1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 * Copyright (c) 2022 Dell Inc, or its subsidiaries. All rights reserved. 6 */ 7 8 #include "spdk/stdinc.h" 9 10 #include "bdev_nvme.h" 11 12 #include "spdk/accel.h" 13 #include "spdk/config.h" 14 #include "spdk/endian.h" 15 #include "spdk/bdev.h" 16 #include "spdk/json.h" 17 #include "spdk/likely.h" 18 #include "spdk/nvme.h" 19 #include "spdk/nvme_ocssd.h" 20 #include "spdk/nvme_zns.h" 21 #include "spdk/opal.h" 22 #include "spdk/thread.h" 23 #include "spdk/trace.h" 24 #include "spdk/string.h" 25 #include "spdk/util.h" 26 #include "spdk/uuid.h" 27 28 #include "spdk/bdev_module.h" 29 #include "spdk/log.h" 30 31 #include "spdk_internal/usdt.h" 32 #include "spdk_internal/trace_defs.h" 33 34 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true 35 #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS (10000) 36 37 #define NSID_STR_LEN 10 38 39 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w); 40 41 struct nvme_bdev_io { 42 /** array of iovecs to transfer. */ 43 struct iovec *iovs; 44 45 /** Number of iovecs in iovs array. */ 46 int iovcnt; 47 48 /** Current iovec position. */ 49 int iovpos; 50 51 /** Offset in current iovec. */ 52 uint32_t iov_offset; 53 54 /** I/O path the current I/O or admin passthrough is submitted on, or the I/O path 55 * being reset in a reset I/O. 56 */ 57 struct nvme_io_path *io_path; 58 59 /** array of iovecs to transfer. */ 60 struct iovec *fused_iovs; 61 62 /** Number of iovecs in iovs array. */ 63 int fused_iovcnt; 64 65 /** Current iovec position. */ 66 int fused_iovpos; 67 68 /** Offset in current iovec. */ 69 uint32_t fused_iov_offset; 70 71 /** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */ 72 struct spdk_nvme_cpl cpl; 73 74 /** Extended IO opts passed by the user to bdev layer and mapped to NVME format */ 75 struct spdk_nvme_ns_cmd_ext_io_opts ext_opts; 76 77 /** Keeps track if first of fused commands was submitted */ 78 bool first_fused_submitted; 79 80 /** Keeps track if first of fused commands was completed */ 81 bool first_fused_completed; 82 83 /** Temporary pointer to zone report buffer */ 84 struct spdk_nvme_zns_zone_report *zone_report_buf; 85 86 /** Keep track of how many zones that have been copied to the spdk_bdev_zone_info struct */ 87 uint64_t handled_zones; 88 89 /** Expiration value in ticks to retry the current I/O. */ 90 uint64_t retry_ticks; 91 92 /* How many times the current I/O was retried. */ 93 int32_t retry_count; 94 95 /* Current tsc at submit time. */ 96 uint64_t submit_tsc; 97 }; 98 99 struct nvme_probe_skip_entry { 100 struct spdk_nvme_transport_id trid; 101 TAILQ_ENTRY(nvme_probe_skip_entry) tailq; 102 }; 103 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */ 104 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER( 105 g_skipped_nvme_ctrlrs); 106 107 static struct spdk_bdev_nvme_opts g_opts = { 108 .action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE, 109 .timeout_us = 0, 110 .timeout_admin_us = 0, 111 .keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS, 112 .transport_retry_count = 4, 113 .arbitration_burst = 0, 114 .low_priority_weight = 0, 115 .medium_priority_weight = 0, 116 .high_priority_weight = 0, 117 .nvme_adminq_poll_period_us = 10000ULL, 118 .nvme_ioq_poll_period_us = 0, 119 .io_queue_requests = 0, 120 .delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT, 121 .bdev_retry_count = 3, 122 .transport_ack_timeout = 0, 123 .ctrlr_loss_timeout_sec = 0, 124 .reconnect_delay_sec = 0, 125 .fast_io_fail_timeout_sec = 0, 126 .disable_auto_failback = false, 127 .generate_uuids = false, 128 .transport_tos = 0, 129 .nvme_error_stat = false, 130 .io_path_stat = false, 131 .allow_accel_sequence = false, 132 }; 133 134 #define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL 135 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT 100000ULL 136 137 static int g_hot_insert_nvme_controller_index = 0; 138 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT; 139 static bool g_nvme_hotplug_enabled = false; 140 struct spdk_thread *g_bdev_nvme_init_thread; 141 static struct spdk_poller *g_hotplug_poller; 142 static struct spdk_poller *g_hotplug_probe_poller; 143 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx; 144 145 static void nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 146 struct nvme_async_probe_ctx *ctx); 147 static void nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 148 struct nvme_async_probe_ctx *ctx); 149 static int bdev_nvme_library_init(void); 150 static void bdev_nvme_library_fini(void); 151 static void _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, 152 struct spdk_bdev_io *bdev_io); 153 static void bdev_nvme_submit_request(struct spdk_io_channel *ch, 154 struct spdk_bdev_io *bdev_io); 155 static int bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 156 void *md, uint64_t lba_count, uint64_t lba, 157 uint32_t flags, struct spdk_memory_domain *domain, void *domain_ctx, 158 struct spdk_accel_sequence *seq); 159 static int bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 160 void *md, uint64_t lba_count, uint64_t lba); 161 static int bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 162 void *md, uint64_t lba_count, uint64_t lba, 163 uint32_t flags, struct spdk_memory_domain *domain, void *domain_ctx, 164 struct spdk_accel_sequence *seq); 165 static int bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 166 void *md, uint64_t lba_count, 167 uint64_t zslba, uint32_t flags); 168 static int bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 169 void *md, uint64_t lba_count, uint64_t lba, 170 uint32_t flags); 171 static int bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, 172 struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov, 173 int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba, 174 uint32_t flags); 175 static int bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, 176 uint32_t num_zones, struct spdk_bdev_zone_info *info); 177 static int bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 178 enum spdk_bdev_zone_action action); 179 static void bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, 180 struct nvme_bdev_io *bio, 181 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); 182 static int bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 183 void *buf, size_t nbytes); 184 static int bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 185 void *buf, size_t nbytes, void *md_buf, size_t md_len); 186 static void bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, 187 struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort); 188 static void bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio); 189 static int bdev_nvme_reset_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 190 static int bdev_nvme_failover_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool remove); 191 static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr); 192 static int nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr); 193 194 static struct nvme_ns *nvme_ns_alloc(void); 195 static void nvme_ns_free(struct nvme_ns *ns); 196 197 static int 198 nvme_ns_cmp(struct nvme_ns *ns1, struct nvme_ns *ns2) 199 { 200 return ns1->id < ns2->id ? -1 : ns1->id > ns2->id; 201 } 202 203 RB_GENERATE_STATIC(nvme_ns_tree, nvme_ns, node, nvme_ns_cmp); 204 205 struct spdk_nvme_qpair * 206 bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch) 207 { 208 struct nvme_ctrlr_channel *ctrlr_ch; 209 210 assert(ctrlr_io_ch != NULL); 211 212 ctrlr_ch = spdk_io_channel_get_ctx(ctrlr_io_ch); 213 214 return ctrlr_ch->qpair->qpair; 215 } 216 217 static int 218 bdev_nvme_get_ctx_size(void) 219 { 220 return sizeof(struct nvme_bdev_io); 221 } 222 223 static struct spdk_bdev_module nvme_if = { 224 .name = "nvme", 225 .async_fini = true, 226 .module_init = bdev_nvme_library_init, 227 .module_fini = bdev_nvme_library_fini, 228 .config_json = bdev_nvme_config_json, 229 .get_ctx_size = bdev_nvme_get_ctx_size, 230 231 }; 232 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if) 233 234 struct nvme_bdev_ctrlrs g_nvme_bdev_ctrlrs = TAILQ_HEAD_INITIALIZER(g_nvme_bdev_ctrlrs); 235 pthread_mutex_t g_bdev_nvme_mutex = PTHREAD_MUTEX_INITIALIZER; 236 bool g_bdev_nvme_module_finish; 237 238 struct nvme_bdev_ctrlr * 239 nvme_bdev_ctrlr_get_by_name(const char *name) 240 { 241 struct nvme_bdev_ctrlr *nbdev_ctrlr; 242 243 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 244 if (strcmp(name, nbdev_ctrlr->name) == 0) { 245 break; 246 } 247 } 248 249 return nbdev_ctrlr; 250 } 251 252 static struct nvme_ctrlr * 253 nvme_bdev_ctrlr_get_ctrlr(struct nvme_bdev_ctrlr *nbdev_ctrlr, 254 const struct spdk_nvme_transport_id *trid) 255 { 256 struct nvme_ctrlr *nvme_ctrlr; 257 258 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 259 if (spdk_nvme_transport_id_compare(trid, &nvme_ctrlr->active_path_id->trid) == 0) { 260 break; 261 } 262 } 263 264 return nvme_ctrlr; 265 } 266 267 struct nvme_ctrlr * 268 nvme_bdev_ctrlr_get_ctrlr_by_id(struct nvme_bdev_ctrlr *nbdev_ctrlr, 269 uint16_t cntlid) 270 { 271 struct nvme_ctrlr *nvme_ctrlr; 272 const struct spdk_nvme_ctrlr_data *cdata; 273 274 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 275 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 276 if (cdata->cntlid == cntlid) { 277 break; 278 } 279 } 280 281 return nvme_ctrlr; 282 } 283 284 static struct nvme_bdev * 285 nvme_bdev_ctrlr_get_bdev(struct nvme_bdev_ctrlr *nbdev_ctrlr, uint32_t nsid) 286 { 287 struct nvme_bdev *bdev; 288 289 pthread_mutex_lock(&g_bdev_nvme_mutex); 290 TAILQ_FOREACH(bdev, &nbdev_ctrlr->bdevs, tailq) { 291 if (bdev->nsid == nsid) { 292 break; 293 } 294 } 295 pthread_mutex_unlock(&g_bdev_nvme_mutex); 296 297 return bdev; 298 } 299 300 struct nvme_ns * 301 nvme_ctrlr_get_ns(struct nvme_ctrlr *nvme_ctrlr, uint32_t nsid) 302 { 303 struct nvme_ns ns; 304 305 assert(nsid > 0); 306 307 ns.id = nsid; 308 return RB_FIND(nvme_ns_tree, &nvme_ctrlr->namespaces, &ns); 309 } 310 311 struct nvme_ns * 312 nvme_ctrlr_get_first_active_ns(struct nvme_ctrlr *nvme_ctrlr) 313 { 314 return RB_MIN(nvme_ns_tree, &nvme_ctrlr->namespaces); 315 } 316 317 struct nvme_ns * 318 nvme_ctrlr_get_next_active_ns(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *ns) 319 { 320 if (ns == NULL) { 321 return NULL; 322 } 323 324 return RB_NEXT(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 325 } 326 327 static struct nvme_ctrlr * 328 nvme_ctrlr_get(const struct spdk_nvme_transport_id *trid) 329 { 330 struct nvme_bdev_ctrlr *nbdev_ctrlr; 331 struct nvme_ctrlr *nvme_ctrlr = NULL; 332 333 pthread_mutex_lock(&g_bdev_nvme_mutex); 334 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 335 nvme_ctrlr = nvme_bdev_ctrlr_get_ctrlr(nbdev_ctrlr, trid); 336 if (nvme_ctrlr != NULL) { 337 break; 338 } 339 } 340 pthread_mutex_unlock(&g_bdev_nvme_mutex); 341 342 return nvme_ctrlr; 343 } 344 345 struct nvme_ctrlr * 346 nvme_ctrlr_get_by_name(const char *name) 347 { 348 struct nvme_bdev_ctrlr *nbdev_ctrlr; 349 struct nvme_ctrlr *nvme_ctrlr = NULL; 350 351 if (name == NULL) { 352 return NULL; 353 } 354 355 pthread_mutex_lock(&g_bdev_nvme_mutex); 356 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 357 if (nbdev_ctrlr != NULL) { 358 nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 359 } 360 pthread_mutex_unlock(&g_bdev_nvme_mutex); 361 362 return nvme_ctrlr; 363 } 364 365 void 366 nvme_bdev_ctrlr_for_each(nvme_bdev_ctrlr_for_each_fn fn, void *ctx) 367 { 368 struct nvme_bdev_ctrlr *nbdev_ctrlr; 369 370 pthread_mutex_lock(&g_bdev_nvme_mutex); 371 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 372 fn(nbdev_ctrlr, ctx); 373 } 374 pthread_mutex_unlock(&g_bdev_nvme_mutex); 375 } 376 377 void 378 nvme_bdev_dump_trid_json(const struct spdk_nvme_transport_id *trid, struct spdk_json_write_ctx *w) 379 { 380 const char *trtype_str; 381 const char *adrfam_str; 382 383 trtype_str = spdk_nvme_transport_id_trtype_str(trid->trtype); 384 if (trtype_str) { 385 spdk_json_write_named_string(w, "trtype", trtype_str); 386 } 387 388 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 389 if (adrfam_str) { 390 spdk_json_write_named_string(w, "adrfam", adrfam_str); 391 } 392 393 if (trid->traddr[0] != '\0') { 394 spdk_json_write_named_string(w, "traddr", trid->traddr); 395 } 396 397 if (trid->trsvcid[0] != '\0') { 398 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 399 } 400 401 if (trid->subnqn[0] != '\0') { 402 spdk_json_write_named_string(w, "subnqn", trid->subnqn); 403 } 404 } 405 406 static void 407 nvme_bdev_ctrlr_delete(struct nvme_bdev_ctrlr *nbdev_ctrlr, 408 struct nvme_ctrlr *nvme_ctrlr) 409 { 410 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_delete, nvme_ctrlr->nbdev_ctrlr->name); 411 pthread_mutex_lock(&g_bdev_nvme_mutex); 412 413 TAILQ_REMOVE(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 414 if (!TAILQ_EMPTY(&nbdev_ctrlr->ctrlrs)) { 415 pthread_mutex_unlock(&g_bdev_nvme_mutex); 416 417 return; 418 } 419 TAILQ_REMOVE(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 420 421 pthread_mutex_unlock(&g_bdev_nvme_mutex); 422 423 assert(TAILQ_EMPTY(&nbdev_ctrlr->bdevs)); 424 425 free(nbdev_ctrlr->name); 426 free(nbdev_ctrlr); 427 } 428 429 static void 430 _nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 431 { 432 struct nvme_path_id *path_id, *tmp_path; 433 struct nvme_ns *ns, *tmp_ns; 434 435 free(nvme_ctrlr->copied_ana_desc); 436 spdk_free(nvme_ctrlr->ana_log_page); 437 438 if (nvme_ctrlr->opal_dev) { 439 spdk_opal_dev_destruct(nvme_ctrlr->opal_dev); 440 nvme_ctrlr->opal_dev = NULL; 441 } 442 443 if (nvme_ctrlr->nbdev_ctrlr) { 444 nvme_bdev_ctrlr_delete(nvme_ctrlr->nbdev_ctrlr, nvme_ctrlr); 445 } 446 447 RB_FOREACH_SAFE(ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp_ns) { 448 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 449 nvme_ns_free(ns); 450 } 451 452 TAILQ_FOREACH_SAFE(path_id, &nvme_ctrlr->trids, link, tmp_path) { 453 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 454 free(path_id); 455 } 456 457 pthread_mutex_destroy(&nvme_ctrlr->mutex); 458 459 free(nvme_ctrlr); 460 461 pthread_mutex_lock(&g_bdev_nvme_mutex); 462 if (g_bdev_nvme_module_finish && TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 463 pthread_mutex_unlock(&g_bdev_nvme_mutex); 464 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 465 spdk_bdev_module_fini_done(); 466 return; 467 } 468 pthread_mutex_unlock(&g_bdev_nvme_mutex); 469 } 470 471 static int 472 nvme_detach_poller(void *arg) 473 { 474 struct nvme_ctrlr *nvme_ctrlr = arg; 475 int rc; 476 477 rc = spdk_nvme_detach_poll_async(nvme_ctrlr->detach_ctx); 478 if (rc != -EAGAIN) { 479 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 480 _nvme_ctrlr_delete(nvme_ctrlr); 481 } 482 483 return SPDK_POLLER_BUSY; 484 } 485 486 static void 487 nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 488 { 489 int rc; 490 491 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 492 493 /* First, unregister the adminq poller, as the driver will poll adminq if necessary */ 494 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 495 496 /* If we got here, the reset/detach poller cannot be active */ 497 assert(nvme_ctrlr->reset_detach_poller == NULL); 498 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(nvme_detach_poller, 499 nvme_ctrlr, 1000); 500 if (nvme_ctrlr->reset_detach_poller == NULL) { 501 SPDK_ERRLOG("Failed to register detach poller\n"); 502 goto error; 503 } 504 505 rc = spdk_nvme_detach_async(nvme_ctrlr->ctrlr, &nvme_ctrlr->detach_ctx); 506 if (rc != 0) { 507 SPDK_ERRLOG("Failed to detach the NVMe controller\n"); 508 goto error; 509 } 510 511 return; 512 error: 513 /* We don't have a good way to handle errors here, so just do what we can and delete the 514 * controller without detaching the underlying NVMe device. 515 */ 516 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 517 _nvme_ctrlr_delete(nvme_ctrlr); 518 } 519 520 static void 521 nvme_ctrlr_unregister_cb(void *io_device) 522 { 523 struct nvme_ctrlr *nvme_ctrlr = io_device; 524 525 nvme_ctrlr_delete(nvme_ctrlr); 526 } 527 528 static void 529 nvme_ctrlr_unregister(void *ctx) 530 { 531 struct nvme_ctrlr *nvme_ctrlr = ctx; 532 533 spdk_io_device_unregister(nvme_ctrlr, nvme_ctrlr_unregister_cb); 534 } 535 536 static bool 537 nvme_ctrlr_can_be_unregistered(struct nvme_ctrlr *nvme_ctrlr) 538 { 539 if (!nvme_ctrlr->destruct) { 540 return false; 541 } 542 543 if (nvme_ctrlr->ref > 0) { 544 return false; 545 } 546 547 if (nvme_ctrlr->resetting) { 548 return false; 549 } 550 551 if (nvme_ctrlr->ana_log_page_updating) { 552 return false; 553 } 554 555 if (nvme_ctrlr->io_path_cache_clearing) { 556 return false; 557 } 558 559 return true; 560 } 561 562 static void 563 nvme_ctrlr_release(struct nvme_ctrlr *nvme_ctrlr) 564 { 565 pthread_mutex_lock(&nvme_ctrlr->mutex); 566 SPDK_DTRACE_PROBE2(bdev_nvme_ctrlr_release, nvme_ctrlr->nbdev_ctrlr->name, nvme_ctrlr->ref); 567 568 assert(nvme_ctrlr->ref > 0); 569 nvme_ctrlr->ref--; 570 571 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 572 pthread_mutex_unlock(&nvme_ctrlr->mutex); 573 return; 574 } 575 576 pthread_mutex_unlock(&nvme_ctrlr->mutex); 577 578 spdk_thread_exec_msg(nvme_ctrlr->thread, nvme_ctrlr_unregister, nvme_ctrlr); 579 } 580 581 static void 582 bdev_nvme_clear_current_io_path(struct nvme_bdev_channel *nbdev_ch) 583 { 584 nbdev_ch->current_io_path = NULL; 585 nbdev_ch->rr_counter = 0; 586 } 587 588 static struct nvme_io_path * 589 _bdev_nvme_get_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 590 { 591 struct nvme_io_path *io_path; 592 593 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 594 if (io_path->nvme_ns == nvme_ns) { 595 break; 596 } 597 } 598 599 return io_path; 600 } 601 602 static struct nvme_io_path * 603 nvme_io_path_alloc(void) 604 { 605 struct nvme_io_path *io_path; 606 607 io_path = calloc(1, sizeof(*io_path)); 608 if (io_path == NULL) { 609 SPDK_ERRLOG("Failed to alloc io_path.\n"); 610 return NULL; 611 } 612 613 if (g_opts.io_path_stat) { 614 io_path->stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); 615 if (io_path->stat == NULL) { 616 free(io_path); 617 SPDK_ERRLOG("Failed to alloc io_path stat.\n"); 618 return NULL; 619 } 620 spdk_bdev_reset_io_stat(io_path->stat, SPDK_BDEV_RESET_STAT_MAXMIN); 621 } 622 623 return io_path; 624 } 625 626 static void 627 nvme_io_path_free(struct nvme_io_path *io_path) 628 { 629 free(io_path->stat); 630 free(io_path); 631 } 632 633 static int 634 _bdev_nvme_add_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 635 { 636 struct nvme_io_path *io_path; 637 struct spdk_io_channel *ch; 638 struct nvme_ctrlr_channel *ctrlr_ch; 639 struct nvme_qpair *nvme_qpair; 640 641 io_path = nvme_io_path_alloc(); 642 if (io_path == NULL) { 643 return -ENOMEM; 644 } 645 646 io_path->nvme_ns = nvme_ns; 647 648 ch = spdk_get_io_channel(nvme_ns->ctrlr); 649 if (ch == NULL) { 650 nvme_io_path_free(io_path); 651 SPDK_ERRLOG("Failed to alloc io_channel.\n"); 652 return -ENOMEM; 653 } 654 655 ctrlr_ch = spdk_io_channel_get_ctx(ch); 656 657 nvme_qpair = ctrlr_ch->qpair; 658 assert(nvme_qpair != NULL); 659 660 io_path->qpair = nvme_qpair; 661 TAILQ_INSERT_TAIL(&nvme_qpair->io_path_list, io_path, tailq); 662 663 io_path->nbdev_ch = nbdev_ch; 664 STAILQ_INSERT_TAIL(&nbdev_ch->io_path_list, io_path, stailq); 665 666 bdev_nvme_clear_current_io_path(nbdev_ch); 667 668 return 0; 669 } 670 671 static void 672 bdev_nvme_clear_retry_io_path(struct nvme_bdev_channel *nbdev_ch, 673 struct nvme_io_path *io_path) 674 { 675 struct spdk_bdev_io *bdev_io; 676 struct nvme_bdev_io *bio; 677 678 TAILQ_FOREACH(bdev_io, &nbdev_ch->retry_io_list, module_link) { 679 bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 680 if (bio->io_path == io_path) { 681 bio->io_path = NULL; 682 } 683 } 684 } 685 686 static void 687 _bdev_nvme_delete_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *io_path) 688 { 689 struct spdk_io_channel *ch; 690 struct nvme_qpair *nvme_qpair; 691 struct nvme_ctrlr_channel *ctrlr_ch; 692 struct nvme_bdev *nbdev; 693 694 nbdev = spdk_io_channel_get_io_device(spdk_io_channel_from_ctx(nbdev_ch)); 695 696 /* Add the statistics to nvme_ns before this path is destroyed. */ 697 pthread_mutex_lock(&nbdev->mutex); 698 if (nbdev->ref != 0 && io_path->nvme_ns->stat != NULL && io_path->stat != NULL) { 699 spdk_bdev_add_io_stat(io_path->nvme_ns->stat, io_path->stat); 700 } 701 pthread_mutex_unlock(&nbdev->mutex); 702 703 bdev_nvme_clear_current_io_path(nbdev_ch); 704 bdev_nvme_clear_retry_io_path(nbdev_ch, io_path); 705 706 STAILQ_REMOVE(&nbdev_ch->io_path_list, io_path, nvme_io_path, stailq); 707 io_path->nbdev_ch = NULL; 708 709 nvme_qpair = io_path->qpair; 710 assert(nvme_qpair != NULL); 711 712 ctrlr_ch = nvme_qpair->ctrlr_ch; 713 assert(ctrlr_ch != NULL); 714 715 ch = spdk_io_channel_from_ctx(ctrlr_ch); 716 spdk_put_io_channel(ch); 717 718 /* After an io_path is removed, I/Os submitted to it may complete and update statistics 719 * of the io_path. To avoid heap-use-after-free error from this case, do not free the 720 * io_path here but free the io_path when the associated qpair is freed. It is ensured 721 * that all I/Os submitted to the io_path are completed when the associated qpair is freed. 722 */ 723 } 724 725 static void 726 _bdev_nvme_delete_io_paths(struct nvme_bdev_channel *nbdev_ch) 727 { 728 struct nvme_io_path *io_path, *tmp_io_path; 729 730 STAILQ_FOREACH_SAFE(io_path, &nbdev_ch->io_path_list, stailq, tmp_io_path) { 731 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 732 } 733 } 734 735 static int 736 bdev_nvme_create_bdev_channel_cb(void *io_device, void *ctx_buf) 737 { 738 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 739 struct nvme_bdev *nbdev = io_device; 740 struct nvme_ns *nvme_ns; 741 int rc; 742 743 STAILQ_INIT(&nbdev_ch->io_path_list); 744 TAILQ_INIT(&nbdev_ch->retry_io_list); 745 746 pthread_mutex_lock(&nbdev->mutex); 747 748 nbdev_ch->mp_policy = nbdev->mp_policy; 749 nbdev_ch->mp_selector = nbdev->mp_selector; 750 nbdev_ch->rr_min_io = nbdev->rr_min_io; 751 752 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 753 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 754 if (rc != 0) { 755 pthread_mutex_unlock(&nbdev->mutex); 756 757 _bdev_nvme_delete_io_paths(nbdev_ch); 758 return rc; 759 } 760 } 761 pthread_mutex_unlock(&nbdev->mutex); 762 763 return 0; 764 } 765 766 /* If cpl != NULL, complete the bdev_io with nvme status based on 'cpl'. 767 * If cpl == NULL, complete the bdev_io with bdev status based on 'status'. 768 */ 769 static inline void 770 __bdev_nvme_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status, 771 const struct spdk_nvme_cpl *cpl) 772 { 773 spdk_trace_record(TRACE_BDEV_NVME_IO_DONE, 0, 0, (uintptr_t)bdev_io->driver_ctx, 774 (uintptr_t)bdev_io); 775 if (cpl) { 776 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 777 } else { 778 spdk_bdev_io_complete(bdev_io, status); 779 } 780 } 781 782 static void bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch); 783 784 static void 785 bdev_nvme_destroy_bdev_channel_cb(void *io_device, void *ctx_buf) 786 { 787 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 788 789 bdev_nvme_abort_retry_ios(nbdev_ch); 790 _bdev_nvme_delete_io_paths(nbdev_ch); 791 } 792 793 static inline bool 794 bdev_nvme_io_type_is_admin(enum spdk_bdev_io_type io_type) 795 { 796 switch (io_type) { 797 case SPDK_BDEV_IO_TYPE_RESET: 798 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 799 case SPDK_BDEV_IO_TYPE_ABORT: 800 return true; 801 default: 802 break; 803 } 804 805 return false; 806 } 807 808 static inline bool 809 nvme_ns_is_accessible(struct nvme_ns *nvme_ns) 810 { 811 if (spdk_unlikely(nvme_ns->ana_state_updating)) { 812 return false; 813 } 814 815 switch (nvme_ns->ana_state) { 816 case SPDK_NVME_ANA_OPTIMIZED_STATE: 817 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 818 return true; 819 default: 820 break; 821 } 822 823 return false; 824 } 825 826 static inline bool 827 nvme_qpair_is_connected(struct nvme_qpair *nvme_qpair) 828 { 829 if (spdk_unlikely(nvme_qpair->qpair == NULL)) { 830 return false; 831 } 832 833 if (spdk_unlikely(spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) != 834 SPDK_NVME_QPAIR_FAILURE_NONE)) { 835 return false; 836 } 837 838 if (spdk_unlikely(nvme_qpair->ctrlr_ch->reset_iter != NULL)) { 839 return false; 840 } 841 842 if (spdk_nvme_ctrlr_get_admin_qp_failure_reason(nvme_qpair->ctrlr->ctrlr) != 843 SPDK_NVME_QPAIR_FAILURE_NONE) { 844 return false; 845 } 846 847 return true; 848 } 849 850 static inline bool 851 nvme_io_path_is_available(struct nvme_io_path *io_path) 852 { 853 if (spdk_unlikely(!nvme_qpair_is_connected(io_path->qpair))) { 854 return false; 855 } 856 857 if (spdk_unlikely(!nvme_ns_is_accessible(io_path->nvme_ns))) { 858 return false; 859 } 860 861 return true; 862 } 863 864 static inline bool 865 nvme_ctrlr_is_failed(struct nvme_ctrlr *nvme_ctrlr) 866 { 867 if (nvme_ctrlr->destruct) { 868 return true; 869 } 870 871 if (nvme_ctrlr->fast_io_fail_timedout) { 872 return true; 873 } 874 875 if (nvme_ctrlr->resetting) { 876 if (nvme_ctrlr->opts.reconnect_delay_sec != 0) { 877 return false; 878 } else { 879 return true; 880 } 881 } 882 883 if (nvme_ctrlr->reconnect_is_delayed) { 884 return false; 885 } 886 887 if (nvme_ctrlr->disabled) { 888 return true; 889 } 890 891 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 892 return true; 893 } else { 894 return false; 895 } 896 } 897 898 static bool 899 nvme_ctrlr_is_available(struct nvme_ctrlr *nvme_ctrlr) 900 { 901 if (nvme_ctrlr->destruct) { 902 return false; 903 } 904 905 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 906 return false; 907 } 908 909 if (nvme_ctrlr->resetting || nvme_ctrlr->reconnect_is_delayed) { 910 return false; 911 } 912 913 if (nvme_ctrlr->disabled) { 914 return false; 915 } 916 917 return true; 918 } 919 920 /* Simulate circular linked list. */ 921 static inline struct nvme_io_path * 922 nvme_io_path_get_next(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *prev_path) 923 { 924 struct nvme_io_path *next_path; 925 926 if (prev_path != NULL) { 927 next_path = STAILQ_NEXT(prev_path, stailq); 928 if (next_path != NULL) { 929 return next_path; 930 } 931 } 932 933 return STAILQ_FIRST(&nbdev_ch->io_path_list); 934 } 935 936 static struct nvme_io_path * 937 _bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 938 { 939 struct nvme_io_path *io_path, *start, *non_optimized = NULL; 940 941 start = nvme_io_path_get_next(nbdev_ch, nbdev_ch->current_io_path); 942 943 io_path = start; 944 do { 945 if (spdk_likely(nvme_qpair_is_connected(io_path->qpair) && 946 !io_path->nvme_ns->ana_state_updating)) { 947 switch (io_path->nvme_ns->ana_state) { 948 case SPDK_NVME_ANA_OPTIMIZED_STATE: 949 nbdev_ch->current_io_path = io_path; 950 return io_path; 951 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 952 if (non_optimized == NULL) { 953 non_optimized = io_path; 954 } 955 break; 956 default: 957 break; 958 } 959 } 960 io_path = nvme_io_path_get_next(nbdev_ch, io_path); 961 } while (io_path != start); 962 963 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) { 964 /* We come here only if there is no optimized path. Cache even non_optimized 965 * path for load balance across multiple non_optimized paths. 966 */ 967 nbdev_ch->current_io_path = non_optimized; 968 } 969 970 return non_optimized; 971 } 972 973 static struct nvme_io_path * 974 _bdev_nvme_find_io_path_min_qd(struct nvme_bdev_channel *nbdev_ch) 975 { 976 struct nvme_io_path *io_path; 977 struct nvme_io_path *optimized = NULL, *non_optimized = NULL; 978 uint32_t opt_min_qd = UINT32_MAX, non_opt_min_qd = UINT32_MAX; 979 uint32_t num_outstanding_reqs; 980 981 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 982 if (spdk_unlikely(!nvme_qpair_is_connected(io_path->qpair))) { 983 /* The device is currently resetting. */ 984 continue; 985 } 986 987 if (spdk_unlikely(io_path->nvme_ns->ana_state_updating)) { 988 continue; 989 } 990 991 num_outstanding_reqs = spdk_nvme_qpair_get_num_outstanding_reqs(io_path->qpair->qpair); 992 switch (io_path->nvme_ns->ana_state) { 993 case SPDK_NVME_ANA_OPTIMIZED_STATE: 994 if (num_outstanding_reqs < opt_min_qd) { 995 opt_min_qd = num_outstanding_reqs; 996 optimized = io_path; 997 } 998 break; 999 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 1000 if (num_outstanding_reqs < non_opt_min_qd) { 1001 non_opt_min_qd = num_outstanding_reqs; 1002 non_optimized = io_path; 1003 } 1004 break; 1005 default: 1006 break; 1007 } 1008 } 1009 1010 /* don't cache io path for BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH selector */ 1011 if (optimized != NULL) { 1012 return optimized; 1013 } 1014 1015 return non_optimized; 1016 } 1017 1018 static inline struct nvme_io_path * 1019 bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 1020 { 1021 if (spdk_likely(nbdev_ch->current_io_path != NULL)) { 1022 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE) { 1023 return nbdev_ch->current_io_path; 1024 } else if (nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 1025 if (++nbdev_ch->rr_counter < nbdev_ch->rr_min_io) { 1026 return nbdev_ch->current_io_path; 1027 } 1028 nbdev_ch->rr_counter = 0; 1029 } 1030 } 1031 1032 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE || 1033 nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 1034 return _bdev_nvme_find_io_path(nbdev_ch); 1035 } else { 1036 return _bdev_nvme_find_io_path_min_qd(nbdev_ch); 1037 } 1038 } 1039 1040 /* Return true if there is any io_path whose qpair is active or ctrlr is not failed, 1041 * or false otherwise. 1042 * 1043 * If any io_path has an active qpair but find_io_path() returned NULL, its namespace 1044 * is likely to be non-accessible now but may become accessible. 1045 * 1046 * If any io_path has an unfailed ctrlr but find_io_path() returned NULL, the ctrlr 1047 * is likely to be resetting now but the reset may succeed. A ctrlr is set to unfailed 1048 * when starting to reset it but it is set to failed when the reset failed. Hence, if 1049 * a ctrlr is unfailed, it is likely that it works fine or is resetting. 1050 */ 1051 static bool 1052 any_io_path_may_become_available(struct nvme_bdev_channel *nbdev_ch) 1053 { 1054 struct nvme_io_path *io_path; 1055 1056 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 1057 if (io_path->nvme_ns->ana_transition_timedout) { 1058 continue; 1059 } 1060 1061 if (nvme_qpair_is_connected(io_path->qpair) || 1062 !nvme_ctrlr_is_failed(io_path->qpair->ctrlr)) { 1063 return true; 1064 } 1065 } 1066 1067 return false; 1068 } 1069 1070 static void 1071 bdev_nvme_retry_io(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io) 1072 { 1073 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 1074 struct spdk_io_channel *ch; 1075 1076 if (nbdev_io->io_path != NULL && nvme_io_path_is_available(nbdev_io->io_path)) { 1077 _bdev_nvme_submit_request(nbdev_ch, bdev_io); 1078 } else { 1079 ch = spdk_io_channel_from_ctx(nbdev_ch); 1080 bdev_nvme_submit_request(ch, bdev_io); 1081 } 1082 } 1083 1084 static int 1085 bdev_nvme_retry_ios(void *arg) 1086 { 1087 struct nvme_bdev_channel *nbdev_ch = arg; 1088 struct spdk_bdev_io *bdev_io, *tmp_bdev_io; 1089 struct nvme_bdev_io *bio; 1090 uint64_t now, delay_us; 1091 1092 now = spdk_get_ticks(); 1093 1094 TAILQ_FOREACH_SAFE(bdev_io, &nbdev_ch->retry_io_list, module_link, tmp_bdev_io) { 1095 bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 1096 if (bio->retry_ticks > now) { 1097 break; 1098 } 1099 1100 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io, module_link); 1101 1102 bdev_nvme_retry_io(nbdev_ch, bdev_io); 1103 } 1104 1105 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1106 1107 bdev_io = TAILQ_FIRST(&nbdev_ch->retry_io_list); 1108 if (bdev_io != NULL) { 1109 bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 1110 1111 delay_us = (bio->retry_ticks - now) * SPDK_SEC_TO_USEC / spdk_get_ticks_hz(); 1112 1113 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 1114 delay_us); 1115 } 1116 1117 return SPDK_POLLER_BUSY; 1118 } 1119 1120 static void 1121 bdev_nvme_queue_retry_io(struct nvme_bdev_channel *nbdev_ch, 1122 struct nvme_bdev_io *bio, uint64_t delay_ms) 1123 { 1124 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1125 struct spdk_bdev_io *tmp_bdev_io; 1126 struct nvme_bdev_io *tmp_bio; 1127 1128 bio->retry_ticks = spdk_get_ticks() + delay_ms * spdk_get_ticks_hz() / 1000ULL; 1129 1130 TAILQ_FOREACH_REVERSE(tmp_bdev_io, &nbdev_ch->retry_io_list, retry_io_head, module_link) { 1131 tmp_bio = (struct nvme_bdev_io *)tmp_bdev_io->driver_ctx; 1132 1133 if (tmp_bio->retry_ticks <= bio->retry_ticks) { 1134 TAILQ_INSERT_AFTER(&nbdev_ch->retry_io_list, tmp_bdev_io, bdev_io, 1135 module_link); 1136 return; 1137 } 1138 } 1139 1140 /* No earlier I/Os were found. This I/O must be the new head. */ 1141 TAILQ_INSERT_HEAD(&nbdev_ch->retry_io_list, bdev_io, module_link); 1142 1143 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1144 1145 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 1146 delay_ms * 1000ULL); 1147 } 1148 1149 static void 1150 bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch) 1151 { 1152 struct spdk_bdev_io *bdev_io, *tmp_io; 1153 1154 TAILQ_FOREACH_SAFE(bdev_io, &nbdev_ch->retry_io_list, module_link, tmp_io) { 1155 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io, module_link); 1156 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED, NULL); 1157 } 1158 1159 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1160 } 1161 1162 static int 1163 bdev_nvme_abort_retry_io(struct nvme_bdev_channel *nbdev_ch, 1164 struct nvme_bdev_io *bio_to_abort) 1165 { 1166 struct spdk_bdev_io *bdev_io_to_abort; 1167 1168 TAILQ_FOREACH(bdev_io_to_abort, &nbdev_ch->retry_io_list, module_link) { 1169 if ((struct nvme_bdev_io *)bdev_io_to_abort->driver_ctx == bio_to_abort) { 1170 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io_to_abort, module_link); 1171 __bdev_nvme_io_complete(bdev_io_to_abort, SPDK_BDEV_IO_STATUS_ABORTED, NULL); 1172 return 0; 1173 } 1174 } 1175 1176 return -ENOENT; 1177 } 1178 1179 static void 1180 bdev_nvme_update_nvme_error_stat(struct spdk_bdev_io *bdev_io, const struct spdk_nvme_cpl *cpl) 1181 { 1182 struct nvme_bdev *nbdev; 1183 uint16_t sct, sc; 1184 1185 assert(spdk_nvme_cpl_is_error(cpl)); 1186 1187 nbdev = bdev_io->bdev->ctxt; 1188 1189 if (nbdev->err_stat == NULL) { 1190 return; 1191 } 1192 1193 sct = cpl->status.sct; 1194 sc = cpl->status.sc; 1195 1196 pthread_mutex_lock(&nbdev->mutex); 1197 1198 nbdev->err_stat->status_type[sct]++; 1199 switch (sct) { 1200 case SPDK_NVME_SCT_GENERIC: 1201 case SPDK_NVME_SCT_COMMAND_SPECIFIC: 1202 case SPDK_NVME_SCT_MEDIA_ERROR: 1203 case SPDK_NVME_SCT_PATH: 1204 nbdev->err_stat->status[sct][sc]++; 1205 break; 1206 default: 1207 break; 1208 } 1209 1210 pthread_mutex_unlock(&nbdev->mutex); 1211 } 1212 1213 static inline void 1214 bdev_nvme_update_io_path_stat(struct nvme_bdev_io *bio) 1215 { 1216 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1217 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 1218 uint32_t blocklen = bdev_io->bdev->blocklen; 1219 struct spdk_bdev_io_stat *stat; 1220 uint64_t tsc_diff; 1221 1222 if (bio->io_path->stat == NULL) { 1223 return; 1224 } 1225 1226 tsc_diff = spdk_get_ticks() - bio->submit_tsc; 1227 stat = bio->io_path->stat; 1228 1229 switch (bdev_io->type) { 1230 case SPDK_BDEV_IO_TYPE_READ: 1231 stat->bytes_read += num_blocks * blocklen; 1232 stat->num_read_ops++; 1233 stat->read_latency_ticks += tsc_diff; 1234 if (stat->max_read_latency_ticks < tsc_diff) { 1235 stat->max_read_latency_ticks = tsc_diff; 1236 } 1237 if (stat->min_read_latency_ticks > tsc_diff) { 1238 stat->min_read_latency_ticks = tsc_diff; 1239 } 1240 break; 1241 case SPDK_BDEV_IO_TYPE_WRITE: 1242 stat->bytes_written += num_blocks * blocklen; 1243 stat->num_write_ops++; 1244 stat->write_latency_ticks += tsc_diff; 1245 if (stat->max_write_latency_ticks < tsc_diff) { 1246 stat->max_write_latency_ticks = tsc_diff; 1247 } 1248 if (stat->min_write_latency_ticks > tsc_diff) { 1249 stat->min_write_latency_ticks = tsc_diff; 1250 } 1251 break; 1252 case SPDK_BDEV_IO_TYPE_UNMAP: 1253 stat->bytes_unmapped += num_blocks * blocklen; 1254 stat->num_unmap_ops++; 1255 stat->unmap_latency_ticks += tsc_diff; 1256 if (stat->max_unmap_latency_ticks < tsc_diff) { 1257 stat->max_unmap_latency_ticks = tsc_diff; 1258 } 1259 if (stat->min_unmap_latency_ticks > tsc_diff) { 1260 stat->min_unmap_latency_ticks = tsc_diff; 1261 } 1262 break; 1263 case SPDK_BDEV_IO_TYPE_ZCOPY: 1264 /* Track the data in the start phase only */ 1265 if (!bdev_io->u.bdev.zcopy.start) { 1266 break; 1267 } 1268 if (bdev_io->u.bdev.zcopy.populate) { 1269 stat->bytes_read += num_blocks * blocklen; 1270 stat->num_read_ops++; 1271 stat->read_latency_ticks += tsc_diff; 1272 if (stat->max_read_latency_ticks < tsc_diff) { 1273 stat->max_read_latency_ticks = tsc_diff; 1274 } 1275 if (stat->min_read_latency_ticks > tsc_diff) { 1276 stat->min_read_latency_ticks = tsc_diff; 1277 } 1278 } else { 1279 stat->bytes_written += num_blocks * blocklen; 1280 stat->num_write_ops++; 1281 stat->write_latency_ticks += tsc_diff; 1282 if (stat->max_write_latency_ticks < tsc_diff) { 1283 stat->max_write_latency_ticks = tsc_diff; 1284 } 1285 if (stat->min_write_latency_ticks > tsc_diff) { 1286 stat->min_write_latency_ticks = tsc_diff; 1287 } 1288 } 1289 break; 1290 case SPDK_BDEV_IO_TYPE_COPY: 1291 stat->bytes_copied += num_blocks * blocklen; 1292 stat->num_copy_ops++; 1293 stat->copy_latency_ticks += tsc_diff; 1294 if (stat->max_copy_latency_ticks < tsc_diff) { 1295 stat->max_copy_latency_ticks = tsc_diff; 1296 } 1297 if (stat->min_copy_latency_ticks > tsc_diff) { 1298 stat->min_copy_latency_ticks = tsc_diff; 1299 } 1300 break; 1301 default: 1302 break; 1303 } 1304 } 1305 1306 static bool 1307 bdev_nvme_check_retry_io(struct nvme_bdev_io *bio, 1308 const struct spdk_nvme_cpl *cpl, 1309 struct nvme_bdev_channel *nbdev_ch, 1310 uint64_t *_delay_ms) 1311 { 1312 struct nvme_io_path *io_path = bio->io_path; 1313 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 1314 const struct spdk_nvme_ctrlr_data *cdata; 1315 1316 if (spdk_nvme_cpl_is_path_error(cpl) || 1317 spdk_nvme_cpl_is_aborted_sq_deletion(cpl) || 1318 !nvme_io_path_is_available(io_path) || 1319 !nvme_ctrlr_is_available(nvme_ctrlr)) { 1320 bdev_nvme_clear_current_io_path(nbdev_ch); 1321 bio->io_path = NULL; 1322 if (spdk_nvme_cpl_is_ana_error(cpl)) { 1323 if (nvme_ctrlr_read_ana_log_page(nvme_ctrlr) == 0) { 1324 io_path->nvme_ns->ana_state_updating = true; 1325 } 1326 } 1327 if (!any_io_path_may_become_available(nbdev_ch)) { 1328 return false; 1329 } 1330 *_delay_ms = 0; 1331 } else { 1332 bio->retry_count++; 1333 1334 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 1335 1336 if (cpl->status.crd != 0) { 1337 *_delay_ms = cdata->crdt[cpl->status.crd] * 100; 1338 } else { 1339 *_delay_ms = 0; 1340 } 1341 } 1342 1343 return true; 1344 } 1345 1346 static inline void 1347 bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio, 1348 const struct spdk_nvme_cpl *cpl) 1349 { 1350 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1351 struct nvme_bdev_channel *nbdev_ch; 1352 uint64_t delay_ms; 1353 1354 assert(!bdev_nvme_io_type_is_admin(bdev_io->type)); 1355 1356 if (spdk_likely(spdk_nvme_cpl_is_success(cpl))) { 1357 bdev_nvme_update_io_path_stat(bio); 1358 goto complete; 1359 } 1360 1361 /* Update error counts before deciding if retry is needed. 1362 * Hence, error counts may be more than the number of I/O errors. 1363 */ 1364 bdev_nvme_update_nvme_error_stat(bdev_io, cpl); 1365 1366 if (cpl->status.dnr != 0 || spdk_nvme_cpl_is_aborted_by_request(cpl) || 1367 (g_opts.bdev_retry_count != -1 && bio->retry_count >= g_opts.bdev_retry_count)) { 1368 goto complete; 1369 } 1370 1371 /* At this point we don't know whether the sequence was successfully executed or not, so we 1372 * cannot retry the IO */ 1373 if (bdev_io->u.bdev.accel_sequence != NULL) { 1374 goto complete; 1375 } 1376 1377 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1378 1379 if (bdev_nvme_check_retry_io(bio, cpl, nbdev_ch, &delay_ms)) { 1380 bdev_nvme_queue_retry_io(nbdev_ch, bio, delay_ms); 1381 return; 1382 } 1383 1384 complete: 1385 bio->retry_count = 0; 1386 bio->submit_tsc = 0; 1387 bdev_io->u.bdev.accel_sequence = NULL; 1388 __bdev_nvme_io_complete(bdev_io, 0, cpl); 1389 } 1390 1391 static inline void 1392 bdev_nvme_io_complete(struct nvme_bdev_io *bio, int rc) 1393 { 1394 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1395 struct nvme_bdev_channel *nbdev_ch; 1396 enum spdk_bdev_io_status io_status; 1397 1398 assert(!bdev_nvme_io_type_is_admin(bdev_io->type)); 1399 1400 switch (rc) { 1401 case 0: 1402 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1403 break; 1404 case -ENOMEM: 1405 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1406 break; 1407 case -ENXIO: 1408 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1409 1410 bdev_nvme_clear_current_io_path(nbdev_ch); 1411 bio->io_path = NULL; 1412 1413 if (any_io_path_may_become_available(nbdev_ch)) { 1414 bdev_nvme_queue_retry_io(nbdev_ch, bio, 1000ULL); 1415 return; 1416 } 1417 1418 /* fallthrough */ 1419 default: 1420 spdk_accel_sequence_abort(bdev_io->u.bdev.accel_sequence); 1421 bdev_io->u.bdev.accel_sequence = NULL; 1422 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1423 break; 1424 } 1425 1426 bio->retry_count = 0; 1427 bio->submit_tsc = 0; 1428 __bdev_nvme_io_complete(bdev_io, io_status, NULL); 1429 } 1430 1431 static inline void 1432 bdev_nvme_admin_complete(struct nvme_bdev_io *bio, int rc) 1433 { 1434 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1435 enum spdk_bdev_io_status io_status; 1436 1437 switch (rc) { 1438 case 0: 1439 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1440 break; 1441 case -ENOMEM: 1442 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1443 break; 1444 case -ENXIO: 1445 /* fallthrough */ 1446 default: 1447 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1448 break; 1449 } 1450 1451 __bdev_nvme_io_complete(bdev_io, io_status, NULL); 1452 } 1453 1454 static void 1455 bdev_nvme_clear_io_path_caches_done(struct spdk_io_channel_iter *i, int status) 1456 { 1457 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 1458 1459 pthread_mutex_lock(&nvme_ctrlr->mutex); 1460 1461 assert(nvme_ctrlr->io_path_cache_clearing == true); 1462 nvme_ctrlr->io_path_cache_clearing = false; 1463 1464 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 1465 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1466 return; 1467 } 1468 1469 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1470 1471 nvme_ctrlr_unregister(nvme_ctrlr); 1472 } 1473 1474 static void 1475 _bdev_nvme_clear_io_path_cache(struct nvme_qpair *nvme_qpair) 1476 { 1477 struct nvme_io_path *io_path; 1478 1479 TAILQ_FOREACH(io_path, &nvme_qpair->io_path_list, tailq) { 1480 if (io_path->nbdev_ch == NULL) { 1481 continue; 1482 } 1483 bdev_nvme_clear_current_io_path(io_path->nbdev_ch); 1484 } 1485 } 1486 1487 static void 1488 bdev_nvme_clear_io_path_cache(struct spdk_io_channel_iter *i) 1489 { 1490 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 1491 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 1492 1493 assert(ctrlr_ch->qpair != NULL); 1494 1495 _bdev_nvme_clear_io_path_cache(ctrlr_ch->qpair); 1496 1497 spdk_for_each_channel_continue(i, 0); 1498 } 1499 1500 static void 1501 bdev_nvme_clear_io_path_caches(struct nvme_ctrlr *nvme_ctrlr) 1502 { 1503 pthread_mutex_lock(&nvme_ctrlr->mutex); 1504 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 1505 nvme_ctrlr->io_path_cache_clearing) { 1506 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1507 return; 1508 } 1509 1510 nvme_ctrlr->io_path_cache_clearing = true; 1511 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1512 1513 spdk_for_each_channel(nvme_ctrlr, 1514 bdev_nvme_clear_io_path_cache, 1515 NULL, 1516 bdev_nvme_clear_io_path_caches_done); 1517 } 1518 1519 static struct nvme_qpair * 1520 nvme_poll_group_get_qpair(struct nvme_poll_group *group, struct spdk_nvme_qpair *qpair) 1521 { 1522 struct nvme_qpair *nvme_qpair; 1523 1524 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1525 if (nvme_qpair->qpair == qpair) { 1526 break; 1527 } 1528 } 1529 1530 return nvme_qpair; 1531 } 1532 1533 static void nvme_qpair_delete(struct nvme_qpair *nvme_qpair); 1534 1535 static void 1536 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx) 1537 { 1538 struct nvme_poll_group *group = poll_group_ctx; 1539 struct nvme_qpair *nvme_qpair; 1540 struct nvme_ctrlr_channel *ctrlr_ch; 1541 int status; 1542 1543 nvme_qpair = nvme_poll_group_get_qpair(group, qpair); 1544 if (nvme_qpair == NULL) { 1545 return; 1546 } 1547 1548 if (nvme_qpair->qpair != NULL) { 1549 spdk_nvme_ctrlr_free_io_qpair(nvme_qpair->qpair); 1550 nvme_qpair->qpair = NULL; 1551 } 1552 1553 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1554 1555 ctrlr_ch = nvme_qpair->ctrlr_ch; 1556 1557 if (ctrlr_ch != NULL) { 1558 if (ctrlr_ch->reset_iter != NULL) { 1559 /* We are in a full reset sequence. */ 1560 if (ctrlr_ch->connect_poller != NULL) { 1561 /* qpair was failed to connect. Abort the reset sequence. */ 1562 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was failed to connect. abort the reset ctrlr sequence.\n", 1563 qpair); 1564 spdk_poller_unregister(&ctrlr_ch->connect_poller); 1565 status = -1; 1566 } else { 1567 /* qpair was completed to disconnect. Just move to the next ctrlr_channel. */ 1568 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was disconnected and freed in a reset ctrlr sequence.\n", 1569 qpair); 1570 status = 0; 1571 } 1572 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, status); 1573 ctrlr_ch->reset_iter = NULL; 1574 } else { 1575 /* qpair was disconnected unexpectedly. Reset controller for recovery. */ 1576 SPDK_NOTICELOG("qpair %p was disconnected and freed. reset controller.\n", qpair); 1577 bdev_nvme_failover_ctrlr(nvme_qpair->ctrlr, false); 1578 } 1579 } else { 1580 /* In this case, ctrlr_channel is already deleted. */ 1581 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was disconnected and freed. delete nvme_qpair.\n", qpair); 1582 nvme_qpair_delete(nvme_qpair); 1583 } 1584 } 1585 1586 static void 1587 bdev_nvme_check_io_qpairs(struct nvme_poll_group *group) 1588 { 1589 struct nvme_qpair *nvme_qpair; 1590 1591 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1592 if (nvme_qpair->qpair == NULL || nvme_qpair->ctrlr_ch == NULL) { 1593 continue; 1594 } 1595 1596 if (spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) != 1597 SPDK_NVME_QPAIR_FAILURE_NONE) { 1598 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1599 } 1600 } 1601 } 1602 1603 static int 1604 bdev_nvme_poll(void *arg) 1605 { 1606 struct nvme_poll_group *group = arg; 1607 int64_t num_completions; 1608 1609 if (group->collect_spin_stat && group->start_ticks == 0) { 1610 group->start_ticks = spdk_get_ticks(); 1611 } 1612 1613 num_completions = spdk_nvme_poll_group_process_completions(group->group, 0, 1614 bdev_nvme_disconnected_qpair_cb); 1615 if (group->collect_spin_stat) { 1616 if (num_completions > 0) { 1617 if (group->end_ticks != 0) { 1618 group->spin_ticks += (group->end_ticks - group->start_ticks); 1619 group->end_ticks = 0; 1620 } 1621 group->start_ticks = 0; 1622 } else { 1623 group->end_ticks = spdk_get_ticks(); 1624 } 1625 } 1626 1627 if (spdk_unlikely(num_completions < 0)) { 1628 bdev_nvme_check_io_qpairs(group); 1629 } 1630 1631 return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 1632 } 1633 1634 static int bdev_nvme_poll_adminq(void *arg); 1635 1636 static void 1637 bdev_nvme_change_adminq_poll_period(struct nvme_ctrlr *nvme_ctrlr, uint64_t new_period_us) 1638 { 1639 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 1640 1641 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, 1642 nvme_ctrlr, new_period_us); 1643 } 1644 1645 static int 1646 bdev_nvme_poll_adminq(void *arg) 1647 { 1648 int32_t rc; 1649 struct nvme_ctrlr *nvme_ctrlr = arg; 1650 nvme_ctrlr_disconnected_cb disconnected_cb; 1651 1652 assert(nvme_ctrlr != NULL); 1653 1654 rc = spdk_nvme_ctrlr_process_admin_completions(nvme_ctrlr->ctrlr); 1655 if (rc < 0) { 1656 disconnected_cb = nvme_ctrlr->disconnected_cb; 1657 nvme_ctrlr->disconnected_cb = NULL; 1658 1659 if (disconnected_cb != NULL) { 1660 bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 1661 g_opts.nvme_adminq_poll_period_us); 1662 disconnected_cb(nvme_ctrlr); 1663 } else { 1664 bdev_nvme_failover_ctrlr(nvme_ctrlr, false); 1665 } 1666 } else if (spdk_nvme_ctrlr_get_admin_qp_failure_reason(nvme_ctrlr->ctrlr) != 1667 SPDK_NVME_QPAIR_FAILURE_NONE) { 1668 bdev_nvme_clear_io_path_caches(nvme_ctrlr); 1669 } 1670 1671 return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY; 1672 } 1673 1674 static void 1675 nvme_bdev_free(void *io_device) 1676 { 1677 struct nvme_bdev *nvme_disk = io_device; 1678 1679 pthread_mutex_destroy(&nvme_disk->mutex); 1680 free(nvme_disk->disk.name); 1681 free(nvme_disk->err_stat); 1682 free(nvme_disk); 1683 } 1684 1685 static int 1686 bdev_nvme_destruct(void *ctx) 1687 { 1688 struct nvme_bdev *nvme_disk = ctx; 1689 struct nvme_ns *nvme_ns, *tmp_nvme_ns; 1690 1691 SPDK_DTRACE_PROBE2(bdev_nvme_destruct, nvme_disk->nbdev_ctrlr->name, nvme_disk->nsid); 1692 1693 TAILQ_FOREACH_SAFE(nvme_ns, &nvme_disk->nvme_ns_list, tailq, tmp_nvme_ns) { 1694 pthread_mutex_lock(&nvme_ns->ctrlr->mutex); 1695 1696 nvme_ns->bdev = NULL; 1697 1698 assert(nvme_ns->id > 0); 1699 1700 if (nvme_ctrlr_get_ns(nvme_ns->ctrlr, nvme_ns->id) == NULL) { 1701 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1702 1703 nvme_ctrlr_release(nvme_ns->ctrlr); 1704 nvme_ns_free(nvme_ns); 1705 } else { 1706 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1707 } 1708 } 1709 1710 pthread_mutex_lock(&g_bdev_nvme_mutex); 1711 TAILQ_REMOVE(&nvme_disk->nbdev_ctrlr->bdevs, nvme_disk, tailq); 1712 pthread_mutex_unlock(&g_bdev_nvme_mutex); 1713 1714 spdk_io_device_unregister(nvme_disk, nvme_bdev_free); 1715 1716 return 0; 1717 } 1718 1719 static int 1720 bdev_nvme_create_qpair(struct nvme_qpair *nvme_qpair) 1721 { 1722 struct nvme_ctrlr *nvme_ctrlr; 1723 struct spdk_nvme_io_qpair_opts opts; 1724 struct spdk_nvme_qpair *qpair; 1725 int rc; 1726 1727 nvme_ctrlr = nvme_qpair->ctrlr; 1728 1729 spdk_nvme_ctrlr_get_default_io_qpair_opts(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1730 opts.delay_cmd_submit = g_opts.delay_cmd_submit; 1731 opts.create_only = true; 1732 opts.async_mode = true; 1733 opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests); 1734 g_opts.io_queue_requests = opts.io_queue_requests; 1735 1736 qpair = spdk_nvme_ctrlr_alloc_io_qpair(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1737 if (qpair == NULL) { 1738 return -1; 1739 } 1740 1741 SPDK_DTRACE_PROBE3(bdev_nvme_create_qpair, nvme_ctrlr->nbdev_ctrlr->name, 1742 spdk_nvme_qpair_get_id(qpair), spdk_thread_get_id(nvme_ctrlr->thread)); 1743 1744 assert(nvme_qpair->group != NULL); 1745 1746 rc = spdk_nvme_poll_group_add(nvme_qpair->group->group, qpair); 1747 if (rc != 0) { 1748 SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n"); 1749 goto err; 1750 } 1751 1752 rc = spdk_nvme_ctrlr_connect_io_qpair(nvme_ctrlr->ctrlr, qpair); 1753 if (rc != 0) { 1754 SPDK_ERRLOG("Unable to connect I/O qpair.\n"); 1755 goto err; 1756 } 1757 1758 nvme_qpair->qpair = qpair; 1759 1760 if (!g_opts.disable_auto_failback) { 1761 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1762 } 1763 1764 return 0; 1765 1766 err: 1767 spdk_nvme_ctrlr_free_io_qpair(qpair); 1768 1769 return rc; 1770 } 1771 1772 static void 1773 bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i) 1774 { 1775 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 1776 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 1777 enum spdk_bdev_io_status status = SPDK_BDEV_IO_STATUS_SUCCESS; 1778 struct spdk_bdev_io *bdev_io; 1779 1780 if (spdk_io_channel_iter_get_ctx(i) != NULL) { 1781 status = SPDK_BDEV_IO_STATUS_FAILED; 1782 } 1783 1784 while (!TAILQ_EMPTY(&ctrlr_ch->pending_resets)) { 1785 bdev_io = TAILQ_FIRST(&ctrlr_ch->pending_resets); 1786 TAILQ_REMOVE(&ctrlr_ch->pending_resets, bdev_io, module_link); 1787 __bdev_nvme_io_complete(bdev_io, status, NULL); 1788 } 1789 1790 spdk_for_each_channel_continue(i, 0); 1791 } 1792 1793 /* This function marks the current trid as failed by storing the current ticks 1794 * and then sets the next trid to the active trid within a controller if exists. 1795 * 1796 * The purpose of the boolean return value is to request the caller to disconnect 1797 * the current trid now to try connecting the next trid. 1798 */ 1799 static bool 1800 bdev_nvme_failover_trid(struct nvme_ctrlr *nvme_ctrlr, bool remove, bool start) 1801 { 1802 struct nvme_path_id *path_id, *next_path; 1803 int rc __attribute__((unused)); 1804 1805 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 1806 assert(path_id); 1807 assert(path_id == nvme_ctrlr->active_path_id); 1808 next_path = TAILQ_NEXT(path_id, link); 1809 1810 /* Update the last failed time. It means the trid is failed if its last 1811 * failed time is non-zero. 1812 */ 1813 path_id->last_failed_tsc = spdk_get_ticks(); 1814 1815 if (next_path == NULL) { 1816 /* There is no alternate trid within a controller. */ 1817 return false; 1818 } 1819 1820 if (!start && nvme_ctrlr->opts.reconnect_delay_sec == 0) { 1821 /* Connect is not retried in a controller reset sequence. Connecting 1822 * the next trid will be done by the next bdev_nvme_failover_ctrlr() call. 1823 */ 1824 return false; 1825 } 1826 1827 assert(path_id->trid.trtype != SPDK_NVME_TRANSPORT_PCIE); 1828 1829 SPDK_NOTICELOG("Start failover from %s:%s to %s:%s\n", path_id->trid.traddr, 1830 path_id->trid.trsvcid, next_path->trid.traddr, next_path->trid.trsvcid); 1831 1832 spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr); 1833 nvme_ctrlr->active_path_id = next_path; 1834 rc = spdk_nvme_ctrlr_set_trid(nvme_ctrlr->ctrlr, &next_path->trid); 1835 assert(rc == 0); 1836 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 1837 if (!remove) { 1838 /** Shuffle the old trid to the end of the list and use the new one. 1839 * Allows for round robin through multiple connections. 1840 */ 1841 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, path_id, link); 1842 } else { 1843 free(path_id); 1844 } 1845 1846 if (start || next_path->last_failed_tsc == 0) { 1847 /* bdev_nvme_failover_ctrlr() is just called or the next trid is not failed 1848 * or used yet. Try the next trid now. 1849 */ 1850 return true; 1851 } 1852 1853 if (spdk_get_ticks() > next_path->last_failed_tsc + spdk_get_ticks_hz() * 1854 nvme_ctrlr->opts.reconnect_delay_sec) { 1855 /* Enough backoff passed since the next trid failed. Try the next trid now. */ 1856 return true; 1857 } 1858 1859 /* The next trid will be tried after reconnect_delay_sec seconds. */ 1860 return false; 1861 } 1862 1863 static bool 1864 bdev_nvme_check_ctrlr_loss_timeout(struct nvme_ctrlr *nvme_ctrlr) 1865 { 1866 int32_t elapsed; 1867 1868 if (nvme_ctrlr->opts.ctrlr_loss_timeout_sec == 0 || 1869 nvme_ctrlr->opts.ctrlr_loss_timeout_sec == -1) { 1870 return false; 1871 } 1872 1873 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 1874 if (elapsed >= nvme_ctrlr->opts.ctrlr_loss_timeout_sec) { 1875 return true; 1876 } else { 1877 return false; 1878 } 1879 } 1880 1881 static bool 1882 bdev_nvme_check_fast_io_fail_timeout(struct nvme_ctrlr *nvme_ctrlr) 1883 { 1884 uint32_t elapsed; 1885 1886 if (nvme_ctrlr->opts.fast_io_fail_timeout_sec == 0) { 1887 return false; 1888 } 1889 1890 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 1891 if (elapsed >= nvme_ctrlr->opts.fast_io_fail_timeout_sec) { 1892 return true; 1893 } else { 1894 return false; 1895 } 1896 } 1897 1898 static void bdev_nvme_reset_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, bool success); 1899 1900 static void 1901 nvme_ctrlr_disconnect(struct nvme_ctrlr *nvme_ctrlr, nvme_ctrlr_disconnected_cb cb_fn) 1902 { 1903 int rc; 1904 1905 rc = spdk_nvme_ctrlr_disconnect(nvme_ctrlr->ctrlr); 1906 if (rc != 0) { 1907 /* Disconnect fails if ctrlr is already resetting or removed. In this case, 1908 * fail the reset sequence immediately. 1909 */ 1910 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 1911 return; 1912 } 1913 1914 /* spdk_nvme_ctrlr_disconnect() may complete asynchronously later by polling adminq. 1915 * Set callback here to execute the specified operation after ctrlr is really disconnected. 1916 */ 1917 assert(nvme_ctrlr->disconnected_cb == NULL); 1918 nvme_ctrlr->disconnected_cb = cb_fn; 1919 1920 /* During disconnection, reduce the period to poll adminq more often. */ 1921 bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 0); 1922 } 1923 1924 enum bdev_nvme_op_after_reset { 1925 OP_NONE, 1926 OP_COMPLETE_PENDING_DESTRUCT, 1927 OP_DESTRUCT, 1928 OP_DELAYED_RECONNECT, 1929 OP_FAILOVER, 1930 }; 1931 1932 typedef enum bdev_nvme_op_after_reset _bdev_nvme_op_after_reset; 1933 1934 static _bdev_nvme_op_after_reset 1935 bdev_nvme_check_op_after_reset(struct nvme_ctrlr *nvme_ctrlr, bool success) 1936 { 1937 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 1938 /* Complete pending destruct after reset completes. */ 1939 return OP_COMPLETE_PENDING_DESTRUCT; 1940 } else if (nvme_ctrlr->pending_failover) { 1941 nvme_ctrlr->pending_failover = false; 1942 nvme_ctrlr->reset_start_tsc = 0; 1943 return OP_FAILOVER; 1944 } else if (success || nvme_ctrlr->opts.reconnect_delay_sec == 0) { 1945 nvme_ctrlr->reset_start_tsc = 0; 1946 return OP_NONE; 1947 } else if (bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 1948 return OP_DESTRUCT; 1949 } else { 1950 if (bdev_nvme_check_fast_io_fail_timeout(nvme_ctrlr)) { 1951 nvme_ctrlr->fast_io_fail_timedout = true; 1952 } 1953 return OP_DELAYED_RECONNECT; 1954 } 1955 } 1956 1957 static int bdev_nvme_delete_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool hotplug); 1958 static void bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 1959 1960 static int 1961 bdev_nvme_reconnect_delay_timer_expired(void *ctx) 1962 { 1963 struct nvme_ctrlr *nvme_ctrlr = ctx; 1964 1965 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect_delay, nvme_ctrlr->nbdev_ctrlr->name); 1966 pthread_mutex_lock(&nvme_ctrlr->mutex); 1967 1968 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 1969 1970 if (!nvme_ctrlr->reconnect_is_delayed) { 1971 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1972 return SPDK_POLLER_BUSY; 1973 } 1974 1975 nvme_ctrlr->reconnect_is_delayed = false; 1976 1977 if (nvme_ctrlr->destruct) { 1978 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1979 return SPDK_POLLER_BUSY; 1980 } 1981 1982 assert(nvme_ctrlr->resetting == false); 1983 nvme_ctrlr->resetting = true; 1984 1985 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1986 1987 spdk_poller_resume(nvme_ctrlr->adminq_timer_poller); 1988 1989 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 1990 return SPDK_POLLER_BUSY; 1991 } 1992 1993 static void 1994 bdev_nvme_start_reconnect_delay_timer(struct nvme_ctrlr *nvme_ctrlr) 1995 { 1996 spdk_poller_pause(nvme_ctrlr->adminq_timer_poller); 1997 1998 assert(nvme_ctrlr->reconnect_is_delayed == false); 1999 nvme_ctrlr->reconnect_is_delayed = true; 2000 2001 assert(nvme_ctrlr->reconnect_delay_timer == NULL); 2002 nvme_ctrlr->reconnect_delay_timer = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_delay_timer_expired, 2003 nvme_ctrlr, 2004 nvme_ctrlr->opts.reconnect_delay_sec * SPDK_SEC_TO_USEC); 2005 } 2006 2007 static void remove_discovery_entry(struct nvme_ctrlr *nvme_ctrlr); 2008 2009 static void 2010 _bdev_nvme_reset_ctrlr_complete(struct spdk_io_channel_iter *i, int status) 2011 { 2012 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2013 bool success = spdk_io_channel_iter_get_ctx(i) == NULL; 2014 bdev_nvme_ctrlr_op_cb ctrlr_op_cb_fn = nvme_ctrlr->ctrlr_op_cb_fn; 2015 void *ctrlr_op_cb_arg = nvme_ctrlr->ctrlr_op_cb_arg; 2016 enum bdev_nvme_op_after_reset op_after_reset; 2017 2018 assert(nvme_ctrlr->thread == spdk_get_thread()); 2019 2020 nvme_ctrlr->ctrlr_op_cb_fn = NULL; 2021 nvme_ctrlr->ctrlr_op_cb_arg = NULL; 2022 2023 if (!success) { 2024 SPDK_ERRLOG("Resetting controller failed.\n"); 2025 } else { 2026 SPDK_NOTICELOG("Resetting controller successful.\n"); 2027 } 2028 2029 pthread_mutex_lock(&nvme_ctrlr->mutex); 2030 nvme_ctrlr->resetting = false; 2031 nvme_ctrlr->dont_retry = false; 2032 nvme_ctrlr->in_failover = false; 2033 2034 op_after_reset = bdev_nvme_check_op_after_reset(nvme_ctrlr, success); 2035 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2036 2037 if (ctrlr_op_cb_fn) { 2038 ctrlr_op_cb_fn(ctrlr_op_cb_arg, success ? 0 : -1); 2039 } 2040 2041 switch (op_after_reset) { 2042 case OP_COMPLETE_PENDING_DESTRUCT: 2043 nvme_ctrlr_unregister(nvme_ctrlr); 2044 break; 2045 case OP_DESTRUCT: 2046 bdev_nvme_delete_ctrlr(nvme_ctrlr, false); 2047 remove_discovery_entry(nvme_ctrlr); 2048 break; 2049 case OP_DELAYED_RECONNECT: 2050 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_start_reconnect_delay_timer); 2051 break; 2052 case OP_FAILOVER: 2053 bdev_nvme_failover_ctrlr(nvme_ctrlr, false); 2054 break; 2055 default: 2056 break; 2057 } 2058 } 2059 2060 static void 2061 bdev_nvme_reset_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, bool success) 2062 { 2063 pthread_mutex_lock(&nvme_ctrlr->mutex); 2064 if (!success) { 2065 /* Connecting the active trid failed. Set the next alternate trid to the 2066 * active trid if it exists. 2067 */ 2068 if (bdev_nvme_failover_trid(nvme_ctrlr, false, false)) { 2069 /* The next alternate trid exists and is ready to try. Try it now. */ 2070 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2071 2072 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr); 2073 return; 2074 } 2075 2076 /* We came here if there is no alternate trid or if the next trid exists but 2077 * is not ready to try. We will try the active trid after reconnect_delay_sec 2078 * seconds if it is non-zero or at the next reset call otherwise. 2079 */ 2080 } else { 2081 /* Connecting the active trid succeeded. Clear the last failed time because it 2082 * means the trid is failed if its last failed time is non-zero. 2083 */ 2084 nvme_ctrlr->active_path_id->last_failed_tsc = 0; 2085 } 2086 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2087 2088 /* Make sure we clear any pending resets before returning. */ 2089 spdk_for_each_channel(nvme_ctrlr, 2090 bdev_nvme_complete_pending_resets, 2091 success ? NULL : (void *)0x1, 2092 _bdev_nvme_reset_ctrlr_complete); 2093 } 2094 2095 static void 2096 bdev_nvme_reset_create_qpairs_failed(struct spdk_io_channel_iter *i, int status) 2097 { 2098 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2099 2100 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 2101 } 2102 2103 static void 2104 bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i) 2105 { 2106 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 2107 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch); 2108 struct nvme_qpair *nvme_qpair; 2109 2110 nvme_qpair = ctrlr_ch->qpair; 2111 assert(nvme_qpair != NULL); 2112 2113 _bdev_nvme_clear_io_path_cache(nvme_qpair); 2114 2115 if (nvme_qpair->qpair != NULL) { 2116 if (nvme_qpair->ctrlr->dont_retry) { 2117 spdk_nvme_qpair_set_abort_dnr(nvme_qpair->qpair, true); 2118 } 2119 spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); 2120 2121 /* The current full reset sequence will move to the next 2122 * ctrlr_channel after the qpair is actually disconnected. 2123 */ 2124 assert(ctrlr_ch->reset_iter == NULL); 2125 ctrlr_ch->reset_iter = i; 2126 } else { 2127 spdk_for_each_channel_continue(i, 0); 2128 } 2129 } 2130 2131 static void 2132 bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status) 2133 { 2134 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2135 2136 if (status == 0) { 2137 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, true); 2138 } else { 2139 /* Delete the added qpairs and quiesce ctrlr to make the states clean. */ 2140 spdk_for_each_channel(nvme_ctrlr, 2141 bdev_nvme_reset_destroy_qpair, 2142 NULL, 2143 bdev_nvme_reset_create_qpairs_failed); 2144 } 2145 } 2146 2147 static int 2148 bdev_nvme_reset_check_qpair_connected(void *ctx) 2149 { 2150 struct nvme_ctrlr_channel *ctrlr_ch = ctx; 2151 2152 if (ctrlr_ch->reset_iter == NULL) { 2153 /* qpair was already failed to connect and the reset sequence is being aborted. */ 2154 assert(ctrlr_ch->connect_poller == NULL); 2155 assert(ctrlr_ch->qpair->qpair == NULL); 2156 return SPDK_POLLER_BUSY; 2157 } 2158 2159 assert(ctrlr_ch->qpair->qpair != NULL); 2160 2161 if (!spdk_nvme_qpair_is_connected(ctrlr_ch->qpair->qpair)) { 2162 return SPDK_POLLER_BUSY; 2163 } 2164 2165 spdk_poller_unregister(&ctrlr_ch->connect_poller); 2166 2167 /* qpair was completed to connect. Move to the next ctrlr_channel */ 2168 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 2169 ctrlr_ch->reset_iter = NULL; 2170 2171 return SPDK_POLLER_BUSY; 2172 } 2173 2174 static void 2175 bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i) 2176 { 2177 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 2178 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 2179 int rc; 2180 2181 rc = bdev_nvme_create_qpair(ctrlr_ch->qpair); 2182 if (rc == 0) { 2183 ctrlr_ch->connect_poller = SPDK_POLLER_REGISTER(bdev_nvme_reset_check_qpair_connected, 2184 ctrlr_ch, 0); 2185 2186 /* The current full reset sequence will move to the next 2187 * ctrlr_channel after the qpair is actually connected. 2188 */ 2189 assert(ctrlr_ch->reset_iter == NULL); 2190 ctrlr_ch->reset_iter = i; 2191 } else { 2192 spdk_for_each_channel_continue(i, rc); 2193 } 2194 } 2195 2196 static int 2197 bdev_nvme_reconnect_ctrlr_poll(void *arg) 2198 { 2199 struct nvme_ctrlr *nvme_ctrlr = arg; 2200 int rc = -ETIMEDOUT; 2201 2202 if (!bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 2203 rc = spdk_nvme_ctrlr_reconnect_poll_async(nvme_ctrlr->ctrlr); 2204 if (rc == -EAGAIN) { 2205 return SPDK_POLLER_BUSY; 2206 } 2207 } 2208 2209 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 2210 if (rc == 0) { 2211 /* Recreate all of the I/O queue pairs */ 2212 spdk_for_each_channel(nvme_ctrlr, 2213 bdev_nvme_reset_create_qpair, 2214 NULL, 2215 bdev_nvme_reset_create_qpairs_done); 2216 } else { 2217 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 2218 } 2219 return SPDK_POLLER_BUSY; 2220 } 2221 2222 static void 2223 bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2224 { 2225 spdk_nvme_ctrlr_reconnect_async(nvme_ctrlr->ctrlr); 2226 2227 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect, nvme_ctrlr->nbdev_ctrlr->name); 2228 assert(nvme_ctrlr->reset_detach_poller == NULL); 2229 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_ctrlr_poll, 2230 nvme_ctrlr, 0); 2231 } 2232 2233 static void 2234 bdev_nvme_reset_destroy_qpair_done(struct spdk_io_channel_iter *i, int status) 2235 { 2236 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2237 2238 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reset, nvme_ctrlr->nbdev_ctrlr->name); 2239 assert(status == 0); 2240 2241 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2242 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2243 } else { 2244 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr); 2245 } 2246 } 2247 2248 static void 2249 bdev_nvme_reset_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr) 2250 { 2251 spdk_for_each_channel(nvme_ctrlr, 2252 bdev_nvme_reset_destroy_qpair, 2253 NULL, 2254 bdev_nvme_reset_destroy_qpair_done); 2255 } 2256 2257 static void 2258 bdev_nvme_reconnect_ctrlr_now(void *ctx) 2259 { 2260 struct nvme_ctrlr *nvme_ctrlr = ctx; 2261 2262 assert(nvme_ctrlr->resetting == true); 2263 assert(nvme_ctrlr->thread == spdk_get_thread()); 2264 2265 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 2266 2267 spdk_poller_resume(nvme_ctrlr->adminq_timer_poller); 2268 2269 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2270 } 2271 2272 static void 2273 _bdev_nvme_reset_ctrlr(void *ctx) 2274 { 2275 struct nvme_ctrlr *nvme_ctrlr = ctx; 2276 2277 assert(nvme_ctrlr->resetting == true); 2278 assert(nvme_ctrlr->thread == spdk_get_thread()); 2279 2280 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2281 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reset_destroy_qpairs); 2282 } else { 2283 bdev_nvme_reset_destroy_qpairs(nvme_ctrlr); 2284 } 2285 } 2286 2287 static int 2288 bdev_nvme_reset_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2289 { 2290 spdk_msg_fn msg_fn; 2291 2292 pthread_mutex_lock(&nvme_ctrlr->mutex); 2293 if (nvme_ctrlr->destruct) { 2294 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2295 return -ENXIO; 2296 } 2297 2298 if (nvme_ctrlr->resetting) { 2299 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2300 SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); 2301 return -EBUSY; 2302 } 2303 2304 if (nvme_ctrlr->disabled) { 2305 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2306 SPDK_NOTICELOG("Unable to perform reset. Controller is disabled.\n"); 2307 return -EALREADY; 2308 } 2309 2310 nvme_ctrlr->resetting = true; 2311 nvme_ctrlr->dont_retry = true; 2312 2313 if (nvme_ctrlr->reconnect_is_delayed) { 2314 SPDK_DEBUGLOG(bdev_nvme, "Reconnect is already scheduled.\n"); 2315 msg_fn = bdev_nvme_reconnect_ctrlr_now; 2316 nvme_ctrlr->reconnect_is_delayed = false; 2317 } else { 2318 msg_fn = _bdev_nvme_reset_ctrlr; 2319 assert(nvme_ctrlr->reset_start_tsc == 0); 2320 } 2321 2322 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2323 2324 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2325 2326 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 2327 return 0; 2328 } 2329 2330 static int 2331 bdev_nvme_enable_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2332 { 2333 pthread_mutex_lock(&nvme_ctrlr->mutex); 2334 if (nvme_ctrlr->destruct) { 2335 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2336 return -ENXIO; 2337 } 2338 2339 if (nvme_ctrlr->resetting) { 2340 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2341 return -EBUSY; 2342 } 2343 2344 if (!nvme_ctrlr->disabled) { 2345 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2346 return -EALREADY; 2347 } 2348 2349 nvme_ctrlr->disabled = false; 2350 nvme_ctrlr->resetting = true; 2351 2352 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2353 2354 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2355 2356 spdk_thread_send_msg(nvme_ctrlr->thread, bdev_nvme_reconnect_ctrlr_now, nvme_ctrlr); 2357 return 0; 2358 } 2359 2360 static void 2361 _bdev_nvme_disable_ctrlr_complete(struct spdk_io_channel_iter *i, int status) 2362 { 2363 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2364 bdev_nvme_ctrlr_op_cb ctrlr_op_cb_fn = nvme_ctrlr->ctrlr_op_cb_fn; 2365 void *ctrlr_op_cb_arg = nvme_ctrlr->ctrlr_op_cb_arg; 2366 enum bdev_nvme_op_after_reset op_after_disable; 2367 2368 assert(nvme_ctrlr->thread == spdk_get_thread()); 2369 2370 nvme_ctrlr->ctrlr_op_cb_fn = NULL; 2371 nvme_ctrlr->ctrlr_op_cb_arg = NULL; 2372 2373 pthread_mutex_lock(&nvme_ctrlr->mutex); 2374 2375 nvme_ctrlr->resetting = false; 2376 nvme_ctrlr->dont_retry = false; 2377 2378 op_after_disable = bdev_nvme_check_op_after_reset(nvme_ctrlr, true); 2379 2380 nvme_ctrlr->disabled = true; 2381 spdk_poller_pause(nvme_ctrlr->adminq_timer_poller); 2382 2383 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2384 2385 if (ctrlr_op_cb_fn) { 2386 ctrlr_op_cb_fn(ctrlr_op_cb_arg, 0); 2387 } 2388 2389 switch (op_after_disable) { 2390 case OP_COMPLETE_PENDING_DESTRUCT: 2391 nvme_ctrlr_unregister(nvme_ctrlr); 2392 break; 2393 default: 2394 break; 2395 } 2396 2397 } 2398 2399 static void 2400 bdev_nvme_disable_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr) 2401 { 2402 /* Make sure we clear any pending resets before returning. */ 2403 spdk_for_each_channel(nvme_ctrlr, 2404 bdev_nvme_complete_pending_resets, 2405 NULL, 2406 _bdev_nvme_disable_ctrlr_complete); 2407 } 2408 2409 static void 2410 bdev_nvme_disable_destroy_qpairs_done(struct spdk_io_channel_iter *i, int status) 2411 { 2412 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2413 2414 assert(status == 0); 2415 2416 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2417 bdev_nvme_disable_ctrlr_complete(nvme_ctrlr); 2418 } else { 2419 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_disable_ctrlr_complete); 2420 } 2421 } 2422 2423 static void 2424 bdev_nvme_disable_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr) 2425 { 2426 spdk_for_each_channel(nvme_ctrlr, 2427 bdev_nvme_reset_destroy_qpair, 2428 NULL, 2429 bdev_nvme_disable_destroy_qpairs_done); 2430 } 2431 2432 static void 2433 _bdev_nvme_cancel_reconnect_and_disable_ctrlr(void *ctx) 2434 { 2435 struct nvme_ctrlr *nvme_ctrlr = ctx; 2436 2437 assert(nvme_ctrlr->resetting == true); 2438 assert(nvme_ctrlr->thread == spdk_get_thread()); 2439 2440 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 2441 2442 bdev_nvme_disable_ctrlr_complete(nvme_ctrlr); 2443 } 2444 2445 static void 2446 _bdev_nvme_disconnect_and_disable_ctrlr(void *ctx) 2447 { 2448 struct nvme_ctrlr *nvme_ctrlr = ctx; 2449 2450 assert(nvme_ctrlr->resetting == true); 2451 assert(nvme_ctrlr->thread == spdk_get_thread()); 2452 2453 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2454 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_disable_destroy_qpairs); 2455 } else { 2456 bdev_nvme_disable_destroy_qpairs(nvme_ctrlr); 2457 } 2458 } 2459 2460 static int 2461 bdev_nvme_disable_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2462 { 2463 spdk_msg_fn msg_fn; 2464 2465 pthread_mutex_lock(&nvme_ctrlr->mutex); 2466 if (nvme_ctrlr->destruct) { 2467 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2468 return -ENXIO; 2469 } 2470 2471 if (nvme_ctrlr->resetting) { 2472 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2473 return -EBUSY; 2474 } 2475 2476 if (nvme_ctrlr->disabled) { 2477 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2478 return -EALREADY; 2479 } 2480 2481 nvme_ctrlr->resetting = true; 2482 nvme_ctrlr->dont_retry = true; 2483 2484 if (nvme_ctrlr->reconnect_is_delayed) { 2485 msg_fn = _bdev_nvme_cancel_reconnect_and_disable_ctrlr; 2486 nvme_ctrlr->reconnect_is_delayed = false; 2487 } else { 2488 msg_fn = _bdev_nvme_disconnect_and_disable_ctrlr; 2489 } 2490 2491 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2492 2493 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2494 2495 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 2496 return 0; 2497 } 2498 2499 static int 2500 nvme_ctrlr_op(struct nvme_ctrlr *nvme_ctrlr, enum nvme_ctrlr_op op, 2501 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2502 { 2503 int rc; 2504 2505 switch (op) { 2506 case NVME_CTRLR_OP_RESET: 2507 rc = bdev_nvme_reset_ctrlr(nvme_ctrlr); 2508 break; 2509 case NVME_CTRLR_OP_ENABLE: 2510 rc = bdev_nvme_enable_ctrlr(nvme_ctrlr); 2511 break; 2512 case NVME_CTRLR_OP_DISABLE: 2513 rc = bdev_nvme_disable_ctrlr(nvme_ctrlr); 2514 break; 2515 default: 2516 rc = -EINVAL; 2517 break; 2518 } 2519 2520 if (rc == 0) { 2521 assert(nvme_ctrlr->ctrlr_op_cb_fn == NULL); 2522 assert(nvme_ctrlr->ctrlr_op_cb_arg == NULL); 2523 nvme_ctrlr->ctrlr_op_cb_fn = cb_fn; 2524 nvme_ctrlr->ctrlr_op_cb_arg = cb_arg; 2525 } 2526 return rc; 2527 } 2528 2529 struct nvme_ctrlr_op_rpc_ctx { 2530 struct nvme_ctrlr *nvme_ctrlr; 2531 struct spdk_thread *orig_thread; 2532 enum nvme_ctrlr_op op; 2533 int rc; 2534 bdev_nvme_ctrlr_op_cb cb_fn; 2535 void *cb_arg; 2536 }; 2537 2538 static void 2539 _nvme_ctrlr_op_rpc_complete(void *_ctx) 2540 { 2541 struct nvme_ctrlr_op_rpc_ctx *ctx = _ctx; 2542 2543 assert(ctx != NULL); 2544 assert(ctx->cb_fn != NULL); 2545 2546 ctx->cb_fn(ctx->cb_arg, ctx->rc); 2547 2548 free(ctx); 2549 } 2550 2551 static void 2552 nvme_ctrlr_op_rpc_complete(void *cb_arg, int rc) 2553 { 2554 struct nvme_ctrlr_op_rpc_ctx *ctx = cb_arg; 2555 2556 ctx->rc = rc; 2557 2558 spdk_thread_send_msg(ctx->orig_thread, _nvme_ctrlr_op_rpc_complete, ctx); 2559 } 2560 2561 void 2562 nvme_ctrlr_op_rpc(struct nvme_ctrlr *nvme_ctrlr, enum nvme_ctrlr_op op, 2563 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2564 { 2565 struct nvme_ctrlr_op_rpc_ctx *ctx; 2566 int rc; 2567 2568 assert(cb_fn != NULL); 2569 2570 ctx = calloc(1, sizeof(*ctx)); 2571 if (ctx == NULL) { 2572 SPDK_ERRLOG("Failed to allocate nvme_ctrlr_op_rpc_ctx.\n"); 2573 cb_fn(cb_arg, -ENOMEM); 2574 return; 2575 } 2576 2577 ctx->orig_thread = spdk_get_thread(); 2578 ctx->cb_fn = cb_fn; 2579 ctx->cb_arg = cb_arg; 2580 2581 rc = nvme_ctrlr_op(nvme_ctrlr, op, nvme_ctrlr_op_rpc_complete, ctx); 2582 if (rc == 0) { 2583 return; 2584 } else if (rc == -EALREADY) { 2585 rc = 0; 2586 } 2587 2588 nvme_ctrlr_op_rpc_complete(ctx, rc); 2589 } 2590 2591 static void nvme_bdev_ctrlr_op_rpc_continue(void *cb_arg, int rc); 2592 2593 static void 2594 _nvme_bdev_ctrlr_op_rpc_continue(void *_ctx) 2595 { 2596 struct nvme_ctrlr_op_rpc_ctx *ctx = _ctx; 2597 struct nvme_ctrlr *prev_nvme_ctrlr, *next_nvme_ctrlr; 2598 int rc; 2599 2600 prev_nvme_ctrlr = ctx->nvme_ctrlr; 2601 ctx->nvme_ctrlr = NULL; 2602 2603 if (ctx->rc != 0) { 2604 goto complete; 2605 } 2606 2607 next_nvme_ctrlr = TAILQ_NEXT(prev_nvme_ctrlr, tailq); 2608 if (next_nvme_ctrlr == NULL) { 2609 goto complete; 2610 } 2611 2612 rc = nvme_ctrlr_op(next_nvme_ctrlr, ctx->op, nvme_bdev_ctrlr_op_rpc_continue, ctx); 2613 if (rc == 0) { 2614 ctx->nvme_ctrlr = next_nvme_ctrlr; 2615 return; 2616 } else if (rc == -EALREADY) { 2617 ctx->nvme_ctrlr = next_nvme_ctrlr; 2618 rc = 0; 2619 } 2620 2621 ctx->rc = rc; 2622 2623 complete: 2624 ctx->cb_fn(ctx->cb_arg, ctx->rc); 2625 free(ctx); 2626 } 2627 2628 static void 2629 nvme_bdev_ctrlr_op_rpc_continue(void *cb_arg, int rc) 2630 { 2631 struct nvme_ctrlr_op_rpc_ctx *ctx = cb_arg; 2632 2633 ctx->rc = rc; 2634 2635 spdk_thread_send_msg(ctx->orig_thread, _nvme_bdev_ctrlr_op_rpc_continue, ctx); 2636 } 2637 2638 void 2639 nvme_bdev_ctrlr_op_rpc(struct nvme_bdev_ctrlr *nbdev_ctrlr, enum nvme_ctrlr_op op, 2640 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2641 { 2642 struct nvme_ctrlr_op_rpc_ctx *ctx; 2643 struct nvme_ctrlr *nvme_ctrlr; 2644 int rc; 2645 2646 assert(cb_fn != NULL); 2647 2648 ctx = calloc(1, sizeof(*ctx)); 2649 if (ctx == NULL) { 2650 SPDK_ERRLOG("Failed to allocate nvme_ctrlr_op_rpc_ctx.\n"); 2651 cb_fn(cb_arg, -ENOMEM); 2652 return; 2653 } 2654 2655 ctx->orig_thread = spdk_get_thread(); 2656 ctx->op = op; 2657 ctx->cb_fn = cb_fn; 2658 ctx->cb_arg = cb_arg; 2659 2660 nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 2661 assert(nvme_ctrlr != NULL); 2662 2663 rc = nvme_ctrlr_op(nvme_ctrlr, op, nvme_bdev_ctrlr_op_rpc_continue, ctx); 2664 if (rc == 0) { 2665 ctx->nvme_ctrlr = nvme_ctrlr; 2666 return; 2667 } else if (rc == -EALREADY) { 2668 ctx->nvme_ctrlr = nvme_ctrlr; 2669 rc = 0; 2670 } 2671 2672 nvme_bdev_ctrlr_op_rpc_continue(ctx, rc); 2673 } 2674 2675 static int _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio); 2676 2677 static void 2678 _bdev_nvme_reset_io_complete(struct spdk_io_channel_iter *i, int status) 2679 { 2680 struct nvme_bdev_io *bio = spdk_io_channel_iter_get_ctx(i); 2681 enum spdk_bdev_io_status io_status; 2682 2683 if (bio->cpl.cdw0 == 0) { 2684 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 2685 } else { 2686 io_status = SPDK_BDEV_IO_STATUS_FAILED; 2687 } 2688 2689 __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), io_status, NULL); 2690 } 2691 2692 static void 2693 bdev_nvme_abort_bdev_channel(struct spdk_io_channel_iter *i) 2694 { 2695 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 2696 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 2697 2698 bdev_nvme_abort_retry_ios(nbdev_ch); 2699 2700 spdk_for_each_channel_continue(i, 0); 2701 } 2702 2703 static void 2704 bdev_nvme_reset_io_complete(struct nvme_bdev_io *bio) 2705 { 2706 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2707 struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt; 2708 2709 /* Abort all queued I/Os for retry. */ 2710 spdk_for_each_channel(nbdev, 2711 bdev_nvme_abort_bdev_channel, 2712 bio, 2713 _bdev_nvme_reset_io_complete); 2714 } 2715 2716 static void 2717 _bdev_nvme_reset_io_continue(void *ctx) 2718 { 2719 struct nvme_bdev_io *bio = ctx; 2720 struct nvme_io_path *prev_io_path, *next_io_path; 2721 int rc; 2722 2723 prev_io_path = bio->io_path; 2724 bio->io_path = NULL; 2725 2726 if (bio->cpl.cdw0 != 0) { 2727 goto complete; 2728 } 2729 2730 next_io_path = STAILQ_NEXT(prev_io_path, stailq); 2731 if (next_io_path == NULL) { 2732 goto complete; 2733 } 2734 2735 rc = _bdev_nvme_reset_io(next_io_path, bio); 2736 if (rc == 0) { 2737 return; 2738 } 2739 2740 bio->cpl.cdw0 = 1; 2741 2742 complete: 2743 bdev_nvme_reset_io_complete(bio); 2744 } 2745 2746 static void 2747 bdev_nvme_reset_io_continue(void *cb_arg, int rc) 2748 { 2749 struct nvme_bdev_io *bio = cb_arg; 2750 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2751 2752 bio->cpl.cdw0 = (rc == 0) ? 0 : 1; 2753 2754 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), _bdev_nvme_reset_io_continue, bio); 2755 } 2756 2757 static int 2758 _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio) 2759 { 2760 struct nvme_ctrlr_channel *ctrlr_ch; 2761 struct spdk_bdev_io *bdev_io; 2762 int rc; 2763 2764 rc = nvme_ctrlr_op(io_path->qpair->ctrlr, NVME_CTRLR_OP_RESET, 2765 bdev_nvme_reset_io_continue, bio); 2766 if (rc == 0) { 2767 assert(bio->io_path == NULL); 2768 bio->io_path = io_path; 2769 } else if (rc == -EBUSY) { 2770 ctrlr_ch = io_path->qpair->ctrlr_ch; 2771 assert(ctrlr_ch != NULL); 2772 /* 2773 * Reset call is queued only if it is from the app framework. This is on purpose so that 2774 * we don't interfere with the app framework reset strategy. i.e. we are deferring to the 2775 * upper level. If they are in the middle of a reset, we won't try to schedule another one. 2776 */ 2777 bdev_io = spdk_bdev_io_from_ctx(bio); 2778 TAILQ_INSERT_TAIL(&ctrlr_ch->pending_resets, bdev_io, module_link); 2779 rc = 0; 2780 } 2781 2782 return rc; 2783 } 2784 2785 static void 2786 bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio) 2787 { 2788 struct nvme_io_path *io_path; 2789 int rc; 2790 2791 bio->cpl.cdw0 = 0; 2792 2793 /* Reset all nvme_ctrlrs of a bdev controller sequentially. */ 2794 io_path = STAILQ_FIRST(&nbdev_ch->io_path_list); 2795 assert(io_path != NULL); 2796 2797 rc = _bdev_nvme_reset_io(io_path, bio); 2798 if (rc != 0) { 2799 /* If the current nvme_ctrlr is disabled, skip it and move to the next nvme_ctrlr. */ 2800 bdev_nvme_reset_io_continue(bio, rc == -EALREADY); 2801 } 2802 } 2803 2804 static int 2805 bdev_nvme_failover_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool remove) 2806 { 2807 if (nvme_ctrlr->destruct) { 2808 /* Don't bother resetting if the controller is in the process of being destructed. */ 2809 return -ENXIO; 2810 } 2811 2812 if (nvme_ctrlr->resetting) { 2813 if (!nvme_ctrlr->in_failover) { 2814 SPDK_NOTICELOG("Reset is already in progress. Defer failover until reset completes.\n"); 2815 2816 /* Defer failover until reset completes. */ 2817 nvme_ctrlr->pending_failover = true; 2818 return -EINPROGRESS; 2819 } else { 2820 SPDK_NOTICELOG("Unable to perform failover, already in progress.\n"); 2821 return -EBUSY; 2822 } 2823 } 2824 2825 bdev_nvme_failover_trid(nvme_ctrlr, remove, true); 2826 2827 if (nvme_ctrlr->reconnect_is_delayed) { 2828 SPDK_NOTICELOG("Reconnect is already scheduled.\n"); 2829 2830 /* We rely on the next reconnect for the failover. */ 2831 return -EALREADY; 2832 } 2833 2834 if (nvme_ctrlr->disabled) { 2835 SPDK_NOTICELOG("Controller is disabled.\n"); 2836 2837 /* We rely on the enablement for the failover. */ 2838 return -EALREADY; 2839 } 2840 2841 nvme_ctrlr->resetting = true; 2842 nvme_ctrlr->in_failover = true; 2843 2844 assert(nvme_ctrlr->reset_start_tsc == 0); 2845 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2846 2847 return 0; 2848 } 2849 2850 static int 2851 bdev_nvme_failover_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool remove) 2852 { 2853 int rc; 2854 2855 pthread_mutex_lock(&nvme_ctrlr->mutex); 2856 rc = bdev_nvme_failover_ctrlr_unsafe(nvme_ctrlr, remove); 2857 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2858 2859 if (rc == 0) { 2860 spdk_thread_send_msg(nvme_ctrlr->thread, _bdev_nvme_reset_ctrlr, nvme_ctrlr); 2861 } else if (rc == -EALREADY) { 2862 rc = 0; 2863 } 2864 2865 return rc; 2866 } 2867 2868 static int bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, 2869 uint64_t num_blocks); 2870 2871 static int bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, 2872 uint64_t num_blocks); 2873 2874 static int bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, 2875 uint64_t src_offset_blocks, 2876 uint64_t num_blocks); 2877 2878 static void 2879 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 2880 bool success) 2881 { 2882 struct nvme_bdev_io *bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 2883 struct spdk_bdev *bdev = bdev_io->bdev; 2884 int ret; 2885 2886 if (!success) { 2887 ret = -EINVAL; 2888 goto exit; 2889 } 2890 2891 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 2892 ret = -ENXIO; 2893 goto exit; 2894 } 2895 2896 ret = bdev_nvme_readv(bio, 2897 bdev_io->u.bdev.iovs, 2898 bdev_io->u.bdev.iovcnt, 2899 bdev_io->u.bdev.md_buf, 2900 bdev_io->u.bdev.num_blocks, 2901 bdev_io->u.bdev.offset_blocks, 2902 bdev->dif_check_flags, 2903 bdev_io->u.bdev.memory_domain, 2904 bdev_io->u.bdev.memory_domain_ctx, 2905 bdev_io->u.bdev.accel_sequence); 2906 2907 exit: 2908 if (spdk_unlikely(ret != 0)) { 2909 bdev_nvme_io_complete(bio, ret); 2910 } 2911 } 2912 2913 static inline void 2914 _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io) 2915 { 2916 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 2917 struct spdk_bdev *bdev = bdev_io->bdev; 2918 struct nvme_bdev_io *nbdev_io_to_abort; 2919 int rc = 0; 2920 2921 switch (bdev_io->type) { 2922 case SPDK_BDEV_IO_TYPE_READ: 2923 if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) { 2924 rc = bdev_nvme_readv(nbdev_io, 2925 bdev_io->u.bdev.iovs, 2926 bdev_io->u.bdev.iovcnt, 2927 bdev_io->u.bdev.md_buf, 2928 bdev_io->u.bdev.num_blocks, 2929 bdev_io->u.bdev.offset_blocks, 2930 bdev->dif_check_flags, 2931 bdev_io->u.bdev.memory_domain, 2932 bdev_io->u.bdev.memory_domain_ctx, 2933 bdev_io->u.bdev.accel_sequence); 2934 } else { 2935 spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb, 2936 bdev_io->u.bdev.num_blocks * bdev->blocklen); 2937 rc = 0; 2938 } 2939 break; 2940 case SPDK_BDEV_IO_TYPE_WRITE: 2941 rc = bdev_nvme_writev(nbdev_io, 2942 bdev_io->u.bdev.iovs, 2943 bdev_io->u.bdev.iovcnt, 2944 bdev_io->u.bdev.md_buf, 2945 bdev_io->u.bdev.num_blocks, 2946 bdev_io->u.bdev.offset_blocks, 2947 bdev->dif_check_flags, 2948 bdev_io->u.bdev.memory_domain, 2949 bdev_io->u.bdev.memory_domain_ctx, 2950 bdev_io->u.bdev.accel_sequence); 2951 break; 2952 case SPDK_BDEV_IO_TYPE_COMPARE: 2953 rc = bdev_nvme_comparev(nbdev_io, 2954 bdev_io->u.bdev.iovs, 2955 bdev_io->u.bdev.iovcnt, 2956 bdev_io->u.bdev.md_buf, 2957 bdev_io->u.bdev.num_blocks, 2958 bdev_io->u.bdev.offset_blocks, 2959 bdev->dif_check_flags); 2960 break; 2961 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 2962 rc = bdev_nvme_comparev_and_writev(nbdev_io, 2963 bdev_io->u.bdev.iovs, 2964 bdev_io->u.bdev.iovcnt, 2965 bdev_io->u.bdev.fused_iovs, 2966 bdev_io->u.bdev.fused_iovcnt, 2967 bdev_io->u.bdev.md_buf, 2968 bdev_io->u.bdev.num_blocks, 2969 bdev_io->u.bdev.offset_blocks, 2970 bdev->dif_check_flags); 2971 break; 2972 case SPDK_BDEV_IO_TYPE_UNMAP: 2973 rc = bdev_nvme_unmap(nbdev_io, 2974 bdev_io->u.bdev.offset_blocks, 2975 bdev_io->u.bdev.num_blocks); 2976 break; 2977 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2978 rc = bdev_nvme_write_zeroes(nbdev_io, 2979 bdev_io->u.bdev.offset_blocks, 2980 bdev_io->u.bdev.num_blocks); 2981 break; 2982 case SPDK_BDEV_IO_TYPE_RESET: 2983 nbdev_io->io_path = NULL; 2984 bdev_nvme_reset_io(nbdev_ch, nbdev_io); 2985 return; 2986 2987 case SPDK_BDEV_IO_TYPE_FLUSH: 2988 bdev_nvme_io_complete(nbdev_io, 0); 2989 return; 2990 2991 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 2992 rc = bdev_nvme_zone_appendv(nbdev_io, 2993 bdev_io->u.bdev.iovs, 2994 bdev_io->u.bdev.iovcnt, 2995 bdev_io->u.bdev.md_buf, 2996 bdev_io->u.bdev.num_blocks, 2997 bdev_io->u.bdev.offset_blocks, 2998 bdev->dif_check_flags); 2999 break; 3000 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 3001 rc = bdev_nvme_get_zone_info(nbdev_io, 3002 bdev_io->u.zone_mgmt.zone_id, 3003 bdev_io->u.zone_mgmt.num_zones, 3004 bdev_io->u.zone_mgmt.buf); 3005 break; 3006 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 3007 rc = bdev_nvme_zone_management(nbdev_io, 3008 bdev_io->u.zone_mgmt.zone_id, 3009 bdev_io->u.zone_mgmt.zone_action); 3010 break; 3011 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 3012 nbdev_io->io_path = NULL; 3013 bdev_nvme_admin_passthru(nbdev_ch, 3014 nbdev_io, 3015 &bdev_io->u.nvme_passthru.cmd, 3016 bdev_io->u.nvme_passthru.buf, 3017 bdev_io->u.nvme_passthru.nbytes); 3018 return; 3019 3020 case SPDK_BDEV_IO_TYPE_NVME_IO: 3021 rc = bdev_nvme_io_passthru(nbdev_io, 3022 &bdev_io->u.nvme_passthru.cmd, 3023 bdev_io->u.nvme_passthru.buf, 3024 bdev_io->u.nvme_passthru.nbytes); 3025 break; 3026 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3027 rc = bdev_nvme_io_passthru_md(nbdev_io, 3028 &bdev_io->u.nvme_passthru.cmd, 3029 bdev_io->u.nvme_passthru.buf, 3030 bdev_io->u.nvme_passthru.nbytes, 3031 bdev_io->u.nvme_passthru.md_buf, 3032 bdev_io->u.nvme_passthru.md_len); 3033 break; 3034 case SPDK_BDEV_IO_TYPE_ABORT: 3035 nbdev_io->io_path = NULL; 3036 nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx; 3037 bdev_nvme_abort(nbdev_ch, 3038 nbdev_io, 3039 nbdev_io_to_abort); 3040 return; 3041 3042 case SPDK_BDEV_IO_TYPE_COPY: 3043 rc = bdev_nvme_copy(nbdev_io, 3044 bdev_io->u.bdev.offset_blocks, 3045 bdev_io->u.bdev.copy.src_offset_blocks, 3046 bdev_io->u.bdev.num_blocks); 3047 break; 3048 default: 3049 rc = -EINVAL; 3050 break; 3051 } 3052 3053 if (spdk_unlikely(rc != 0)) { 3054 bdev_nvme_io_complete(nbdev_io, rc); 3055 } 3056 } 3057 3058 static void 3059 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 3060 { 3061 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 3062 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 3063 3064 if (spdk_likely(nbdev_io->submit_tsc == 0)) { 3065 nbdev_io->submit_tsc = spdk_bdev_io_get_submit_tsc(bdev_io); 3066 } else { 3067 /* There are cases where submit_tsc != 0, i.e. retry I/O. 3068 * We need to update submit_tsc here. 3069 */ 3070 nbdev_io->submit_tsc = spdk_get_ticks(); 3071 } 3072 3073 spdk_trace_record(TRACE_BDEV_NVME_IO_START, 0, 0, (uintptr_t)nbdev_io, (uintptr_t)bdev_io); 3074 nbdev_io->io_path = bdev_nvme_find_io_path(nbdev_ch); 3075 if (spdk_unlikely(!nbdev_io->io_path)) { 3076 if (!bdev_nvme_io_type_is_admin(bdev_io->type)) { 3077 bdev_nvme_io_complete(nbdev_io, -ENXIO); 3078 return; 3079 } 3080 3081 /* Admin commands do not use the optimal I/O path. 3082 * Simply fall through even if it is not found. 3083 */ 3084 } 3085 3086 _bdev_nvme_submit_request(nbdev_ch, bdev_io); 3087 } 3088 3089 static bool 3090 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 3091 { 3092 struct nvme_bdev *nbdev = ctx; 3093 struct nvme_ns *nvme_ns; 3094 struct spdk_nvme_ns *ns; 3095 struct spdk_nvme_ctrlr *ctrlr; 3096 const struct spdk_nvme_ctrlr_data *cdata; 3097 3098 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 3099 assert(nvme_ns != NULL); 3100 ns = nvme_ns->ns; 3101 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 3102 3103 switch (io_type) { 3104 case SPDK_BDEV_IO_TYPE_READ: 3105 case SPDK_BDEV_IO_TYPE_WRITE: 3106 case SPDK_BDEV_IO_TYPE_RESET: 3107 case SPDK_BDEV_IO_TYPE_FLUSH: 3108 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 3109 case SPDK_BDEV_IO_TYPE_NVME_IO: 3110 case SPDK_BDEV_IO_TYPE_ABORT: 3111 return true; 3112 3113 case SPDK_BDEV_IO_TYPE_COMPARE: 3114 return spdk_nvme_ns_supports_compare(ns); 3115 3116 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3117 return spdk_nvme_ns_get_md_size(ns) ? true : false; 3118 3119 case SPDK_BDEV_IO_TYPE_UNMAP: 3120 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3121 return cdata->oncs.dsm; 3122 3123 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3124 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3125 return cdata->oncs.write_zeroes; 3126 3127 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 3128 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 3129 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) { 3130 return true; 3131 } 3132 return false; 3133 3134 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 3135 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 3136 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS; 3137 3138 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 3139 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS && 3140 spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ZONE_APPEND_SUPPORTED; 3141 3142 case SPDK_BDEV_IO_TYPE_COPY: 3143 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3144 return cdata->oncs.copy; 3145 3146 default: 3147 return false; 3148 } 3149 } 3150 3151 static int 3152 nvme_qpair_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ctrlr_channel *ctrlr_ch) 3153 { 3154 struct nvme_qpair *nvme_qpair; 3155 struct spdk_io_channel *pg_ch; 3156 int rc; 3157 3158 nvme_qpair = calloc(1, sizeof(*nvme_qpair)); 3159 if (!nvme_qpair) { 3160 SPDK_ERRLOG("Failed to alloc nvme_qpair.\n"); 3161 return -1; 3162 } 3163 3164 TAILQ_INIT(&nvme_qpair->io_path_list); 3165 3166 nvme_qpair->ctrlr = nvme_ctrlr; 3167 nvme_qpair->ctrlr_ch = ctrlr_ch; 3168 3169 pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs); 3170 if (!pg_ch) { 3171 free(nvme_qpair); 3172 return -1; 3173 } 3174 3175 nvme_qpair->group = spdk_io_channel_get_ctx(pg_ch); 3176 3177 #ifdef SPDK_CONFIG_VTUNE 3178 nvme_qpair->group->collect_spin_stat = true; 3179 #else 3180 nvme_qpair->group->collect_spin_stat = false; 3181 #endif 3182 3183 if (!nvme_ctrlr->disabled) { 3184 /* If a nvme_ctrlr is disabled, don't try to create qpair for it. Qpair will 3185 * be created when it's enabled. 3186 */ 3187 rc = bdev_nvme_create_qpair(nvme_qpair); 3188 if (rc != 0) { 3189 /* nvme_ctrlr can't create IO qpair if connection is down. 3190 * If reconnect_delay_sec is non-zero, creating IO qpair is retried 3191 * after reconnect_delay_sec seconds. If bdev_retry_count is non-zero, 3192 * submitted IO will be queued until IO qpair is successfully created. 3193 * 3194 * Hence, if both are satisfied, ignore the failure. 3195 */ 3196 if (nvme_ctrlr->opts.reconnect_delay_sec == 0 || g_opts.bdev_retry_count == 0) { 3197 spdk_put_io_channel(pg_ch); 3198 free(nvme_qpair); 3199 return rc; 3200 } 3201 } 3202 } 3203 3204 TAILQ_INSERT_TAIL(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 3205 3206 ctrlr_ch->qpair = nvme_qpair; 3207 3208 pthread_mutex_lock(&nvme_qpair->ctrlr->mutex); 3209 nvme_qpair->ctrlr->ref++; 3210 pthread_mutex_unlock(&nvme_qpair->ctrlr->mutex); 3211 3212 return 0; 3213 } 3214 3215 static int 3216 bdev_nvme_create_ctrlr_channel_cb(void *io_device, void *ctx_buf) 3217 { 3218 struct nvme_ctrlr *nvme_ctrlr = io_device; 3219 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 3220 3221 TAILQ_INIT(&ctrlr_ch->pending_resets); 3222 3223 return nvme_qpair_create(nvme_ctrlr, ctrlr_ch); 3224 } 3225 3226 static void 3227 nvme_qpair_delete(struct nvme_qpair *nvme_qpair) 3228 { 3229 struct nvme_io_path *io_path, *next; 3230 3231 assert(nvme_qpair->group != NULL); 3232 3233 TAILQ_FOREACH_SAFE(io_path, &nvme_qpair->io_path_list, tailq, next) { 3234 TAILQ_REMOVE(&nvme_qpair->io_path_list, io_path, tailq); 3235 nvme_io_path_free(io_path); 3236 } 3237 3238 TAILQ_REMOVE(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 3239 3240 spdk_put_io_channel(spdk_io_channel_from_ctx(nvme_qpair->group)); 3241 3242 nvme_ctrlr_release(nvme_qpair->ctrlr); 3243 3244 free(nvme_qpair); 3245 } 3246 3247 static void 3248 bdev_nvme_destroy_ctrlr_channel_cb(void *io_device, void *ctx_buf) 3249 { 3250 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 3251 struct nvme_qpair *nvme_qpair; 3252 3253 nvme_qpair = ctrlr_ch->qpair; 3254 assert(nvme_qpair != NULL); 3255 3256 _bdev_nvme_clear_io_path_cache(nvme_qpair); 3257 3258 if (nvme_qpair->qpair != NULL) { 3259 if (ctrlr_ch->reset_iter == NULL) { 3260 spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); 3261 } else { 3262 /* Skip current ctrlr_channel in a full reset sequence because 3263 * it is being deleted now. The qpair is already being disconnected. 3264 * We do not have to restart disconnecting it. 3265 */ 3266 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 3267 } 3268 3269 /* We cannot release a reference to the poll group now. 3270 * The qpair may be disconnected asynchronously later. 3271 * We need to poll it until it is actually disconnected. 3272 * Just detach the qpair from the deleting ctrlr_channel. 3273 */ 3274 nvme_qpair->ctrlr_ch = NULL; 3275 } else { 3276 assert(ctrlr_ch->reset_iter == NULL); 3277 3278 nvme_qpair_delete(nvme_qpair); 3279 } 3280 } 3281 3282 static inline struct spdk_io_channel * 3283 bdev_nvme_get_accel_channel(struct nvme_poll_group *group) 3284 { 3285 if (spdk_unlikely(!group->accel_channel)) { 3286 group->accel_channel = spdk_accel_get_io_channel(); 3287 if (!group->accel_channel) { 3288 SPDK_ERRLOG("Cannot get the accel_channel for bdev nvme polling group=%p\n", 3289 group); 3290 return NULL; 3291 } 3292 } 3293 3294 return group->accel_channel; 3295 } 3296 3297 static void 3298 bdev_nvme_submit_accel_crc32c(void *ctx, uint32_t *dst, struct iovec *iov, 3299 uint32_t iov_cnt, uint32_t seed, 3300 spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) 3301 { 3302 struct spdk_io_channel *accel_ch; 3303 struct nvme_poll_group *group = ctx; 3304 int rc; 3305 3306 assert(cb_fn != NULL); 3307 3308 accel_ch = bdev_nvme_get_accel_channel(group); 3309 if (spdk_unlikely(accel_ch == NULL)) { 3310 cb_fn(cb_arg, -ENOMEM); 3311 return; 3312 } 3313 3314 rc = spdk_accel_submit_crc32cv(accel_ch, dst, iov, iov_cnt, seed, cb_fn, cb_arg); 3315 if (rc) { 3316 /* For the two cases, spdk_accel_submit_crc32cv does not call the user's cb_fn */ 3317 if (rc == -ENOMEM || rc == -EINVAL) { 3318 cb_fn(cb_arg, rc); 3319 } 3320 SPDK_ERRLOG("Cannot complete the accelerated crc32c operation with iov=%p\n", iov); 3321 } 3322 } 3323 3324 static void 3325 bdev_nvme_finish_sequence(void *seq, spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) 3326 { 3327 spdk_accel_sequence_finish(seq, cb_fn, cb_arg); 3328 } 3329 3330 static void 3331 bdev_nvme_abort_sequence(void *seq) 3332 { 3333 spdk_accel_sequence_abort(seq); 3334 } 3335 3336 static void 3337 bdev_nvme_reverse_sequence(void *seq) 3338 { 3339 spdk_accel_sequence_reverse(seq); 3340 } 3341 3342 static int 3343 bdev_nvme_append_crc32c(void *ctx, void **seq, uint32_t *dst, struct iovec *iovs, uint32_t iovcnt, 3344 struct spdk_memory_domain *domain, void *domain_ctx, uint32_t seed, 3345 spdk_nvme_accel_step_cb cb_fn, void *cb_arg) 3346 { 3347 struct spdk_io_channel *ch; 3348 struct nvme_poll_group *group = ctx; 3349 3350 ch = bdev_nvme_get_accel_channel(group); 3351 if (spdk_unlikely(ch == NULL)) { 3352 return -ENOMEM; 3353 } 3354 3355 return spdk_accel_append_crc32c((struct spdk_accel_sequence **)seq, ch, dst, iovs, iovcnt, 3356 domain, domain_ctx, seed, cb_fn, cb_arg); 3357 } 3358 3359 static struct spdk_nvme_accel_fn_table g_bdev_nvme_accel_fn_table = { 3360 .table_size = sizeof(struct spdk_nvme_accel_fn_table), 3361 .submit_accel_crc32c = bdev_nvme_submit_accel_crc32c, 3362 .append_crc32c = bdev_nvme_append_crc32c, 3363 .finish_sequence = bdev_nvme_finish_sequence, 3364 .reverse_sequence = bdev_nvme_reverse_sequence, 3365 .abort_sequence = bdev_nvme_abort_sequence, 3366 }; 3367 3368 static int 3369 bdev_nvme_create_poll_group_cb(void *io_device, void *ctx_buf) 3370 { 3371 struct nvme_poll_group *group = ctx_buf; 3372 3373 TAILQ_INIT(&group->qpair_list); 3374 3375 group->group = spdk_nvme_poll_group_create(group, &g_bdev_nvme_accel_fn_table); 3376 if (group->group == NULL) { 3377 return -1; 3378 } 3379 3380 group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us); 3381 3382 if (group->poller == NULL) { 3383 spdk_nvme_poll_group_destroy(group->group); 3384 return -1; 3385 } 3386 3387 return 0; 3388 } 3389 3390 static void 3391 bdev_nvme_destroy_poll_group_cb(void *io_device, void *ctx_buf) 3392 { 3393 struct nvme_poll_group *group = ctx_buf; 3394 3395 assert(TAILQ_EMPTY(&group->qpair_list)); 3396 3397 if (group->accel_channel) { 3398 spdk_put_io_channel(group->accel_channel); 3399 } 3400 3401 spdk_poller_unregister(&group->poller); 3402 if (spdk_nvme_poll_group_destroy(group->group)) { 3403 SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.\n"); 3404 assert(false); 3405 } 3406 } 3407 3408 static struct spdk_io_channel * 3409 bdev_nvme_get_io_channel(void *ctx) 3410 { 3411 struct nvme_bdev *nvme_bdev = ctx; 3412 3413 return spdk_get_io_channel(nvme_bdev); 3414 } 3415 3416 static void * 3417 bdev_nvme_get_module_ctx(void *ctx) 3418 { 3419 struct nvme_bdev *nvme_bdev = ctx; 3420 struct nvme_ns *nvme_ns; 3421 3422 if (!nvme_bdev || nvme_bdev->disk.module != &nvme_if) { 3423 return NULL; 3424 } 3425 3426 nvme_ns = TAILQ_FIRST(&nvme_bdev->nvme_ns_list); 3427 if (!nvme_ns) { 3428 return NULL; 3429 } 3430 3431 return nvme_ns->ns; 3432 } 3433 3434 static const char * 3435 _nvme_ana_state_str(enum spdk_nvme_ana_state ana_state) 3436 { 3437 switch (ana_state) { 3438 case SPDK_NVME_ANA_OPTIMIZED_STATE: 3439 return "optimized"; 3440 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 3441 return "non_optimized"; 3442 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 3443 return "inaccessible"; 3444 case SPDK_NVME_ANA_PERSISTENT_LOSS_STATE: 3445 return "persistent_loss"; 3446 case SPDK_NVME_ANA_CHANGE_STATE: 3447 return "change"; 3448 default: 3449 return NULL; 3450 } 3451 } 3452 3453 static int 3454 bdev_nvme_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size) 3455 { 3456 struct spdk_memory_domain **_domains = NULL; 3457 struct nvme_bdev *nbdev = ctx; 3458 struct nvme_ns *nvme_ns; 3459 int i = 0, _array_size = array_size; 3460 int rc = 0; 3461 3462 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 3463 if (domains && array_size >= i) { 3464 _domains = &domains[i]; 3465 } else { 3466 _domains = NULL; 3467 } 3468 rc = spdk_nvme_ctrlr_get_memory_domains(nvme_ns->ctrlr->ctrlr, _domains, _array_size); 3469 if (rc > 0) { 3470 i += rc; 3471 if (_array_size >= rc) { 3472 _array_size -= rc; 3473 } else { 3474 _array_size = 0; 3475 } 3476 } else if (rc < 0) { 3477 return rc; 3478 } 3479 } 3480 3481 return i; 3482 } 3483 3484 static const char * 3485 nvme_ctrlr_get_state_str(struct nvme_ctrlr *nvme_ctrlr) 3486 { 3487 if (nvme_ctrlr->destruct) { 3488 return "deleting"; 3489 } else if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 3490 return "failed"; 3491 } else if (nvme_ctrlr->resetting) { 3492 return "resetting"; 3493 } else if (nvme_ctrlr->reconnect_is_delayed > 0) { 3494 return "reconnect_is_delayed"; 3495 } else if (nvme_ctrlr->disabled) { 3496 return "disabled"; 3497 } else { 3498 return "enabled"; 3499 } 3500 } 3501 3502 void 3503 nvme_ctrlr_info_json(struct spdk_json_write_ctx *w, struct nvme_ctrlr *nvme_ctrlr) 3504 { 3505 struct spdk_nvme_transport_id *trid; 3506 const struct spdk_nvme_ctrlr_opts *opts; 3507 const struct spdk_nvme_ctrlr_data *cdata; 3508 struct nvme_path_id *path_id; 3509 3510 spdk_json_write_object_begin(w); 3511 3512 spdk_json_write_named_string(w, "state", nvme_ctrlr_get_state_str(nvme_ctrlr)); 3513 3514 #ifdef SPDK_CONFIG_NVME_CUSE 3515 size_t cuse_name_size = 128; 3516 char cuse_name[cuse_name_size]; 3517 3518 int rc = spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr, cuse_name, &cuse_name_size); 3519 if (rc == 0) { 3520 spdk_json_write_named_string(w, "cuse_device", cuse_name); 3521 } 3522 #endif 3523 trid = &nvme_ctrlr->active_path_id->trid; 3524 spdk_json_write_named_object_begin(w, "trid"); 3525 nvme_bdev_dump_trid_json(trid, w); 3526 spdk_json_write_object_end(w); 3527 3528 path_id = TAILQ_NEXT(nvme_ctrlr->active_path_id, link); 3529 if (path_id != NULL) { 3530 spdk_json_write_named_array_begin(w, "alternate_trids"); 3531 do { 3532 trid = &path_id->trid; 3533 spdk_json_write_object_begin(w); 3534 nvme_bdev_dump_trid_json(trid, w); 3535 spdk_json_write_object_end(w); 3536 3537 path_id = TAILQ_NEXT(path_id, link); 3538 } while (path_id != NULL); 3539 spdk_json_write_array_end(w); 3540 } 3541 3542 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 3543 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 3544 3545 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 3546 spdk_json_write_named_object_begin(w, "host"); 3547 spdk_json_write_named_string(w, "nqn", opts->hostnqn); 3548 spdk_json_write_named_string(w, "addr", opts->src_addr); 3549 spdk_json_write_named_string(w, "svcid", opts->src_svcid); 3550 spdk_json_write_object_end(w); 3551 3552 spdk_json_write_object_end(w); 3553 } 3554 3555 static void 3556 nvme_namespace_info_json(struct spdk_json_write_ctx *w, 3557 struct nvme_ns *nvme_ns) 3558 { 3559 struct spdk_nvme_ns *ns; 3560 struct spdk_nvme_ctrlr *ctrlr; 3561 const struct spdk_nvme_ctrlr_data *cdata; 3562 const struct spdk_nvme_transport_id *trid; 3563 union spdk_nvme_vs_register vs; 3564 const struct spdk_nvme_ns_data *nsdata; 3565 char buf[128]; 3566 3567 ns = nvme_ns->ns; 3568 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 3569 3570 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3571 trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); 3572 vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr); 3573 3574 spdk_json_write_object_begin(w); 3575 3576 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 3577 spdk_json_write_named_string(w, "pci_address", trid->traddr); 3578 } 3579 3580 spdk_json_write_named_object_begin(w, "trid"); 3581 3582 nvme_bdev_dump_trid_json(trid, w); 3583 3584 spdk_json_write_object_end(w); 3585 3586 #ifdef SPDK_CONFIG_NVME_CUSE 3587 size_t cuse_name_size = 128; 3588 char cuse_name[cuse_name_size]; 3589 3590 int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns), 3591 cuse_name, &cuse_name_size); 3592 if (rc == 0) { 3593 spdk_json_write_named_string(w, "cuse_device", cuse_name); 3594 } 3595 #endif 3596 3597 spdk_json_write_named_object_begin(w, "ctrlr_data"); 3598 3599 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 3600 3601 spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid); 3602 3603 snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn); 3604 spdk_str_trim(buf); 3605 spdk_json_write_named_string(w, "model_number", buf); 3606 3607 snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn); 3608 spdk_str_trim(buf); 3609 spdk_json_write_named_string(w, "serial_number", buf); 3610 3611 snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr); 3612 spdk_str_trim(buf); 3613 spdk_json_write_named_string(w, "firmware_revision", buf); 3614 3615 if (cdata->subnqn[0] != '\0') { 3616 spdk_json_write_named_string(w, "subnqn", cdata->subnqn); 3617 } 3618 3619 spdk_json_write_named_object_begin(w, "oacs"); 3620 3621 spdk_json_write_named_uint32(w, "security", cdata->oacs.security); 3622 spdk_json_write_named_uint32(w, "format", cdata->oacs.format); 3623 spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware); 3624 spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage); 3625 3626 spdk_json_write_object_end(w); 3627 3628 spdk_json_write_named_bool(w, "multi_ctrlr", cdata->cmic.multi_ctrlr); 3629 spdk_json_write_named_bool(w, "ana_reporting", cdata->cmic.ana_reporting); 3630 3631 spdk_json_write_object_end(w); 3632 3633 spdk_json_write_named_object_begin(w, "vs"); 3634 3635 spdk_json_write_name(w, "nvme_version"); 3636 if (vs.bits.ter) { 3637 spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter); 3638 } else { 3639 spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr); 3640 } 3641 3642 spdk_json_write_object_end(w); 3643 3644 nsdata = spdk_nvme_ns_get_data(ns); 3645 3646 spdk_json_write_named_object_begin(w, "ns_data"); 3647 3648 spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns)); 3649 3650 if (cdata->cmic.ana_reporting) { 3651 spdk_json_write_named_string(w, "ana_state", 3652 _nvme_ana_state_str(nvme_ns->ana_state)); 3653 } 3654 3655 spdk_json_write_named_bool(w, "can_share", nsdata->nmic.can_share); 3656 3657 spdk_json_write_object_end(w); 3658 3659 if (cdata->oacs.security) { 3660 spdk_json_write_named_object_begin(w, "security"); 3661 3662 spdk_json_write_named_bool(w, "opal", nvme_ns->bdev->opal); 3663 3664 spdk_json_write_object_end(w); 3665 } 3666 3667 spdk_json_write_object_end(w); 3668 } 3669 3670 static const char * 3671 nvme_bdev_get_mp_policy_str(struct nvme_bdev *nbdev) 3672 { 3673 switch (nbdev->mp_policy) { 3674 case BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE: 3675 return "active_passive"; 3676 case BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE: 3677 return "active_active"; 3678 default: 3679 assert(false); 3680 return "invalid"; 3681 } 3682 } 3683 3684 static int 3685 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) 3686 { 3687 struct nvme_bdev *nvme_bdev = ctx; 3688 struct nvme_ns *nvme_ns; 3689 3690 pthread_mutex_lock(&nvme_bdev->mutex); 3691 spdk_json_write_named_array_begin(w, "nvme"); 3692 TAILQ_FOREACH(nvme_ns, &nvme_bdev->nvme_ns_list, tailq) { 3693 nvme_namespace_info_json(w, nvme_ns); 3694 } 3695 spdk_json_write_array_end(w); 3696 spdk_json_write_named_string(w, "mp_policy", nvme_bdev_get_mp_policy_str(nvme_bdev)); 3697 pthread_mutex_unlock(&nvme_bdev->mutex); 3698 3699 return 0; 3700 } 3701 3702 static void 3703 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3704 { 3705 /* No config per bdev needed */ 3706 } 3707 3708 static uint64_t 3709 bdev_nvme_get_spin_time(struct spdk_io_channel *ch) 3710 { 3711 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 3712 struct nvme_io_path *io_path; 3713 struct nvme_poll_group *group; 3714 uint64_t spin_time = 0; 3715 3716 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 3717 group = io_path->qpair->group; 3718 3719 if (!group || !group->collect_spin_stat) { 3720 continue; 3721 } 3722 3723 if (group->end_ticks != 0) { 3724 group->spin_ticks += (group->end_ticks - group->start_ticks); 3725 group->end_ticks = 0; 3726 } 3727 3728 spin_time += group->spin_ticks; 3729 group->start_ticks = 0; 3730 group->spin_ticks = 0; 3731 } 3732 3733 return (spin_time * 1000000ULL) / spdk_get_ticks_hz(); 3734 } 3735 3736 static void 3737 bdev_nvme_reset_device_stat(void *ctx) 3738 { 3739 struct nvme_bdev *nbdev = ctx; 3740 3741 if (nbdev->err_stat != NULL) { 3742 memset(nbdev->err_stat, 0, sizeof(struct nvme_error_stat)); 3743 } 3744 } 3745 3746 /* JSON string should be lowercases and underscore delimited string. */ 3747 static void 3748 bdev_nvme_format_nvme_status(char *dst, const char *src) 3749 { 3750 char tmp[256]; 3751 3752 spdk_strcpy_replace(dst, 256, src, " - ", "_"); 3753 spdk_strcpy_replace(tmp, 256, dst, "-", "_"); 3754 spdk_strcpy_replace(dst, 256, tmp, " ", "_"); 3755 spdk_strlwr(dst); 3756 } 3757 3758 static void 3759 bdev_nvme_dump_device_stat_json(void *ctx, struct spdk_json_write_ctx *w) 3760 { 3761 struct nvme_bdev *nbdev = ctx; 3762 struct spdk_nvme_status status = {}; 3763 uint16_t sct, sc; 3764 char status_json[256]; 3765 const char *status_str; 3766 3767 if (nbdev->err_stat == NULL) { 3768 return; 3769 } 3770 3771 spdk_json_write_named_object_begin(w, "nvme_error"); 3772 3773 spdk_json_write_named_object_begin(w, "status_type"); 3774 for (sct = 0; sct < 8; sct++) { 3775 if (nbdev->err_stat->status_type[sct] == 0) { 3776 continue; 3777 } 3778 status.sct = sct; 3779 3780 status_str = spdk_nvme_cpl_get_status_type_string(&status); 3781 assert(status_str != NULL); 3782 bdev_nvme_format_nvme_status(status_json, status_str); 3783 3784 spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status_type[sct]); 3785 } 3786 spdk_json_write_object_end(w); 3787 3788 spdk_json_write_named_object_begin(w, "status_code"); 3789 for (sct = 0; sct < 4; sct++) { 3790 status.sct = sct; 3791 for (sc = 0; sc < 256; sc++) { 3792 if (nbdev->err_stat->status[sct][sc] == 0) { 3793 continue; 3794 } 3795 status.sc = sc; 3796 3797 status_str = spdk_nvme_cpl_get_status_string(&status); 3798 assert(status_str != NULL); 3799 bdev_nvme_format_nvme_status(status_json, status_str); 3800 3801 spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status[sct][sc]); 3802 } 3803 } 3804 spdk_json_write_object_end(w); 3805 3806 spdk_json_write_object_end(w); 3807 } 3808 3809 static bool 3810 bdev_nvme_accel_sequence_supported(void *ctx, enum spdk_bdev_io_type type) 3811 { 3812 struct nvme_bdev *nbdev = ctx; 3813 struct spdk_nvme_ctrlr *ctrlr; 3814 3815 if (!g_opts.allow_accel_sequence) { 3816 return false; 3817 } 3818 3819 switch (type) { 3820 case SPDK_BDEV_IO_TYPE_WRITE: 3821 case SPDK_BDEV_IO_TYPE_READ: 3822 break; 3823 default: 3824 return false; 3825 } 3826 3827 ctrlr = bdev_nvme_get_ctrlr(&nbdev->disk); 3828 assert(ctrlr != NULL); 3829 3830 return spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ACCEL_SEQUENCE_SUPPORTED; 3831 } 3832 3833 static const struct spdk_bdev_fn_table nvmelib_fn_table = { 3834 .destruct = bdev_nvme_destruct, 3835 .submit_request = bdev_nvme_submit_request, 3836 .io_type_supported = bdev_nvme_io_type_supported, 3837 .get_io_channel = bdev_nvme_get_io_channel, 3838 .dump_info_json = bdev_nvme_dump_info_json, 3839 .write_config_json = bdev_nvme_write_config_json, 3840 .get_spin_time = bdev_nvme_get_spin_time, 3841 .get_module_ctx = bdev_nvme_get_module_ctx, 3842 .get_memory_domains = bdev_nvme_get_memory_domains, 3843 .accel_sequence_supported = bdev_nvme_accel_sequence_supported, 3844 .reset_device_stat = bdev_nvme_reset_device_stat, 3845 .dump_device_stat_json = bdev_nvme_dump_device_stat_json, 3846 }; 3847 3848 typedef int (*bdev_nvme_parse_ana_log_page_cb)( 3849 const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg); 3850 3851 static int 3852 bdev_nvme_parse_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 3853 bdev_nvme_parse_ana_log_page_cb cb_fn, void *cb_arg) 3854 { 3855 struct spdk_nvme_ana_group_descriptor *copied_desc; 3856 uint8_t *orig_desc; 3857 uint32_t i, desc_size, copy_len; 3858 int rc = 0; 3859 3860 if (nvme_ctrlr->ana_log_page == NULL) { 3861 return -EINVAL; 3862 } 3863 3864 copied_desc = nvme_ctrlr->copied_ana_desc; 3865 3866 orig_desc = (uint8_t *)nvme_ctrlr->ana_log_page + sizeof(struct spdk_nvme_ana_page); 3867 copy_len = nvme_ctrlr->max_ana_log_page_size - sizeof(struct spdk_nvme_ana_page); 3868 3869 for (i = 0; i < nvme_ctrlr->ana_log_page->num_ana_group_desc; i++) { 3870 memcpy(copied_desc, orig_desc, copy_len); 3871 3872 rc = cb_fn(copied_desc, cb_arg); 3873 if (rc != 0) { 3874 break; 3875 } 3876 3877 desc_size = sizeof(struct spdk_nvme_ana_group_descriptor) + 3878 copied_desc->num_of_nsid * sizeof(uint32_t); 3879 orig_desc += desc_size; 3880 copy_len -= desc_size; 3881 } 3882 3883 return rc; 3884 } 3885 3886 static int 3887 nvme_ns_ana_transition_timedout(void *ctx) 3888 { 3889 struct nvme_ns *nvme_ns = ctx; 3890 3891 spdk_poller_unregister(&nvme_ns->anatt_timer); 3892 nvme_ns->ana_transition_timedout = true; 3893 3894 return SPDK_POLLER_BUSY; 3895 } 3896 3897 static void 3898 _nvme_ns_set_ana_state(struct nvme_ns *nvme_ns, 3899 const struct spdk_nvme_ana_group_descriptor *desc) 3900 { 3901 const struct spdk_nvme_ctrlr_data *cdata; 3902 3903 nvme_ns->ana_group_id = desc->ana_group_id; 3904 nvme_ns->ana_state = desc->ana_state; 3905 nvme_ns->ana_state_updating = false; 3906 3907 switch (nvme_ns->ana_state) { 3908 case SPDK_NVME_ANA_OPTIMIZED_STATE: 3909 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 3910 nvme_ns->ana_transition_timedout = false; 3911 spdk_poller_unregister(&nvme_ns->anatt_timer); 3912 break; 3913 3914 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 3915 case SPDK_NVME_ANA_CHANGE_STATE: 3916 if (nvme_ns->anatt_timer != NULL) { 3917 break; 3918 } 3919 3920 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 3921 nvme_ns->anatt_timer = SPDK_POLLER_REGISTER(nvme_ns_ana_transition_timedout, 3922 nvme_ns, 3923 cdata->anatt * SPDK_SEC_TO_USEC); 3924 break; 3925 default: 3926 break; 3927 } 3928 } 3929 3930 static int 3931 nvme_ns_set_ana_state(const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg) 3932 { 3933 struct nvme_ns *nvme_ns = cb_arg; 3934 uint32_t i; 3935 3936 for (i = 0; i < desc->num_of_nsid; i++) { 3937 if (desc->nsid[i] != spdk_nvme_ns_get_id(nvme_ns->ns)) { 3938 continue; 3939 } 3940 3941 _nvme_ns_set_ana_state(nvme_ns, desc); 3942 return 1; 3943 } 3944 3945 return 0; 3946 } 3947 3948 static struct spdk_uuid 3949 nvme_generate_uuid(const char *sn, uint32_t nsid) 3950 { 3951 struct spdk_uuid new_uuid, namespace_uuid; 3952 char merged_str[SPDK_NVME_CTRLR_SN_LEN + NSID_STR_LEN + 1] = {'\0'}; 3953 /* This namespace UUID was generated using uuid_generate() method. */ 3954 const char *namespace_str = {"edaed2de-24bc-4b07-b559-f47ecbe730fd"}; 3955 int size; 3956 3957 assert(strlen(sn) <= SPDK_NVME_CTRLR_SN_LEN); 3958 3959 spdk_uuid_set_null(&new_uuid); 3960 spdk_uuid_set_null(&namespace_uuid); 3961 3962 size = snprintf(merged_str, sizeof(merged_str), "%s%"PRIu32, sn, nsid); 3963 assert(size > 0 && (unsigned long)size < sizeof(merged_str)); 3964 3965 spdk_uuid_parse(&namespace_uuid, namespace_str); 3966 3967 spdk_uuid_generate_sha1(&new_uuid, &namespace_uuid, merged_str, size); 3968 3969 return new_uuid; 3970 } 3971 3972 static int 3973 nvme_disk_create(struct spdk_bdev *disk, const char *base_name, 3974 struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns, 3975 uint32_t prchk_flags, void *ctx) 3976 { 3977 const struct spdk_uuid *uuid; 3978 const uint8_t *nguid; 3979 const struct spdk_nvme_ctrlr_data *cdata; 3980 const struct spdk_nvme_ns_data *nsdata; 3981 const struct spdk_nvme_ctrlr_opts *opts; 3982 enum spdk_nvme_csi csi; 3983 uint32_t atomic_bs, phys_bs, bs; 3984 char sn_tmp[SPDK_NVME_CTRLR_SN_LEN + 1] = {'\0'}; 3985 3986 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3987 csi = spdk_nvme_ns_get_csi(ns); 3988 opts = spdk_nvme_ctrlr_get_opts(ctrlr); 3989 3990 switch (csi) { 3991 case SPDK_NVME_CSI_NVM: 3992 disk->product_name = "NVMe disk"; 3993 break; 3994 case SPDK_NVME_CSI_ZNS: 3995 disk->product_name = "NVMe ZNS disk"; 3996 disk->zoned = true; 3997 disk->zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 3998 disk->max_zone_append_size = spdk_nvme_zns_ctrlr_get_max_zone_append_size(ctrlr) / 3999 spdk_nvme_ns_get_extended_sector_size(ns); 4000 disk->max_open_zones = spdk_nvme_zns_ns_get_max_open_zones(ns); 4001 disk->max_active_zones = spdk_nvme_zns_ns_get_max_active_zones(ns); 4002 break; 4003 default: 4004 SPDK_ERRLOG("unsupported CSI: %u\n", csi); 4005 return -ENOTSUP; 4006 } 4007 4008 disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns)); 4009 if (!disk->name) { 4010 return -ENOMEM; 4011 } 4012 4013 disk->write_cache = 0; 4014 if (cdata->vwc.present) { 4015 /* Enable if the Volatile Write Cache exists */ 4016 disk->write_cache = 1; 4017 } 4018 if (cdata->oncs.write_zeroes) { 4019 disk->max_write_zeroes = UINT16_MAX + 1; 4020 } 4021 disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns); 4022 disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns); 4023 disk->max_segment_size = spdk_nvme_ctrlr_get_max_xfer_size(ctrlr); 4024 /* NVMe driver will split one request into multiple requests 4025 * based on MDTS and stripe boundary, the bdev layer will use 4026 * max_segment_size and max_num_segments to split one big IO 4027 * into multiple requests, then small request can't run out 4028 * of NVMe internal requests data structure. 4029 */ 4030 if (opts && opts->io_queue_requests) { 4031 disk->max_num_segments = opts->io_queue_requests / 2; 4032 } 4033 disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns); 4034 4035 nguid = spdk_nvme_ns_get_nguid(ns); 4036 if (!nguid) { 4037 uuid = spdk_nvme_ns_get_uuid(ns); 4038 if (uuid) { 4039 disk->uuid = *uuid; 4040 } else if (g_opts.generate_uuids) { 4041 spdk_strcpy_pad(sn_tmp, cdata->sn, SPDK_NVME_CTRLR_SN_LEN, '\0'); 4042 disk->uuid = nvme_generate_uuid(sn_tmp, spdk_nvme_ns_get_id(ns)); 4043 } 4044 } else { 4045 memcpy(&disk->uuid, nguid, sizeof(disk->uuid)); 4046 } 4047 4048 nsdata = spdk_nvme_ns_get_data(ns); 4049 bs = spdk_nvme_ns_get_sector_size(ns); 4050 atomic_bs = bs; 4051 phys_bs = bs; 4052 if (nsdata->nabo == 0) { 4053 if (nsdata->nsfeat.ns_atomic_write_unit && nsdata->nawupf) { 4054 atomic_bs = bs * (1 + nsdata->nawupf); 4055 } else { 4056 atomic_bs = bs * (1 + cdata->awupf); 4057 } 4058 } 4059 if (nsdata->nsfeat.optperf) { 4060 phys_bs = bs * (1 + nsdata->npwg); 4061 } 4062 disk->phys_blocklen = spdk_min(phys_bs, atomic_bs); 4063 4064 disk->md_len = spdk_nvme_ns_get_md_size(ns); 4065 if (disk->md_len != 0) { 4066 disk->md_interleave = nsdata->flbas.extended; 4067 disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns); 4068 if (disk->dif_type != SPDK_DIF_DISABLE) { 4069 disk->dif_is_head_of_md = nsdata->dps.md_start; 4070 disk->dif_check_flags = prchk_flags; 4071 } 4072 } 4073 4074 if (!(spdk_nvme_ctrlr_get_flags(ctrlr) & 4075 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) { 4076 disk->acwu = 0; 4077 } else if (nsdata->nsfeat.ns_atomic_write_unit) { 4078 disk->acwu = nsdata->nacwu + 1; /* 0-based */ 4079 } else { 4080 disk->acwu = cdata->acwu + 1; /* 0-based */ 4081 } 4082 4083 if (cdata->oncs.copy) { 4084 /* For now bdev interface allows only single segment copy */ 4085 disk->max_copy = nsdata->mssrl; 4086 } 4087 4088 disk->ctxt = ctx; 4089 disk->fn_table = &nvmelib_fn_table; 4090 disk->module = &nvme_if; 4091 4092 return 0; 4093 } 4094 4095 static struct nvme_bdev * 4096 nvme_bdev_alloc(void) 4097 { 4098 struct nvme_bdev *bdev; 4099 int rc; 4100 4101 bdev = calloc(1, sizeof(*bdev)); 4102 if (!bdev) { 4103 SPDK_ERRLOG("bdev calloc() failed\n"); 4104 return NULL; 4105 } 4106 4107 if (g_opts.nvme_error_stat) { 4108 bdev->err_stat = calloc(1, sizeof(struct nvme_error_stat)); 4109 if (!bdev->err_stat) { 4110 SPDK_ERRLOG("err_stat calloc() failed\n"); 4111 free(bdev); 4112 return NULL; 4113 } 4114 } 4115 4116 rc = pthread_mutex_init(&bdev->mutex, NULL); 4117 if (rc != 0) { 4118 free(bdev->err_stat); 4119 free(bdev); 4120 return NULL; 4121 } 4122 4123 bdev->ref = 1; 4124 bdev->mp_policy = BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE; 4125 bdev->mp_selector = BDEV_NVME_MP_SELECTOR_ROUND_ROBIN; 4126 bdev->rr_min_io = UINT32_MAX; 4127 TAILQ_INIT(&bdev->nvme_ns_list); 4128 4129 return bdev; 4130 } 4131 4132 static int 4133 nvme_bdev_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4134 { 4135 struct nvme_bdev *bdev; 4136 struct nvme_bdev_ctrlr *nbdev_ctrlr = nvme_ctrlr->nbdev_ctrlr; 4137 int rc; 4138 4139 bdev = nvme_bdev_alloc(); 4140 if (bdev == NULL) { 4141 SPDK_ERRLOG("Failed to allocate NVMe bdev\n"); 4142 return -ENOMEM; 4143 } 4144 4145 bdev->opal = nvme_ctrlr->opal_dev != NULL; 4146 4147 rc = nvme_disk_create(&bdev->disk, nbdev_ctrlr->name, nvme_ctrlr->ctrlr, 4148 nvme_ns->ns, nvme_ctrlr->opts.prchk_flags, bdev); 4149 if (rc != 0) { 4150 SPDK_ERRLOG("Failed to create NVMe disk\n"); 4151 nvme_bdev_free(bdev); 4152 return rc; 4153 } 4154 4155 spdk_io_device_register(bdev, 4156 bdev_nvme_create_bdev_channel_cb, 4157 bdev_nvme_destroy_bdev_channel_cb, 4158 sizeof(struct nvme_bdev_channel), 4159 bdev->disk.name); 4160 4161 nvme_ns->bdev = bdev; 4162 bdev->nsid = nvme_ns->id; 4163 TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); 4164 4165 bdev->nbdev_ctrlr = nbdev_ctrlr; 4166 TAILQ_INSERT_TAIL(&nbdev_ctrlr->bdevs, bdev, tailq); 4167 4168 rc = spdk_bdev_register(&bdev->disk); 4169 if (rc != 0) { 4170 SPDK_ERRLOG("spdk_bdev_register() failed\n"); 4171 spdk_io_device_unregister(bdev, NULL); 4172 nvme_ns->bdev = NULL; 4173 TAILQ_REMOVE(&nbdev_ctrlr->bdevs, bdev, tailq); 4174 nvme_bdev_free(bdev); 4175 return rc; 4176 } 4177 4178 return 0; 4179 } 4180 4181 static bool 4182 bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2) 4183 { 4184 const struct spdk_nvme_ns_data *nsdata1, *nsdata2; 4185 const struct spdk_uuid *uuid1, *uuid2; 4186 4187 nsdata1 = spdk_nvme_ns_get_data(ns1); 4188 nsdata2 = spdk_nvme_ns_get_data(ns2); 4189 uuid1 = spdk_nvme_ns_get_uuid(ns1); 4190 uuid2 = spdk_nvme_ns_get_uuid(ns2); 4191 4192 return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid)) == 0 && 4193 nsdata1->eui64 == nsdata2->eui64 && 4194 ((uuid1 == NULL && uuid2 == NULL) || 4195 (uuid1 != NULL && uuid2 != NULL && spdk_uuid_compare(uuid1, uuid2) == 0)) && 4196 spdk_nvme_ns_get_csi(ns1) == spdk_nvme_ns_get_csi(ns2); 4197 } 4198 4199 static bool 4200 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 4201 struct spdk_nvme_ctrlr_opts *opts) 4202 { 4203 struct nvme_probe_skip_entry *entry; 4204 4205 TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) { 4206 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 4207 return false; 4208 } 4209 } 4210 4211 opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst; 4212 opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight; 4213 opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight; 4214 opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight; 4215 opts->disable_read_ana_log_page = true; 4216 4217 SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr); 4218 4219 return true; 4220 } 4221 4222 static void 4223 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl) 4224 { 4225 struct nvme_ctrlr *nvme_ctrlr = ctx; 4226 4227 if (spdk_nvme_cpl_is_error(cpl)) { 4228 SPDK_WARNLOG("Abort failed. Resetting controller. sc is %u, sct is %u.\n", cpl->status.sc, 4229 cpl->status.sct); 4230 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4231 } else if (cpl->cdw0 & 0x1) { 4232 SPDK_WARNLOG("Specified command could not be aborted.\n"); 4233 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4234 } 4235 } 4236 4237 static void 4238 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr, 4239 struct spdk_nvme_qpair *qpair, uint16_t cid) 4240 { 4241 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 4242 union spdk_nvme_csts_register csts; 4243 int rc; 4244 4245 assert(nvme_ctrlr->ctrlr == ctrlr); 4246 4247 SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid); 4248 4249 /* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O 4250 * queue. (Note: qpair == NULL when there's an admin cmd timeout.) Otherwise we 4251 * would submit another fabrics cmd on the admin queue to read CSTS and check for its 4252 * completion recursively. 4253 */ 4254 if (nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) { 4255 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); 4256 if (csts.bits.cfs) { 4257 SPDK_ERRLOG("Controller Fatal Status, reset required\n"); 4258 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4259 return; 4260 } 4261 } 4262 4263 switch (g_opts.action_on_timeout) { 4264 case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT: 4265 if (qpair) { 4266 /* Don't send abort to ctrlr when ctrlr is not available. */ 4267 pthread_mutex_lock(&nvme_ctrlr->mutex); 4268 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 4269 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4270 SPDK_NOTICELOG("Quit abort. Ctrlr is not available.\n"); 4271 return; 4272 } 4273 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4274 4275 rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid, 4276 nvme_abort_cpl, nvme_ctrlr); 4277 if (rc == 0) { 4278 return; 4279 } 4280 4281 SPDK_ERRLOG("Unable to send abort. Resetting, rc is %d.\n", rc); 4282 } 4283 4284 /* FALLTHROUGH */ 4285 case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET: 4286 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4287 break; 4288 case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE: 4289 SPDK_DEBUGLOG(bdev_nvme, "No action for nvme controller timeout.\n"); 4290 break; 4291 default: 4292 SPDK_ERRLOG("An invalid timeout action value is found.\n"); 4293 break; 4294 } 4295 } 4296 4297 static struct nvme_ns * 4298 nvme_ns_alloc(void) 4299 { 4300 struct nvme_ns *nvme_ns; 4301 4302 nvme_ns = calloc(1, sizeof(struct nvme_ns)); 4303 if (nvme_ns == NULL) { 4304 return NULL; 4305 } 4306 4307 if (g_opts.io_path_stat) { 4308 nvme_ns->stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); 4309 if (nvme_ns->stat == NULL) { 4310 free(nvme_ns); 4311 return NULL; 4312 } 4313 spdk_bdev_reset_io_stat(nvme_ns->stat, SPDK_BDEV_RESET_STAT_MAXMIN); 4314 } 4315 4316 return nvme_ns; 4317 } 4318 4319 static void 4320 nvme_ns_free(struct nvme_ns *nvme_ns) 4321 { 4322 free(nvme_ns->stat); 4323 free(nvme_ns); 4324 } 4325 4326 static void 4327 nvme_ctrlr_populate_namespace_done(struct nvme_ns *nvme_ns, int rc) 4328 { 4329 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 4330 struct nvme_async_probe_ctx *ctx = nvme_ns->probe_ctx; 4331 4332 if (rc == 0) { 4333 nvme_ns->probe_ctx = NULL; 4334 pthread_mutex_lock(&nvme_ctrlr->mutex); 4335 nvme_ctrlr->ref++; 4336 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4337 } else { 4338 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4339 nvme_ns_free(nvme_ns); 4340 } 4341 4342 if (ctx) { 4343 ctx->populates_in_progress--; 4344 if (ctx->populates_in_progress == 0) { 4345 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 4346 } 4347 } 4348 } 4349 4350 static void 4351 bdev_nvme_add_io_path(struct spdk_io_channel_iter *i) 4352 { 4353 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4354 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4355 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4356 int rc; 4357 4358 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 4359 if (rc != 0) { 4360 SPDK_ERRLOG("Failed to add I/O path to bdev_channel dynamically.\n"); 4361 } 4362 4363 spdk_for_each_channel_continue(i, rc); 4364 } 4365 4366 static void 4367 bdev_nvme_delete_io_path(struct spdk_io_channel_iter *i) 4368 { 4369 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4370 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4371 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4372 struct nvme_io_path *io_path; 4373 4374 io_path = _bdev_nvme_get_io_path(nbdev_ch, nvme_ns); 4375 if (io_path != NULL) { 4376 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 4377 } 4378 4379 spdk_for_each_channel_continue(i, 0); 4380 } 4381 4382 static void 4383 bdev_nvme_add_io_path_failed(struct spdk_io_channel_iter *i, int status) 4384 { 4385 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4386 4387 nvme_ctrlr_populate_namespace_done(nvme_ns, -1); 4388 } 4389 4390 static void 4391 bdev_nvme_add_io_path_done(struct spdk_io_channel_iter *i, int status) 4392 { 4393 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4394 struct nvme_bdev *bdev = spdk_io_channel_iter_get_io_device(i); 4395 4396 if (status == 0) { 4397 nvme_ctrlr_populate_namespace_done(nvme_ns, 0); 4398 } else { 4399 /* Delete the added io_paths and fail populating the namespace. */ 4400 spdk_for_each_channel(bdev, 4401 bdev_nvme_delete_io_path, 4402 nvme_ns, 4403 bdev_nvme_add_io_path_failed); 4404 } 4405 } 4406 4407 static int 4408 nvme_bdev_add_ns(struct nvme_bdev *bdev, struct nvme_ns *nvme_ns) 4409 { 4410 struct nvme_ns *tmp_ns; 4411 const struct spdk_nvme_ns_data *nsdata; 4412 4413 nsdata = spdk_nvme_ns_get_data(nvme_ns->ns); 4414 if (!nsdata->nmic.can_share) { 4415 SPDK_ERRLOG("Namespace cannot be shared.\n"); 4416 return -EINVAL; 4417 } 4418 4419 pthread_mutex_lock(&bdev->mutex); 4420 4421 tmp_ns = TAILQ_FIRST(&bdev->nvme_ns_list); 4422 assert(tmp_ns != NULL); 4423 4424 if (!bdev_nvme_compare_ns(nvme_ns->ns, tmp_ns->ns)) { 4425 pthread_mutex_unlock(&bdev->mutex); 4426 SPDK_ERRLOG("Namespaces are not identical.\n"); 4427 return -EINVAL; 4428 } 4429 4430 bdev->ref++; 4431 TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); 4432 nvme_ns->bdev = bdev; 4433 4434 pthread_mutex_unlock(&bdev->mutex); 4435 4436 /* Add nvme_io_path to nvme_bdev_channels dynamically. */ 4437 spdk_for_each_channel(bdev, 4438 bdev_nvme_add_io_path, 4439 nvme_ns, 4440 bdev_nvme_add_io_path_done); 4441 4442 return 0; 4443 } 4444 4445 static void 4446 nvme_ctrlr_populate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4447 { 4448 struct spdk_nvme_ns *ns; 4449 struct nvme_bdev *bdev; 4450 int rc = 0; 4451 4452 ns = spdk_nvme_ctrlr_get_ns(nvme_ctrlr->ctrlr, nvme_ns->id); 4453 if (!ns) { 4454 SPDK_DEBUGLOG(bdev_nvme, "Invalid NS %d\n", nvme_ns->id); 4455 rc = -EINVAL; 4456 goto done; 4457 } 4458 4459 nvme_ns->ns = ns; 4460 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 4461 4462 if (nvme_ctrlr->ana_log_page != NULL) { 4463 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ns_set_ana_state, nvme_ns); 4464 } 4465 4466 bdev = nvme_bdev_ctrlr_get_bdev(nvme_ctrlr->nbdev_ctrlr, nvme_ns->id); 4467 if (bdev == NULL) { 4468 rc = nvme_bdev_create(nvme_ctrlr, nvme_ns); 4469 } else { 4470 rc = nvme_bdev_add_ns(bdev, nvme_ns); 4471 if (rc == 0) { 4472 return; 4473 } 4474 } 4475 done: 4476 nvme_ctrlr_populate_namespace_done(nvme_ns, rc); 4477 } 4478 4479 static void 4480 nvme_ctrlr_depopulate_namespace_done(struct nvme_ns *nvme_ns) 4481 { 4482 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 4483 4484 assert(nvme_ctrlr != NULL); 4485 4486 pthread_mutex_lock(&nvme_ctrlr->mutex); 4487 4488 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4489 4490 if (nvme_ns->bdev != NULL) { 4491 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4492 return; 4493 } 4494 4495 nvme_ns_free(nvme_ns); 4496 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4497 4498 nvme_ctrlr_release(nvme_ctrlr); 4499 } 4500 4501 static void 4502 bdev_nvme_delete_io_path_done(struct spdk_io_channel_iter *i, int status) 4503 { 4504 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4505 4506 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 4507 } 4508 4509 static void 4510 nvme_ctrlr_depopulate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4511 { 4512 struct nvme_bdev *bdev; 4513 4514 spdk_poller_unregister(&nvme_ns->anatt_timer); 4515 4516 bdev = nvme_ns->bdev; 4517 if (bdev != NULL) { 4518 pthread_mutex_lock(&bdev->mutex); 4519 4520 assert(bdev->ref > 0); 4521 bdev->ref--; 4522 if (bdev->ref == 0) { 4523 pthread_mutex_unlock(&bdev->mutex); 4524 4525 spdk_bdev_unregister(&bdev->disk, NULL, NULL); 4526 } else { 4527 /* spdk_bdev_unregister() is not called until the last nvme_ns is 4528 * depopulated. Hence we need to remove nvme_ns from bdev->nvme_ns_list 4529 * and clear nvme_ns->bdev here. 4530 */ 4531 TAILQ_REMOVE(&bdev->nvme_ns_list, nvme_ns, tailq); 4532 nvme_ns->bdev = NULL; 4533 4534 pthread_mutex_unlock(&bdev->mutex); 4535 4536 /* Delete nvme_io_paths from nvme_bdev_channels dynamically. After that, 4537 * we call depopulate_namespace_done() to avoid use-after-free. 4538 */ 4539 spdk_for_each_channel(bdev, 4540 bdev_nvme_delete_io_path, 4541 nvme_ns, 4542 bdev_nvme_delete_io_path_done); 4543 return; 4544 } 4545 } 4546 4547 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 4548 } 4549 4550 static void 4551 nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 4552 struct nvme_async_probe_ctx *ctx) 4553 { 4554 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 4555 struct nvme_ns *nvme_ns, *next; 4556 struct spdk_nvme_ns *ns; 4557 struct nvme_bdev *bdev; 4558 uint32_t nsid; 4559 int rc; 4560 uint64_t num_sectors; 4561 4562 if (ctx) { 4563 /* Initialize this count to 1 to handle the populate functions 4564 * calling nvme_ctrlr_populate_namespace_done() immediately. 4565 */ 4566 ctx->populates_in_progress = 1; 4567 } 4568 4569 /* First loop over our existing namespaces and see if they have been 4570 * removed. */ 4571 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 4572 while (nvme_ns != NULL) { 4573 next = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 4574 4575 if (spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) { 4576 /* NS is still there but attributes may have changed */ 4577 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id); 4578 num_sectors = spdk_nvme_ns_get_num_sectors(ns); 4579 bdev = nvme_ns->bdev; 4580 assert(bdev != NULL); 4581 if (bdev->disk.blockcnt != num_sectors) { 4582 SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n", 4583 nvme_ns->id, 4584 bdev->disk.name, 4585 bdev->disk.blockcnt, 4586 num_sectors); 4587 rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors); 4588 if (rc != 0) { 4589 SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n", 4590 bdev->disk.name, rc); 4591 } 4592 } 4593 } else { 4594 /* Namespace was removed */ 4595 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 4596 } 4597 4598 nvme_ns = next; 4599 } 4600 4601 /* Loop through all of the namespaces at the nvme level and see if any of them are new */ 4602 nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 4603 while (nsid != 0) { 4604 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 4605 4606 if (nvme_ns == NULL) { 4607 /* Found a new one */ 4608 nvme_ns = nvme_ns_alloc(); 4609 if (nvme_ns == NULL) { 4610 SPDK_ERRLOG("Failed to allocate namespace\n"); 4611 /* This just fails to attach the namespace. It may work on a future attempt. */ 4612 continue; 4613 } 4614 4615 nvme_ns->id = nsid; 4616 nvme_ns->ctrlr = nvme_ctrlr; 4617 4618 nvme_ns->bdev = NULL; 4619 4620 if (ctx) { 4621 ctx->populates_in_progress++; 4622 } 4623 nvme_ns->probe_ctx = ctx; 4624 4625 RB_INSERT(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4626 4627 nvme_ctrlr_populate_namespace(nvme_ctrlr, nvme_ns); 4628 } 4629 4630 nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid); 4631 } 4632 4633 if (ctx) { 4634 /* Decrement this count now that the loop is over to account 4635 * for the one we started with. If the count is then 0, we 4636 * know any populate_namespace functions completed immediately, 4637 * so we'll kick the callback here. 4638 */ 4639 ctx->populates_in_progress--; 4640 if (ctx->populates_in_progress == 0) { 4641 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 4642 } 4643 } 4644 4645 } 4646 4647 static void 4648 nvme_ctrlr_depopulate_namespaces(struct nvme_ctrlr *nvme_ctrlr) 4649 { 4650 struct nvme_ns *nvme_ns, *tmp; 4651 4652 RB_FOREACH_SAFE(nvme_ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp) { 4653 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 4654 } 4655 } 4656 4657 static uint32_t 4658 nvme_ctrlr_get_ana_log_page_size(struct nvme_ctrlr *nvme_ctrlr) 4659 { 4660 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 4661 const struct spdk_nvme_ctrlr_data *cdata; 4662 uint32_t nsid, ns_count = 0; 4663 4664 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 4665 4666 for (nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 4667 nsid != 0; nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid)) { 4668 ns_count++; 4669 } 4670 4671 return sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 4672 sizeof(struct spdk_nvme_ana_group_descriptor) + ns_count * 4673 sizeof(uint32_t); 4674 } 4675 4676 static int 4677 nvme_ctrlr_set_ana_states(const struct spdk_nvme_ana_group_descriptor *desc, 4678 void *cb_arg) 4679 { 4680 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 4681 struct nvme_ns *nvme_ns; 4682 uint32_t i, nsid; 4683 4684 for (i = 0; i < desc->num_of_nsid; i++) { 4685 nsid = desc->nsid[i]; 4686 if (nsid == 0) { 4687 continue; 4688 } 4689 4690 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 4691 4692 assert(nvme_ns != NULL); 4693 if (nvme_ns == NULL) { 4694 /* Target told us that an inactive namespace had an ANA change */ 4695 continue; 4696 } 4697 4698 _nvme_ns_set_ana_state(nvme_ns, desc); 4699 } 4700 4701 return 0; 4702 } 4703 4704 static void 4705 bdev_nvme_disable_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 4706 { 4707 struct nvme_ns *nvme_ns; 4708 4709 spdk_free(nvme_ctrlr->ana_log_page); 4710 nvme_ctrlr->ana_log_page = NULL; 4711 4712 for (nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 4713 nvme_ns != NULL; 4714 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns)) { 4715 nvme_ns->ana_state_updating = false; 4716 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 4717 } 4718 } 4719 4720 static void 4721 nvme_ctrlr_read_ana_log_page_done(void *ctx, const struct spdk_nvme_cpl *cpl) 4722 { 4723 struct nvme_ctrlr *nvme_ctrlr = ctx; 4724 4725 if (cpl != NULL && spdk_nvme_cpl_is_success(cpl)) { 4726 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ctrlr_set_ana_states, 4727 nvme_ctrlr); 4728 } else { 4729 bdev_nvme_disable_read_ana_log_page(nvme_ctrlr); 4730 } 4731 4732 pthread_mutex_lock(&nvme_ctrlr->mutex); 4733 4734 assert(nvme_ctrlr->ana_log_page_updating == true); 4735 nvme_ctrlr->ana_log_page_updating = false; 4736 4737 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 4738 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4739 4740 nvme_ctrlr_unregister(nvme_ctrlr); 4741 } else { 4742 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4743 4744 bdev_nvme_clear_io_path_caches(nvme_ctrlr); 4745 } 4746 } 4747 4748 static int 4749 nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 4750 { 4751 uint32_t ana_log_page_size; 4752 int rc; 4753 4754 if (nvme_ctrlr->ana_log_page == NULL) { 4755 return -EINVAL; 4756 } 4757 4758 ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); 4759 4760 if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { 4761 SPDK_ERRLOG("ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", 4762 ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); 4763 return -EINVAL; 4764 } 4765 4766 pthread_mutex_lock(&nvme_ctrlr->mutex); 4767 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 4768 nvme_ctrlr->ana_log_page_updating) { 4769 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4770 return -EBUSY; 4771 } 4772 4773 nvme_ctrlr->ana_log_page_updating = true; 4774 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4775 4776 rc = spdk_nvme_ctrlr_cmd_get_log_page(nvme_ctrlr->ctrlr, 4777 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 4778 SPDK_NVME_GLOBAL_NS_TAG, 4779 nvme_ctrlr->ana_log_page, 4780 ana_log_page_size, 0, 4781 nvme_ctrlr_read_ana_log_page_done, 4782 nvme_ctrlr); 4783 if (rc != 0) { 4784 nvme_ctrlr_read_ana_log_page_done(nvme_ctrlr, NULL); 4785 } 4786 4787 return rc; 4788 } 4789 4790 static void 4791 dummy_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 4792 { 4793 } 4794 4795 struct bdev_nvme_set_preferred_path_ctx { 4796 struct spdk_bdev_desc *desc; 4797 struct nvme_ns *nvme_ns; 4798 bdev_nvme_set_preferred_path_cb cb_fn; 4799 void *cb_arg; 4800 }; 4801 4802 static void 4803 bdev_nvme_set_preferred_path_done(struct spdk_io_channel_iter *i, int status) 4804 { 4805 struct bdev_nvme_set_preferred_path_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4806 4807 assert(ctx != NULL); 4808 assert(ctx->desc != NULL); 4809 assert(ctx->cb_fn != NULL); 4810 4811 spdk_bdev_close(ctx->desc); 4812 4813 ctx->cb_fn(ctx->cb_arg, status); 4814 4815 free(ctx); 4816 } 4817 4818 static void 4819 _bdev_nvme_set_preferred_path(struct spdk_io_channel_iter *i) 4820 { 4821 struct bdev_nvme_set_preferred_path_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4822 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4823 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4824 struct nvme_io_path *io_path, *prev; 4825 4826 prev = NULL; 4827 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 4828 if (io_path->nvme_ns == ctx->nvme_ns) { 4829 break; 4830 } 4831 prev = io_path; 4832 } 4833 4834 if (io_path != NULL) { 4835 if (prev != NULL) { 4836 STAILQ_REMOVE_AFTER(&nbdev_ch->io_path_list, prev, stailq); 4837 STAILQ_INSERT_HEAD(&nbdev_ch->io_path_list, io_path, stailq); 4838 } 4839 4840 /* We can set io_path to nbdev_ch->current_io_path directly here. 4841 * However, it needs to be conditional. To simplify the code, 4842 * just clear nbdev_ch->current_io_path and let find_io_path() 4843 * fill it. 4844 * 4845 * Automatic failback may be disabled. Hence even if the io_path is 4846 * already at the head, clear nbdev_ch->current_io_path. 4847 */ 4848 bdev_nvme_clear_current_io_path(nbdev_ch); 4849 } 4850 4851 spdk_for_each_channel_continue(i, 0); 4852 } 4853 4854 static struct nvme_ns * 4855 bdev_nvme_set_preferred_ns(struct nvme_bdev *nbdev, uint16_t cntlid) 4856 { 4857 struct nvme_ns *nvme_ns, *prev; 4858 const struct spdk_nvme_ctrlr_data *cdata; 4859 4860 prev = NULL; 4861 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 4862 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 4863 4864 if (cdata->cntlid == cntlid) { 4865 break; 4866 } 4867 prev = nvme_ns; 4868 } 4869 4870 if (nvme_ns != NULL && prev != NULL) { 4871 TAILQ_REMOVE(&nbdev->nvme_ns_list, nvme_ns, tailq); 4872 TAILQ_INSERT_HEAD(&nbdev->nvme_ns_list, nvme_ns, tailq); 4873 } 4874 4875 return nvme_ns; 4876 } 4877 4878 /* This function supports only multipath mode. There is only a single I/O path 4879 * for each NVMe-oF controller. Hence, just move the matched I/O path to the 4880 * head of the I/O path list for each NVMe bdev channel. 4881 * 4882 * NVMe bdev channel may be acquired after completing this function. move the 4883 * matched namespace to the head of the namespace list for the NVMe bdev too. 4884 */ 4885 void 4886 bdev_nvme_set_preferred_path(const char *name, uint16_t cntlid, 4887 bdev_nvme_set_preferred_path_cb cb_fn, void *cb_arg) 4888 { 4889 struct bdev_nvme_set_preferred_path_ctx *ctx; 4890 struct spdk_bdev *bdev; 4891 struct nvme_bdev *nbdev; 4892 int rc = 0; 4893 4894 assert(cb_fn != NULL); 4895 4896 ctx = calloc(1, sizeof(*ctx)); 4897 if (ctx == NULL) { 4898 SPDK_ERRLOG("Failed to alloc context.\n"); 4899 rc = -ENOMEM; 4900 goto err_alloc; 4901 } 4902 4903 ctx->cb_fn = cb_fn; 4904 ctx->cb_arg = cb_arg; 4905 4906 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 4907 if (rc != 0) { 4908 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 4909 goto err_open; 4910 } 4911 4912 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 4913 4914 if (bdev->module != &nvme_if) { 4915 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 4916 rc = -ENODEV; 4917 goto err_bdev; 4918 } 4919 4920 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 4921 4922 pthread_mutex_lock(&nbdev->mutex); 4923 4924 ctx->nvme_ns = bdev_nvme_set_preferred_ns(nbdev, cntlid); 4925 if (ctx->nvme_ns == NULL) { 4926 pthread_mutex_unlock(&nbdev->mutex); 4927 4928 SPDK_ERRLOG("bdev %s does not have namespace to controller %u.\n", name, cntlid); 4929 rc = -ENODEV; 4930 goto err_bdev; 4931 } 4932 4933 pthread_mutex_unlock(&nbdev->mutex); 4934 4935 spdk_for_each_channel(nbdev, 4936 _bdev_nvme_set_preferred_path, 4937 ctx, 4938 bdev_nvme_set_preferred_path_done); 4939 return; 4940 4941 err_bdev: 4942 spdk_bdev_close(ctx->desc); 4943 err_open: 4944 free(ctx); 4945 err_alloc: 4946 cb_fn(cb_arg, rc); 4947 } 4948 4949 struct bdev_nvme_set_multipath_policy_ctx { 4950 struct spdk_bdev_desc *desc; 4951 bdev_nvme_set_multipath_policy_cb cb_fn; 4952 void *cb_arg; 4953 }; 4954 4955 static void 4956 bdev_nvme_set_multipath_policy_done(struct spdk_io_channel_iter *i, int status) 4957 { 4958 struct bdev_nvme_set_multipath_policy_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4959 4960 assert(ctx != NULL); 4961 assert(ctx->desc != NULL); 4962 assert(ctx->cb_fn != NULL); 4963 4964 spdk_bdev_close(ctx->desc); 4965 4966 ctx->cb_fn(ctx->cb_arg, status); 4967 4968 free(ctx); 4969 } 4970 4971 static void 4972 _bdev_nvme_set_multipath_policy(struct spdk_io_channel_iter *i) 4973 { 4974 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4975 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4976 struct nvme_bdev *nbdev = spdk_io_channel_get_io_device(_ch); 4977 4978 nbdev_ch->mp_policy = nbdev->mp_policy; 4979 nbdev_ch->mp_selector = nbdev->mp_selector; 4980 nbdev_ch->rr_min_io = nbdev->rr_min_io; 4981 bdev_nvme_clear_current_io_path(nbdev_ch); 4982 4983 spdk_for_each_channel_continue(i, 0); 4984 } 4985 4986 void 4987 bdev_nvme_set_multipath_policy(const char *name, enum bdev_nvme_multipath_policy policy, 4988 enum bdev_nvme_multipath_selector selector, uint32_t rr_min_io, 4989 bdev_nvme_set_multipath_policy_cb cb_fn, void *cb_arg) 4990 { 4991 struct bdev_nvme_set_multipath_policy_ctx *ctx; 4992 struct spdk_bdev *bdev; 4993 struct nvme_bdev *nbdev; 4994 int rc; 4995 4996 assert(cb_fn != NULL); 4997 4998 if (policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE && selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 4999 if (rr_min_io == UINT32_MAX) { 5000 rr_min_io = 1; 5001 } else if (rr_min_io == 0) { 5002 rc = -EINVAL; 5003 goto exit; 5004 } 5005 } else if (rr_min_io != UINT32_MAX) { 5006 rc = -EINVAL; 5007 goto exit; 5008 } 5009 5010 ctx = calloc(1, sizeof(*ctx)); 5011 if (ctx == NULL) { 5012 SPDK_ERRLOG("Failed to alloc context.\n"); 5013 rc = -ENOMEM; 5014 goto exit; 5015 } 5016 5017 ctx->cb_fn = cb_fn; 5018 ctx->cb_arg = cb_arg; 5019 5020 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 5021 if (rc != 0) { 5022 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 5023 rc = -ENODEV; 5024 goto err_open; 5025 } 5026 5027 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 5028 if (bdev->module != &nvme_if) { 5029 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 5030 rc = -ENODEV; 5031 goto err_module; 5032 } 5033 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 5034 5035 pthread_mutex_lock(&nbdev->mutex); 5036 nbdev->mp_policy = policy; 5037 nbdev->mp_selector = selector; 5038 nbdev->rr_min_io = rr_min_io; 5039 pthread_mutex_unlock(&nbdev->mutex); 5040 5041 spdk_for_each_channel(nbdev, 5042 _bdev_nvme_set_multipath_policy, 5043 ctx, 5044 bdev_nvme_set_multipath_policy_done); 5045 return; 5046 5047 err_module: 5048 spdk_bdev_close(ctx->desc); 5049 err_open: 5050 free(ctx); 5051 exit: 5052 cb_fn(cb_arg, rc); 5053 } 5054 5055 static void 5056 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 5057 { 5058 struct nvme_ctrlr *nvme_ctrlr = arg; 5059 union spdk_nvme_async_event_completion event; 5060 5061 if (spdk_nvme_cpl_is_error(cpl)) { 5062 SPDK_WARNLOG("AER request execute failed\n"); 5063 return; 5064 } 5065 5066 event.raw = cpl->cdw0; 5067 if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 5068 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) { 5069 nvme_ctrlr_populate_namespaces(nvme_ctrlr, NULL); 5070 } else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 5071 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_ANA_CHANGE)) { 5072 nvme_ctrlr_read_ana_log_page(nvme_ctrlr); 5073 } 5074 } 5075 5076 static void 5077 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, int rc) 5078 { 5079 if (ctx->cb_fn) { 5080 ctx->cb_fn(ctx->cb_ctx, ctx->reported_bdevs, rc); 5081 } 5082 5083 ctx->namespaces_populated = true; 5084 if (ctx->probe_done) { 5085 /* The probe was already completed, so we need to free the context 5086 * here. This can happen for cases like OCSSD, where we need to 5087 * send additional commands to the SSD after attach. 5088 */ 5089 free(ctx); 5090 } 5091 } 5092 5093 static void 5094 nvme_ctrlr_create_done(struct nvme_ctrlr *nvme_ctrlr, 5095 struct nvme_async_probe_ctx *ctx) 5096 { 5097 spdk_io_device_register(nvme_ctrlr, 5098 bdev_nvme_create_ctrlr_channel_cb, 5099 bdev_nvme_destroy_ctrlr_channel_cb, 5100 sizeof(struct nvme_ctrlr_channel), 5101 nvme_ctrlr->nbdev_ctrlr->name); 5102 5103 nvme_ctrlr_populate_namespaces(nvme_ctrlr, ctx); 5104 } 5105 5106 static void 5107 nvme_ctrlr_init_ana_log_page_done(void *_ctx, const struct spdk_nvme_cpl *cpl) 5108 { 5109 struct nvme_ctrlr *nvme_ctrlr = _ctx; 5110 struct nvme_async_probe_ctx *ctx = nvme_ctrlr->probe_ctx; 5111 5112 nvme_ctrlr->probe_ctx = NULL; 5113 5114 if (spdk_nvme_cpl_is_error(cpl)) { 5115 nvme_ctrlr_delete(nvme_ctrlr); 5116 5117 if (ctx != NULL) { 5118 ctx->reported_bdevs = 0; 5119 populate_namespaces_cb(ctx, -1); 5120 } 5121 return; 5122 } 5123 5124 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 5125 } 5126 5127 static int 5128 nvme_ctrlr_init_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 5129 struct nvme_async_probe_ctx *ctx) 5130 { 5131 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 5132 const struct spdk_nvme_ctrlr_data *cdata; 5133 uint32_t ana_log_page_size; 5134 5135 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5136 5137 /* Set buffer size enough to include maximum number of allowed namespaces. */ 5138 ana_log_page_size = sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 5139 sizeof(struct spdk_nvme_ana_group_descriptor) + cdata->mnan * 5140 sizeof(uint32_t); 5141 5142 nvme_ctrlr->ana_log_page = spdk_zmalloc(ana_log_page_size, 64, NULL, 5143 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 5144 if (nvme_ctrlr->ana_log_page == NULL) { 5145 SPDK_ERRLOG("could not allocate ANA log page buffer\n"); 5146 return -ENXIO; 5147 } 5148 5149 /* Each descriptor in a ANA log page is not ensured to be 8-bytes aligned. 5150 * Hence copy each descriptor to a temporary area when parsing it. 5151 * 5152 * Allocate a buffer whose size is as large as ANA log page buffer because 5153 * we do not know the size of a descriptor until actually reading it. 5154 */ 5155 nvme_ctrlr->copied_ana_desc = calloc(1, ana_log_page_size); 5156 if (nvme_ctrlr->copied_ana_desc == NULL) { 5157 SPDK_ERRLOG("could not allocate a buffer to parse ANA descriptor\n"); 5158 return -ENOMEM; 5159 } 5160 5161 nvme_ctrlr->max_ana_log_page_size = ana_log_page_size; 5162 5163 nvme_ctrlr->probe_ctx = ctx; 5164 5165 /* Then, set the read size only to include the current active namespaces. */ 5166 ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); 5167 5168 if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { 5169 SPDK_ERRLOG("ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", 5170 ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); 5171 return -EINVAL; 5172 } 5173 5174 return spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, 5175 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 5176 SPDK_NVME_GLOBAL_NS_TAG, 5177 nvme_ctrlr->ana_log_page, 5178 ana_log_page_size, 0, 5179 nvme_ctrlr_init_ana_log_page_done, 5180 nvme_ctrlr); 5181 } 5182 5183 /* hostnqn and subnqn were already verified before attaching a controller. 5184 * Hence check only the multipath capability and cntlid here. 5185 */ 5186 static bool 5187 bdev_nvme_check_multipath(struct nvme_bdev_ctrlr *nbdev_ctrlr, struct spdk_nvme_ctrlr *ctrlr) 5188 { 5189 struct nvme_ctrlr *tmp; 5190 const struct spdk_nvme_ctrlr_data *cdata, *tmp_cdata; 5191 5192 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5193 5194 if (!cdata->cmic.multi_ctrlr) { 5195 SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); 5196 return false; 5197 } 5198 5199 TAILQ_FOREACH(tmp, &nbdev_ctrlr->ctrlrs, tailq) { 5200 tmp_cdata = spdk_nvme_ctrlr_get_data(tmp->ctrlr); 5201 5202 if (!tmp_cdata->cmic.multi_ctrlr) { 5203 SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); 5204 return false; 5205 } 5206 if (cdata->cntlid == tmp_cdata->cntlid) { 5207 SPDK_ERRLOG("cntlid %u are duplicated.\n", tmp_cdata->cntlid); 5208 return false; 5209 } 5210 } 5211 5212 return true; 5213 } 5214 5215 static int 5216 nvme_bdev_ctrlr_create(const char *name, struct nvme_ctrlr *nvme_ctrlr) 5217 { 5218 struct nvme_bdev_ctrlr *nbdev_ctrlr; 5219 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 5220 int rc = 0; 5221 5222 pthread_mutex_lock(&g_bdev_nvme_mutex); 5223 5224 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 5225 if (nbdev_ctrlr != NULL) { 5226 if (!bdev_nvme_check_multipath(nbdev_ctrlr, ctrlr)) { 5227 rc = -EINVAL; 5228 goto exit; 5229 } 5230 } else { 5231 nbdev_ctrlr = calloc(1, sizeof(*nbdev_ctrlr)); 5232 if (nbdev_ctrlr == NULL) { 5233 SPDK_ERRLOG("Failed to allocate nvme_bdev_ctrlr.\n"); 5234 rc = -ENOMEM; 5235 goto exit; 5236 } 5237 nbdev_ctrlr->name = strdup(name); 5238 if (nbdev_ctrlr->name == NULL) { 5239 SPDK_ERRLOG("Failed to allocate name of nvme_bdev_ctrlr.\n"); 5240 free(nbdev_ctrlr); 5241 goto exit; 5242 } 5243 TAILQ_INIT(&nbdev_ctrlr->ctrlrs); 5244 TAILQ_INIT(&nbdev_ctrlr->bdevs); 5245 TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 5246 } 5247 nvme_ctrlr->nbdev_ctrlr = nbdev_ctrlr; 5248 TAILQ_INSERT_TAIL(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 5249 exit: 5250 pthread_mutex_unlock(&g_bdev_nvme_mutex); 5251 return rc; 5252 } 5253 5254 static int 5255 nvme_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr, 5256 const char *name, 5257 const struct spdk_nvme_transport_id *trid, 5258 struct nvme_async_probe_ctx *ctx) 5259 { 5260 struct nvme_ctrlr *nvme_ctrlr; 5261 struct nvme_path_id *path_id; 5262 const struct spdk_nvme_ctrlr_data *cdata; 5263 int rc; 5264 5265 nvme_ctrlr = calloc(1, sizeof(*nvme_ctrlr)); 5266 if (nvme_ctrlr == NULL) { 5267 SPDK_ERRLOG("Failed to allocate device struct\n"); 5268 return -ENOMEM; 5269 } 5270 5271 rc = pthread_mutex_init(&nvme_ctrlr->mutex, NULL); 5272 if (rc != 0) { 5273 free(nvme_ctrlr); 5274 return rc; 5275 } 5276 5277 TAILQ_INIT(&nvme_ctrlr->trids); 5278 5279 RB_INIT(&nvme_ctrlr->namespaces); 5280 5281 path_id = calloc(1, sizeof(*path_id)); 5282 if (path_id == NULL) { 5283 SPDK_ERRLOG("Failed to allocate trid entry pointer\n"); 5284 rc = -ENOMEM; 5285 goto err; 5286 } 5287 5288 path_id->trid = *trid; 5289 if (ctx != NULL) { 5290 memcpy(path_id->hostid.hostaddr, ctx->drv_opts.src_addr, sizeof(path_id->hostid.hostaddr)); 5291 memcpy(path_id->hostid.hostsvcid, ctx->drv_opts.src_svcid, sizeof(path_id->hostid.hostsvcid)); 5292 } 5293 nvme_ctrlr->active_path_id = path_id; 5294 TAILQ_INSERT_HEAD(&nvme_ctrlr->trids, path_id, link); 5295 5296 nvme_ctrlr->thread = spdk_get_thread(); 5297 nvme_ctrlr->ctrlr = ctrlr; 5298 nvme_ctrlr->ref = 1; 5299 5300 if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) { 5301 SPDK_ERRLOG("OCSSDs are not supported"); 5302 rc = -ENOTSUP; 5303 goto err; 5304 } 5305 5306 if (ctx != NULL) { 5307 memcpy(&nvme_ctrlr->opts, &ctx->bdev_opts, sizeof(ctx->bdev_opts)); 5308 } else { 5309 bdev_nvme_get_default_ctrlr_opts(&nvme_ctrlr->opts); 5310 } 5311 5312 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_ctrlr, 5313 g_opts.nvme_adminq_poll_period_us); 5314 5315 if (g_opts.timeout_us > 0) { 5316 /* Register timeout callback. Timeout values for IO vs. admin reqs can be different. */ 5317 /* If timeout_admin_us is 0 (not specified), admin uses same timeout as IO. */ 5318 uint64_t adm_timeout_us = (g_opts.timeout_admin_us == 0) ? 5319 g_opts.timeout_us : g_opts.timeout_admin_us; 5320 spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us, 5321 adm_timeout_us, timeout_cb, nvme_ctrlr); 5322 } 5323 5324 spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_ctrlr); 5325 spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_ctrlr); 5326 5327 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 5328 SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) { 5329 nvme_ctrlr->opal_dev = spdk_opal_dev_construct(ctrlr); 5330 } 5331 5332 rc = nvme_bdev_ctrlr_create(name, nvme_ctrlr); 5333 if (rc != 0) { 5334 goto err; 5335 } 5336 5337 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5338 5339 if (cdata->cmic.ana_reporting) { 5340 rc = nvme_ctrlr_init_ana_log_page(nvme_ctrlr, ctx); 5341 if (rc == 0) { 5342 return 0; 5343 } 5344 } else { 5345 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 5346 return 0; 5347 } 5348 5349 err: 5350 nvme_ctrlr_delete(nvme_ctrlr); 5351 return rc; 5352 } 5353 5354 void 5355 bdev_nvme_get_default_ctrlr_opts(struct nvme_ctrlr_opts *opts) 5356 { 5357 opts->prchk_flags = 0; 5358 opts->ctrlr_loss_timeout_sec = g_opts.ctrlr_loss_timeout_sec; 5359 opts->reconnect_delay_sec = g_opts.reconnect_delay_sec; 5360 opts->fast_io_fail_timeout_sec = g_opts.fast_io_fail_timeout_sec; 5361 } 5362 5363 static void 5364 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5365 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *drv_opts) 5366 { 5367 char *name; 5368 5369 name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++); 5370 if (!name) { 5371 SPDK_ERRLOG("Failed to assign name to NVMe device\n"); 5372 return; 5373 } 5374 5375 if (nvme_ctrlr_create(ctrlr, name, trid, NULL) == 0) { 5376 SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name); 5377 } else { 5378 SPDK_ERRLOG("Failed to attach to %s (%s)\n", trid->traddr, name); 5379 } 5380 5381 free(name); 5382 } 5383 5384 static void 5385 _nvme_ctrlr_destruct(void *ctx) 5386 { 5387 struct nvme_ctrlr *nvme_ctrlr = ctx; 5388 5389 nvme_ctrlr_depopulate_namespaces(nvme_ctrlr); 5390 nvme_ctrlr_release(nvme_ctrlr); 5391 } 5392 5393 static int 5394 bdev_nvme_delete_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) 5395 { 5396 struct nvme_probe_skip_entry *entry; 5397 5398 /* The controller's destruction was already started */ 5399 if (nvme_ctrlr->destruct) { 5400 return -EALREADY; 5401 } 5402 5403 if (!hotplug && 5404 nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { 5405 entry = calloc(1, sizeof(*entry)); 5406 if (!entry) { 5407 return -ENOMEM; 5408 } 5409 entry->trid = nvme_ctrlr->active_path_id->trid; 5410 TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq); 5411 } 5412 5413 nvme_ctrlr->destruct = true; 5414 return 0; 5415 } 5416 5417 static int 5418 bdev_nvme_delete_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) 5419 { 5420 int rc; 5421 5422 pthread_mutex_lock(&nvme_ctrlr->mutex); 5423 rc = bdev_nvme_delete_ctrlr_unsafe(nvme_ctrlr, hotplug); 5424 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5425 5426 if (rc == 0) { 5427 _nvme_ctrlr_destruct(nvme_ctrlr); 5428 } else if (rc == -EALREADY) { 5429 rc = 0; 5430 } 5431 5432 return rc; 5433 } 5434 5435 static void 5436 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr) 5437 { 5438 struct nvme_ctrlr *nvme_ctrlr = cb_ctx; 5439 5440 bdev_nvme_delete_ctrlr(nvme_ctrlr, true); 5441 } 5442 5443 static int 5444 bdev_nvme_hotplug_probe(void *arg) 5445 { 5446 if (g_hotplug_probe_ctx == NULL) { 5447 spdk_poller_unregister(&g_hotplug_probe_poller); 5448 return SPDK_POLLER_IDLE; 5449 } 5450 5451 if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) { 5452 g_hotplug_probe_ctx = NULL; 5453 spdk_poller_unregister(&g_hotplug_probe_poller); 5454 } 5455 5456 return SPDK_POLLER_BUSY; 5457 } 5458 5459 static int 5460 bdev_nvme_hotplug(void *arg) 5461 { 5462 struct spdk_nvme_transport_id trid_pcie; 5463 5464 if (g_hotplug_probe_ctx) { 5465 return SPDK_POLLER_BUSY; 5466 } 5467 5468 memset(&trid_pcie, 0, sizeof(trid_pcie)); 5469 spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE); 5470 5471 g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL, 5472 hotplug_probe_cb, attach_cb, NULL); 5473 5474 if (g_hotplug_probe_ctx) { 5475 assert(g_hotplug_probe_poller == NULL); 5476 g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000); 5477 } 5478 5479 return SPDK_POLLER_BUSY; 5480 } 5481 5482 void 5483 bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts) 5484 { 5485 *opts = g_opts; 5486 } 5487 5488 static bool bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, 5489 uint32_t reconnect_delay_sec, 5490 uint32_t fast_io_fail_timeout_sec); 5491 5492 static int 5493 bdev_nvme_validate_opts(const struct spdk_bdev_nvme_opts *opts) 5494 { 5495 if ((opts->timeout_us == 0) && (opts->timeout_admin_us != 0)) { 5496 /* Can't set timeout_admin_us without also setting timeout_us */ 5497 SPDK_WARNLOG("Invalid options: Can't have (timeout_us == 0) with (timeout_admin_us > 0)\n"); 5498 return -EINVAL; 5499 } 5500 5501 if (opts->bdev_retry_count < -1) { 5502 SPDK_WARNLOG("Invalid option: bdev_retry_count can't be less than -1.\n"); 5503 return -EINVAL; 5504 } 5505 5506 if (!bdev_nvme_check_io_error_resiliency_params(opts->ctrlr_loss_timeout_sec, 5507 opts->reconnect_delay_sec, 5508 opts->fast_io_fail_timeout_sec)) { 5509 return -EINVAL; 5510 } 5511 5512 return 0; 5513 } 5514 5515 int 5516 bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts) 5517 { 5518 int ret; 5519 5520 ret = bdev_nvme_validate_opts(opts); 5521 if (ret) { 5522 SPDK_WARNLOG("Failed to set nvme opts.\n"); 5523 return ret; 5524 } 5525 5526 if (g_bdev_nvme_init_thread != NULL) { 5527 if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 5528 return -EPERM; 5529 } 5530 } 5531 5532 if (opts->rdma_srq_size != 0) { 5533 struct spdk_nvme_transport_opts drv_opts; 5534 5535 spdk_nvme_transport_get_opts(&drv_opts, sizeof(drv_opts)); 5536 drv_opts.rdma_srq_size = opts->rdma_srq_size; 5537 5538 ret = spdk_nvme_transport_set_opts(&drv_opts, sizeof(drv_opts)); 5539 if (ret) { 5540 SPDK_ERRLOG("Failed to set NVMe transport opts.\n"); 5541 return ret; 5542 } 5543 } 5544 5545 g_opts = *opts; 5546 5547 return 0; 5548 } 5549 5550 struct set_nvme_hotplug_ctx { 5551 uint64_t period_us; 5552 bool enabled; 5553 spdk_msg_fn fn; 5554 void *fn_ctx; 5555 }; 5556 5557 static void 5558 set_nvme_hotplug_period_cb(void *_ctx) 5559 { 5560 struct set_nvme_hotplug_ctx *ctx = _ctx; 5561 5562 spdk_poller_unregister(&g_hotplug_poller); 5563 if (ctx->enabled) { 5564 g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us); 5565 } 5566 5567 g_nvme_hotplug_poll_period_us = ctx->period_us; 5568 g_nvme_hotplug_enabled = ctx->enabled; 5569 if (ctx->fn) { 5570 ctx->fn(ctx->fn_ctx); 5571 } 5572 5573 free(ctx); 5574 } 5575 5576 int 5577 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx) 5578 { 5579 struct set_nvme_hotplug_ctx *ctx; 5580 5581 if (enabled == true && !spdk_process_is_primary()) { 5582 return -EPERM; 5583 } 5584 5585 ctx = calloc(1, sizeof(*ctx)); 5586 if (ctx == NULL) { 5587 return -ENOMEM; 5588 } 5589 5590 period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us; 5591 ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX); 5592 ctx->enabled = enabled; 5593 ctx->fn = cb; 5594 ctx->fn_ctx = cb_ctx; 5595 5596 spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx); 5597 return 0; 5598 } 5599 5600 static void 5601 nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 5602 struct nvme_async_probe_ctx *ctx) 5603 { 5604 struct nvme_ns *nvme_ns; 5605 struct nvme_bdev *nvme_bdev; 5606 size_t j; 5607 5608 assert(nvme_ctrlr != NULL); 5609 5610 if (ctx->names == NULL) { 5611 ctx->reported_bdevs = 0; 5612 populate_namespaces_cb(ctx, 0); 5613 return; 5614 } 5615 5616 /* 5617 * Report the new bdevs that were created in this call. 5618 * There can be more than one bdev per NVMe controller. 5619 */ 5620 j = 0; 5621 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 5622 while (nvme_ns != NULL) { 5623 nvme_bdev = nvme_ns->bdev; 5624 if (j < ctx->max_bdevs) { 5625 ctx->names[j] = nvme_bdev->disk.name; 5626 j++; 5627 } else { 5628 SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n", 5629 ctx->max_bdevs); 5630 ctx->reported_bdevs = 0; 5631 populate_namespaces_cb(ctx, -ERANGE); 5632 return; 5633 } 5634 5635 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 5636 } 5637 5638 ctx->reported_bdevs = j; 5639 populate_namespaces_cb(ctx, 0); 5640 } 5641 5642 static int 5643 bdev_nvme_check_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 5644 struct spdk_nvme_ctrlr *new_ctrlr, 5645 struct spdk_nvme_transport_id *trid) 5646 { 5647 struct nvme_path_id *tmp_trid; 5648 5649 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 5650 SPDK_ERRLOG("PCIe failover is not supported.\n"); 5651 return -ENOTSUP; 5652 } 5653 5654 /* Currently we only support failover to the same transport type. */ 5655 if (nvme_ctrlr->active_path_id->trid.trtype != trid->trtype) { 5656 SPDK_WARNLOG("Failover from trtype: %s to a different trtype: %s is not supported currently\n", 5657 spdk_nvme_transport_id_trtype_str(nvme_ctrlr->active_path_id->trid.trtype), 5658 spdk_nvme_transport_id_trtype_str(trid->trtype)); 5659 return -EINVAL; 5660 } 5661 5662 5663 /* Currently we only support failover to the same NQN. */ 5664 if (strncmp(trid->subnqn, nvme_ctrlr->active_path_id->trid.subnqn, SPDK_NVMF_NQN_MAX_LEN)) { 5665 SPDK_WARNLOG("Failover from subnqn: %s to a different subnqn: %s is not supported currently\n", 5666 nvme_ctrlr->active_path_id->trid.subnqn, trid->subnqn); 5667 return -EINVAL; 5668 } 5669 5670 /* Skip all the other checks if we've already registered this path. */ 5671 TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) { 5672 if (!spdk_nvme_transport_id_compare(&tmp_trid->trid, trid)) { 5673 SPDK_WARNLOG("This path (traddr: %s subnqn: %s) is already registered\n", trid->traddr, 5674 trid->subnqn); 5675 return -EEXIST; 5676 } 5677 } 5678 5679 return 0; 5680 } 5681 5682 static int 5683 bdev_nvme_check_secondary_namespace(struct nvme_ctrlr *nvme_ctrlr, 5684 struct spdk_nvme_ctrlr *new_ctrlr) 5685 { 5686 struct nvme_ns *nvme_ns; 5687 struct spdk_nvme_ns *new_ns; 5688 5689 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 5690 while (nvme_ns != NULL) { 5691 new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nvme_ns->id); 5692 assert(new_ns != NULL); 5693 5694 if (!bdev_nvme_compare_ns(nvme_ns->ns, new_ns)) { 5695 return -EINVAL; 5696 } 5697 5698 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 5699 } 5700 5701 return 0; 5702 } 5703 5704 static int 5705 _bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 5706 struct spdk_nvme_transport_id *trid) 5707 { 5708 struct nvme_path_id *active_id, *new_trid, *tmp_trid; 5709 5710 new_trid = calloc(1, sizeof(*new_trid)); 5711 if (new_trid == NULL) { 5712 return -ENOMEM; 5713 } 5714 new_trid->trid = *trid; 5715 5716 active_id = nvme_ctrlr->active_path_id; 5717 assert(active_id != NULL); 5718 assert(active_id == TAILQ_FIRST(&nvme_ctrlr->trids)); 5719 5720 /* Skip the active trid not to replace it until it is failed. */ 5721 tmp_trid = TAILQ_NEXT(active_id, link); 5722 if (tmp_trid == NULL) { 5723 goto add_tail; 5724 } 5725 5726 /* It means the trid is faled if its last failed time is non-zero. 5727 * Insert the new alternate trid before any failed trid. 5728 */ 5729 TAILQ_FOREACH_FROM(tmp_trid, &nvme_ctrlr->trids, link) { 5730 if (tmp_trid->last_failed_tsc != 0) { 5731 TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link); 5732 return 0; 5733 } 5734 } 5735 5736 add_tail: 5737 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, new_trid, link); 5738 return 0; 5739 } 5740 5741 /* This is the case that a secondary path is added to an existing 5742 * nvme_ctrlr for failover. After checking if it can access the same 5743 * namespaces as the primary path, it is disconnected until failover occurs. 5744 */ 5745 static int 5746 bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 5747 struct spdk_nvme_ctrlr *new_ctrlr, 5748 struct spdk_nvme_transport_id *trid) 5749 { 5750 int rc; 5751 5752 assert(nvme_ctrlr != NULL); 5753 5754 pthread_mutex_lock(&nvme_ctrlr->mutex); 5755 5756 rc = bdev_nvme_check_secondary_trid(nvme_ctrlr, new_ctrlr, trid); 5757 if (rc != 0) { 5758 goto exit; 5759 } 5760 5761 rc = bdev_nvme_check_secondary_namespace(nvme_ctrlr, new_ctrlr); 5762 if (rc != 0) { 5763 goto exit; 5764 } 5765 5766 rc = _bdev_nvme_add_secondary_trid(nvme_ctrlr, trid); 5767 5768 exit: 5769 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5770 5771 spdk_nvme_detach(new_ctrlr); 5772 5773 return rc; 5774 } 5775 5776 static void 5777 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5778 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 5779 { 5780 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 5781 struct nvme_async_probe_ctx *ctx; 5782 int rc; 5783 5784 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 5785 ctx->ctrlr_attached = true; 5786 5787 rc = nvme_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx); 5788 if (rc != 0) { 5789 ctx->reported_bdevs = 0; 5790 populate_namespaces_cb(ctx, rc); 5791 } 5792 } 5793 5794 static void 5795 connect_set_failover_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5796 struct spdk_nvme_ctrlr *ctrlr, 5797 const struct spdk_nvme_ctrlr_opts *opts) 5798 { 5799 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 5800 struct nvme_ctrlr *nvme_ctrlr; 5801 struct nvme_async_probe_ctx *ctx; 5802 int rc; 5803 5804 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 5805 ctx->ctrlr_attached = true; 5806 5807 nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->base_name); 5808 if (nvme_ctrlr) { 5809 rc = bdev_nvme_add_secondary_trid(nvme_ctrlr, ctrlr, &ctx->trid); 5810 } else { 5811 rc = -ENODEV; 5812 } 5813 5814 ctx->reported_bdevs = 0; 5815 populate_namespaces_cb(ctx, rc); 5816 } 5817 5818 static int 5819 bdev_nvme_async_poll(void *arg) 5820 { 5821 struct nvme_async_probe_ctx *ctx = arg; 5822 int rc; 5823 5824 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 5825 if (spdk_unlikely(rc != -EAGAIN)) { 5826 ctx->probe_done = true; 5827 spdk_poller_unregister(&ctx->poller); 5828 if (!ctx->ctrlr_attached) { 5829 /* The probe is done, but no controller was attached. 5830 * That means we had a failure, so report -EIO back to 5831 * the caller (usually the RPC). populate_namespaces_cb() 5832 * will take care of freeing the nvme_async_probe_ctx. 5833 */ 5834 ctx->reported_bdevs = 0; 5835 populate_namespaces_cb(ctx, -EIO); 5836 } else if (ctx->namespaces_populated) { 5837 /* The namespaces for the attached controller were all 5838 * populated and the response was already sent to the 5839 * caller (usually the RPC). So free the context here. 5840 */ 5841 free(ctx); 5842 } 5843 } 5844 5845 return SPDK_POLLER_BUSY; 5846 } 5847 5848 static bool 5849 bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, 5850 uint32_t reconnect_delay_sec, 5851 uint32_t fast_io_fail_timeout_sec) 5852 { 5853 if (ctrlr_loss_timeout_sec < -1) { 5854 SPDK_ERRLOG("ctrlr_loss_timeout_sec can't be less than -1.\n"); 5855 return false; 5856 } else if (ctrlr_loss_timeout_sec == -1) { 5857 if (reconnect_delay_sec == 0) { 5858 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 5859 return false; 5860 } else if (fast_io_fail_timeout_sec != 0 && 5861 fast_io_fail_timeout_sec < reconnect_delay_sec) { 5862 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io-fail_timeout_sec.\n"); 5863 return false; 5864 } 5865 } else if (ctrlr_loss_timeout_sec != 0) { 5866 if (reconnect_delay_sec == 0) { 5867 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 5868 return false; 5869 } else if (reconnect_delay_sec > (uint32_t)ctrlr_loss_timeout_sec) { 5870 SPDK_ERRLOG("reconnect_delay_sec can't be more than ctrlr_loss_timeout_sec.\n"); 5871 return false; 5872 } else if (fast_io_fail_timeout_sec != 0) { 5873 if (fast_io_fail_timeout_sec < reconnect_delay_sec) { 5874 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io_fail_timeout_sec.\n"); 5875 return false; 5876 } else if (fast_io_fail_timeout_sec > (uint32_t)ctrlr_loss_timeout_sec) { 5877 SPDK_ERRLOG("fast_io_fail_timeout_sec can't be more than ctrlr_loss_timeout_sec.\n"); 5878 return false; 5879 } 5880 } 5881 } else if (reconnect_delay_sec != 0 || fast_io_fail_timeout_sec != 0) { 5882 SPDK_ERRLOG("Both reconnect_delay_sec and fast_io_fail_timeout_sec must be 0 if ctrlr_loss_timeout_sec is 0.\n"); 5883 return false; 5884 } 5885 5886 return true; 5887 } 5888 5889 int 5890 bdev_nvme_create(struct spdk_nvme_transport_id *trid, 5891 const char *base_name, 5892 const char **names, 5893 uint32_t count, 5894 spdk_bdev_create_nvme_fn cb_fn, 5895 void *cb_ctx, 5896 struct spdk_nvme_ctrlr_opts *drv_opts, 5897 struct nvme_ctrlr_opts *bdev_opts, 5898 bool multipath) 5899 { 5900 struct nvme_probe_skip_entry *entry, *tmp; 5901 struct nvme_async_probe_ctx *ctx; 5902 spdk_nvme_attach_cb attach_cb; 5903 5904 /* TODO expand this check to include both the host and target TRIDs. 5905 * Only if both are the same should we fail. 5906 */ 5907 if (nvme_ctrlr_get(trid) != NULL) { 5908 SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr); 5909 return -EEXIST; 5910 } 5911 5912 if (bdev_opts != NULL && 5913 !bdev_nvme_check_io_error_resiliency_params(bdev_opts->ctrlr_loss_timeout_sec, 5914 bdev_opts->reconnect_delay_sec, 5915 bdev_opts->fast_io_fail_timeout_sec)) { 5916 return -EINVAL; 5917 } 5918 5919 ctx = calloc(1, sizeof(*ctx)); 5920 if (!ctx) { 5921 return -ENOMEM; 5922 } 5923 ctx->base_name = base_name; 5924 ctx->names = names; 5925 ctx->max_bdevs = count; 5926 ctx->cb_fn = cb_fn; 5927 ctx->cb_ctx = cb_ctx; 5928 ctx->trid = *trid; 5929 5930 if (bdev_opts) { 5931 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 5932 } else { 5933 bdev_nvme_get_default_ctrlr_opts(&ctx->bdev_opts); 5934 } 5935 5936 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 5937 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) { 5938 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 5939 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 5940 free(entry); 5941 break; 5942 } 5943 } 5944 } 5945 5946 if (drv_opts) { 5947 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 5948 } else { 5949 spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->drv_opts, sizeof(ctx->drv_opts)); 5950 } 5951 5952 ctx->drv_opts.transport_retry_count = g_opts.transport_retry_count; 5953 ctx->drv_opts.transport_ack_timeout = g_opts.transport_ack_timeout; 5954 ctx->drv_opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms; 5955 ctx->drv_opts.disable_read_ana_log_page = true; 5956 ctx->drv_opts.transport_tos = g_opts.transport_tos; 5957 5958 if (nvme_bdev_ctrlr_get_by_name(base_name) == NULL || multipath) { 5959 attach_cb = connect_attach_cb; 5960 } else { 5961 attach_cb = connect_set_failover_cb; 5962 } 5963 5964 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, attach_cb); 5965 if (ctx->probe_ctx == NULL) { 5966 SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr); 5967 free(ctx); 5968 return -ENODEV; 5969 } 5970 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000); 5971 5972 return 0; 5973 } 5974 5975 static bool 5976 nvme_path_should_delete(struct nvme_path_id *p, const struct nvme_path_id *path_id) 5977 { 5978 if (path_id->trid.trtype != 0) { 5979 if (path_id->trid.trtype == SPDK_NVME_TRANSPORT_CUSTOM) { 5980 if (strcasecmp(path_id->trid.trstring, p->trid.trstring) != 0) { 5981 return false; 5982 } 5983 } else { 5984 if (path_id->trid.trtype != p->trid.trtype) { 5985 return false; 5986 } 5987 } 5988 } 5989 5990 if (!spdk_mem_all_zero(path_id->trid.traddr, sizeof(path_id->trid.traddr))) { 5991 if (strcasecmp(path_id->trid.traddr, p->trid.traddr) != 0) { 5992 return false; 5993 } 5994 } 5995 5996 if (path_id->trid.adrfam != 0) { 5997 if (path_id->trid.adrfam != p->trid.adrfam) { 5998 return false; 5999 } 6000 } 6001 6002 if (!spdk_mem_all_zero(path_id->trid.trsvcid, sizeof(path_id->trid.trsvcid))) { 6003 if (strcasecmp(path_id->trid.trsvcid, p->trid.trsvcid) != 0) { 6004 return false; 6005 } 6006 } 6007 6008 if (!spdk_mem_all_zero(path_id->trid.subnqn, sizeof(path_id->trid.subnqn))) { 6009 if (strcmp(path_id->trid.subnqn, p->trid.subnqn) != 0) { 6010 return false; 6011 } 6012 } 6013 6014 if (!spdk_mem_all_zero(path_id->hostid.hostaddr, sizeof(path_id->hostid.hostaddr))) { 6015 if (strcmp(path_id->hostid.hostaddr, p->hostid.hostaddr) != 0) { 6016 return false; 6017 } 6018 } 6019 6020 if (!spdk_mem_all_zero(path_id->hostid.hostsvcid, sizeof(path_id->hostid.hostsvcid))) { 6021 if (strcmp(path_id->hostid.hostsvcid, p->hostid.hostsvcid) != 0) { 6022 return false; 6023 } 6024 } 6025 6026 return true; 6027 } 6028 6029 static int 6030 _bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, const struct nvme_path_id *path_id) 6031 { 6032 struct nvme_path_id *p, *t; 6033 spdk_msg_fn msg_fn; 6034 int rc = -ENXIO; 6035 6036 pthread_mutex_lock(&nvme_ctrlr->mutex); 6037 6038 TAILQ_FOREACH_REVERSE_SAFE(p, &nvme_ctrlr->trids, nvme_paths, link, t) { 6039 if (p == TAILQ_FIRST(&nvme_ctrlr->trids)) { 6040 break; 6041 } 6042 6043 if (!nvme_path_should_delete(p, path_id)) { 6044 continue; 6045 } 6046 6047 /* We are not using the specified path. */ 6048 TAILQ_REMOVE(&nvme_ctrlr->trids, p, link); 6049 free(p); 6050 rc = 0; 6051 } 6052 6053 if (p == NULL || !nvme_path_should_delete(p, path_id)) { 6054 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6055 return rc; 6056 } 6057 6058 /* If we made it here, then this path is a match! Now we need to remove it. */ 6059 6060 /* This is the active path in use right now. The active path is always the first in the list. */ 6061 assert(p == nvme_ctrlr->active_path_id); 6062 6063 if (!TAILQ_NEXT(p, link)) { 6064 /* The current path is the only path. */ 6065 msg_fn = _nvme_ctrlr_destruct; 6066 rc = bdev_nvme_delete_ctrlr_unsafe(nvme_ctrlr, false); 6067 } else { 6068 /* There is an alternative path. */ 6069 msg_fn = _bdev_nvme_reset_ctrlr; 6070 rc = bdev_nvme_failover_ctrlr_unsafe(nvme_ctrlr, true); 6071 } 6072 6073 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6074 6075 if (rc == 0) { 6076 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 6077 } else if (rc == -EALREADY) { 6078 rc = 0; 6079 } 6080 6081 return rc; 6082 } 6083 6084 int 6085 bdev_nvme_delete(const char *name, const struct nvme_path_id *path_id) 6086 { 6087 struct nvme_bdev_ctrlr *nbdev_ctrlr; 6088 struct nvme_ctrlr *nvme_ctrlr, *tmp_nvme_ctrlr; 6089 int rc = -ENXIO, _rc; 6090 6091 if (name == NULL || path_id == NULL) { 6092 return -EINVAL; 6093 } 6094 6095 pthread_mutex_lock(&g_bdev_nvme_mutex); 6096 6097 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 6098 if (nbdev_ctrlr == NULL) { 6099 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6100 6101 SPDK_ERRLOG("Failed to find NVMe bdev controller\n"); 6102 return -ENODEV; 6103 } 6104 6105 TAILQ_FOREACH_SAFE(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq, tmp_nvme_ctrlr) { 6106 _rc = _bdev_nvme_delete(nvme_ctrlr, path_id); 6107 if (_rc < 0 && _rc != -ENXIO) { 6108 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6109 6110 return _rc; 6111 } else if (_rc == 0) { 6112 /* We traverse all remaining nvme_ctrlrs even if one nvme_ctrlr 6113 * was deleted successfully. To remember the successful deletion, 6114 * overwrite rc only if _rc is zero. 6115 */ 6116 rc = 0; 6117 } 6118 } 6119 6120 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6121 6122 /* All nvme_ctrlrs were deleted or no nvme_ctrlr which had the trid was found. */ 6123 return rc; 6124 } 6125 6126 #define DISCOVERY_INFOLOG(ctx, format, ...) \ 6127 SPDK_INFOLOG(bdev_nvme, "Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 6128 6129 #define DISCOVERY_ERRLOG(ctx, format, ...) \ 6130 SPDK_ERRLOG("Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 6131 6132 struct discovery_entry_ctx { 6133 char name[128]; 6134 struct spdk_nvme_transport_id trid; 6135 struct spdk_nvme_ctrlr_opts drv_opts; 6136 struct spdk_nvmf_discovery_log_page_entry entry; 6137 TAILQ_ENTRY(discovery_entry_ctx) tailq; 6138 struct discovery_ctx *ctx; 6139 }; 6140 6141 struct discovery_ctx { 6142 char *name; 6143 spdk_bdev_nvme_start_discovery_fn start_cb_fn; 6144 spdk_bdev_nvme_stop_discovery_fn stop_cb_fn; 6145 void *cb_ctx; 6146 struct spdk_nvme_probe_ctx *probe_ctx; 6147 struct spdk_nvme_detach_ctx *detach_ctx; 6148 struct spdk_nvme_ctrlr *ctrlr; 6149 struct spdk_nvme_transport_id trid; 6150 struct discovery_entry_ctx *entry_ctx_in_use; 6151 struct spdk_poller *poller; 6152 struct spdk_nvme_ctrlr_opts drv_opts; 6153 struct nvme_ctrlr_opts bdev_opts; 6154 struct spdk_nvmf_discovery_log_page *log_page; 6155 TAILQ_ENTRY(discovery_ctx) tailq; 6156 TAILQ_HEAD(, discovery_entry_ctx) nvm_entry_ctxs; 6157 TAILQ_HEAD(, discovery_entry_ctx) discovery_entry_ctxs; 6158 int rc; 6159 bool wait_for_attach; 6160 uint64_t timeout_ticks; 6161 /* Denotes that the discovery service is being started. We're waiting 6162 * for the initial connection to the discovery controller to be 6163 * established and attach discovered NVM ctrlrs. 6164 */ 6165 bool initializing; 6166 /* Denotes if a discovery is currently in progress for this context. 6167 * That includes connecting to newly discovered subsystems. Used to 6168 * ensure we do not start a new discovery until an existing one is 6169 * complete. 6170 */ 6171 bool in_progress; 6172 6173 /* Denotes if another discovery is needed after the one in progress 6174 * completes. Set when we receive an AER completion while a discovery 6175 * is already in progress. 6176 */ 6177 bool pending; 6178 6179 /* Signal to the discovery context poller that it should stop the 6180 * discovery service, including detaching from the current discovery 6181 * controller. 6182 */ 6183 bool stop; 6184 6185 struct spdk_thread *calling_thread; 6186 uint32_t index; 6187 uint32_t attach_in_progress; 6188 char *hostnqn; 6189 6190 /* Denotes if the discovery service was started by the mdns discovery. 6191 */ 6192 bool from_mdns_discovery_service; 6193 }; 6194 6195 TAILQ_HEAD(discovery_ctxs, discovery_ctx); 6196 static struct discovery_ctxs g_discovery_ctxs = TAILQ_HEAD_INITIALIZER(g_discovery_ctxs); 6197 6198 static void get_discovery_log_page(struct discovery_ctx *ctx); 6199 6200 static void 6201 free_discovery_ctx(struct discovery_ctx *ctx) 6202 { 6203 free(ctx->log_page); 6204 free(ctx->hostnqn); 6205 free(ctx->name); 6206 free(ctx); 6207 } 6208 6209 static void 6210 discovery_complete(struct discovery_ctx *ctx) 6211 { 6212 ctx->initializing = false; 6213 ctx->in_progress = false; 6214 if (ctx->pending) { 6215 ctx->pending = false; 6216 get_discovery_log_page(ctx); 6217 } 6218 } 6219 6220 static void 6221 build_trid_from_log_page_entry(struct spdk_nvme_transport_id *trid, 6222 struct spdk_nvmf_discovery_log_page_entry *entry) 6223 { 6224 char *space; 6225 6226 trid->trtype = entry->trtype; 6227 trid->adrfam = entry->adrfam; 6228 memcpy(trid->traddr, entry->traddr, sizeof(entry->traddr)); 6229 memcpy(trid->trsvcid, entry->trsvcid, sizeof(entry->trsvcid)); 6230 /* Because the source buffer (entry->subnqn) is longer than trid->subnqn, and 6231 * before call to this function trid->subnqn is zeroed out, we need 6232 * to copy sizeof(trid->subnqn) minus one byte to make sure the last character 6233 * remains 0. Then we can shorten the string (replace ' ' with 0) if required 6234 */ 6235 memcpy(trid->subnqn, entry->subnqn, sizeof(trid->subnqn) - 1); 6236 6237 /* We want the traddr, trsvcid and subnqn fields to be NULL-terminated. 6238 * But the log page entries typically pad them with spaces, not zeroes. 6239 * So add a NULL terminator to each of these fields at the appropriate 6240 * location. 6241 */ 6242 space = strchr(trid->traddr, ' '); 6243 if (space) { 6244 *space = 0; 6245 } 6246 space = strchr(trid->trsvcid, ' '); 6247 if (space) { 6248 *space = 0; 6249 } 6250 space = strchr(trid->subnqn, ' '); 6251 if (space) { 6252 *space = 0; 6253 } 6254 } 6255 6256 static void 6257 _stop_discovery(void *_ctx) 6258 { 6259 struct discovery_ctx *ctx = _ctx; 6260 6261 if (ctx->attach_in_progress > 0) { 6262 spdk_thread_send_msg(spdk_get_thread(), _stop_discovery, ctx); 6263 return; 6264 } 6265 6266 ctx->stop = true; 6267 6268 while (!TAILQ_EMPTY(&ctx->nvm_entry_ctxs)) { 6269 struct discovery_entry_ctx *entry_ctx; 6270 struct nvme_path_id path = {}; 6271 6272 entry_ctx = TAILQ_FIRST(&ctx->nvm_entry_ctxs); 6273 path.trid = entry_ctx->trid; 6274 bdev_nvme_delete(entry_ctx->name, &path); 6275 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 6276 free(entry_ctx); 6277 } 6278 6279 while (!TAILQ_EMPTY(&ctx->discovery_entry_ctxs)) { 6280 struct discovery_entry_ctx *entry_ctx; 6281 6282 entry_ctx = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 6283 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 6284 free(entry_ctx); 6285 } 6286 6287 free(ctx->entry_ctx_in_use); 6288 ctx->entry_ctx_in_use = NULL; 6289 } 6290 6291 static void 6292 stop_discovery(struct discovery_ctx *ctx, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 6293 { 6294 ctx->stop_cb_fn = cb_fn; 6295 ctx->cb_ctx = cb_ctx; 6296 6297 if (ctx->attach_in_progress > 0) { 6298 DISCOVERY_INFOLOG(ctx, "stopping discovery with attach_in_progress: %"PRIu32"\n", 6299 ctx->attach_in_progress); 6300 } 6301 6302 _stop_discovery(ctx); 6303 } 6304 6305 static void 6306 remove_discovery_entry(struct nvme_ctrlr *nvme_ctrlr) 6307 { 6308 struct discovery_ctx *d_ctx; 6309 struct nvme_path_id *path_id; 6310 struct spdk_nvme_transport_id trid = {}; 6311 struct discovery_entry_ctx *entry_ctx, *tmp; 6312 6313 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 6314 6315 TAILQ_FOREACH(d_ctx, &g_discovery_ctxs, tailq) { 6316 TAILQ_FOREACH_SAFE(entry_ctx, &d_ctx->nvm_entry_ctxs, tailq, tmp) { 6317 build_trid_from_log_page_entry(&trid, &entry_ctx->entry); 6318 if (spdk_nvme_transport_id_compare(&trid, &path_id->trid) != 0) { 6319 continue; 6320 } 6321 6322 TAILQ_REMOVE(&d_ctx->nvm_entry_ctxs, entry_ctx, tailq); 6323 free(entry_ctx); 6324 DISCOVERY_INFOLOG(d_ctx, "Remove discovery entry: %s:%s:%s\n", 6325 trid.subnqn, trid.traddr, trid.trsvcid); 6326 6327 /* Fail discovery ctrlr to force reattach attempt */ 6328 spdk_nvme_ctrlr_fail(d_ctx->ctrlr); 6329 } 6330 } 6331 } 6332 6333 static void 6334 discovery_remove_controllers(struct discovery_ctx *ctx) 6335 { 6336 struct spdk_nvmf_discovery_log_page *log_page = ctx->log_page; 6337 struct discovery_entry_ctx *entry_ctx, *tmp; 6338 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 6339 struct spdk_nvme_transport_id old_trid = {}; 6340 uint64_t numrec, i; 6341 bool found; 6342 6343 numrec = from_le64(&log_page->numrec); 6344 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->nvm_entry_ctxs, tailq, tmp) { 6345 found = false; 6346 old_entry = &entry_ctx->entry; 6347 build_trid_from_log_page_entry(&old_trid, old_entry); 6348 for (i = 0; i < numrec; i++) { 6349 new_entry = &log_page->entries[i]; 6350 if (!memcmp(old_entry, new_entry, sizeof(*old_entry))) { 6351 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s found again\n", 6352 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 6353 found = true; 6354 break; 6355 } 6356 } 6357 if (!found) { 6358 struct nvme_path_id path = {}; 6359 6360 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s not found\n", 6361 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 6362 6363 path.trid = entry_ctx->trid; 6364 bdev_nvme_delete(entry_ctx->name, &path); 6365 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 6366 free(entry_ctx); 6367 } 6368 } 6369 free(log_page); 6370 ctx->log_page = NULL; 6371 discovery_complete(ctx); 6372 } 6373 6374 static void 6375 complete_discovery_start(struct discovery_ctx *ctx, int status) 6376 { 6377 ctx->timeout_ticks = 0; 6378 ctx->rc = status; 6379 if (ctx->start_cb_fn) { 6380 ctx->start_cb_fn(ctx->cb_ctx, status); 6381 ctx->start_cb_fn = NULL; 6382 ctx->cb_ctx = NULL; 6383 } 6384 } 6385 6386 static void 6387 discovery_attach_controller_done(void *cb_ctx, size_t bdev_count, int rc) 6388 { 6389 struct discovery_entry_ctx *entry_ctx = cb_ctx; 6390 struct discovery_ctx *ctx = entry_ctx->ctx; 6391 6392 DISCOVERY_INFOLOG(ctx, "attach %s done\n", entry_ctx->name); 6393 ctx->attach_in_progress--; 6394 if (ctx->attach_in_progress == 0) { 6395 complete_discovery_start(ctx, ctx->rc); 6396 if (ctx->initializing && ctx->rc != 0) { 6397 DISCOVERY_ERRLOG(ctx, "stopping discovery due to errors: %d\n", ctx->rc); 6398 stop_discovery(ctx, NULL, ctx->cb_ctx); 6399 } else { 6400 discovery_remove_controllers(ctx); 6401 } 6402 } 6403 } 6404 6405 static struct discovery_entry_ctx * 6406 create_discovery_entry_ctx(struct discovery_ctx *ctx, struct spdk_nvme_transport_id *trid) 6407 { 6408 struct discovery_entry_ctx *new_ctx; 6409 6410 new_ctx = calloc(1, sizeof(*new_ctx)); 6411 if (new_ctx == NULL) { 6412 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6413 return NULL; 6414 } 6415 6416 new_ctx->ctx = ctx; 6417 memcpy(&new_ctx->trid, trid, sizeof(*trid)); 6418 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 6419 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 6420 return new_ctx; 6421 } 6422 6423 static void 6424 discovery_log_page_cb(void *cb_arg, int rc, const struct spdk_nvme_cpl *cpl, 6425 struct spdk_nvmf_discovery_log_page *log_page) 6426 { 6427 struct discovery_ctx *ctx = cb_arg; 6428 struct discovery_entry_ctx *entry_ctx, *tmp; 6429 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 6430 uint64_t numrec, i; 6431 bool found; 6432 6433 if (rc || spdk_nvme_cpl_is_error(cpl)) { 6434 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 6435 return; 6436 } 6437 6438 ctx->log_page = log_page; 6439 assert(ctx->attach_in_progress == 0); 6440 numrec = from_le64(&log_page->numrec); 6441 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->discovery_entry_ctxs, tailq, tmp) { 6442 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 6443 free(entry_ctx); 6444 } 6445 for (i = 0; i < numrec; i++) { 6446 found = false; 6447 new_entry = &log_page->entries[i]; 6448 if (new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY_CURRENT || 6449 new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) { 6450 struct discovery_entry_ctx *new_ctx; 6451 struct spdk_nvme_transport_id trid = {}; 6452 6453 build_trid_from_log_page_entry(&trid, new_entry); 6454 new_ctx = create_discovery_entry_ctx(ctx, &trid); 6455 if (new_ctx == NULL) { 6456 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6457 break; 6458 } 6459 6460 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, new_ctx, tailq); 6461 continue; 6462 } 6463 TAILQ_FOREACH(entry_ctx, &ctx->nvm_entry_ctxs, tailq) { 6464 old_entry = &entry_ctx->entry; 6465 if (!memcmp(new_entry, old_entry, sizeof(*new_entry))) { 6466 found = true; 6467 break; 6468 } 6469 } 6470 if (!found) { 6471 struct discovery_entry_ctx *subnqn_ctx = NULL, *new_ctx; 6472 struct discovery_ctx *d_ctx; 6473 6474 TAILQ_FOREACH(d_ctx, &g_discovery_ctxs, tailq) { 6475 TAILQ_FOREACH(subnqn_ctx, &d_ctx->nvm_entry_ctxs, tailq) { 6476 if (!memcmp(subnqn_ctx->entry.subnqn, new_entry->subnqn, 6477 sizeof(new_entry->subnqn))) { 6478 break; 6479 } 6480 } 6481 if (subnqn_ctx) { 6482 break; 6483 } 6484 } 6485 6486 new_ctx = calloc(1, sizeof(*new_ctx)); 6487 if (new_ctx == NULL) { 6488 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6489 break; 6490 } 6491 6492 new_ctx->ctx = ctx; 6493 memcpy(&new_ctx->entry, new_entry, sizeof(*new_entry)); 6494 build_trid_from_log_page_entry(&new_ctx->trid, new_entry); 6495 if (subnqn_ctx) { 6496 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s", subnqn_ctx->name); 6497 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new path for %s\n", 6498 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 6499 new_ctx->name); 6500 } else { 6501 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s%d", ctx->name, ctx->index++); 6502 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new subsystem %s\n", 6503 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 6504 new_ctx->name); 6505 } 6506 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 6507 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 6508 rc = bdev_nvme_create(&new_ctx->trid, new_ctx->name, NULL, 0, 6509 discovery_attach_controller_done, new_ctx, 6510 &new_ctx->drv_opts, &ctx->bdev_opts, true); 6511 if (rc == 0) { 6512 TAILQ_INSERT_TAIL(&ctx->nvm_entry_ctxs, new_ctx, tailq); 6513 ctx->attach_in_progress++; 6514 } else { 6515 DISCOVERY_ERRLOG(ctx, "bdev_nvme_create failed (%s)\n", spdk_strerror(-rc)); 6516 } 6517 } 6518 } 6519 6520 if (ctx->attach_in_progress == 0) { 6521 discovery_remove_controllers(ctx); 6522 } 6523 } 6524 6525 static void 6526 get_discovery_log_page(struct discovery_ctx *ctx) 6527 { 6528 int rc; 6529 6530 assert(ctx->in_progress == false); 6531 ctx->in_progress = true; 6532 rc = spdk_nvme_ctrlr_get_discovery_log_page(ctx->ctrlr, discovery_log_page_cb, ctx); 6533 if (rc != 0) { 6534 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 6535 } 6536 DISCOVERY_INFOLOG(ctx, "sent discovery log page command\n"); 6537 } 6538 6539 static void 6540 discovery_aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 6541 { 6542 struct discovery_ctx *ctx = arg; 6543 uint32_t log_page_id = (cpl->cdw0 & 0xFF0000) >> 16; 6544 6545 if (spdk_nvme_cpl_is_error(cpl)) { 6546 DISCOVERY_ERRLOG(ctx, "aer failed\n"); 6547 return; 6548 } 6549 6550 if (log_page_id != SPDK_NVME_LOG_DISCOVERY) { 6551 DISCOVERY_ERRLOG(ctx, "unexpected log page 0x%x\n", log_page_id); 6552 return; 6553 } 6554 6555 DISCOVERY_INFOLOG(ctx, "got aer\n"); 6556 if (ctx->in_progress) { 6557 ctx->pending = true; 6558 return; 6559 } 6560 6561 get_discovery_log_page(ctx); 6562 } 6563 6564 static void 6565 discovery_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 6566 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 6567 { 6568 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 6569 struct discovery_ctx *ctx; 6570 6571 ctx = SPDK_CONTAINEROF(user_opts, struct discovery_ctx, drv_opts); 6572 6573 DISCOVERY_INFOLOG(ctx, "discovery ctrlr attached\n"); 6574 ctx->probe_ctx = NULL; 6575 ctx->ctrlr = ctrlr; 6576 6577 if (ctx->rc != 0) { 6578 DISCOVERY_ERRLOG(ctx, "encountered error while attaching discovery ctrlr: %d\n", 6579 ctx->rc); 6580 return; 6581 } 6582 6583 spdk_nvme_ctrlr_register_aer_callback(ctx->ctrlr, discovery_aer_cb, ctx); 6584 } 6585 6586 static int 6587 discovery_poller(void *arg) 6588 { 6589 struct discovery_ctx *ctx = arg; 6590 struct spdk_nvme_transport_id *trid; 6591 int rc; 6592 6593 if (ctx->detach_ctx) { 6594 rc = spdk_nvme_detach_poll_async(ctx->detach_ctx); 6595 if (rc != -EAGAIN) { 6596 ctx->detach_ctx = NULL; 6597 ctx->ctrlr = NULL; 6598 } 6599 } else if (ctx->stop) { 6600 if (ctx->ctrlr != NULL) { 6601 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 6602 if (rc == 0) { 6603 return SPDK_POLLER_BUSY; 6604 } 6605 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 6606 } 6607 spdk_poller_unregister(&ctx->poller); 6608 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 6609 assert(ctx->start_cb_fn == NULL); 6610 if (ctx->stop_cb_fn != NULL) { 6611 ctx->stop_cb_fn(ctx->cb_ctx); 6612 } 6613 free_discovery_ctx(ctx); 6614 } else if (ctx->probe_ctx == NULL && ctx->ctrlr == NULL) { 6615 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 6616 DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); 6617 assert(ctx->initializing); 6618 spdk_poller_unregister(&ctx->poller); 6619 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 6620 complete_discovery_start(ctx, -ETIMEDOUT); 6621 stop_discovery(ctx, NULL, NULL); 6622 free_discovery_ctx(ctx); 6623 return SPDK_POLLER_BUSY; 6624 } 6625 6626 assert(ctx->entry_ctx_in_use == NULL); 6627 ctx->entry_ctx_in_use = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 6628 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 6629 trid = &ctx->entry_ctx_in_use->trid; 6630 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, discovery_attach_cb); 6631 if (ctx->probe_ctx) { 6632 spdk_poller_unregister(&ctx->poller); 6633 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000); 6634 } else { 6635 DISCOVERY_ERRLOG(ctx, "could not start discovery connect\n"); 6636 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 6637 ctx->entry_ctx_in_use = NULL; 6638 } 6639 } else if (ctx->probe_ctx) { 6640 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 6641 DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); 6642 complete_discovery_start(ctx, -ETIMEDOUT); 6643 return SPDK_POLLER_BUSY; 6644 } 6645 6646 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 6647 if (rc != -EAGAIN) { 6648 if (ctx->rc != 0) { 6649 assert(ctx->initializing); 6650 stop_discovery(ctx, NULL, ctx->cb_ctx); 6651 } else { 6652 assert(rc == 0); 6653 DISCOVERY_INFOLOG(ctx, "discovery ctrlr connected\n"); 6654 ctx->rc = rc; 6655 get_discovery_log_page(ctx); 6656 } 6657 } 6658 } else { 6659 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 6660 DISCOVERY_ERRLOG(ctx, "timed out while attaching NVM ctrlrs\n"); 6661 complete_discovery_start(ctx, -ETIMEDOUT); 6662 /* We need to wait until all NVM ctrlrs are attached before we stop the 6663 * discovery service to make sure we don't detach a ctrlr that is still 6664 * being attached. 6665 */ 6666 if (ctx->attach_in_progress == 0) { 6667 stop_discovery(ctx, NULL, ctx->cb_ctx); 6668 return SPDK_POLLER_BUSY; 6669 } 6670 } 6671 6672 rc = spdk_nvme_ctrlr_process_admin_completions(ctx->ctrlr); 6673 if (rc < 0) { 6674 spdk_poller_unregister(&ctx->poller); 6675 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 6676 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 6677 ctx->entry_ctx_in_use = NULL; 6678 6679 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 6680 if (rc != 0) { 6681 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 6682 ctx->ctrlr = NULL; 6683 } 6684 } 6685 } 6686 6687 return SPDK_POLLER_BUSY; 6688 } 6689 6690 static void 6691 start_discovery_poller(void *arg) 6692 { 6693 struct discovery_ctx *ctx = arg; 6694 6695 TAILQ_INSERT_TAIL(&g_discovery_ctxs, ctx, tailq); 6696 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 6697 } 6698 6699 int 6700 bdev_nvme_start_discovery(struct spdk_nvme_transport_id *trid, 6701 const char *base_name, 6702 struct spdk_nvme_ctrlr_opts *drv_opts, 6703 struct nvme_ctrlr_opts *bdev_opts, 6704 uint64_t attach_timeout, 6705 bool from_mdns, 6706 spdk_bdev_nvme_start_discovery_fn cb_fn, void *cb_ctx) 6707 { 6708 struct discovery_ctx *ctx; 6709 struct discovery_entry_ctx *discovery_entry_ctx; 6710 6711 snprintf(trid->subnqn, sizeof(trid->subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN); 6712 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 6713 if (strcmp(ctx->name, base_name) == 0) { 6714 return -EEXIST; 6715 } 6716 6717 if (ctx->entry_ctx_in_use != NULL) { 6718 if (!spdk_nvme_transport_id_compare(trid, &ctx->entry_ctx_in_use->trid)) { 6719 return -EEXIST; 6720 } 6721 } 6722 6723 TAILQ_FOREACH(discovery_entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 6724 if (!spdk_nvme_transport_id_compare(trid, &discovery_entry_ctx->trid)) { 6725 return -EEXIST; 6726 } 6727 } 6728 } 6729 6730 ctx = calloc(1, sizeof(*ctx)); 6731 if (ctx == NULL) { 6732 return -ENOMEM; 6733 } 6734 6735 ctx->name = strdup(base_name); 6736 if (ctx->name == NULL) { 6737 free_discovery_ctx(ctx); 6738 return -ENOMEM; 6739 } 6740 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 6741 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 6742 ctx->from_mdns_discovery_service = from_mdns; 6743 ctx->bdev_opts.from_discovery_service = true; 6744 ctx->calling_thread = spdk_get_thread(); 6745 ctx->start_cb_fn = cb_fn; 6746 ctx->cb_ctx = cb_ctx; 6747 ctx->initializing = true; 6748 if (ctx->start_cb_fn) { 6749 /* We can use this when dumping json to denote if this RPC parameter 6750 * was specified or not. 6751 */ 6752 ctx->wait_for_attach = true; 6753 } 6754 if (attach_timeout != 0) { 6755 ctx->timeout_ticks = spdk_get_ticks() + attach_timeout * 6756 spdk_get_ticks_hz() / 1000ull; 6757 } 6758 TAILQ_INIT(&ctx->nvm_entry_ctxs); 6759 TAILQ_INIT(&ctx->discovery_entry_ctxs); 6760 memcpy(&ctx->trid, trid, sizeof(*trid)); 6761 /* Even if user did not specify hostnqn, we can still strdup("\0"); */ 6762 ctx->hostnqn = strdup(ctx->drv_opts.hostnqn); 6763 if (ctx->hostnqn == NULL) { 6764 free_discovery_ctx(ctx); 6765 return -ENOMEM; 6766 } 6767 discovery_entry_ctx = create_discovery_entry_ctx(ctx, trid); 6768 if (discovery_entry_ctx == NULL) { 6769 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6770 free_discovery_ctx(ctx); 6771 return -ENOMEM; 6772 } 6773 6774 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, discovery_entry_ctx, tailq); 6775 spdk_thread_send_msg(g_bdev_nvme_init_thread, start_discovery_poller, ctx); 6776 return 0; 6777 } 6778 6779 int 6780 bdev_nvme_stop_discovery(const char *name, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 6781 { 6782 struct discovery_ctx *ctx; 6783 6784 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 6785 if (strcmp(name, ctx->name) == 0) { 6786 if (ctx->stop) { 6787 return -EALREADY; 6788 } 6789 /* If we're still starting the discovery service and ->rc is non-zero, we're 6790 * going to stop it as soon as we can 6791 */ 6792 if (ctx->initializing && ctx->rc != 0) { 6793 return -EALREADY; 6794 } 6795 stop_discovery(ctx, cb_fn, cb_ctx); 6796 return 0; 6797 } 6798 } 6799 6800 return -ENOENT; 6801 } 6802 6803 static int 6804 bdev_nvme_library_init(void) 6805 { 6806 g_bdev_nvme_init_thread = spdk_get_thread(); 6807 6808 spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_create_poll_group_cb, 6809 bdev_nvme_destroy_poll_group_cb, 6810 sizeof(struct nvme_poll_group), "nvme_poll_groups"); 6811 6812 return 0; 6813 } 6814 6815 static void 6816 bdev_nvme_fini_destruct_ctrlrs(void) 6817 { 6818 struct nvme_bdev_ctrlr *nbdev_ctrlr; 6819 struct nvme_ctrlr *nvme_ctrlr; 6820 6821 pthread_mutex_lock(&g_bdev_nvme_mutex); 6822 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 6823 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 6824 pthread_mutex_lock(&nvme_ctrlr->mutex); 6825 if (nvme_ctrlr->destruct) { 6826 /* This controller's destruction was already started 6827 * before the application started shutting down 6828 */ 6829 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6830 continue; 6831 } 6832 nvme_ctrlr->destruct = true; 6833 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6834 6835 spdk_thread_send_msg(nvme_ctrlr->thread, _nvme_ctrlr_destruct, 6836 nvme_ctrlr); 6837 } 6838 } 6839 6840 g_bdev_nvme_module_finish = true; 6841 if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 6842 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6843 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 6844 spdk_bdev_module_fini_done(); 6845 return; 6846 } 6847 6848 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6849 } 6850 6851 static void 6852 check_discovery_fini(void *arg) 6853 { 6854 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 6855 bdev_nvme_fini_destruct_ctrlrs(); 6856 } 6857 } 6858 6859 static void 6860 bdev_nvme_library_fini(void) 6861 { 6862 struct nvme_probe_skip_entry *entry, *entry_tmp; 6863 struct discovery_ctx *ctx; 6864 6865 spdk_poller_unregister(&g_hotplug_poller); 6866 free(g_hotplug_probe_ctx); 6867 g_hotplug_probe_ctx = NULL; 6868 6869 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) { 6870 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 6871 free(entry); 6872 } 6873 6874 assert(spdk_get_thread() == g_bdev_nvme_init_thread); 6875 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 6876 bdev_nvme_fini_destruct_ctrlrs(); 6877 } else { 6878 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 6879 stop_discovery(ctx, check_discovery_fini, NULL); 6880 } 6881 } 6882 } 6883 6884 static void 6885 bdev_nvme_verify_pi_error(struct nvme_bdev_io *bio) 6886 { 6887 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 6888 struct spdk_bdev *bdev = bdev_io->bdev; 6889 struct spdk_dif_ctx dif_ctx; 6890 struct spdk_dif_error err_blk = {}; 6891 int rc; 6892 struct spdk_dif_ctx_init_ext_opts dif_opts; 6893 6894 dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format); 6895 dif_opts.dif_pi_format = SPDK_DIF_PI_FORMAT_16; 6896 rc = spdk_dif_ctx_init(&dif_ctx, 6897 bdev->blocklen, bdev->md_len, bdev->md_interleave, 6898 bdev->dif_is_head_of_md, bdev->dif_type, bdev->dif_check_flags, 6899 bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0, &dif_opts); 6900 if (rc != 0) { 6901 SPDK_ERRLOG("Initialization of DIF context failed\n"); 6902 return; 6903 } 6904 6905 if (bdev->md_interleave) { 6906 rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 6907 bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 6908 } else { 6909 struct iovec md_iov = { 6910 .iov_base = bdev_io->u.bdev.md_buf, 6911 .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len, 6912 }; 6913 6914 rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 6915 &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 6916 } 6917 6918 if (rc != 0) { 6919 SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n", 6920 err_blk.err_type, err_blk.err_offset); 6921 } else { 6922 SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n"); 6923 } 6924 } 6925 6926 static void 6927 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 6928 { 6929 struct nvme_bdev_io *bio = ref; 6930 6931 if (spdk_nvme_cpl_is_success(cpl)) { 6932 /* Run PI verification for read data buffer. */ 6933 bdev_nvme_verify_pi_error(bio); 6934 } 6935 6936 /* Return original completion status */ 6937 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 6938 } 6939 6940 static void 6941 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 6942 { 6943 struct nvme_bdev_io *bio = ref; 6944 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 6945 int ret; 6946 6947 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 6948 SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n", 6949 cpl->status.sct, cpl->status.sc); 6950 6951 /* Save completion status to use after verifying PI error. */ 6952 bio->cpl = *cpl; 6953 6954 if (spdk_likely(nvme_io_path_is_available(bio->io_path))) { 6955 /* Read without PI checking to verify PI error. */ 6956 ret = bdev_nvme_no_pi_readv(bio, 6957 bdev_io->u.bdev.iovs, 6958 bdev_io->u.bdev.iovcnt, 6959 bdev_io->u.bdev.md_buf, 6960 bdev_io->u.bdev.num_blocks, 6961 bdev_io->u.bdev.offset_blocks); 6962 if (ret == 0) { 6963 return; 6964 } 6965 } 6966 } 6967 6968 bdev_nvme_io_complete_nvme_status(bio, cpl); 6969 } 6970 6971 static void 6972 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 6973 { 6974 struct nvme_bdev_io *bio = ref; 6975 6976 if (spdk_nvme_cpl_is_pi_error(cpl)) { 6977 SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n", 6978 cpl->status.sct, cpl->status.sc); 6979 /* Run PI verification for write data buffer if PI error is detected. */ 6980 bdev_nvme_verify_pi_error(bio); 6981 } 6982 6983 bdev_nvme_io_complete_nvme_status(bio, cpl); 6984 } 6985 6986 static void 6987 bdev_nvme_zone_appendv_done(void *ref, const struct spdk_nvme_cpl *cpl) 6988 { 6989 struct nvme_bdev_io *bio = ref; 6990 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 6991 6992 /* spdk_bdev_io_get_append_location() requires that the ALBA is stored in offset_blocks. 6993 * Additionally, offset_blocks has to be set before calling bdev_nvme_verify_pi_error(). 6994 */ 6995 bdev_io->u.bdev.offset_blocks = *(uint64_t *)&cpl->cdw0; 6996 6997 if (spdk_nvme_cpl_is_pi_error(cpl)) { 6998 SPDK_ERRLOG("zone append completed with PI error (sct=%d, sc=%d)\n", 6999 cpl->status.sct, cpl->status.sc); 7000 /* Run PI verification for zone append data buffer if PI error is detected. */ 7001 bdev_nvme_verify_pi_error(bio); 7002 } 7003 7004 bdev_nvme_io_complete_nvme_status(bio, cpl); 7005 } 7006 7007 static void 7008 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7009 { 7010 struct nvme_bdev_io *bio = ref; 7011 7012 if (spdk_nvme_cpl_is_pi_error(cpl)) { 7013 SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n", 7014 cpl->status.sct, cpl->status.sc); 7015 /* Run PI verification for compare data buffer if PI error is detected. */ 7016 bdev_nvme_verify_pi_error(bio); 7017 } 7018 7019 bdev_nvme_io_complete_nvme_status(bio, cpl); 7020 } 7021 7022 static void 7023 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7024 { 7025 struct nvme_bdev_io *bio = ref; 7026 7027 /* Compare operation completion */ 7028 if (!bio->first_fused_completed) { 7029 /* Save compare result for write callback */ 7030 bio->cpl = *cpl; 7031 bio->first_fused_completed = true; 7032 return; 7033 } 7034 7035 /* Write operation completion */ 7036 if (spdk_nvme_cpl_is_error(&bio->cpl)) { 7037 /* If bio->cpl is already an error, it means the compare operation failed. In that case, 7038 * complete the IO with the compare operation's status. 7039 */ 7040 if (!spdk_nvme_cpl_is_error(cpl)) { 7041 SPDK_ERRLOG("Unexpected write success after compare failure.\n"); 7042 } 7043 7044 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 7045 } else { 7046 bdev_nvme_io_complete_nvme_status(bio, cpl); 7047 } 7048 } 7049 7050 static void 7051 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl) 7052 { 7053 struct nvme_bdev_io *bio = ref; 7054 7055 bdev_nvme_io_complete_nvme_status(bio, cpl); 7056 } 7057 7058 static int 7059 fill_zone_from_report(struct spdk_bdev_zone_info *info, struct spdk_nvme_zns_zone_desc *desc) 7060 { 7061 switch (desc->zt) { 7062 case SPDK_NVME_ZONE_TYPE_SEQWR: 7063 info->type = SPDK_BDEV_ZONE_TYPE_SEQWR; 7064 break; 7065 default: 7066 SPDK_ERRLOG("Invalid zone type: %#x in zone report\n", desc->zt); 7067 return -EIO; 7068 } 7069 7070 switch (desc->zs) { 7071 case SPDK_NVME_ZONE_STATE_EMPTY: 7072 info->state = SPDK_BDEV_ZONE_STATE_EMPTY; 7073 break; 7074 case SPDK_NVME_ZONE_STATE_IOPEN: 7075 info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN; 7076 break; 7077 case SPDK_NVME_ZONE_STATE_EOPEN: 7078 info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN; 7079 break; 7080 case SPDK_NVME_ZONE_STATE_CLOSED: 7081 info->state = SPDK_BDEV_ZONE_STATE_CLOSED; 7082 break; 7083 case SPDK_NVME_ZONE_STATE_RONLY: 7084 info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY; 7085 break; 7086 case SPDK_NVME_ZONE_STATE_FULL: 7087 info->state = SPDK_BDEV_ZONE_STATE_FULL; 7088 break; 7089 case SPDK_NVME_ZONE_STATE_OFFLINE: 7090 info->state = SPDK_BDEV_ZONE_STATE_OFFLINE; 7091 break; 7092 default: 7093 SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", desc->zs); 7094 return -EIO; 7095 } 7096 7097 info->zone_id = desc->zslba; 7098 info->write_pointer = desc->wp; 7099 info->capacity = desc->zcap; 7100 7101 return 0; 7102 } 7103 7104 static void 7105 bdev_nvme_get_zone_info_done(void *ref, const struct spdk_nvme_cpl *cpl) 7106 { 7107 struct nvme_bdev_io *bio = ref; 7108 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7109 uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id; 7110 uint32_t zones_to_copy = bdev_io->u.zone_mgmt.num_zones; 7111 struct spdk_bdev_zone_info *info = bdev_io->u.zone_mgmt.buf; 7112 uint64_t max_zones_per_buf, i; 7113 uint32_t zone_report_bufsize; 7114 struct spdk_nvme_ns *ns; 7115 struct spdk_nvme_qpair *qpair; 7116 int ret; 7117 7118 if (spdk_nvme_cpl_is_error(cpl)) { 7119 goto out_complete_io_nvme_cpl; 7120 } 7121 7122 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 7123 ret = -ENXIO; 7124 goto out_complete_io_ret; 7125 } 7126 7127 ns = bio->io_path->nvme_ns->ns; 7128 qpair = bio->io_path->qpair->qpair; 7129 7130 zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 7131 max_zones_per_buf = (zone_report_bufsize - sizeof(*bio->zone_report_buf)) / 7132 sizeof(bio->zone_report_buf->descs[0]); 7133 7134 if (bio->zone_report_buf->nr_zones > max_zones_per_buf) { 7135 ret = -EINVAL; 7136 goto out_complete_io_ret; 7137 } 7138 7139 if (!bio->zone_report_buf->nr_zones) { 7140 ret = -EINVAL; 7141 goto out_complete_io_ret; 7142 } 7143 7144 for (i = 0; i < bio->zone_report_buf->nr_zones && bio->handled_zones < zones_to_copy; i++) { 7145 ret = fill_zone_from_report(&info[bio->handled_zones], 7146 &bio->zone_report_buf->descs[i]); 7147 if (ret) { 7148 goto out_complete_io_ret; 7149 } 7150 bio->handled_zones++; 7151 } 7152 7153 if (bio->handled_zones < zones_to_copy) { 7154 uint64_t zone_size_lba = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 7155 uint64_t slba = zone_id + (zone_size_lba * bio->handled_zones); 7156 7157 memset(bio->zone_report_buf, 0, zone_report_bufsize); 7158 ret = spdk_nvme_zns_report_zones(ns, qpair, 7159 bio->zone_report_buf, zone_report_bufsize, 7160 slba, SPDK_NVME_ZRA_LIST_ALL, true, 7161 bdev_nvme_get_zone_info_done, bio); 7162 if (!ret) { 7163 return; 7164 } else { 7165 goto out_complete_io_ret; 7166 } 7167 } 7168 7169 out_complete_io_nvme_cpl: 7170 free(bio->zone_report_buf); 7171 bio->zone_report_buf = NULL; 7172 bdev_nvme_io_complete_nvme_status(bio, cpl); 7173 return; 7174 7175 out_complete_io_ret: 7176 free(bio->zone_report_buf); 7177 bio->zone_report_buf = NULL; 7178 bdev_nvme_io_complete(bio, ret); 7179 } 7180 7181 static void 7182 bdev_nvme_zone_management_done(void *ref, const struct spdk_nvme_cpl *cpl) 7183 { 7184 struct nvme_bdev_io *bio = ref; 7185 7186 bdev_nvme_io_complete_nvme_status(bio, cpl); 7187 } 7188 7189 static void 7190 bdev_nvme_admin_passthru_complete_nvme_status(void *ctx) 7191 { 7192 struct nvme_bdev_io *bio = ctx; 7193 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7194 const struct spdk_nvme_cpl *cpl = &bio->cpl; 7195 7196 assert(bdev_nvme_io_type_is_admin(bdev_io->type)); 7197 7198 __bdev_nvme_io_complete(bdev_io, 0, cpl); 7199 } 7200 7201 static void 7202 bdev_nvme_abort_complete(void *ctx) 7203 { 7204 struct nvme_bdev_io *bio = ctx; 7205 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7206 7207 if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) { 7208 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS, NULL); 7209 } else { 7210 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED, NULL); 7211 } 7212 } 7213 7214 static void 7215 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl) 7216 { 7217 struct nvme_bdev_io *bio = ref; 7218 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7219 7220 bio->cpl = *cpl; 7221 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), bdev_nvme_abort_complete, bio); 7222 } 7223 7224 static void 7225 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl) 7226 { 7227 struct nvme_bdev_io *bio = ref; 7228 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7229 7230 bio->cpl = *cpl; 7231 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7232 bdev_nvme_admin_passthru_complete_nvme_status, bio); 7233 } 7234 7235 static void 7236 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset) 7237 { 7238 struct nvme_bdev_io *bio = ref; 7239 struct iovec *iov; 7240 7241 bio->iov_offset = sgl_offset; 7242 for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) { 7243 iov = &bio->iovs[bio->iovpos]; 7244 if (bio->iov_offset < iov->iov_len) { 7245 break; 7246 } 7247 7248 bio->iov_offset -= iov->iov_len; 7249 } 7250 } 7251 7252 static int 7253 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length) 7254 { 7255 struct nvme_bdev_io *bio = ref; 7256 struct iovec *iov; 7257 7258 assert(bio->iovpos < bio->iovcnt); 7259 7260 iov = &bio->iovs[bio->iovpos]; 7261 7262 *address = iov->iov_base; 7263 *length = iov->iov_len; 7264 7265 if (bio->iov_offset) { 7266 assert(bio->iov_offset <= iov->iov_len); 7267 *address += bio->iov_offset; 7268 *length -= bio->iov_offset; 7269 } 7270 7271 bio->iov_offset += *length; 7272 if (bio->iov_offset == iov->iov_len) { 7273 bio->iovpos++; 7274 bio->iov_offset = 0; 7275 } 7276 7277 return 0; 7278 } 7279 7280 static void 7281 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset) 7282 { 7283 struct nvme_bdev_io *bio = ref; 7284 struct iovec *iov; 7285 7286 bio->fused_iov_offset = sgl_offset; 7287 for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) { 7288 iov = &bio->fused_iovs[bio->fused_iovpos]; 7289 if (bio->fused_iov_offset < iov->iov_len) { 7290 break; 7291 } 7292 7293 bio->fused_iov_offset -= iov->iov_len; 7294 } 7295 } 7296 7297 static int 7298 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length) 7299 { 7300 struct nvme_bdev_io *bio = ref; 7301 struct iovec *iov; 7302 7303 assert(bio->fused_iovpos < bio->fused_iovcnt); 7304 7305 iov = &bio->fused_iovs[bio->fused_iovpos]; 7306 7307 *address = iov->iov_base; 7308 *length = iov->iov_len; 7309 7310 if (bio->fused_iov_offset) { 7311 assert(bio->fused_iov_offset <= iov->iov_len); 7312 *address += bio->fused_iov_offset; 7313 *length -= bio->fused_iov_offset; 7314 } 7315 7316 bio->fused_iov_offset += *length; 7317 if (bio->fused_iov_offset == iov->iov_len) { 7318 bio->fused_iovpos++; 7319 bio->fused_iov_offset = 0; 7320 } 7321 7322 return 0; 7323 } 7324 7325 static int 7326 bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7327 void *md, uint64_t lba_count, uint64_t lba) 7328 { 7329 int rc; 7330 7331 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n", 7332 lba_count, lba); 7333 7334 bio->iovs = iov; 7335 bio->iovcnt = iovcnt; 7336 bio->iovpos = 0; 7337 bio->iov_offset = 0; 7338 7339 rc = spdk_nvme_ns_cmd_readv_with_md(bio->io_path->nvme_ns->ns, 7340 bio->io_path->qpair->qpair, 7341 lba, lba_count, 7342 bdev_nvme_no_pi_readv_done, bio, 0, 7343 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 7344 md, 0, 0); 7345 7346 if (rc != 0 && rc != -ENOMEM) { 7347 SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc); 7348 } 7349 return rc; 7350 } 7351 7352 static int 7353 bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7354 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, 7355 struct spdk_memory_domain *domain, void *domain_ctx, 7356 struct spdk_accel_sequence *seq) 7357 { 7358 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7359 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7360 int rc; 7361 7362 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7363 lba_count, lba); 7364 7365 bio->iovs = iov; 7366 bio->iovcnt = iovcnt; 7367 bio->iovpos = 0; 7368 bio->iov_offset = 0; 7369 7370 if (domain != NULL || seq != NULL) { 7371 bio->ext_opts.size = SPDK_SIZEOF(&bio->ext_opts, accel_sequence); 7372 bio->ext_opts.memory_domain = domain; 7373 bio->ext_opts.memory_domain_ctx = domain_ctx; 7374 bio->ext_opts.io_flags = flags; 7375 bio->ext_opts.metadata = md; 7376 bio->ext_opts.accel_sequence = seq; 7377 7378 rc = spdk_nvme_ns_cmd_readv_ext(ns, qpair, lba, lba_count, 7379 bdev_nvme_readv_done, bio, 7380 bdev_nvme_queued_reset_sgl, 7381 bdev_nvme_queued_next_sge, 7382 &bio->ext_opts); 7383 } else if (iovcnt == 1) { 7384 rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base, 7385 md, lba, lba_count, bdev_nvme_readv_done, 7386 bio, flags, 0, 0); 7387 } else { 7388 rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count, 7389 bdev_nvme_readv_done, bio, flags, 7390 bdev_nvme_queued_reset_sgl, 7391 bdev_nvme_queued_next_sge, md, 0, 0); 7392 } 7393 7394 if (rc != 0 && rc != -ENOMEM) { 7395 SPDK_ERRLOG("readv failed: rc = %d\n", rc); 7396 } 7397 return rc; 7398 } 7399 7400 static int 7401 bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7402 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, 7403 struct spdk_memory_domain *domain, void *domain_ctx, 7404 struct spdk_accel_sequence *seq) 7405 { 7406 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7407 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7408 int rc; 7409 7410 SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7411 lba_count, lba); 7412 7413 bio->iovs = iov; 7414 bio->iovcnt = iovcnt; 7415 bio->iovpos = 0; 7416 bio->iov_offset = 0; 7417 7418 if (domain != NULL || seq != NULL) { 7419 bio->ext_opts.size = SPDK_SIZEOF(&bio->ext_opts, accel_sequence); 7420 bio->ext_opts.memory_domain = domain; 7421 bio->ext_opts.memory_domain_ctx = domain_ctx; 7422 bio->ext_opts.io_flags = flags; 7423 bio->ext_opts.metadata = md; 7424 bio->ext_opts.accel_sequence = seq; 7425 7426 rc = spdk_nvme_ns_cmd_writev_ext(ns, qpair, lba, lba_count, 7427 bdev_nvme_writev_done, bio, 7428 bdev_nvme_queued_reset_sgl, 7429 bdev_nvme_queued_next_sge, 7430 &bio->ext_opts); 7431 } else if (iovcnt == 1) { 7432 rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base, 7433 md, lba, lba_count, bdev_nvme_writev_done, 7434 bio, flags, 0, 0); 7435 } else { 7436 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 7437 bdev_nvme_writev_done, bio, flags, 7438 bdev_nvme_queued_reset_sgl, 7439 bdev_nvme_queued_next_sge, md, 0, 0); 7440 } 7441 7442 if (rc != 0 && rc != -ENOMEM) { 7443 SPDK_ERRLOG("writev failed: rc = %d\n", rc); 7444 } 7445 return rc; 7446 } 7447 7448 static int 7449 bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7450 void *md, uint64_t lba_count, uint64_t zslba, 7451 uint32_t flags) 7452 { 7453 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7454 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7455 int rc; 7456 7457 SPDK_DEBUGLOG(bdev_nvme, "zone append %" PRIu64 " blocks to zone start lba %#" PRIx64 "\n", 7458 lba_count, zslba); 7459 7460 bio->iovs = iov; 7461 bio->iovcnt = iovcnt; 7462 bio->iovpos = 0; 7463 bio->iov_offset = 0; 7464 7465 if (iovcnt == 1) { 7466 rc = spdk_nvme_zns_zone_append_with_md(ns, qpair, iov[0].iov_base, md, zslba, 7467 lba_count, 7468 bdev_nvme_zone_appendv_done, bio, 7469 flags, 7470 0, 0); 7471 } else { 7472 rc = spdk_nvme_zns_zone_appendv_with_md(ns, qpair, zslba, lba_count, 7473 bdev_nvme_zone_appendv_done, bio, flags, 7474 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 7475 md, 0, 0); 7476 } 7477 7478 if (rc != 0 && rc != -ENOMEM) { 7479 SPDK_ERRLOG("zone append failed: rc = %d\n", rc); 7480 } 7481 return rc; 7482 } 7483 7484 static int 7485 bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7486 void *md, uint64_t lba_count, uint64_t lba, 7487 uint32_t flags) 7488 { 7489 int rc; 7490 7491 SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7492 lba_count, lba); 7493 7494 bio->iovs = iov; 7495 bio->iovcnt = iovcnt; 7496 bio->iovpos = 0; 7497 bio->iov_offset = 0; 7498 7499 rc = spdk_nvme_ns_cmd_comparev_with_md(bio->io_path->nvme_ns->ns, 7500 bio->io_path->qpair->qpair, 7501 lba, lba_count, 7502 bdev_nvme_comparev_done, bio, flags, 7503 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 7504 md, 0, 0); 7505 7506 if (rc != 0 && rc != -ENOMEM) { 7507 SPDK_ERRLOG("comparev failed: rc = %d\n", rc); 7508 } 7509 return rc; 7510 } 7511 7512 static int 7513 bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, 7514 struct iovec *write_iov, int write_iovcnt, 7515 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags) 7516 { 7517 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7518 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7519 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7520 int rc; 7521 7522 SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7523 lba_count, lba); 7524 7525 bio->iovs = cmp_iov; 7526 bio->iovcnt = cmp_iovcnt; 7527 bio->iovpos = 0; 7528 bio->iov_offset = 0; 7529 bio->fused_iovs = write_iov; 7530 bio->fused_iovcnt = write_iovcnt; 7531 bio->fused_iovpos = 0; 7532 bio->fused_iov_offset = 0; 7533 7534 if (bdev_io->num_retries == 0) { 7535 bio->first_fused_submitted = false; 7536 bio->first_fused_completed = false; 7537 } 7538 7539 if (!bio->first_fused_submitted) { 7540 flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST; 7541 memset(&bio->cpl, 0, sizeof(bio->cpl)); 7542 7543 rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count, 7544 bdev_nvme_comparev_and_writev_done, bio, flags, 7545 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0); 7546 if (rc == 0) { 7547 bio->first_fused_submitted = true; 7548 flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST; 7549 } else { 7550 if (rc != -ENOMEM) { 7551 SPDK_ERRLOG("compare failed: rc = %d\n", rc); 7552 } 7553 return rc; 7554 } 7555 } 7556 7557 flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND; 7558 7559 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 7560 bdev_nvme_comparev_and_writev_done, bio, flags, 7561 bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0); 7562 if (rc != 0 && rc != -ENOMEM) { 7563 SPDK_ERRLOG("write failed: rc = %d\n", rc); 7564 rc = 0; 7565 } 7566 7567 return rc; 7568 } 7569 7570 static int 7571 bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 7572 { 7573 struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES]; 7574 struct spdk_nvme_dsm_range *range; 7575 uint64_t offset, remaining; 7576 uint64_t num_ranges_u64; 7577 uint16_t num_ranges; 7578 int rc; 7579 7580 num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) / 7581 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 7582 if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) { 7583 SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks); 7584 return -EINVAL; 7585 } 7586 num_ranges = (uint16_t)num_ranges_u64; 7587 7588 offset = offset_blocks; 7589 remaining = num_blocks; 7590 range = &dsm_ranges[0]; 7591 7592 /* Fill max-size ranges until the remaining blocks fit into one range */ 7593 while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) { 7594 range->attributes.raw = 0; 7595 range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 7596 range->starting_lba = offset; 7597 7598 offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 7599 remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 7600 range++; 7601 } 7602 7603 /* Final range describes the remaining blocks */ 7604 range->attributes.raw = 0; 7605 range->length = remaining; 7606 range->starting_lba = offset; 7607 7608 rc = spdk_nvme_ns_cmd_dataset_management(bio->io_path->nvme_ns->ns, 7609 bio->io_path->qpair->qpair, 7610 SPDK_NVME_DSM_ATTR_DEALLOCATE, 7611 dsm_ranges, num_ranges, 7612 bdev_nvme_queued_done, bio); 7613 7614 return rc; 7615 } 7616 7617 static int 7618 bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 7619 { 7620 if (num_blocks > UINT16_MAX + 1) { 7621 SPDK_ERRLOG("NVMe write zeroes is limited to 16-bit block count\n"); 7622 return -EINVAL; 7623 } 7624 7625 return spdk_nvme_ns_cmd_write_zeroes(bio->io_path->nvme_ns->ns, 7626 bio->io_path->qpair->qpair, 7627 offset_blocks, num_blocks, 7628 bdev_nvme_queued_done, bio, 7629 0); 7630 } 7631 7632 static int 7633 bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones, 7634 struct spdk_bdev_zone_info *info) 7635 { 7636 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7637 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7638 uint32_t zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 7639 uint64_t zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 7640 uint64_t total_zones = spdk_nvme_zns_ns_get_num_zones(ns); 7641 7642 if (zone_id % zone_size != 0) { 7643 return -EINVAL; 7644 } 7645 7646 if (num_zones > total_zones || !num_zones) { 7647 return -EINVAL; 7648 } 7649 7650 assert(!bio->zone_report_buf); 7651 bio->zone_report_buf = calloc(1, zone_report_bufsize); 7652 if (!bio->zone_report_buf) { 7653 return -ENOMEM; 7654 } 7655 7656 bio->handled_zones = 0; 7657 7658 return spdk_nvme_zns_report_zones(ns, qpair, bio->zone_report_buf, zone_report_bufsize, 7659 zone_id, SPDK_NVME_ZRA_LIST_ALL, true, 7660 bdev_nvme_get_zone_info_done, bio); 7661 } 7662 7663 static int 7664 bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 7665 enum spdk_bdev_zone_action action) 7666 { 7667 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7668 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7669 7670 switch (action) { 7671 case SPDK_BDEV_ZONE_CLOSE: 7672 return spdk_nvme_zns_close_zone(ns, qpair, zone_id, false, 7673 bdev_nvme_zone_management_done, bio); 7674 case SPDK_BDEV_ZONE_FINISH: 7675 return spdk_nvme_zns_finish_zone(ns, qpair, zone_id, false, 7676 bdev_nvme_zone_management_done, bio); 7677 case SPDK_BDEV_ZONE_OPEN: 7678 return spdk_nvme_zns_open_zone(ns, qpair, zone_id, false, 7679 bdev_nvme_zone_management_done, bio); 7680 case SPDK_BDEV_ZONE_RESET: 7681 return spdk_nvme_zns_reset_zone(ns, qpair, zone_id, false, 7682 bdev_nvme_zone_management_done, bio); 7683 case SPDK_BDEV_ZONE_OFFLINE: 7684 return spdk_nvme_zns_offline_zone(ns, qpair, zone_id, false, 7685 bdev_nvme_zone_management_done, bio); 7686 default: 7687 return -EINVAL; 7688 } 7689 } 7690 7691 static void 7692 bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 7693 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) 7694 { 7695 struct nvme_io_path *io_path; 7696 struct nvme_ctrlr *nvme_ctrlr; 7697 uint32_t max_xfer_size; 7698 int rc = -ENXIO; 7699 7700 /* Choose the first ctrlr which is not failed. */ 7701 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 7702 nvme_ctrlr = io_path->qpair->ctrlr; 7703 7704 /* We should skip any unavailable nvme_ctrlr rather than checking 7705 * if the return value of spdk_nvme_ctrlr_cmd_admin_raw() is -ENXIO. 7706 */ 7707 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 7708 continue; 7709 } 7710 7711 max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ctrlr->ctrlr); 7712 7713 if (nbytes > max_xfer_size) { 7714 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 7715 rc = -EINVAL; 7716 goto err; 7717 } 7718 7719 rc = spdk_nvme_ctrlr_cmd_admin_raw(nvme_ctrlr->ctrlr, cmd, buf, (uint32_t)nbytes, 7720 bdev_nvme_admin_passthru_done, bio); 7721 if (rc == 0) { 7722 return; 7723 } 7724 } 7725 7726 err: 7727 bdev_nvme_admin_complete(bio, rc); 7728 } 7729 7730 static int 7731 bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 7732 void *buf, size_t nbytes) 7733 { 7734 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7735 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7736 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 7737 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 7738 7739 if (nbytes > max_xfer_size) { 7740 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 7741 return -EINVAL; 7742 } 7743 7744 /* 7745 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 7746 * so fill it out automatically. 7747 */ 7748 cmd->nsid = spdk_nvme_ns_get_id(ns); 7749 7750 return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf, 7751 (uint32_t)nbytes, bdev_nvme_queued_done, bio); 7752 } 7753 7754 static int 7755 bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 7756 void *buf, size_t nbytes, void *md_buf, size_t md_len) 7757 { 7758 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7759 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7760 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 7761 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 7762 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 7763 7764 if (nbytes > max_xfer_size) { 7765 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 7766 return -EINVAL; 7767 } 7768 7769 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 7770 SPDK_ERRLOG("invalid meta data buffer size\n"); 7771 return -EINVAL; 7772 } 7773 7774 /* 7775 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 7776 * so fill it out automatically. 7777 */ 7778 cmd->nsid = spdk_nvme_ns_get_id(ns); 7779 7780 return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf, 7781 (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio); 7782 } 7783 7784 static void 7785 bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 7786 struct nvme_bdev_io *bio_to_abort) 7787 { 7788 struct nvme_io_path *io_path; 7789 int rc = 0; 7790 7791 rc = bdev_nvme_abort_retry_io(nbdev_ch, bio_to_abort); 7792 if (rc == 0) { 7793 bdev_nvme_admin_complete(bio, 0); 7794 return; 7795 } 7796 7797 io_path = bio_to_abort->io_path; 7798 if (io_path != NULL) { 7799 rc = spdk_nvme_ctrlr_cmd_abort_ext(io_path->qpair->ctrlr->ctrlr, 7800 io_path->qpair->qpair, 7801 bio_to_abort, 7802 bdev_nvme_abort_done, bio); 7803 } else { 7804 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 7805 rc = spdk_nvme_ctrlr_cmd_abort_ext(io_path->qpair->ctrlr->ctrlr, 7806 NULL, 7807 bio_to_abort, 7808 bdev_nvme_abort_done, bio); 7809 7810 if (rc != -ENOENT) { 7811 break; 7812 } 7813 } 7814 } 7815 7816 if (rc != 0) { 7817 /* If no command was found or there was any error, complete the abort 7818 * request with failure. 7819 */ 7820 bdev_nvme_admin_complete(bio, rc); 7821 } 7822 } 7823 7824 static int 7825 bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, uint64_t src_offset_blocks, 7826 uint64_t num_blocks) 7827 { 7828 struct spdk_nvme_scc_source_range range = { 7829 .slba = src_offset_blocks, 7830 .nlb = num_blocks - 1 7831 }; 7832 7833 return spdk_nvme_ns_cmd_copy(bio->io_path->nvme_ns->ns, 7834 bio->io_path->qpair->qpair, 7835 &range, 1, dst_offset_blocks, 7836 bdev_nvme_queued_done, bio); 7837 } 7838 7839 static void 7840 bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w) 7841 { 7842 const char *action; 7843 7844 if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) { 7845 action = "reset"; 7846 } else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) { 7847 action = "abort"; 7848 } else { 7849 action = "none"; 7850 } 7851 7852 spdk_json_write_object_begin(w); 7853 7854 spdk_json_write_named_string(w, "method", "bdev_nvme_set_options"); 7855 7856 spdk_json_write_named_object_begin(w, "params"); 7857 spdk_json_write_named_string(w, "action_on_timeout", action); 7858 spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us); 7859 spdk_json_write_named_uint64(w, "timeout_admin_us", g_opts.timeout_admin_us); 7860 spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms); 7861 spdk_json_write_named_uint32(w, "transport_retry_count", g_opts.transport_retry_count); 7862 spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst); 7863 spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight); 7864 spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight); 7865 spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight); 7866 spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us); 7867 spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us); 7868 spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests); 7869 spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit); 7870 spdk_json_write_named_int32(w, "bdev_retry_count", g_opts.bdev_retry_count); 7871 spdk_json_write_named_uint8(w, "transport_ack_timeout", g_opts.transport_ack_timeout); 7872 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", g_opts.ctrlr_loss_timeout_sec); 7873 spdk_json_write_named_uint32(w, "reconnect_delay_sec", g_opts.reconnect_delay_sec); 7874 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", g_opts.fast_io_fail_timeout_sec); 7875 spdk_json_write_named_bool(w, "generate_uuids", g_opts.generate_uuids); 7876 spdk_json_write_named_uint8(w, "transport_tos", g_opts.transport_tos); 7877 spdk_json_write_named_bool(w, "io_path_stat", g_opts.io_path_stat); 7878 spdk_json_write_named_bool(w, "allow_accel_sequence", g_opts.allow_accel_sequence); 7879 spdk_json_write_object_end(w); 7880 7881 spdk_json_write_object_end(w); 7882 } 7883 7884 static void 7885 bdev_nvme_discovery_config_json(struct spdk_json_write_ctx *w, struct discovery_ctx *ctx) 7886 { 7887 struct spdk_nvme_transport_id trid; 7888 7889 spdk_json_write_object_begin(w); 7890 7891 spdk_json_write_named_string(w, "method", "bdev_nvme_start_discovery"); 7892 7893 spdk_json_write_named_object_begin(w, "params"); 7894 spdk_json_write_named_string(w, "name", ctx->name); 7895 spdk_json_write_named_string(w, "hostnqn", ctx->hostnqn); 7896 7897 trid = ctx->trid; 7898 memset(trid.subnqn, 0, sizeof(trid.subnqn)); 7899 nvme_bdev_dump_trid_json(&trid, w); 7900 7901 spdk_json_write_named_bool(w, "wait_for_attach", ctx->wait_for_attach); 7902 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", ctx->bdev_opts.ctrlr_loss_timeout_sec); 7903 spdk_json_write_named_uint32(w, "reconnect_delay_sec", ctx->bdev_opts.reconnect_delay_sec); 7904 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 7905 ctx->bdev_opts.fast_io_fail_timeout_sec); 7906 spdk_json_write_object_end(w); 7907 7908 spdk_json_write_object_end(w); 7909 } 7910 7911 #ifdef SPDK_CONFIG_NVME_CUSE 7912 static void 7913 nvme_ctrlr_cuse_config_json(struct spdk_json_write_ctx *w, 7914 struct nvme_ctrlr *nvme_ctrlr) 7915 { 7916 size_t cuse_name_size = 128; 7917 char cuse_name[cuse_name_size]; 7918 7919 if (spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr, 7920 cuse_name, &cuse_name_size) != 0) { 7921 return; 7922 } 7923 7924 spdk_json_write_object_begin(w); 7925 7926 spdk_json_write_named_string(w, "method", "bdev_nvme_cuse_register"); 7927 7928 spdk_json_write_named_object_begin(w, "params"); 7929 spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name); 7930 spdk_json_write_object_end(w); 7931 7932 spdk_json_write_object_end(w); 7933 } 7934 #endif 7935 7936 static void 7937 nvme_ctrlr_config_json(struct spdk_json_write_ctx *w, 7938 struct nvme_ctrlr *nvme_ctrlr) 7939 { 7940 struct spdk_nvme_transport_id *trid; 7941 const struct spdk_nvme_ctrlr_opts *opts; 7942 7943 if (nvme_ctrlr->opts.from_discovery_service) { 7944 /* Do not emit an RPC for this - it will be implicitly 7945 * covered by a separate bdev_nvme_start_discovery or 7946 * bdev_nvme_start_mdns_discovery RPC. 7947 */ 7948 return; 7949 } 7950 7951 trid = &nvme_ctrlr->active_path_id->trid; 7952 7953 spdk_json_write_object_begin(w); 7954 7955 spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller"); 7956 7957 spdk_json_write_named_object_begin(w, "params"); 7958 spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name); 7959 nvme_bdev_dump_trid_json(trid, w); 7960 spdk_json_write_named_bool(w, "prchk_reftag", 7961 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0); 7962 spdk_json_write_named_bool(w, "prchk_guard", 7963 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0); 7964 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", nvme_ctrlr->opts.ctrlr_loss_timeout_sec); 7965 spdk_json_write_named_uint32(w, "reconnect_delay_sec", nvme_ctrlr->opts.reconnect_delay_sec); 7966 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 7967 nvme_ctrlr->opts.fast_io_fail_timeout_sec); 7968 if (nvme_ctrlr->opts.psk_path[0] != '\0') { 7969 spdk_json_write_named_string(w, "psk", nvme_ctrlr->opts.psk_path); 7970 } 7971 7972 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 7973 spdk_json_write_named_string(w, "hostnqn", opts->hostnqn); 7974 spdk_json_write_named_bool(w, "hdgst", opts->header_digest); 7975 spdk_json_write_named_bool(w, "ddgst", opts->data_digest); 7976 7977 spdk_json_write_object_end(w); 7978 7979 spdk_json_write_object_end(w); 7980 } 7981 7982 static void 7983 bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w) 7984 { 7985 spdk_json_write_object_begin(w); 7986 spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug"); 7987 7988 spdk_json_write_named_object_begin(w, "params"); 7989 spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us); 7990 spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled); 7991 spdk_json_write_object_end(w); 7992 7993 spdk_json_write_object_end(w); 7994 } 7995 7996 static int 7997 bdev_nvme_config_json(struct spdk_json_write_ctx *w) 7998 { 7999 struct nvme_bdev_ctrlr *nbdev_ctrlr; 8000 struct nvme_ctrlr *nvme_ctrlr; 8001 struct discovery_ctx *ctx; 8002 8003 bdev_nvme_opts_config_json(w); 8004 8005 pthread_mutex_lock(&g_bdev_nvme_mutex); 8006 8007 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 8008 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 8009 nvme_ctrlr_config_json(w, nvme_ctrlr); 8010 8011 #ifdef SPDK_CONFIG_NVME_CUSE 8012 nvme_ctrlr_cuse_config_json(w, nvme_ctrlr); 8013 #endif 8014 } 8015 } 8016 8017 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 8018 if (!ctx->from_mdns_discovery_service) { 8019 bdev_nvme_discovery_config_json(w, ctx); 8020 } 8021 } 8022 8023 bdev_nvme_mdns_discovery_config_json(w); 8024 8025 /* Dump as last parameter to give all NVMe bdevs chance to be constructed 8026 * before enabling hotplug poller. 8027 */ 8028 bdev_nvme_hotplug_config_json(w); 8029 8030 pthread_mutex_unlock(&g_bdev_nvme_mutex); 8031 return 0; 8032 } 8033 8034 struct spdk_nvme_ctrlr * 8035 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev) 8036 { 8037 struct nvme_bdev *nbdev; 8038 struct nvme_ns *nvme_ns; 8039 8040 if (!bdev || bdev->module != &nvme_if) { 8041 return NULL; 8042 } 8043 8044 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 8045 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 8046 assert(nvme_ns != NULL); 8047 8048 return nvme_ns->ctrlr->ctrlr; 8049 } 8050 8051 void 8052 nvme_io_path_info_json(struct spdk_json_write_ctx *w, struct nvme_io_path *io_path) 8053 { 8054 struct nvme_ns *nvme_ns = io_path->nvme_ns; 8055 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 8056 const struct spdk_nvme_ctrlr_data *cdata; 8057 const struct spdk_nvme_transport_id *trid; 8058 const char *adrfam_str; 8059 8060 spdk_json_write_object_begin(w); 8061 8062 spdk_json_write_named_string(w, "bdev_name", nvme_ns->bdev->disk.name); 8063 8064 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 8065 trid = spdk_nvme_ctrlr_get_transport_id(nvme_ctrlr->ctrlr); 8066 8067 spdk_json_write_named_uint32(w, "cntlid", cdata->cntlid); 8068 spdk_json_write_named_bool(w, "current", io_path->nbdev_ch != NULL && 8069 io_path == io_path->nbdev_ch->current_io_path); 8070 spdk_json_write_named_bool(w, "connected", nvme_qpair_is_connected(io_path->qpair)); 8071 spdk_json_write_named_bool(w, "accessible", nvme_ns_is_accessible(nvme_ns)); 8072 8073 spdk_json_write_named_object_begin(w, "transport"); 8074 spdk_json_write_named_string(w, "trtype", trid->trstring); 8075 spdk_json_write_named_string(w, "traddr", trid->traddr); 8076 if (trid->trsvcid[0] != '\0') { 8077 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 8078 } 8079 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 8080 if (adrfam_str) { 8081 spdk_json_write_named_string(w, "adrfam", adrfam_str); 8082 } 8083 spdk_json_write_object_end(w); 8084 8085 spdk_json_write_object_end(w); 8086 } 8087 8088 void 8089 bdev_nvme_get_discovery_info(struct spdk_json_write_ctx *w) 8090 { 8091 struct discovery_ctx *ctx; 8092 struct discovery_entry_ctx *entry_ctx; 8093 8094 spdk_json_write_array_begin(w); 8095 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 8096 spdk_json_write_object_begin(w); 8097 spdk_json_write_named_string(w, "name", ctx->name); 8098 8099 spdk_json_write_named_object_begin(w, "trid"); 8100 nvme_bdev_dump_trid_json(&ctx->trid, w); 8101 spdk_json_write_object_end(w); 8102 8103 spdk_json_write_named_array_begin(w, "referrals"); 8104 TAILQ_FOREACH(entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 8105 spdk_json_write_object_begin(w); 8106 spdk_json_write_named_object_begin(w, "trid"); 8107 nvme_bdev_dump_trid_json(&entry_ctx->trid, w); 8108 spdk_json_write_object_end(w); 8109 spdk_json_write_object_end(w); 8110 } 8111 spdk_json_write_array_end(w); 8112 8113 spdk_json_write_object_end(w); 8114 } 8115 spdk_json_write_array_end(w); 8116 } 8117 8118 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme) 8119 8120 SPDK_TRACE_REGISTER_FN(bdev_nvme_trace, "bdev_nvme", TRACE_GROUP_BDEV_NVME) 8121 { 8122 struct spdk_trace_tpoint_opts opts[] = { 8123 { 8124 "BDEV_NVME_IO_START", TRACE_BDEV_NVME_IO_START, 8125 OWNER_NONE, OBJECT_BDEV_NVME_IO, 1, 8126 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 8127 }, 8128 { 8129 "BDEV_NVME_IO_DONE", TRACE_BDEV_NVME_IO_DONE, 8130 OWNER_NONE, OBJECT_BDEV_NVME_IO, 0, 8131 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 8132 } 8133 }; 8134 8135 8136 spdk_trace_register_object(OBJECT_BDEV_NVME_IO, 'N'); 8137 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 8138 spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_SUBMIT, OBJECT_BDEV_NVME_IO, 0); 8139 spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_SUBMIT, OBJECT_BDEV_NVME_IO, 0); 8140 spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_COMPLETE, OBJECT_BDEV_NVME_IO, 0); 8141 spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_COMPLETE, OBJECT_BDEV_NVME_IO, 0); 8142 } 8143