1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 * Copyright (c) 2022 Dell Inc, or its subsidiaries. All rights reserved. 6 */ 7 8 #include "spdk/stdinc.h" 9 10 #include "bdev_nvme.h" 11 12 #include "spdk/accel.h" 13 #include "spdk/config.h" 14 #include "spdk/endian.h" 15 #include "spdk/bdev.h" 16 #include "spdk/json.h" 17 #include "spdk/likely.h" 18 #include "spdk/nvme.h" 19 #include "spdk/nvme_ocssd.h" 20 #include "spdk/nvme_zns.h" 21 #include "spdk/opal.h" 22 #include "spdk/thread.h" 23 #include "spdk/trace.h" 24 #include "spdk/string.h" 25 #include "spdk/util.h" 26 #include "spdk/uuid.h" 27 28 #include "spdk/bdev_module.h" 29 #include "spdk/log.h" 30 31 #include "spdk_internal/usdt.h" 32 #include "spdk_internal/trace_defs.h" 33 34 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true 35 #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS (10000) 36 37 #define NSID_STR_LEN 10 38 39 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w); 40 41 struct nvme_bdev_io { 42 /** array of iovecs to transfer. */ 43 struct iovec *iovs; 44 45 /** Number of iovecs in iovs array. */ 46 int iovcnt; 47 48 /** Current iovec position. */ 49 int iovpos; 50 51 /** Offset in current iovec. */ 52 uint32_t iov_offset; 53 54 /** I/O path the current I/O or admin passthrough is submitted on, or the I/O path 55 * being reset in a reset I/O. 56 */ 57 struct nvme_io_path *io_path; 58 59 /** array of iovecs to transfer. */ 60 struct iovec *fused_iovs; 61 62 /** Number of iovecs in iovs array. */ 63 int fused_iovcnt; 64 65 /** Current iovec position. */ 66 int fused_iovpos; 67 68 /** Offset in current iovec. */ 69 uint32_t fused_iov_offset; 70 71 /** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */ 72 struct spdk_nvme_cpl cpl; 73 74 /** Extended IO opts passed by the user to bdev layer and mapped to NVME format */ 75 struct spdk_nvme_ns_cmd_ext_io_opts ext_opts; 76 77 /** Keeps track if first of fused commands was submitted */ 78 bool first_fused_submitted; 79 80 /** Keeps track if first of fused commands was completed */ 81 bool first_fused_completed; 82 83 /** Temporary pointer to zone report buffer */ 84 struct spdk_nvme_zns_zone_report *zone_report_buf; 85 86 /** Keep track of how many zones that have been copied to the spdk_bdev_zone_info struct */ 87 uint64_t handled_zones; 88 89 /** Expiration value in ticks to retry the current I/O. */ 90 uint64_t retry_ticks; 91 92 /* How many times the current I/O was retried. */ 93 int32_t retry_count; 94 95 /* Current tsc at submit time. */ 96 uint64_t submit_tsc; 97 }; 98 99 struct nvme_probe_skip_entry { 100 struct spdk_nvme_transport_id trid; 101 TAILQ_ENTRY(nvme_probe_skip_entry) tailq; 102 }; 103 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */ 104 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER( 105 g_skipped_nvme_ctrlrs); 106 107 static struct spdk_bdev_nvme_opts g_opts = { 108 .action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE, 109 .timeout_us = 0, 110 .timeout_admin_us = 0, 111 .keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS, 112 .transport_retry_count = 4, 113 .arbitration_burst = 0, 114 .low_priority_weight = 0, 115 .medium_priority_weight = 0, 116 .high_priority_weight = 0, 117 .nvme_adminq_poll_period_us = 10000ULL, 118 .nvme_ioq_poll_period_us = 0, 119 .io_queue_requests = 0, 120 .delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT, 121 .bdev_retry_count = 3, 122 .transport_ack_timeout = 0, 123 .ctrlr_loss_timeout_sec = 0, 124 .reconnect_delay_sec = 0, 125 .fast_io_fail_timeout_sec = 0, 126 .disable_auto_failback = false, 127 .generate_uuids = false, 128 .transport_tos = 0, 129 .nvme_error_stat = false, 130 .io_path_stat = false, 131 }; 132 133 #define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL 134 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT 100000ULL 135 136 static int g_hot_insert_nvme_controller_index = 0; 137 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT; 138 static bool g_nvme_hotplug_enabled = false; 139 struct spdk_thread *g_bdev_nvme_init_thread; 140 static struct spdk_poller *g_hotplug_poller; 141 static struct spdk_poller *g_hotplug_probe_poller; 142 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx; 143 144 static void nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 145 struct nvme_async_probe_ctx *ctx); 146 static void nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 147 struct nvme_async_probe_ctx *ctx); 148 static int bdev_nvme_library_init(void); 149 static void bdev_nvme_library_fini(void); 150 static void _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, 151 struct spdk_bdev_io *bdev_io); 152 static void bdev_nvme_submit_request(struct spdk_io_channel *ch, 153 struct spdk_bdev_io *bdev_io); 154 static int bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 155 void *md, uint64_t lba_count, uint64_t lba, 156 uint32_t flags, struct spdk_memory_domain *domain, void *domain_ctx); 157 static int bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 158 void *md, uint64_t lba_count, uint64_t lba); 159 static int bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 160 void *md, uint64_t lba_count, uint64_t lba, 161 uint32_t flags, struct spdk_memory_domain *domain, void *domain_ctx); 162 static int bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 163 void *md, uint64_t lba_count, 164 uint64_t zslba, uint32_t flags); 165 static int bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 166 void *md, uint64_t lba_count, uint64_t lba, 167 uint32_t flags); 168 static int bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, 169 struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov, 170 int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba, 171 uint32_t flags); 172 static int bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, 173 uint32_t num_zones, struct spdk_bdev_zone_info *info); 174 static int bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 175 enum spdk_bdev_zone_action action); 176 static void bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, 177 struct nvme_bdev_io *bio, 178 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); 179 static int bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 180 void *buf, size_t nbytes); 181 static int bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 182 void *buf, size_t nbytes, void *md_buf, size_t md_len); 183 static void bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, 184 struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort); 185 static void bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio); 186 static int bdev_nvme_reset_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 187 static int bdev_nvme_failover_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool remove); 188 static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr); 189 static int nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr); 190 191 static struct nvme_ns *nvme_ns_alloc(void); 192 static void nvme_ns_free(struct nvme_ns *ns); 193 194 static int 195 nvme_ns_cmp(struct nvme_ns *ns1, struct nvme_ns *ns2) 196 { 197 return ns1->id < ns2->id ? -1 : ns1->id > ns2->id; 198 } 199 200 RB_GENERATE_STATIC(nvme_ns_tree, nvme_ns, node, nvme_ns_cmp); 201 202 struct spdk_nvme_qpair * 203 bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch) 204 { 205 struct nvme_ctrlr_channel *ctrlr_ch; 206 207 assert(ctrlr_io_ch != NULL); 208 209 ctrlr_ch = spdk_io_channel_get_ctx(ctrlr_io_ch); 210 211 return ctrlr_ch->qpair->qpair; 212 } 213 214 static int 215 bdev_nvme_get_ctx_size(void) 216 { 217 return sizeof(struct nvme_bdev_io); 218 } 219 220 static struct spdk_bdev_module nvme_if = { 221 .name = "nvme", 222 .async_fini = true, 223 .module_init = bdev_nvme_library_init, 224 .module_fini = bdev_nvme_library_fini, 225 .config_json = bdev_nvme_config_json, 226 .get_ctx_size = bdev_nvme_get_ctx_size, 227 228 }; 229 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if) 230 231 struct nvme_bdev_ctrlrs g_nvme_bdev_ctrlrs = TAILQ_HEAD_INITIALIZER(g_nvme_bdev_ctrlrs); 232 pthread_mutex_t g_bdev_nvme_mutex = PTHREAD_MUTEX_INITIALIZER; 233 bool g_bdev_nvme_module_finish; 234 235 struct nvme_bdev_ctrlr * 236 nvme_bdev_ctrlr_get_by_name(const char *name) 237 { 238 struct nvme_bdev_ctrlr *nbdev_ctrlr; 239 240 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 241 if (strcmp(name, nbdev_ctrlr->name) == 0) { 242 break; 243 } 244 } 245 246 return nbdev_ctrlr; 247 } 248 249 static struct nvme_ctrlr * 250 nvme_bdev_ctrlr_get_ctrlr(struct nvme_bdev_ctrlr *nbdev_ctrlr, 251 const struct spdk_nvme_transport_id *trid) 252 { 253 struct nvme_ctrlr *nvme_ctrlr; 254 255 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 256 if (spdk_nvme_transport_id_compare(trid, &nvme_ctrlr->active_path_id->trid) == 0) { 257 break; 258 } 259 } 260 261 return nvme_ctrlr; 262 } 263 264 struct nvme_ctrlr * 265 nvme_bdev_ctrlr_get_ctrlr_by_id(struct nvme_bdev_ctrlr *nbdev_ctrlr, 266 uint16_t cntlid) 267 { 268 struct nvme_ctrlr *nvme_ctrlr; 269 const struct spdk_nvme_ctrlr_data *cdata; 270 271 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 272 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 273 if (cdata->cntlid == cntlid) { 274 break; 275 } 276 } 277 278 return nvme_ctrlr; 279 } 280 281 static struct nvme_bdev * 282 nvme_bdev_ctrlr_get_bdev(struct nvme_bdev_ctrlr *nbdev_ctrlr, uint32_t nsid) 283 { 284 struct nvme_bdev *bdev; 285 286 pthread_mutex_lock(&g_bdev_nvme_mutex); 287 TAILQ_FOREACH(bdev, &nbdev_ctrlr->bdevs, tailq) { 288 if (bdev->nsid == nsid) { 289 break; 290 } 291 } 292 pthread_mutex_unlock(&g_bdev_nvme_mutex); 293 294 return bdev; 295 } 296 297 struct nvme_ns * 298 nvme_ctrlr_get_ns(struct nvme_ctrlr *nvme_ctrlr, uint32_t nsid) 299 { 300 struct nvme_ns ns; 301 302 assert(nsid > 0); 303 304 ns.id = nsid; 305 return RB_FIND(nvme_ns_tree, &nvme_ctrlr->namespaces, &ns); 306 } 307 308 struct nvme_ns * 309 nvme_ctrlr_get_first_active_ns(struct nvme_ctrlr *nvme_ctrlr) 310 { 311 return RB_MIN(nvme_ns_tree, &nvme_ctrlr->namespaces); 312 } 313 314 struct nvme_ns * 315 nvme_ctrlr_get_next_active_ns(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *ns) 316 { 317 if (ns == NULL) { 318 return NULL; 319 } 320 321 return RB_NEXT(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 322 } 323 324 static struct nvme_ctrlr * 325 nvme_ctrlr_get(const struct spdk_nvme_transport_id *trid) 326 { 327 struct nvme_bdev_ctrlr *nbdev_ctrlr; 328 struct nvme_ctrlr *nvme_ctrlr = NULL; 329 330 pthread_mutex_lock(&g_bdev_nvme_mutex); 331 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 332 nvme_ctrlr = nvme_bdev_ctrlr_get_ctrlr(nbdev_ctrlr, trid); 333 if (nvme_ctrlr != NULL) { 334 break; 335 } 336 } 337 pthread_mutex_unlock(&g_bdev_nvme_mutex); 338 339 return nvme_ctrlr; 340 } 341 342 struct nvme_ctrlr * 343 nvme_ctrlr_get_by_name(const char *name) 344 { 345 struct nvme_bdev_ctrlr *nbdev_ctrlr; 346 struct nvme_ctrlr *nvme_ctrlr = NULL; 347 348 if (name == NULL) { 349 return NULL; 350 } 351 352 pthread_mutex_lock(&g_bdev_nvme_mutex); 353 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 354 if (nbdev_ctrlr != NULL) { 355 nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 356 } 357 pthread_mutex_unlock(&g_bdev_nvme_mutex); 358 359 return nvme_ctrlr; 360 } 361 362 void 363 nvme_bdev_ctrlr_for_each(nvme_bdev_ctrlr_for_each_fn fn, void *ctx) 364 { 365 struct nvme_bdev_ctrlr *nbdev_ctrlr; 366 367 pthread_mutex_lock(&g_bdev_nvme_mutex); 368 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 369 fn(nbdev_ctrlr, ctx); 370 } 371 pthread_mutex_unlock(&g_bdev_nvme_mutex); 372 } 373 374 void 375 nvme_bdev_dump_trid_json(const struct spdk_nvme_transport_id *trid, struct spdk_json_write_ctx *w) 376 { 377 const char *trtype_str; 378 const char *adrfam_str; 379 380 trtype_str = spdk_nvme_transport_id_trtype_str(trid->trtype); 381 if (trtype_str) { 382 spdk_json_write_named_string(w, "trtype", trtype_str); 383 } 384 385 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 386 if (adrfam_str) { 387 spdk_json_write_named_string(w, "adrfam", adrfam_str); 388 } 389 390 if (trid->traddr[0] != '\0') { 391 spdk_json_write_named_string(w, "traddr", trid->traddr); 392 } 393 394 if (trid->trsvcid[0] != '\0') { 395 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 396 } 397 398 if (trid->subnqn[0] != '\0') { 399 spdk_json_write_named_string(w, "subnqn", trid->subnqn); 400 } 401 } 402 403 static void 404 nvme_bdev_ctrlr_delete(struct nvme_bdev_ctrlr *nbdev_ctrlr, 405 struct nvme_ctrlr *nvme_ctrlr) 406 { 407 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_delete, nvme_ctrlr->nbdev_ctrlr->name); 408 pthread_mutex_lock(&g_bdev_nvme_mutex); 409 410 TAILQ_REMOVE(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 411 if (!TAILQ_EMPTY(&nbdev_ctrlr->ctrlrs)) { 412 pthread_mutex_unlock(&g_bdev_nvme_mutex); 413 414 return; 415 } 416 TAILQ_REMOVE(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 417 418 pthread_mutex_unlock(&g_bdev_nvme_mutex); 419 420 assert(TAILQ_EMPTY(&nbdev_ctrlr->bdevs)); 421 422 free(nbdev_ctrlr->name); 423 free(nbdev_ctrlr); 424 } 425 426 static void 427 _nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 428 { 429 struct nvme_path_id *path_id, *tmp_path; 430 struct nvme_ns *ns, *tmp_ns; 431 432 free(nvme_ctrlr->copied_ana_desc); 433 spdk_free(nvme_ctrlr->ana_log_page); 434 435 if (nvme_ctrlr->opal_dev) { 436 spdk_opal_dev_destruct(nvme_ctrlr->opal_dev); 437 nvme_ctrlr->opal_dev = NULL; 438 } 439 440 if (nvme_ctrlr->nbdev_ctrlr) { 441 nvme_bdev_ctrlr_delete(nvme_ctrlr->nbdev_ctrlr, nvme_ctrlr); 442 } 443 444 RB_FOREACH_SAFE(ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp_ns) { 445 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 446 nvme_ns_free(ns); 447 } 448 449 TAILQ_FOREACH_SAFE(path_id, &nvme_ctrlr->trids, link, tmp_path) { 450 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 451 free(path_id); 452 } 453 454 pthread_mutex_destroy(&nvme_ctrlr->mutex); 455 456 free(nvme_ctrlr); 457 458 pthread_mutex_lock(&g_bdev_nvme_mutex); 459 if (g_bdev_nvme_module_finish && TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 460 pthread_mutex_unlock(&g_bdev_nvme_mutex); 461 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 462 spdk_bdev_module_fini_done(); 463 return; 464 } 465 pthread_mutex_unlock(&g_bdev_nvme_mutex); 466 } 467 468 static int 469 nvme_detach_poller(void *arg) 470 { 471 struct nvme_ctrlr *nvme_ctrlr = arg; 472 int rc; 473 474 rc = spdk_nvme_detach_poll_async(nvme_ctrlr->detach_ctx); 475 if (rc != -EAGAIN) { 476 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 477 _nvme_ctrlr_delete(nvme_ctrlr); 478 } 479 480 return SPDK_POLLER_BUSY; 481 } 482 483 static void 484 nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 485 { 486 int rc; 487 488 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 489 490 /* First, unregister the adminq poller, as the driver will poll adminq if necessary */ 491 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 492 493 /* If we got here, the reset/detach poller cannot be active */ 494 assert(nvme_ctrlr->reset_detach_poller == NULL); 495 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(nvme_detach_poller, 496 nvme_ctrlr, 1000); 497 if (nvme_ctrlr->reset_detach_poller == NULL) { 498 SPDK_ERRLOG("Failed to register detach poller\n"); 499 goto error; 500 } 501 502 rc = spdk_nvme_detach_async(nvme_ctrlr->ctrlr, &nvme_ctrlr->detach_ctx); 503 if (rc != 0) { 504 SPDK_ERRLOG("Failed to detach the NVMe controller\n"); 505 goto error; 506 } 507 508 return; 509 error: 510 /* We don't have a good way to handle errors here, so just do what we can and delete the 511 * controller without detaching the underlying NVMe device. 512 */ 513 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 514 _nvme_ctrlr_delete(nvme_ctrlr); 515 } 516 517 static void 518 nvme_ctrlr_unregister_cb(void *io_device) 519 { 520 struct nvme_ctrlr *nvme_ctrlr = io_device; 521 522 nvme_ctrlr_delete(nvme_ctrlr); 523 } 524 525 static void 526 nvme_ctrlr_unregister(void *ctx) 527 { 528 struct nvme_ctrlr *nvme_ctrlr = ctx; 529 530 spdk_io_device_unregister(nvme_ctrlr, nvme_ctrlr_unregister_cb); 531 } 532 533 static bool 534 nvme_ctrlr_can_be_unregistered(struct nvme_ctrlr *nvme_ctrlr) 535 { 536 if (!nvme_ctrlr->destruct) { 537 return false; 538 } 539 540 if (nvme_ctrlr->ref > 0) { 541 return false; 542 } 543 544 if (nvme_ctrlr->resetting) { 545 return false; 546 } 547 548 if (nvme_ctrlr->ana_log_page_updating) { 549 return false; 550 } 551 552 if (nvme_ctrlr->io_path_cache_clearing) { 553 return false; 554 } 555 556 return true; 557 } 558 559 static void 560 nvme_ctrlr_release(struct nvme_ctrlr *nvme_ctrlr) 561 { 562 pthread_mutex_lock(&nvme_ctrlr->mutex); 563 SPDK_DTRACE_PROBE2(bdev_nvme_ctrlr_release, nvme_ctrlr->nbdev_ctrlr->name, nvme_ctrlr->ref); 564 565 assert(nvme_ctrlr->ref > 0); 566 nvme_ctrlr->ref--; 567 568 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 569 pthread_mutex_unlock(&nvme_ctrlr->mutex); 570 return; 571 } 572 573 pthread_mutex_unlock(&nvme_ctrlr->mutex); 574 575 spdk_thread_exec_msg(nvme_ctrlr->thread, nvme_ctrlr_unregister, nvme_ctrlr); 576 } 577 578 static void 579 bdev_nvme_clear_current_io_path(struct nvme_bdev_channel *nbdev_ch) 580 { 581 nbdev_ch->current_io_path = NULL; 582 nbdev_ch->rr_counter = 0; 583 } 584 585 static struct nvme_io_path * 586 _bdev_nvme_get_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 587 { 588 struct nvme_io_path *io_path; 589 590 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 591 if (io_path->nvme_ns == nvme_ns) { 592 break; 593 } 594 } 595 596 return io_path; 597 } 598 599 static struct nvme_io_path * 600 nvme_io_path_alloc(void) 601 { 602 struct nvme_io_path *io_path; 603 604 io_path = calloc(1, sizeof(*io_path)); 605 if (io_path == NULL) { 606 SPDK_ERRLOG("Failed to alloc io_path.\n"); 607 return NULL; 608 } 609 610 if (g_opts.io_path_stat) { 611 io_path->stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); 612 if (io_path->stat == NULL) { 613 free(io_path); 614 SPDK_ERRLOG("Failed to alloc io_path stat.\n"); 615 return NULL; 616 } 617 spdk_bdev_reset_io_stat(io_path->stat, SPDK_BDEV_RESET_STAT_MAXMIN); 618 } 619 620 return io_path; 621 } 622 623 static void 624 nvme_io_path_free(struct nvme_io_path *io_path) 625 { 626 free(io_path->stat); 627 free(io_path); 628 } 629 630 static int 631 _bdev_nvme_add_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 632 { 633 struct nvme_io_path *io_path; 634 struct spdk_io_channel *ch; 635 struct nvme_ctrlr_channel *ctrlr_ch; 636 struct nvme_qpair *nvme_qpair; 637 638 io_path = nvme_io_path_alloc(); 639 if (io_path == NULL) { 640 return -ENOMEM; 641 } 642 643 io_path->nvme_ns = nvme_ns; 644 645 ch = spdk_get_io_channel(nvme_ns->ctrlr); 646 if (ch == NULL) { 647 nvme_io_path_free(io_path); 648 SPDK_ERRLOG("Failed to alloc io_channel.\n"); 649 return -ENOMEM; 650 } 651 652 ctrlr_ch = spdk_io_channel_get_ctx(ch); 653 654 nvme_qpair = ctrlr_ch->qpair; 655 assert(nvme_qpair != NULL); 656 657 io_path->qpair = nvme_qpair; 658 TAILQ_INSERT_TAIL(&nvme_qpair->io_path_list, io_path, tailq); 659 660 io_path->nbdev_ch = nbdev_ch; 661 STAILQ_INSERT_TAIL(&nbdev_ch->io_path_list, io_path, stailq); 662 663 bdev_nvme_clear_current_io_path(nbdev_ch); 664 665 return 0; 666 } 667 668 static void 669 bdev_nvme_clear_retry_io_path(struct nvme_bdev_channel *nbdev_ch, 670 struct nvme_io_path *io_path) 671 { 672 struct spdk_bdev_io *bdev_io; 673 struct nvme_bdev_io *bio; 674 675 TAILQ_FOREACH(bdev_io, &nbdev_ch->retry_io_list, module_link) { 676 bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 677 if (bio->io_path == io_path) { 678 bio->io_path = NULL; 679 } 680 } 681 } 682 683 static void 684 _bdev_nvme_delete_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *io_path) 685 { 686 struct spdk_io_channel *ch; 687 struct nvme_qpair *nvme_qpair; 688 struct nvme_ctrlr_channel *ctrlr_ch; 689 struct nvme_bdev *nbdev; 690 691 nbdev = spdk_io_channel_get_io_device(spdk_io_channel_from_ctx(nbdev_ch)); 692 693 /* Add the statistics to nvme_ns before this path is destroyed. */ 694 pthread_mutex_lock(&nbdev->mutex); 695 if (nbdev->ref != 0 && io_path->nvme_ns->stat != NULL && io_path->stat != NULL) { 696 spdk_bdev_add_io_stat(io_path->nvme_ns->stat, io_path->stat); 697 } 698 pthread_mutex_unlock(&nbdev->mutex); 699 700 bdev_nvme_clear_current_io_path(nbdev_ch); 701 bdev_nvme_clear_retry_io_path(nbdev_ch, io_path); 702 703 STAILQ_REMOVE(&nbdev_ch->io_path_list, io_path, nvme_io_path, stailq); 704 io_path->nbdev_ch = NULL; 705 706 nvme_qpair = io_path->qpair; 707 assert(nvme_qpair != NULL); 708 709 ctrlr_ch = nvme_qpair->ctrlr_ch; 710 assert(ctrlr_ch != NULL); 711 712 ch = spdk_io_channel_from_ctx(ctrlr_ch); 713 spdk_put_io_channel(ch); 714 715 /* After an io_path is removed, I/Os submitted to it may complete and update statistics 716 * of the io_path. To avoid heap-use-after-free error from this case, do not free the 717 * io_path here but free the io_path when the associated qpair is freed. It is ensured 718 * that all I/Os submitted to the io_path are completed when the associated qpair is freed. 719 */ 720 } 721 722 static void 723 _bdev_nvme_delete_io_paths(struct nvme_bdev_channel *nbdev_ch) 724 { 725 struct nvme_io_path *io_path, *tmp_io_path; 726 727 STAILQ_FOREACH_SAFE(io_path, &nbdev_ch->io_path_list, stailq, tmp_io_path) { 728 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 729 } 730 } 731 732 static int 733 bdev_nvme_create_bdev_channel_cb(void *io_device, void *ctx_buf) 734 { 735 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 736 struct nvme_bdev *nbdev = io_device; 737 struct nvme_ns *nvme_ns; 738 int rc; 739 740 STAILQ_INIT(&nbdev_ch->io_path_list); 741 TAILQ_INIT(&nbdev_ch->retry_io_list); 742 743 pthread_mutex_lock(&nbdev->mutex); 744 745 nbdev_ch->mp_policy = nbdev->mp_policy; 746 nbdev_ch->mp_selector = nbdev->mp_selector; 747 nbdev_ch->rr_min_io = nbdev->rr_min_io; 748 749 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 750 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 751 if (rc != 0) { 752 pthread_mutex_unlock(&nbdev->mutex); 753 754 _bdev_nvme_delete_io_paths(nbdev_ch); 755 return rc; 756 } 757 } 758 pthread_mutex_unlock(&nbdev->mutex); 759 760 return 0; 761 } 762 763 /* If cpl != NULL, complete the bdev_io with nvme status based on 'cpl'. 764 * If cpl == NULL, complete the bdev_io with bdev status based on 'status'. 765 */ 766 static inline void 767 __bdev_nvme_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status, 768 const struct spdk_nvme_cpl *cpl) 769 { 770 spdk_trace_record(TRACE_BDEV_NVME_IO_DONE, 0, 0, (uintptr_t)bdev_io->driver_ctx, 771 (uintptr_t)bdev_io); 772 if (cpl) { 773 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 774 } else { 775 spdk_bdev_io_complete(bdev_io, status); 776 } 777 } 778 779 static void bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch); 780 781 static void 782 bdev_nvme_destroy_bdev_channel_cb(void *io_device, void *ctx_buf) 783 { 784 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 785 786 bdev_nvme_abort_retry_ios(nbdev_ch); 787 _bdev_nvme_delete_io_paths(nbdev_ch); 788 } 789 790 static inline bool 791 bdev_nvme_io_type_is_admin(enum spdk_bdev_io_type io_type) 792 { 793 switch (io_type) { 794 case SPDK_BDEV_IO_TYPE_RESET: 795 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 796 case SPDK_BDEV_IO_TYPE_ABORT: 797 return true; 798 default: 799 break; 800 } 801 802 return false; 803 } 804 805 static inline bool 806 nvme_ns_is_accessible(struct nvme_ns *nvme_ns) 807 { 808 if (spdk_unlikely(nvme_ns->ana_state_updating)) { 809 return false; 810 } 811 812 switch (nvme_ns->ana_state) { 813 case SPDK_NVME_ANA_OPTIMIZED_STATE: 814 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 815 return true; 816 default: 817 break; 818 } 819 820 return false; 821 } 822 823 static inline bool 824 nvme_qpair_is_connected(struct nvme_qpair *nvme_qpair) 825 { 826 if (spdk_unlikely(nvme_qpair->qpair == NULL)) { 827 return false; 828 } 829 830 if (spdk_unlikely(spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) != 831 SPDK_NVME_QPAIR_FAILURE_NONE)) { 832 return false; 833 } 834 835 if (spdk_unlikely(nvme_qpair->ctrlr_ch->reset_iter != NULL)) { 836 return false; 837 } 838 839 if (spdk_nvme_ctrlr_get_admin_qp_failure_reason(nvme_qpair->ctrlr->ctrlr) != 840 SPDK_NVME_QPAIR_FAILURE_NONE) { 841 return false; 842 } 843 844 return true; 845 } 846 847 static inline bool 848 nvme_io_path_is_available(struct nvme_io_path *io_path) 849 { 850 if (spdk_unlikely(!nvme_qpair_is_connected(io_path->qpair))) { 851 return false; 852 } 853 854 if (spdk_unlikely(!nvme_ns_is_accessible(io_path->nvme_ns))) { 855 return false; 856 } 857 858 return true; 859 } 860 861 static inline bool 862 nvme_ctrlr_is_failed(struct nvme_ctrlr *nvme_ctrlr) 863 { 864 if (nvme_ctrlr->destruct) { 865 return true; 866 } 867 868 if (nvme_ctrlr->fast_io_fail_timedout) { 869 return true; 870 } 871 872 if (nvme_ctrlr->resetting) { 873 if (nvme_ctrlr->opts.reconnect_delay_sec != 0) { 874 return false; 875 } else { 876 return true; 877 } 878 } 879 880 if (nvme_ctrlr->reconnect_is_delayed) { 881 return false; 882 } 883 884 if (nvme_ctrlr->disabled) { 885 return true; 886 } 887 888 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 889 return true; 890 } else { 891 return false; 892 } 893 } 894 895 static bool 896 nvme_ctrlr_is_available(struct nvme_ctrlr *nvme_ctrlr) 897 { 898 if (nvme_ctrlr->destruct) { 899 return false; 900 } 901 902 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 903 return false; 904 } 905 906 if (nvme_ctrlr->resetting || nvme_ctrlr->reconnect_is_delayed) { 907 return false; 908 } 909 910 if (nvme_ctrlr->disabled) { 911 return false; 912 } 913 914 return true; 915 } 916 917 /* Simulate circular linked list. */ 918 static inline struct nvme_io_path * 919 nvme_io_path_get_next(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *prev_path) 920 { 921 struct nvme_io_path *next_path; 922 923 if (prev_path != NULL) { 924 next_path = STAILQ_NEXT(prev_path, stailq); 925 if (next_path != NULL) { 926 return next_path; 927 } 928 } 929 930 return STAILQ_FIRST(&nbdev_ch->io_path_list); 931 } 932 933 static struct nvme_io_path * 934 _bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 935 { 936 struct nvme_io_path *io_path, *start, *non_optimized = NULL; 937 938 start = nvme_io_path_get_next(nbdev_ch, nbdev_ch->current_io_path); 939 940 io_path = start; 941 do { 942 if (spdk_likely(nvme_qpair_is_connected(io_path->qpair) && 943 !io_path->nvme_ns->ana_state_updating)) { 944 switch (io_path->nvme_ns->ana_state) { 945 case SPDK_NVME_ANA_OPTIMIZED_STATE: 946 nbdev_ch->current_io_path = io_path; 947 return io_path; 948 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 949 if (non_optimized == NULL) { 950 non_optimized = io_path; 951 } 952 break; 953 default: 954 break; 955 } 956 } 957 io_path = nvme_io_path_get_next(nbdev_ch, io_path); 958 } while (io_path != start); 959 960 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) { 961 /* We come here only if there is no optimized path. Cache even non_optimized 962 * path for load balance across multiple non_optimized paths. 963 */ 964 nbdev_ch->current_io_path = non_optimized; 965 } 966 967 return non_optimized; 968 } 969 970 static struct nvme_io_path * 971 _bdev_nvme_find_io_path_min_qd(struct nvme_bdev_channel *nbdev_ch) 972 { 973 struct nvme_io_path *io_path; 974 struct nvme_io_path *optimized = NULL, *non_optimized = NULL; 975 uint32_t opt_min_qd = UINT32_MAX, non_opt_min_qd = UINT32_MAX; 976 uint32_t num_outstanding_reqs; 977 978 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 979 if (spdk_unlikely(!nvme_qpair_is_connected(io_path->qpair))) { 980 /* The device is currently resetting. */ 981 continue; 982 } 983 984 if (spdk_unlikely(io_path->nvme_ns->ana_state_updating)) { 985 continue; 986 } 987 988 num_outstanding_reqs = spdk_nvme_qpair_get_num_outstanding_reqs(io_path->qpair->qpair); 989 switch (io_path->nvme_ns->ana_state) { 990 case SPDK_NVME_ANA_OPTIMIZED_STATE: 991 if (num_outstanding_reqs < opt_min_qd) { 992 opt_min_qd = num_outstanding_reqs; 993 optimized = io_path; 994 } 995 break; 996 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 997 if (num_outstanding_reqs < non_opt_min_qd) { 998 non_opt_min_qd = num_outstanding_reqs; 999 non_optimized = io_path; 1000 } 1001 break; 1002 default: 1003 break; 1004 } 1005 } 1006 1007 /* don't cache io path for BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH selector */ 1008 if (optimized != NULL) { 1009 return optimized; 1010 } 1011 1012 return non_optimized; 1013 } 1014 1015 static inline struct nvme_io_path * 1016 bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 1017 { 1018 if (spdk_likely(nbdev_ch->current_io_path != NULL)) { 1019 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE) { 1020 return nbdev_ch->current_io_path; 1021 } else if (nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 1022 if (++nbdev_ch->rr_counter < nbdev_ch->rr_min_io) { 1023 return nbdev_ch->current_io_path; 1024 } 1025 nbdev_ch->rr_counter = 0; 1026 } 1027 } 1028 1029 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE || 1030 nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 1031 return _bdev_nvme_find_io_path(nbdev_ch); 1032 } else { 1033 return _bdev_nvme_find_io_path_min_qd(nbdev_ch); 1034 } 1035 } 1036 1037 /* Return true if there is any io_path whose qpair is active or ctrlr is not failed, 1038 * or false otherwise. 1039 * 1040 * If any io_path has an active qpair but find_io_path() returned NULL, its namespace 1041 * is likely to be non-accessible now but may become accessible. 1042 * 1043 * If any io_path has an unfailed ctrlr but find_io_path() returned NULL, the ctrlr 1044 * is likely to be resetting now but the reset may succeed. A ctrlr is set to unfailed 1045 * when starting to reset it but it is set to failed when the reset failed. Hence, if 1046 * a ctrlr is unfailed, it is likely that it works fine or is resetting. 1047 */ 1048 static bool 1049 any_io_path_may_become_available(struct nvme_bdev_channel *nbdev_ch) 1050 { 1051 struct nvme_io_path *io_path; 1052 1053 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 1054 if (io_path->nvme_ns->ana_transition_timedout) { 1055 continue; 1056 } 1057 1058 if (nvme_qpair_is_connected(io_path->qpair) || 1059 !nvme_ctrlr_is_failed(io_path->qpair->ctrlr)) { 1060 return true; 1061 } 1062 } 1063 1064 return false; 1065 } 1066 1067 static void 1068 bdev_nvme_retry_io(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io) 1069 { 1070 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 1071 struct spdk_io_channel *ch; 1072 1073 if (nbdev_io->io_path != NULL && nvme_io_path_is_available(nbdev_io->io_path)) { 1074 _bdev_nvme_submit_request(nbdev_ch, bdev_io); 1075 } else { 1076 ch = spdk_io_channel_from_ctx(nbdev_ch); 1077 bdev_nvme_submit_request(ch, bdev_io); 1078 } 1079 } 1080 1081 static int 1082 bdev_nvme_retry_ios(void *arg) 1083 { 1084 struct nvme_bdev_channel *nbdev_ch = arg; 1085 struct spdk_bdev_io *bdev_io, *tmp_bdev_io; 1086 struct nvme_bdev_io *bio; 1087 uint64_t now, delay_us; 1088 1089 now = spdk_get_ticks(); 1090 1091 TAILQ_FOREACH_SAFE(bdev_io, &nbdev_ch->retry_io_list, module_link, tmp_bdev_io) { 1092 bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 1093 if (bio->retry_ticks > now) { 1094 break; 1095 } 1096 1097 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io, module_link); 1098 1099 bdev_nvme_retry_io(nbdev_ch, bdev_io); 1100 } 1101 1102 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1103 1104 bdev_io = TAILQ_FIRST(&nbdev_ch->retry_io_list); 1105 if (bdev_io != NULL) { 1106 bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 1107 1108 delay_us = (bio->retry_ticks - now) * SPDK_SEC_TO_USEC / spdk_get_ticks_hz(); 1109 1110 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 1111 delay_us); 1112 } 1113 1114 return SPDK_POLLER_BUSY; 1115 } 1116 1117 static void 1118 bdev_nvme_queue_retry_io(struct nvme_bdev_channel *nbdev_ch, 1119 struct nvme_bdev_io *bio, uint64_t delay_ms) 1120 { 1121 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1122 struct spdk_bdev_io *tmp_bdev_io; 1123 struct nvme_bdev_io *tmp_bio; 1124 1125 bio->retry_ticks = spdk_get_ticks() + delay_ms * spdk_get_ticks_hz() / 1000ULL; 1126 1127 TAILQ_FOREACH_REVERSE(tmp_bdev_io, &nbdev_ch->retry_io_list, retry_io_head, module_link) { 1128 tmp_bio = (struct nvme_bdev_io *)tmp_bdev_io->driver_ctx; 1129 1130 if (tmp_bio->retry_ticks <= bio->retry_ticks) { 1131 TAILQ_INSERT_AFTER(&nbdev_ch->retry_io_list, tmp_bdev_io, bdev_io, 1132 module_link); 1133 return; 1134 } 1135 } 1136 1137 /* No earlier I/Os were found. This I/O must be the new head. */ 1138 TAILQ_INSERT_HEAD(&nbdev_ch->retry_io_list, bdev_io, module_link); 1139 1140 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1141 1142 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 1143 delay_ms * 1000ULL); 1144 } 1145 1146 static void 1147 bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch) 1148 { 1149 struct spdk_bdev_io *bdev_io, *tmp_io; 1150 1151 TAILQ_FOREACH_SAFE(bdev_io, &nbdev_ch->retry_io_list, module_link, tmp_io) { 1152 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io, module_link); 1153 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED, NULL); 1154 } 1155 1156 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1157 } 1158 1159 static int 1160 bdev_nvme_abort_retry_io(struct nvme_bdev_channel *nbdev_ch, 1161 struct nvme_bdev_io *bio_to_abort) 1162 { 1163 struct spdk_bdev_io *bdev_io_to_abort; 1164 1165 TAILQ_FOREACH(bdev_io_to_abort, &nbdev_ch->retry_io_list, module_link) { 1166 if ((struct nvme_bdev_io *)bdev_io_to_abort->driver_ctx == bio_to_abort) { 1167 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io_to_abort, module_link); 1168 __bdev_nvme_io_complete(bdev_io_to_abort, SPDK_BDEV_IO_STATUS_ABORTED, NULL); 1169 return 0; 1170 } 1171 } 1172 1173 return -ENOENT; 1174 } 1175 1176 static void 1177 bdev_nvme_update_nvme_error_stat(struct spdk_bdev_io *bdev_io, const struct spdk_nvme_cpl *cpl) 1178 { 1179 struct nvme_bdev *nbdev; 1180 uint16_t sct, sc; 1181 1182 assert(spdk_nvme_cpl_is_error(cpl)); 1183 1184 nbdev = bdev_io->bdev->ctxt; 1185 1186 if (nbdev->err_stat == NULL) { 1187 return; 1188 } 1189 1190 sct = cpl->status.sct; 1191 sc = cpl->status.sc; 1192 1193 pthread_mutex_lock(&nbdev->mutex); 1194 1195 nbdev->err_stat->status_type[sct]++; 1196 switch (sct) { 1197 case SPDK_NVME_SCT_GENERIC: 1198 case SPDK_NVME_SCT_COMMAND_SPECIFIC: 1199 case SPDK_NVME_SCT_MEDIA_ERROR: 1200 case SPDK_NVME_SCT_PATH: 1201 nbdev->err_stat->status[sct][sc]++; 1202 break; 1203 default: 1204 break; 1205 } 1206 1207 pthread_mutex_unlock(&nbdev->mutex); 1208 } 1209 1210 static inline void 1211 bdev_nvme_update_io_path_stat(struct nvme_bdev_io *bio) 1212 { 1213 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1214 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 1215 uint32_t blocklen = bdev_io->bdev->blocklen; 1216 struct spdk_bdev_io_stat *stat; 1217 uint64_t tsc_diff; 1218 1219 if (bio->io_path->stat == NULL) { 1220 return; 1221 } 1222 1223 tsc_diff = spdk_get_ticks() - bio->submit_tsc; 1224 stat = bio->io_path->stat; 1225 1226 switch (bdev_io->type) { 1227 case SPDK_BDEV_IO_TYPE_READ: 1228 stat->bytes_read += num_blocks * blocklen; 1229 stat->num_read_ops++; 1230 stat->read_latency_ticks += tsc_diff; 1231 if (stat->max_read_latency_ticks < tsc_diff) { 1232 stat->max_read_latency_ticks = tsc_diff; 1233 } 1234 if (stat->min_read_latency_ticks > tsc_diff) { 1235 stat->min_read_latency_ticks = tsc_diff; 1236 } 1237 break; 1238 case SPDK_BDEV_IO_TYPE_WRITE: 1239 stat->bytes_written += num_blocks * blocklen; 1240 stat->num_write_ops++; 1241 stat->write_latency_ticks += tsc_diff; 1242 if (stat->max_write_latency_ticks < tsc_diff) { 1243 stat->max_write_latency_ticks = tsc_diff; 1244 } 1245 if (stat->min_write_latency_ticks > tsc_diff) { 1246 stat->min_write_latency_ticks = tsc_diff; 1247 } 1248 break; 1249 case SPDK_BDEV_IO_TYPE_UNMAP: 1250 stat->bytes_unmapped += num_blocks * blocklen; 1251 stat->num_unmap_ops++; 1252 stat->unmap_latency_ticks += tsc_diff; 1253 if (stat->max_unmap_latency_ticks < tsc_diff) { 1254 stat->max_unmap_latency_ticks = tsc_diff; 1255 } 1256 if (stat->min_unmap_latency_ticks > tsc_diff) { 1257 stat->min_unmap_latency_ticks = tsc_diff; 1258 } 1259 break; 1260 case SPDK_BDEV_IO_TYPE_ZCOPY: 1261 /* Track the data in the start phase only */ 1262 if (!bdev_io->u.bdev.zcopy.start) { 1263 break; 1264 } 1265 if (bdev_io->u.bdev.zcopy.populate) { 1266 stat->bytes_read += num_blocks * blocklen; 1267 stat->num_read_ops++; 1268 stat->read_latency_ticks += tsc_diff; 1269 if (stat->max_read_latency_ticks < tsc_diff) { 1270 stat->max_read_latency_ticks = tsc_diff; 1271 } 1272 if (stat->min_read_latency_ticks > tsc_diff) { 1273 stat->min_read_latency_ticks = tsc_diff; 1274 } 1275 } else { 1276 stat->bytes_written += num_blocks * blocklen; 1277 stat->num_write_ops++; 1278 stat->write_latency_ticks += tsc_diff; 1279 if (stat->max_write_latency_ticks < tsc_diff) { 1280 stat->max_write_latency_ticks = tsc_diff; 1281 } 1282 if (stat->min_write_latency_ticks > tsc_diff) { 1283 stat->min_write_latency_ticks = tsc_diff; 1284 } 1285 } 1286 break; 1287 case SPDK_BDEV_IO_TYPE_COPY: 1288 stat->bytes_copied += num_blocks * blocklen; 1289 stat->num_copy_ops++; 1290 stat->copy_latency_ticks += tsc_diff; 1291 if (stat->max_copy_latency_ticks < tsc_diff) { 1292 stat->max_copy_latency_ticks = tsc_diff; 1293 } 1294 if (stat->min_copy_latency_ticks > tsc_diff) { 1295 stat->min_copy_latency_ticks = tsc_diff; 1296 } 1297 break; 1298 default: 1299 break; 1300 } 1301 } 1302 1303 static bool 1304 bdev_nvme_check_retry_io(struct nvme_bdev_io *bio, 1305 const struct spdk_nvme_cpl *cpl, 1306 struct nvme_bdev_channel *nbdev_ch, 1307 uint64_t *_delay_ms) 1308 { 1309 struct nvme_io_path *io_path = bio->io_path; 1310 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 1311 const struct spdk_nvme_ctrlr_data *cdata; 1312 1313 if (spdk_nvme_cpl_is_path_error(cpl) || 1314 spdk_nvme_cpl_is_aborted_sq_deletion(cpl) || 1315 !nvme_io_path_is_available(io_path) || 1316 !nvme_ctrlr_is_available(nvme_ctrlr)) { 1317 bdev_nvme_clear_current_io_path(nbdev_ch); 1318 bio->io_path = NULL; 1319 if (spdk_nvme_cpl_is_ana_error(cpl)) { 1320 if (nvme_ctrlr_read_ana_log_page(nvme_ctrlr) == 0) { 1321 io_path->nvme_ns->ana_state_updating = true; 1322 } 1323 } 1324 if (!any_io_path_may_become_available(nbdev_ch)) { 1325 return false; 1326 } 1327 *_delay_ms = 0; 1328 } else { 1329 bio->retry_count++; 1330 1331 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 1332 1333 if (cpl->status.crd != 0) { 1334 *_delay_ms = cdata->crdt[cpl->status.crd] * 100; 1335 } else { 1336 *_delay_ms = 0; 1337 } 1338 } 1339 1340 return true; 1341 } 1342 1343 static inline void 1344 bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio, 1345 const struct spdk_nvme_cpl *cpl) 1346 { 1347 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1348 struct nvme_bdev_channel *nbdev_ch; 1349 uint64_t delay_ms; 1350 1351 assert(!bdev_nvme_io_type_is_admin(bdev_io->type)); 1352 1353 if (spdk_likely(spdk_nvme_cpl_is_success(cpl))) { 1354 bdev_nvme_update_io_path_stat(bio); 1355 goto complete; 1356 } 1357 1358 /* Update error counts before deciding if retry is needed. 1359 * Hence, error counts may be more than the number of I/O errors. 1360 */ 1361 bdev_nvme_update_nvme_error_stat(bdev_io, cpl); 1362 1363 if (cpl->status.dnr != 0 || spdk_nvme_cpl_is_aborted_by_request(cpl) || 1364 (g_opts.bdev_retry_count != -1 && bio->retry_count >= g_opts.bdev_retry_count)) { 1365 goto complete; 1366 } 1367 1368 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1369 1370 if (bdev_nvme_check_retry_io(bio, cpl, nbdev_ch, &delay_ms)) { 1371 bdev_nvme_queue_retry_io(nbdev_ch, bio, delay_ms); 1372 return; 1373 } 1374 1375 complete: 1376 bio->retry_count = 0; 1377 bio->submit_tsc = 0; 1378 __bdev_nvme_io_complete(bdev_io, 0, cpl); 1379 } 1380 1381 static inline void 1382 bdev_nvme_io_complete(struct nvme_bdev_io *bio, int rc) 1383 { 1384 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1385 struct nvme_bdev_channel *nbdev_ch; 1386 enum spdk_bdev_io_status io_status; 1387 1388 assert(!bdev_nvme_io_type_is_admin(bdev_io->type)); 1389 1390 switch (rc) { 1391 case 0: 1392 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1393 break; 1394 case -ENOMEM: 1395 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1396 break; 1397 case -ENXIO: 1398 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1399 1400 bdev_nvme_clear_current_io_path(nbdev_ch); 1401 bio->io_path = NULL; 1402 1403 if (any_io_path_may_become_available(nbdev_ch)) { 1404 bdev_nvme_queue_retry_io(nbdev_ch, bio, 1000ULL); 1405 return; 1406 } 1407 1408 /* fallthrough */ 1409 default: 1410 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1411 break; 1412 } 1413 1414 bio->retry_count = 0; 1415 bio->submit_tsc = 0; 1416 __bdev_nvme_io_complete(bdev_io, io_status, NULL); 1417 } 1418 1419 static inline void 1420 bdev_nvme_admin_complete(struct nvme_bdev_io *bio, int rc) 1421 { 1422 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1423 enum spdk_bdev_io_status io_status; 1424 1425 switch (rc) { 1426 case 0: 1427 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1428 break; 1429 case -ENOMEM: 1430 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1431 break; 1432 case -ENXIO: 1433 /* fallthrough */ 1434 default: 1435 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1436 break; 1437 } 1438 1439 __bdev_nvme_io_complete(bdev_io, io_status, NULL); 1440 } 1441 1442 static void 1443 bdev_nvme_clear_io_path_caches_done(struct spdk_io_channel_iter *i, int status) 1444 { 1445 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 1446 1447 pthread_mutex_lock(&nvme_ctrlr->mutex); 1448 1449 assert(nvme_ctrlr->io_path_cache_clearing == true); 1450 nvme_ctrlr->io_path_cache_clearing = false; 1451 1452 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 1453 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1454 return; 1455 } 1456 1457 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1458 1459 nvme_ctrlr_unregister(nvme_ctrlr); 1460 } 1461 1462 static void 1463 _bdev_nvme_clear_io_path_cache(struct nvme_qpair *nvme_qpair) 1464 { 1465 struct nvme_io_path *io_path; 1466 1467 TAILQ_FOREACH(io_path, &nvme_qpair->io_path_list, tailq) { 1468 if (io_path->nbdev_ch == NULL) { 1469 continue; 1470 } 1471 bdev_nvme_clear_current_io_path(io_path->nbdev_ch); 1472 } 1473 } 1474 1475 static void 1476 bdev_nvme_clear_io_path_cache(struct spdk_io_channel_iter *i) 1477 { 1478 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 1479 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 1480 1481 assert(ctrlr_ch->qpair != NULL); 1482 1483 _bdev_nvme_clear_io_path_cache(ctrlr_ch->qpair); 1484 1485 spdk_for_each_channel_continue(i, 0); 1486 } 1487 1488 static void 1489 bdev_nvme_clear_io_path_caches(struct nvme_ctrlr *nvme_ctrlr) 1490 { 1491 pthread_mutex_lock(&nvme_ctrlr->mutex); 1492 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 1493 nvme_ctrlr->io_path_cache_clearing) { 1494 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1495 return; 1496 } 1497 1498 nvme_ctrlr->io_path_cache_clearing = true; 1499 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1500 1501 spdk_for_each_channel(nvme_ctrlr, 1502 bdev_nvme_clear_io_path_cache, 1503 NULL, 1504 bdev_nvme_clear_io_path_caches_done); 1505 } 1506 1507 static struct nvme_qpair * 1508 nvme_poll_group_get_qpair(struct nvme_poll_group *group, struct spdk_nvme_qpair *qpair) 1509 { 1510 struct nvme_qpair *nvme_qpair; 1511 1512 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1513 if (nvme_qpair->qpair == qpair) { 1514 break; 1515 } 1516 } 1517 1518 return nvme_qpair; 1519 } 1520 1521 static void nvme_qpair_delete(struct nvme_qpair *nvme_qpair); 1522 1523 static void 1524 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx) 1525 { 1526 struct nvme_poll_group *group = poll_group_ctx; 1527 struct nvme_qpair *nvme_qpair; 1528 struct nvme_ctrlr_channel *ctrlr_ch; 1529 int status; 1530 1531 nvme_qpair = nvme_poll_group_get_qpair(group, qpair); 1532 if (nvme_qpair == NULL) { 1533 return; 1534 } 1535 1536 if (nvme_qpair->qpair != NULL) { 1537 spdk_nvme_ctrlr_free_io_qpair(nvme_qpair->qpair); 1538 nvme_qpair->qpair = NULL; 1539 } 1540 1541 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1542 1543 ctrlr_ch = nvme_qpair->ctrlr_ch; 1544 1545 if (ctrlr_ch != NULL) { 1546 if (ctrlr_ch->reset_iter != NULL) { 1547 /* We are in a full reset sequence. */ 1548 if (ctrlr_ch->connect_poller != NULL) { 1549 /* qpair was failed to connect. Abort the reset sequence. */ 1550 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was failed to connect. abort the reset ctrlr sequence.\n", 1551 qpair); 1552 spdk_poller_unregister(&ctrlr_ch->connect_poller); 1553 status = -1; 1554 } else { 1555 /* qpair was completed to disconnect. Just move to the next ctrlr_channel. */ 1556 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was disconnected and freed in a reset ctrlr sequence.\n", 1557 qpair); 1558 status = 0; 1559 } 1560 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, status); 1561 ctrlr_ch->reset_iter = NULL; 1562 } else { 1563 /* qpair was disconnected unexpectedly. Reset controller for recovery. */ 1564 SPDK_NOTICELOG("qpair %p was disconnected and freed. reset controller.\n", qpair); 1565 bdev_nvme_failover_ctrlr(nvme_qpair->ctrlr, false); 1566 } 1567 } else { 1568 /* In this case, ctrlr_channel is already deleted. */ 1569 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was disconnected and freed. delete nvme_qpair.\n", qpair); 1570 nvme_qpair_delete(nvme_qpair); 1571 } 1572 } 1573 1574 static void 1575 bdev_nvme_check_io_qpairs(struct nvme_poll_group *group) 1576 { 1577 struct nvme_qpair *nvme_qpair; 1578 1579 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1580 if (nvme_qpair->qpair == NULL || nvme_qpair->ctrlr_ch == NULL) { 1581 continue; 1582 } 1583 1584 if (spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) != 1585 SPDK_NVME_QPAIR_FAILURE_NONE) { 1586 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1587 } 1588 } 1589 } 1590 1591 static int 1592 bdev_nvme_poll(void *arg) 1593 { 1594 struct nvme_poll_group *group = arg; 1595 int64_t num_completions; 1596 1597 if (group->collect_spin_stat && group->start_ticks == 0) { 1598 group->start_ticks = spdk_get_ticks(); 1599 } 1600 1601 num_completions = spdk_nvme_poll_group_process_completions(group->group, 0, 1602 bdev_nvme_disconnected_qpair_cb); 1603 if (group->collect_spin_stat) { 1604 if (num_completions > 0) { 1605 if (group->end_ticks != 0) { 1606 group->spin_ticks += (group->end_ticks - group->start_ticks); 1607 group->end_ticks = 0; 1608 } 1609 group->start_ticks = 0; 1610 } else { 1611 group->end_ticks = spdk_get_ticks(); 1612 } 1613 } 1614 1615 if (spdk_unlikely(num_completions < 0)) { 1616 bdev_nvme_check_io_qpairs(group); 1617 } 1618 1619 return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 1620 } 1621 1622 static int bdev_nvme_poll_adminq(void *arg); 1623 1624 static void 1625 bdev_nvme_change_adminq_poll_period(struct nvme_ctrlr *nvme_ctrlr, uint64_t new_period_us) 1626 { 1627 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 1628 1629 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, 1630 nvme_ctrlr, new_period_us); 1631 } 1632 1633 static int 1634 bdev_nvme_poll_adminq(void *arg) 1635 { 1636 int32_t rc; 1637 struct nvme_ctrlr *nvme_ctrlr = arg; 1638 nvme_ctrlr_disconnected_cb disconnected_cb; 1639 1640 assert(nvme_ctrlr != NULL); 1641 1642 rc = spdk_nvme_ctrlr_process_admin_completions(nvme_ctrlr->ctrlr); 1643 if (rc < 0) { 1644 disconnected_cb = nvme_ctrlr->disconnected_cb; 1645 nvme_ctrlr->disconnected_cb = NULL; 1646 1647 if (disconnected_cb != NULL) { 1648 bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 1649 g_opts.nvme_adminq_poll_period_us); 1650 disconnected_cb(nvme_ctrlr); 1651 } else { 1652 bdev_nvme_failover_ctrlr(nvme_ctrlr, false); 1653 } 1654 } else if (spdk_nvme_ctrlr_get_admin_qp_failure_reason(nvme_ctrlr->ctrlr) != 1655 SPDK_NVME_QPAIR_FAILURE_NONE) { 1656 bdev_nvme_clear_io_path_caches(nvme_ctrlr); 1657 } 1658 1659 return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY; 1660 } 1661 1662 static void 1663 nvme_bdev_free(void *io_device) 1664 { 1665 struct nvme_bdev *nvme_disk = io_device; 1666 1667 pthread_mutex_destroy(&nvme_disk->mutex); 1668 free(nvme_disk->disk.name); 1669 free(nvme_disk->err_stat); 1670 free(nvme_disk); 1671 } 1672 1673 static int 1674 bdev_nvme_destruct(void *ctx) 1675 { 1676 struct nvme_bdev *nvme_disk = ctx; 1677 struct nvme_ns *nvme_ns, *tmp_nvme_ns; 1678 1679 SPDK_DTRACE_PROBE2(bdev_nvme_destruct, nvme_disk->nbdev_ctrlr->name, nvme_disk->nsid); 1680 1681 TAILQ_FOREACH_SAFE(nvme_ns, &nvme_disk->nvme_ns_list, tailq, tmp_nvme_ns) { 1682 pthread_mutex_lock(&nvme_ns->ctrlr->mutex); 1683 1684 nvme_ns->bdev = NULL; 1685 1686 assert(nvme_ns->id > 0); 1687 1688 if (nvme_ctrlr_get_ns(nvme_ns->ctrlr, nvme_ns->id) == NULL) { 1689 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1690 1691 nvme_ctrlr_release(nvme_ns->ctrlr); 1692 nvme_ns_free(nvme_ns); 1693 } else { 1694 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1695 } 1696 } 1697 1698 pthread_mutex_lock(&g_bdev_nvme_mutex); 1699 TAILQ_REMOVE(&nvme_disk->nbdev_ctrlr->bdevs, nvme_disk, tailq); 1700 pthread_mutex_unlock(&g_bdev_nvme_mutex); 1701 1702 spdk_io_device_unregister(nvme_disk, nvme_bdev_free); 1703 1704 return 0; 1705 } 1706 1707 static int 1708 bdev_nvme_create_qpair(struct nvme_qpair *nvme_qpair) 1709 { 1710 struct nvme_ctrlr *nvme_ctrlr; 1711 struct spdk_nvme_io_qpair_opts opts; 1712 struct spdk_nvme_qpair *qpair; 1713 int rc; 1714 1715 nvme_ctrlr = nvme_qpair->ctrlr; 1716 1717 spdk_nvme_ctrlr_get_default_io_qpair_opts(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1718 opts.delay_cmd_submit = g_opts.delay_cmd_submit; 1719 opts.create_only = true; 1720 opts.async_mode = true; 1721 opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests); 1722 g_opts.io_queue_requests = opts.io_queue_requests; 1723 1724 qpair = spdk_nvme_ctrlr_alloc_io_qpair(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1725 if (qpair == NULL) { 1726 return -1; 1727 } 1728 1729 SPDK_DTRACE_PROBE3(bdev_nvme_create_qpair, nvme_ctrlr->nbdev_ctrlr->name, 1730 spdk_nvme_qpair_get_id(qpair), spdk_thread_get_id(nvme_ctrlr->thread)); 1731 1732 assert(nvme_qpair->group != NULL); 1733 1734 rc = spdk_nvme_poll_group_add(nvme_qpair->group->group, qpair); 1735 if (rc != 0) { 1736 SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n"); 1737 goto err; 1738 } 1739 1740 rc = spdk_nvme_ctrlr_connect_io_qpair(nvme_ctrlr->ctrlr, qpair); 1741 if (rc != 0) { 1742 SPDK_ERRLOG("Unable to connect I/O qpair.\n"); 1743 goto err; 1744 } 1745 1746 nvme_qpair->qpair = qpair; 1747 1748 if (!g_opts.disable_auto_failback) { 1749 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1750 } 1751 1752 return 0; 1753 1754 err: 1755 spdk_nvme_ctrlr_free_io_qpair(qpair); 1756 1757 return rc; 1758 } 1759 1760 static void 1761 bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i) 1762 { 1763 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 1764 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 1765 enum spdk_bdev_io_status status = SPDK_BDEV_IO_STATUS_SUCCESS; 1766 struct spdk_bdev_io *bdev_io; 1767 1768 if (spdk_io_channel_iter_get_ctx(i) != NULL) { 1769 status = SPDK_BDEV_IO_STATUS_FAILED; 1770 } 1771 1772 while (!TAILQ_EMPTY(&ctrlr_ch->pending_resets)) { 1773 bdev_io = TAILQ_FIRST(&ctrlr_ch->pending_resets); 1774 TAILQ_REMOVE(&ctrlr_ch->pending_resets, bdev_io, module_link); 1775 __bdev_nvme_io_complete(bdev_io, status, NULL); 1776 } 1777 1778 spdk_for_each_channel_continue(i, 0); 1779 } 1780 1781 /* This function marks the current trid as failed by storing the current ticks 1782 * and then sets the next trid to the active trid within a controller if exists. 1783 * 1784 * The purpose of the boolean return value is to request the caller to disconnect 1785 * the current trid now to try connecting the next trid. 1786 */ 1787 static bool 1788 bdev_nvme_failover_trid(struct nvme_ctrlr *nvme_ctrlr, bool remove, bool start) 1789 { 1790 struct nvme_path_id *path_id, *next_path; 1791 int rc __attribute__((unused)); 1792 1793 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 1794 assert(path_id); 1795 assert(path_id == nvme_ctrlr->active_path_id); 1796 next_path = TAILQ_NEXT(path_id, link); 1797 1798 /* Update the last failed time. It means the trid is failed if its last 1799 * failed time is non-zero. 1800 */ 1801 path_id->last_failed_tsc = spdk_get_ticks(); 1802 1803 if (next_path == NULL) { 1804 /* There is no alternate trid within a controller. */ 1805 return false; 1806 } 1807 1808 if (!start && nvme_ctrlr->opts.reconnect_delay_sec == 0) { 1809 /* Connect is not retried in a controller reset sequence. Connecting 1810 * the next trid will be done by the next bdev_nvme_failover_ctrlr() call. 1811 */ 1812 return false; 1813 } 1814 1815 assert(path_id->trid.trtype != SPDK_NVME_TRANSPORT_PCIE); 1816 1817 SPDK_NOTICELOG("Start failover from %s:%s to %s:%s\n", path_id->trid.traddr, 1818 path_id->trid.trsvcid, next_path->trid.traddr, next_path->trid.trsvcid); 1819 1820 spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr); 1821 nvme_ctrlr->active_path_id = next_path; 1822 rc = spdk_nvme_ctrlr_set_trid(nvme_ctrlr->ctrlr, &next_path->trid); 1823 assert(rc == 0); 1824 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 1825 if (!remove) { 1826 /** Shuffle the old trid to the end of the list and use the new one. 1827 * Allows for round robin through multiple connections. 1828 */ 1829 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, path_id, link); 1830 } else { 1831 free(path_id); 1832 } 1833 1834 if (start || next_path->last_failed_tsc == 0) { 1835 /* bdev_nvme_failover_ctrlr() is just called or the next trid is not failed 1836 * or used yet. Try the next trid now. 1837 */ 1838 return true; 1839 } 1840 1841 if (spdk_get_ticks() > next_path->last_failed_tsc + spdk_get_ticks_hz() * 1842 nvme_ctrlr->opts.reconnect_delay_sec) { 1843 /* Enough backoff passed since the next trid failed. Try the next trid now. */ 1844 return true; 1845 } 1846 1847 /* The next trid will be tried after reconnect_delay_sec seconds. */ 1848 return false; 1849 } 1850 1851 static bool 1852 bdev_nvme_check_ctrlr_loss_timeout(struct nvme_ctrlr *nvme_ctrlr) 1853 { 1854 int32_t elapsed; 1855 1856 if (nvme_ctrlr->opts.ctrlr_loss_timeout_sec == 0 || 1857 nvme_ctrlr->opts.ctrlr_loss_timeout_sec == -1) { 1858 return false; 1859 } 1860 1861 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 1862 if (elapsed >= nvme_ctrlr->opts.ctrlr_loss_timeout_sec) { 1863 return true; 1864 } else { 1865 return false; 1866 } 1867 } 1868 1869 static bool 1870 bdev_nvme_check_fast_io_fail_timeout(struct nvme_ctrlr *nvme_ctrlr) 1871 { 1872 uint32_t elapsed; 1873 1874 if (nvme_ctrlr->opts.fast_io_fail_timeout_sec == 0) { 1875 return false; 1876 } 1877 1878 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 1879 if (elapsed >= nvme_ctrlr->opts.fast_io_fail_timeout_sec) { 1880 return true; 1881 } else { 1882 return false; 1883 } 1884 } 1885 1886 static void bdev_nvme_reset_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, bool success); 1887 1888 static void 1889 nvme_ctrlr_disconnect(struct nvme_ctrlr *nvme_ctrlr, nvme_ctrlr_disconnected_cb cb_fn) 1890 { 1891 int rc; 1892 1893 rc = spdk_nvme_ctrlr_disconnect(nvme_ctrlr->ctrlr); 1894 if (rc != 0) { 1895 /* Disconnect fails if ctrlr is already resetting or removed. In this case, 1896 * fail the reset sequence immediately. 1897 */ 1898 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 1899 return; 1900 } 1901 1902 /* spdk_nvme_ctrlr_disconnect() may complete asynchronously later by polling adminq. 1903 * Set callback here to execute the specified operation after ctrlr is really disconnected. 1904 */ 1905 assert(nvme_ctrlr->disconnected_cb == NULL); 1906 nvme_ctrlr->disconnected_cb = cb_fn; 1907 1908 /* During disconnection, reduce the period to poll adminq more often. */ 1909 bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 0); 1910 } 1911 1912 enum bdev_nvme_op_after_reset { 1913 OP_NONE, 1914 OP_COMPLETE_PENDING_DESTRUCT, 1915 OP_DESTRUCT, 1916 OP_DELAYED_RECONNECT, 1917 OP_FAILOVER, 1918 }; 1919 1920 typedef enum bdev_nvme_op_after_reset _bdev_nvme_op_after_reset; 1921 1922 static _bdev_nvme_op_after_reset 1923 bdev_nvme_check_op_after_reset(struct nvme_ctrlr *nvme_ctrlr, bool success) 1924 { 1925 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 1926 /* Complete pending destruct after reset completes. */ 1927 return OP_COMPLETE_PENDING_DESTRUCT; 1928 } else if (nvme_ctrlr->pending_failover) { 1929 nvme_ctrlr->pending_failover = false; 1930 nvme_ctrlr->reset_start_tsc = 0; 1931 return OP_FAILOVER; 1932 } else if (success || nvme_ctrlr->opts.reconnect_delay_sec == 0) { 1933 nvme_ctrlr->reset_start_tsc = 0; 1934 return OP_NONE; 1935 } else if (bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 1936 return OP_DESTRUCT; 1937 } else { 1938 if (bdev_nvme_check_fast_io_fail_timeout(nvme_ctrlr)) { 1939 nvme_ctrlr->fast_io_fail_timedout = true; 1940 } 1941 return OP_DELAYED_RECONNECT; 1942 } 1943 } 1944 1945 static int bdev_nvme_delete_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool hotplug); 1946 static void bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 1947 1948 static int 1949 bdev_nvme_reconnect_delay_timer_expired(void *ctx) 1950 { 1951 struct nvme_ctrlr *nvme_ctrlr = ctx; 1952 1953 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect_delay, nvme_ctrlr->nbdev_ctrlr->name); 1954 pthread_mutex_lock(&nvme_ctrlr->mutex); 1955 1956 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 1957 1958 if (!nvme_ctrlr->reconnect_is_delayed) { 1959 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1960 return SPDK_POLLER_BUSY; 1961 } 1962 1963 nvme_ctrlr->reconnect_is_delayed = false; 1964 1965 if (nvme_ctrlr->destruct) { 1966 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1967 return SPDK_POLLER_BUSY; 1968 } 1969 1970 assert(nvme_ctrlr->resetting == false); 1971 nvme_ctrlr->resetting = true; 1972 1973 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1974 1975 spdk_poller_resume(nvme_ctrlr->adminq_timer_poller); 1976 1977 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 1978 return SPDK_POLLER_BUSY; 1979 } 1980 1981 static void 1982 bdev_nvme_start_reconnect_delay_timer(struct nvme_ctrlr *nvme_ctrlr) 1983 { 1984 spdk_poller_pause(nvme_ctrlr->adminq_timer_poller); 1985 1986 assert(nvme_ctrlr->reconnect_is_delayed == false); 1987 nvme_ctrlr->reconnect_is_delayed = true; 1988 1989 assert(nvme_ctrlr->reconnect_delay_timer == NULL); 1990 nvme_ctrlr->reconnect_delay_timer = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_delay_timer_expired, 1991 nvme_ctrlr, 1992 nvme_ctrlr->opts.reconnect_delay_sec * SPDK_SEC_TO_USEC); 1993 } 1994 1995 static void remove_discovery_entry(struct nvme_ctrlr *nvme_ctrlr); 1996 1997 static void 1998 _bdev_nvme_reset_ctrlr_complete(struct spdk_io_channel_iter *i, int status) 1999 { 2000 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2001 bool success = spdk_io_channel_iter_get_ctx(i) == NULL; 2002 bdev_nvme_ctrlr_op_cb ctrlr_op_cb_fn = nvme_ctrlr->ctrlr_op_cb_fn; 2003 void *ctrlr_op_cb_arg = nvme_ctrlr->ctrlr_op_cb_arg; 2004 enum bdev_nvme_op_after_reset op_after_reset; 2005 2006 assert(nvme_ctrlr->thread == spdk_get_thread()); 2007 2008 nvme_ctrlr->ctrlr_op_cb_fn = NULL; 2009 nvme_ctrlr->ctrlr_op_cb_arg = NULL; 2010 2011 if (!success) { 2012 SPDK_ERRLOG("Resetting controller failed.\n"); 2013 } else { 2014 SPDK_NOTICELOG("Resetting controller successful.\n"); 2015 } 2016 2017 pthread_mutex_lock(&nvme_ctrlr->mutex); 2018 nvme_ctrlr->resetting = false; 2019 nvme_ctrlr->dont_retry = false; 2020 nvme_ctrlr->in_failover = false; 2021 2022 op_after_reset = bdev_nvme_check_op_after_reset(nvme_ctrlr, success); 2023 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2024 2025 if (ctrlr_op_cb_fn) { 2026 ctrlr_op_cb_fn(ctrlr_op_cb_arg, success ? 0 : -1); 2027 } 2028 2029 switch (op_after_reset) { 2030 case OP_COMPLETE_PENDING_DESTRUCT: 2031 nvme_ctrlr_unregister(nvme_ctrlr); 2032 break; 2033 case OP_DESTRUCT: 2034 bdev_nvme_delete_ctrlr(nvme_ctrlr, false); 2035 remove_discovery_entry(nvme_ctrlr); 2036 break; 2037 case OP_DELAYED_RECONNECT: 2038 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_start_reconnect_delay_timer); 2039 break; 2040 case OP_FAILOVER: 2041 bdev_nvme_failover_ctrlr(nvme_ctrlr, false); 2042 break; 2043 default: 2044 break; 2045 } 2046 } 2047 2048 static void 2049 bdev_nvme_reset_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, bool success) 2050 { 2051 pthread_mutex_lock(&nvme_ctrlr->mutex); 2052 if (!success) { 2053 /* Connecting the active trid failed. Set the next alternate trid to the 2054 * active trid if it exists. 2055 */ 2056 if (bdev_nvme_failover_trid(nvme_ctrlr, false, false)) { 2057 /* The next alternate trid exists and is ready to try. Try it now. */ 2058 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2059 2060 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr); 2061 return; 2062 } 2063 2064 /* We came here if there is no alternate trid or if the next trid exists but 2065 * is not ready to try. We will try the active trid after reconnect_delay_sec 2066 * seconds if it is non-zero or at the next reset call otherwise. 2067 */ 2068 } else { 2069 /* Connecting the active trid succeeded. Clear the last failed time because it 2070 * means the trid is failed if its last failed time is non-zero. 2071 */ 2072 nvme_ctrlr->active_path_id->last_failed_tsc = 0; 2073 } 2074 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2075 2076 /* Make sure we clear any pending resets before returning. */ 2077 spdk_for_each_channel(nvme_ctrlr, 2078 bdev_nvme_complete_pending_resets, 2079 success ? NULL : (void *)0x1, 2080 _bdev_nvme_reset_ctrlr_complete); 2081 } 2082 2083 static void 2084 bdev_nvme_reset_create_qpairs_failed(struct spdk_io_channel_iter *i, int status) 2085 { 2086 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2087 2088 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 2089 } 2090 2091 static void 2092 bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i) 2093 { 2094 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 2095 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch); 2096 struct nvme_qpair *nvme_qpair; 2097 2098 nvme_qpair = ctrlr_ch->qpair; 2099 assert(nvme_qpair != NULL); 2100 2101 _bdev_nvme_clear_io_path_cache(nvme_qpair); 2102 2103 if (nvme_qpair->qpair != NULL) { 2104 if (nvme_qpair->ctrlr->dont_retry) { 2105 spdk_nvme_qpair_set_abort_dnr(nvme_qpair->qpair, true); 2106 } 2107 spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); 2108 2109 /* The current full reset sequence will move to the next 2110 * ctrlr_channel after the qpair is actually disconnected. 2111 */ 2112 assert(ctrlr_ch->reset_iter == NULL); 2113 ctrlr_ch->reset_iter = i; 2114 } else { 2115 spdk_for_each_channel_continue(i, 0); 2116 } 2117 } 2118 2119 static void 2120 bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status) 2121 { 2122 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2123 2124 if (status == 0) { 2125 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, true); 2126 } else { 2127 /* Delete the added qpairs and quiesce ctrlr to make the states clean. */ 2128 spdk_for_each_channel(nvme_ctrlr, 2129 bdev_nvme_reset_destroy_qpair, 2130 NULL, 2131 bdev_nvme_reset_create_qpairs_failed); 2132 } 2133 } 2134 2135 static int 2136 bdev_nvme_reset_check_qpair_connected(void *ctx) 2137 { 2138 struct nvme_ctrlr_channel *ctrlr_ch = ctx; 2139 2140 if (ctrlr_ch->reset_iter == NULL) { 2141 /* qpair was already failed to connect and the reset sequence is being aborted. */ 2142 assert(ctrlr_ch->connect_poller == NULL); 2143 assert(ctrlr_ch->qpair->qpair == NULL); 2144 return SPDK_POLLER_BUSY; 2145 } 2146 2147 assert(ctrlr_ch->qpair->qpair != NULL); 2148 2149 if (!spdk_nvme_qpair_is_connected(ctrlr_ch->qpair->qpair)) { 2150 return SPDK_POLLER_BUSY; 2151 } 2152 2153 spdk_poller_unregister(&ctrlr_ch->connect_poller); 2154 2155 /* qpair was completed to connect. Move to the next ctrlr_channel */ 2156 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 2157 ctrlr_ch->reset_iter = NULL; 2158 2159 return SPDK_POLLER_BUSY; 2160 } 2161 2162 static void 2163 bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i) 2164 { 2165 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 2166 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 2167 int rc; 2168 2169 rc = bdev_nvme_create_qpair(ctrlr_ch->qpair); 2170 if (rc == 0) { 2171 ctrlr_ch->connect_poller = SPDK_POLLER_REGISTER(bdev_nvme_reset_check_qpair_connected, 2172 ctrlr_ch, 0); 2173 2174 /* The current full reset sequence will move to the next 2175 * ctrlr_channel after the qpair is actually connected. 2176 */ 2177 assert(ctrlr_ch->reset_iter == NULL); 2178 ctrlr_ch->reset_iter = i; 2179 } else { 2180 spdk_for_each_channel_continue(i, rc); 2181 } 2182 } 2183 2184 static int 2185 bdev_nvme_reconnect_ctrlr_poll(void *arg) 2186 { 2187 struct nvme_ctrlr *nvme_ctrlr = arg; 2188 int rc = -ETIMEDOUT; 2189 2190 if (!bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 2191 rc = spdk_nvme_ctrlr_reconnect_poll_async(nvme_ctrlr->ctrlr); 2192 if (rc == -EAGAIN) { 2193 return SPDK_POLLER_BUSY; 2194 } 2195 } 2196 2197 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 2198 if (rc == 0) { 2199 /* Recreate all of the I/O queue pairs */ 2200 spdk_for_each_channel(nvme_ctrlr, 2201 bdev_nvme_reset_create_qpair, 2202 NULL, 2203 bdev_nvme_reset_create_qpairs_done); 2204 } else { 2205 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 2206 } 2207 return SPDK_POLLER_BUSY; 2208 } 2209 2210 static void 2211 bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2212 { 2213 spdk_nvme_ctrlr_reconnect_async(nvme_ctrlr->ctrlr); 2214 2215 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect, nvme_ctrlr->nbdev_ctrlr->name); 2216 assert(nvme_ctrlr->reset_detach_poller == NULL); 2217 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_ctrlr_poll, 2218 nvme_ctrlr, 0); 2219 } 2220 2221 static void 2222 bdev_nvme_reset_destroy_qpair_done(struct spdk_io_channel_iter *i, int status) 2223 { 2224 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2225 2226 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reset, nvme_ctrlr->nbdev_ctrlr->name); 2227 assert(status == 0); 2228 2229 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2230 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2231 } else { 2232 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr); 2233 } 2234 } 2235 2236 static void 2237 bdev_nvme_reset_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr) 2238 { 2239 spdk_for_each_channel(nvme_ctrlr, 2240 bdev_nvme_reset_destroy_qpair, 2241 NULL, 2242 bdev_nvme_reset_destroy_qpair_done); 2243 } 2244 2245 static void 2246 bdev_nvme_reconnect_ctrlr_now(void *ctx) 2247 { 2248 struct nvme_ctrlr *nvme_ctrlr = ctx; 2249 2250 assert(nvme_ctrlr->resetting == true); 2251 assert(nvme_ctrlr->thread == spdk_get_thread()); 2252 2253 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 2254 2255 spdk_poller_resume(nvme_ctrlr->adminq_timer_poller); 2256 2257 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2258 } 2259 2260 static void 2261 _bdev_nvme_reset_ctrlr(void *ctx) 2262 { 2263 struct nvme_ctrlr *nvme_ctrlr = ctx; 2264 2265 assert(nvme_ctrlr->resetting == true); 2266 assert(nvme_ctrlr->thread == spdk_get_thread()); 2267 2268 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2269 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reset_destroy_qpairs); 2270 } else { 2271 bdev_nvme_reset_destroy_qpairs(nvme_ctrlr); 2272 } 2273 } 2274 2275 static int 2276 bdev_nvme_reset_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2277 { 2278 spdk_msg_fn msg_fn; 2279 2280 pthread_mutex_lock(&nvme_ctrlr->mutex); 2281 if (nvme_ctrlr->destruct) { 2282 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2283 return -ENXIO; 2284 } 2285 2286 if (nvme_ctrlr->resetting) { 2287 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2288 SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); 2289 return -EBUSY; 2290 } 2291 2292 if (nvme_ctrlr->disabled) { 2293 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2294 SPDK_NOTICELOG("Unable to perform reset. Controller is disabled.\n"); 2295 return -EALREADY; 2296 } 2297 2298 nvme_ctrlr->resetting = true; 2299 nvme_ctrlr->dont_retry = true; 2300 2301 if (nvme_ctrlr->reconnect_is_delayed) { 2302 SPDK_DEBUGLOG(bdev_nvme, "Reconnect is already scheduled.\n"); 2303 msg_fn = bdev_nvme_reconnect_ctrlr_now; 2304 nvme_ctrlr->reconnect_is_delayed = false; 2305 } else { 2306 msg_fn = _bdev_nvme_reset_ctrlr; 2307 assert(nvme_ctrlr->reset_start_tsc == 0); 2308 } 2309 2310 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2311 2312 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2313 2314 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 2315 return 0; 2316 } 2317 2318 static int 2319 bdev_nvme_enable_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2320 { 2321 pthread_mutex_lock(&nvme_ctrlr->mutex); 2322 if (nvme_ctrlr->destruct) { 2323 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2324 return -ENXIO; 2325 } 2326 2327 if (nvme_ctrlr->resetting) { 2328 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2329 return -EBUSY; 2330 } 2331 2332 if (!nvme_ctrlr->disabled) { 2333 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2334 return -EALREADY; 2335 } 2336 2337 nvme_ctrlr->disabled = false; 2338 nvme_ctrlr->resetting = true; 2339 2340 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2341 2342 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2343 2344 spdk_thread_send_msg(nvme_ctrlr->thread, bdev_nvme_reconnect_ctrlr_now, nvme_ctrlr); 2345 return 0; 2346 } 2347 2348 static void 2349 _bdev_nvme_disable_ctrlr_complete(struct spdk_io_channel_iter *i, int status) 2350 { 2351 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2352 bdev_nvme_ctrlr_op_cb ctrlr_op_cb_fn = nvme_ctrlr->ctrlr_op_cb_fn; 2353 void *ctrlr_op_cb_arg = nvme_ctrlr->ctrlr_op_cb_arg; 2354 enum bdev_nvme_op_after_reset op_after_disable; 2355 2356 assert(nvme_ctrlr->thread == spdk_get_thread()); 2357 2358 nvme_ctrlr->ctrlr_op_cb_fn = NULL; 2359 nvme_ctrlr->ctrlr_op_cb_arg = NULL; 2360 2361 pthread_mutex_lock(&nvme_ctrlr->mutex); 2362 2363 nvme_ctrlr->resetting = false; 2364 nvme_ctrlr->dont_retry = false; 2365 2366 op_after_disable = bdev_nvme_check_op_after_reset(nvme_ctrlr, true); 2367 2368 nvme_ctrlr->disabled = true; 2369 spdk_poller_pause(nvme_ctrlr->adminq_timer_poller); 2370 2371 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2372 2373 if (ctrlr_op_cb_fn) { 2374 ctrlr_op_cb_fn(ctrlr_op_cb_arg, 0); 2375 } 2376 2377 switch (op_after_disable) { 2378 case OP_COMPLETE_PENDING_DESTRUCT: 2379 nvme_ctrlr_unregister(nvme_ctrlr); 2380 break; 2381 default: 2382 break; 2383 } 2384 2385 } 2386 2387 static void 2388 bdev_nvme_disable_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr) 2389 { 2390 /* Make sure we clear any pending resets before returning. */ 2391 spdk_for_each_channel(nvme_ctrlr, 2392 bdev_nvme_complete_pending_resets, 2393 NULL, 2394 _bdev_nvme_disable_ctrlr_complete); 2395 } 2396 2397 static void 2398 bdev_nvme_disable_destroy_qpairs_done(struct spdk_io_channel_iter *i, int status) 2399 { 2400 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2401 2402 assert(status == 0); 2403 2404 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2405 bdev_nvme_disable_ctrlr_complete(nvme_ctrlr); 2406 } else { 2407 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_disable_ctrlr_complete); 2408 } 2409 } 2410 2411 static void 2412 bdev_nvme_disable_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr) 2413 { 2414 spdk_for_each_channel(nvme_ctrlr, 2415 bdev_nvme_reset_destroy_qpair, 2416 NULL, 2417 bdev_nvme_disable_destroy_qpairs_done); 2418 } 2419 2420 static void 2421 _bdev_nvme_cancel_reconnect_and_disable_ctrlr(void *ctx) 2422 { 2423 struct nvme_ctrlr *nvme_ctrlr = ctx; 2424 2425 assert(nvme_ctrlr->resetting == true); 2426 assert(nvme_ctrlr->thread == spdk_get_thread()); 2427 2428 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 2429 2430 bdev_nvme_disable_ctrlr_complete(nvme_ctrlr); 2431 } 2432 2433 static void 2434 _bdev_nvme_disconnect_and_disable_ctrlr(void *ctx) 2435 { 2436 struct nvme_ctrlr *nvme_ctrlr = ctx; 2437 2438 assert(nvme_ctrlr->resetting == true); 2439 assert(nvme_ctrlr->thread == spdk_get_thread()); 2440 2441 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2442 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_disable_destroy_qpairs); 2443 } else { 2444 bdev_nvme_disable_destroy_qpairs(nvme_ctrlr); 2445 } 2446 } 2447 2448 static int 2449 bdev_nvme_disable_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2450 { 2451 spdk_msg_fn msg_fn; 2452 2453 pthread_mutex_lock(&nvme_ctrlr->mutex); 2454 if (nvme_ctrlr->destruct) { 2455 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2456 return -ENXIO; 2457 } 2458 2459 if (nvme_ctrlr->resetting) { 2460 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2461 return -EBUSY; 2462 } 2463 2464 if (nvme_ctrlr->disabled) { 2465 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2466 return -EALREADY; 2467 } 2468 2469 nvme_ctrlr->resetting = true; 2470 nvme_ctrlr->dont_retry = true; 2471 2472 if (nvme_ctrlr->reconnect_is_delayed) { 2473 msg_fn = _bdev_nvme_cancel_reconnect_and_disable_ctrlr; 2474 nvme_ctrlr->reconnect_is_delayed = false; 2475 } else { 2476 msg_fn = _bdev_nvme_disconnect_and_disable_ctrlr; 2477 } 2478 2479 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2480 2481 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2482 2483 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 2484 return 0; 2485 } 2486 2487 static int 2488 nvme_ctrlr_op(struct nvme_ctrlr *nvme_ctrlr, enum nvme_ctrlr_op op, 2489 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2490 { 2491 int rc; 2492 2493 switch (op) { 2494 case NVME_CTRLR_OP_RESET: 2495 rc = bdev_nvme_reset_ctrlr(nvme_ctrlr); 2496 break; 2497 case NVME_CTRLR_OP_ENABLE: 2498 rc = bdev_nvme_enable_ctrlr(nvme_ctrlr); 2499 break; 2500 case NVME_CTRLR_OP_DISABLE: 2501 rc = bdev_nvme_disable_ctrlr(nvme_ctrlr); 2502 break; 2503 default: 2504 rc = -EINVAL; 2505 break; 2506 } 2507 2508 if (rc == 0) { 2509 assert(nvme_ctrlr->ctrlr_op_cb_fn == NULL); 2510 assert(nvme_ctrlr->ctrlr_op_cb_arg == NULL); 2511 nvme_ctrlr->ctrlr_op_cb_fn = cb_fn; 2512 nvme_ctrlr->ctrlr_op_cb_arg = cb_arg; 2513 } 2514 return rc; 2515 } 2516 2517 struct nvme_ctrlr_op_rpc_ctx { 2518 struct nvme_ctrlr *nvme_ctrlr; 2519 struct spdk_thread *orig_thread; 2520 enum nvme_ctrlr_op op; 2521 int rc; 2522 bdev_nvme_ctrlr_op_cb cb_fn; 2523 void *cb_arg; 2524 }; 2525 2526 static void 2527 _nvme_ctrlr_op_rpc_complete(void *_ctx) 2528 { 2529 struct nvme_ctrlr_op_rpc_ctx *ctx = _ctx; 2530 2531 assert(ctx != NULL); 2532 assert(ctx->cb_fn != NULL); 2533 2534 ctx->cb_fn(ctx->cb_arg, ctx->rc); 2535 2536 free(ctx); 2537 } 2538 2539 static void 2540 nvme_ctrlr_op_rpc_complete(void *cb_arg, int rc) 2541 { 2542 struct nvme_ctrlr_op_rpc_ctx *ctx = cb_arg; 2543 2544 ctx->rc = rc; 2545 2546 spdk_thread_send_msg(ctx->orig_thread, _nvme_ctrlr_op_rpc_complete, ctx); 2547 } 2548 2549 void 2550 nvme_ctrlr_op_rpc(struct nvme_ctrlr *nvme_ctrlr, enum nvme_ctrlr_op op, 2551 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2552 { 2553 struct nvme_ctrlr_op_rpc_ctx *ctx; 2554 int rc; 2555 2556 assert(cb_fn != NULL); 2557 2558 ctx = calloc(1, sizeof(*ctx)); 2559 if (ctx == NULL) { 2560 SPDK_ERRLOG("Failed to allocate nvme_ctrlr_op_rpc_ctx.\n"); 2561 cb_fn(cb_arg, -ENOMEM); 2562 return; 2563 } 2564 2565 ctx->orig_thread = spdk_get_thread(); 2566 ctx->cb_fn = cb_fn; 2567 ctx->cb_arg = cb_arg; 2568 2569 rc = nvme_ctrlr_op(nvme_ctrlr, op, nvme_ctrlr_op_rpc_complete, ctx); 2570 if (rc == 0) { 2571 return; 2572 } else if (rc == -EALREADY) { 2573 rc = 0; 2574 } 2575 2576 nvme_ctrlr_op_rpc_complete(ctx, rc); 2577 } 2578 2579 static void nvme_bdev_ctrlr_op_rpc_continue(void *cb_arg, int rc); 2580 2581 static void 2582 _nvme_bdev_ctrlr_op_rpc_continue(void *_ctx) 2583 { 2584 struct nvme_ctrlr_op_rpc_ctx *ctx = _ctx; 2585 struct nvme_ctrlr *prev_nvme_ctrlr, *next_nvme_ctrlr; 2586 int rc; 2587 2588 prev_nvme_ctrlr = ctx->nvme_ctrlr; 2589 ctx->nvme_ctrlr = NULL; 2590 2591 if (ctx->rc != 0) { 2592 goto complete; 2593 } 2594 2595 next_nvme_ctrlr = TAILQ_NEXT(prev_nvme_ctrlr, tailq); 2596 if (next_nvme_ctrlr == NULL) { 2597 goto complete; 2598 } 2599 2600 rc = nvme_ctrlr_op(next_nvme_ctrlr, ctx->op, nvme_bdev_ctrlr_op_rpc_continue, ctx); 2601 if (rc == 0) { 2602 ctx->nvme_ctrlr = next_nvme_ctrlr; 2603 return; 2604 } else if (rc == -EALREADY) { 2605 ctx->nvme_ctrlr = next_nvme_ctrlr; 2606 rc = 0; 2607 } 2608 2609 ctx->rc = rc; 2610 2611 complete: 2612 ctx->cb_fn(ctx->cb_arg, ctx->rc); 2613 free(ctx); 2614 } 2615 2616 static void 2617 nvme_bdev_ctrlr_op_rpc_continue(void *cb_arg, int rc) 2618 { 2619 struct nvme_ctrlr_op_rpc_ctx *ctx = cb_arg; 2620 2621 ctx->rc = rc; 2622 2623 spdk_thread_send_msg(ctx->orig_thread, _nvme_bdev_ctrlr_op_rpc_continue, ctx); 2624 } 2625 2626 void 2627 nvme_bdev_ctrlr_op_rpc(struct nvme_bdev_ctrlr *nbdev_ctrlr, enum nvme_ctrlr_op op, 2628 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2629 { 2630 struct nvme_ctrlr_op_rpc_ctx *ctx; 2631 struct nvme_ctrlr *nvme_ctrlr; 2632 int rc; 2633 2634 assert(cb_fn != NULL); 2635 2636 ctx = calloc(1, sizeof(*ctx)); 2637 if (ctx == NULL) { 2638 SPDK_ERRLOG("Failed to allocate nvme_ctrlr_op_rpc_ctx.\n"); 2639 cb_fn(cb_arg, -ENOMEM); 2640 return; 2641 } 2642 2643 ctx->orig_thread = spdk_get_thread(); 2644 ctx->op = op; 2645 ctx->cb_fn = cb_fn; 2646 ctx->cb_arg = cb_arg; 2647 2648 nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 2649 assert(nvme_ctrlr != NULL); 2650 2651 rc = nvme_ctrlr_op(nvme_ctrlr, op, nvme_bdev_ctrlr_op_rpc_continue, ctx); 2652 if (rc == 0) { 2653 ctx->nvme_ctrlr = nvme_ctrlr; 2654 return; 2655 } else if (rc == -EALREADY) { 2656 ctx->nvme_ctrlr = nvme_ctrlr; 2657 rc = 0; 2658 } 2659 2660 nvme_bdev_ctrlr_op_rpc_continue(ctx, rc); 2661 } 2662 2663 static int _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio); 2664 2665 static void 2666 _bdev_nvme_reset_io_complete(struct spdk_io_channel_iter *i, int status) 2667 { 2668 struct nvme_bdev_io *bio = spdk_io_channel_iter_get_ctx(i); 2669 enum spdk_bdev_io_status io_status; 2670 2671 if (bio->cpl.cdw0 == 0) { 2672 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 2673 } else { 2674 io_status = SPDK_BDEV_IO_STATUS_FAILED; 2675 } 2676 2677 __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), io_status, NULL); 2678 } 2679 2680 static void 2681 bdev_nvme_abort_bdev_channel(struct spdk_io_channel_iter *i) 2682 { 2683 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 2684 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 2685 2686 bdev_nvme_abort_retry_ios(nbdev_ch); 2687 2688 spdk_for_each_channel_continue(i, 0); 2689 } 2690 2691 static void 2692 bdev_nvme_reset_io_complete(struct nvme_bdev_io *bio) 2693 { 2694 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2695 struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt; 2696 2697 /* Abort all queued I/Os for retry. */ 2698 spdk_for_each_channel(nbdev, 2699 bdev_nvme_abort_bdev_channel, 2700 bio, 2701 _bdev_nvme_reset_io_complete); 2702 } 2703 2704 static void 2705 _bdev_nvme_reset_io_continue(void *ctx) 2706 { 2707 struct nvme_bdev_io *bio = ctx; 2708 struct nvme_io_path *prev_io_path, *next_io_path; 2709 int rc; 2710 2711 prev_io_path = bio->io_path; 2712 bio->io_path = NULL; 2713 2714 if (bio->cpl.cdw0 != 0) { 2715 goto complete; 2716 } 2717 2718 next_io_path = STAILQ_NEXT(prev_io_path, stailq); 2719 if (next_io_path == NULL) { 2720 goto complete; 2721 } 2722 2723 rc = _bdev_nvme_reset_io(next_io_path, bio); 2724 if (rc == 0) { 2725 return; 2726 } 2727 2728 bio->cpl.cdw0 = 1; 2729 2730 complete: 2731 bdev_nvme_reset_io_complete(bio); 2732 } 2733 2734 static void 2735 bdev_nvme_reset_io_continue(void *cb_arg, int rc) 2736 { 2737 struct nvme_bdev_io *bio = cb_arg; 2738 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2739 2740 bio->cpl.cdw0 = (rc == 0) ? 0 : 1; 2741 2742 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), _bdev_nvme_reset_io_continue, bio); 2743 } 2744 2745 static int 2746 _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio) 2747 { 2748 struct nvme_ctrlr_channel *ctrlr_ch; 2749 struct spdk_bdev_io *bdev_io; 2750 int rc; 2751 2752 rc = nvme_ctrlr_op(io_path->qpair->ctrlr, NVME_CTRLR_OP_RESET, 2753 bdev_nvme_reset_io_continue, bio); 2754 if (rc == 0) { 2755 assert(bio->io_path == NULL); 2756 bio->io_path = io_path; 2757 } else if (rc == -EBUSY) { 2758 ctrlr_ch = io_path->qpair->ctrlr_ch; 2759 assert(ctrlr_ch != NULL); 2760 /* 2761 * Reset call is queued only if it is from the app framework. This is on purpose so that 2762 * we don't interfere with the app framework reset strategy. i.e. we are deferring to the 2763 * upper level. If they are in the middle of a reset, we won't try to schedule another one. 2764 */ 2765 bdev_io = spdk_bdev_io_from_ctx(bio); 2766 TAILQ_INSERT_TAIL(&ctrlr_ch->pending_resets, bdev_io, module_link); 2767 rc = 0; 2768 } 2769 2770 return rc; 2771 } 2772 2773 static void 2774 bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio) 2775 { 2776 struct nvme_io_path *io_path; 2777 int rc; 2778 2779 bio->cpl.cdw0 = 0; 2780 2781 /* Reset all nvme_ctrlrs of a bdev controller sequentially. */ 2782 io_path = STAILQ_FIRST(&nbdev_ch->io_path_list); 2783 assert(io_path != NULL); 2784 2785 rc = _bdev_nvme_reset_io(io_path, bio); 2786 if (rc != 0) { 2787 /* If the current nvme_ctrlr is disabled, skip it and move to the next nvme_ctrlr. */ 2788 bdev_nvme_reset_io_continue(bio, rc == -EALREADY); 2789 } 2790 } 2791 2792 static int 2793 bdev_nvme_failover_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool remove) 2794 { 2795 if (nvme_ctrlr->destruct) { 2796 /* Don't bother resetting if the controller is in the process of being destructed. */ 2797 return -ENXIO; 2798 } 2799 2800 if (nvme_ctrlr->resetting) { 2801 if (!nvme_ctrlr->in_failover) { 2802 SPDK_NOTICELOG("Reset is already in progress. Defer failover until reset completes.\n"); 2803 2804 /* Defer failover until reset completes. */ 2805 nvme_ctrlr->pending_failover = true; 2806 return -EINPROGRESS; 2807 } else { 2808 SPDK_NOTICELOG("Unable to perform failover, already in progress.\n"); 2809 return -EBUSY; 2810 } 2811 } 2812 2813 bdev_nvme_failover_trid(nvme_ctrlr, remove, true); 2814 2815 if (nvme_ctrlr->reconnect_is_delayed) { 2816 SPDK_NOTICELOG("Reconnect is already scheduled.\n"); 2817 2818 /* We rely on the next reconnect for the failover. */ 2819 return -EALREADY; 2820 } 2821 2822 if (nvme_ctrlr->disabled) { 2823 SPDK_NOTICELOG("Controller is disabled.\n"); 2824 2825 /* We rely on the enablement for the failover. */ 2826 return -EALREADY; 2827 } 2828 2829 nvme_ctrlr->resetting = true; 2830 nvme_ctrlr->in_failover = true; 2831 2832 assert(nvme_ctrlr->reset_start_tsc == 0); 2833 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2834 2835 return 0; 2836 } 2837 2838 static int 2839 bdev_nvme_failover_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool remove) 2840 { 2841 int rc; 2842 2843 pthread_mutex_lock(&nvme_ctrlr->mutex); 2844 rc = bdev_nvme_failover_ctrlr_unsafe(nvme_ctrlr, remove); 2845 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2846 2847 if (rc == 0) { 2848 spdk_thread_send_msg(nvme_ctrlr->thread, _bdev_nvme_reset_ctrlr, nvme_ctrlr); 2849 } else if (rc == -EALREADY) { 2850 rc = 0; 2851 } 2852 2853 return rc; 2854 } 2855 2856 static int bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, 2857 uint64_t num_blocks); 2858 2859 static int bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, 2860 uint64_t num_blocks); 2861 2862 static int bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, 2863 uint64_t src_offset_blocks, 2864 uint64_t num_blocks); 2865 2866 static void 2867 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 2868 bool success) 2869 { 2870 struct nvme_bdev_io *bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 2871 struct spdk_bdev *bdev = bdev_io->bdev; 2872 int ret; 2873 2874 if (!success) { 2875 ret = -EINVAL; 2876 goto exit; 2877 } 2878 2879 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 2880 ret = -ENXIO; 2881 goto exit; 2882 } 2883 2884 ret = bdev_nvme_readv(bio, 2885 bdev_io->u.bdev.iovs, 2886 bdev_io->u.bdev.iovcnt, 2887 bdev_io->u.bdev.md_buf, 2888 bdev_io->u.bdev.num_blocks, 2889 bdev_io->u.bdev.offset_blocks, 2890 bdev->dif_check_flags, 2891 bdev_io->u.bdev.memory_domain, 2892 bdev_io->u.bdev.memory_domain_ctx); 2893 2894 exit: 2895 if (spdk_unlikely(ret != 0)) { 2896 bdev_nvme_io_complete(bio, ret); 2897 } 2898 } 2899 2900 static inline void 2901 _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io) 2902 { 2903 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 2904 struct spdk_bdev *bdev = bdev_io->bdev; 2905 struct nvme_bdev_io *nbdev_io_to_abort; 2906 int rc = 0; 2907 2908 switch (bdev_io->type) { 2909 case SPDK_BDEV_IO_TYPE_READ: 2910 if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) { 2911 rc = bdev_nvme_readv(nbdev_io, 2912 bdev_io->u.bdev.iovs, 2913 bdev_io->u.bdev.iovcnt, 2914 bdev_io->u.bdev.md_buf, 2915 bdev_io->u.bdev.num_blocks, 2916 bdev_io->u.bdev.offset_blocks, 2917 bdev->dif_check_flags, 2918 bdev_io->u.bdev.memory_domain, 2919 bdev_io->u.bdev.memory_domain_ctx); 2920 } else { 2921 spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb, 2922 bdev_io->u.bdev.num_blocks * bdev->blocklen); 2923 rc = 0; 2924 } 2925 break; 2926 case SPDK_BDEV_IO_TYPE_WRITE: 2927 rc = bdev_nvme_writev(nbdev_io, 2928 bdev_io->u.bdev.iovs, 2929 bdev_io->u.bdev.iovcnt, 2930 bdev_io->u.bdev.md_buf, 2931 bdev_io->u.bdev.num_blocks, 2932 bdev_io->u.bdev.offset_blocks, 2933 bdev->dif_check_flags, 2934 bdev_io->u.bdev.memory_domain, 2935 bdev_io->u.bdev.memory_domain_ctx); 2936 break; 2937 case SPDK_BDEV_IO_TYPE_COMPARE: 2938 rc = bdev_nvme_comparev(nbdev_io, 2939 bdev_io->u.bdev.iovs, 2940 bdev_io->u.bdev.iovcnt, 2941 bdev_io->u.bdev.md_buf, 2942 bdev_io->u.bdev.num_blocks, 2943 bdev_io->u.bdev.offset_blocks, 2944 bdev->dif_check_flags); 2945 break; 2946 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 2947 rc = bdev_nvme_comparev_and_writev(nbdev_io, 2948 bdev_io->u.bdev.iovs, 2949 bdev_io->u.bdev.iovcnt, 2950 bdev_io->u.bdev.fused_iovs, 2951 bdev_io->u.bdev.fused_iovcnt, 2952 bdev_io->u.bdev.md_buf, 2953 bdev_io->u.bdev.num_blocks, 2954 bdev_io->u.bdev.offset_blocks, 2955 bdev->dif_check_flags); 2956 break; 2957 case SPDK_BDEV_IO_TYPE_UNMAP: 2958 rc = bdev_nvme_unmap(nbdev_io, 2959 bdev_io->u.bdev.offset_blocks, 2960 bdev_io->u.bdev.num_blocks); 2961 break; 2962 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2963 rc = bdev_nvme_write_zeroes(nbdev_io, 2964 bdev_io->u.bdev.offset_blocks, 2965 bdev_io->u.bdev.num_blocks); 2966 break; 2967 case SPDK_BDEV_IO_TYPE_RESET: 2968 nbdev_io->io_path = NULL; 2969 bdev_nvme_reset_io(nbdev_ch, nbdev_io); 2970 return; 2971 2972 case SPDK_BDEV_IO_TYPE_FLUSH: 2973 bdev_nvme_io_complete(nbdev_io, 0); 2974 return; 2975 2976 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 2977 rc = bdev_nvme_zone_appendv(nbdev_io, 2978 bdev_io->u.bdev.iovs, 2979 bdev_io->u.bdev.iovcnt, 2980 bdev_io->u.bdev.md_buf, 2981 bdev_io->u.bdev.num_blocks, 2982 bdev_io->u.bdev.offset_blocks, 2983 bdev->dif_check_flags); 2984 break; 2985 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 2986 rc = bdev_nvme_get_zone_info(nbdev_io, 2987 bdev_io->u.zone_mgmt.zone_id, 2988 bdev_io->u.zone_mgmt.num_zones, 2989 bdev_io->u.zone_mgmt.buf); 2990 break; 2991 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 2992 rc = bdev_nvme_zone_management(nbdev_io, 2993 bdev_io->u.zone_mgmt.zone_id, 2994 bdev_io->u.zone_mgmt.zone_action); 2995 break; 2996 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 2997 nbdev_io->io_path = NULL; 2998 bdev_nvme_admin_passthru(nbdev_ch, 2999 nbdev_io, 3000 &bdev_io->u.nvme_passthru.cmd, 3001 bdev_io->u.nvme_passthru.buf, 3002 bdev_io->u.nvme_passthru.nbytes); 3003 return; 3004 3005 case SPDK_BDEV_IO_TYPE_NVME_IO: 3006 rc = bdev_nvme_io_passthru(nbdev_io, 3007 &bdev_io->u.nvme_passthru.cmd, 3008 bdev_io->u.nvme_passthru.buf, 3009 bdev_io->u.nvme_passthru.nbytes); 3010 break; 3011 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3012 rc = bdev_nvme_io_passthru_md(nbdev_io, 3013 &bdev_io->u.nvme_passthru.cmd, 3014 bdev_io->u.nvme_passthru.buf, 3015 bdev_io->u.nvme_passthru.nbytes, 3016 bdev_io->u.nvme_passthru.md_buf, 3017 bdev_io->u.nvme_passthru.md_len); 3018 break; 3019 case SPDK_BDEV_IO_TYPE_ABORT: 3020 nbdev_io->io_path = NULL; 3021 nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx; 3022 bdev_nvme_abort(nbdev_ch, 3023 nbdev_io, 3024 nbdev_io_to_abort); 3025 return; 3026 3027 case SPDK_BDEV_IO_TYPE_COPY: 3028 rc = bdev_nvme_copy(nbdev_io, 3029 bdev_io->u.bdev.offset_blocks, 3030 bdev_io->u.bdev.copy.src_offset_blocks, 3031 bdev_io->u.bdev.num_blocks); 3032 break; 3033 default: 3034 rc = -EINVAL; 3035 break; 3036 } 3037 3038 if (spdk_unlikely(rc != 0)) { 3039 bdev_nvme_io_complete(nbdev_io, rc); 3040 } 3041 } 3042 3043 static void 3044 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 3045 { 3046 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 3047 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 3048 3049 if (spdk_likely(nbdev_io->submit_tsc == 0)) { 3050 nbdev_io->submit_tsc = spdk_bdev_io_get_submit_tsc(bdev_io); 3051 } else { 3052 /* There are cases where submit_tsc != 0, i.e. retry I/O. 3053 * We need to update submit_tsc here. 3054 */ 3055 nbdev_io->submit_tsc = spdk_get_ticks(); 3056 } 3057 3058 spdk_trace_record(TRACE_BDEV_NVME_IO_START, 0, 0, (uintptr_t)nbdev_io, (uintptr_t)bdev_io); 3059 nbdev_io->io_path = bdev_nvme_find_io_path(nbdev_ch); 3060 if (spdk_unlikely(!nbdev_io->io_path)) { 3061 if (!bdev_nvme_io_type_is_admin(bdev_io->type)) { 3062 bdev_nvme_io_complete(nbdev_io, -ENXIO); 3063 return; 3064 } 3065 3066 /* Admin commands do not use the optimal I/O path. 3067 * Simply fall through even if it is not found. 3068 */ 3069 } 3070 3071 _bdev_nvme_submit_request(nbdev_ch, bdev_io); 3072 } 3073 3074 static bool 3075 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 3076 { 3077 struct nvme_bdev *nbdev = ctx; 3078 struct nvme_ns *nvme_ns; 3079 struct spdk_nvme_ns *ns; 3080 struct spdk_nvme_ctrlr *ctrlr; 3081 const struct spdk_nvme_ctrlr_data *cdata; 3082 3083 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 3084 assert(nvme_ns != NULL); 3085 ns = nvme_ns->ns; 3086 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 3087 3088 switch (io_type) { 3089 case SPDK_BDEV_IO_TYPE_READ: 3090 case SPDK_BDEV_IO_TYPE_WRITE: 3091 case SPDK_BDEV_IO_TYPE_RESET: 3092 case SPDK_BDEV_IO_TYPE_FLUSH: 3093 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 3094 case SPDK_BDEV_IO_TYPE_NVME_IO: 3095 case SPDK_BDEV_IO_TYPE_ABORT: 3096 return true; 3097 3098 case SPDK_BDEV_IO_TYPE_COMPARE: 3099 return spdk_nvme_ns_supports_compare(ns); 3100 3101 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3102 return spdk_nvme_ns_get_md_size(ns) ? true : false; 3103 3104 case SPDK_BDEV_IO_TYPE_UNMAP: 3105 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3106 return cdata->oncs.dsm; 3107 3108 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3109 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3110 return cdata->oncs.write_zeroes; 3111 3112 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 3113 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 3114 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) { 3115 return true; 3116 } 3117 return false; 3118 3119 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 3120 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 3121 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS; 3122 3123 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 3124 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS && 3125 spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ZONE_APPEND_SUPPORTED; 3126 3127 case SPDK_BDEV_IO_TYPE_COPY: 3128 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3129 return cdata->oncs.copy; 3130 3131 default: 3132 return false; 3133 } 3134 } 3135 3136 static int 3137 nvme_qpair_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ctrlr_channel *ctrlr_ch) 3138 { 3139 struct nvme_qpair *nvme_qpair; 3140 struct spdk_io_channel *pg_ch; 3141 int rc; 3142 3143 nvme_qpair = calloc(1, sizeof(*nvme_qpair)); 3144 if (!nvme_qpair) { 3145 SPDK_ERRLOG("Failed to alloc nvme_qpair.\n"); 3146 return -1; 3147 } 3148 3149 TAILQ_INIT(&nvme_qpair->io_path_list); 3150 3151 nvme_qpair->ctrlr = nvme_ctrlr; 3152 nvme_qpair->ctrlr_ch = ctrlr_ch; 3153 3154 pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs); 3155 if (!pg_ch) { 3156 free(nvme_qpair); 3157 return -1; 3158 } 3159 3160 nvme_qpair->group = spdk_io_channel_get_ctx(pg_ch); 3161 3162 #ifdef SPDK_CONFIG_VTUNE 3163 nvme_qpair->group->collect_spin_stat = true; 3164 #else 3165 nvme_qpair->group->collect_spin_stat = false; 3166 #endif 3167 3168 rc = bdev_nvme_create_qpair(nvme_qpair); 3169 if (rc != 0) { 3170 /* nvme_ctrlr can't create IO qpair if connection is down. 3171 * 3172 * If reconnect_delay_sec is non-zero, creating IO qpair is retried 3173 * after reconnect_delay_sec seconds. If bdev_retry_count is non-zero, 3174 * submitted IO will be queued until IO qpair is successfully created. 3175 * 3176 * Hence, if both are satisfied, ignore the failure. 3177 */ 3178 if (nvme_ctrlr->opts.reconnect_delay_sec == 0 || g_opts.bdev_retry_count == 0) { 3179 spdk_put_io_channel(pg_ch); 3180 free(nvme_qpair); 3181 return rc; 3182 } 3183 } 3184 3185 TAILQ_INSERT_TAIL(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 3186 3187 ctrlr_ch->qpair = nvme_qpair; 3188 3189 pthread_mutex_lock(&nvme_qpair->ctrlr->mutex); 3190 nvme_qpair->ctrlr->ref++; 3191 pthread_mutex_unlock(&nvme_qpair->ctrlr->mutex); 3192 3193 return 0; 3194 } 3195 3196 static int 3197 bdev_nvme_create_ctrlr_channel_cb(void *io_device, void *ctx_buf) 3198 { 3199 struct nvme_ctrlr *nvme_ctrlr = io_device; 3200 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 3201 3202 TAILQ_INIT(&ctrlr_ch->pending_resets); 3203 3204 return nvme_qpair_create(nvme_ctrlr, ctrlr_ch); 3205 } 3206 3207 static void 3208 nvme_qpair_delete(struct nvme_qpair *nvme_qpair) 3209 { 3210 struct nvme_io_path *io_path, *next; 3211 3212 assert(nvme_qpair->group != NULL); 3213 3214 TAILQ_FOREACH_SAFE(io_path, &nvme_qpair->io_path_list, tailq, next) { 3215 TAILQ_REMOVE(&nvme_qpair->io_path_list, io_path, tailq); 3216 nvme_io_path_free(io_path); 3217 } 3218 3219 TAILQ_REMOVE(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 3220 3221 spdk_put_io_channel(spdk_io_channel_from_ctx(nvme_qpair->group)); 3222 3223 nvme_ctrlr_release(nvme_qpair->ctrlr); 3224 3225 free(nvme_qpair); 3226 } 3227 3228 static void 3229 bdev_nvme_destroy_ctrlr_channel_cb(void *io_device, void *ctx_buf) 3230 { 3231 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 3232 struct nvme_qpair *nvme_qpair; 3233 3234 nvme_qpair = ctrlr_ch->qpair; 3235 assert(nvme_qpair != NULL); 3236 3237 _bdev_nvme_clear_io_path_cache(nvme_qpair); 3238 3239 if (nvme_qpair->qpair != NULL) { 3240 if (ctrlr_ch->reset_iter == NULL) { 3241 spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); 3242 } else { 3243 /* Skip current ctrlr_channel in a full reset sequence because 3244 * it is being deleted now. The qpair is already being disconnected. 3245 * We do not have to restart disconnecting it. 3246 */ 3247 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 3248 } 3249 3250 /* We cannot release a reference to the poll group now. 3251 * The qpair may be disconnected asynchronously later. 3252 * We need to poll it until it is actually disconnected. 3253 * Just detach the qpair from the deleting ctrlr_channel. 3254 */ 3255 nvme_qpair->ctrlr_ch = NULL; 3256 } else { 3257 assert(ctrlr_ch->reset_iter == NULL); 3258 3259 nvme_qpair_delete(nvme_qpair); 3260 } 3261 } 3262 3263 static inline struct spdk_io_channel * 3264 bdev_nvme_get_accel_channel(struct nvme_poll_group *group) 3265 { 3266 if (spdk_unlikely(!group->accel_channel)) { 3267 group->accel_channel = spdk_accel_get_io_channel(); 3268 if (!group->accel_channel) { 3269 SPDK_ERRLOG("Cannot get the accel_channel for bdev nvme polling group=%p\n", 3270 group); 3271 return NULL; 3272 } 3273 } 3274 3275 return group->accel_channel; 3276 } 3277 3278 static void 3279 bdev_nvme_submit_accel_crc32c(void *ctx, uint32_t *dst, struct iovec *iov, 3280 uint32_t iov_cnt, uint32_t seed, 3281 spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) 3282 { 3283 struct spdk_io_channel *accel_ch; 3284 struct nvme_poll_group *group = ctx; 3285 int rc; 3286 3287 assert(cb_fn != NULL); 3288 3289 accel_ch = bdev_nvme_get_accel_channel(group); 3290 if (spdk_unlikely(accel_ch == NULL)) { 3291 cb_fn(cb_arg, -ENOMEM); 3292 return; 3293 } 3294 3295 rc = spdk_accel_submit_crc32cv(accel_ch, dst, iov, iov_cnt, seed, cb_fn, cb_arg); 3296 if (rc) { 3297 /* For the two cases, spdk_accel_submit_crc32cv does not call the user's cb_fn */ 3298 if (rc == -ENOMEM || rc == -EINVAL) { 3299 cb_fn(cb_arg, rc); 3300 } 3301 SPDK_ERRLOG("Cannot complete the accelerated crc32c operation with iov=%p\n", iov); 3302 } 3303 } 3304 3305 static void 3306 bdev_nvme_finish_sequence(void *seq, spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) 3307 { 3308 spdk_accel_sequence_finish(seq, cb_fn, cb_arg); 3309 } 3310 3311 static void 3312 bdev_nvme_abort_sequence(void *seq) 3313 { 3314 spdk_accel_sequence_abort(seq); 3315 } 3316 3317 static void 3318 bdev_nvme_reverse_sequence(void *seq) 3319 { 3320 spdk_accel_sequence_reverse(seq); 3321 } 3322 3323 static int 3324 bdev_nvme_append_crc32c(void *ctx, void **seq, uint32_t *dst, struct iovec *iovs, uint32_t iovcnt, 3325 struct spdk_memory_domain *domain, void *domain_ctx, uint32_t seed, 3326 spdk_nvme_accel_step_cb cb_fn, void *cb_arg) 3327 { 3328 struct spdk_io_channel *ch; 3329 struct nvme_poll_group *group = ctx; 3330 3331 ch = bdev_nvme_get_accel_channel(group); 3332 if (spdk_unlikely(ch == NULL)) { 3333 return -ENOMEM; 3334 } 3335 3336 return spdk_accel_append_crc32c((struct spdk_accel_sequence **)seq, ch, dst, iovs, iovcnt, 3337 domain, domain_ctx, seed, cb_fn, cb_arg); 3338 } 3339 3340 static struct spdk_nvme_accel_fn_table g_bdev_nvme_accel_fn_table = { 3341 .table_size = sizeof(struct spdk_nvme_accel_fn_table), 3342 .submit_accel_crc32c = bdev_nvme_submit_accel_crc32c, 3343 .append_crc32c = bdev_nvme_append_crc32c, 3344 .finish_sequence = bdev_nvme_finish_sequence, 3345 .reverse_sequence = bdev_nvme_reverse_sequence, 3346 .abort_sequence = bdev_nvme_abort_sequence, 3347 }; 3348 3349 static int 3350 bdev_nvme_create_poll_group_cb(void *io_device, void *ctx_buf) 3351 { 3352 struct nvme_poll_group *group = ctx_buf; 3353 3354 TAILQ_INIT(&group->qpair_list); 3355 3356 group->group = spdk_nvme_poll_group_create(group, &g_bdev_nvme_accel_fn_table); 3357 if (group->group == NULL) { 3358 return -1; 3359 } 3360 3361 group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us); 3362 3363 if (group->poller == NULL) { 3364 spdk_nvme_poll_group_destroy(group->group); 3365 return -1; 3366 } 3367 3368 return 0; 3369 } 3370 3371 static void 3372 bdev_nvme_destroy_poll_group_cb(void *io_device, void *ctx_buf) 3373 { 3374 struct nvme_poll_group *group = ctx_buf; 3375 3376 assert(TAILQ_EMPTY(&group->qpair_list)); 3377 3378 if (group->accel_channel) { 3379 spdk_put_io_channel(group->accel_channel); 3380 } 3381 3382 spdk_poller_unregister(&group->poller); 3383 if (spdk_nvme_poll_group_destroy(group->group)) { 3384 SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.\n"); 3385 assert(false); 3386 } 3387 } 3388 3389 static struct spdk_io_channel * 3390 bdev_nvme_get_io_channel(void *ctx) 3391 { 3392 struct nvme_bdev *nvme_bdev = ctx; 3393 3394 return spdk_get_io_channel(nvme_bdev); 3395 } 3396 3397 static void * 3398 bdev_nvme_get_module_ctx(void *ctx) 3399 { 3400 struct nvme_bdev *nvme_bdev = ctx; 3401 struct nvme_ns *nvme_ns; 3402 3403 if (!nvme_bdev || nvme_bdev->disk.module != &nvme_if) { 3404 return NULL; 3405 } 3406 3407 nvme_ns = TAILQ_FIRST(&nvme_bdev->nvme_ns_list); 3408 if (!nvme_ns) { 3409 return NULL; 3410 } 3411 3412 return nvme_ns->ns; 3413 } 3414 3415 static const char * 3416 _nvme_ana_state_str(enum spdk_nvme_ana_state ana_state) 3417 { 3418 switch (ana_state) { 3419 case SPDK_NVME_ANA_OPTIMIZED_STATE: 3420 return "optimized"; 3421 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 3422 return "non_optimized"; 3423 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 3424 return "inaccessible"; 3425 case SPDK_NVME_ANA_PERSISTENT_LOSS_STATE: 3426 return "persistent_loss"; 3427 case SPDK_NVME_ANA_CHANGE_STATE: 3428 return "change"; 3429 default: 3430 return NULL; 3431 } 3432 } 3433 3434 static int 3435 bdev_nvme_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size) 3436 { 3437 struct spdk_memory_domain **_domains = NULL; 3438 struct nvme_bdev *nbdev = ctx; 3439 struct nvme_ns *nvme_ns; 3440 int i = 0, _array_size = array_size; 3441 int rc = 0; 3442 3443 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 3444 if (domains && array_size >= i) { 3445 _domains = &domains[i]; 3446 } else { 3447 _domains = NULL; 3448 } 3449 rc = spdk_nvme_ctrlr_get_memory_domains(nvme_ns->ctrlr->ctrlr, _domains, _array_size); 3450 if (rc > 0) { 3451 i += rc; 3452 if (_array_size >= rc) { 3453 _array_size -= rc; 3454 } else { 3455 _array_size = 0; 3456 } 3457 } else if (rc < 0) { 3458 return rc; 3459 } 3460 } 3461 3462 return i; 3463 } 3464 3465 static const char * 3466 nvme_ctrlr_get_state_str(struct nvme_ctrlr *nvme_ctrlr) 3467 { 3468 if (nvme_ctrlr->destruct) { 3469 return "deleting"; 3470 } else if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 3471 return "failed"; 3472 } else if (nvme_ctrlr->resetting) { 3473 return "resetting"; 3474 } else if (nvme_ctrlr->reconnect_is_delayed > 0) { 3475 return "reconnect_is_delayed"; 3476 } else if (nvme_ctrlr->disabled) { 3477 return "disabled"; 3478 } else { 3479 return "enabled"; 3480 } 3481 } 3482 3483 void 3484 nvme_ctrlr_info_json(struct spdk_json_write_ctx *w, struct nvme_ctrlr *nvme_ctrlr) 3485 { 3486 struct spdk_nvme_transport_id *trid; 3487 const struct spdk_nvme_ctrlr_opts *opts; 3488 const struct spdk_nvme_ctrlr_data *cdata; 3489 struct nvme_path_id *path_id; 3490 3491 spdk_json_write_object_begin(w); 3492 3493 spdk_json_write_named_string(w, "state", nvme_ctrlr_get_state_str(nvme_ctrlr)); 3494 3495 #ifdef SPDK_CONFIG_NVME_CUSE 3496 size_t cuse_name_size = 128; 3497 char cuse_name[cuse_name_size]; 3498 3499 int rc = spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr, cuse_name, &cuse_name_size); 3500 if (rc == 0) { 3501 spdk_json_write_named_string(w, "cuse_device", cuse_name); 3502 } 3503 #endif 3504 trid = &nvme_ctrlr->active_path_id->trid; 3505 spdk_json_write_named_object_begin(w, "trid"); 3506 nvme_bdev_dump_trid_json(trid, w); 3507 spdk_json_write_object_end(w); 3508 3509 path_id = TAILQ_NEXT(nvme_ctrlr->active_path_id, link); 3510 if (path_id != NULL) { 3511 spdk_json_write_named_array_begin(w, "alternate_trids"); 3512 do { 3513 trid = &path_id->trid; 3514 spdk_json_write_object_begin(w); 3515 nvme_bdev_dump_trid_json(trid, w); 3516 spdk_json_write_object_end(w); 3517 3518 path_id = TAILQ_NEXT(path_id, link); 3519 } while (path_id != NULL); 3520 spdk_json_write_array_end(w); 3521 } 3522 3523 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 3524 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 3525 3526 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 3527 spdk_json_write_named_object_begin(w, "host"); 3528 spdk_json_write_named_string(w, "nqn", opts->hostnqn); 3529 spdk_json_write_named_string(w, "addr", opts->src_addr); 3530 spdk_json_write_named_string(w, "svcid", opts->src_svcid); 3531 spdk_json_write_object_end(w); 3532 3533 spdk_json_write_object_end(w); 3534 } 3535 3536 static void 3537 nvme_namespace_info_json(struct spdk_json_write_ctx *w, 3538 struct nvme_ns *nvme_ns) 3539 { 3540 struct spdk_nvme_ns *ns; 3541 struct spdk_nvme_ctrlr *ctrlr; 3542 const struct spdk_nvme_ctrlr_data *cdata; 3543 const struct spdk_nvme_transport_id *trid; 3544 union spdk_nvme_vs_register vs; 3545 const struct spdk_nvme_ns_data *nsdata; 3546 char buf[128]; 3547 3548 ns = nvme_ns->ns; 3549 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 3550 3551 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3552 trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); 3553 vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr); 3554 3555 spdk_json_write_object_begin(w); 3556 3557 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 3558 spdk_json_write_named_string(w, "pci_address", trid->traddr); 3559 } 3560 3561 spdk_json_write_named_object_begin(w, "trid"); 3562 3563 nvme_bdev_dump_trid_json(trid, w); 3564 3565 spdk_json_write_object_end(w); 3566 3567 #ifdef SPDK_CONFIG_NVME_CUSE 3568 size_t cuse_name_size = 128; 3569 char cuse_name[cuse_name_size]; 3570 3571 int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns), 3572 cuse_name, &cuse_name_size); 3573 if (rc == 0) { 3574 spdk_json_write_named_string(w, "cuse_device", cuse_name); 3575 } 3576 #endif 3577 3578 spdk_json_write_named_object_begin(w, "ctrlr_data"); 3579 3580 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 3581 3582 spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid); 3583 3584 snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn); 3585 spdk_str_trim(buf); 3586 spdk_json_write_named_string(w, "model_number", buf); 3587 3588 snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn); 3589 spdk_str_trim(buf); 3590 spdk_json_write_named_string(w, "serial_number", buf); 3591 3592 snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr); 3593 spdk_str_trim(buf); 3594 spdk_json_write_named_string(w, "firmware_revision", buf); 3595 3596 if (cdata->subnqn[0] != '\0') { 3597 spdk_json_write_named_string(w, "subnqn", cdata->subnqn); 3598 } 3599 3600 spdk_json_write_named_object_begin(w, "oacs"); 3601 3602 spdk_json_write_named_uint32(w, "security", cdata->oacs.security); 3603 spdk_json_write_named_uint32(w, "format", cdata->oacs.format); 3604 spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware); 3605 spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage); 3606 3607 spdk_json_write_object_end(w); 3608 3609 spdk_json_write_named_bool(w, "multi_ctrlr", cdata->cmic.multi_ctrlr); 3610 spdk_json_write_named_bool(w, "ana_reporting", cdata->cmic.ana_reporting); 3611 3612 spdk_json_write_object_end(w); 3613 3614 spdk_json_write_named_object_begin(w, "vs"); 3615 3616 spdk_json_write_name(w, "nvme_version"); 3617 if (vs.bits.ter) { 3618 spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter); 3619 } else { 3620 spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr); 3621 } 3622 3623 spdk_json_write_object_end(w); 3624 3625 nsdata = spdk_nvme_ns_get_data(ns); 3626 3627 spdk_json_write_named_object_begin(w, "ns_data"); 3628 3629 spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns)); 3630 3631 if (cdata->cmic.ana_reporting) { 3632 spdk_json_write_named_string(w, "ana_state", 3633 _nvme_ana_state_str(nvme_ns->ana_state)); 3634 } 3635 3636 spdk_json_write_named_bool(w, "can_share", nsdata->nmic.can_share); 3637 3638 spdk_json_write_object_end(w); 3639 3640 if (cdata->oacs.security) { 3641 spdk_json_write_named_object_begin(w, "security"); 3642 3643 spdk_json_write_named_bool(w, "opal", nvme_ns->bdev->opal); 3644 3645 spdk_json_write_object_end(w); 3646 } 3647 3648 spdk_json_write_object_end(w); 3649 } 3650 3651 static const char * 3652 nvme_bdev_get_mp_policy_str(struct nvme_bdev *nbdev) 3653 { 3654 switch (nbdev->mp_policy) { 3655 case BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE: 3656 return "active_passive"; 3657 case BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE: 3658 return "active_active"; 3659 default: 3660 assert(false); 3661 return "invalid"; 3662 } 3663 } 3664 3665 static int 3666 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) 3667 { 3668 struct nvme_bdev *nvme_bdev = ctx; 3669 struct nvme_ns *nvme_ns; 3670 3671 pthread_mutex_lock(&nvme_bdev->mutex); 3672 spdk_json_write_named_array_begin(w, "nvme"); 3673 TAILQ_FOREACH(nvme_ns, &nvme_bdev->nvme_ns_list, tailq) { 3674 nvme_namespace_info_json(w, nvme_ns); 3675 } 3676 spdk_json_write_array_end(w); 3677 spdk_json_write_named_string(w, "mp_policy", nvme_bdev_get_mp_policy_str(nvme_bdev)); 3678 pthread_mutex_unlock(&nvme_bdev->mutex); 3679 3680 return 0; 3681 } 3682 3683 static void 3684 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3685 { 3686 /* No config per bdev needed */ 3687 } 3688 3689 static uint64_t 3690 bdev_nvme_get_spin_time(struct spdk_io_channel *ch) 3691 { 3692 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 3693 struct nvme_io_path *io_path; 3694 struct nvme_poll_group *group; 3695 uint64_t spin_time = 0; 3696 3697 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 3698 group = io_path->qpair->group; 3699 3700 if (!group || !group->collect_spin_stat) { 3701 continue; 3702 } 3703 3704 if (group->end_ticks != 0) { 3705 group->spin_ticks += (group->end_ticks - group->start_ticks); 3706 group->end_ticks = 0; 3707 } 3708 3709 spin_time += group->spin_ticks; 3710 group->start_ticks = 0; 3711 group->spin_ticks = 0; 3712 } 3713 3714 return (spin_time * 1000000ULL) / spdk_get_ticks_hz(); 3715 } 3716 3717 static void 3718 bdev_nvme_reset_device_stat(void *ctx) 3719 { 3720 struct nvme_bdev *nbdev = ctx; 3721 3722 if (nbdev->err_stat != NULL) { 3723 memset(nbdev->err_stat, 0, sizeof(struct nvme_error_stat)); 3724 } 3725 } 3726 3727 /* JSON string should be lowercases and underscore delimited string. */ 3728 static void 3729 bdev_nvme_format_nvme_status(char *dst, const char *src) 3730 { 3731 char tmp[256]; 3732 3733 spdk_strcpy_replace(dst, 256, src, " - ", "_"); 3734 spdk_strcpy_replace(tmp, 256, dst, "-", "_"); 3735 spdk_strcpy_replace(dst, 256, tmp, " ", "_"); 3736 spdk_strlwr(dst); 3737 } 3738 3739 static void 3740 bdev_nvme_dump_device_stat_json(void *ctx, struct spdk_json_write_ctx *w) 3741 { 3742 struct nvme_bdev *nbdev = ctx; 3743 struct spdk_nvme_status status = {}; 3744 uint16_t sct, sc; 3745 char status_json[256]; 3746 const char *status_str; 3747 3748 if (nbdev->err_stat == NULL) { 3749 return; 3750 } 3751 3752 spdk_json_write_named_object_begin(w, "nvme_error"); 3753 3754 spdk_json_write_named_object_begin(w, "status_type"); 3755 for (sct = 0; sct < 8; sct++) { 3756 if (nbdev->err_stat->status_type[sct] == 0) { 3757 continue; 3758 } 3759 status.sct = sct; 3760 3761 status_str = spdk_nvme_cpl_get_status_type_string(&status); 3762 assert(status_str != NULL); 3763 bdev_nvme_format_nvme_status(status_json, status_str); 3764 3765 spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status_type[sct]); 3766 } 3767 spdk_json_write_object_end(w); 3768 3769 spdk_json_write_named_object_begin(w, "status_code"); 3770 for (sct = 0; sct < 4; sct++) { 3771 status.sct = sct; 3772 for (sc = 0; sc < 256; sc++) { 3773 if (nbdev->err_stat->status[sct][sc] == 0) { 3774 continue; 3775 } 3776 status.sc = sc; 3777 3778 status_str = spdk_nvme_cpl_get_status_string(&status); 3779 assert(status_str != NULL); 3780 bdev_nvme_format_nvme_status(status_json, status_str); 3781 3782 spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status[sct][sc]); 3783 } 3784 } 3785 spdk_json_write_object_end(w); 3786 3787 spdk_json_write_object_end(w); 3788 } 3789 3790 static const struct spdk_bdev_fn_table nvmelib_fn_table = { 3791 .destruct = bdev_nvme_destruct, 3792 .submit_request = bdev_nvme_submit_request, 3793 .io_type_supported = bdev_nvme_io_type_supported, 3794 .get_io_channel = bdev_nvme_get_io_channel, 3795 .dump_info_json = bdev_nvme_dump_info_json, 3796 .write_config_json = bdev_nvme_write_config_json, 3797 .get_spin_time = bdev_nvme_get_spin_time, 3798 .get_module_ctx = bdev_nvme_get_module_ctx, 3799 .get_memory_domains = bdev_nvme_get_memory_domains, 3800 .reset_device_stat = bdev_nvme_reset_device_stat, 3801 .dump_device_stat_json = bdev_nvme_dump_device_stat_json, 3802 }; 3803 3804 typedef int (*bdev_nvme_parse_ana_log_page_cb)( 3805 const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg); 3806 3807 static int 3808 bdev_nvme_parse_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 3809 bdev_nvme_parse_ana_log_page_cb cb_fn, void *cb_arg) 3810 { 3811 struct spdk_nvme_ana_group_descriptor *copied_desc; 3812 uint8_t *orig_desc; 3813 uint32_t i, desc_size, copy_len; 3814 int rc = 0; 3815 3816 if (nvme_ctrlr->ana_log_page == NULL) { 3817 return -EINVAL; 3818 } 3819 3820 copied_desc = nvme_ctrlr->copied_ana_desc; 3821 3822 orig_desc = (uint8_t *)nvme_ctrlr->ana_log_page + sizeof(struct spdk_nvme_ana_page); 3823 copy_len = nvme_ctrlr->max_ana_log_page_size - sizeof(struct spdk_nvme_ana_page); 3824 3825 for (i = 0; i < nvme_ctrlr->ana_log_page->num_ana_group_desc; i++) { 3826 memcpy(copied_desc, orig_desc, copy_len); 3827 3828 rc = cb_fn(copied_desc, cb_arg); 3829 if (rc != 0) { 3830 break; 3831 } 3832 3833 desc_size = sizeof(struct spdk_nvme_ana_group_descriptor) + 3834 copied_desc->num_of_nsid * sizeof(uint32_t); 3835 orig_desc += desc_size; 3836 copy_len -= desc_size; 3837 } 3838 3839 return rc; 3840 } 3841 3842 static int 3843 nvme_ns_ana_transition_timedout(void *ctx) 3844 { 3845 struct nvme_ns *nvme_ns = ctx; 3846 3847 spdk_poller_unregister(&nvme_ns->anatt_timer); 3848 nvme_ns->ana_transition_timedout = true; 3849 3850 return SPDK_POLLER_BUSY; 3851 } 3852 3853 static void 3854 _nvme_ns_set_ana_state(struct nvme_ns *nvme_ns, 3855 const struct spdk_nvme_ana_group_descriptor *desc) 3856 { 3857 const struct spdk_nvme_ctrlr_data *cdata; 3858 3859 nvme_ns->ana_group_id = desc->ana_group_id; 3860 nvme_ns->ana_state = desc->ana_state; 3861 nvme_ns->ana_state_updating = false; 3862 3863 switch (nvme_ns->ana_state) { 3864 case SPDK_NVME_ANA_OPTIMIZED_STATE: 3865 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 3866 nvme_ns->ana_transition_timedout = false; 3867 spdk_poller_unregister(&nvme_ns->anatt_timer); 3868 break; 3869 3870 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 3871 case SPDK_NVME_ANA_CHANGE_STATE: 3872 if (nvme_ns->anatt_timer != NULL) { 3873 break; 3874 } 3875 3876 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 3877 nvme_ns->anatt_timer = SPDK_POLLER_REGISTER(nvme_ns_ana_transition_timedout, 3878 nvme_ns, 3879 cdata->anatt * SPDK_SEC_TO_USEC); 3880 break; 3881 default: 3882 break; 3883 } 3884 } 3885 3886 static int 3887 nvme_ns_set_ana_state(const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg) 3888 { 3889 struct nvme_ns *nvme_ns = cb_arg; 3890 uint32_t i; 3891 3892 for (i = 0; i < desc->num_of_nsid; i++) { 3893 if (desc->nsid[i] != spdk_nvme_ns_get_id(nvme_ns->ns)) { 3894 continue; 3895 } 3896 3897 _nvme_ns_set_ana_state(nvme_ns, desc); 3898 return 1; 3899 } 3900 3901 return 0; 3902 } 3903 3904 static struct spdk_uuid 3905 nvme_generate_uuid(const char *sn, uint32_t nsid) 3906 { 3907 struct spdk_uuid new_uuid, namespace_uuid; 3908 char merged_str[SPDK_NVME_CTRLR_SN_LEN + NSID_STR_LEN + 1] = {'\0'}; 3909 /* This namespace UUID was generated using uuid_generate() method. */ 3910 const char *namespace_str = {"edaed2de-24bc-4b07-b559-f47ecbe730fd"}; 3911 int size; 3912 3913 assert(strlen(sn) <= SPDK_NVME_CTRLR_SN_LEN); 3914 3915 spdk_uuid_set_null(&new_uuid); 3916 spdk_uuid_set_null(&namespace_uuid); 3917 3918 size = snprintf(merged_str, sizeof(merged_str), "%s%"PRIu32, sn, nsid); 3919 assert(size > 0 && (unsigned long)size < sizeof(merged_str)); 3920 3921 spdk_uuid_parse(&namespace_uuid, namespace_str); 3922 3923 spdk_uuid_generate_sha1(&new_uuid, &namespace_uuid, merged_str, size); 3924 3925 return new_uuid; 3926 } 3927 3928 static int 3929 nvme_disk_create(struct spdk_bdev *disk, const char *base_name, 3930 struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns, 3931 uint32_t prchk_flags, void *ctx) 3932 { 3933 const struct spdk_uuid *uuid; 3934 const uint8_t *nguid; 3935 const struct spdk_nvme_ctrlr_data *cdata; 3936 const struct spdk_nvme_ns_data *nsdata; 3937 const struct spdk_nvme_ctrlr_opts *opts; 3938 enum spdk_nvme_csi csi; 3939 uint32_t atomic_bs, phys_bs, bs; 3940 char sn_tmp[SPDK_NVME_CTRLR_SN_LEN + 1] = {'\0'}; 3941 3942 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3943 csi = spdk_nvme_ns_get_csi(ns); 3944 opts = spdk_nvme_ctrlr_get_opts(ctrlr); 3945 3946 switch (csi) { 3947 case SPDK_NVME_CSI_NVM: 3948 disk->product_name = "NVMe disk"; 3949 break; 3950 case SPDK_NVME_CSI_ZNS: 3951 disk->product_name = "NVMe ZNS disk"; 3952 disk->zoned = true; 3953 disk->zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 3954 disk->max_zone_append_size = spdk_nvme_zns_ctrlr_get_max_zone_append_size(ctrlr) / 3955 spdk_nvme_ns_get_extended_sector_size(ns); 3956 disk->max_open_zones = spdk_nvme_zns_ns_get_max_open_zones(ns); 3957 disk->max_active_zones = spdk_nvme_zns_ns_get_max_active_zones(ns); 3958 break; 3959 default: 3960 SPDK_ERRLOG("unsupported CSI: %u\n", csi); 3961 return -ENOTSUP; 3962 } 3963 3964 disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns)); 3965 if (!disk->name) { 3966 return -ENOMEM; 3967 } 3968 3969 disk->write_cache = 0; 3970 if (cdata->vwc.present) { 3971 /* Enable if the Volatile Write Cache exists */ 3972 disk->write_cache = 1; 3973 } 3974 if (cdata->oncs.write_zeroes) { 3975 disk->max_write_zeroes = UINT16_MAX + 1; 3976 } 3977 disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns); 3978 disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns); 3979 disk->max_segment_size = spdk_nvme_ctrlr_get_max_xfer_size(ctrlr); 3980 /* NVMe driver will split one request into multiple requests 3981 * based on MDTS and stripe boundary, the bdev layer will use 3982 * max_segment_size and max_num_segments to split one big IO 3983 * into multiple requests, then small request can't run out 3984 * of NVMe internal requests data structure. 3985 */ 3986 if (opts && opts->io_queue_requests) { 3987 disk->max_num_segments = opts->io_queue_requests / 2; 3988 } 3989 disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns); 3990 3991 nguid = spdk_nvme_ns_get_nguid(ns); 3992 if (!nguid) { 3993 uuid = spdk_nvme_ns_get_uuid(ns); 3994 if (uuid) { 3995 disk->uuid = *uuid; 3996 } else if (g_opts.generate_uuids) { 3997 spdk_strcpy_pad(sn_tmp, cdata->sn, SPDK_NVME_CTRLR_SN_LEN, '\0'); 3998 disk->uuid = nvme_generate_uuid(sn_tmp, spdk_nvme_ns_get_id(ns)); 3999 } 4000 } else { 4001 memcpy(&disk->uuid, nguid, sizeof(disk->uuid)); 4002 } 4003 4004 nsdata = spdk_nvme_ns_get_data(ns); 4005 bs = spdk_nvme_ns_get_sector_size(ns); 4006 atomic_bs = bs; 4007 phys_bs = bs; 4008 if (nsdata->nabo == 0) { 4009 if (nsdata->nsfeat.ns_atomic_write_unit && nsdata->nawupf) { 4010 atomic_bs = bs * (1 + nsdata->nawupf); 4011 } else { 4012 atomic_bs = bs * (1 + cdata->awupf); 4013 } 4014 } 4015 if (nsdata->nsfeat.optperf) { 4016 phys_bs = bs * (1 + nsdata->npwg); 4017 } 4018 disk->phys_blocklen = spdk_min(phys_bs, atomic_bs); 4019 4020 disk->md_len = spdk_nvme_ns_get_md_size(ns); 4021 if (disk->md_len != 0) { 4022 disk->md_interleave = nsdata->flbas.extended; 4023 disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns); 4024 if (disk->dif_type != SPDK_DIF_DISABLE) { 4025 disk->dif_is_head_of_md = nsdata->dps.md_start; 4026 disk->dif_check_flags = prchk_flags; 4027 } 4028 } 4029 4030 if (!(spdk_nvme_ctrlr_get_flags(ctrlr) & 4031 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) { 4032 disk->acwu = 0; 4033 } else if (nsdata->nsfeat.ns_atomic_write_unit) { 4034 disk->acwu = nsdata->nacwu + 1; /* 0-based */ 4035 } else { 4036 disk->acwu = cdata->acwu + 1; /* 0-based */ 4037 } 4038 4039 if (cdata->oncs.copy) { 4040 /* For now bdev interface allows only single segment copy */ 4041 disk->max_copy = nsdata->mssrl; 4042 } 4043 4044 disk->ctxt = ctx; 4045 disk->fn_table = &nvmelib_fn_table; 4046 disk->module = &nvme_if; 4047 4048 return 0; 4049 } 4050 4051 static struct nvme_bdev * 4052 nvme_bdev_alloc(void) 4053 { 4054 struct nvme_bdev *bdev; 4055 int rc; 4056 4057 bdev = calloc(1, sizeof(*bdev)); 4058 if (!bdev) { 4059 SPDK_ERRLOG("bdev calloc() failed\n"); 4060 return NULL; 4061 } 4062 4063 if (g_opts.nvme_error_stat) { 4064 bdev->err_stat = calloc(1, sizeof(struct nvme_error_stat)); 4065 if (!bdev->err_stat) { 4066 SPDK_ERRLOG("err_stat calloc() failed\n"); 4067 free(bdev); 4068 return NULL; 4069 } 4070 } 4071 4072 rc = pthread_mutex_init(&bdev->mutex, NULL); 4073 if (rc != 0) { 4074 free(bdev->err_stat); 4075 free(bdev); 4076 return NULL; 4077 } 4078 4079 bdev->ref = 1; 4080 bdev->mp_policy = BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE; 4081 bdev->mp_selector = BDEV_NVME_MP_SELECTOR_ROUND_ROBIN; 4082 bdev->rr_min_io = UINT32_MAX; 4083 TAILQ_INIT(&bdev->nvme_ns_list); 4084 4085 return bdev; 4086 } 4087 4088 static int 4089 nvme_bdev_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4090 { 4091 struct nvme_bdev *bdev; 4092 struct nvme_bdev_ctrlr *nbdev_ctrlr = nvme_ctrlr->nbdev_ctrlr; 4093 int rc; 4094 4095 bdev = nvme_bdev_alloc(); 4096 if (bdev == NULL) { 4097 SPDK_ERRLOG("Failed to allocate NVMe bdev\n"); 4098 return -ENOMEM; 4099 } 4100 4101 bdev->opal = nvme_ctrlr->opal_dev != NULL; 4102 4103 rc = nvme_disk_create(&bdev->disk, nbdev_ctrlr->name, nvme_ctrlr->ctrlr, 4104 nvme_ns->ns, nvme_ctrlr->opts.prchk_flags, bdev); 4105 if (rc != 0) { 4106 SPDK_ERRLOG("Failed to create NVMe disk\n"); 4107 nvme_bdev_free(bdev); 4108 return rc; 4109 } 4110 4111 spdk_io_device_register(bdev, 4112 bdev_nvme_create_bdev_channel_cb, 4113 bdev_nvme_destroy_bdev_channel_cb, 4114 sizeof(struct nvme_bdev_channel), 4115 bdev->disk.name); 4116 4117 nvme_ns->bdev = bdev; 4118 bdev->nsid = nvme_ns->id; 4119 TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); 4120 4121 bdev->nbdev_ctrlr = nbdev_ctrlr; 4122 TAILQ_INSERT_TAIL(&nbdev_ctrlr->bdevs, bdev, tailq); 4123 4124 rc = spdk_bdev_register(&bdev->disk); 4125 if (rc != 0) { 4126 SPDK_ERRLOG("spdk_bdev_register() failed\n"); 4127 spdk_io_device_unregister(bdev, NULL); 4128 nvme_ns->bdev = NULL; 4129 TAILQ_REMOVE(&nbdev_ctrlr->bdevs, bdev, tailq); 4130 nvme_bdev_free(bdev); 4131 return rc; 4132 } 4133 4134 return 0; 4135 } 4136 4137 static bool 4138 bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2) 4139 { 4140 const struct spdk_nvme_ns_data *nsdata1, *nsdata2; 4141 const struct spdk_uuid *uuid1, *uuid2; 4142 4143 nsdata1 = spdk_nvme_ns_get_data(ns1); 4144 nsdata2 = spdk_nvme_ns_get_data(ns2); 4145 uuid1 = spdk_nvme_ns_get_uuid(ns1); 4146 uuid2 = spdk_nvme_ns_get_uuid(ns2); 4147 4148 return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid)) == 0 && 4149 nsdata1->eui64 == nsdata2->eui64 && 4150 ((uuid1 == NULL && uuid2 == NULL) || 4151 (uuid1 != NULL && uuid2 != NULL && spdk_uuid_compare(uuid1, uuid2) == 0)) && 4152 spdk_nvme_ns_get_csi(ns1) == spdk_nvme_ns_get_csi(ns2); 4153 } 4154 4155 static bool 4156 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 4157 struct spdk_nvme_ctrlr_opts *opts) 4158 { 4159 struct nvme_probe_skip_entry *entry; 4160 4161 TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) { 4162 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 4163 return false; 4164 } 4165 } 4166 4167 opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst; 4168 opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight; 4169 opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight; 4170 opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight; 4171 opts->disable_read_ana_log_page = true; 4172 4173 SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr); 4174 4175 return true; 4176 } 4177 4178 static void 4179 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl) 4180 { 4181 struct nvme_ctrlr *nvme_ctrlr = ctx; 4182 4183 if (spdk_nvme_cpl_is_error(cpl)) { 4184 SPDK_WARNLOG("Abort failed. Resetting controller. sc is %u, sct is %u.\n", cpl->status.sc, 4185 cpl->status.sct); 4186 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4187 } else if (cpl->cdw0 & 0x1) { 4188 SPDK_WARNLOG("Specified command could not be aborted.\n"); 4189 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4190 } 4191 } 4192 4193 static void 4194 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr, 4195 struct spdk_nvme_qpair *qpair, uint16_t cid) 4196 { 4197 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 4198 union spdk_nvme_csts_register csts; 4199 int rc; 4200 4201 assert(nvme_ctrlr->ctrlr == ctrlr); 4202 4203 SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid); 4204 4205 /* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O 4206 * queue. (Note: qpair == NULL when there's an admin cmd timeout.) Otherwise we 4207 * would submit another fabrics cmd on the admin queue to read CSTS and check for its 4208 * completion recursively. 4209 */ 4210 if (nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) { 4211 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); 4212 if (csts.bits.cfs) { 4213 SPDK_ERRLOG("Controller Fatal Status, reset required\n"); 4214 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4215 return; 4216 } 4217 } 4218 4219 switch (g_opts.action_on_timeout) { 4220 case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT: 4221 if (qpair) { 4222 /* Don't send abort to ctrlr when ctrlr is not available. */ 4223 pthread_mutex_lock(&nvme_ctrlr->mutex); 4224 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 4225 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4226 SPDK_NOTICELOG("Quit abort. Ctrlr is not available.\n"); 4227 return; 4228 } 4229 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4230 4231 rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid, 4232 nvme_abort_cpl, nvme_ctrlr); 4233 if (rc == 0) { 4234 return; 4235 } 4236 4237 SPDK_ERRLOG("Unable to send abort. Resetting, rc is %d.\n", rc); 4238 } 4239 4240 /* FALLTHROUGH */ 4241 case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET: 4242 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4243 break; 4244 case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE: 4245 SPDK_DEBUGLOG(bdev_nvme, "No action for nvme controller timeout.\n"); 4246 break; 4247 default: 4248 SPDK_ERRLOG("An invalid timeout action value is found.\n"); 4249 break; 4250 } 4251 } 4252 4253 static struct nvme_ns * 4254 nvme_ns_alloc(void) 4255 { 4256 struct nvme_ns *nvme_ns; 4257 4258 nvme_ns = calloc(1, sizeof(struct nvme_ns)); 4259 if (nvme_ns == NULL) { 4260 return NULL; 4261 } 4262 4263 if (g_opts.io_path_stat) { 4264 nvme_ns->stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); 4265 if (nvme_ns->stat == NULL) { 4266 free(nvme_ns); 4267 return NULL; 4268 } 4269 spdk_bdev_reset_io_stat(nvme_ns->stat, SPDK_BDEV_RESET_STAT_MAXMIN); 4270 } 4271 4272 return nvme_ns; 4273 } 4274 4275 static void 4276 nvme_ns_free(struct nvme_ns *nvme_ns) 4277 { 4278 free(nvme_ns->stat); 4279 free(nvme_ns); 4280 } 4281 4282 static void 4283 nvme_ctrlr_populate_namespace_done(struct nvme_ns *nvme_ns, int rc) 4284 { 4285 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 4286 struct nvme_async_probe_ctx *ctx = nvme_ns->probe_ctx; 4287 4288 if (rc == 0) { 4289 nvme_ns->probe_ctx = NULL; 4290 pthread_mutex_lock(&nvme_ctrlr->mutex); 4291 nvme_ctrlr->ref++; 4292 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4293 } else { 4294 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4295 nvme_ns_free(nvme_ns); 4296 } 4297 4298 if (ctx) { 4299 ctx->populates_in_progress--; 4300 if (ctx->populates_in_progress == 0) { 4301 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 4302 } 4303 } 4304 } 4305 4306 static void 4307 bdev_nvme_add_io_path(struct spdk_io_channel_iter *i) 4308 { 4309 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4310 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4311 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4312 int rc; 4313 4314 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 4315 if (rc != 0) { 4316 SPDK_ERRLOG("Failed to add I/O path to bdev_channel dynamically.\n"); 4317 } 4318 4319 spdk_for_each_channel_continue(i, rc); 4320 } 4321 4322 static void 4323 bdev_nvme_delete_io_path(struct spdk_io_channel_iter *i) 4324 { 4325 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4326 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4327 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4328 struct nvme_io_path *io_path; 4329 4330 io_path = _bdev_nvme_get_io_path(nbdev_ch, nvme_ns); 4331 if (io_path != NULL) { 4332 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 4333 } 4334 4335 spdk_for_each_channel_continue(i, 0); 4336 } 4337 4338 static void 4339 bdev_nvme_add_io_path_failed(struct spdk_io_channel_iter *i, int status) 4340 { 4341 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4342 4343 nvme_ctrlr_populate_namespace_done(nvme_ns, -1); 4344 } 4345 4346 static void 4347 bdev_nvme_add_io_path_done(struct spdk_io_channel_iter *i, int status) 4348 { 4349 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4350 struct nvme_bdev *bdev = spdk_io_channel_iter_get_io_device(i); 4351 4352 if (status == 0) { 4353 nvme_ctrlr_populate_namespace_done(nvme_ns, 0); 4354 } else { 4355 /* Delete the added io_paths and fail populating the namespace. */ 4356 spdk_for_each_channel(bdev, 4357 bdev_nvme_delete_io_path, 4358 nvme_ns, 4359 bdev_nvme_add_io_path_failed); 4360 } 4361 } 4362 4363 static int 4364 nvme_bdev_add_ns(struct nvme_bdev *bdev, struct nvme_ns *nvme_ns) 4365 { 4366 struct nvme_ns *tmp_ns; 4367 const struct spdk_nvme_ns_data *nsdata; 4368 4369 nsdata = spdk_nvme_ns_get_data(nvme_ns->ns); 4370 if (!nsdata->nmic.can_share) { 4371 SPDK_ERRLOG("Namespace cannot be shared.\n"); 4372 return -EINVAL; 4373 } 4374 4375 pthread_mutex_lock(&bdev->mutex); 4376 4377 tmp_ns = TAILQ_FIRST(&bdev->nvme_ns_list); 4378 assert(tmp_ns != NULL); 4379 4380 if (!bdev_nvme_compare_ns(nvme_ns->ns, tmp_ns->ns)) { 4381 pthread_mutex_unlock(&bdev->mutex); 4382 SPDK_ERRLOG("Namespaces are not identical.\n"); 4383 return -EINVAL; 4384 } 4385 4386 bdev->ref++; 4387 TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); 4388 nvme_ns->bdev = bdev; 4389 4390 pthread_mutex_unlock(&bdev->mutex); 4391 4392 /* Add nvme_io_path to nvme_bdev_channels dynamically. */ 4393 spdk_for_each_channel(bdev, 4394 bdev_nvme_add_io_path, 4395 nvme_ns, 4396 bdev_nvme_add_io_path_done); 4397 4398 return 0; 4399 } 4400 4401 static void 4402 nvme_ctrlr_populate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4403 { 4404 struct spdk_nvme_ns *ns; 4405 struct nvme_bdev *bdev; 4406 int rc = 0; 4407 4408 ns = spdk_nvme_ctrlr_get_ns(nvme_ctrlr->ctrlr, nvme_ns->id); 4409 if (!ns) { 4410 SPDK_DEBUGLOG(bdev_nvme, "Invalid NS %d\n", nvme_ns->id); 4411 rc = -EINVAL; 4412 goto done; 4413 } 4414 4415 nvme_ns->ns = ns; 4416 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 4417 4418 if (nvme_ctrlr->ana_log_page != NULL) { 4419 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ns_set_ana_state, nvme_ns); 4420 } 4421 4422 bdev = nvme_bdev_ctrlr_get_bdev(nvme_ctrlr->nbdev_ctrlr, nvme_ns->id); 4423 if (bdev == NULL) { 4424 rc = nvme_bdev_create(nvme_ctrlr, nvme_ns); 4425 } else { 4426 rc = nvme_bdev_add_ns(bdev, nvme_ns); 4427 if (rc == 0) { 4428 return; 4429 } 4430 } 4431 done: 4432 nvme_ctrlr_populate_namespace_done(nvme_ns, rc); 4433 } 4434 4435 static void 4436 nvme_ctrlr_depopulate_namespace_done(struct nvme_ns *nvme_ns) 4437 { 4438 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 4439 4440 assert(nvme_ctrlr != NULL); 4441 4442 pthread_mutex_lock(&nvme_ctrlr->mutex); 4443 4444 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4445 4446 if (nvme_ns->bdev != NULL) { 4447 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4448 return; 4449 } 4450 4451 nvme_ns_free(nvme_ns); 4452 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4453 4454 nvme_ctrlr_release(nvme_ctrlr); 4455 } 4456 4457 static void 4458 bdev_nvme_delete_io_path_done(struct spdk_io_channel_iter *i, int status) 4459 { 4460 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4461 4462 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 4463 } 4464 4465 static void 4466 nvme_ctrlr_depopulate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4467 { 4468 struct nvme_bdev *bdev; 4469 4470 spdk_poller_unregister(&nvme_ns->anatt_timer); 4471 4472 bdev = nvme_ns->bdev; 4473 if (bdev != NULL) { 4474 pthread_mutex_lock(&bdev->mutex); 4475 4476 assert(bdev->ref > 0); 4477 bdev->ref--; 4478 if (bdev->ref == 0) { 4479 pthread_mutex_unlock(&bdev->mutex); 4480 4481 spdk_bdev_unregister(&bdev->disk, NULL, NULL); 4482 } else { 4483 /* spdk_bdev_unregister() is not called until the last nvme_ns is 4484 * depopulated. Hence we need to remove nvme_ns from bdev->nvme_ns_list 4485 * and clear nvme_ns->bdev here. 4486 */ 4487 TAILQ_REMOVE(&bdev->nvme_ns_list, nvme_ns, tailq); 4488 nvme_ns->bdev = NULL; 4489 4490 pthread_mutex_unlock(&bdev->mutex); 4491 4492 /* Delete nvme_io_paths from nvme_bdev_channels dynamically. After that, 4493 * we call depopulate_namespace_done() to avoid use-after-free. 4494 */ 4495 spdk_for_each_channel(bdev, 4496 bdev_nvme_delete_io_path, 4497 nvme_ns, 4498 bdev_nvme_delete_io_path_done); 4499 return; 4500 } 4501 } 4502 4503 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 4504 } 4505 4506 static void 4507 nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 4508 struct nvme_async_probe_ctx *ctx) 4509 { 4510 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 4511 struct nvme_ns *nvme_ns, *next; 4512 struct spdk_nvme_ns *ns; 4513 struct nvme_bdev *bdev; 4514 uint32_t nsid; 4515 int rc; 4516 uint64_t num_sectors; 4517 4518 if (ctx) { 4519 /* Initialize this count to 1 to handle the populate functions 4520 * calling nvme_ctrlr_populate_namespace_done() immediately. 4521 */ 4522 ctx->populates_in_progress = 1; 4523 } 4524 4525 /* First loop over our existing namespaces and see if they have been 4526 * removed. */ 4527 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 4528 while (nvme_ns != NULL) { 4529 next = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 4530 4531 if (spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) { 4532 /* NS is still there but attributes may have changed */ 4533 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id); 4534 num_sectors = spdk_nvme_ns_get_num_sectors(ns); 4535 bdev = nvme_ns->bdev; 4536 assert(bdev != NULL); 4537 if (bdev->disk.blockcnt != num_sectors) { 4538 SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n", 4539 nvme_ns->id, 4540 bdev->disk.name, 4541 bdev->disk.blockcnt, 4542 num_sectors); 4543 rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors); 4544 if (rc != 0) { 4545 SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n", 4546 bdev->disk.name, rc); 4547 } 4548 } 4549 } else { 4550 /* Namespace was removed */ 4551 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 4552 } 4553 4554 nvme_ns = next; 4555 } 4556 4557 /* Loop through all of the namespaces at the nvme level and see if any of them are new */ 4558 nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 4559 while (nsid != 0) { 4560 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 4561 4562 if (nvme_ns == NULL) { 4563 /* Found a new one */ 4564 nvme_ns = nvme_ns_alloc(); 4565 if (nvme_ns == NULL) { 4566 SPDK_ERRLOG("Failed to allocate namespace\n"); 4567 /* This just fails to attach the namespace. It may work on a future attempt. */ 4568 continue; 4569 } 4570 4571 nvme_ns->id = nsid; 4572 nvme_ns->ctrlr = nvme_ctrlr; 4573 4574 nvme_ns->bdev = NULL; 4575 4576 if (ctx) { 4577 ctx->populates_in_progress++; 4578 } 4579 nvme_ns->probe_ctx = ctx; 4580 4581 RB_INSERT(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4582 4583 nvme_ctrlr_populate_namespace(nvme_ctrlr, nvme_ns); 4584 } 4585 4586 nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid); 4587 } 4588 4589 if (ctx) { 4590 /* Decrement this count now that the loop is over to account 4591 * for the one we started with. If the count is then 0, we 4592 * know any populate_namespace functions completed immediately, 4593 * so we'll kick the callback here. 4594 */ 4595 ctx->populates_in_progress--; 4596 if (ctx->populates_in_progress == 0) { 4597 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 4598 } 4599 } 4600 4601 } 4602 4603 static void 4604 nvme_ctrlr_depopulate_namespaces(struct nvme_ctrlr *nvme_ctrlr) 4605 { 4606 struct nvme_ns *nvme_ns, *tmp; 4607 4608 RB_FOREACH_SAFE(nvme_ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp) { 4609 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 4610 } 4611 } 4612 4613 static uint32_t 4614 nvme_ctrlr_get_ana_log_page_size(struct nvme_ctrlr *nvme_ctrlr) 4615 { 4616 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 4617 const struct spdk_nvme_ctrlr_data *cdata; 4618 uint32_t nsid, ns_count = 0; 4619 4620 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 4621 4622 for (nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 4623 nsid != 0; nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid)) { 4624 ns_count++; 4625 } 4626 4627 return sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 4628 sizeof(struct spdk_nvme_ana_group_descriptor) + ns_count * 4629 sizeof(uint32_t); 4630 } 4631 4632 static int 4633 nvme_ctrlr_set_ana_states(const struct spdk_nvme_ana_group_descriptor *desc, 4634 void *cb_arg) 4635 { 4636 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 4637 struct nvme_ns *nvme_ns; 4638 uint32_t i, nsid; 4639 4640 for (i = 0; i < desc->num_of_nsid; i++) { 4641 nsid = desc->nsid[i]; 4642 if (nsid == 0) { 4643 continue; 4644 } 4645 4646 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 4647 4648 assert(nvme_ns != NULL); 4649 if (nvme_ns == NULL) { 4650 /* Target told us that an inactive namespace had an ANA change */ 4651 continue; 4652 } 4653 4654 _nvme_ns_set_ana_state(nvme_ns, desc); 4655 } 4656 4657 return 0; 4658 } 4659 4660 static void 4661 bdev_nvme_disable_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 4662 { 4663 struct nvme_ns *nvme_ns; 4664 4665 spdk_free(nvme_ctrlr->ana_log_page); 4666 nvme_ctrlr->ana_log_page = NULL; 4667 4668 for (nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 4669 nvme_ns != NULL; 4670 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns)) { 4671 nvme_ns->ana_state_updating = false; 4672 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 4673 } 4674 } 4675 4676 static void 4677 nvme_ctrlr_read_ana_log_page_done(void *ctx, const struct spdk_nvme_cpl *cpl) 4678 { 4679 struct nvme_ctrlr *nvme_ctrlr = ctx; 4680 4681 if (cpl != NULL && spdk_nvme_cpl_is_success(cpl)) { 4682 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ctrlr_set_ana_states, 4683 nvme_ctrlr); 4684 } else { 4685 bdev_nvme_disable_read_ana_log_page(nvme_ctrlr); 4686 } 4687 4688 pthread_mutex_lock(&nvme_ctrlr->mutex); 4689 4690 assert(nvme_ctrlr->ana_log_page_updating == true); 4691 nvme_ctrlr->ana_log_page_updating = false; 4692 4693 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 4694 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4695 4696 nvme_ctrlr_unregister(nvme_ctrlr); 4697 } else { 4698 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4699 4700 bdev_nvme_clear_io_path_caches(nvme_ctrlr); 4701 } 4702 } 4703 4704 static int 4705 nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 4706 { 4707 uint32_t ana_log_page_size; 4708 int rc; 4709 4710 if (nvme_ctrlr->ana_log_page == NULL) { 4711 return -EINVAL; 4712 } 4713 4714 ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); 4715 4716 if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { 4717 SPDK_ERRLOG("ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", 4718 ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); 4719 return -EINVAL; 4720 } 4721 4722 pthread_mutex_lock(&nvme_ctrlr->mutex); 4723 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 4724 nvme_ctrlr->ana_log_page_updating) { 4725 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4726 return -EBUSY; 4727 } 4728 4729 nvme_ctrlr->ana_log_page_updating = true; 4730 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4731 4732 rc = spdk_nvme_ctrlr_cmd_get_log_page(nvme_ctrlr->ctrlr, 4733 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 4734 SPDK_NVME_GLOBAL_NS_TAG, 4735 nvme_ctrlr->ana_log_page, 4736 ana_log_page_size, 0, 4737 nvme_ctrlr_read_ana_log_page_done, 4738 nvme_ctrlr); 4739 if (rc != 0) { 4740 nvme_ctrlr_read_ana_log_page_done(nvme_ctrlr, NULL); 4741 } 4742 4743 return rc; 4744 } 4745 4746 static void 4747 dummy_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 4748 { 4749 } 4750 4751 struct bdev_nvme_set_preferred_path_ctx { 4752 struct spdk_bdev_desc *desc; 4753 struct nvme_ns *nvme_ns; 4754 bdev_nvme_set_preferred_path_cb cb_fn; 4755 void *cb_arg; 4756 }; 4757 4758 static void 4759 bdev_nvme_set_preferred_path_done(struct spdk_io_channel_iter *i, int status) 4760 { 4761 struct bdev_nvme_set_preferred_path_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4762 4763 assert(ctx != NULL); 4764 assert(ctx->desc != NULL); 4765 assert(ctx->cb_fn != NULL); 4766 4767 spdk_bdev_close(ctx->desc); 4768 4769 ctx->cb_fn(ctx->cb_arg, status); 4770 4771 free(ctx); 4772 } 4773 4774 static void 4775 _bdev_nvme_set_preferred_path(struct spdk_io_channel_iter *i) 4776 { 4777 struct bdev_nvme_set_preferred_path_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4778 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4779 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4780 struct nvme_io_path *io_path, *prev; 4781 4782 prev = NULL; 4783 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 4784 if (io_path->nvme_ns == ctx->nvme_ns) { 4785 break; 4786 } 4787 prev = io_path; 4788 } 4789 4790 if (io_path != NULL) { 4791 if (prev != NULL) { 4792 STAILQ_REMOVE_AFTER(&nbdev_ch->io_path_list, prev, stailq); 4793 STAILQ_INSERT_HEAD(&nbdev_ch->io_path_list, io_path, stailq); 4794 } 4795 4796 /* We can set io_path to nbdev_ch->current_io_path directly here. 4797 * However, it needs to be conditional. To simplify the code, 4798 * just clear nbdev_ch->current_io_path and let find_io_path() 4799 * fill it. 4800 * 4801 * Automatic failback may be disabled. Hence even if the io_path is 4802 * already at the head, clear nbdev_ch->current_io_path. 4803 */ 4804 bdev_nvme_clear_current_io_path(nbdev_ch); 4805 } 4806 4807 spdk_for_each_channel_continue(i, 0); 4808 } 4809 4810 static struct nvme_ns * 4811 bdev_nvme_set_preferred_ns(struct nvme_bdev *nbdev, uint16_t cntlid) 4812 { 4813 struct nvme_ns *nvme_ns, *prev; 4814 const struct spdk_nvme_ctrlr_data *cdata; 4815 4816 prev = NULL; 4817 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 4818 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 4819 4820 if (cdata->cntlid == cntlid) { 4821 break; 4822 } 4823 prev = nvme_ns; 4824 } 4825 4826 if (nvme_ns != NULL && prev != NULL) { 4827 TAILQ_REMOVE(&nbdev->nvme_ns_list, nvme_ns, tailq); 4828 TAILQ_INSERT_HEAD(&nbdev->nvme_ns_list, nvme_ns, tailq); 4829 } 4830 4831 return nvme_ns; 4832 } 4833 4834 /* This function supports only multipath mode. There is only a single I/O path 4835 * for each NVMe-oF controller. Hence, just move the matched I/O path to the 4836 * head of the I/O path list for each NVMe bdev channel. 4837 * 4838 * NVMe bdev channel may be acquired after completing this function. move the 4839 * matched namespace to the head of the namespace list for the NVMe bdev too. 4840 */ 4841 void 4842 bdev_nvme_set_preferred_path(const char *name, uint16_t cntlid, 4843 bdev_nvme_set_preferred_path_cb cb_fn, void *cb_arg) 4844 { 4845 struct bdev_nvme_set_preferred_path_ctx *ctx; 4846 struct spdk_bdev *bdev; 4847 struct nvme_bdev *nbdev; 4848 int rc = 0; 4849 4850 assert(cb_fn != NULL); 4851 4852 ctx = calloc(1, sizeof(*ctx)); 4853 if (ctx == NULL) { 4854 SPDK_ERRLOG("Failed to alloc context.\n"); 4855 rc = -ENOMEM; 4856 goto err_alloc; 4857 } 4858 4859 ctx->cb_fn = cb_fn; 4860 ctx->cb_arg = cb_arg; 4861 4862 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 4863 if (rc != 0) { 4864 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 4865 goto err_open; 4866 } 4867 4868 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 4869 4870 if (bdev->module != &nvme_if) { 4871 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 4872 rc = -ENODEV; 4873 goto err_bdev; 4874 } 4875 4876 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 4877 4878 pthread_mutex_lock(&nbdev->mutex); 4879 4880 ctx->nvme_ns = bdev_nvme_set_preferred_ns(nbdev, cntlid); 4881 if (ctx->nvme_ns == NULL) { 4882 pthread_mutex_unlock(&nbdev->mutex); 4883 4884 SPDK_ERRLOG("bdev %s does not have namespace to controller %u.\n", name, cntlid); 4885 rc = -ENODEV; 4886 goto err_bdev; 4887 } 4888 4889 pthread_mutex_unlock(&nbdev->mutex); 4890 4891 spdk_for_each_channel(nbdev, 4892 _bdev_nvme_set_preferred_path, 4893 ctx, 4894 bdev_nvme_set_preferred_path_done); 4895 return; 4896 4897 err_bdev: 4898 spdk_bdev_close(ctx->desc); 4899 err_open: 4900 free(ctx); 4901 err_alloc: 4902 cb_fn(cb_arg, rc); 4903 } 4904 4905 struct bdev_nvme_set_multipath_policy_ctx { 4906 struct spdk_bdev_desc *desc; 4907 bdev_nvme_set_multipath_policy_cb cb_fn; 4908 void *cb_arg; 4909 }; 4910 4911 static void 4912 bdev_nvme_set_multipath_policy_done(struct spdk_io_channel_iter *i, int status) 4913 { 4914 struct bdev_nvme_set_multipath_policy_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4915 4916 assert(ctx != NULL); 4917 assert(ctx->desc != NULL); 4918 assert(ctx->cb_fn != NULL); 4919 4920 spdk_bdev_close(ctx->desc); 4921 4922 ctx->cb_fn(ctx->cb_arg, status); 4923 4924 free(ctx); 4925 } 4926 4927 static void 4928 _bdev_nvme_set_multipath_policy(struct spdk_io_channel_iter *i) 4929 { 4930 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4931 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4932 struct nvme_bdev *nbdev = spdk_io_channel_get_io_device(_ch); 4933 4934 nbdev_ch->mp_policy = nbdev->mp_policy; 4935 nbdev_ch->mp_selector = nbdev->mp_selector; 4936 nbdev_ch->rr_min_io = nbdev->rr_min_io; 4937 bdev_nvme_clear_current_io_path(nbdev_ch); 4938 4939 spdk_for_each_channel_continue(i, 0); 4940 } 4941 4942 void 4943 bdev_nvme_set_multipath_policy(const char *name, enum bdev_nvme_multipath_policy policy, 4944 enum bdev_nvme_multipath_selector selector, uint32_t rr_min_io, 4945 bdev_nvme_set_multipath_policy_cb cb_fn, void *cb_arg) 4946 { 4947 struct bdev_nvme_set_multipath_policy_ctx *ctx; 4948 struct spdk_bdev *bdev; 4949 struct nvme_bdev *nbdev; 4950 int rc; 4951 4952 assert(cb_fn != NULL); 4953 4954 if (policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE && selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 4955 if (rr_min_io == UINT32_MAX) { 4956 rr_min_io = 1; 4957 } else if (rr_min_io == 0) { 4958 rc = -EINVAL; 4959 goto exit; 4960 } 4961 } else if (rr_min_io != UINT32_MAX) { 4962 rc = -EINVAL; 4963 goto exit; 4964 } 4965 4966 ctx = calloc(1, sizeof(*ctx)); 4967 if (ctx == NULL) { 4968 SPDK_ERRLOG("Failed to alloc context.\n"); 4969 rc = -ENOMEM; 4970 goto exit; 4971 } 4972 4973 ctx->cb_fn = cb_fn; 4974 ctx->cb_arg = cb_arg; 4975 4976 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 4977 if (rc != 0) { 4978 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 4979 rc = -ENODEV; 4980 goto err_open; 4981 } 4982 4983 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 4984 if (bdev->module != &nvme_if) { 4985 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 4986 rc = -ENODEV; 4987 goto err_module; 4988 } 4989 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 4990 4991 pthread_mutex_lock(&nbdev->mutex); 4992 nbdev->mp_policy = policy; 4993 nbdev->mp_selector = selector; 4994 nbdev->rr_min_io = rr_min_io; 4995 pthread_mutex_unlock(&nbdev->mutex); 4996 4997 spdk_for_each_channel(nbdev, 4998 _bdev_nvme_set_multipath_policy, 4999 ctx, 5000 bdev_nvme_set_multipath_policy_done); 5001 return; 5002 5003 err_module: 5004 spdk_bdev_close(ctx->desc); 5005 err_open: 5006 free(ctx); 5007 exit: 5008 cb_fn(cb_arg, rc); 5009 } 5010 5011 static void 5012 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 5013 { 5014 struct nvme_ctrlr *nvme_ctrlr = arg; 5015 union spdk_nvme_async_event_completion event; 5016 5017 if (spdk_nvme_cpl_is_error(cpl)) { 5018 SPDK_WARNLOG("AER request execute failed\n"); 5019 return; 5020 } 5021 5022 event.raw = cpl->cdw0; 5023 if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 5024 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) { 5025 nvme_ctrlr_populate_namespaces(nvme_ctrlr, NULL); 5026 } else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 5027 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_ANA_CHANGE)) { 5028 nvme_ctrlr_read_ana_log_page(nvme_ctrlr); 5029 } 5030 } 5031 5032 static void 5033 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, size_t count, int rc) 5034 { 5035 if (ctx->cb_fn) { 5036 ctx->cb_fn(ctx->cb_ctx, count, rc); 5037 } 5038 5039 ctx->namespaces_populated = true; 5040 if (ctx->probe_done) { 5041 /* The probe was already completed, so we need to free the context 5042 * here. This can happen for cases like OCSSD, where we need to 5043 * send additional commands to the SSD after attach. 5044 */ 5045 free(ctx); 5046 } 5047 } 5048 5049 static void 5050 nvme_ctrlr_create_done(struct nvme_ctrlr *nvme_ctrlr, 5051 struct nvme_async_probe_ctx *ctx) 5052 { 5053 spdk_io_device_register(nvme_ctrlr, 5054 bdev_nvme_create_ctrlr_channel_cb, 5055 bdev_nvme_destroy_ctrlr_channel_cb, 5056 sizeof(struct nvme_ctrlr_channel), 5057 nvme_ctrlr->nbdev_ctrlr->name); 5058 5059 nvme_ctrlr_populate_namespaces(nvme_ctrlr, ctx); 5060 } 5061 5062 static void 5063 nvme_ctrlr_init_ana_log_page_done(void *_ctx, const struct spdk_nvme_cpl *cpl) 5064 { 5065 struct nvme_ctrlr *nvme_ctrlr = _ctx; 5066 struct nvme_async_probe_ctx *ctx = nvme_ctrlr->probe_ctx; 5067 5068 nvme_ctrlr->probe_ctx = NULL; 5069 5070 if (spdk_nvme_cpl_is_error(cpl)) { 5071 nvme_ctrlr_delete(nvme_ctrlr); 5072 5073 if (ctx != NULL) { 5074 populate_namespaces_cb(ctx, 0, -1); 5075 } 5076 return; 5077 } 5078 5079 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 5080 } 5081 5082 static int 5083 nvme_ctrlr_init_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 5084 struct nvme_async_probe_ctx *ctx) 5085 { 5086 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 5087 const struct spdk_nvme_ctrlr_data *cdata; 5088 uint32_t ana_log_page_size; 5089 5090 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5091 5092 /* Set buffer size enough to include maximum number of allowed namespaces. */ 5093 ana_log_page_size = sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 5094 sizeof(struct spdk_nvme_ana_group_descriptor) + cdata->mnan * 5095 sizeof(uint32_t); 5096 5097 nvme_ctrlr->ana_log_page = spdk_zmalloc(ana_log_page_size, 64, NULL, 5098 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 5099 if (nvme_ctrlr->ana_log_page == NULL) { 5100 SPDK_ERRLOG("could not allocate ANA log page buffer\n"); 5101 return -ENXIO; 5102 } 5103 5104 /* Each descriptor in a ANA log page is not ensured to be 8-bytes aligned. 5105 * Hence copy each descriptor to a temporary area when parsing it. 5106 * 5107 * Allocate a buffer whose size is as large as ANA log page buffer because 5108 * we do not know the size of a descriptor until actually reading it. 5109 */ 5110 nvme_ctrlr->copied_ana_desc = calloc(1, ana_log_page_size); 5111 if (nvme_ctrlr->copied_ana_desc == NULL) { 5112 SPDK_ERRLOG("could not allocate a buffer to parse ANA descriptor\n"); 5113 return -ENOMEM; 5114 } 5115 5116 nvme_ctrlr->max_ana_log_page_size = ana_log_page_size; 5117 5118 nvme_ctrlr->probe_ctx = ctx; 5119 5120 /* Then, set the read size only to include the current active namespaces. */ 5121 ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); 5122 5123 if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { 5124 SPDK_ERRLOG("ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", 5125 ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); 5126 return -EINVAL; 5127 } 5128 5129 return spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, 5130 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 5131 SPDK_NVME_GLOBAL_NS_TAG, 5132 nvme_ctrlr->ana_log_page, 5133 ana_log_page_size, 0, 5134 nvme_ctrlr_init_ana_log_page_done, 5135 nvme_ctrlr); 5136 } 5137 5138 /* hostnqn and subnqn were already verified before attaching a controller. 5139 * Hence check only the multipath capability and cntlid here. 5140 */ 5141 static bool 5142 bdev_nvme_check_multipath(struct nvme_bdev_ctrlr *nbdev_ctrlr, struct spdk_nvme_ctrlr *ctrlr) 5143 { 5144 struct nvme_ctrlr *tmp; 5145 const struct spdk_nvme_ctrlr_data *cdata, *tmp_cdata; 5146 5147 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5148 5149 if (!cdata->cmic.multi_ctrlr) { 5150 SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); 5151 return false; 5152 } 5153 5154 TAILQ_FOREACH(tmp, &nbdev_ctrlr->ctrlrs, tailq) { 5155 tmp_cdata = spdk_nvme_ctrlr_get_data(tmp->ctrlr); 5156 5157 if (!tmp_cdata->cmic.multi_ctrlr) { 5158 SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); 5159 return false; 5160 } 5161 if (cdata->cntlid == tmp_cdata->cntlid) { 5162 SPDK_ERRLOG("cntlid %u are duplicated.\n", tmp_cdata->cntlid); 5163 return false; 5164 } 5165 } 5166 5167 return true; 5168 } 5169 5170 static int 5171 nvme_bdev_ctrlr_create(const char *name, struct nvme_ctrlr *nvme_ctrlr) 5172 { 5173 struct nvme_bdev_ctrlr *nbdev_ctrlr; 5174 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 5175 int rc = 0; 5176 5177 pthread_mutex_lock(&g_bdev_nvme_mutex); 5178 5179 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 5180 if (nbdev_ctrlr != NULL) { 5181 if (!bdev_nvme_check_multipath(nbdev_ctrlr, ctrlr)) { 5182 rc = -EINVAL; 5183 goto exit; 5184 } 5185 } else { 5186 nbdev_ctrlr = calloc(1, sizeof(*nbdev_ctrlr)); 5187 if (nbdev_ctrlr == NULL) { 5188 SPDK_ERRLOG("Failed to allocate nvme_bdev_ctrlr.\n"); 5189 rc = -ENOMEM; 5190 goto exit; 5191 } 5192 nbdev_ctrlr->name = strdup(name); 5193 if (nbdev_ctrlr->name == NULL) { 5194 SPDK_ERRLOG("Failed to allocate name of nvme_bdev_ctrlr.\n"); 5195 free(nbdev_ctrlr); 5196 goto exit; 5197 } 5198 TAILQ_INIT(&nbdev_ctrlr->ctrlrs); 5199 TAILQ_INIT(&nbdev_ctrlr->bdevs); 5200 TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 5201 } 5202 nvme_ctrlr->nbdev_ctrlr = nbdev_ctrlr; 5203 TAILQ_INSERT_TAIL(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 5204 exit: 5205 pthread_mutex_unlock(&g_bdev_nvme_mutex); 5206 return rc; 5207 } 5208 5209 static int 5210 nvme_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr, 5211 const char *name, 5212 const struct spdk_nvme_transport_id *trid, 5213 struct nvme_async_probe_ctx *ctx) 5214 { 5215 struct nvme_ctrlr *nvme_ctrlr; 5216 struct nvme_path_id *path_id; 5217 const struct spdk_nvme_ctrlr_data *cdata; 5218 int rc; 5219 5220 nvme_ctrlr = calloc(1, sizeof(*nvme_ctrlr)); 5221 if (nvme_ctrlr == NULL) { 5222 SPDK_ERRLOG("Failed to allocate device struct\n"); 5223 return -ENOMEM; 5224 } 5225 5226 rc = pthread_mutex_init(&nvme_ctrlr->mutex, NULL); 5227 if (rc != 0) { 5228 free(nvme_ctrlr); 5229 return rc; 5230 } 5231 5232 TAILQ_INIT(&nvme_ctrlr->trids); 5233 5234 RB_INIT(&nvme_ctrlr->namespaces); 5235 5236 path_id = calloc(1, sizeof(*path_id)); 5237 if (path_id == NULL) { 5238 SPDK_ERRLOG("Failed to allocate trid entry pointer\n"); 5239 rc = -ENOMEM; 5240 goto err; 5241 } 5242 5243 path_id->trid = *trid; 5244 if (ctx != NULL) { 5245 memcpy(path_id->hostid.hostaddr, ctx->drv_opts.src_addr, sizeof(path_id->hostid.hostaddr)); 5246 memcpy(path_id->hostid.hostsvcid, ctx->drv_opts.src_svcid, sizeof(path_id->hostid.hostsvcid)); 5247 } 5248 nvme_ctrlr->active_path_id = path_id; 5249 TAILQ_INSERT_HEAD(&nvme_ctrlr->trids, path_id, link); 5250 5251 nvme_ctrlr->thread = spdk_get_thread(); 5252 nvme_ctrlr->ctrlr = ctrlr; 5253 nvme_ctrlr->ref = 1; 5254 5255 if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) { 5256 SPDK_ERRLOG("OCSSDs are not supported"); 5257 rc = -ENOTSUP; 5258 goto err; 5259 } 5260 5261 if (ctx != NULL) { 5262 memcpy(&nvme_ctrlr->opts, &ctx->bdev_opts, sizeof(ctx->bdev_opts)); 5263 } else { 5264 bdev_nvme_get_default_ctrlr_opts(&nvme_ctrlr->opts); 5265 } 5266 5267 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_ctrlr, 5268 g_opts.nvme_adminq_poll_period_us); 5269 5270 if (g_opts.timeout_us > 0) { 5271 /* Register timeout callback. Timeout values for IO vs. admin reqs can be different. */ 5272 /* If timeout_admin_us is 0 (not specified), admin uses same timeout as IO. */ 5273 uint64_t adm_timeout_us = (g_opts.timeout_admin_us == 0) ? 5274 g_opts.timeout_us : g_opts.timeout_admin_us; 5275 spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us, 5276 adm_timeout_us, timeout_cb, nvme_ctrlr); 5277 } 5278 5279 spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_ctrlr); 5280 spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_ctrlr); 5281 5282 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 5283 SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) { 5284 nvme_ctrlr->opal_dev = spdk_opal_dev_construct(ctrlr); 5285 } 5286 5287 rc = nvme_bdev_ctrlr_create(name, nvme_ctrlr); 5288 if (rc != 0) { 5289 goto err; 5290 } 5291 5292 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5293 5294 if (cdata->cmic.ana_reporting) { 5295 rc = nvme_ctrlr_init_ana_log_page(nvme_ctrlr, ctx); 5296 if (rc == 0) { 5297 return 0; 5298 } 5299 } else { 5300 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 5301 return 0; 5302 } 5303 5304 err: 5305 nvme_ctrlr_delete(nvme_ctrlr); 5306 return rc; 5307 } 5308 5309 void 5310 bdev_nvme_get_default_ctrlr_opts(struct nvme_ctrlr_opts *opts) 5311 { 5312 opts->prchk_flags = 0; 5313 opts->ctrlr_loss_timeout_sec = g_opts.ctrlr_loss_timeout_sec; 5314 opts->reconnect_delay_sec = g_opts.reconnect_delay_sec; 5315 opts->fast_io_fail_timeout_sec = g_opts.fast_io_fail_timeout_sec; 5316 } 5317 5318 static void 5319 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5320 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *drv_opts) 5321 { 5322 char *name; 5323 5324 name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++); 5325 if (!name) { 5326 SPDK_ERRLOG("Failed to assign name to NVMe device\n"); 5327 return; 5328 } 5329 5330 if (nvme_ctrlr_create(ctrlr, name, trid, NULL) == 0) { 5331 SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name); 5332 } else { 5333 SPDK_ERRLOG("Failed to attach to %s (%s)\n", trid->traddr, name); 5334 } 5335 5336 free(name); 5337 } 5338 5339 static void 5340 _nvme_ctrlr_destruct(void *ctx) 5341 { 5342 struct nvme_ctrlr *nvme_ctrlr = ctx; 5343 5344 nvme_ctrlr_depopulate_namespaces(nvme_ctrlr); 5345 nvme_ctrlr_release(nvme_ctrlr); 5346 } 5347 5348 static int 5349 bdev_nvme_delete_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) 5350 { 5351 struct nvme_probe_skip_entry *entry; 5352 5353 /* The controller's destruction was already started */ 5354 if (nvme_ctrlr->destruct) { 5355 return -EALREADY; 5356 } 5357 5358 if (!hotplug && 5359 nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { 5360 entry = calloc(1, sizeof(*entry)); 5361 if (!entry) { 5362 return -ENOMEM; 5363 } 5364 entry->trid = nvme_ctrlr->active_path_id->trid; 5365 TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq); 5366 } 5367 5368 nvme_ctrlr->destruct = true; 5369 return 0; 5370 } 5371 5372 static int 5373 bdev_nvme_delete_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) 5374 { 5375 int rc; 5376 5377 pthread_mutex_lock(&nvme_ctrlr->mutex); 5378 rc = bdev_nvme_delete_ctrlr_unsafe(nvme_ctrlr, hotplug); 5379 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5380 5381 if (rc == 0) { 5382 _nvme_ctrlr_destruct(nvme_ctrlr); 5383 } else if (rc == -EALREADY) { 5384 rc = 0; 5385 } 5386 5387 return rc; 5388 } 5389 5390 static void 5391 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr) 5392 { 5393 struct nvme_ctrlr *nvme_ctrlr = cb_ctx; 5394 5395 bdev_nvme_delete_ctrlr(nvme_ctrlr, true); 5396 } 5397 5398 static int 5399 bdev_nvme_hotplug_probe(void *arg) 5400 { 5401 if (g_hotplug_probe_ctx == NULL) { 5402 spdk_poller_unregister(&g_hotplug_probe_poller); 5403 return SPDK_POLLER_IDLE; 5404 } 5405 5406 if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) { 5407 g_hotplug_probe_ctx = NULL; 5408 spdk_poller_unregister(&g_hotplug_probe_poller); 5409 } 5410 5411 return SPDK_POLLER_BUSY; 5412 } 5413 5414 static int 5415 bdev_nvme_hotplug(void *arg) 5416 { 5417 struct spdk_nvme_transport_id trid_pcie; 5418 5419 if (g_hotplug_probe_ctx) { 5420 return SPDK_POLLER_BUSY; 5421 } 5422 5423 memset(&trid_pcie, 0, sizeof(trid_pcie)); 5424 spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE); 5425 5426 g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL, 5427 hotplug_probe_cb, attach_cb, NULL); 5428 5429 if (g_hotplug_probe_ctx) { 5430 assert(g_hotplug_probe_poller == NULL); 5431 g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000); 5432 } 5433 5434 return SPDK_POLLER_BUSY; 5435 } 5436 5437 void 5438 bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts) 5439 { 5440 *opts = g_opts; 5441 } 5442 5443 static bool bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, 5444 uint32_t reconnect_delay_sec, 5445 uint32_t fast_io_fail_timeout_sec); 5446 5447 static int 5448 bdev_nvme_validate_opts(const struct spdk_bdev_nvme_opts *opts) 5449 { 5450 if ((opts->timeout_us == 0) && (opts->timeout_admin_us != 0)) { 5451 /* Can't set timeout_admin_us without also setting timeout_us */ 5452 SPDK_WARNLOG("Invalid options: Can't have (timeout_us == 0) with (timeout_admin_us > 0)\n"); 5453 return -EINVAL; 5454 } 5455 5456 if (opts->bdev_retry_count < -1) { 5457 SPDK_WARNLOG("Invalid option: bdev_retry_count can't be less than -1.\n"); 5458 return -EINVAL; 5459 } 5460 5461 if (!bdev_nvme_check_io_error_resiliency_params(opts->ctrlr_loss_timeout_sec, 5462 opts->reconnect_delay_sec, 5463 opts->fast_io_fail_timeout_sec)) { 5464 return -EINVAL; 5465 } 5466 5467 return 0; 5468 } 5469 5470 int 5471 bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts) 5472 { 5473 int ret; 5474 5475 ret = bdev_nvme_validate_opts(opts); 5476 if (ret) { 5477 SPDK_WARNLOG("Failed to set nvme opts.\n"); 5478 return ret; 5479 } 5480 5481 if (g_bdev_nvme_init_thread != NULL) { 5482 if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 5483 return -EPERM; 5484 } 5485 } 5486 5487 if (opts->rdma_srq_size != 0) { 5488 struct spdk_nvme_transport_opts drv_opts; 5489 5490 spdk_nvme_transport_get_opts(&drv_opts, sizeof(drv_opts)); 5491 drv_opts.rdma_srq_size = opts->rdma_srq_size; 5492 5493 ret = spdk_nvme_transport_set_opts(&drv_opts, sizeof(drv_opts)); 5494 if (ret) { 5495 SPDK_ERRLOG("Failed to set NVMe transport opts.\n"); 5496 return ret; 5497 } 5498 } 5499 5500 g_opts = *opts; 5501 5502 return 0; 5503 } 5504 5505 struct set_nvme_hotplug_ctx { 5506 uint64_t period_us; 5507 bool enabled; 5508 spdk_msg_fn fn; 5509 void *fn_ctx; 5510 }; 5511 5512 static void 5513 set_nvme_hotplug_period_cb(void *_ctx) 5514 { 5515 struct set_nvme_hotplug_ctx *ctx = _ctx; 5516 5517 spdk_poller_unregister(&g_hotplug_poller); 5518 if (ctx->enabled) { 5519 g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us); 5520 } 5521 5522 g_nvme_hotplug_poll_period_us = ctx->period_us; 5523 g_nvme_hotplug_enabled = ctx->enabled; 5524 if (ctx->fn) { 5525 ctx->fn(ctx->fn_ctx); 5526 } 5527 5528 free(ctx); 5529 } 5530 5531 int 5532 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx) 5533 { 5534 struct set_nvme_hotplug_ctx *ctx; 5535 5536 if (enabled == true && !spdk_process_is_primary()) { 5537 return -EPERM; 5538 } 5539 5540 ctx = calloc(1, sizeof(*ctx)); 5541 if (ctx == NULL) { 5542 return -ENOMEM; 5543 } 5544 5545 period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us; 5546 ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX); 5547 ctx->enabled = enabled; 5548 ctx->fn = cb; 5549 ctx->fn_ctx = cb_ctx; 5550 5551 spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx); 5552 return 0; 5553 } 5554 5555 static void 5556 nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 5557 struct nvme_async_probe_ctx *ctx) 5558 { 5559 struct nvme_ns *nvme_ns; 5560 struct nvme_bdev *nvme_bdev; 5561 size_t j; 5562 5563 assert(nvme_ctrlr != NULL); 5564 5565 if (ctx->names == NULL) { 5566 populate_namespaces_cb(ctx, 0, 0); 5567 return; 5568 } 5569 5570 /* 5571 * Report the new bdevs that were created in this call. 5572 * There can be more than one bdev per NVMe controller. 5573 */ 5574 j = 0; 5575 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 5576 while (nvme_ns != NULL) { 5577 nvme_bdev = nvme_ns->bdev; 5578 if (j < ctx->count) { 5579 ctx->names[j] = nvme_bdev->disk.name; 5580 j++; 5581 } else { 5582 SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n", 5583 ctx->count); 5584 populate_namespaces_cb(ctx, 0, -ERANGE); 5585 return; 5586 } 5587 5588 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 5589 } 5590 5591 populate_namespaces_cb(ctx, j, 0); 5592 } 5593 5594 static int 5595 bdev_nvme_check_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 5596 struct spdk_nvme_ctrlr *new_ctrlr, 5597 struct spdk_nvme_transport_id *trid) 5598 { 5599 struct nvme_path_id *tmp_trid; 5600 5601 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 5602 SPDK_ERRLOG("PCIe failover is not supported.\n"); 5603 return -ENOTSUP; 5604 } 5605 5606 /* Currently we only support failover to the same transport type. */ 5607 if (nvme_ctrlr->active_path_id->trid.trtype != trid->trtype) { 5608 SPDK_WARNLOG("Failover from trtype: %s to a different trtype: %s is not supported currently\n", 5609 spdk_nvme_transport_id_trtype_str(nvme_ctrlr->active_path_id->trid.trtype), 5610 spdk_nvme_transport_id_trtype_str(trid->trtype)); 5611 return -EINVAL; 5612 } 5613 5614 5615 /* Currently we only support failover to the same NQN. */ 5616 if (strncmp(trid->subnqn, nvme_ctrlr->active_path_id->trid.subnqn, SPDK_NVMF_NQN_MAX_LEN)) { 5617 SPDK_WARNLOG("Failover from subnqn: %s to a different subnqn: %s is not supported currently\n", 5618 nvme_ctrlr->active_path_id->trid.subnqn, trid->subnqn); 5619 return -EINVAL; 5620 } 5621 5622 /* Skip all the other checks if we've already registered this path. */ 5623 TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) { 5624 if (!spdk_nvme_transport_id_compare(&tmp_trid->trid, trid)) { 5625 SPDK_WARNLOG("This path (traddr: %s subnqn: %s) is already registered\n", trid->traddr, 5626 trid->subnqn); 5627 return -EEXIST; 5628 } 5629 } 5630 5631 return 0; 5632 } 5633 5634 static int 5635 bdev_nvme_check_secondary_namespace(struct nvme_ctrlr *nvme_ctrlr, 5636 struct spdk_nvme_ctrlr *new_ctrlr) 5637 { 5638 struct nvme_ns *nvme_ns; 5639 struct spdk_nvme_ns *new_ns; 5640 5641 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 5642 while (nvme_ns != NULL) { 5643 new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nvme_ns->id); 5644 assert(new_ns != NULL); 5645 5646 if (!bdev_nvme_compare_ns(nvme_ns->ns, new_ns)) { 5647 return -EINVAL; 5648 } 5649 5650 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 5651 } 5652 5653 return 0; 5654 } 5655 5656 static int 5657 _bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 5658 struct spdk_nvme_transport_id *trid) 5659 { 5660 struct nvme_path_id *active_id, *new_trid, *tmp_trid; 5661 5662 new_trid = calloc(1, sizeof(*new_trid)); 5663 if (new_trid == NULL) { 5664 return -ENOMEM; 5665 } 5666 new_trid->trid = *trid; 5667 5668 active_id = nvme_ctrlr->active_path_id; 5669 assert(active_id != NULL); 5670 assert(active_id == TAILQ_FIRST(&nvme_ctrlr->trids)); 5671 5672 /* Skip the active trid not to replace it until it is failed. */ 5673 tmp_trid = TAILQ_NEXT(active_id, link); 5674 if (tmp_trid == NULL) { 5675 goto add_tail; 5676 } 5677 5678 /* It means the trid is faled if its last failed time is non-zero. 5679 * Insert the new alternate trid before any failed trid. 5680 */ 5681 TAILQ_FOREACH_FROM(tmp_trid, &nvme_ctrlr->trids, link) { 5682 if (tmp_trid->last_failed_tsc != 0) { 5683 TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link); 5684 return 0; 5685 } 5686 } 5687 5688 add_tail: 5689 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, new_trid, link); 5690 return 0; 5691 } 5692 5693 /* This is the case that a secondary path is added to an existing 5694 * nvme_ctrlr for failover. After checking if it can access the same 5695 * namespaces as the primary path, it is disconnected until failover occurs. 5696 */ 5697 static int 5698 bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 5699 struct spdk_nvme_ctrlr *new_ctrlr, 5700 struct spdk_nvme_transport_id *trid) 5701 { 5702 int rc; 5703 5704 assert(nvme_ctrlr != NULL); 5705 5706 pthread_mutex_lock(&nvme_ctrlr->mutex); 5707 5708 rc = bdev_nvme_check_secondary_trid(nvme_ctrlr, new_ctrlr, trid); 5709 if (rc != 0) { 5710 goto exit; 5711 } 5712 5713 rc = bdev_nvme_check_secondary_namespace(nvme_ctrlr, new_ctrlr); 5714 if (rc != 0) { 5715 goto exit; 5716 } 5717 5718 rc = _bdev_nvme_add_secondary_trid(nvme_ctrlr, trid); 5719 5720 exit: 5721 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5722 5723 spdk_nvme_detach(new_ctrlr); 5724 5725 return rc; 5726 } 5727 5728 static void 5729 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5730 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 5731 { 5732 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 5733 struct nvme_async_probe_ctx *ctx; 5734 int rc; 5735 5736 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 5737 ctx->ctrlr_attached = true; 5738 5739 rc = nvme_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx); 5740 if (rc != 0) { 5741 populate_namespaces_cb(ctx, 0, rc); 5742 } 5743 } 5744 5745 static void 5746 connect_set_failover_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5747 struct spdk_nvme_ctrlr *ctrlr, 5748 const struct spdk_nvme_ctrlr_opts *opts) 5749 { 5750 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 5751 struct nvme_ctrlr *nvme_ctrlr; 5752 struct nvme_async_probe_ctx *ctx; 5753 int rc; 5754 5755 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 5756 ctx->ctrlr_attached = true; 5757 5758 nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->base_name); 5759 if (nvme_ctrlr) { 5760 rc = bdev_nvme_add_secondary_trid(nvme_ctrlr, ctrlr, &ctx->trid); 5761 } else { 5762 rc = -ENODEV; 5763 } 5764 5765 populate_namespaces_cb(ctx, 0, rc); 5766 } 5767 5768 static int 5769 bdev_nvme_async_poll(void *arg) 5770 { 5771 struct nvme_async_probe_ctx *ctx = arg; 5772 int rc; 5773 5774 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 5775 if (spdk_unlikely(rc != -EAGAIN)) { 5776 ctx->probe_done = true; 5777 spdk_poller_unregister(&ctx->poller); 5778 if (!ctx->ctrlr_attached) { 5779 /* The probe is done, but no controller was attached. 5780 * That means we had a failure, so report -EIO back to 5781 * the caller (usually the RPC). populate_namespaces_cb() 5782 * will take care of freeing the nvme_async_probe_ctx. 5783 */ 5784 populate_namespaces_cb(ctx, 0, -EIO); 5785 } else if (ctx->namespaces_populated) { 5786 /* The namespaces for the attached controller were all 5787 * populated and the response was already sent to the 5788 * caller (usually the RPC). So free the context here. 5789 */ 5790 free(ctx); 5791 } 5792 } 5793 5794 return SPDK_POLLER_BUSY; 5795 } 5796 5797 static bool 5798 bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, 5799 uint32_t reconnect_delay_sec, 5800 uint32_t fast_io_fail_timeout_sec) 5801 { 5802 if (ctrlr_loss_timeout_sec < -1) { 5803 SPDK_ERRLOG("ctrlr_loss_timeout_sec can't be less than -1.\n"); 5804 return false; 5805 } else if (ctrlr_loss_timeout_sec == -1) { 5806 if (reconnect_delay_sec == 0) { 5807 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 5808 return false; 5809 } else if (fast_io_fail_timeout_sec != 0 && 5810 fast_io_fail_timeout_sec < reconnect_delay_sec) { 5811 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io-fail_timeout_sec.\n"); 5812 return false; 5813 } 5814 } else if (ctrlr_loss_timeout_sec != 0) { 5815 if (reconnect_delay_sec == 0) { 5816 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 5817 return false; 5818 } else if (reconnect_delay_sec > (uint32_t)ctrlr_loss_timeout_sec) { 5819 SPDK_ERRLOG("reconnect_delay_sec can't be more than ctrlr_loss_timeout_sec.\n"); 5820 return false; 5821 } else if (fast_io_fail_timeout_sec != 0) { 5822 if (fast_io_fail_timeout_sec < reconnect_delay_sec) { 5823 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io_fail_timeout_sec.\n"); 5824 return false; 5825 } else if (fast_io_fail_timeout_sec > (uint32_t)ctrlr_loss_timeout_sec) { 5826 SPDK_ERRLOG("fast_io_fail_timeout_sec can't be more than ctrlr_loss_timeout_sec.\n"); 5827 return false; 5828 } 5829 } 5830 } else if (reconnect_delay_sec != 0 || fast_io_fail_timeout_sec != 0) { 5831 SPDK_ERRLOG("Both reconnect_delay_sec and fast_io_fail_timeout_sec must be 0 if ctrlr_loss_timeout_sec is 0.\n"); 5832 return false; 5833 } 5834 5835 return true; 5836 } 5837 5838 int 5839 bdev_nvme_create(struct spdk_nvme_transport_id *trid, 5840 const char *base_name, 5841 const char **names, 5842 uint32_t count, 5843 spdk_bdev_create_nvme_fn cb_fn, 5844 void *cb_ctx, 5845 struct spdk_nvme_ctrlr_opts *drv_opts, 5846 struct nvme_ctrlr_opts *bdev_opts, 5847 bool multipath) 5848 { 5849 struct nvme_probe_skip_entry *entry, *tmp; 5850 struct nvme_async_probe_ctx *ctx; 5851 spdk_nvme_attach_cb attach_cb; 5852 5853 /* TODO expand this check to include both the host and target TRIDs. 5854 * Only if both are the same should we fail. 5855 */ 5856 if (nvme_ctrlr_get(trid) != NULL) { 5857 SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr); 5858 return -EEXIST; 5859 } 5860 5861 if (bdev_opts != NULL && 5862 !bdev_nvme_check_io_error_resiliency_params(bdev_opts->ctrlr_loss_timeout_sec, 5863 bdev_opts->reconnect_delay_sec, 5864 bdev_opts->fast_io_fail_timeout_sec)) { 5865 return -EINVAL; 5866 } 5867 5868 ctx = calloc(1, sizeof(*ctx)); 5869 if (!ctx) { 5870 return -ENOMEM; 5871 } 5872 ctx->base_name = base_name; 5873 ctx->names = names; 5874 ctx->count = count; 5875 ctx->cb_fn = cb_fn; 5876 ctx->cb_ctx = cb_ctx; 5877 ctx->trid = *trid; 5878 5879 if (bdev_opts) { 5880 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 5881 } else { 5882 bdev_nvme_get_default_ctrlr_opts(&ctx->bdev_opts); 5883 } 5884 5885 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 5886 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) { 5887 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 5888 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 5889 free(entry); 5890 break; 5891 } 5892 } 5893 } 5894 5895 if (drv_opts) { 5896 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 5897 } else { 5898 spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->drv_opts, sizeof(ctx->drv_opts)); 5899 } 5900 5901 ctx->drv_opts.transport_retry_count = g_opts.transport_retry_count; 5902 ctx->drv_opts.transport_ack_timeout = g_opts.transport_ack_timeout; 5903 ctx->drv_opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms; 5904 ctx->drv_opts.disable_read_ana_log_page = true; 5905 ctx->drv_opts.transport_tos = g_opts.transport_tos; 5906 5907 if (nvme_bdev_ctrlr_get_by_name(base_name) == NULL || multipath) { 5908 attach_cb = connect_attach_cb; 5909 } else { 5910 attach_cb = connect_set_failover_cb; 5911 } 5912 5913 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, attach_cb); 5914 if (ctx->probe_ctx == NULL) { 5915 SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr); 5916 free(ctx); 5917 return -ENODEV; 5918 } 5919 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000); 5920 5921 return 0; 5922 } 5923 5924 static bool 5925 nvme_path_should_delete(struct nvme_path_id *p, const struct nvme_path_id *path_id) 5926 { 5927 if (path_id->trid.trtype != 0) { 5928 if (path_id->trid.trtype == SPDK_NVME_TRANSPORT_CUSTOM) { 5929 if (strcasecmp(path_id->trid.trstring, p->trid.trstring) != 0) { 5930 return false; 5931 } 5932 } else { 5933 if (path_id->trid.trtype != p->trid.trtype) { 5934 return false; 5935 } 5936 } 5937 } 5938 5939 if (!spdk_mem_all_zero(path_id->trid.traddr, sizeof(path_id->trid.traddr))) { 5940 if (strcasecmp(path_id->trid.traddr, p->trid.traddr) != 0) { 5941 return false; 5942 } 5943 } 5944 5945 if (path_id->trid.adrfam != 0) { 5946 if (path_id->trid.adrfam != p->trid.adrfam) { 5947 return false; 5948 } 5949 } 5950 5951 if (!spdk_mem_all_zero(path_id->trid.trsvcid, sizeof(path_id->trid.trsvcid))) { 5952 if (strcasecmp(path_id->trid.trsvcid, p->trid.trsvcid) != 0) { 5953 return false; 5954 } 5955 } 5956 5957 if (!spdk_mem_all_zero(path_id->trid.subnqn, sizeof(path_id->trid.subnqn))) { 5958 if (strcmp(path_id->trid.subnqn, p->trid.subnqn) != 0) { 5959 return false; 5960 } 5961 } 5962 5963 if (!spdk_mem_all_zero(path_id->hostid.hostaddr, sizeof(path_id->hostid.hostaddr))) { 5964 if (strcmp(path_id->hostid.hostaddr, p->hostid.hostaddr) != 0) { 5965 return false; 5966 } 5967 } 5968 5969 if (!spdk_mem_all_zero(path_id->hostid.hostsvcid, sizeof(path_id->hostid.hostsvcid))) { 5970 if (strcmp(path_id->hostid.hostsvcid, p->hostid.hostsvcid) != 0) { 5971 return false; 5972 } 5973 } 5974 5975 return true; 5976 } 5977 5978 static int 5979 _bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, const struct nvme_path_id *path_id) 5980 { 5981 struct nvme_path_id *p, *t; 5982 spdk_msg_fn msg_fn; 5983 int rc = -ENXIO; 5984 5985 pthread_mutex_lock(&nvme_ctrlr->mutex); 5986 5987 TAILQ_FOREACH_REVERSE_SAFE(p, &nvme_ctrlr->trids, nvme_paths, link, t) { 5988 if (p == TAILQ_FIRST(&nvme_ctrlr->trids)) { 5989 break; 5990 } 5991 5992 if (!nvme_path_should_delete(p, path_id)) { 5993 continue; 5994 } 5995 5996 /* We are not using the specified path. */ 5997 TAILQ_REMOVE(&nvme_ctrlr->trids, p, link); 5998 free(p); 5999 rc = 0; 6000 } 6001 6002 if (p == NULL || !nvme_path_should_delete(p, path_id)) { 6003 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6004 return rc; 6005 } 6006 6007 /* If we made it here, then this path is a match! Now we need to remove it. */ 6008 6009 /* This is the active path in use right now. The active path is always the first in the list. */ 6010 assert(p == nvme_ctrlr->active_path_id); 6011 6012 if (!TAILQ_NEXT(p, link)) { 6013 /* The current path is the only path. */ 6014 msg_fn = _nvme_ctrlr_destruct; 6015 rc = bdev_nvme_delete_ctrlr_unsafe(nvme_ctrlr, false); 6016 } else { 6017 /* There is an alternative path. */ 6018 msg_fn = _bdev_nvme_reset_ctrlr; 6019 rc = bdev_nvme_failover_ctrlr_unsafe(nvme_ctrlr, true); 6020 } 6021 6022 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6023 6024 if (rc == 0) { 6025 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 6026 } else if (rc == -EALREADY) { 6027 rc = 0; 6028 } 6029 6030 return rc; 6031 } 6032 6033 int 6034 bdev_nvme_delete(const char *name, const struct nvme_path_id *path_id) 6035 { 6036 struct nvme_bdev_ctrlr *nbdev_ctrlr; 6037 struct nvme_ctrlr *nvme_ctrlr, *tmp_nvme_ctrlr; 6038 int rc = -ENXIO, _rc; 6039 6040 if (name == NULL || path_id == NULL) { 6041 return -EINVAL; 6042 } 6043 6044 pthread_mutex_lock(&g_bdev_nvme_mutex); 6045 6046 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 6047 if (nbdev_ctrlr == NULL) { 6048 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6049 6050 SPDK_ERRLOG("Failed to find NVMe bdev controller\n"); 6051 return -ENODEV; 6052 } 6053 6054 TAILQ_FOREACH_SAFE(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq, tmp_nvme_ctrlr) { 6055 _rc = _bdev_nvme_delete(nvme_ctrlr, path_id); 6056 if (_rc < 0 && _rc != -ENXIO) { 6057 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6058 6059 return _rc; 6060 } else if (_rc == 0) { 6061 /* We traverse all remaining nvme_ctrlrs even if one nvme_ctrlr 6062 * was deleted successfully. To remember the successful deletion, 6063 * overwrite rc only if _rc is zero. 6064 */ 6065 rc = 0; 6066 } 6067 } 6068 6069 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6070 6071 /* All nvme_ctrlrs were deleted or no nvme_ctrlr which had the trid was found. */ 6072 return rc; 6073 } 6074 6075 #define DISCOVERY_INFOLOG(ctx, format, ...) \ 6076 SPDK_INFOLOG(bdev_nvme, "Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 6077 6078 #define DISCOVERY_ERRLOG(ctx, format, ...) \ 6079 SPDK_ERRLOG("Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 6080 6081 struct discovery_entry_ctx { 6082 char name[128]; 6083 struct spdk_nvme_transport_id trid; 6084 struct spdk_nvme_ctrlr_opts drv_opts; 6085 struct spdk_nvmf_discovery_log_page_entry entry; 6086 TAILQ_ENTRY(discovery_entry_ctx) tailq; 6087 struct discovery_ctx *ctx; 6088 }; 6089 6090 struct discovery_ctx { 6091 char *name; 6092 spdk_bdev_nvme_start_discovery_fn start_cb_fn; 6093 spdk_bdev_nvme_stop_discovery_fn stop_cb_fn; 6094 void *cb_ctx; 6095 struct spdk_nvme_probe_ctx *probe_ctx; 6096 struct spdk_nvme_detach_ctx *detach_ctx; 6097 struct spdk_nvme_ctrlr *ctrlr; 6098 struct spdk_nvme_transport_id trid; 6099 struct discovery_entry_ctx *entry_ctx_in_use; 6100 struct spdk_poller *poller; 6101 struct spdk_nvme_ctrlr_opts drv_opts; 6102 struct nvme_ctrlr_opts bdev_opts; 6103 struct spdk_nvmf_discovery_log_page *log_page; 6104 TAILQ_ENTRY(discovery_ctx) tailq; 6105 TAILQ_HEAD(, discovery_entry_ctx) nvm_entry_ctxs; 6106 TAILQ_HEAD(, discovery_entry_ctx) discovery_entry_ctxs; 6107 int rc; 6108 bool wait_for_attach; 6109 uint64_t timeout_ticks; 6110 /* Denotes that the discovery service is being started. We're waiting 6111 * for the initial connection to the discovery controller to be 6112 * established and attach discovered NVM ctrlrs. 6113 */ 6114 bool initializing; 6115 /* Denotes if a discovery is currently in progress for this context. 6116 * That includes connecting to newly discovered subsystems. Used to 6117 * ensure we do not start a new discovery until an existing one is 6118 * complete. 6119 */ 6120 bool in_progress; 6121 6122 /* Denotes if another discovery is needed after the one in progress 6123 * completes. Set when we receive an AER completion while a discovery 6124 * is already in progress. 6125 */ 6126 bool pending; 6127 6128 /* Signal to the discovery context poller that it should stop the 6129 * discovery service, including detaching from the current discovery 6130 * controller. 6131 */ 6132 bool stop; 6133 6134 struct spdk_thread *calling_thread; 6135 uint32_t index; 6136 uint32_t attach_in_progress; 6137 char *hostnqn; 6138 6139 /* Denotes if the discovery service was started by the mdns discovery. 6140 */ 6141 bool from_mdns_discovery_service; 6142 }; 6143 6144 TAILQ_HEAD(discovery_ctxs, discovery_ctx); 6145 static struct discovery_ctxs g_discovery_ctxs = TAILQ_HEAD_INITIALIZER(g_discovery_ctxs); 6146 6147 static void get_discovery_log_page(struct discovery_ctx *ctx); 6148 6149 static void 6150 free_discovery_ctx(struct discovery_ctx *ctx) 6151 { 6152 free(ctx->log_page); 6153 free(ctx->hostnqn); 6154 free(ctx->name); 6155 free(ctx); 6156 } 6157 6158 static void 6159 discovery_complete(struct discovery_ctx *ctx) 6160 { 6161 ctx->initializing = false; 6162 ctx->in_progress = false; 6163 if (ctx->pending) { 6164 ctx->pending = false; 6165 get_discovery_log_page(ctx); 6166 } 6167 } 6168 6169 static void 6170 build_trid_from_log_page_entry(struct spdk_nvme_transport_id *trid, 6171 struct spdk_nvmf_discovery_log_page_entry *entry) 6172 { 6173 char *space; 6174 6175 trid->trtype = entry->trtype; 6176 trid->adrfam = entry->adrfam; 6177 memcpy(trid->traddr, entry->traddr, sizeof(entry->traddr)); 6178 memcpy(trid->trsvcid, entry->trsvcid, sizeof(entry->trsvcid)); 6179 /* Because the source buffer (entry->subnqn) is longer than trid->subnqn, and 6180 * before call to this function trid->subnqn is zeroed out, we need 6181 * to copy sizeof(trid->subnqn) minus one byte to make sure the last character 6182 * remains 0. Then we can shorten the string (replace ' ' with 0) if required 6183 */ 6184 memcpy(trid->subnqn, entry->subnqn, sizeof(trid->subnqn) - 1); 6185 6186 /* We want the traddr, trsvcid and subnqn fields to be NULL-terminated. 6187 * But the log page entries typically pad them with spaces, not zeroes. 6188 * So add a NULL terminator to each of these fields at the appropriate 6189 * location. 6190 */ 6191 space = strchr(trid->traddr, ' '); 6192 if (space) { 6193 *space = 0; 6194 } 6195 space = strchr(trid->trsvcid, ' '); 6196 if (space) { 6197 *space = 0; 6198 } 6199 space = strchr(trid->subnqn, ' '); 6200 if (space) { 6201 *space = 0; 6202 } 6203 } 6204 6205 static void 6206 _stop_discovery(void *_ctx) 6207 { 6208 struct discovery_ctx *ctx = _ctx; 6209 6210 if (ctx->attach_in_progress > 0) { 6211 spdk_thread_send_msg(spdk_get_thread(), _stop_discovery, ctx); 6212 return; 6213 } 6214 6215 ctx->stop = true; 6216 6217 while (!TAILQ_EMPTY(&ctx->nvm_entry_ctxs)) { 6218 struct discovery_entry_ctx *entry_ctx; 6219 struct nvme_path_id path = {}; 6220 6221 entry_ctx = TAILQ_FIRST(&ctx->nvm_entry_ctxs); 6222 path.trid = entry_ctx->trid; 6223 bdev_nvme_delete(entry_ctx->name, &path); 6224 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 6225 free(entry_ctx); 6226 } 6227 6228 while (!TAILQ_EMPTY(&ctx->discovery_entry_ctxs)) { 6229 struct discovery_entry_ctx *entry_ctx; 6230 6231 entry_ctx = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 6232 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 6233 free(entry_ctx); 6234 } 6235 6236 free(ctx->entry_ctx_in_use); 6237 ctx->entry_ctx_in_use = NULL; 6238 } 6239 6240 static void 6241 stop_discovery(struct discovery_ctx *ctx, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 6242 { 6243 ctx->stop_cb_fn = cb_fn; 6244 ctx->cb_ctx = cb_ctx; 6245 6246 if (ctx->attach_in_progress > 0) { 6247 DISCOVERY_INFOLOG(ctx, "stopping discovery with attach_in_progress: %"PRIu32"\n", 6248 ctx->attach_in_progress); 6249 } 6250 6251 _stop_discovery(ctx); 6252 } 6253 6254 static void 6255 remove_discovery_entry(struct nvme_ctrlr *nvme_ctrlr) 6256 { 6257 struct discovery_ctx *d_ctx; 6258 struct nvme_path_id *path_id; 6259 struct spdk_nvme_transport_id trid = {}; 6260 struct discovery_entry_ctx *entry_ctx, *tmp; 6261 6262 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 6263 6264 TAILQ_FOREACH(d_ctx, &g_discovery_ctxs, tailq) { 6265 TAILQ_FOREACH_SAFE(entry_ctx, &d_ctx->nvm_entry_ctxs, tailq, tmp) { 6266 build_trid_from_log_page_entry(&trid, &entry_ctx->entry); 6267 if (spdk_nvme_transport_id_compare(&trid, &path_id->trid) != 0) { 6268 continue; 6269 } 6270 6271 TAILQ_REMOVE(&d_ctx->nvm_entry_ctxs, entry_ctx, tailq); 6272 free(entry_ctx); 6273 DISCOVERY_INFOLOG(d_ctx, "Remove discovery entry: %s:%s:%s\n", 6274 trid.subnqn, trid.traddr, trid.trsvcid); 6275 6276 /* Fail discovery ctrlr to force reattach attempt */ 6277 spdk_nvme_ctrlr_fail(d_ctx->ctrlr); 6278 } 6279 } 6280 } 6281 6282 static void 6283 discovery_remove_controllers(struct discovery_ctx *ctx) 6284 { 6285 struct spdk_nvmf_discovery_log_page *log_page = ctx->log_page; 6286 struct discovery_entry_ctx *entry_ctx, *tmp; 6287 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 6288 struct spdk_nvme_transport_id old_trid = {}; 6289 uint64_t numrec, i; 6290 bool found; 6291 6292 numrec = from_le64(&log_page->numrec); 6293 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->nvm_entry_ctxs, tailq, tmp) { 6294 found = false; 6295 old_entry = &entry_ctx->entry; 6296 build_trid_from_log_page_entry(&old_trid, old_entry); 6297 for (i = 0; i < numrec; i++) { 6298 new_entry = &log_page->entries[i]; 6299 if (!memcmp(old_entry, new_entry, sizeof(*old_entry))) { 6300 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s found again\n", 6301 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 6302 found = true; 6303 break; 6304 } 6305 } 6306 if (!found) { 6307 struct nvme_path_id path = {}; 6308 6309 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s not found\n", 6310 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 6311 6312 path.trid = entry_ctx->trid; 6313 bdev_nvme_delete(entry_ctx->name, &path); 6314 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 6315 free(entry_ctx); 6316 } 6317 } 6318 free(log_page); 6319 ctx->log_page = NULL; 6320 discovery_complete(ctx); 6321 } 6322 6323 static void 6324 complete_discovery_start(struct discovery_ctx *ctx, int status) 6325 { 6326 ctx->timeout_ticks = 0; 6327 ctx->rc = status; 6328 if (ctx->start_cb_fn) { 6329 ctx->start_cb_fn(ctx->cb_ctx, status); 6330 ctx->start_cb_fn = NULL; 6331 ctx->cb_ctx = NULL; 6332 } 6333 } 6334 6335 static void 6336 discovery_attach_controller_done(void *cb_ctx, size_t bdev_count, int rc) 6337 { 6338 struct discovery_entry_ctx *entry_ctx = cb_ctx; 6339 struct discovery_ctx *ctx = entry_ctx->ctx; 6340 6341 DISCOVERY_INFOLOG(ctx, "attach %s done\n", entry_ctx->name); 6342 ctx->attach_in_progress--; 6343 if (ctx->attach_in_progress == 0) { 6344 complete_discovery_start(ctx, ctx->rc); 6345 if (ctx->initializing && ctx->rc != 0) { 6346 DISCOVERY_ERRLOG(ctx, "stopping discovery due to errors: %d\n", ctx->rc); 6347 stop_discovery(ctx, NULL, ctx->cb_ctx); 6348 } else { 6349 discovery_remove_controllers(ctx); 6350 } 6351 } 6352 } 6353 6354 static struct discovery_entry_ctx * 6355 create_discovery_entry_ctx(struct discovery_ctx *ctx, struct spdk_nvme_transport_id *trid) 6356 { 6357 struct discovery_entry_ctx *new_ctx; 6358 6359 new_ctx = calloc(1, sizeof(*new_ctx)); 6360 if (new_ctx == NULL) { 6361 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6362 return NULL; 6363 } 6364 6365 new_ctx->ctx = ctx; 6366 memcpy(&new_ctx->trid, trid, sizeof(*trid)); 6367 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 6368 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 6369 return new_ctx; 6370 } 6371 6372 static void 6373 discovery_log_page_cb(void *cb_arg, int rc, const struct spdk_nvme_cpl *cpl, 6374 struct spdk_nvmf_discovery_log_page *log_page) 6375 { 6376 struct discovery_ctx *ctx = cb_arg; 6377 struct discovery_entry_ctx *entry_ctx, *tmp; 6378 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 6379 uint64_t numrec, i; 6380 bool found; 6381 6382 if (rc || spdk_nvme_cpl_is_error(cpl)) { 6383 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 6384 return; 6385 } 6386 6387 ctx->log_page = log_page; 6388 assert(ctx->attach_in_progress == 0); 6389 numrec = from_le64(&log_page->numrec); 6390 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->discovery_entry_ctxs, tailq, tmp) { 6391 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 6392 free(entry_ctx); 6393 } 6394 for (i = 0; i < numrec; i++) { 6395 found = false; 6396 new_entry = &log_page->entries[i]; 6397 if (new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) { 6398 struct discovery_entry_ctx *new_ctx; 6399 struct spdk_nvme_transport_id trid = {}; 6400 6401 build_trid_from_log_page_entry(&trid, new_entry); 6402 new_ctx = create_discovery_entry_ctx(ctx, &trid); 6403 if (new_ctx == NULL) { 6404 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6405 break; 6406 } 6407 6408 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, new_ctx, tailq); 6409 continue; 6410 } 6411 TAILQ_FOREACH(entry_ctx, &ctx->nvm_entry_ctxs, tailq) { 6412 old_entry = &entry_ctx->entry; 6413 if (!memcmp(new_entry, old_entry, sizeof(*new_entry))) { 6414 found = true; 6415 break; 6416 } 6417 } 6418 if (!found) { 6419 struct discovery_entry_ctx *subnqn_ctx = NULL, *new_ctx; 6420 struct discovery_ctx *d_ctx; 6421 6422 TAILQ_FOREACH(d_ctx, &g_discovery_ctxs, tailq) { 6423 TAILQ_FOREACH(subnqn_ctx, &d_ctx->nvm_entry_ctxs, tailq) { 6424 if (!memcmp(subnqn_ctx->entry.subnqn, new_entry->subnqn, 6425 sizeof(new_entry->subnqn))) { 6426 break; 6427 } 6428 } 6429 if (subnqn_ctx) { 6430 break; 6431 } 6432 } 6433 6434 new_ctx = calloc(1, sizeof(*new_ctx)); 6435 if (new_ctx == NULL) { 6436 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6437 break; 6438 } 6439 6440 new_ctx->ctx = ctx; 6441 memcpy(&new_ctx->entry, new_entry, sizeof(*new_entry)); 6442 build_trid_from_log_page_entry(&new_ctx->trid, new_entry); 6443 if (subnqn_ctx) { 6444 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s", subnqn_ctx->name); 6445 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new path for %s\n", 6446 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 6447 new_ctx->name); 6448 } else { 6449 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s%d", ctx->name, ctx->index++); 6450 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new subsystem %s\n", 6451 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 6452 new_ctx->name); 6453 } 6454 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 6455 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 6456 rc = bdev_nvme_create(&new_ctx->trid, new_ctx->name, NULL, 0, 6457 discovery_attach_controller_done, new_ctx, 6458 &new_ctx->drv_opts, &ctx->bdev_opts, true); 6459 if (rc == 0) { 6460 TAILQ_INSERT_TAIL(&ctx->nvm_entry_ctxs, new_ctx, tailq); 6461 ctx->attach_in_progress++; 6462 } else { 6463 DISCOVERY_ERRLOG(ctx, "bdev_nvme_create failed (%s)\n", spdk_strerror(-rc)); 6464 } 6465 } 6466 } 6467 6468 if (ctx->attach_in_progress == 0) { 6469 discovery_remove_controllers(ctx); 6470 } 6471 } 6472 6473 static void 6474 get_discovery_log_page(struct discovery_ctx *ctx) 6475 { 6476 int rc; 6477 6478 assert(ctx->in_progress == false); 6479 ctx->in_progress = true; 6480 rc = spdk_nvme_ctrlr_get_discovery_log_page(ctx->ctrlr, discovery_log_page_cb, ctx); 6481 if (rc != 0) { 6482 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 6483 } 6484 DISCOVERY_INFOLOG(ctx, "sent discovery log page command\n"); 6485 } 6486 6487 static void 6488 discovery_aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 6489 { 6490 struct discovery_ctx *ctx = arg; 6491 uint32_t log_page_id = (cpl->cdw0 & 0xFF0000) >> 16; 6492 6493 if (spdk_nvme_cpl_is_error(cpl)) { 6494 DISCOVERY_ERRLOG(ctx, "aer failed\n"); 6495 return; 6496 } 6497 6498 if (log_page_id != SPDK_NVME_LOG_DISCOVERY) { 6499 DISCOVERY_ERRLOG(ctx, "unexpected log page 0x%x\n", log_page_id); 6500 return; 6501 } 6502 6503 DISCOVERY_INFOLOG(ctx, "got aer\n"); 6504 if (ctx->in_progress) { 6505 ctx->pending = true; 6506 return; 6507 } 6508 6509 get_discovery_log_page(ctx); 6510 } 6511 6512 static void 6513 discovery_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 6514 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 6515 { 6516 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 6517 struct discovery_ctx *ctx; 6518 6519 ctx = SPDK_CONTAINEROF(user_opts, struct discovery_ctx, drv_opts); 6520 6521 DISCOVERY_INFOLOG(ctx, "discovery ctrlr attached\n"); 6522 ctx->probe_ctx = NULL; 6523 ctx->ctrlr = ctrlr; 6524 6525 if (ctx->rc != 0) { 6526 DISCOVERY_ERRLOG(ctx, "encountered error while attaching discovery ctrlr: %d\n", 6527 ctx->rc); 6528 return; 6529 } 6530 6531 spdk_nvme_ctrlr_register_aer_callback(ctx->ctrlr, discovery_aer_cb, ctx); 6532 } 6533 6534 static int 6535 discovery_poller(void *arg) 6536 { 6537 struct discovery_ctx *ctx = arg; 6538 struct spdk_nvme_transport_id *trid; 6539 int rc; 6540 6541 if (ctx->detach_ctx) { 6542 rc = spdk_nvme_detach_poll_async(ctx->detach_ctx); 6543 if (rc != -EAGAIN) { 6544 ctx->detach_ctx = NULL; 6545 ctx->ctrlr = NULL; 6546 } 6547 } else if (ctx->stop) { 6548 if (ctx->ctrlr != NULL) { 6549 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 6550 if (rc == 0) { 6551 return SPDK_POLLER_BUSY; 6552 } 6553 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 6554 } 6555 spdk_poller_unregister(&ctx->poller); 6556 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 6557 assert(ctx->start_cb_fn == NULL); 6558 if (ctx->stop_cb_fn != NULL) { 6559 ctx->stop_cb_fn(ctx->cb_ctx); 6560 } 6561 free_discovery_ctx(ctx); 6562 } else if (ctx->probe_ctx == NULL && ctx->ctrlr == NULL) { 6563 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 6564 DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); 6565 assert(ctx->initializing); 6566 spdk_poller_unregister(&ctx->poller); 6567 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 6568 complete_discovery_start(ctx, -ETIMEDOUT); 6569 stop_discovery(ctx, NULL, NULL); 6570 free_discovery_ctx(ctx); 6571 return SPDK_POLLER_BUSY; 6572 } 6573 6574 assert(ctx->entry_ctx_in_use == NULL); 6575 ctx->entry_ctx_in_use = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 6576 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 6577 trid = &ctx->entry_ctx_in_use->trid; 6578 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, discovery_attach_cb); 6579 if (ctx->probe_ctx) { 6580 spdk_poller_unregister(&ctx->poller); 6581 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000); 6582 } else { 6583 DISCOVERY_ERRLOG(ctx, "could not start discovery connect\n"); 6584 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 6585 ctx->entry_ctx_in_use = NULL; 6586 } 6587 } else if (ctx->probe_ctx) { 6588 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 6589 DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); 6590 complete_discovery_start(ctx, -ETIMEDOUT); 6591 return SPDK_POLLER_BUSY; 6592 } 6593 6594 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 6595 if (rc != -EAGAIN) { 6596 if (ctx->rc != 0) { 6597 assert(ctx->initializing); 6598 stop_discovery(ctx, NULL, ctx->cb_ctx); 6599 } else { 6600 assert(rc == 0); 6601 DISCOVERY_INFOLOG(ctx, "discovery ctrlr connected\n"); 6602 ctx->rc = rc; 6603 get_discovery_log_page(ctx); 6604 } 6605 } 6606 } else { 6607 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 6608 DISCOVERY_ERRLOG(ctx, "timed out while attaching NVM ctrlrs\n"); 6609 complete_discovery_start(ctx, -ETIMEDOUT); 6610 /* We need to wait until all NVM ctrlrs are attached before we stop the 6611 * discovery service to make sure we don't detach a ctrlr that is still 6612 * being attached. 6613 */ 6614 if (ctx->attach_in_progress == 0) { 6615 stop_discovery(ctx, NULL, ctx->cb_ctx); 6616 return SPDK_POLLER_BUSY; 6617 } 6618 } 6619 6620 rc = spdk_nvme_ctrlr_process_admin_completions(ctx->ctrlr); 6621 if (rc < 0) { 6622 spdk_poller_unregister(&ctx->poller); 6623 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 6624 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 6625 ctx->entry_ctx_in_use = NULL; 6626 6627 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 6628 if (rc != 0) { 6629 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 6630 ctx->ctrlr = NULL; 6631 } 6632 } 6633 } 6634 6635 return SPDK_POLLER_BUSY; 6636 } 6637 6638 static void 6639 start_discovery_poller(void *arg) 6640 { 6641 struct discovery_ctx *ctx = arg; 6642 6643 TAILQ_INSERT_TAIL(&g_discovery_ctxs, ctx, tailq); 6644 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 6645 } 6646 6647 int 6648 bdev_nvme_start_discovery(struct spdk_nvme_transport_id *trid, 6649 const char *base_name, 6650 struct spdk_nvme_ctrlr_opts *drv_opts, 6651 struct nvme_ctrlr_opts *bdev_opts, 6652 uint64_t attach_timeout, 6653 bool from_mdns, 6654 spdk_bdev_nvme_start_discovery_fn cb_fn, void *cb_ctx) 6655 { 6656 struct discovery_ctx *ctx; 6657 struct discovery_entry_ctx *discovery_entry_ctx; 6658 6659 snprintf(trid->subnqn, sizeof(trid->subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN); 6660 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 6661 if (strcmp(ctx->name, base_name) == 0) { 6662 return -EEXIST; 6663 } 6664 6665 if (ctx->entry_ctx_in_use != NULL) { 6666 if (!spdk_nvme_transport_id_compare(trid, &ctx->entry_ctx_in_use->trid)) { 6667 return -EEXIST; 6668 } 6669 } 6670 6671 TAILQ_FOREACH(discovery_entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 6672 if (!spdk_nvme_transport_id_compare(trid, &discovery_entry_ctx->trid)) { 6673 return -EEXIST; 6674 } 6675 } 6676 } 6677 6678 ctx = calloc(1, sizeof(*ctx)); 6679 if (ctx == NULL) { 6680 return -ENOMEM; 6681 } 6682 6683 ctx->name = strdup(base_name); 6684 if (ctx->name == NULL) { 6685 free_discovery_ctx(ctx); 6686 return -ENOMEM; 6687 } 6688 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 6689 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 6690 ctx->from_mdns_discovery_service = from_mdns; 6691 ctx->bdev_opts.from_discovery_service = true; 6692 ctx->calling_thread = spdk_get_thread(); 6693 ctx->start_cb_fn = cb_fn; 6694 ctx->cb_ctx = cb_ctx; 6695 ctx->initializing = true; 6696 if (ctx->start_cb_fn) { 6697 /* We can use this when dumping json to denote if this RPC parameter 6698 * was specified or not. 6699 */ 6700 ctx->wait_for_attach = true; 6701 } 6702 if (attach_timeout != 0) { 6703 ctx->timeout_ticks = spdk_get_ticks() + attach_timeout * 6704 spdk_get_ticks_hz() / 1000ull; 6705 } 6706 TAILQ_INIT(&ctx->nvm_entry_ctxs); 6707 TAILQ_INIT(&ctx->discovery_entry_ctxs); 6708 memcpy(&ctx->trid, trid, sizeof(*trid)); 6709 /* Even if user did not specify hostnqn, we can still strdup("\0"); */ 6710 ctx->hostnqn = strdup(ctx->drv_opts.hostnqn); 6711 if (ctx->hostnqn == NULL) { 6712 free_discovery_ctx(ctx); 6713 return -ENOMEM; 6714 } 6715 discovery_entry_ctx = create_discovery_entry_ctx(ctx, trid); 6716 if (discovery_entry_ctx == NULL) { 6717 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6718 free_discovery_ctx(ctx); 6719 return -ENOMEM; 6720 } 6721 6722 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, discovery_entry_ctx, tailq); 6723 spdk_thread_send_msg(g_bdev_nvme_init_thread, start_discovery_poller, ctx); 6724 return 0; 6725 } 6726 6727 int 6728 bdev_nvme_stop_discovery(const char *name, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 6729 { 6730 struct discovery_ctx *ctx; 6731 6732 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 6733 if (strcmp(name, ctx->name) == 0) { 6734 if (ctx->stop) { 6735 return -EALREADY; 6736 } 6737 /* If we're still starting the discovery service and ->rc is non-zero, we're 6738 * going to stop it as soon as we can 6739 */ 6740 if (ctx->initializing && ctx->rc != 0) { 6741 return -EALREADY; 6742 } 6743 stop_discovery(ctx, cb_fn, cb_ctx); 6744 return 0; 6745 } 6746 } 6747 6748 return -ENOENT; 6749 } 6750 6751 static int 6752 bdev_nvme_library_init(void) 6753 { 6754 g_bdev_nvme_init_thread = spdk_get_thread(); 6755 6756 spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_create_poll_group_cb, 6757 bdev_nvme_destroy_poll_group_cb, 6758 sizeof(struct nvme_poll_group), "nvme_poll_groups"); 6759 6760 return 0; 6761 } 6762 6763 static void 6764 bdev_nvme_fini_destruct_ctrlrs(void) 6765 { 6766 struct nvme_bdev_ctrlr *nbdev_ctrlr; 6767 struct nvme_ctrlr *nvme_ctrlr; 6768 6769 pthread_mutex_lock(&g_bdev_nvme_mutex); 6770 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 6771 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 6772 pthread_mutex_lock(&nvme_ctrlr->mutex); 6773 if (nvme_ctrlr->destruct) { 6774 /* This controller's destruction was already started 6775 * before the application started shutting down 6776 */ 6777 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6778 continue; 6779 } 6780 nvme_ctrlr->destruct = true; 6781 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6782 6783 spdk_thread_send_msg(nvme_ctrlr->thread, _nvme_ctrlr_destruct, 6784 nvme_ctrlr); 6785 } 6786 } 6787 6788 g_bdev_nvme_module_finish = true; 6789 if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 6790 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6791 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 6792 spdk_bdev_module_fini_done(); 6793 return; 6794 } 6795 6796 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6797 } 6798 6799 static void 6800 check_discovery_fini(void *arg) 6801 { 6802 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 6803 bdev_nvme_fini_destruct_ctrlrs(); 6804 } 6805 } 6806 6807 static void 6808 bdev_nvme_library_fini(void) 6809 { 6810 struct nvme_probe_skip_entry *entry, *entry_tmp; 6811 struct discovery_ctx *ctx; 6812 6813 spdk_poller_unregister(&g_hotplug_poller); 6814 free(g_hotplug_probe_ctx); 6815 g_hotplug_probe_ctx = NULL; 6816 6817 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) { 6818 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 6819 free(entry); 6820 } 6821 6822 assert(spdk_get_thread() == g_bdev_nvme_init_thread); 6823 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 6824 bdev_nvme_fini_destruct_ctrlrs(); 6825 } else { 6826 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 6827 stop_discovery(ctx, check_discovery_fini, NULL); 6828 } 6829 } 6830 } 6831 6832 static void 6833 bdev_nvme_verify_pi_error(struct nvme_bdev_io *bio) 6834 { 6835 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 6836 struct spdk_bdev *bdev = bdev_io->bdev; 6837 struct spdk_dif_ctx dif_ctx; 6838 struct spdk_dif_error err_blk = {}; 6839 int rc; 6840 struct spdk_dif_ctx_init_ext_opts dif_opts; 6841 6842 dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format); 6843 dif_opts.dif_pi_format = SPDK_DIF_PI_FORMAT_16; 6844 rc = spdk_dif_ctx_init(&dif_ctx, 6845 bdev->blocklen, bdev->md_len, bdev->md_interleave, 6846 bdev->dif_is_head_of_md, bdev->dif_type, bdev->dif_check_flags, 6847 bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0, &dif_opts); 6848 if (rc != 0) { 6849 SPDK_ERRLOG("Initialization of DIF context failed\n"); 6850 return; 6851 } 6852 6853 if (bdev->md_interleave) { 6854 rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 6855 bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 6856 } else { 6857 struct iovec md_iov = { 6858 .iov_base = bdev_io->u.bdev.md_buf, 6859 .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len, 6860 }; 6861 6862 rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 6863 &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 6864 } 6865 6866 if (rc != 0) { 6867 SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n", 6868 err_blk.err_type, err_blk.err_offset); 6869 } else { 6870 SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n"); 6871 } 6872 } 6873 6874 static void 6875 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 6876 { 6877 struct nvme_bdev_io *bio = ref; 6878 6879 if (spdk_nvme_cpl_is_success(cpl)) { 6880 /* Run PI verification for read data buffer. */ 6881 bdev_nvme_verify_pi_error(bio); 6882 } 6883 6884 /* Return original completion status */ 6885 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 6886 } 6887 6888 static void 6889 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 6890 { 6891 struct nvme_bdev_io *bio = ref; 6892 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 6893 int ret; 6894 6895 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 6896 SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n", 6897 cpl->status.sct, cpl->status.sc); 6898 6899 /* Save completion status to use after verifying PI error. */ 6900 bio->cpl = *cpl; 6901 6902 if (spdk_likely(nvme_io_path_is_available(bio->io_path))) { 6903 /* Read without PI checking to verify PI error. */ 6904 ret = bdev_nvme_no_pi_readv(bio, 6905 bdev_io->u.bdev.iovs, 6906 bdev_io->u.bdev.iovcnt, 6907 bdev_io->u.bdev.md_buf, 6908 bdev_io->u.bdev.num_blocks, 6909 bdev_io->u.bdev.offset_blocks); 6910 if (ret == 0) { 6911 return; 6912 } 6913 } 6914 } 6915 6916 bdev_nvme_io_complete_nvme_status(bio, cpl); 6917 } 6918 6919 static void 6920 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 6921 { 6922 struct nvme_bdev_io *bio = ref; 6923 6924 if (spdk_nvme_cpl_is_pi_error(cpl)) { 6925 SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n", 6926 cpl->status.sct, cpl->status.sc); 6927 /* Run PI verification for write data buffer if PI error is detected. */ 6928 bdev_nvme_verify_pi_error(bio); 6929 } 6930 6931 bdev_nvme_io_complete_nvme_status(bio, cpl); 6932 } 6933 6934 static void 6935 bdev_nvme_zone_appendv_done(void *ref, const struct spdk_nvme_cpl *cpl) 6936 { 6937 struct nvme_bdev_io *bio = ref; 6938 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 6939 6940 /* spdk_bdev_io_get_append_location() requires that the ALBA is stored in offset_blocks. 6941 * Additionally, offset_blocks has to be set before calling bdev_nvme_verify_pi_error(). 6942 */ 6943 bdev_io->u.bdev.offset_blocks = *(uint64_t *)&cpl->cdw0; 6944 6945 if (spdk_nvme_cpl_is_pi_error(cpl)) { 6946 SPDK_ERRLOG("zone append completed with PI error (sct=%d, sc=%d)\n", 6947 cpl->status.sct, cpl->status.sc); 6948 /* Run PI verification for zone append data buffer if PI error is detected. */ 6949 bdev_nvme_verify_pi_error(bio); 6950 } 6951 6952 bdev_nvme_io_complete_nvme_status(bio, cpl); 6953 } 6954 6955 static void 6956 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl) 6957 { 6958 struct nvme_bdev_io *bio = ref; 6959 6960 if (spdk_nvme_cpl_is_pi_error(cpl)) { 6961 SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n", 6962 cpl->status.sct, cpl->status.sc); 6963 /* Run PI verification for compare data buffer if PI error is detected. */ 6964 bdev_nvme_verify_pi_error(bio); 6965 } 6966 6967 bdev_nvme_io_complete_nvme_status(bio, cpl); 6968 } 6969 6970 static void 6971 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 6972 { 6973 struct nvme_bdev_io *bio = ref; 6974 6975 /* Compare operation completion */ 6976 if (!bio->first_fused_completed) { 6977 /* Save compare result for write callback */ 6978 bio->cpl = *cpl; 6979 bio->first_fused_completed = true; 6980 return; 6981 } 6982 6983 /* Write operation completion */ 6984 if (spdk_nvme_cpl_is_error(&bio->cpl)) { 6985 /* If bio->cpl is already an error, it means the compare operation failed. In that case, 6986 * complete the IO with the compare operation's status. 6987 */ 6988 if (!spdk_nvme_cpl_is_error(cpl)) { 6989 SPDK_ERRLOG("Unexpected write success after compare failure.\n"); 6990 } 6991 6992 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 6993 } else { 6994 bdev_nvme_io_complete_nvme_status(bio, cpl); 6995 } 6996 } 6997 6998 static void 6999 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl) 7000 { 7001 struct nvme_bdev_io *bio = ref; 7002 7003 bdev_nvme_io_complete_nvme_status(bio, cpl); 7004 } 7005 7006 static int 7007 fill_zone_from_report(struct spdk_bdev_zone_info *info, struct spdk_nvme_zns_zone_desc *desc) 7008 { 7009 switch (desc->zt) { 7010 case SPDK_NVME_ZONE_TYPE_SEQWR: 7011 info->type = SPDK_BDEV_ZONE_TYPE_SEQWR; 7012 break; 7013 default: 7014 SPDK_ERRLOG("Invalid zone type: %#x in zone report\n", desc->zt); 7015 return -EIO; 7016 } 7017 7018 switch (desc->zs) { 7019 case SPDK_NVME_ZONE_STATE_EMPTY: 7020 info->state = SPDK_BDEV_ZONE_STATE_EMPTY; 7021 break; 7022 case SPDK_NVME_ZONE_STATE_IOPEN: 7023 info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN; 7024 break; 7025 case SPDK_NVME_ZONE_STATE_EOPEN: 7026 info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN; 7027 break; 7028 case SPDK_NVME_ZONE_STATE_CLOSED: 7029 info->state = SPDK_BDEV_ZONE_STATE_CLOSED; 7030 break; 7031 case SPDK_NVME_ZONE_STATE_RONLY: 7032 info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY; 7033 break; 7034 case SPDK_NVME_ZONE_STATE_FULL: 7035 info->state = SPDK_BDEV_ZONE_STATE_FULL; 7036 break; 7037 case SPDK_NVME_ZONE_STATE_OFFLINE: 7038 info->state = SPDK_BDEV_ZONE_STATE_OFFLINE; 7039 break; 7040 default: 7041 SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", desc->zs); 7042 return -EIO; 7043 } 7044 7045 info->zone_id = desc->zslba; 7046 info->write_pointer = desc->wp; 7047 info->capacity = desc->zcap; 7048 7049 return 0; 7050 } 7051 7052 static void 7053 bdev_nvme_get_zone_info_done(void *ref, const struct spdk_nvme_cpl *cpl) 7054 { 7055 struct nvme_bdev_io *bio = ref; 7056 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7057 uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id; 7058 uint32_t zones_to_copy = bdev_io->u.zone_mgmt.num_zones; 7059 struct spdk_bdev_zone_info *info = bdev_io->u.zone_mgmt.buf; 7060 uint64_t max_zones_per_buf, i; 7061 uint32_t zone_report_bufsize; 7062 struct spdk_nvme_ns *ns; 7063 struct spdk_nvme_qpair *qpair; 7064 int ret; 7065 7066 if (spdk_nvme_cpl_is_error(cpl)) { 7067 goto out_complete_io_nvme_cpl; 7068 } 7069 7070 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 7071 ret = -ENXIO; 7072 goto out_complete_io_ret; 7073 } 7074 7075 ns = bio->io_path->nvme_ns->ns; 7076 qpair = bio->io_path->qpair->qpair; 7077 7078 zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 7079 max_zones_per_buf = (zone_report_bufsize - sizeof(*bio->zone_report_buf)) / 7080 sizeof(bio->zone_report_buf->descs[0]); 7081 7082 if (bio->zone_report_buf->nr_zones > max_zones_per_buf) { 7083 ret = -EINVAL; 7084 goto out_complete_io_ret; 7085 } 7086 7087 if (!bio->zone_report_buf->nr_zones) { 7088 ret = -EINVAL; 7089 goto out_complete_io_ret; 7090 } 7091 7092 for (i = 0; i < bio->zone_report_buf->nr_zones && bio->handled_zones < zones_to_copy; i++) { 7093 ret = fill_zone_from_report(&info[bio->handled_zones], 7094 &bio->zone_report_buf->descs[i]); 7095 if (ret) { 7096 goto out_complete_io_ret; 7097 } 7098 bio->handled_zones++; 7099 } 7100 7101 if (bio->handled_zones < zones_to_copy) { 7102 uint64_t zone_size_lba = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 7103 uint64_t slba = zone_id + (zone_size_lba * bio->handled_zones); 7104 7105 memset(bio->zone_report_buf, 0, zone_report_bufsize); 7106 ret = spdk_nvme_zns_report_zones(ns, qpair, 7107 bio->zone_report_buf, zone_report_bufsize, 7108 slba, SPDK_NVME_ZRA_LIST_ALL, true, 7109 bdev_nvme_get_zone_info_done, bio); 7110 if (!ret) { 7111 return; 7112 } else { 7113 goto out_complete_io_ret; 7114 } 7115 } 7116 7117 out_complete_io_nvme_cpl: 7118 free(bio->zone_report_buf); 7119 bio->zone_report_buf = NULL; 7120 bdev_nvme_io_complete_nvme_status(bio, cpl); 7121 return; 7122 7123 out_complete_io_ret: 7124 free(bio->zone_report_buf); 7125 bio->zone_report_buf = NULL; 7126 bdev_nvme_io_complete(bio, ret); 7127 } 7128 7129 static void 7130 bdev_nvme_zone_management_done(void *ref, const struct spdk_nvme_cpl *cpl) 7131 { 7132 struct nvme_bdev_io *bio = ref; 7133 7134 bdev_nvme_io_complete_nvme_status(bio, cpl); 7135 } 7136 7137 static void 7138 bdev_nvme_admin_passthru_complete_nvme_status(void *ctx) 7139 { 7140 struct nvme_bdev_io *bio = ctx; 7141 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7142 const struct spdk_nvme_cpl *cpl = &bio->cpl; 7143 7144 assert(bdev_nvme_io_type_is_admin(bdev_io->type)); 7145 7146 __bdev_nvme_io_complete(bdev_io, 0, cpl); 7147 } 7148 7149 static void 7150 bdev_nvme_abort_complete(void *ctx) 7151 { 7152 struct nvme_bdev_io *bio = ctx; 7153 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7154 7155 if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) { 7156 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS, NULL); 7157 } else { 7158 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED, NULL); 7159 } 7160 } 7161 7162 static void 7163 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl) 7164 { 7165 struct nvme_bdev_io *bio = ref; 7166 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7167 7168 bio->cpl = *cpl; 7169 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), bdev_nvme_abort_complete, bio); 7170 } 7171 7172 static void 7173 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl) 7174 { 7175 struct nvme_bdev_io *bio = ref; 7176 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7177 7178 bio->cpl = *cpl; 7179 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7180 bdev_nvme_admin_passthru_complete_nvme_status, bio); 7181 } 7182 7183 static void 7184 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset) 7185 { 7186 struct nvme_bdev_io *bio = ref; 7187 struct iovec *iov; 7188 7189 bio->iov_offset = sgl_offset; 7190 for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) { 7191 iov = &bio->iovs[bio->iovpos]; 7192 if (bio->iov_offset < iov->iov_len) { 7193 break; 7194 } 7195 7196 bio->iov_offset -= iov->iov_len; 7197 } 7198 } 7199 7200 static int 7201 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length) 7202 { 7203 struct nvme_bdev_io *bio = ref; 7204 struct iovec *iov; 7205 7206 assert(bio->iovpos < bio->iovcnt); 7207 7208 iov = &bio->iovs[bio->iovpos]; 7209 7210 *address = iov->iov_base; 7211 *length = iov->iov_len; 7212 7213 if (bio->iov_offset) { 7214 assert(bio->iov_offset <= iov->iov_len); 7215 *address += bio->iov_offset; 7216 *length -= bio->iov_offset; 7217 } 7218 7219 bio->iov_offset += *length; 7220 if (bio->iov_offset == iov->iov_len) { 7221 bio->iovpos++; 7222 bio->iov_offset = 0; 7223 } 7224 7225 return 0; 7226 } 7227 7228 static void 7229 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset) 7230 { 7231 struct nvme_bdev_io *bio = ref; 7232 struct iovec *iov; 7233 7234 bio->fused_iov_offset = sgl_offset; 7235 for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) { 7236 iov = &bio->fused_iovs[bio->fused_iovpos]; 7237 if (bio->fused_iov_offset < iov->iov_len) { 7238 break; 7239 } 7240 7241 bio->fused_iov_offset -= iov->iov_len; 7242 } 7243 } 7244 7245 static int 7246 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length) 7247 { 7248 struct nvme_bdev_io *bio = ref; 7249 struct iovec *iov; 7250 7251 assert(bio->fused_iovpos < bio->fused_iovcnt); 7252 7253 iov = &bio->fused_iovs[bio->fused_iovpos]; 7254 7255 *address = iov->iov_base; 7256 *length = iov->iov_len; 7257 7258 if (bio->fused_iov_offset) { 7259 assert(bio->fused_iov_offset <= iov->iov_len); 7260 *address += bio->fused_iov_offset; 7261 *length -= bio->fused_iov_offset; 7262 } 7263 7264 bio->fused_iov_offset += *length; 7265 if (bio->fused_iov_offset == iov->iov_len) { 7266 bio->fused_iovpos++; 7267 bio->fused_iov_offset = 0; 7268 } 7269 7270 return 0; 7271 } 7272 7273 static int 7274 bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7275 void *md, uint64_t lba_count, uint64_t lba) 7276 { 7277 int rc; 7278 7279 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n", 7280 lba_count, lba); 7281 7282 bio->iovs = iov; 7283 bio->iovcnt = iovcnt; 7284 bio->iovpos = 0; 7285 bio->iov_offset = 0; 7286 7287 rc = spdk_nvme_ns_cmd_readv_with_md(bio->io_path->nvme_ns->ns, 7288 bio->io_path->qpair->qpair, 7289 lba, lba_count, 7290 bdev_nvme_no_pi_readv_done, bio, 0, 7291 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 7292 md, 0, 0); 7293 7294 if (rc != 0 && rc != -ENOMEM) { 7295 SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc); 7296 } 7297 return rc; 7298 } 7299 7300 static int 7301 bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7302 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, 7303 struct spdk_memory_domain *domain, void *domain_ctx) 7304 { 7305 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7306 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7307 int rc; 7308 7309 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7310 lba_count, lba); 7311 7312 bio->iovs = iov; 7313 bio->iovcnt = iovcnt; 7314 bio->iovpos = 0; 7315 bio->iov_offset = 0; 7316 7317 if (domain != NULL) { 7318 bio->ext_opts.size = SPDK_SIZEOF(&bio->ext_opts, cdw13); 7319 bio->ext_opts.memory_domain = domain; 7320 bio->ext_opts.memory_domain_ctx = domain_ctx; 7321 bio->ext_opts.io_flags = flags; 7322 bio->ext_opts.metadata = md; 7323 7324 rc = spdk_nvme_ns_cmd_readv_ext(ns, qpair, lba, lba_count, 7325 bdev_nvme_readv_done, bio, 7326 bdev_nvme_queued_reset_sgl, 7327 bdev_nvme_queued_next_sge, 7328 &bio->ext_opts); 7329 } else if (iovcnt == 1) { 7330 rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base, 7331 md, lba, lba_count, bdev_nvme_readv_done, 7332 bio, flags, 0, 0); 7333 } else { 7334 rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count, 7335 bdev_nvme_readv_done, bio, flags, 7336 bdev_nvme_queued_reset_sgl, 7337 bdev_nvme_queued_next_sge, md, 0, 0); 7338 } 7339 7340 if (rc != 0 && rc != -ENOMEM) { 7341 SPDK_ERRLOG("readv failed: rc = %d\n", rc); 7342 } 7343 return rc; 7344 } 7345 7346 static int 7347 bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7348 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, 7349 struct spdk_memory_domain *domain, void *domain_ctx) 7350 { 7351 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7352 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7353 int rc; 7354 7355 SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7356 lba_count, lba); 7357 7358 bio->iovs = iov; 7359 bio->iovcnt = iovcnt; 7360 bio->iovpos = 0; 7361 bio->iov_offset = 0; 7362 7363 if (domain != NULL) { 7364 bio->ext_opts.size = SPDK_SIZEOF(&bio->ext_opts, cdw13); 7365 bio->ext_opts.memory_domain = domain; 7366 bio->ext_opts.memory_domain_ctx = domain_ctx; 7367 bio->ext_opts.io_flags = flags; 7368 bio->ext_opts.metadata = md; 7369 7370 rc = spdk_nvme_ns_cmd_writev_ext(ns, qpair, lba, lba_count, 7371 bdev_nvme_writev_done, bio, 7372 bdev_nvme_queued_reset_sgl, 7373 bdev_nvme_queued_next_sge, 7374 &bio->ext_opts); 7375 } else if (iovcnt == 1) { 7376 rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base, 7377 md, lba, lba_count, bdev_nvme_writev_done, 7378 bio, flags, 0, 0); 7379 } else { 7380 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 7381 bdev_nvme_writev_done, bio, flags, 7382 bdev_nvme_queued_reset_sgl, 7383 bdev_nvme_queued_next_sge, md, 0, 0); 7384 } 7385 7386 if (rc != 0 && rc != -ENOMEM) { 7387 SPDK_ERRLOG("writev failed: rc = %d\n", rc); 7388 } 7389 return rc; 7390 } 7391 7392 static int 7393 bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7394 void *md, uint64_t lba_count, uint64_t zslba, 7395 uint32_t flags) 7396 { 7397 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7398 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7399 int rc; 7400 7401 SPDK_DEBUGLOG(bdev_nvme, "zone append %" PRIu64 " blocks to zone start lba %#" PRIx64 "\n", 7402 lba_count, zslba); 7403 7404 bio->iovs = iov; 7405 bio->iovcnt = iovcnt; 7406 bio->iovpos = 0; 7407 bio->iov_offset = 0; 7408 7409 if (iovcnt == 1) { 7410 rc = spdk_nvme_zns_zone_append_with_md(ns, qpair, iov[0].iov_base, md, zslba, 7411 lba_count, 7412 bdev_nvme_zone_appendv_done, bio, 7413 flags, 7414 0, 0); 7415 } else { 7416 rc = spdk_nvme_zns_zone_appendv_with_md(ns, qpair, zslba, lba_count, 7417 bdev_nvme_zone_appendv_done, bio, flags, 7418 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 7419 md, 0, 0); 7420 } 7421 7422 if (rc != 0 && rc != -ENOMEM) { 7423 SPDK_ERRLOG("zone append failed: rc = %d\n", rc); 7424 } 7425 return rc; 7426 } 7427 7428 static int 7429 bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7430 void *md, uint64_t lba_count, uint64_t lba, 7431 uint32_t flags) 7432 { 7433 int rc; 7434 7435 SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7436 lba_count, lba); 7437 7438 bio->iovs = iov; 7439 bio->iovcnt = iovcnt; 7440 bio->iovpos = 0; 7441 bio->iov_offset = 0; 7442 7443 rc = spdk_nvme_ns_cmd_comparev_with_md(bio->io_path->nvme_ns->ns, 7444 bio->io_path->qpair->qpair, 7445 lba, lba_count, 7446 bdev_nvme_comparev_done, bio, flags, 7447 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 7448 md, 0, 0); 7449 7450 if (rc != 0 && rc != -ENOMEM) { 7451 SPDK_ERRLOG("comparev failed: rc = %d\n", rc); 7452 } 7453 return rc; 7454 } 7455 7456 static int 7457 bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, 7458 struct iovec *write_iov, int write_iovcnt, 7459 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags) 7460 { 7461 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7462 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7463 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7464 int rc; 7465 7466 SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7467 lba_count, lba); 7468 7469 bio->iovs = cmp_iov; 7470 bio->iovcnt = cmp_iovcnt; 7471 bio->iovpos = 0; 7472 bio->iov_offset = 0; 7473 bio->fused_iovs = write_iov; 7474 bio->fused_iovcnt = write_iovcnt; 7475 bio->fused_iovpos = 0; 7476 bio->fused_iov_offset = 0; 7477 7478 if (bdev_io->num_retries == 0) { 7479 bio->first_fused_submitted = false; 7480 bio->first_fused_completed = false; 7481 } 7482 7483 if (!bio->first_fused_submitted) { 7484 flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST; 7485 memset(&bio->cpl, 0, sizeof(bio->cpl)); 7486 7487 rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count, 7488 bdev_nvme_comparev_and_writev_done, bio, flags, 7489 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0); 7490 if (rc == 0) { 7491 bio->first_fused_submitted = true; 7492 flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST; 7493 } else { 7494 if (rc != -ENOMEM) { 7495 SPDK_ERRLOG("compare failed: rc = %d\n", rc); 7496 } 7497 return rc; 7498 } 7499 } 7500 7501 flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND; 7502 7503 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 7504 bdev_nvme_comparev_and_writev_done, bio, flags, 7505 bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0); 7506 if (rc != 0 && rc != -ENOMEM) { 7507 SPDK_ERRLOG("write failed: rc = %d\n", rc); 7508 rc = 0; 7509 } 7510 7511 return rc; 7512 } 7513 7514 static int 7515 bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 7516 { 7517 struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES]; 7518 struct spdk_nvme_dsm_range *range; 7519 uint64_t offset, remaining; 7520 uint64_t num_ranges_u64; 7521 uint16_t num_ranges; 7522 int rc; 7523 7524 num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) / 7525 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 7526 if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) { 7527 SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks); 7528 return -EINVAL; 7529 } 7530 num_ranges = (uint16_t)num_ranges_u64; 7531 7532 offset = offset_blocks; 7533 remaining = num_blocks; 7534 range = &dsm_ranges[0]; 7535 7536 /* Fill max-size ranges until the remaining blocks fit into one range */ 7537 while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) { 7538 range->attributes.raw = 0; 7539 range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 7540 range->starting_lba = offset; 7541 7542 offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 7543 remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 7544 range++; 7545 } 7546 7547 /* Final range describes the remaining blocks */ 7548 range->attributes.raw = 0; 7549 range->length = remaining; 7550 range->starting_lba = offset; 7551 7552 rc = spdk_nvme_ns_cmd_dataset_management(bio->io_path->nvme_ns->ns, 7553 bio->io_path->qpair->qpair, 7554 SPDK_NVME_DSM_ATTR_DEALLOCATE, 7555 dsm_ranges, num_ranges, 7556 bdev_nvme_queued_done, bio); 7557 7558 return rc; 7559 } 7560 7561 static int 7562 bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 7563 { 7564 if (num_blocks > UINT16_MAX + 1) { 7565 SPDK_ERRLOG("NVMe write zeroes is limited to 16-bit block count\n"); 7566 return -EINVAL; 7567 } 7568 7569 return spdk_nvme_ns_cmd_write_zeroes(bio->io_path->nvme_ns->ns, 7570 bio->io_path->qpair->qpair, 7571 offset_blocks, num_blocks, 7572 bdev_nvme_queued_done, bio, 7573 0); 7574 } 7575 7576 static int 7577 bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones, 7578 struct spdk_bdev_zone_info *info) 7579 { 7580 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7581 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7582 uint32_t zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 7583 uint64_t zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 7584 uint64_t total_zones = spdk_nvme_zns_ns_get_num_zones(ns); 7585 7586 if (zone_id % zone_size != 0) { 7587 return -EINVAL; 7588 } 7589 7590 if (num_zones > total_zones || !num_zones) { 7591 return -EINVAL; 7592 } 7593 7594 assert(!bio->zone_report_buf); 7595 bio->zone_report_buf = calloc(1, zone_report_bufsize); 7596 if (!bio->zone_report_buf) { 7597 return -ENOMEM; 7598 } 7599 7600 bio->handled_zones = 0; 7601 7602 return spdk_nvme_zns_report_zones(ns, qpair, bio->zone_report_buf, zone_report_bufsize, 7603 zone_id, SPDK_NVME_ZRA_LIST_ALL, true, 7604 bdev_nvme_get_zone_info_done, bio); 7605 } 7606 7607 static int 7608 bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 7609 enum spdk_bdev_zone_action action) 7610 { 7611 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7612 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7613 7614 switch (action) { 7615 case SPDK_BDEV_ZONE_CLOSE: 7616 return spdk_nvme_zns_close_zone(ns, qpair, zone_id, false, 7617 bdev_nvme_zone_management_done, bio); 7618 case SPDK_BDEV_ZONE_FINISH: 7619 return spdk_nvme_zns_finish_zone(ns, qpair, zone_id, false, 7620 bdev_nvme_zone_management_done, bio); 7621 case SPDK_BDEV_ZONE_OPEN: 7622 return spdk_nvme_zns_open_zone(ns, qpair, zone_id, false, 7623 bdev_nvme_zone_management_done, bio); 7624 case SPDK_BDEV_ZONE_RESET: 7625 return spdk_nvme_zns_reset_zone(ns, qpair, zone_id, false, 7626 bdev_nvme_zone_management_done, bio); 7627 case SPDK_BDEV_ZONE_OFFLINE: 7628 return spdk_nvme_zns_offline_zone(ns, qpair, zone_id, false, 7629 bdev_nvme_zone_management_done, bio); 7630 default: 7631 return -EINVAL; 7632 } 7633 } 7634 7635 static void 7636 bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 7637 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) 7638 { 7639 struct nvme_io_path *io_path; 7640 struct nvme_ctrlr *nvme_ctrlr; 7641 uint32_t max_xfer_size; 7642 int rc = -ENXIO; 7643 7644 /* Choose the first ctrlr which is not failed. */ 7645 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 7646 nvme_ctrlr = io_path->qpair->ctrlr; 7647 7648 /* We should skip any unavailable nvme_ctrlr rather than checking 7649 * if the return value of spdk_nvme_ctrlr_cmd_admin_raw() is -ENXIO. 7650 */ 7651 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 7652 continue; 7653 } 7654 7655 max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ctrlr->ctrlr); 7656 7657 if (nbytes > max_xfer_size) { 7658 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 7659 rc = -EINVAL; 7660 goto err; 7661 } 7662 7663 rc = spdk_nvme_ctrlr_cmd_admin_raw(nvme_ctrlr->ctrlr, cmd, buf, (uint32_t)nbytes, 7664 bdev_nvme_admin_passthru_done, bio); 7665 if (rc == 0) { 7666 return; 7667 } 7668 } 7669 7670 err: 7671 bdev_nvme_admin_complete(bio, rc); 7672 } 7673 7674 static int 7675 bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 7676 void *buf, size_t nbytes) 7677 { 7678 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7679 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7680 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 7681 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 7682 7683 if (nbytes > max_xfer_size) { 7684 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 7685 return -EINVAL; 7686 } 7687 7688 /* 7689 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 7690 * so fill it out automatically. 7691 */ 7692 cmd->nsid = spdk_nvme_ns_get_id(ns); 7693 7694 return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf, 7695 (uint32_t)nbytes, bdev_nvme_queued_done, bio); 7696 } 7697 7698 static int 7699 bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 7700 void *buf, size_t nbytes, void *md_buf, size_t md_len) 7701 { 7702 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7703 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7704 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 7705 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 7706 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 7707 7708 if (nbytes > max_xfer_size) { 7709 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 7710 return -EINVAL; 7711 } 7712 7713 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 7714 SPDK_ERRLOG("invalid meta data buffer size\n"); 7715 return -EINVAL; 7716 } 7717 7718 /* 7719 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 7720 * so fill it out automatically. 7721 */ 7722 cmd->nsid = spdk_nvme_ns_get_id(ns); 7723 7724 return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf, 7725 (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio); 7726 } 7727 7728 static void 7729 bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 7730 struct nvme_bdev_io *bio_to_abort) 7731 { 7732 struct nvme_io_path *io_path; 7733 int rc = 0; 7734 7735 rc = bdev_nvme_abort_retry_io(nbdev_ch, bio_to_abort); 7736 if (rc == 0) { 7737 bdev_nvme_admin_complete(bio, 0); 7738 return; 7739 } 7740 7741 io_path = bio_to_abort->io_path; 7742 if (io_path != NULL) { 7743 rc = spdk_nvme_ctrlr_cmd_abort_ext(io_path->qpair->ctrlr->ctrlr, 7744 io_path->qpair->qpair, 7745 bio_to_abort, 7746 bdev_nvme_abort_done, bio); 7747 } else { 7748 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 7749 rc = spdk_nvme_ctrlr_cmd_abort_ext(io_path->qpair->ctrlr->ctrlr, 7750 NULL, 7751 bio_to_abort, 7752 bdev_nvme_abort_done, bio); 7753 7754 if (rc != -ENOENT) { 7755 break; 7756 } 7757 } 7758 } 7759 7760 if (rc != 0) { 7761 /* If no command was found or there was any error, complete the abort 7762 * request with failure. 7763 */ 7764 bdev_nvme_admin_complete(bio, rc); 7765 } 7766 } 7767 7768 static int 7769 bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, uint64_t src_offset_blocks, 7770 uint64_t num_blocks) 7771 { 7772 struct spdk_nvme_scc_source_range range = { 7773 .slba = src_offset_blocks, 7774 .nlb = num_blocks - 1 7775 }; 7776 7777 return spdk_nvme_ns_cmd_copy(bio->io_path->nvme_ns->ns, 7778 bio->io_path->qpair->qpair, 7779 &range, 1, dst_offset_blocks, 7780 bdev_nvme_queued_done, bio); 7781 } 7782 7783 static void 7784 bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w) 7785 { 7786 const char *action; 7787 7788 if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) { 7789 action = "reset"; 7790 } else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) { 7791 action = "abort"; 7792 } else { 7793 action = "none"; 7794 } 7795 7796 spdk_json_write_object_begin(w); 7797 7798 spdk_json_write_named_string(w, "method", "bdev_nvme_set_options"); 7799 7800 spdk_json_write_named_object_begin(w, "params"); 7801 spdk_json_write_named_string(w, "action_on_timeout", action); 7802 spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us); 7803 spdk_json_write_named_uint64(w, "timeout_admin_us", g_opts.timeout_admin_us); 7804 spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms); 7805 spdk_json_write_named_uint32(w, "transport_retry_count", g_opts.transport_retry_count); 7806 spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst); 7807 spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight); 7808 spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight); 7809 spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight); 7810 spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us); 7811 spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us); 7812 spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests); 7813 spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit); 7814 spdk_json_write_named_int32(w, "bdev_retry_count", g_opts.bdev_retry_count); 7815 spdk_json_write_named_uint8(w, "transport_ack_timeout", g_opts.transport_ack_timeout); 7816 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", g_opts.ctrlr_loss_timeout_sec); 7817 spdk_json_write_named_uint32(w, "reconnect_delay_sec", g_opts.reconnect_delay_sec); 7818 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", g_opts.fast_io_fail_timeout_sec); 7819 spdk_json_write_named_bool(w, "generate_uuids", g_opts.generate_uuids); 7820 spdk_json_write_named_uint8(w, "transport_tos", g_opts.transport_tos); 7821 spdk_json_write_named_bool(w, "io_path_stat", g_opts.io_path_stat); 7822 spdk_json_write_object_end(w); 7823 7824 spdk_json_write_object_end(w); 7825 } 7826 7827 static void 7828 bdev_nvme_discovery_config_json(struct spdk_json_write_ctx *w, struct discovery_ctx *ctx) 7829 { 7830 struct spdk_nvme_transport_id trid; 7831 7832 spdk_json_write_object_begin(w); 7833 7834 spdk_json_write_named_string(w, "method", "bdev_nvme_start_discovery"); 7835 7836 spdk_json_write_named_object_begin(w, "params"); 7837 spdk_json_write_named_string(w, "name", ctx->name); 7838 spdk_json_write_named_string(w, "hostnqn", ctx->hostnqn); 7839 7840 trid = ctx->trid; 7841 memset(trid.subnqn, 0, sizeof(trid.subnqn)); 7842 nvme_bdev_dump_trid_json(&trid, w); 7843 7844 spdk_json_write_named_bool(w, "wait_for_attach", ctx->wait_for_attach); 7845 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", ctx->bdev_opts.ctrlr_loss_timeout_sec); 7846 spdk_json_write_named_uint32(w, "reconnect_delay_sec", ctx->bdev_opts.reconnect_delay_sec); 7847 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 7848 ctx->bdev_opts.fast_io_fail_timeout_sec); 7849 spdk_json_write_object_end(w); 7850 7851 spdk_json_write_object_end(w); 7852 } 7853 7854 static void 7855 nvme_ctrlr_config_json(struct spdk_json_write_ctx *w, 7856 struct nvme_ctrlr *nvme_ctrlr) 7857 { 7858 struct spdk_nvme_transport_id *trid; 7859 const struct spdk_nvme_ctrlr_opts *opts; 7860 7861 if (nvme_ctrlr->opts.from_discovery_service) { 7862 /* Do not emit an RPC for this - it will be implicitly 7863 * covered by a separate bdev_nvme_start_discovery or 7864 * bdev_nvme_start_mdns_discovery RPC. 7865 */ 7866 return; 7867 } 7868 7869 trid = &nvme_ctrlr->active_path_id->trid; 7870 7871 spdk_json_write_object_begin(w); 7872 7873 spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller"); 7874 7875 spdk_json_write_named_object_begin(w, "params"); 7876 spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name); 7877 nvme_bdev_dump_trid_json(trid, w); 7878 spdk_json_write_named_bool(w, "prchk_reftag", 7879 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0); 7880 spdk_json_write_named_bool(w, "prchk_guard", 7881 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0); 7882 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", nvme_ctrlr->opts.ctrlr_loss_timeout_sec); 7883 spdk_json_write_named_uint32(w, "reconnect_delay_sec", nvme_ctrlr->opts.reconnect_delay_sec); 7884 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 7885 nvme_ctrlr->opts.fast_io_fail_timeout_sec); 7886 7887 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 7888 spdk_json_write_named_bool(w, "hdgst", opts->header_digest); 7889 spdk_json_write_named_bool(w, "ddgst", opts->data_digest); 7890 7891 spdk_json_write_object_end(w); 7892 7893 spdk_json_write_object_end(w); 7894 } 7895 7896 static void 7897 bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w) 7898 { 7899 spdk_json_write_object_begin(w); 7900 spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug"); 7901 7902 spdk_json_write_named_object_begin(w, "params"); 7903 spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us); 7904 spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled); 7905 spdk_json_write_object_end(w); 7906 7907 spdk_json_write_object_end(w); 7908 } 7909 7910 static int 7911 bdev_nvme_config_json(struct spdk_json_write_ctx *w) 7912 { 7913 struct nvme_bdev_ctrlr *nbdev_ctrlr; 7914 struct nvme_ctrlr *nvme_ctrlr; 7915 struct discovery_ctx *ctx; 7916 7917 bdev_nvme_opts_config_json(w); 7918 7919 pthread_mutex_lock(&g_bdev_nvme_mutex); 7920 7921 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 7922 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 7923 nvme_ctrlr_config_json(w, nvme_ctrlr); 7924 } 7925 } 7926 7927 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 7928 if (!ctx->from_mdns_discovery_service) { 7929 bdev_nvme_discovery_config_json(w, ctx); 7930 } 7931 } 7932 7933 bdev_nvme_mdns_discovery_config_json(w); 7934 7935 /* Dump as last parameter to give all NVMe bdevs chance to be constructed 7936 * before enabling hotplug poller. 7937 */ 7938 bdev_nvme_hotplug_config_json(w); 7939 7940 pthread_mutex_unlock(&g_bdev_nvme_mutex); 7941 return 0; 7942 } 7943 7944 struct spdk_nvme_ctrlr * 7945 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev) 7946 { 7947 struct nvme_bdev *nbdev; 7948 struct nvme_ns *nvme_ns; 7949 7950 if (!bdev || bdev->module != &nvme_if) { 7951 return NULL; 7952 } 7953 7954 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 7955 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 7956 assert(nvme_ns != NULL); 7957 7958 return nvme_ns->ctrlr->ctrlr; 7959 } 7960 7961 void 7962 nvme_io_path_info_json(struct spdk_json_write_ctx *w, struct nvme_io_path *io_path) 7963 { 7964 struct nvme_ns *nvme_ns = io_path->nvme_ns; 7965 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 7966 const struct spdk_nvme_ctrlr_data *cdata; 7967 const struct spdk_nvme_transport_id *trid; 7968 const char *adrfam_str; 7969 7970 spdk_json_write_object_begin(w); 7971 7972 spdk_json_write_named_string(w, "bdev_name", nvme_ns->bdev->disk.name); 7973 7974 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 7975 trid = spdk_nvme_ctrlr_get_transport_id(nvme_ctrlr->ctrlr); 7976 7977 spdk_json_write_named_uint32(w, "cntlid", cdata->cntlid); 7978 spdk_json_write_named_bool(w, "current", io_path->nbdev_ch != NULL && 7979 io_path == io_path->nbdev_ch->current_io_path); 7980 spdk_json_write_named_bool(w, "connected", nvme_qpair_is_connected(io_path->qpair)); 7981 spdk_json_write_named_bool(w, "accessible", nvme_ns_is_accessible(nvme_ns)); 7982 7983 spdk_json_write_named_object_begin(w, "transport"); 7984 spdk_json_write_named_string(w, "trtype", trid->trstring); 7985 spdk_json_write_named_string(w, "traddr", trid->traddr); 7986 if (trid->trsvcid[0] != '\0') { 7987 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 7988 } 7989 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 7990 if (adrfam_str) { 7991 spdk_json_write_named_string(w, "adrfam", adrfam_str); 7992 } 7993 spdk_json_write_object_end(w); 7994 7995 spdk_json_write_object_end(w); 7996 } 7997 7998 void 7999 bdev_nvme_get_discovery_info(struct spdk_json_write_ctx *w) 8000 { 8001 struct discovery_ctx *ctx; 8002 struct discovery_entry_ctx *entry_ctx; 8003 8004 spdk_json_write_array_begin(w); 8005 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 8006 spdk_json_write_object_begin(w); 8007 spdk_json_write_named_string(w, "name", ctx->name); 8008 8009 spdk_json_write_named_object_begin(w, "trid"); 8010 nvme_bdev_dump_trid_json(&ctx->trid, w); 8011 spdk_json_write_object_end(w); 8012 8013 spdk_json_write_named_array_begin(w, "referrals"); 8014 TAILQ_FOREACH(entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 8015 spdk_json_write_object_begin(w); 8016 spdk_json_write_named_object_begin(w, "trid"); 8017 nvme_bdev_dump_trid_json(&entry_ctx->trid, w); 8018 spdk_json_write_object_end(w); 8019 spdk_json_write_object_end(w); 8020 } 8021 spdk_json_write_array_end(w); 8022 8023 spdk_json_write_object_end(w); 8024 } 8025 spdk_json_write_array_end(w); 8026 } 8027 8028 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme) 8029 8030 SPDK_TRACE_REGISTER_FN(bdev_nvme_trace, "bdev_nvme", TRACE_GROUP_BDEV_NVME) 8031 { 8032 struct spdk_trace_tpoint_opts opts[] = { 8033 { 8034 "BDEV_NVME_IO_START", TRACE_BDEV_NVME_IO_START, 8035 OWNER_NONE, OBJECT_BDEV_NVME_IO, 1, 8036 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 8037 }, 8038 { 8039 "BDEV_NVME_IO_DONE", TRACE_BDEV_NVME_IO_DONE, 8040 OWNER_NONE, OBJECT_BDEV_NVME_IO, 0, 8041 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 8042 } 8043 }; 8044 8045 8046 spdk_trace_register_object(OBJECT_BDEV_NVME_IO, 'N'); 8047 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 8048 spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_SUBMIT, OBJECT_BDEV_NVME_IO, 0); 8049 spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_SUBMIT, OBJECT_BDEV_NVME_IO, 0); 8050 spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_COMPLETE, OBJECT_BDEV_NVME_IO, 0); 8051 spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_COMPLETE, OBJECT_BDEV_NVME_IO, 0); 8052 } 8053