1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 * Copyright (c) 2022 Dell Inc, or its subsidiaries. All rights reserved. 6 */ 7 8 #include "spdk/stdinc.h" 9 10 #include "bdev_nvme.h" 11 12 #include "spdk/accel.h" 13 #include "spdk/config.h" 14 #include "spdk/endian.h" 15 #include "spdk/bdev.h" 16 #include "spdk/json.h" 17 #include "spdk/likely.h" 18 #include "spdk/nvme.h" 19 #include "spdk/nvme_ocssd.h" 20 #include "spdk/nvme_zns.h" 21 #include "spdk/opal.h" 22 #include "spdk/thread.h" 23 #include "spdk/trace.h" 24 #include "spdk/string.h" 25 #include "spdk/util.h" 26 #include "spdk/uuid.h" 27 28 #include "spdk/bdev_module.h" 29 #include "spdk/log.h" 30 31 #include "spdk_internal/usdt.h" 32 #include "spdk_internal/trace_defs.h" 33 34 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true 35 #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS (10000) 36 37 #define NSID_STR_LEN 10 38 39 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w); 40 41 struct nvme_bdev_io { 42 /** array of iovecs to transfer. */ 43 struct iovec *iovs; 44 45 /** Number of iovecs in iovs array. */ 46 int iovcnt; 47 48 /** Current iovec position. */ 49 int iovpos; 50 51 /** Offset in current iovec. */ 52 uint32_t iov_offset; 53 54 /** I/O path the current I/O or admin passthrough is submitted on, or the I/O path 55 * being reset in a reset I/O. 56 */ 57 struct nvme_io_path *io_path; 58 59 /** array of iovecs to transfer. */ 60 struct iovec *fused_iovs; 61 62 /** Number of iovecs in iovs array. */ 63 int fused_iovcnt; 64 65 /** Current iovec position. */ 66 int fused_iovpos; 67 68 /** Offset in current iovec. */ 69 uint32_t fused_iov_offset; 70 71 /** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */ 72 struct spdk_nvme_cpl cpl; 73 74 /** Extended IO opts passed by the user to bdev layer and mapped to NVME format */ 75 struct spdk_nvme_ns_cmd_ext_io_opts ext_opts; 76 77 /** Keeps track if first of fused commands was submitted */ 78 bool first_fused_submitted; 79 80 /** Keeps track if first of fused commands was completed */ 81 bool first_fused_completed; 82 83 /** Temporary pointer to zone report buffer */ 84 struct spdk_nvme_zns_zone_report *zone_report_buf; 85 86 /** Keep track of how many zones that have been copied to the spdk_bdev_zone_info struct */ 87 uint64_t handled_zones; 88 89 /** Expiration value in ticks to retry the current I/O. */ 90 uint64_t retry_ticks; 91 92 /* How many times the current I/O was retried. */ 93 int32_t retry_count; 94 95 /* Current tsc at submit time. */ 96 uint64_t submit_tsc; 97 }; 98 99 struct nvme_probe_skip_entry { 100 struct spdk_nvme_transport_id trid; 101 TAILQ_ENTRY(nvme_probe_skip_entry) tailq; 102 }; 103 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */ 104 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER( 105 g_skipped_nvme_ctrlrs); 106 107 static struct spdk_bdev_nvme_opts g_opts = { 108 .action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE, 109 .timeout_us = 0, 110 .timeout_admin_us = 0, 111 .keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS, 112 .transport_retry_count = 4, 113 .arbitration_burst = 0, 114 .low_priority_weight = 0, 115 .medium_priority_weight = 0, 116 .high_priority_weight = 0, 117 .nvme_adminq_poll_period_us = 10000ULL, 118 .nvme_ioq_poll_period_us = 0, 119 .io_queue_requests = 0, 120 .delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT, 121 .bdev_retry_count = 3, 122 .transport_ack_timeout = 0, 123 .ctrlr_loss_timeout_sec = 0, 124 .reconnect_delay_sec = 0, 125 .fast_io_fail_timeout_sec = 0, 126 .disable_auto_failback = false, 127 .generate_uuids = false, 128 .transport_tos = 0, 129 .nvme_error_stat = false, 130 .io_path_stat = false, 131 }; 132 133 #define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL 134 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT 100000ULL 135 136 static int g_hot_insert_nvme_controller_index = 0; 137 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT; 138 static bool g_nvme_hotplug_enabled = false; 139 struct spdk_thread *g_bdev_nvme_init_thread; 140 static struct spdk_poller *g_hotplug_poller; 141 static struct spdk_poller *g_hotplug_probe_poller; 142 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx; 143 144 static void nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 145 struct nvme_async_probe_ctx *ctx); 146 static void nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 147 struct nvme_async_probe_ctx *ctx); 148 static int bdev_nvme_library_init(void); 149 static void bdev_nvme_library_fini(void); 150 static void _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, 151 struct spdk_bdev_io *bdev_io); 152 static void bdev_nvme_submit_request(struct spdk_io_channel *ch, 153 struct spdk_bdev_io *bdev_io); 154 static int bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 155 void *md, uint64_t lba_count, uint64_t lba, 156 uint32_t flags, struct spdk_memory_domain *domain, void *domain_ctx, 157 struct spdk_accel_sequence *seq); 158 static int bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 159 void *md, uint64_t lba_count, uint64_t lba); 160 static int bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 161 void *md, uint64_t lba_count, uint64_t lba, 162 uint32_t flags, struct spdk_memory_domain *domain, void *domain_ctx, 163 struct spdk_accel_sequence *seq); 164 static int bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 165 void *md, uint64_t lba_count, 166 uint64_t zslba, uint32_t flags); 167 static int bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 168 void *md, uint64_t lba_count, uint64_t lba, 169 uint32_t flags); 170 static int bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, 171 struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov, 172 int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba, 173 uint32_t flags); 174 static int bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, 175 uint32_t num_zones, struct spdk_bdev_zone_info *info); 176 static int bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 177 enum spdk_bdev_zone_action action); 178 static void bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, 179 struct nvme_bdev_io *bio, 180 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); 181 static int bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 182 void *buf, size_t nbytes); 183 static int bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 184 void *buf, size_t nbytes, void *md_buf, size_t md_len); 185 static void bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, 186 struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort); 187 static void bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio); 188 static int bdev_nvme_reset_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 189 static int bdev_nvme_failover_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool remove); 190 static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr); 191 static int nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr); 192 193 static struct nvme_ns *nvme_ns_alloc(void); 194 static void nvme_ns_free(struct nvme_ns *ns); 195 196 static int 197 nvme_ns_cmp(struct nvme_ns *ns1, struct nvme_ns *ns2) 198 { 199 return ns1->id < ns2->id ? -1 : ns1->id > ns2->id; 200 } 201 202 RB_GENERATE_STATIC(nvme_ns_tree, nvme_ns, node, nvme_ns_cmp); 203 204 struct spdk_nvme_qpair * 205 bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch) 206 { 207 struct nvme_ctrlr_channel *ctrlr_ch; 208 209 assert(ctrlr_io_ch != NULL); 210 211 ctrlr_ch = spdk_io_channel_get_ctx(ctrlr_io_ch); 212 213 return ctrlr_ch->qpair->qpair; 214 } 215 216 static int 217 bdev_nvme_get_ctx_size(void) 218 { 219 return sizeof(struct nvme_bdev_io); 220 } 221 222 static struct spdk_bdev_module nvme_if = { 223 .name = "nvme", 224 .async_fini = true, 225 .module_init = bdev_nvme_library_init, 226 .module_fini = bdev_nvme_library_fini, 227 .config_json = bdev_nvme_config_json, 228 .get_ctx_size = bdev_nvme_get_ctx_size, 229 230 }; 231 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if) 232 233 struct nvme_bdev_ctrlrs g_nvme_bdev_ctrlrs = TAILQ_HEAD_INITIALIZER(g_nvme_bdev_ctrlrs); 234 pthread_mutex_t g_bdev_nvme_mutex = PTHREAD_MUTEX_INITIALIZER; 235 bool g_bdev_nvme_module_finish; 236 237 struct nvme_bdev_ctrlr * 238 nvme_bdev_ctrlr_get_by_name(const char *name) 239 { 240 struct nvme_bdev_ctrlr *nbdev_ctrlr; 241 242 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 243 if (strcmp(name, nbdev_ctrlr->name) == 0) { 244 break; 245 } 246 } 247 248 return nbdev_ctrlr; 249 } 250 251 static struct nvme_ctrlr * 252 nvme_bdev_ctrlr_get_ctrlr(struct nvme_bdev_ctrlr *nbdev_ctrlr, 253 const struct spdk_nvme_transport_id *trid) 254 { 255 struct nvme_ctrlr *nvme_ctrlr; 256 257 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 258 if (spdk_nvme_transport_id_compare(trid, &nvme_ctrlr->active_path_id->trid) == 0) { 259 break; 260 } 261 } 262 263 return nvme_ctrlr; 264 } 265 266 struct nvme_ctrlr * 267 nvme_bdev_ctrlr_get_ctrlr_by_id(struct nvme_bdev_ctrlr *nbdev_ctrlr, 268 uint16_t cntlid) 269 { 270 struct nvme_ctrlr *nvme_ctrlr; 271 const struct spdk_nvme_ctrlr_data *cdata; 272 273 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 274 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 275 if (cdata->cntlid == cntlid) { 276 break; 277 } 278 } 279 280 return nvme_ctrlr; 281 } 282 283 static struct nvme_bdev * 284 nvme_bdev_ctrlr_get_bdev(struct nvme_bdev_ctrlr *nbdev_ctrlr, uint32_t nsid) 285 { 286 struct nvme_bdev *bdev; 287 288 pthread_mutex_lock(&g_bdev_nvme_mutex); 289 TAILQ_FOREACH(bdev, &nbdev_ctrlr->bdevs, tailq) { 290 if (bdev->nsid == nsid) { 291 break; 292 } 293 } 294 pthread_mutex_unlock(&g_bdev_nvme_mutex); 295 296 return bdev; 297 } 298 299 struct nvme_ns * 300 nvme_ctrlr_get_ns(struct nvme_ctrlr *nvme_ctrlr, uint32_t nsid) 301 { 302 struct nvme_ns ns; 303 304 assert(nsid > 0); 305 306 ns.id = nsid; 307 return RB_FIND(nvme_ns_tree, &nvme_ctrlr->namespaces, &ns); 308 } 309 310 struct nvme_ns * 311 nvme_ctrlr_get_first_active_ns(struct nvme_ctrlr *nvme_ctrlr) 312 { 313 return RB_MIN(nvme_ns_tree, &nvme_ctrlr->namespaces); 314 } 315 316 struct nvme_ns * 317 nvme_ctrlr_get_next_active_ns(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *ns) 318 { 319 if (ns == NULL) { 320 return NULL; 321 } 322 323 return RB_NEXT(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 324 } 325 326 static struct nvme_ctrlr * 327 nvme_ctrlr_get(const struct spdk_nvme_transport_id *trid) 328 { 329 struct nvme_bdev_ctrlr *nbdev_ctrlr; 330 struct nvme_ctrlr *nvme_ctrlr = NULL; 331 332 pthread_mutex_lock(&g_bdev_nvme_mutex); 333 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 334 nvme_ctrlr = nvme_bdev_ctrlr_get_ctrlr(nbdev_ctrlr, trid); 335 if (nvme_ctrlr != NULL) { 336 break; 337 } 338 } 339 pthread_mutex_unlock(&g_bdev_nvme_mutex); 340 341 return nvme_ctrlr; 342 } 343 344 struct nvme_ctrlr * 345 nvme_ctrlr_get_by_name(const char *name) 346 { 347 struct nvme_bdev_ctrlr *nbdev_ctrlr; 348 struct nvme_ctrlr *nvme_ctrlr = NULL; 349 350 if (name == NULL) { 351 return NULL; 352 } 353 354 pthread_mutex_lock(&g_bdev_nvme_mutex); 355 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 356 if (nbdev_ctrlr != NULL) { 357 nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 358 } 359 pthread_mutex_unlock(&g_bdev_nvme_mutex); 360 361 return nvme_ctrlr; 362 } 363 364 void 365 nvme_bdev_ctrlr_for_each(nvme_bdev_ctrlr_for_each_fn fn, void *ctx) 366 { 367 struct nvme_bdev_ctrlr *nbdev_ctrlr; 368 369 pthread_mutex_lock(&g_bdev_nvme_mutex); 370 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 371 fn(nbdev_ctrlr, ctx); 372 } 373 pthread_mutex_unlock(&g_bdev_nvme_mutex); 374 } 375 376 void 377 nvme_bdev_dump_trid_json(const struct spdk_nvme_transport_id *trid, struct spdk_json_write_ctx *w) 378 { 379 const char *trtype_str; 380 const char *adrfam_str; 381 382 trtype_str = spdk_nvme_transport_id_trtype_str(trid->trtype); 383 if (trtype_str) { 384 spdk_json_write_named_string(w, "trtype", trtype_str); 385 } 386 387 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 388 if (adrfam_str) { 389 spdk_json_write_named_string(w, "adrfam", adrfam_str); 390 } 391 392 if (trid->traddr[0] != '\0') { 393 spdk_json_write_named_string(w, "traddr", trid->traddr); 394 } 395 396 if (trid->trsvcid[0] != '\0') { 397 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 398 } 399 400 if (trid->subnqn[0] != '\0') { 401 spdk_json_write_named_string(w, "subnqn", trid->subnqn); 402 } 403 } 404 405 static void 406 nvme_bdev_ctrlr_delete(struct nvme_bdev_ctrlr *nbdev_ctrlr, 407 struct nvme_ctrlr *nvme_ctrlr) 408 { 409 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_delete, nvme_ctrlr->nbdev_ctrlr->name); 410 pthread_mutex_lock(&g_bdev_nvme_mutex); 411 412 TAILQ_REMOVE(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 413 if (!TAILQ_EMPTY(&nbdev_ctrlr->ctrlrs)) { 414 pthread_mutex_unlock(&g_bdev_nvme_mutex); 415 416 return; 417 } 418 TAILQ_REMOVE(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 419 420 pthread_mutex_unlock(&g_bdev_nvme_mutex); 421 422 assert(TAILQ_EMPTY(&nbdev_ctrlr->bdevs)); 423 424 free(nbdev_ctrlr->name); 425 free(nbdev_ctrlr); 426 } 427 428 static void 429 _nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 430 { 431 struct nvme_path_id *path_id, *tmp_path; 432 struct nvme_ns *ns, *tmp_ns; 433 434 free(nvme_ctrlr->copied_ana_desc); 435 spdk_free(nvme_ctrlr->ana_log_page); 436 437 if (nvme_ctrlr->opal_dev) { 438 spdk_opal_dev_destruct(nvme_ctrlr->opal_dev); 439 nvme_ctrlr->opal_dev = NULL; 440 } 441 442 if (nvme_ctrlr->nbdev_ctrlr) { 443 nvme_bdev_ctrlr_delete(nvme_ctrlr->nbdev_ctrlr, nvme_ctrlr); 444 } 445 446 RB_FOREACH_SAFE(ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp_ns) { 447 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 448 nvme_ns_free(ns); 449 } 450 451 TAILQ_FOREACH_SAFE(path_id, &nvme_ctrlr->trids, link, tmp_path) { 452 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 453 free(path_id); 454 } 455 456 pthread_mutex_destroy(&nvme_ctrlr->mutex); 457 458 free(nvme_ctrlr); 459 460 pthread_mutex_lock(&g_bdev_nvme_mutex); 461 if (g_bdev_nvme_module_finish && TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 462 pthread_mutex_unlock(&g_bdev_nvme_mutex); 463 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 464 spdk_bdev_module_fini_done(); 465 return; 466 } 467 pthread_mutex_unlock(&g_bdev_nvme_mutex); 468 } 469 470 static int 471 nvme_detach_poller(void *arg) 472 { 473 struct nvme_ctrlr *nvme_ctrlr = arg; 474 int rc; 475 476 rc = spdk_nvme_detach_poll_async(nvme_ctrlr->detach_ctx); 477 if (rc != -EAGAIN) { 478 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 479 _nvme_ctrlr_delete(nvme_ctrlr); 480 } 481 482 return SPDK_POLLER_BUSY; 483 } 484 485 static void 486 nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 487 { 488 int rc; 489 490 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 491 492 /* First, unregister the adminq poller, as the driver will poll adminq if necessary */ 493 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 494 495 /* If we got here, the reset/detach poller cannot be active */ 496 assert(nvme_ctrlr->reset_detach_poller == NULL); 497 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(nvme_detach_poller, 498 nvme_ctrlr, 1000); 499 if (nvme_ctrlr->reset_detach_poller == NULL) { 500 SPDK_ERRLOG("Failed to register detach poller\n"); 501 goto error; 502 } 503 504 rc = spdk_nvme_detach_async(nvme_ctrlr->ctrlr, &nvme_ctrlr->detach_ctx); 505 if (rc != 0) { 506 SPDK_ERRLOG("Failed to detach the NVMe controller\n"); 507 goto error; 508 } 509 510 return; 511 error: 512 /* We don't have a good way to handle errors here, so just do what we can and delete the 513 * controller without detaching the underlying NVMe device. 514 */ 515 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 516 _nvme_ctrlr_delete(nvme_ctrlr); 517 } 518 519 static void 520 nvme_ctrlr_unregister_cb(void *io_device) 521 { 522 struct nvme_ctrlr *nvme_ctrlr = io_device; 523 524 nvme_ctrlr_delete(nvme_ctrlr); 525 } 526 527 static void 528 nvme_ctrlr_unregister(void *ctx) 529 { 530 struct nvme_ctrlr *nvme_ctrlr = ctx; 531 532 spdk_io_device_unregister(nvme_ctrlr, nvme_ctrlr_unregister_cb); 533 } 534 535 static bool 536 nvme_ctrlr_can_be_unregistered(struct nvme_ctrlr *nvme_ctrlr) 537 { 538 if (!nvme_ctrlr->destruct) { 539 return false; 540 } 541 542 if (nvme_ctrlr->ref > 0) { 543 return false; 544 } 545 546 if (nvme_ctrlr->resetting) { 547 return false; 548 } 549 550 if (nvme_ctrlr->ana_log_page_updating) { 551 return false; 552 } 553 554 if (nvme_ctrlr->io_path_cache_clearing) { 555 return false; 556 } 557 558 return true; 559 } 560 561 static void 562 nvme_ctrlr_release(struct nvme_ctrlr *nvme_ctrlr) 563 { 564 pthread_mutex_lock(&nvme_ctrlr->mutex); 565 SPDK_DTRACE_PROBE2(bdev_nvme_ctrlr_release, nvme_ctrlr->nbdev_ctrlr->name, nvme_ctrlr->ref); 566 567 assert(nvme_ctrlr->ref > 0); 568 nvme_ctrlr->ref--; 569 570 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 571 pthread_mutex_unlock(&nvme_ctrlr->mutex); 572 return; 573 } 574 575 pthread_mutex_unlock(&nvme_ctrlr->mutex); 576 577 spdk_thread_exec_msg(nvme_ctrlr->thread, nvme_ctrlr_unregister, nvme_ctrlr); 578 } 579 580 static void 581 bdev_nvme_clear_current_io_path(struct nvme_bdev_channel *nbdev_ch) 582 { 583 nbdev_ch->current_io_path = NULL; 584 nbdev_ch->rr_counter = 0; 585 } 586 587 static struct nvme_io_path * 588 _bdev_nvme_get_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 589 { 590 struct nvme_io_path *io_path; 591 592 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 593 if (io_path->nvme_ns == nvme_ns) { 594 break; 595 } 596 } 597 598 return io_path; 599 } 600 601 static struct nvme_io_path * 602 nvme_io_path_alloc(void) 603 { 604 struct nvme_io_path *io_path; 605 606 io_path = calloc(1, sizeof(*io_path)); 607 if (io_path == NULL) { 608 SPDK_ERRLOG("Failed to alloc io_path.\n"); 609 return NULL; 610 } 611 612 if (g_opts.io_path_stat) { 613 io_path->stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); 614 if (io_path->stat == NULL) { 615 free(io_path); 616 SPDK_ERRLOG("Failed to alloc io_path stat.\n"); 617 return NULL; 618 } 619 spdk_bdev_reset_io_stat(io_path->stat, SPDK_BDEV_RESET_STAT_MAXMIN); 620 } 621 622 return io_path; 623 } 624 625 static void 626 nvme_io_path_free(struct nvme_io_path *io_path) 627 { 628 free(io_path->stat); 629 free(io_path); 630 } 631 632 static int 633 _bdev_nvme_add_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 634 { 635 struct nvme_io_path *io_path; 636 struct spdk_io_channel *ch; 637 struct nvme_ctrlr_channel *ctrlr_ch; 638 struct nvme_qpair *nvme_qpair; 639 640 io_path = nvme_io_path_alloc(); 641 if (io_path == NULL) { 642 return -ENOMEM; 643 } 644 645 io_path->nvme_ns = nvme_ns; 646 647 ch = spdk_get_io_channel(nvme_ns->ctrlr); 648 if (ch == NULL) { 649 nvme_io_path_free(io_path); 650 SPDK_ERRLOG("Failed to alloc io_channel.\n"); 651 return -ENOMEM; 652 } 653 654 ctrlr_ch = spdk_io_channel_get_ctx(ch); 655 656 nvme_qpair = ctrlr_ch->qpair; 657 assert(nvme_qpair != NULL); 658 659 io_path->qpair = nvme_qpair; 660 TAILQ_INSERT_TAIL(&nvme_qpair->io_path_list, io_path, tailq); 661 662 io_path->nbdev_ch = nbdev_ch; 663 STAILQ_INSERT_TAIL(&nbdev_ch->io_path_list, io_path, stailq); 664 665 bdev_nvme_clear_current_io_path(nbdev_ch); 666 667 return 0; 668 } 669 670 static void 671 bdev_nvme_clear_retry_io_path(struct nvme_bdev_channel *nbdev_ch, 672 struct nvme_io_path *io_path) 673 { 674 struct spdk_bdev_io *bdev_io; 675 struct nvme_bdev_io *bio; 676 677 TAILQ_FOREACH(bdev_io, &nbdev_ch->retry_io_list, module_link) { 678 bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 679 if (bio->io_path == io_path) { 680 bio->io_path = NULL; 681 } 682 } 683 } 684 685 static void 686 _bdev_nvme_delete_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *io_path) 687 { 688 struct spdk_io_channel *ch; 689 struct nvme_qpair *nvme_qpair; 690 struct nvme_ctrlr_channel *ctrlr_ch; 691 struct nvme_bdev *nbdev; 692 693 nbdev = spdk_io_channel_get_io_device(spdk_io_channel_from_ctx(nbdev_ch)); 694 695 /* Add the statistics to nvme_ns before this path is destroyed. */ 696 pthread_mutex_lock(&nbdev->mutex); 697 if (nbdev->ref != 0 && io_path->nvme_ns->stat != NULL && io_path->stat != NULL) { 698 spdk_bdev_add_io_stat(io_path->nvme_ns->stat, io_path->stat); 699 } 700 pthread_mutex_unlock(&nbdev->mutex); 701 702 bdev_nvme_clear_current_io_path(nbdev_ch); 703 bdev_nvme_clear_retry_io_path(nbdev_ch, io_path); 704 705 STAILQ_REMOVE(&nbdev_ch->io_path_list, io_path, nvme_io_path, stailq); 706 io_path->nbdev_ch = NULL; 707 708 nvme_qpair = io_path->qpair; 709 assert(nvme_qpair != NULL); 710 711 ctrlr_ch = nvme_qpair->ctrlr_ch; 712 assert(ctrlr_ch != NULL); 713 714 ch = spdk_io_channel_from_ctx(ctrlr_ch); 715 spdk_put_io_channel(ch); 716 717 /* After an io_path is removed, I/Os submitted to it may complete and update statistics 718 * of the io_path. To avoid heap-use-after-free error from this case, do not free the 719 * io_path here but free the io_path when the associated qpair is freed. It is ensured 720 * that all I/Os submitted to the io_path are completed when the associated qpair is freed. 721 */ 722 } 723 724 static void 725 _bdev_nvme_delete_io_paths(struct nvme_bdev_channel *nbdev_ch) 726 { 727 struct nvme_io_path *io_path, *tmp_io_path; 728 729 STAILQ_FOREACH_SAFE(io_path, &nbdev_ch->io_path_list, stailq, tmp_io_path) { 730 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 731 } 732 } 733 734 static int 735 bdev_nvme_create_bdev_channel_cb(void *io_device, void *ctx_buf) 736 { 737 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 738 struct nvme_bdev *nbdev = io_device; 739 struct nvme_ns *nvme_ns; 740 int rc; 741 742 STAILQ_INIT(&nbdev_ch->io_path_list); 743 TAILQ_INIT(&nbdev_ch->retry_io_list); 744 745 pthread_mutex_lock(&nbdev->mutex); 746 747 nbdev_ch->mp_policy = nbdev->mp_policy; 748 nbdev_ch->mp_selector = nbdev->mp_selector; 749 nbdev_ch->rr_min_io = nbdev->rr_min_io; 750 751 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 752 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 753 if (rc != 0) { 754 pthread_mutex_unlock(&nbdev->mutex); 755 756 _bdev_nvme_delete_io_paths(nbdev_ch); 757 return rc; 758 } 759 } 760 pthread_mutex_unlock(&nbdev->mutex); 761 762 return 0; 763 } 764 765 /* If cpl != NULL, complete the bdev_io with nvme status based on 'cpl'. 766 * If cpl == NULL, complete the bdev_io with bdev status based on 'status'. 767 */ 768 static inline void 769 __bdev_nvme_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status, 770 const struct spdk_nvme_cpl *cpl) 771 { 772 spdk_trace_record(TRACE_BDEV_NVME_IO_DONE, 0, 0, (uintptr_t)bdev_io->driver_ctx, 773 (uintptr_t)bdev_io); 774 if (cpl) { 775 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 776 } else { 777 spdk_bdev_io_complete(bdev_io, status); 778 } 779 } 780 781 static void bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch); 782 783 static void 784 bdev_nvme_destroy_bdev_channel_cb(void *io_device, void *ctx_buf) 785 { 786 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 787 788 bdev_nvme_abort_retry_ios(nbdev_ch); 789 _bdev_nvme_delete_io_paths(nbdev_ch); 790 } 791 792 static inline bool 793 bdev_nvme_io_type_is_admin(enum spdk_bdev_io_type io_type) 794 { 795 switch (io_type) { 796 case SPDK_BDEV_IO_TYPE_RESET: 797 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 798 case SPDK_BDEV_IO_TYPE_ABORT: 799 return true; 800 default: 801 break; 802 } 803 804 return false; 805 } 806 807 static inline bool 808 nvme_ns_is_accessible(struct nvme_ns *nvme_ns) 809 { 810 if (spdk_unlikely(nvme_ns->ana_state_updating)) { 811 return false; 812 } 813 814 switch (nvme_ns->ana_state) { 815 case SPDK_NVME_ANA_OPTIMIZED_STATE: 816 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 817 return true; 818 default: 819 break; 820 } 821 822 return false; 823 } 824 825 static inline bool 826 nvme_qpair_is_connected(struct nvme_qpair *nvme_qpair) 827 { 828 if (spdk_unlikely(nvme_qpair->qpair == NULL)) { 829 return false; 830 } 831 832 if (spdk_unlikely(spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) != 833 SPDK_NVME_QPAIR_FAILURE_NONE)) { 834 return false; 835 } 836 837 if (spdk_unlikely(nvme_qpair->ctrlr_ch->reset_iter != NULL)) { 838 return false; 839 } 840 841 if (spdk_nvme_ctrlr_get_admin_qp_failure_reason(nvme_qpair->ctrlr->ctrlr) != 842 SPDK_NVME_QPAIR_FAILURE_NONE) { 843 return false; 844 } 845 846 return true; 847 } 848 849 static inline bool 850 nvme_io_path_is_available(struct nvme_io_path *io_path) 851 { 852 if (spdk_unlikely(!nvme_qpair_is_connected(io_path->qpair))) { 853 return false; 854 } 855 856 if (spdk_unlikely(!nvme_ns_is_accessible(io_path->nvme_ns))) { 857 return false; 858 } 859 860 return true; 861 } 862 863 static inline bool 864 nvme_ctrlr_is_failed(struct nvme_ctrlr *nvme_ctrlr) 865 { 866 if (nvme_ctrlr->destruct) { 867 return true; 868 } 869 870 if (nvme_ctrlr->fast_io_fail_timedout) { 871 return true; 872 } 873 874 if (nvme_ctrlr->resetting) { 875 if (nvme_ctrlr->opts.reconnect_delay_sec != 0) { 876 return false; 877 } else { 878 return true; 879 } 880 } 881 882 if (nvme_ctrlr->reconnect_is_delayed) { 883 return false; 884 } 885 886 if (nvme_ctrlr->disabled) { 887 return true; 888 } 889 890 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 891 return true; 892 } else { 893 return false; 894 } 895 } 896 897 static bool 898 nvme_ctrlr_is_available(struct nvme_ctrlr *nvme_ctrlr) 899 { 900 if (nvme_ctrlr->destruct) { 901 return false; 902 } 903 904 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 905 return false; 906 } 907 908 if (nvme_ctrlr->resetting || nvme_ctrlr->reconnect_is_delayed) { 909 return false; 910 } 911 912 if (nvme_ctrlr->disabled) { 913 return false; 914 } 915 916 return true; 917 } 918 919 /* Simulate circular linked list. */ 920 static inline struct nvme_io_path * 921 nvme_io_path_get_next(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *prev_path) 922 { 923 struct nvme_io_path *next_path; 924 925 if (prev_path != NULL) { 926 next_path = STAILQ_NEXT(prev_path, stailq); 927 if (next_path != NULL) { 928 return next_path; 929 } 930 } 931 932 return STAILQ_FIRST(&nbdev_ch->io_path_list); 933 } 934 935 static struct nvme_io_path * 936 _bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 937 { 938 struct nvme_io_path *io_path, *start, *non_optimized = NULL; 939 940 start = nvme_io_path_get_next(nbdev_ch, nbdev_ch->current_io_path); 941 942 io_path = start; 943 do { 944 if (spdk_likely(nvme_qpair_is_connected(io_path->qpair) && 945 !io_path->nvme_ns->ana_state_updating)) { 946 switch (io_path->nvme_ns->ana_state) { 947 case SPDK_NVME_ANA_OPTIMIZED_STATE: 948 nbdev_ch->current_io_path = io_path; 949 return io_path; 950 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 951 if (non_optimized == NULL) { 952 non_optimized = io_path; 953 } 954 break; 955 default: 956 break; 957 } 958 } 959 io_path = nvme_io_path_get_next(nbdev_ch, io_path); 960 } while (io_path != start); 961 962 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) { 963 /* We come here only if there is no optimized path. Cache even non_optimized 964 * path for load balance across multiple non_optimized paths. 965 */ 966 nbdev_ch->current_io_path = non_optimized; 967 } 968 969 return non_optimized; 970 } 971 972 static struct nvme_io_path * 973 _bdev_nvme_find_io_path_min_qd(struct nvme_bdev_channel *nbdev_ch) 974 { 975 struct nvme_io_path *io_path; 976 struct nvme_io_path *optimized = NULL, *non_optimized = NULL; 977 uint32_t opt_min_qd = UINT32_MAX, non_opt_min_qd = UINT32_MAX; 978 uint32_t num_outstanding_reqs; 979 980 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 981 if (spdk_unlikely(!nvme_qpair_is_connected(io_path->qpair))) { 982 /* The device is currently resetting. */ 983 continue; 984 } 985 986 if (spdk_unlikely(io_path->nvme_ns->ana_state_updating)) { 987 continue; 988 } 989 990 num_outstanding_reqs = spdk_nvme_qpair_get_num_outstanding_reqs(io_path->qpair->qpair); 991 switch (io_path->nvme_ns->ana_state) { 992 case SPDK_NVME_ANA_OPTIMIZED_STATE: 993 if (num_outstanding_reqs < opt_min_qd) { 994 opt_min_qd = num_outstanding_reqs; 995 optimized = io_path; 996 } 997 break; 998 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 999 if (num_outstanding_reqs < non_opt_min_qd) { 1000 non_opt_min_qd = num_outstanding_reqs; 1001 non_optimized = io_path; 1002 } 1003 break; 1004 default: 1005 break; 1006 } 1007 } 1008 1009 /* don't cache io path for BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH selector */ 1010 if (optimized != NULL) { 1011 return optimized; 1012 } 1013 1014 return non_optimized; 1015 } 1016 1017 static inline struct nvme_io_path * 1018 bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 1019 { 1020 if (spdk_likely(nbdev_ch->current_io_path != NULL)) { 1021 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE) { 1022 return nbdev_ch->current_io_path; 1023 } else if (nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 1024 if (++nbdev_ch->rr_counter < nbdev_ch->rr_min_io) { 1025 return nbdev_ch->current_io_path; 1026 } 1027 nbdev_ch->rr_counter = 0; 1028 } 1029 } 1030 1031 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE || 1032 nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 1033 return _bdev_nvme_find_io_path(nbdev_ch); 1034 } else { 1035 return _bdev_nvme_find_io_path_min_qd(nbdev_ch); 1036 } 1037 } 1038 1039 /* Return true if there is any io_path whose qpair is active or ctrlr is not failed, 1040 * or false otherwise. 1041 * 1042 * If any io_path has an active qpair but find_io_path() returned NULL, its namespace 1043 * is likely to be non-accessible now but may become accessible. 1044 * 1045 * If any io_path has an unfailed ctrlr but find_io_path() returned NULL, the ctrlr 1046 * is likely to be resetting now but the reset may succeed. A ctrlr is set to unfailed 1047 * when starting to reset it but it is set to failed when the reset failed. Hence, if 1048 * a ctrlr is unfailed, it is likely that it works fine or is resetting. 1049 */ 1050 static bool 1051 any_io_path_may_become_available(struct nvme_bdev_channel *nbdev_ch) 1052 { 1053 struct nvme_io_path *io_path; 1054 1055 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 1056 if (io_path->nvme_ns->ana_transition_timedout) { 1057 continue; 1058 } 1059 1060 if (nvme_qpair_is_connected(io_path->qpair) || 1061 !nvme_ctrlr_is_failed(io_path->qpair->ctrlr)) { 1062 return true; 1063 } 1064 } 1065 1066 return false; 1067 } 1068 1069 static void 1070 bdev_nvme_retry_io(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io) 1071 { 1072 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 1073 struct spdk_io_channel *ch; 1074 1075 if (nbdev_io->io_path != NULL && nvme_io_path_is_available(nbdev_io->io_path)) { 1076 _bdev_nvme_submit_request(nbdev_ch, bdev_io); 1077 } else { 1078 ch = spdk_io_channel_from_ctx(nbdev_ch); 1079 bdev_nvme_submit_request(ch, bdev_io); 1080 } 1081 } 1082 1083 static int 1084 bdev_nvme_retry_ios(void *arg) 1085 { 1086 struct nvme_bdev_channel *nbdev_ch = arg; 1087 struct spdk_bdev_io *bdev_io, *tmp_bdev_io; 1088 struct nvme_bdev_io *bio; 1089 uint64_t now, delay_us; 1090 1091 now = spdk_get_ticks(); 1092 1093 TAILQ_FOREACH_SAFE(bdev_io, &nbdev_ch->retry_io_list, module_link, tmp_bdev_io) { 1094 bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 1095 if (bio->retry_ticks > now) { 1096 break; 1097 } 1098 1099 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io, module_link); 1100 1101 bdev_nvme_retry_io(nbdev_ch, bdev_io); 1102 } 1103 1104 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1105 1106 bdev_io = TAILQ_FIRST(&nbdev_ch->retry_io_list); 1107 if (bdev_io != NULL) { 1108 bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 1109 1110 delay_us = (bio->retry_ticks - now) * SPDK_SEC_TO_USEC / spdk_get_ticks_hz(); 1111 1112 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 1113 delay_us); 1114 } 1115 1116 return SPDK_POLLER_BUSY; 1117 } 1118 1119 static void 1120 bdev_nvme_queue_retry_io(struct nvme_bdev_channel *nbdev_ch, 1121 struct nvme_bdev_io *bio, uint64_t delay_ms) 1122 { 1123 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1124 struct spdk_bdev_io *tmp_bdev_io; 1125 struct nvme_bdev_io *tmp_bio; 1126 1127 bio->retry_ticks = spdk_get_ticks() + delay_ms * spdk_get_ticks_hz() / 1000ULL; 1128 1129 TAILQ_FOREACH_REVERSE(tmp_bdev_io, &nbdev_ch->retry_io_list, retry_io_head, module_link) { 1130 tmp_bio = (struct nvme_bdev_io *)tmp_bdev_io->driver_ctx; 1131 1132 if (tmp_bio->retry_ticks <= bio->retry_ticks) { 1133 TAILQ_INSERT_AFTER(&nbdev_ch->retry_io_list, tmp_bdev_io, bdev_io, 1134 module_link); 1135 return; 1136 } 1137 } 1138 1139 /* No earlier I/Os were found. This I/O must be the new head. */ 1140 TAILQ_INSERT_HEAD(&nbdev_ch->retry_io_list, bdev_io, module_link); 1141 1142 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1143 1144 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 1145 delay_ms * 1000ULL); 1146 } 1147 1148 static void 1149 bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch) 1150 { 1151 struct spdk_bdev_io *bdev_io, *tmp_io; 1152 1153 TAILQ_FOREACH_SAFE(bdev_io, &nbdev_ch->retry_io_list, module_link, tmp_io) { 1154 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io, module_link); 1155 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED, NULL); 1156 } 1157 1158 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1159 } 1160 1161 static int 1162 bdev_nvme_abort_retry_io(struct nvme_bdev_channel *nbdev_ch, 1163 struct nvme_bdev_io *bio_to_abort) 1164 { 1165 struct spdk_bdev_io *bdev_io_to_abort; 1166 1167 TAILQ_FOREACH(bdev_io_to_abort, &nbdev_ch->retry_io_list, module_link) { 1168 if ((struct nvme_bdev_io *)bdev_io_to_abort->driver_ctx == bio_to_abort) { 1169 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io_to_abort, module_link); 1170 __bdev_nvme_io_complete(bdev_io_to_abort, SPDK_BDEV_IO_STATUS_ABORTED, NULL); 1171 return 0; 1172 } 1173 } 1174 1175 return -ENOENT; 1176 } 1177 1178 static void 1179 bdev_nvme_update_nvme_error_stat(struct spdk_bdev_io *bdev_io, const struct spdk_nvme_cpl *cpl) 1180 { 1181 struct nvme_bdev *nbdev; 1182 uint16_t sct, sc; 1183 1184 assert(spdk_nvme_cpl_is_error(cpl)); 1185 1186 nbdev = bdev_io->bdev->ctxt; 1187 1188 if (nbdev->err_stat == NULL) { 1189 return; 1190 } 1191 1192 sct = cpl->status.sct; 1193 sc = cpl->status.sc; 1194 1195 pthread_mutex_lock(&nbdev->mutex); 1196 1197 nbdev->err_stat->status_type[sct]++; 1198 switch (sct) { 1199 case SPDK_NVME_SCT_GENERIC: 1200 case SPDK_NVME_SCT_COMMAND_SPECIFIC: 1201 case SPDK_NVME_SCT_MEDIA_ERROR: 1202 case SPDK_NVME_SCT_PATH: 1203 nbdev->err_stat->status[sct][sc]++; 1204 break; 1205 default: 1206 break; 1207 } 1208 1209 pthread_mutex_unlock(&nbdev->mutex); 1210 } 1211 1212 static inline void 1213 bdev_nvme_update_io_path_stat(struct nvme_bdev_io *bio) 1214 { 1215 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1216 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 1217 uint32_t blocklen = bdev_io->bdev->blocklen; 1218 struct spdk_bdev_io_stat *stat; 1219 uint64_t tsc_diff; 1220 1221 if (bio->io_path->stat == NULL) { 1222 return; 1223 } 1224 1225 tsc_diff = spdk_get_ticks() - bio->submit_tsc; 1226 stat = bio->io_path->stat; 1227 1228 switch (bdev_io->type) { 1229 case SPDK_BDEV_IO_TYPE_READ: 1230 stat->bytes_read += num_blocks * blocklen; 1231 stat->num_read_ops++; 1232 stat->read_latency_ticks += tsc_diff; 1233 if (stat->max_read_latency_ticks < tsc_diff) { 1234 stat->max_read_latency_ticks = tsc_diff; 1235 } 1236 if (stat->min_read_latency_ticks > tsc_diff) { 1237 stat->min_read_latency_ticks = tsc_diff; 1238 } 1239 break; 1240 case SPDK_BDEV_IO_TYPE_WRITE: 1241 stat->bytes_written += num_blocks * blocklen; 1242 stat->num_write_ops++; 1243 stat->write_latency_ticks += tsc_diff; 1244 if (stat->max_write_latency_ticks < tsc_diff) { 1245 stat->max_write_latency_ticks = tsc_diff; 1246 } 1247 if (stat->min_write_latency_ticks > tsc_diff) { 1248 stat->min_write_latency_ticks = tsc_diff; 1249 } 1250 break; 1251 case SPDK_BDEV_IO_TYPE_UNMAP: 1252 stat->bytes_unmapped += num_blocks * blocklen; 1253 stat->num_unmap_ops++; 1254 stat->unmap_latency_ticks += tsc_diff; 1255 if (stat->max_unmap_latency_ticks < tsc_diff) { 1256 stat->max_unmap_latency_ticks = tsc_diff; 1257 } 1258 if (stat->min_unmap_latency_ticks > tsc_diff) { 1259 stat->min_unmap_latency_ticks = tsc_diff; 1260 } 1261 break; 1262 case SPDK_BDEV_IO_TYPE_ZCOPY: 1263 /* Track the data in the start phase only */ 1264 if (!bdev_io->u.bdev.zcopy.start) { 1265 break; 1266 } 1267 if (bdev_io->u.bdev.zcopy.populate) { 1268 stat->bytes_read += num_blocks * blocklen; 1269 stat->num_read_ops++; 1270 stat->read_latency_ticks += tsc_diff; 1271 if (stat->max_read_latency_ticks < tsc_diff) { 1272 stat->max_read_latency_ticks = tsc_diff; 1273 } 1274 if (stat->min_read_latency_ticks > tsc_diff) { 1275 stat->min_read_latency_ticks = tsc_diff; 1276 } 1277 } else { 1278 stat->bytes_written += num_blocks * blocklen; 1279 stat->num_write_ops++; 1280 stat->write_latency_ticks += tsc_diff; 1281 if (stat->max_write_latency_ticks < tsc_diff) { 1282 stat->max_write_latency_ticks = tsc_diff; 1283 } 1284 if (stat->min_write_latency_ticks > tsc_diff) { 1285 stat->min_write_latency_ticks = tsc_diff; 1286 } 1287 } 1288 break; 1289 case SPDK_BDEV_IO_TYPE_COPY: 1290 stat->bytes_copied += num_blocks * blocklen; 1291 stat->num_copy_ops++; 1292 stat->copy_latency_ticks += tsc_diff; 1293 if (stat->max_copy_latency_ticks < tsc_diff) { 1294 stat->max_copy_latency_ticks = tsc_diff; 1295 } 1296 if (stat->min_copy_latency_ticks > tsc_diff) { 1297 stat->min_copy_latency_ticks = tsc_diff; 1298 } 1299 break; 1300 default: 1301 break; 1302 } 1303 } 1304 1305 static bool 1306 bdev_nvme_check_retry_io(struct nvme_bdev_io *bio, 1307 const struct spdk_nvme_cpl *cpl, 1308 struct nvme_bdev_channel *nbdev_ch, 1309 uint64_t *_delay_ms) 1310 { 1311 struct nvme_io_path *io_path = bio->io_path; 1312 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 1313 const struct spdk_nvme_ctrlr_data *cdata; 1314 1315 if (spdk_nvme_cpl_is_path_error(cpl) || 1316 spdk_nvme_cpl_is_aborted_sq_deletion(cpl) || 1317 !nvme_io_path_is_available(io_path) || 1318 !nvme_ctrlr_is_available(nvme_ctrlr)) { 1319 bdev_nvme_clear_current_io_path(nbdev_ch); 1320 bio->io_path = NULL; 1321 if (spdk_nvme_cpl_is_ana_error(cpl)) { 1322 if (nvme_ctrlr_read_ana_log_page(nvme_ctrlr) == 0) { 1323 io_path->nvme_ns->ana_state_updating = true; 1324 } 1325 } 1326 if (!any_io_path_may_become_available(nbdev_ch)) { 1327 return false; 1328 } 1329 *_delay_ms = 0; 1330 } else { 1331 bio->retry_count++; 1332 1333 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 1334 1335 if (cpl->status.crd != 0) { 1336 *_delay_ms = cdata->crdt[cpl->status.crd] * 100; 1337 } else { 1338 *_delay_ms = 0; 1339 } 1340 } 1341 1342 return true; 1343 } 1344 1345 static inline void 1346 bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio, 1347 const struct spdk_nvme_cpl *cpl) 1348 { 1349 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1350 struct nvme_bdev_channel *nbdev_ch; 1351 uint64_t delay_ms; 1352 1353 assert(!bdev_nvme_io_type_is_admin(bdev_io->type)); 1354 1355 if (spdk_likely(spdk_nvme_cpl_is_success(cpl))) { 1356 bdev_nvme_update_io_path_stat(bio); 1357 goto complete; 1358 } 1359 1360 /* Update error counts before deciding if retry is needed. 1361 * Hence, error counts may be more than the number of I/O errors. 1362 */ 1363 bdev_nvme_update_nvme_error_stat(bdev_io, cpl); 1364 1365 if (cpl->status.dnr != 0 || spdk_nvme_cpl_is_aborted_by_request(cpl) || 1366 (g_opts.bdev_retry_count != -1 && bio->retry_count >= g_opts.bdev_retry_count)) { 1367 goto complete; 1368 } 1369 1370 /* At this point we don't know whether the sequence was successfully executed or not, so we 1371 * cannot retry the IO */ 1372 if (bdev_io->u.bdev.accel_sequence != NULL) { 1373 goto complete; 1374 } 1375 1376 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1377 1378 if (bdev_nvme_check_retry_io(bio, cpl, nbdev_ch, &delay_ms)) { 1379 bdev_nvme_queue_retry_io(nbdev_ch, bio, delay_ms); 1380 return; 1381 } 1382 1383 complete: 1384 bio->retry_count = 0; 1385 bio->submit_tsc = 0; 1386 bdev_io->u.bdev.accel_sequence = NULL; 1387 __bdev_nvme_io_complete(bdev_io, 0, cpl); 1388 } 1389 1390 static inline void 1391 bdev_nvme_io_complete(struct nvme_bdev_io *bio, int rc) 1392 { 1393 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1394 struct nvme_bdev_channel *nbdev_ch; 1395 enum spdk_bdev_io_status io_status; 1396 1397 assert(!bdev_nvme_io_type_is_admin(bdev_io->type)); 1398 1399 switch (rc) { 1400 case 0: 1401 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1402 break; 1403 case -ENOMEM: 1404 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1405 break; 1406 case -ENXIO: 1407 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1408 1409 bdev_nvme_clear_current_io_path(nbdev_ch); 1410 bio->io_path = NULL; 1411 1412 if (any_io_path_may_become_available(nbdev_ch)) { 1413 bdev_nvme_queue_retry_io(nbdev_ch, bio, 1000ULL); 1414 return; 1415 } 1416 1417 /* fallthrough */ 1418 default: 1419 spdk_accel_sequence_abort(bdev_io->u.bdev.accel_sequence); 1420 bdev_io->u.bdev.accel_sequence = NULL; 1421 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1422 break; 1423 } 1424 1425 bio->retry_count = 0; 1426 bio->submit_tsc = 0; 1427 __bdev_nvme_io_complete(bdev_io, io_status, NULL); 1428 } 1429 1430 static inline void 1431 bdev_nvme_admin_complete(struct nvme_bdev_io *bio, int rc) 1432 { 1433 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1434 enum spdk_bdev_io_status io_status; 1435 1436 switch (rc) { 1437 case 0: 1438 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1439 break; 1440 case -ENOMEM: 1441 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1442 break; 1443 case -ENXIO: 1444 /* fallthrough */ 1445 default: 1446 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1447 break; 1448 } 1449 1450 __bdev_nvme_io_complete(bdev_io, io_status, NULL); 1451 } 1452 1453 static void 1454 bdev_nvme_clear_io_path_caches_done(struct spdk_io_channel_iter *i, int status) 1455 { 1456 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 1457 1458 pthread_mutex_lock(&nvme_ctrlr->mutex); 1459 1460 assert(nvme_ctrlr->io_path_cache_clearing == true); 1461 nvme_ctrlr->io_path_cache_clearing = false; 1462 1463 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 1464 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1465 return; 1466 } 1467 1468 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1469 1470 nvme_ctrlr_unregister(nvme_ctrlr); 1471 } 1472 1473 static void 1474 _bdev_nvme_clear_io_path_cache(struct nvme_qpair *nvme_qpair) 1475 { 1476 struct nvme_io_path *io_path; 1477 1478 TAILQ_FOREACH(io_path, &nvme_qpair->io_path_list, tailq) { 1479 if (io_path->nbdev_ch == NULL) { 1480 continue; 1481 } 1482 bdev_nvme_clear_current_io_path(io_path->nbdev_ch); 1483 } 1484 } 1485 1486 static void 1487 bdev_nvme_clear_io_path_cache(struct spdk_io_channel_iter *i) 1488 { 1489 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 1490 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 1491 1492 assert(ctrlr_ch->qpair != NULL); 1493 1494 _bdev_nvme_clear_io_path_cache(ctrlr_ch->qpair); 1495 1496 spdk_for_each_channel_continue(i, 0); 1497 } 1498 1499 static void 1500 bdev_nvme_clear_io_path_caches(struct nvme_ctrlr *nvme_ctrlr) 1501 { 1502 pthread_mutex_lock(&nvme_ctrlr->mutex); 1503 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 1504 nvme_ctrlr->io_path_cache_clearing) { 1505 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1506 return; 1507 } 1508 1509 nvme_ctrlr->io_path_cache_clearing = true; 1510 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1511 1512 spdk_for_each_channel(nvme_ctrlr, 1513 bdev_nvme_clear_io_path_cache, 1514 NULL, 1515 bdev_nvme_clear_io_path_caches_done); 1516 } 1517 1518 static struct nvme_qpair * 1519 nvme_poll_group_get_qpair(struct nvme_poll_group *group, struct spdk_nvme_qpair *qpair) 1520 { 1521 struct nvme_qpair *nvme_qpair; 1522 1523 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1524 if (nvme_qpair->qpair == qpair) { 1525 break; 1526 } 1527 } 1528 1529 return nvme_qpair; 1530 } 1531 1532 static void nvme_qpair_delete(struct nvme_qpair *nvme_qpair); 1533 1534 static void 1535 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx) 1536 { 1537 struct nvme_poll_group *group = poll_group_ctx; 1538 struct nvme_qpair *nvme_qpair; 1539 struct nvme_ctrlr_channel *ctrlr_ch; 1540 int status; 1541 1542 nvme_qpair = nvme_poll_group_get_qpair(group, qpair); 1543 if (nvme_qpair == NULL) { 1544 return; 1545 } 1546 1547 if (nvme_qpair->qpair != NULL) { 1548 spdk_nvme_ctrlr_free_io_qpair(nvme_qpair->qpair); 1549 nvme_qpair->qpair = NULL; 1550 } 1551 1552 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1553 1554 ctrlr_ch = nvme_qpair->ctrlr_ch; 1555 1556 if (ctrlr_ch != NULL) { 1557 if (ctrlr_ch->reset_iter != NULL) { 1558 /* We are in a full reset sequence. */ 1559 if (ctrlr_ch->connect_poller != NULL) { 1560 /* qpair was failed to connect. Abort the reset sequence. */ 1561 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was failed to connect. abort the reset ctrlr sequence.\n", 1562 qpair); 1563 spdk_poller_unregister(&ctrlr_ch->connect_poller); 1564 status = -1; 1565 } else { 1566 /* qpair was completed to disconnect. Just move to the next ctrlr_channel. */ 1567 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was disconnected and freed in a reset ctrlr sequence.\n", 1568 qpair); 1569 status = 0; 1570 } 1571 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, status); 1572 ctrlr_ch->reset_iter = NULL; 1573 } else { 1574 /* qpair was disconnected unexpectedly. Reset controller for recovery. */ 1575 SPDK_NOTICELOG("qpair %p was disconnected and freed. reset controller.\n", qpair); 1576 bdev_nvme_failover_ctrlr(nvme_qpair->ctrlr, false); 1577 } 1578 } else { 1579 /* In this case, ctrlr_channel is already deleted. */ 1580 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was disconnected and freed. delete nvme_qpair.\n", qpair); 1581 nvme_qpair_delete(nvme_qpair); 1582 } 1583 } 1584 1585 static void 1586 bdev_nvme_check_io_qpairs(struct nvme_poll_group *group) 1587 { 1588 struct nvme_qpair *nvme_qpair; 1589 1590 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1591 if (nvme_qpair->qpair == NULL || nvme_qpair->ctrlr_ch == NULL) { 1592 continue; 1593 } 1594 1595 if (spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) != 1596 SPDK_NVME_QPAIR_FAILURE_NONE) { 1597 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1598 } 1599 } 1600 } 1601 1602 static int 1603 bdev_nvme_poll(void *arg) 1604 { 1605 struct nvme_poll_group *group = arg; 1606 int64_t num_completions; 1607 1608 if (group->collect_spin_stat && group->start_ticks == 0) { 1609 group->start_ticks = spdk_get_ticks(); 1610 } 1611 1612 num_completions = spdk_nvme_poll_group_process_completions(group->group, 0, 1613 bdev_nvme_disconnected_qpair_cb); 1614 if (group->collect_spin_stat) { 1615 if (num_completions > 0) { 1616 if (group->end_ticks != 0) { 1617 group->spin_ticks += (group->end_ticks - group->start_ticks); 1618 group->end_ticks = 0; 1619 } 1620 group->start_ticks = 0; 1621 } else { 1622 group->end_ticks = spdk_get_ticks(); 1623 } 1624 } 1625 1626 if (spdk_unlikely(num_completions < 0)) { 1627 bdev_nvme_check_io_qpairs(group); 1628 } 1629 1630 return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 1631 } 1632 1633 static int bdev_nvme_poll_adminq(void *arg); 1634 1635 static void 1636 bdev_nvme_change_adminq_poll_period(struct nvme_ctrlr *nvme_ctrlr, uint64_t new_period_us) 1637 { 1638 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 1639 1640 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, 1641 nvme_ctrlr, new_period_us); 1642 } 1643 1644 static int 1645 bdev_nvme_poll_adminq(void *arg) 1646 { 1647 int32_t rc; 1648 struct nvme_ctrlr *nvme_ctrlr = arg; 1649 nvme_ctrlr_disconnected_cb disconnected_cb; 1650 1651 assert(nvme_ctrlr != NULL); 1652 1653 rc = spdk_nvme_ctrlr_process_admin_completions(nvme_ctrlr->ctrlr); 1654 if (rc < 0) { 1655 disconnected_cb = nvme_ctrlr->disconnected_cb; 1656 nvme_ctrlr->disconnected_cb = NULL; 1657 1658 if (disconnected_cb != NULL) { 1659 bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 1660 g_opts.nvme_adminq_poll_period_us); 1661 disconnected_cb(nvme_ctrlr); 1662 } else { 1663 bdev_nvme_failover_ctrlr(nvme_ctrlr, false); 1664 } 1665 } else if (spdk_nvme_ctrlr_get_admin_qp_failure_reason(nvme_ctrlr->ctrlr) != 1666 SPDK_NVME_QPAIR_FAILURE_NONE) { 1667 bdev_nvme_clear_io_path_caches(nvme_ctrlr); 1668 } 1669 1670 return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY; 1671 } 1672 1673 static void 1674 nvme_bdev_free(void *io_device) 1675 { 1676 struct nvme_bdev *nvme_disk = io_device; 1677 1678 pthread_mutex_destroy(&nvme_disk->mutex); 1679 free(nvme_disk->disk.name); 1680 free(nvme_disk->err_stat); 1681 free(nvme_disk); 1682 } 1683 1684 static int 1685 bdev_nvme_destruct(void *ctx) 1686 { 1687 struct nvme_bdev *nvme_disk = ctx; 1688 struct nvme_ns *nvme_ns, *tmp_nvme_ns; 1689 1690 SPDK_DTRACE_PROBE2(bdev_nvme_destruct, nvme_disk->nbdev_ctrlr->name, nvme_disk->nsid); 1691 1692 TAILQ_FOREACH_SAFE(nvme_ns, &nvme_disk->nvme_ns_list, tailq, tmp_nvme_ns) { 1693 pthread_mutex_lock(&nvme_ns->ctrlr->mutex); 1694 1695 nvme_ns->bdev = NULL; 1696 1697 assert(nvme_ns->id > 0); 1698 1699 if (nvme_ctrlr_get_ns(nvme_ns->ctrlr, nvme_ns->id) == NULL) { 1700 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1701 1702 nvme_ctrlr_release(nvme_ns->ctrlr); 1703 nvme_ns_free(nvme_ns); 1704 } else { 1705 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1706 } 1707 } 1708 1709 pthread_mutex_lock(&g_bdev_nvme_mutex); 1710 TAILQ_REMOVE(&nvme_disk->nbdev_ctrlr->bdevs, nvme_disk, tailq); 1711 pthread_mutex_unlock(&g_bdev_nvme_mutex); 1712 1713 spdk_io_device_unregister(nvme_disk, nvme_bdev_free); 1714 1715 return 0; 1716 } 1717 1718 static int 1719 bdev_nvme_create_qpair(struct nvme_qpair *nvme_qpair) 1720 { 1721 struct nvme_ctrlr *nvme_ctrlr; 1722 struct spdk_nvme_io_qpair_opts opts; 1723 struct spdk_nvme_qpair *qpair; 1724 int rc; 1725 1726 nvme_ctrlr = nvme_qpair->ctrlr; 1727 1728 spdk_nvme_ctrlr_get_default_io_qpair_opts(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1729 opts.delay_cmd_submit = g_opts.delay_cmd_submit; 1730 opts.create_only = true; 1731 opts.async_mode = true; 1732 opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests); 1733 g_opts.io_queue_requests = opts.io_queue_requests; 1734 1735 qpair = spdk_nvme_ctrlr_alloc_io_qpair(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1736 if (qpair == NULL) { 1737 return -1; 1738 } 1739 1740 SPDK_DTRACE_PROBE3(bdev_nvme_create_qpair, nvme_ctrlr->nbdev_ctrlr->name, 1741 spdk_nvme_qpair_get_id(qpair), spdk_thread_get_id(nvme_ctrlr->thread)); 1742 1743 assert(nvme_qpair->group != NULL); 1744 1745 rc = spdk_nvme_poll_group_add(nvme_qpair->group->group, qpair); 1746 if (rc != 0) { 1747 SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n"); 1748 goto err; 1749 } 1750 1751 rc = spdk_nvme_ctrlr_connect_io_qpair(nvme_ctrlr->ctrlr, qpair); 1752 if (rc != 0) { 1753 SPDK_ERRLOG("Unable to connect I/O qpair.\n"); 1754 goto err; 1755 } 1756 1757 nvme_qpair->qpair = qpair; 1758 1759 if (!g_opts.disable_auto_failback) { 1760 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1761 } 1762 1763 return 0; 1764 1765 err: 1766 spdk_nvme_ctrlr_free_io_qpair(qpair); 1767 1768 return rc; 1769 } 1770 1771 static void 1772 bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i) 1773 { 1774 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 1775 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 1776 enum spdk_bdev_io_status status = SPDK_BDEV_IO_STATUS_SUCCESS; 1777 struct spdk_bdev_io *bdev_io; 1778 1779 if (spdk_io_channel_iter_get_ctx(i) != NULL) { 1780 status = SPDK_BDEV_IO_STATUS_FAILED; 1781 } 1782 1783 while (!TAILQ_EMPTY(&ctrlr_ch->pending_resets)) { 1784 bdev_io = TAILQ_FIRST(&ctrlr_ch->pending_resets); 1785 TAILQ_REMOVE(&ctrlr_ch->pending_resets, bdev_io, module_link); 1786 __bdev_nvme_io_complete(bdev_io, status, NULL); 1787 } 1788 1789 spdk_for_each_channel_continue(i, 0); 1790 } 1791 1792 /* This function marks the current trid as failed by storing the current ticks 1793 * and then sets the next trid to the active trid within a controller if exists. 1794 * 1795 * The purpose of the boolean return value is to request the caller to disconnect 1796 * the current trid now to try connecting the next trid. 1797 */ 1798 static bool 1799 bdev_nvme_failover_trid(struct nvme_ctrlr *nvme_ctrlr, bool remove, bool start) 1800 { 1801 struct nvme_path_id *path_id, *next_path; 1802 int rc __attribute__((unused)); 1803 1804 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 1805 assert(path_id); 1806 assert(path_id == nvme_ctrlr->active_path_id); 1807 next_path = TAILQ_NEXT(path_id, link); 1808 1809 /* Update the last failed time. It means the trid is failed if its last 1810 * failed time is non-zero. 1811 */ 1812 path_id->last_failed_tsc = spdk_get_ticks(); 1813 1814 if (next_path == NULL) { 1815 /* There is no alternate trid within a controller. */ 1816 return false; 1817 } 1818 1819 if (!start && nvme_ctrlr->opts.reconnect_delay_sec == 0) { 1820 /* Connect is not retried in a controller reset sequence. Connecting 1821 * the next trid will be done by the next bdev_nvme_failover_ctrlr() call. 1822 */ 1823 return false; 1824 } 1825 1826 assert(path_id->trid.trtype != SPDK_NVME_TRANSPORT_PCIE); 1827 1828 SPDK_NOTICELOG("Start failover from %s:%s to %s:%s\n", path_id->trid.traddr, 1829 path_id->trid.trsvcid, next_path->trid.traddr, next_path->trid.trsvcid); 1830 1831 spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr); 1832 nvme_ctrlr->active_path_id = next_path; 1833 rc = spdk_nvme_ctrlr_set_trid(nvme_ctrlr->ctrlr, &next_path->trid); 1834 assert(rc == 0); 1835 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 1836 if (!remove) { 1837 /** Shuffle the old trid to the end of the list and use the new one. 1838 * Allows for round robin through multiple connections. 1839 */ 1840 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, path_id, link); 1841 } else { 1842 free(path_id); 1843 } 1844 1845 if (start || next_path->last_failed_tsc == 0) { 1846 /* bdev_nvme_failover_ctrlr() is just called or the next trid is not failed 1847 * or used yet. Try the next trid now. 1848 */ 1849 return true; 1850 } 1851 1852 if (spdk_get_ticks() > next_path->last_failed_tsc + spdk_get_ticks_hz() * 1853 nvme_ctrlr->opts.reconnect_delay_sec) { 1854 /* Enough backoff passed since the next trid failed. Try the next trid now. */ 1855 return true; 1856 } 1857 1858 /* The next trid will be tried after reconnect_delay_sec seconds. */ 1859 return false; 1860 } 1861 1862 static bool 1863 bdev_nvme_check_ctrlr_loss_timeout(struct nvme_ctrlr *nvme_ctrlr) 1864 { 1865 int32_t elapsed; 1866 1867 if (nvme_ctrlr->opts.ctrlr_loss_timeout_sec == 0 || 1868 nvme_ctrlr->opts.ctrlr_loss_timeout_sec == -1) { 1869 return false; 1870 } 1871 1872 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 1873 if (elapsed >= nvme_ctrlr->opts.ctrlr_loss_timeout_sec) { 1874 return true; 1875 } else { 1876 return false; 1877 } 1878 } 1879 1880 static bool 1881 bdev_nvme_check_fast_io_fail_timeout(struct nvme_ctrlr *nvme_ctrlr) 1882 { 1883 uint32_t elapsed; 1884 1885 if (nvme_ctrlr->opts.fast_io_fail_timeout_sec == 0) { 1886 return false; 1887 } 1888 1889 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 1890 if (elapsed >= nvme_ctrlr->opts.fast_io_fail_timeout_sec) { 1891 return true; 1892 } else { 1893 return false; 1894 } 1895 } 1896 1897 static void bdev_nvme_reset_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, bool success); 1898 1899 static void 1900 nvme_ctrlr_disconnect(struct nvme_ctrlr *nvme_ctrlr, nvme_ctrlr_disconnected_cb cb_fn) 1901 { 1902 int rc; 1903 1904 rc = spdk_nvme_ctrlr_disconnect(nvme_ctrlr->ctrlr); 1905 if (rc != 0) { 1906 /* Disconnect fails if ctrlr is already resetting or removed. In this case, 1907 * fail the reset sequence immediately. 1908 */ 1909 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 1910 return; 1911 } 1912 1913 /* spdk_nvme_ctrlr_disconnect() may complete asynchronously later by polling adminq. 1914 * Set callback here to execute the specified operation after ctrlr is really disconnected. 1915 */ 1916 assert(nvme_ctrlr->disconnected_cb == NULL); 1917 nvme_ctrlr->disconnected_cb = cb_fn; 1918 1919 /* During disconnection, reduce the period to poll adminq more often. */ 1920 bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 0); 1921 } 1922 1923 enum bdev_nvme_op_after_reset { 1924 OP_NONE, 1925 OP_COMPLETE_PENDING_DESTRUCT, 1926 OP_DESTRUCT, 1927 OP_DELAYED_RECONNECT, 1928 OP_FAILOVER, 1929 }; 1930 1931 typedef enum bdev_nvme_op_after_reset _bdev_nvme_op_after_reset; 1932 1933 static _bdev_nvme_op_after_reset 1934 bdev_nvme_check_op_after_reset(struct nvme_ctrlr *nvme_ctrlr, bool success) 1935 { 1936 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 1937 /* Complete pending destruct after reset completes. */ 1938 return OP_COMPLETE_PENDING_DESTRUCT; 1939 } else if (nvme_ctrlr->pending_failover) { 1940 nvme_ctrlr->pending_failover = false; 1941 nvme_ctrlr->reset_start_tsc = 0; 1942 return OP_FAILOVER; 1943 } else if (success || nvme_ctrlr->opts.reconnect_delay_sec == 0) { 1944 nvme_ctrlr->reset_start_tsc = 0; 1945 return OP_NONE; 1946 } else if (bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 1947 return OP_DESTRUCT; 1948 } else { 1949 if (bdev_nvme_check_fast_io_fail_timeout(nvme_ctrlr)) { 1950 nvme_ctrlr->fast_io_fail_timedout = true; 1951 } 1952 return OP_DELAYED_RECONNECT; 1953 } 1954 } 1955 1956 static int bdev_nvme_delete_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool hotplug); 1957 static void bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 1958 1959 static int 1960 bdev_nvme_reconnect_delay_timer_expired(void *ctx) 1961 { 1962 struct nvme_ctrlr *nvme_ctrlr = ctx; 1963 1964 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect_delay, nvme_ctrlr->nbdev_ctrlr->name); 1965 pthread_mutex_lock(&nvme_ctrlr->mutex); 1966 1967 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 1968 1969 if (!nvme_ctrlr->reconnect_is_delayed) { 1970 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1971 return SPDK_POLLER_BUSY; 1972 } 1973 1974 nvme_ctrlr->reconnect_is_delayed = false; 1975 1976 if (nvme_ctrlr->destruct) { 1977 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1978 return SPDK_POLLER_BUSY; 1979 } 1980 1981 assert(nvme_ctrlr->resetting == false); 1982 nvme_ctrlr->resetting = true; 1983 1984 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1985 1986 spdk_poller_resume(nvme_ctrlr->adminq_timer_poller); 1987 1988 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 1989 return SPDK_POLLER_BUSY; 1990 } 1991 1992 static void 1993 bdev_nvme_start_reconnect_delay_timer(struct nvme_ctrlr *nvme_ctrlr) 1994 { 1995 spdk_poller_pause(nvme_ctrlr->adminq_timer_poller); 1996 1997 assert(nvme_ctrlr->reconnect_is_delayed == false); 1998 nvme_ctrlr->reconnect_is_delayed = true; 1999 2000 assert(nvme_ctrlr->reconnect_delay_timer == NULL); 2001 nvme_ctrlr->reconnect_delay_timer = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_delay_timer_expired, 2002 nvme_ctrlr, 2003 nvme_ctrlr->opts.reconnect_delay_sec * SPDK_SEC_TO_USEC); 2004 } 2005 2006 static void remove_discovery_entry(struct nvme_ctrlr *nvme_ctrlr); 2007 2008 static void 2009 _bdev_nvme_reset_ctrlr_complete(struct spdk_io_channel_iter *i, int status) 2010 { 2011 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2012 bool success = spdk_io_channel_iter_get_ctx(i) == NULL; 2013 bdev_nvme_ctrlr_op_cb ctrlr_op_cb_fn = nvme_ctrlr->ctrlr_op_cb_fn; 2014 void *ctrlr_op_cb_arg = nvme_ctrlr->ctrlr_op_cb_arg; 2015 enum bdev_nvme_op_after_reset op_after_reset; 2016 2017 assert(nvme_ctrlr->thread == spdk_get_thread()); 2018 2019 nvme_ctrlr->ctrlr_op_cb_fn = NULL; 2020 nvme_ctrlr->ctrlr_op_cb_arg = NULL; 2021 2022 if (!success) { 2023 SPDK_ERRLOG("Resetting controller failed.\n"); 2024 } else { 2025 SPDK_NOTICELOG("Resetting controller successful.\n"); 2026 } 2027 2028 pthread_mutex_lock(&nvme_ctrlr->mutex); 2029 nvme_ctrlr->resetting = false; 2030 nvme_ctrlr->dont_retry = false; 2031 nvme_ctrlr->in_failover = false; 2032 2033 op_after_reset = bdev_nvme_check_op_after_reset(nvme_ctrlr, success); 2034 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2035 2036 if (ctrlr_op_cb_fn) { 2037 ctrlr_op_cb_fn(ctrlr_op_cb_arg, success ? 0 : -1); 2038 } 2039 2040 switch (op_after_reset) { 2041 case OP_COMPLETE_PENDING_DESTRUCT: 2042 nvme_ctrlr_unregister(nvme_ctrlr); 2043 break; 2044 case OP_DESTRUCT: 2045 bdev_nvme_delete_ctrlr(nvme_ctrlr, false); 2046 remove_discovery_entry(nvme_ctrlr); 2047 break; 2048 case OP_DELAYED_RECONNECT: 2049 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_start_reconnect_delay_timer); 2050 break; 2051 case OP_FAILOVER: 2052 bdev_nvme_failover_ctrlr(nvme_ctrlr, false); 2053 break; 2054 default: 2055 break; 2056 } 2057 } 2058 2059 static void 2060 bdev_nvme_reset_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, bool success) 2061 { 2062 pthread_mutex_lock(&nvme_ctrlr->mutex); 2063 if (!success) { 2064 /* Connecting the active trid failed. Set the next alternate trid to the 2065 * active trid if it exists. 2066 */ 2067 if (bdev_nvme_failover_trid(nvme_ctrlr, false, false)) { 2068 /* The next alternate trid exists and is ready to try. Try it now. */ 2069 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2070 2071 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr); 2072 return; 2073 } 2074 2075 /* We came here if there is no alternate trid or if the next trid exists but 2076 * is not ready to try. We will try the active trid after reconnect_delay_sec 2077 * seconds if it is non-zero or at the next reset call otherwise. 2078 */ 2079 } else { 2080 /* Connecting the active trid succeeded. Clear the last failed time because it 2081 * means the trid is failed if its last failed time is non-zero. 2082 */ 2083 nvme_ctrlr->active_path_id->last_failed_tsc = 0; 2084 } 2085 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2086 2087 /* Make sure we clear any pending resets before returning. */ 2088 spdk_for_each_channel(nvme_ctrlr, 2089 bdev_nvme_complete_pending_resets, 2090 success ? NULL : (void *)0x1, 2091 _bdev_nvme_reset_ctrlr_complete); 2092 } 2093 2094 static void 2095 bdev_nvme_reset_create_qpairs_failed(struct spdk_io_channel_iter *i, int status) 2096 { 2097 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2098 2099 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 2100 } 2101 2102 static void 2103 bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i) 2104 { 2105 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 2106 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch); 2107 struct nvme_qpair *nvme_qpair; 2108 2109 nvme_qpair = ctrlr_ch->qpair; 2110 assert(nvme_qpair != NULL); 2111 2112 _bdev_nvme_clear_io_path_cache(nvme_qpair); 2113 2114 if (nvme_qpair->qpair != NULL) { 2115 if (nvme_qpair->ctrlr->dont_retry) { 2116 spdk_nvme_qpair_set_abort_dnr(nvme_qpair->qpair, true); 2117 } 2118 spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); 2119 2120 /* The current full reset sequence will move to the next 2121 * ctrlr_channel after the qpair is actually disconnected. 2122 */ 2123 assert(ctrlr_ch->reset_iter == NULL); 2124 ctrlr_ch->reset_iter = i; 2125 } else { 2126 spdk_for_each_channel_continue(i, 0); 2127 } 2128 } 2129 2130 static void 2131 bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status) 2132 { 2133 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2134 2135 if (status == 0) { 2136 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, true); 2137 } else { 2138 /* Delete the added qpairs and quiesce ctrlr to make the states clean. */ 2139 spdk_for_each_channel(nvme_ctrlr, 2140 bdev_nvme_reset_destroy_qpair, 2141 NULL, 2142 bdev_nvme_reset_create_qpairs_failed); 2143 } 2144 } 2145 2146 static int 2147 bdev_nvme_reset_check_qpair_connected(void *ctx) 2148 { 2149 struct nvme_ctrlr_channel *ctrlr_ch = ctx; 2150 2151 if (ctrlr_ch->reset_iter == NULL) { 2152 /* qpair was already failed to connect and the reset sequence is being aborted. */ 2153 assert(ctrlr_ch->connect_poller == NULL); 2154 assert(ctrlr_ch->qpair->qpair == NULL); 2155 return SPDK_POLLER_BUSY; 2156 } 2157 2158 assert(ctrlr_ch->qpair->qpair != NULL); 2159 2160 if (!spdk_nvme_qpair_is_connected(ctrlr_ch->qpair->qpair)) { 2161 return SPDK_POLLER_BUSY; 2162 } 2163 2164 spdk_poller_unregister(&ctrlr_ch->connect_poller); 2165 2166 /* qpair was completed to connect. Move to the next ctrlr_channel */ 2167 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 2168 ctrlr_ch->reset_iter = NULL; 2169 2170 return SPDK_POLLER_BUSY; 2171 } 2172 2173 static void 2174 bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i) 2175 { 2176 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 2177 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 2178 int rc; 2179 2180 rc = bdev_nvme_create_qpair(ctrlr_ch->qpair); 2181 if (rc == 0) { 2182 ctrlr_ch->connect_poller = SPDK_POLLER_REGISTER(bdev_nvme_reset_check_qpair_connected, 2183 ctrlr_ch, 0); 2184 2185 /* The current full reset sequence will move to the next 2186 * ctrlr_channel after the qpair is actually connected. 2187 */ 2188 assert(ctrlr_ch->reset_iter == NULL); 2189 ctrlr_ch->reset_iter = i; 2190 } else { 2191 spdk_for_each_channel_continue(i, rc); 2192 } 2193 } 2194 2195 static int 2196 bdev_nvme_reconnect_ctrlr_poll(void *arg) 2197 { 2198 struct nvme_ctrlr *nvme_ctrlr = arg; 2199 int rc = -ETIMEDOUT; 2200 2201 if (!bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 2202 rc = spdk_nvme_ctrlr_reconnect_poll_async(nvme_ctrlr->ctrlr); 2203 if (rc == -EAGAIN) { 2204 return SPDK_POLLER_BUSY; 2205 } 2206 } 2207 2208 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 2209 if (rc == 0) { 2210 /* Recreate all of the I/O queue pairs */ 2211 spdk_for_each_channel(nvme_ctrlr, 2212 bdev_nvme_reset_create_qpair, 2213 NULL, 2214 bdev_nvme_reset_create_qpairs_done); 2215 } else { 2216 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 2217 } 2218 return SPDK_POLLER_BUSY; 2219 } 2220 2221 static void 2222 bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2223 { 2224 spdk_nvme_ctrlr_reconnect_async(nvme_ctrlr->ctrlr); 2225 2226 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect, nvme_ctrlr->nbdev_ctrlr->name); 2227 assert(nvme_ctrlr->reset_detach_poller == NULL); 2228 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_ctrlr_poll, 2229 nvme_ctrlr, 0); 2230 } 2231 2232 static void 2233 bdev_nvme_reset_destroy_qpair_done(struct spdk_io_channel_iter *i, int status) 2234 { 2235 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2236 2237 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reset, nvme_ctrlr->nbdev_ctrlr->name); 2238 assert(status == 0); 2239 2240 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2241 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2242 } else { 2243 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr); 2244 } 2245 } 2246 2247 static void 2248 bdev_nvme_reset_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr) 2249 { 2250 spdk_for_each_channel(nvme_ctrlr, 2251 bdev_nvme_reset_destroy_qpair, 2252 NULL, 2253 bdev_nvme_reset_destroy_qpair_done); 2254 } 2255 2256 static void 2257 bdev_nvme_reconnect_ctrlr_now(void *ctx) 2258 { 2259 struct nvme_ctrlr *nvme_ctrlr = ctx; 2260 2261 assert(nvme_ctrlr->resetting == true); 2262 assert(nvme_ctrlr->thread == spdk_get_thread()); 2263 2264 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 2265 2266 spdk_poller_resume(nvme_ctrlr->adminq_timer_poller); 2267 2268 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2269 } 2270 2271 static void 2272 _bdev_nvme_reset_ctrlr(void *ctx) 2273 { 2274 struct nvme_ctrlr *nvme_ctrlr = ctx; 2275 2276 assert(nvme_ctrlr->resetting == true); 2277 assert(nvme_ctrlr->thread == spdk_get_thread()); 2278 2279 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2280 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reset_destroy_qpairs); 2281 } else { 2282 bdev_nvme_reset_destroy_qpairs(nvme_ctrlr); 2283 } 2284 } 2285 2286 static int 2287 bdev_nvme_reset_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2288 { 2289 spdk_msg_fn msg_fn; 2290 2291 pthread_mutex_lock(&nvme_ctrlr->mutex); 2292 if (nvme_ctrlr->destruct) { 2293 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2294 return -ENXIO; 2295 } 2296 2297 if (nvme_ctrlr->resetting) { 2298 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2299 SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); 2300 return -EBUSY; 2301 } 2302 2303 if (nvme_ctrlr->disabled) { 2304 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2305 SPDK_NOTICELOG("Unable to perform reset. Controller is disabled.\n"); 2306 return -EALREADY; 2307 } 2308 2309 nvme_ctrlr->resetting = true; 2310 nvme_ctrlr->dont_retry = true; 2311 2312 if (nvme_ctrlr->reconnect_is_delayed) { 2313 SPDK_DEBUGLOG(bdev_nvme, "Reconnect is already scheduled.\n"); 2314 msg_fn = bdev_nvme_reconnect_ctrlr_now; 2315 nvme_ctrlr->reconnect_is_delayed = false; 2316 } else { 2317 msg_fn = _bdev_nvme_reset_ctrlr; 2318 assert(nvme_ctrlr->reset_start_tsc == 0); 2319 } 2320 2321 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2322 2323 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2324 2325 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 2326 return 0; 2327 } 2328 2329 static int 2330 bdev_nvme_enable_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2331 { 2332 pthread_mutex_lock(&nvme_ctrlr->mutex); 2333 if (nvme_ctrlr->destruct) { 2334 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2335 return -ENXIO; 2336 } 2337 2338 if (nvme_ctrlr->resetting) { 2339 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2340 return -EBUSY; 2341 } 2342 2343 if (!nvme_ctrlr->disabled) { 2344 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2345 return -EALREADY; 2346 } 2347 2348 nvme_ctrlr->disabled = false; 2349 nvme_ctrlr->resetting = true; 2350 2351 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2352 2353 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2354 2355 spdk_thread_send_msg(nvme_ctrlr->thread, bdev_nvme_reconnect_ctrlr_now, nvme_ctrlr); 2356 return 0; 2357 } 2358 2359 static void 2360 _bdev_nvme_disable_ctrlr_complete(struct spdk_io_channel_iter *i, int status) 2361 { 2362 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2363 bdev_nvme_ctrlr_op_cb ctrlr_op_cb_fn = nvme_ctrlr->ctrlr_op_cb_fn; 2364 void *ctrlr_op_cb_arg = nvme_ctrlr->ctrlr_op_cb_arg; 2365 enum bdev_nvme_op_after_reset op_after_disable; 2366 2367 assert(nvme_ctrlr->thread == spdk_get_thread()); 2368 2369 nvme_ctrlr->ctrlr_op_cb_fn = NULL; 2370 nvme_ctrlr->ctrlr_op_cb_arg = NULL; 2371 2372 pthread_mutex_lock(&nvme_ctrlr->mutex); 2373 2374 nvme_ctrlr->resetting = false; 2375 nvme_ctrlr->dont_retry = false; 2376 2377 op_after_disable = bdev_nvme_check_op_after_reset(nvme_ctrlr, true); 2378 2379 nvme_ctrlr->disabled = true; 2380 spdk_poller_pause(nvme_ctrlr->adminq_timer_poller); 2381 2382 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2383 2384 if (ctrlr_op_cb_fn) { 2385 ctrlr_op_cb_fn(ctrlr_op_cb_arg, 0); 2386 } 2387 2388 switch (op_after_disable) { 2389 case OP_COMPLETE_PENDING_DESTRUCT: 2390 nvme_ctrlr_unregister(nvme_ctrlr); 2391 break; 2392 default: 2393 break; 2394 } 2395 2396 } 2397 2398 static void 2399 bdev_nvme_disable_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr) 2400 { 2401 /* Make sure we clear any pending resets before returning. */ 2402 spdk_for_each_channel(nvme_ctrlr, 2403 bdev_nvme_complete_pending_resets, 2404 NULL, 2405 _bdev_nvme_disable_ctrlr_complete); 2406 } 2407 2408 static void 2409 bdev_nvme_disable_destroy_qpairs_done(struct spdk_io_channel_iter *i, int status) 2410 { 2411 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2412 2413 assert(status == 0); 2414 2415 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2416 bdev_nvme_disable_ctrlr_complete(nvme_ctrlr); 2417 } else { 2418 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_disable_ctrlr_complete); 2419 } 2420 } 2421 2422 static void 2423 bdev_nvme_disable_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr) 2424 { 2425 spdk_for_each_channel(nvme_ctrlr, 2426 bdev_nvme_reset_destroy_qpair, 2427 NULL, 2428 bdev_nvme_disable_destroy_qpairs_done); 2429 } 2430 2431 static void 2432 _bdev_nvme_cancel_reconnect_and_disable_ctrlr(void *ctx) 2433 { 2434 struct nvme_ctrlr *nvme_ctrlr = ctx; 2435 2436 assert(nvme_ctrlr->resetting == true); 2437 assert(nvme_ctrlr->thread == spdk_get_thread()); 2438 2439 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 2440 2441 bdev_nvme_disable_ctrlr_complete(nvme_ctrlr); 2442 } 2443 2444 static void 2445 _bdev_nvme_disconnect_and_disable_ctrlr(void *ctx) 2446 { 2447 struct nvme_ctrlr *nvme_ctrlr = ctx; 2448 2449 assert(nvme_ctrlr->resetting == true); 2450 assert(nvme_ctrlr->thread == spdk_get_thread()); 2451 2452 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2453 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_disable_destroy_qpairs); 2454 } else { 2455 bdev_nvme_disable_destroy_qpairs(nvme_ctrlr); 2456 } 2457 } 2458 2459 static int 2460 bdev_nvme_disable_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2461 { 2462 spdk_msg_fn msg_fn; 2463 2464 pthread_mutex_lock(&nvme_ctrlr->mutex); 2465 if (nvme_ctrlr->destruct) { 2466 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2467 return -ENXIO; 2468 } 2469 2470 if (nvme_ctrlr->resetting) { 2471 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2472 return -EBUSY; 2473 } 2474 2475 if (nvme_ctrlr->disabled) { 2476 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2477 return -EALREADY; 2478 } 2479 2480 nvme_ctrlr->resetting = true; 2481 nvme_ctrlr->dont_retry = true; 2482 2483 if (nvme_ctrlr->reconnect_is_delayed) { 2484 msg_fn = _bdev_nvme_cancel_reconnect_and_disable_ctrlr; 2485 nvme_ctrlr->reconnect_is_delayed = false; 2486 } else { 2487 msg_fn = _bdev_nvme_disconnect_and_disable_ctrlr; 2488 } 2489 2490 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2491 2492 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2493 2494 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 2495 return 0; 2496 } 2497 2498 static int 2499 nvme_ctrlr_op(struct nvme_ctrlr *nvme_ctrlr, enum nvme_ctrlr_op op, 2500 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2501 { 2502 int rc; 2503 2504 switch (op) { 2505 case NVME_CTRLR_OP_RESET: 2506 rc = bdev_nvme_reset_ctrlr(nvme_ctrlr); 2507 break; 2508 case NVME_CTRLR_OP_ENABLE: 2509 rc = bdev_nvme_enable_ctrlr(nvme_ctrlr); 2510 break; 2511 case NVME_CTRLR_OP_DISABLE: 2512 rc = bdev_nvme_disable_ctrlr(nvme_ctrlr); 2513 break; 2514 default: 2515 rc = -EINVAL; 2516 break; 2517 } 2518 2519 if (rc == 0) { 2520 assert(nvme_ctrlr->ctrlr_op_cb_fn == NULL); 2521 assert(nvme_ctrlr->ctrlr_op_cb_arg == NULL); 2522 nvme_ctrlr->ctrlr_op_cb_fn = cb_fn; 2523 nvme_ctrlr->ctrlr_op_cb_arg = cb_arg; 2524 } 2525 return rc; 2526 } 2527 2528 struct nvme_ctrlr_op_rpc_ctx { 2529 struct nvme_ctrlr *nvme_ctrlr; 2530 struct spdk_thread *orig_thread; 2531 enum nvme_ctrlr_op op; 2532 int rc; 2533 bdev_nvme_ctrlr_op_cb cb_fn; 2534 void *cb_arg; 2535 }; 2536 2537 static void 2538 _nvme_ctrlr_op_rpc_complete(void *_ctx) 2539 { 2540 struct nvme_ctrlr_op_rpc_ctx *ctx = _ctx; 2541 2542 assert(ctx != NULL); 2543 assert(ctx->cb_fn != NULL); 2544 2545 ctx->cb_fn(ctx->cb_arg, ctx->rc); 2546 2547 free(ctx); 2548 } 2549 2550 static void 2551 nvme_ctrlr_op_rpc_complete(void *cb_arg, int rc) 2552 { 2553 struct nvme_ctrlr_op_rpc_ctx *ctx = cb_arg; 2554 2555 ctx->rc = rc; 2556 2557 spdk_thread_send_msg(ctx->orig_thread, _nvme_ctrlr_op_rpc_complete, ctx); 2558 } 2559 2560 void 2561 nvme_ctrlr_op_rpc(struct nvme_ctrlr *nvme_ctrlr, enum nvme_ctrlr_op op, 2562 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2563 { 2564 struct nvme_ctrlr_op_rpc_ctx *ctx; 2565 int rc; 2566 2567 assert(cb_fn != NULL); 2568 2569 ctx = calloc(1, sizeof(*ctx)); 2570 if (ctx == NULL) { 2571 SPDK_ERRLOG("Failed to allocate nvme_ctrlr_op_rpc_ctx.\n"); 2572 cb_fn(cb_arg, -ENOMEM); 2573 return; 2574 } 2575 2576 ctx->orig_thread = spdk_get_thread(); 2577 ctx->cb_fn = cb_fn; 2578 ctx->cb_arg = cb_arg; 2579 2580 rc = nvme_ctrlr_op(nvme_ctrlr, op, nvme_ctrlr_op_rpc_complete, ctx); 2581 if (rc == 0) { 2582 return; 2583 } else if (rc == -EALREADY) { 2584 rc = 0; 2585 } 2586 2587 nvme_ctrlr_op_rpc_complete(ctx, rc); 2588 } 2589 2590 static void nvme_bdev_ctrlr_op_rpc_continue(void *cb_arg, int rc); 2591 2592 static void 2593 _nvme_bdev_ctrlr_op_rpc_continue(void *_ctx) 2594 { 2595 struct nvme_ctrlr_op_rpc_ctx *ctx = _ctx; 2596 struct nvme_ctrlr *prev_nvme_ctrlr, *next_nvme_ctrlr; 2597 int rc; 2598 2599 prev_nvme_ctrlr = ctx->nvme_ctrlr; 2600 ctx->nvme_ctrlr = NULL; 2601 2602 if (ctx->rc != 0) { 2603 goto complete; 2604 } 2605 2606 next_nvme_ctrlr = TAILQ_NEXT(prev_nvme_ctrlr, tailq); 2607 if (next_nvme_ctrlr == NULL) { 2608 goto complete; 2609 } 2610 2611 rc = nvme_ctrlr_op(next_nvme_ctrlr, ctx->op, nvme_bdev_ctrlr_op_rpc_continue, ctx); 2612 if (rc == 0) { 2613 ctx->nvme_ctrlr = next_nvme_ctrlr; 2614 return; 2615 } else if (rc == -EALREADY) { 2616 ctx->nvme_ctrlr = next_nvme_ctrlr; 2617 rc = 0; 2618 } 2619 2620 ctx->rc = rc; 2621 2622 complete: 2623 ctx->cb_fn(ctx->cb_arg, ctx->rc); 2624 free(ctx); 2625 } 2626 2627 static void 2628 nvme_bdev_ctrlr_op_rpc_continue(void *cb_arg, int rc) 2629 { 2630 struct nvme_ctrlr_op_rpc_ctx *ctx = cb_arg; 2631 2632 ctx->rc = rc; 2633 2634 spdk_thread_send_msg(ctx->orig_thread, _nvme_bdev_ctrlr_op_rpc_continue, ctx); 2635 } 2636 2637 void 2638 nvme_bdev_ctrlr_op_rpc(struct nvme_bdev_ctrlr *nbdev_ctrlr, enum nvme_ctrlr_op op, 2639 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2640 { 2641 struct nvme_ctrlr_op_rpc_ctx *ctx; 2642 struct nvme_ctrlr *nvme_ctrlr; 2643 int rc; 2644 2645 assert(cb_fn != NULL); 2646 2647 ctx = calloc(1, sizeof(*ctx)); 2648 if (ctx == NULL) { 2649 SPDK_ERRLOG("Failed to allocate nvme_ctrlr_op_rpc_ctx.\n"); 2650 cb_fn(cb_arg, -ENOMEM); 2651 return; 2652 } 2653 2654 ctx->orig_thread = spdk_get_thread(); 2655 ctx->op = op; 2656 ctx->cb_fn = cb_fn; 2657 ctx->cb_arg = cb_arg; 2658 2659 nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 2660 assert(nvme_ctrlr != NULL); 2661 2662 rc = nvme_ctrlr_op(nvme_ctrlr, op, nvme_bdev_ctrlr_op_rpc_continue, ctx); 2663 if (rc == 0) { 2664 ctx->nvme_ctrlr = nvme_ctrlr; 2665 return; 2666 } else if (rc == -EALREADY) { 2667 ctx->nvme_ctrlr = nvme_ctrlr; 2668 rc = 0; 2669 } 2670 2671 nvme_bdev_ctrlr_op_rpc_continue(ctx, rc); 2672 } 2673 2674 static int _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio); 2675 2676 static void 2677 _bdev_nvme_reset_io_complete(struct spdk_io_channel_iter *i, int status) 2678 { 2679 struct nvme_bdev_io *bio = spdk_io_channel_iter_get_ctx(i); 2680 enum spdk_bdev_io_status io_status; 2681 2682 if (bio->cpl.cdw0 == 0) { 2683 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 2684 } else { 2685 io_status = SPDK_BDEV_IO_STATUS_FAILED; 2686 } 2687 2688 __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), io_status, NULL); 2689 } 2690 2691 static void 2692 bdev_nvme_abort_bdev_channel(struct spdk_io_channel_iter *i) 2693 { 2694 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 2695 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 2696 2697 bdev_nvme_abort_retry_ios(nbdev_ch); 2698 2699 spdk_for_each_channel_continue(i, 0); 2700 } 2701 2702 static void 2703 bdev_nvme_reset_io_complete(struct nvme_bdev_io *bio) 2704 { 2705 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2706 struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt; 2707 2708 /* Abort all queued I/Os for retry. */ 2709 spdk_for_each_channel(nbdev, 2710 bdev_nvme_abort_bdev_channel, 2711 bio, 2712 _bdev_nvme_reset_io_complete); 2713 } 2714 2715 static void 2716 _bdev_nvme_reset_io_continue(void *ctx) 2717 { 2718 struct nvme_bdev_io *bio = ctx; 2719 struct nvme_io_path *prev_io_path, *next_io_path; 2720 int rc; 2721 2722 prev_io_path = bio->io_path; 2723 bio->io_path = NULL; 2724 2725 if (bio->cpl.cdw0 != 0) { 2726 goto complete; 2727 } 2728 2729 next_io_path = STAILQ_NEXT(prev_io_path, stailq); 2730 if (next_io_path == NULL) { 2731 goto complete; 2732 } 2733 2734 rc = _bdev_nvme_reset_io(next_io_path, bio); 2735 if (rc == 0) { 2736 return; 2737 } 2738 2739 bio->cpl.cdw0 = 1; 2740 2741 complete: 2742 bdev_nvme_reset_io_complete(bio); 2743 } 2744 2745 static void 2746 bdev_nvme_reset_io_continue(void *cb_arg, int rc) 2747 { 2748 struct nvme_bdev_io *bio = cb_arg; 2749 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2750 2751 bio->cpl.cdw0 = (rc == 0) ? 0 : 1; 2752 2753 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), _bdev_nvme_reset_io_continue, bio); 2754 } 2755 2756 static int 2757 _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio) 2758 { 2759 struct nvme_ctrlr_channel *ctrlr_ch; 2760 struct spdk_bdev_io *bdev_io; 2761 int rc; 2762 2763 rc = nvme_ctrlr_op(io_path->qpair->ctrlr, NVME_CTRLR_OP_RESET, 2764 bdev_nvme_reset_io_continue, bio); 2765 if (rc == 0) { 2766 assert(bio->io_path == NULL); 2767 bio->io_path = io_path; 2768 } else if (rc == -EBUSY) { 2769 ctrlr_ch = io_path->qpair->ctrlr_ch; 2770 assert(ctrlr_ch != NULL); 2771 /* 2772 * Reset call is queued only if it is from the app framework. This is on purpose so that 2773 * we don't interfere with the app framework reset strategy. i.e. we are deferring to the 2774 * upper level. If they are in the middle of a reset, we won't try to schedule another one. 2775 */ 2776 bdev_io = spdk_bdev_io_from_ctx(bio); 2777 TAILQ_INSERT_TAIL(&ctrlr_ch->pending_resets, bdev_io, module_link); 2778 rc = 0; 2779 } 2780 2781 return rc; 2782 } 2783 2784 static void 2785 bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio) 2786 { 2787 struct nvme_io_path *io_path; 2788 int rc; 2789 2790 bio->cpl.cdw0 = 0; 2791 2792 /* Reset all nvme_ctrlrs of a bdev controller sequentially. */ 2793 io_path = STAILQ_FIRST(&nbdev_ch->io_path_list); 2794 assert(io_path != NULL); 2795 2796 rc = _bdev_nvme_reset_io(io_path, bio); 2797 if (rc != 0) { 2798 /* If the current nvme_ctrlr is disabled, skip it and move to the next nvme_ctrlr. */ 2799 bdev_nvme_reset_io_continue(bio, rc == -EALREADY); 2800 } 2801 } 2802 2803 static int 2804 bdev_nvme_failover_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool remove) 2805 { 2806 if (nvme_ctrlr->destruct) { 2807 /* Don't bother resetting if the controller is in the process of being destructed. */ 2808 return -ENXIO; 2809 } 2810 2811 if (nvme_ctrlr->resetting) { 2812 if (!nvme_ctrlr->in_failover) { 2813 SPDK_NOTICELOG("Reset is already in progress. Defer failover until reset completes.\n"); 2814 2815 /* Defer failover until reset completes. */ 2816 nvme_ctrlr->pending_failover = true; 2817 return -EINPROGRESS; 2818 } else { 2819 SPDK_NOTICELOG("Unable to perform failover, already in progress.\n"); 2820 return -EBUSY; 2821 } 2822 } 2823 2824 bdev_nvme_failover_trid(nvme_ctrlr, remove, true); 2825 2826 if (nvme_ctrlr->reconnect_is_delayed) { 2827 SPDK_NOTICELOG("Reconnect is already scheduled.\n"); 2828 2829 /* We rely on the next reconnect for the failover. */ 2830 return -EALREADY; 2831 } 2832 2833 if (nvme_ctrlr->disabled) { 2834 SPDK_NOTICELOG("Controller is disabled.\n"); 2835 2836 /* We rely on the enablement for the failover. */ 2837 return -EALREADY; 2838 } 2839 2840 nvme_ctrlr->resetting = true; 2841 nvme_ctrlr->in_failover = true; 2842 2843 assert(nvme_ctrlr->reset_start_tsc == 0); 2844 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2845 2846 return 0; 2847 } 2848 2849 static int 2850 bdev_nvme_failover_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool remove) 2851 { 2852 int rc; 2853 2854 pthread_mutex_lock(&nvme_ctrlr->mutex); 2855 rc = bdev_nvme_failover_ctrlr_unsafe(nvme_ctrlr, remove); 2856 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2857 2858 if (rc == 0) { 2859 spdk_thread_send_msg(nvme_ctrlr->thread, _bdev_nvme_reset_ctrlr, nvme_ctrlr); 2860 } else if (rc == -EALREADY) { 2861 rc = 0; 2862 } 2863 2864 return rc; 2865 } 2866 2867 static int bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, 2868 uint64_t num_blocks); 2869 2870 static int bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, 2871 uint64_t num_blocks); 2872 2873 static int bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, 2874 uint64_t src_offset_blocks, 2875 uint64_t num_blocks); 2876 2877 static void 2878 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 2879 bool success) 2880 { 2881 struct nvme_bdev_io *bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 2882 struct spdk_bdev *bdev = bdev_io->bdev; 2883 int ret; 2884 2885 if (!success) { 2886 ret = -EINVAL; 2887 goto exit; 2888 } 2889 2890 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 2891 ret = -ENXIO; 2892 goto exit; 2893 } 2894 2895 ret = bdev_nvme_readv(bio, 2896 bdev_io->u.bdev.iovs, 2897 bdev_io->u.bdev.iovcnt, 2898 bdev_io->u.bdev.md_buf, 2899 bdev_io->u.bdev.num_blocks, 2900 bdev_io->u.bdev.offset_blocks, 2901 bdev->dif_check_flags, 2902 bdev_io->u.bdev.memory_domain, 2903 bdev_io->u.bdev.memory_domain_ctx, 2904 bdev_io->u.bdev.accel_sequence); 2905 2906 exit: 2907 if (spdk_unlikely(ret != 0)) { 2908 bdev_nvme_io_complete(bio, ret); 2909 } 2910 } 2911 2912 static inline void 2913 _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io) 2914 { 2915 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 2916 struct spdk_bdev *bdev = bdev_io->bdev; 2917 struct nvme_bdev_io *nbdev_io_to_abort; 2918 int rc = 0; 2919 2920 switch (bdev_io->type) { 2921 case SPDK_BDEV_IO_TYPE_READ: 2922 if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) { 2923 rc = bdev_nvme_readv(nbdev_io, 2924 bdev_io->u.bdev.iovs, 2925 bdev_io->u.bdev.iovcnt, 2926 bdev_io->u.bdev.md_buf, 2927 bdev_io->u.bdev.num_blocks, 2928 bdev_io->u.bdev.offset_blocks, 2929 bdev->dif_check_flags, 2930 bdev_io->u.bdev.memory_domain, 2931 bdev_io->u.bdev.memory_domain_ctx, 2932 bdev_io->u.bdev.accel_sequence); 2933 } else { 2934 spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb, 2935 bdev_io->u.bdev.num_blocks * bdev->blocklen); 2936 rc = 0; 2937 } 2938 break; 2939 case SPDK_BDEV_IO_TYPE_WRITE: 2940 rc = bdev_nvme_writev(nbdev_io, 2941 bdev_io->u.bdev.iovs, 2942 bdev_io->u.bdev.iovcnt, 2943 bdev_io->u.bdev.md_buf, 2944 bdev_io->u.bdev.num_blocks, 2945 bdev_io->u.bdev.offset_blocks, 2946 bdev->dif_check_flags, 2947 bdev_io->u.bdev.memory_domain, 2948 bdev_io->u.bdev.memory_domain_ctx, 2949 bdev_io->u.bdev.accel_sequence); 2950 break; 2951 case SPDK_BDEV_IO_TYPE_COMPARE: 2952 rc = bdev_nvme_comparev(nbdev_io, 2953 bdev_io->u.bdev.iovs, 2954 bdev_io->u.bdev.iovcnt, 2955 bdev_io->u.bdev.md_buf, 2956 bdev_io->u.bdev.num_blocks, 2957 bdev_io->u.bdev.offset_blocks, 2958 bdev->dif_check_flags); 2959 break; 2960 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 2961 rc = bdev_nvme_comparev_and_writev(nbdev_io, 2962 bdev_io->u.bdev.iovs, 2963 bdev_io->u.bdev.iovcnt, 2964 bdev_io->u.bdev.fused_iovs, 2965 bdev_io->u.bdev.fused_iovcnt, 2966 bdev_io->u.bdev.md_buf, 2967 bdev_io->u.bdev.num_blocks, 2968 bdev_io->u.bdev.offset_blocks, 2969 bdev->dif_check_flags); 2970 break; 2971 case SPDK_BDEV_IO_TYPE_UNMAP: 2972 rc = bdev_nvme_unmap(nbdev_io, 2973 bdev_io->u.bdev.offset_blocks, 2974 bdev_io->u.bdev.num_blocks); 2975 break; 2976 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2977 rc = bdev_nvme_write_zeroes(nbdev_io, 2978 bdev_io->u.bdev.offset_blocks, 2979 bdev_io->u.bdev.num_blocks); 2980 break; 2981 case SPDK_BDEV_IO_TYPE_RESET: 2982 nbdev_io->io_path = NULL; 2983 bdev_nvme_reset_io(nbdev_ch, nbdev_io); 2984 return; 2985 2986 case SPDK_BDEV_IO_TYPE_FLUSH: 2987 bdev_nvme_io_complete(nbdev_io, 0); 2988 return; 2989 2990 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 2991 rc = bdev_nvme_zone_appendv(nbdev_io, 2992 bdev_io->u.bdev.iovs, 2993 bdev_io->u.bdev.iovcnt, 2994 bdev_io->u.bdev.md_buf, 2995 bdev_io->u.bdev.num_blocks, 2996 bdev_io->u.bdev.offset_blocks, 2997 bdev->dif_check_flags); 2998 break; 2999 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 3000 rc = bdev_nvme_get_zone_info(nbdev_io, 3001 bdev_io->u.zone_mgmt.zone_id, 3002 bdev_io->u.zone_mgmt.num_zones, 3003 bdev_io->u.zone_mgmt.buf); 3004 break; 3005 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 3006 rc = bdev_nvme_zone_management(nbdev_io, 3007 bdev_io->u.zone_mgmt.zone_id, 3008 bdev_io->u.zone_mgmt.zone_action); 3009 break; 3010 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 3011 nbdev_io->io_path = NULL; 3012 bdev_nvme_admin_passthru(nbdev_ch, 3013 nbdev_io, 3014 &bdev_io->u.nvme_passthru.cmd, 3015 bdev_io->u.nvme_passthru.buf, 3016 bdev_io->u.nvme_passthru.nbytes); 3017 return; 3018 3019 case SPDK_BDEV_IO_TYPE_NVME_IO: 3020 rc = bdev_nvme_io_passthru(nbdev_io, 3021 &bdev_io->u.nvme_passthru.cmd, 3022 bdev_io->u.nvme_passthru.buf, 3023 bdev_io->u.nvme_passthru.nbytes); 3024 break; 3025 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3026 rc = bdev_nvme_io_passthru_md(nbdev_io, 3027 &bdev_io->u.nvme_passthru.cmd, 3028 bdev_io->u.nvme_passthru.buf, 3029 bdev_io->u.nvme_passthru.nbytes, 3030 bdev_io->u.nvme_passthru.md_buf, 3031 bdev_io->u.nvme_passthru.md_len); 3032 break; 3033 case SPDK_BDEV_IO_TYPE_ABORT: 3034 nbdev_io->io_path = NULL; 3035 nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx; 3036 bdev_nvme_abort(nbdev_ch, 3037 nbdev_io, 3038 nbdev_io_to_abort); 3039 return; 3040 3041 case SPDK_BDEV_IO_TYPE_COPY: 3042 rc = bdev_nvme_copy(nbdev_io, 3043 bdev_io->u.bdev.offset_blocks, 3044 bdev_io->u.bdev.copy.src_offset_blocks, 3045 bdev_io->u.bdev.num_blocks); 3046 break; 3047 default: 3048 rc = -EINVAL; 3049 break; 3050 } 3051 3052 if (spdk_unlikely(rc != 0)) { 3053 bdev_nvme_io_complete(nbdev_io, rc); 3054 } 3055 } 3056 3057 static void 3058 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 3059 { 3060 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 3061 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 3062 3063 if (spdk_likely(nbdev_io->submit_tsc == 0)) { 3064 nbdev_io->submit_tsc = spdk_bdev_io_get_submit_tsc(bdev_io); 3065 } else { 3066 /* There are cases where submit_tsc != 0, i.e. retry I/O. 3067 * We need to update submit_tsc here. 3068 */ 3069 nbdev_io->submit_tsc = spdk_get_ticks(); 3070 } 3071 3072 spdk_trace_record(TRACE_BDEV_NVME_IO_START, 0, 0, (uintptr_t)nbdev_io, (uintptr_t)bdev_io); 3073 nbdev_io->io_path = bdev_nvme_find_io_path(nbdev_ch); 3074 if (spdk_unlikely(!nbdev_io->io_path)) { 3075 if (!bdev_nvme_io_type_is_admin(bdev_io->type)) { 3076 bdev_nvme_io_complete(nbdev_io, -ENXIO); 3077 return; 3078 } 3079 3080 /* Admin commands do not use the optimal I/O path. 3081 * Simply fall through even if it is not found. 3082 */ 3083 } 3084 3085 _bdev_nvme_submit_request(nbdev_ch, bdev_io); 3086 } 3087 3088 static bool 3089 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 3090 { 3091 struct nvme_bdev *nbdev = ctx; 3092 struct nvme_ns *nvme_ns; 3093 struct spdk_nvme_ns *ns; 3094 struct spdk_nvme_ctrlr *ctrlr; 3095 const struct spdk_nvme_ctrlr_data *cdata; 3096 3097 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 3098 assert(nvme_ns != NULL); 3099 ns = nvme_ns->ns; 3100 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 3101 3102 switch (io_type) { 3103 case SPDK_BDEV_IO_TYPE_READ: 3104 case SPDK_BDEV_IO_TYPE_WRITE: 3105 case SPDK_BDEV_IO_TYPE_RESET: 3106 case SPDK_BDEV_IO_TYPE_FLUSH: 3107 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 3108 case SPDK_BDEV_IO_TYPE_NVME_IO: 3109 case SPDK_BDEV_IO_TYPE_ABORT: 3110 return true; 3111 3112 case SPDK_BDEV_IO_TYPE_COMPARE: 3113 return spdk_nvme_ns_supports_compare(ns); 3114 3115 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3116 return spdk_nvme_ns_get_md_size(ns) ? true : false; 3117 3118 case SPDK_BDEV_IO_TYPE_UNMAP: 3119 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3120 return cdata->oncs.dsm; 3121 3122 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3123 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3124 return cdata->oncs.write_zeroes; 3125 3126 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 3127 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 3128 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) { 3129 return true; 3130 } 3131 return false; 3132 3133 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 3134 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 3135 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS; 3136 3137 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 3138 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS && 3139 spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ZONE_APPEND_SUPPORTED; 3140 3141 case SPDK_BDEV_IO_TYPE_COPY: 3142 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3143 return cdata->oncs.copy; 3144 3145 default: 3146 return false; 3147 } 3148 } 3149 3150 static int 3151 nvme_qpair_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ctrlr_channel *ctrlr_ch) 3152 { 3153 struct nvme_qpair *nvme_qpair; 3154 struct spdk_io_channel *pg_ch; 3155 int rc; 3156 3157 nvme_qpair = calloc(1, sizeof(*nvme_qpair)); 3158 if (!nvme_qpair) { 3159 SPDK_ERRLOG("Failed to alloc nvme_qpair.\n"); 3160 return -1; 3161 } 3162 3163 TAILQ_INIT(&nvme_qpair->io_path_list); 3164 3165 nvme_qpair->ctrlr = nvme_ctrlr; 3166 nvme_qpair->ctrlr_ch = ctrlr_ch; 3167 3168 pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs); 3169 if (!pg_ch) { 3170 free(nvme_qpair); 3171 return -1; 3172 } 3173 3174 nvme_qpair->group = spdk_io_channel_get_ctx(pg_ch); 3175 3176 #ifdef SPDK_CONFIG_VTUNE 3177 nvme_qpair->group->collect_spin_stat = true; 3178 #else 3179 nvme_qpair->group->collect_spin_stat = false; 3180 #endif 3181 3182 if (!nvme_ctrlr->disabled) { 3183 /* If a nvme_ctrlr is disabled, don't try to create qpair for it. Qpair will 3184 * be created when it's enabled. 3185 */ 3186 rc = bdev_nvme_create_qpair(nvme_qpair); 3187 if (rc != 0) { 3188 /* nvme_ctrlr can't create IO qpair if connection is down. 3189 * If reconnect_delay_sec is non-zero, creating IO qpair is retried 3190 * after reconnect_delay_sec seconds. If bdev_retry_count is non-zero, 3191 * submitted IO will be queued until IO qpair is successfully created. 3192 * 3193 * Hence, if both are satisfied, ignore the failure. 3194 */ 3195 if (nvme_ctrlr->opts.reconnect_delay_sec == 0 || g_opts.bdev_retry_count == 0) { 3196 spdk_put_io_channel(pg_ch); 3197 free(nvme_qpair); 3198 return rc; 3199 } 3200 } 3201 } 3202 3203 TAILQ_INSERT_TAIL(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 3204 3205 ctrlr_ch->qpair = nvme_qpair; 3206 3207 pthread_mutex_lock(&nvme_qpair->ctrlr->mutex); 3208 nvme_qpair->ctrlr->ref++; 3209 pthread_mutex_unlock(&nvme_qpair->ctrlr->mutex); 3210 3211 return 0; 3212 } 3213 3214 static int 3215 bdev_nvme_create_ctrlr_channel_cb(void *io_device, void *ctx_buf) 3216 { 3217 struct nvme_ctrlr *nvme_ctrlr = io_device; 3218 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 3219 3220 TAILQ_INIT(&ctrlr_ch->pending_resets); 3221 3222 return nvme_qpair_create(nvme_ctrlr, ctrlr_ch); 3223 } 3224 3225 static void 3226 nvme_qpair_delete(struct nvme_qpair *nvme_qpair) 3227 { 3228 struct nvme_io_path *io_path, *next; 3229 3230 assert(nvme_qpair->group != NULL); 3231 3232 TAILQ_FOREACH_SAFE(io_path, &nvme_qpair->io_path_list, tailq, next) { 3233 TAILQ_REMOVE(&nvme_qpair->io_path_list, io_path, tailq); 3234 nvme_io_path_free(io_path); 3235 } 3236 3237 TAILQ_REMOVE(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 3238 3239 spdk_put_io_channel(spdk_io_channel_from_ctx(nvme_qpair->group)); 3240 3241 nvme_ctrlr_release(nvme_qpair->ctrlr); 3242 3243 free(nvme_qpair); 3244 } 3245 3246 static void 3247 bdev_nvme_destroy_ctrlr_channel_cb(void *io_device, void *ctx_buf) 3248 { 3249 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 3250 struct nvme_qpair *nvme_qpair; 3251 3252 nvme_qpair = ctrlr_ch->qpair; 3253 assert(nvme_qpair != NULL); 3254 3255 _bdev_nvme_clear_io_path_cache(nvme_qpair); 3256 3257 if (nvme_qpair->qpair != NULL) { 3258 if (ctrlr_ch->reset_iter == NULL) { 3259 spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); 3260 } else { 3261 /* Skip current ctrlr_channel in a full reset sequence because 3262 * it is being deleted now. The qpair is already being disconnected. 3263 * We do not have to restart disconnecting it. 3264 */ 3265 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 3266 } 3267 3268 /* We cannot release a reference to the poll group now. 3269 * The qpair may be disconnected asynchronously later. 3270 * We need to poll it until it is actually disconnected. 3271 * Just detach the qpair from the deleting ctrlr_channel. 3272 */ 3273 nvme_qpair->ctrlr_ch = NULL; 3274 } else { 3275 assert(ctrlr_ch->reset_iter == NULL); 3276 3277 nvme_qpair_delete(nvme_qpair); 3278 } 3279 } 3280 3281 static inline struct spdk_io_channel * 3282 bdev_nvme_get_accel_channel(struct nvme_poll_group *group) 3283 { 3284 if (spdk_unlikely(!group->accel_channel)) { 3285 group->accel_channel = spdk_accel_get_io_channel(); 3286 if (!group->accel_channel) { 3287 SPDK_ERRLOG("Cannot get the accel_channel for bdev nvme polling group=%p\n", 3288 group); 3289 return NULL; 3290 } 3291 } 3292 3293 return group->accel_channel; 3294 } 3295 3296 static void 3297 bdev_nvme_submit_accel_crc32c(void *ctx, uint32_t *dst, struct iovec *iov, 3298 uint32_t iov_cnt, uint32_t seed, 3299 spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) 3300 { 3301 struct spdk_io_channel *accel_ch; 3302 struct nvme_poll_group *group = ctx; 3303 int rc; 3304 3305 assert(cb_fn != NULL); 3306 3307 accel_ch = bdev_nvme_get_accel_channel(group); 3308 if (spdk_unlikely(accel_ch == NULL)) { 3309 cb_fn(cb_arg, -ENOMEM); 3310 return; 3311 } 3312 3313 rc = spdk_accel_submit_crc32cv(accel_ch, dst, iov, iov_cnt, seed, cb_fn, cb_arg); 3314 if (rc) { 3315 /* For the two cases, spdk_accel_submit_crc32cv does not call the user's cb_fn */ 3316 if (rc == -ENOMEM || rc == -EINVAL) { 3317 cb_fn(cb_arg, rc); 3318 } 3319 SPDK_ERRLOG("Cannot complete the accelerated crc32c operation with iov=%p\n", iov); 3320 } 3321 } 3322 3323 static void 3324 bdev_nvme_finish_sequence(void *seq, spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) 3325 { 3326 spdk_accel_sequence_finish(seq, cb_fn, cb_arg); 3327 } 3328 3329 static void 3330 bdev_nvme_abort_sequence(void *seq) 3331 { 3332 spdk_accel_sequence_abort(seq); 3333 } 3334 3335 static void 3336 bdev_nvme_reverse_sequence(void *seq) 3337 { 3338 spdk_accel_sequence_reverse(seq); 3339 } 3340 3341 static int 3342 bdev_nvme_append_crc32c(void *ctx, void **seq, uint32_t *dst, struct iovec *iovs, uint32_t iovcnt, 3343 struct spdk_memory_domain *domain, void *domain_ctx, uint32_t seed, 3344 spdk_nvme_accel_step_cb cb_fn, void *cb_arg) 3345 { 3346 struct spdk_io_channel *ch; 3347 struct nvme_poll_group *group = ctx; 3348 3349 ch = bdev_nvme_get_accel_channel(group); 3350 if (spdk_unlikely(ch == NULL)) { 3351 return -ENOMEM; 3352 } 3353 3354 return spdk_accel_append_crc32c((struct spdk_accel_sequence **)seq, ch, dst, iovs, iovcnt, 3355 domain, domain_ctx, seed, cb_fn, cb_arg); 3356 } 3357 3358 static struct spdk_nvme_accel_fn_table g_bdev_nvme_accel_fn_table = { 3359 .table_size = sizeof(struct spdk_nvme_accel_fn_table), 3360 .submit_accel_crc32c = bdev_nvme_submit_accel_crc32c, 3361 .append_crc32c = bdev_nvme_append_crc32c, 3362 .finish_sequence = bdev_nvme_finish_sequence, 3363 .reverse_sequence = bdev_nvme_reverse_sequence, 3364 .abort_sequence = bdev_nvme_abort_sequence, 3365 }; 3366 3367 static int 3368 bdev_nvme_create_poll_group_cb(void *io_device, void *ctx_buf) 3369 { 3370 struct nvme_poll_group *group = ctx_buf; 3371 3372 TAILQ_INIT(&group->qpair_list); 3373 3374 group->group = spdk_nvme_poll_group_create(group, &g_bdev_nvme_accel_fn_table); 3375 if (group->group == NULL) { 3376 return -1; 3377 } 3378 3379 group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us); 3380 3381 if (group->poller == NULL) { 3382 spdk_nvme_poll_group_destroy(group->group); 3383 return -1; 3384 } 3385 3386 return 0; 3387 } 3388 3389 static void 3390 bdev_nvme_destroy_poll_group_cb(void *io_device, void *ctx_buf) 3391 { 3392 struct nvme_poll_group *group = ctx_buf; 3393 3394 assert(TAILQ_EMPTY(&group->qpair_list)); 3395 3396 if (group->accel_channel) { 3397 spdk_put_io_channel(group->accel_channel); 3398 } 3399 3400 spdk_poller_unregister(&group->poller); 3401 if (spdk_nvme_poll_group_destroy(group->group)) { 3402 SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.\n"); 3403 assert(false); 3404 } 3405 } 3406 3407 static struct spdk_io_channel * 3408 bdev_nvme_get_io_channel(void *ctx) 3409 { 3410 struct nvme_bdev *nvme_bdev = ctx; 3411 3412 return spdk_get_io_channel(nvme_bdev); 3413 } 3414 3415 static void * 3416 bdev_nvme_get_module_ctx(void *ctx) 3417 { 3418 struct nvme_bdev *nvme_bdev = ctx; 3419 struct nvme_ns *nvme_ns; 3420 3421 if (!nvme_bdev || nvme_bdev->disk.module != &nvme_if) { 3422 return NULL; 3423 } 3424 3425 nvme_ns = TAILQ_FIRST(&nvme_bdev->nvme_ns_list); 3426 if (!nvme_ns) { 3427 return NULL; 3428 } 3429 3430 return nvme_ns->ns; 3431 } 3432 3433 static const char * 3434 _nvme_ana_state_str(enum spdk_nvme_ana_state ana_state) 3435 { 3436 switch (ana_state) { 3437 case SPDK_NVME_ANA_OPTIMIZED_STATE: 3438 return "optimized"; 3439 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 3440 return "non_optimized"; 3441 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 3442 return "inaccessible"; 3443 case SPDK_NVME_ANA_PERSISTENT_LOSS_STATE: 3444 return "persistent_loss"; 3445 case SPDK_NVME_ANA_CHANGE_STATE: 3446 return "change"; 3447 default: 3448 return NULL; 3449 } 3450 } 3451 3452 static int 3453 bdev_nvme_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size) 3454 { 3455 struct spdk_memory_domain **_domains = NULL; 3456 struct nvme_bdev *nbdev = ctx; 3457 struct nvme_ns *nvme_ns; 3458 int i = 0, _array_size = array_size; 3459 int rc = 0; 3460 3461 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 3462 if (domains && array_size >= i) { 3463 _domains = &domains[i]; 3464 } else { 3465 _domains = NULL; 3466 } 3467 rc = spdk_nvme_ctrlr_get_memory_domains(nvme_ns->ctrlr->ctrlr, _domains, _array_size); 3468 if (rc > 0) { 3469 i += rc; 3470 if (_array_size >= rc) { 3471 _array_size -= rc; 3472 } else { 3473 _array_size = 0; 3474 } 3475 } else if (rc < 0) { 3476 return rc; 3477 } 3478 } 3479 3480 return i; 3481 } 3482 3483 static const char * 3484 nvme_ctrlr_get_state_str(struct nvme_ctrlr *nvme_ctrlr) 3485 { 3486 if (nvme_ctrlr->destruct) { 3487 return "deleting"; 3488 } else if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 3489 return "failed"; 3490 } else if (nvme_ctrlr->resetting) { 3491 return "resetting"; 3492 } else if (nvme_ctrlr->reconnect_is_delayed > 0) { 3493 return "reconnect_is_delayed"; 3494 } else if (nvme_ctrlr->disabled) { 3495 return "disabled"; 3496 } else { 3497 return "enabled"; 3498 } 3499 } 3500 3501 void 3502 nvme_ctrlr_info_json(struct spdk_json_write_ctx *w, struct nvme_ctrlr *nvme_ctrlr) 3503 { 3504 struct spdk_nvme_transport_id *trid; 3505 const struct spdk_nvme_ctrlr_opts *opts; 3506 const struct spdk_nvme_ctrlr_data *cdata; 3507 struct nvme_path_id *path_id; 3508 3509 spdk_json_write_object_begin(w); 3510 3511 spdk_json_write_named_string(w, "state", nvme_ctrlr_get_state_str(nvme_ctrlr)); 3512 3513 #ifdef SPDK_CONFIG_NVME_CUSE 3514 size_t cuse_name_size = 128; 3515 char cuse_name[cuse_name_size]; 3516 3517 int rc = spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr, cuse_name, &cuse_name_size); 3518 if (rc == 0) { 3519 spdk_json_write_named_string(w, "cuse_device", cuse_name); 3520 } 3521 #endif 3522 trid = &nvme_ctrlr->active_path_id->trid; 3523 spdk_json_write_named_object_begin(w, "trid"); 3524 nvme_bdev_dump_trid_json(trid, w); 3525 spdk_json_write_object_end(w); 3526 3527 path_id = TAILQ_NEXT(nvme_ctrlr->active_path_id, link); 3528 if (path_id != NULL) { 3529 spdk_json_write_named_array_begin(w, "alternate_trids"); 3530 do { 3531 trid = &path_id->trid; 3532 spdk_json_write_object_begin(w); 3533 nvme_bdev_dump_trid_json(trid, w); 3534 spdk_json_write_object_end(w); 3535 3536 path_id = TAILQ_NEXT(path_id, link); 3537 } while (path_id != NULL); 3538 spdk_json_write_array_end(w); 3539 } 3540 3541 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 3542 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 3543 3544 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 3545 spdk_json_write_named_object_begin(w, "host"); 3546 spdk_json_write_named_string(w, "nqn", opts->hostnqn); 3547 spdk_json_write_named_string(w, "addr", opts->src_addr); 3548 spdk_json_write_named_string(w, "svcid", opts->src_svcid); 3549 spdk_json_write_object_end(w); 3550 3551 spdk_json_write_object_end(w); 3552 } 3553 3554 static void 3555 nvme_namespace_info_json(struct spdk_json_write_ctx *w, 3556 struct nvme_ns *nvme_ns) 3557 { 3558 struct spdk_nvme_ns *ns; 3559 struct spdk_nvme_ctrlr *ctrlr; 3560 const struct spdk_nvme_ctrlr_data *cdata; 3561 const struct spdk_nvme_transport_id *trid; 3562 union spdk_nvme_vs_register vs; 3563 const struct spdk_nvme_ns_data *nsdata; 3564 char buf[128]; 3565 3566 ns = nvme_ns->ns; 3567 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 3568 3569 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3570 trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); 3571 vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr); 3572 3573 spdk_json_write_object_begin(w); 3574 3575 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 3576 spdk_json_write_named_string(w, "pci_address", trid->traddr); 3577 } 3578 3579 spdk_json_write_named_object_begin(w, "trid"); 3580 3581 nvme_bdev_dump_trid_json(trid, w); 3582 3583 spdk_json_write_object_end(w); 3584 3585 #ifdef SPDK_CONFIG_NVME_CUSE 3586 size_t cuse_name_size = 128; 3587 char cuse_name[cuse_name_size]; 3588 3589 int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns), 3590 cuse_name, &cuse_name_size); 3591 if (rc == 0) { 3592 spdk_json_write_named_string(w, "cuse_device", cuse_name); 3593 } 3594 #endif 3595 3596 spdk_json_write_named_object_begin(w, "ctrlr_data"); 3597 3598 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 3599 3600 spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid); 3601 3602 snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn); 3603 spdk_str_trim(buf); 3604 spdk_json_write_named_string(w, "model_number", buf); 3605 3606 snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn); 3607 spdk_str_trim(buf); 3608 spdk_json_write_named_string(w, "serial_number", buf); 3609 3610 snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr); 3611 spdk_str_trim(buf); 3612 spdk_json_write_named_string(w, "firmware_revision", buf); 3613 3614 if (cdata->subnqn[0] != '\0') { 3615 spdk_json_write_named_string(w, "subnqn", cdata->subnqn); 3616 } 3617 3618 spdk_json_write_named_object_begin(w, "oacs"); 3619 3620 spdk_json_write_named_uint32(w, "security", cdata->oacs.security); 3621 spdk_json_write_named_uint32(w, "format", cdata->oacs.format); 3622 spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware); 3623 spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage); 3624 3625 spdk_json_write_object_end(w); 3626 3627 spdk_json_write_named_bool(w, "multi_ctrlr", cdata->cmic.multi_ctrlr); 3628 spdk_json_write_named_bool(w, "ana_reporting", cdata->cmic.ana_reporting); 3629 3630 spdk_json_write_object_end(w); 3631 3632 spdk_json_write_named_object_begin(w, "vs"); 3633 3634 spdk_json_write_name(w, "nvme_version"); 3635 if (vs.bits.ter) { 3636 spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter); 3637 } else { 3638 spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr); 3639 } 3640 3641 spdk_json_write_object_end(w); 3642 3643 nsdata = spdk_nvme_ns_get_data(ns); 3644 3645 spdk_json_write_named_object_begin(w, "ns_data"); 3646 3647 spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns)); 3648 3649 if (cdata->cmic.ana_reporting) { 3650 spdk_json_write_named_string(w, "ana_state", 3651 _nvme_ana_state_str(nvme_ns->ana_state)); 3652 } 3653 3654 spdk_json_write_named_bool(w, "can_share", nsdata->nmic.can_share); 3655 3656 spdk_json_write_object_end(w); 3657 3658 if (cdata->oacs.security) { 3659 spdk_json_write_named_object_begin(w, "security"); 3660 3661 spdk_json_write_named_bool(w, "opal", nvme_ns->bdev->opal); 3662 3663 spdk_json_write_object_end(w); 3664 } 3665 3666 spdk_json_write_object_end(w); 3667 } 3668 3669 static const char * 3670 nvme_bdev_get_mp_policy_str(struct nvme_bdev *nbdev) 3671 { 3672 switch (nbdev->mp_policy) { 3673 case BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE: 3674 return "active_passive"; 3675 case BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE: 3676 return "active_active"; 3677 default: 3678 assert(false); 3679 return "invalid"; 3680 } 3681 } 3682 3683 static int 3684 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) 3685 { 3686 struct nvme_bdev *nvme_bdev = ctx; 3687 struct nvme_ns *nvme_ns; 3688 3689 pthread_mutex_lock(&nvme_bdev->mutex); 3690 spdk_json_write_named_array_begin(w, "nvme"); 3691 TAILQ_FOREACH(nvme_ns, &nvme_bdev->nvme_ns_list, tailq) { 3692 nvme_namespace_info_json(w, nvme_ns); 3693 } 3694 spdk_json_write_array_end(w); 3695 spdk_json_write_named_string(w, "mp_policy", nvme_bdev_get_mp_policy_str(nvme_bdev)); 3696 pthread_mutex_unlock(&nvme_bdev->mutex); 3697 3698 return 0; 3699 } 3700 3701 static void 3702 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3703 { 3704 /* No config per bdev needed */ 3705 } 3706 3707 static uint64_t 3708 bdev_nvme_get_spin_time(struct spdk_io_channel *ch) 3709 { 3710 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 3711 struct nvme_io_path *io_path; 3712 struct nvme_poll_group *group; 3713 uint64_t spin_time = 0; 3714 3715 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 3716 group = io_path->qpair->group; 3717 3718 if (!group || !group->collect_spin_stat) { 3719 continue; 3720 } 3721 3722 if (group->end_ticks != 0) { 3723 group->spin_ticks += (group->end_ticks - group->start_ticks); 3724 group->end_ticks = 0; 3725 } 3726 3727 spin_time += group->spin_ticks; 3728 group->start_ticks = 0; 3729 group->spin_ticks = 0; 3730 } 3731 3732 return (spin_time * 1000000ULL) / spdk_get_ticks_hz(); 3733 } 3734 3735 static void 3736 bdev_nvme_reset_device_stat(void *ctx) 3737 { 3738 struct nvme_bdev *nbdev = ctx; 3739 3740 if (nbdev->err_stat != NULL) { 3741 memset(nbdev->err_stat, 0, sizeof(struct nvme_error_stat)); 3742 } 3743 } 3744 3745 /* JSON string should be lowercases and underscore delimited string. */ 3746 static void 3747 bdev_nvme_format_nvme_status(char *dst, const char *src) 3748 { 3749 char tmp[256]; 3750 3751 spdk_strcpy_replace(dst, 256, src, " - ", "_"); 3752 spdk_strcpy_replace(tmp, 256, dst, "-", "_"); 3753 spdk_strcpy_replace(dst, 256, tmp, " ", "_"); 3754 spdk_strlwr(dst); 3755 } 3756 3757 static void 3758 bdev_nvme_dump_device_stat_json(void *ctx, struct spdk_json_write_ctx *w) 3759 { 3760 struct nvme_bdev *nbdev = ctx; 3761 struct spdk_nvme_status status = {}; 3762 uint16_t sct, sc; 3763 char status_json[256]; 3764 const char *status_str; 3765 3766 if (nbdev->err_stat == NULL) { 3767 return; 3768 } 3769 3770 spdk_json_write_named_object_begin(w, "nvme_error"); 3771 3772 spdk_json_write_named_object_begin(w, "status_type"); 3773 for (sct = 0; sct < 8; sct++) { 3774 if (nbdev->err_stat->status_type[sct] == 0) { 3775 continue; 3776 } 3777 status.sct = sct; 3778 3779 status_str = spdk_nvme_cpl_get_status_type_string(&status); 3780 assert(status_str != NULL); 3781 bdev_nvme_format_nvme_status(status_json, status_str); 3782 3783 spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status_type[sct]); 3784 } 3785 spdk_json_write_object_end(w); 3786 3787 spdk_json_write_named_object_begin(w, "status_code"); 3788 for (sct = 0; sct < 4; sct++) { 3789 status.sct = sct; 3790 for (sc = 0; sc < 256; sc++) { 3791 if (nbdev->err_stat->status[sct][sc] == 0) { 3792 continue; 3793 } 3794 status.sc = sc; 3795 3796 status_str = spdk_nvme_cpl_get_status_string(&status); 3797 assert(status_str != NULL); 3798 bdev_nvme_format_nvme_status(status_json, status_str); 3799 3800 spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status[sct][sc]); 3801 } 3802 } 3803 spdk_json_write_object_end(w); 3804 3805 spdk_json_write_object_end(w); 3806 } 3807 3808 static bool 3809 bdev_nvme_accel_sequence_supported(void *ctx, enum spdk_bdev_io_type type) 3810 { 3811 struct nvme_bdev *nbdev = ctx; 3812 struct spdk_nvme_ctrlr *ctrlr; 3813 3814 switch (type) { 3815 case SPDK_BDEV_IO_TYPE_WRITE: 3816 case SPDK_BDEV_IO_TYPE_READ: 3817 break; 3818 default: 3819 return false; 3820 } 3821 3822 ctrlr = bdev_nvme_get_ctrlr(&nbdev->disk); 3823 assert(ctrlr != NULL); 3824 3825 return spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ACCEL_SEQUENCE_SUPPORTED; 3826 } 3827 3828 static const struct spdk_bdev_fn_table nvmelib_fn_table = { 3829 .destruct = bdev_nvme_destruct, 3830 .submit_request = bdev_nvme_submit_request, 3831 .io_type_supported = bdev_nvme_io_type_supported, 3832 .get_io_channel = bdev_nvme_get_io_channel, 3833 .dump_info_json = bdev_nvme_dump_info_json, 3834 .write_config_json = bdev_nvme_write_config_json, 3835 .get_spin_time = bdev_nvme_get_spin_time, 3836 .get_module_ctx = bdev_nvme_get_module_ctx, 3837 .get_memory_domains = bdev_nvme_get_memory_domains, 3838 .accel_sequence_supported = bdev_nvme_accel_sequence_supported, 3839 .reset_device_stat = bdev_nvme_reset_device_stat, 3840 .dump_device_stat_json = bdev_nvme_dump_device_stat_json, 3841 }; 3842 3843 typedef int (*bdev_nvme_parse_ana_log_page_cb)( 3844 const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg); 3845 3846 static int 3847 bdev_nvme_parse_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 3848 bdev_nvme_parse_ana_log_page_cb cb_fn, void *cb_arg) 3849 { 3850 struct spdk_nvme_ana_group_descriptor *copied_desc; 3851 uint8_t *orig_desc; 3852 uint32_t i, desc_size, copy_len; 3853 int rc = 0; 3854 3855 if (nvme_ctrlr->ana_log_page == NULL) { 3856 return -EINVAL; 3857 } 3858 3859 copied_desc = nvme_ctrlr->copied_ana_desc; 3860 3861 orig_desc = (uint8_t *)nvme_ctrlr->ana_log_page + sizeof(struct spdk_nvme_ana_page); 3862 copy_len = nvme_ctrlr->max_ana_log_page_size - sizeof(struct spdk_nvme_ana_page); 3863 3864 for (i = 0; i < nvme_ctrlr->ana_log_page->num_ana_group_desc; i++) { 3865 memcpy(copied_desc, orig_desc, copy_len); 3866 3867 rc = cb_fn(copied_desc, cb_arg); 3868 if (rc != 0) { 3869 break; 3870 } 3871 3872 desc_size = sizeof(struct spdk_nvme_ana_group_descriptor) + 3873 copied_desc->num_of_nsid * sizeof(uint32_t); 3874 orig_desc += desc_size; 3875 copy_len -= desc_size; 3876 } 3877 3878 return rc; 3879 } 3880 3881 static int 3882 nvme_ns_ana_transition_timedout(void *ctx) 3883 { 3884 struct nvme_ns *nvme_ns = ctx; 3885 3886 spdk_poller_unregister(&nvme_ns->anatt_timer); 3887 nvme_ns->ana_transition_timedout = true; 3888 3889 return SPDK_POLLER_BUSY; 3890 } 3891 3892 static void 3893 _nvme_ns_set_ana_state(struct nvme_ns *nvme_ns, 3894 const struct spdk_nvme_ana_group_descriptor *desc) 3895 { 3896 const struct spdk_nvme_ctrlr_data *cdata; 3897 3898 nvme_ns->ana_group_id = desc->ana_group_id; 3899 nvme_ns->ana_state = desc->ana_state; 3900 nvme_ns->ana_state_updating = false; 3901 3902 switch (nvme_ns->ana_state) { 3903 case SPDK_NVME_ANA_OPTIMIZED_STATE: 3904 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 3905 nvme_ns->ana_transition_timedout = false; 3906 spdk_poller_unregister(&nvme_ns->anatt_timer); 3907 break; 3908 3909 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 3910 case SPDK_NVME_ANA_CHANGE_STATE: 3911 if (nvme_ns->anatt_timer != NULL) { 3912 break; 3913 } 3914 3915 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 3916 nvme_ns->anatt_timer = SPDK_POLLER_REGISTER(nvme_ns_ana_transition_timedout, 3917 nvme_ns, 3918 cdata->anatt * SPDK_SEC_TO_USEC); 3919 break; 3920 default: 3921 break; 3922 } 3923 } 3924 3925 static int 3926 nvme_ns_set_ana_state(const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg) 3927 { 3928 struct nvme_ns *nvme_ns = cb_arg; 3929 uint32_t i; 3930 3931 for (i = 0; i < desc->num_of_nsid; i++) { 3932 if (desc->nsid[i] != spdk_nvme_ns_get_id(nvme_ns->ns)) { 3933 continue; 3934 } 3935 3936 _nvme_ns_set_ana_state(nvme_ns, desc); 3937 return 1; 3938 } 3939 3940 return 0; 3941 } 3942 3943 static struct spdk_uuid 3944 nvme_generate_uuid(const char *sn, uint32_t nsid) 3945 { 3946 struct spdk_uuid new_uuid, namespace_uuid; 3947 char merged_str[SPDK_NVME_CTRLR_SN_LEN + NSID_STR_LEN + 1] = {'\0'}; 3948 /* This namespace UUID was generated using uuid_generate() method. */ 3949 const char *namespace_str = {"edaed2de-24bc-4b07-b559-f47ecbe730fd"}; 3950 int size; 3951 3952 assert(strlen(sn) <= SPDK_NVME_CTRLR_SN_LEN); 3953 3954 spdk_uuid_set_null(&new_uuid); 3955 spdk_uuid_set_null(&namespace_uuid); 3956 3957 size = snprintf(merged_str, sizeof(merged_str), "%s%"PRIu32, sn, nsid); 3958 assert(size > 0 && (unsigned long)size < sizeof(merged_str)); 3959 3960 spdk_uuid_parse(&namespace_uuid, namespace_str); 3961 3962 spdk_uuid_generate_sha1(&new_uuid, &namespace_uuid, merged_str, size); 3963 3964 return new_uuid; 3965 } 3966 3967 static int 3968 nvme_disk_create(struct spdk_bdev *disk, const char *base_name, 3969 struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns, 3970 uint32_t prchk_flags, void *ctx) 3971 { 3972 const struct spdk_uuid *uuid; 3973 const uint8_t *nguid; 3974 const struct spdk_nvme_ctrlr_data *cdata; 3975 const struct spdk_nvme_ns_data *nsdata; 3976 const struct spdk_nvme_ctrlr_opts *opts; 3977 enum spdk_nvme_csi csi; 3978 uint32_t atomic_bs, phys_bs, bs; 3979 char sn_tmp[SPDK_NVME_CTRLR_SN_LEN + 1] = {'\0'}; 3980 3981 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3982 csi = spdk_nvme_ns_get_csi(ns); 3983 opts = spdk_nvme_ctrlr_get_opts(ctrlr); 3984 3985 switch (csi) { 3986 case SPDK_NVME_CSI_NVM: 3987 disk->product_name = "NVMe disk"; 3988 break; 3989 case SPDK_NVME_CSI_ZNS: 3990 disk->product_name = "NVMe ZNS disk"; 3991 disk->zoned = true; 3992 disk->zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 3993 disk->max_zone_append_size = spdk_nvme_zns_ctrlr_get_max_zone_append_size(ctrlr) / 3994 spdk_nvme_ns_get_extended_sector_size(ns); 3995 disk->max_open_zones = spdk_nvme_zns_ns_get_max_open_zones(ns); 3996 disk->max_active_zones = spdk_nvme_zns_ns_get_max_active_zones(ns); 3997 break; 3998 default: 3999 SPDK_ERRLOG("unsupported CSI: %u\n", csi); 4000 return -ENOTSUP; 4001 } 4002 4003 disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns)); 4004 if (!disk->name) { 4005 return -ENOMEM; 4006 } 4007 4008 disk->write_cache = 0; 4009 if (cdata->vwc.present) { 4010 /* Enable if the Volatile Write Cache exists */ 4011 disk->write_cache = 1; 4012 } 4013 if (cdata->oncs.write_zeroes) { 4014 disk->max_write_zeroes = UINT16_MAX + 1; 4015 } 4016 disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns); 4017 disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns); 4018 disk->max_segment_size = spdk_nvme_ctrlr_get_max_xfer_size(ctrlr); 4019 /* NVMe driver will split one request into multiple requests 4020 * based on MDTS and stripe boundary, the bdev layer will use 4021 * max_segment_size and max_num_segments to split one big IO 4022 * into multiple requests, then small request can't run out 4023 * of NVMe internal requests data structure. 4024 */ 4025 if (opts && opts->io_queue_requests) { 4026 disk->max_num_segments = opts->io_queue_requests / 2; 4027 } 4028 disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns); 4029 4030 nguid = spdk_nvme_ns_get_nguid(ns); 4031 if (!nguid) { 4032 uuid = spdk_nvme_ns_get_uuid(ns); 4033 if (uuid) { 4034 disk->uuid = *uuid; 4035 } else if (g_opts.generate_uuids) { 4036 spdk_strcpy_pad(sn_tmp, cdata->sn, SPDK_NVME_CTRLR_SN_LEN, '\0'); 4037 disk->uuid = nvme_generate_uuid(sn_tmp, spdk_nvme_ns_get_id(ns)); 4038 } 4039 } else { 4040 memcpy(&disk->uuid, nguid, sizeof(disk->uuid)); 4041 } 4042 4043 nsdata = spdk_nvme_ns_get_data(ns); 4044 bs = spdk_nvme_ns_get_sector_size(ns); 4045 atomic_bs = bs; 4046 phys_bs = bs; 4047 if (nsdata->nabo == 0) { 4048 if (nsdata->nsfeat.ns_atomic_write_unit && nsdata->nawupf) { 4049 atomic_bs = bs * (1 + nsdata->nawupf); 4050 } else { 4051 atomic_bs = bs * (1 + cdata->awupf); 4052 } 4053 } 4054 if (nsdata->nsfeat.optperf) { 4055 phys_bs = bs * (1 + nsdata->npwg); 4056 } 4057 disk->phys_blocklen = spdk_min(phys_bs, atomic_bs); 4058 4059 disk->md_len = spdk_nvme_ns_get_md_size(ns); 4060 if (disk->md_len != 0) { 4061 disk->md_interleave = nsdata->flbas.extended; 4062 disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns); 4063 if (disk->dif_type != SPDK_DIF_DISABLE) { 4064 disk->dif_is_head_of_md = nsdata->dps.md_start; 4065 disk->dif_check_flags = prchk_flags; 4066 } 4067 } 4068 4069 if (!(spdk_nvme_ctrlr_get_flags(ctrlr) & 4070 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) { 4071 disk->acwu = 0; 4072 } else if (nsdata->nsfeat.ns_atomic_write_unit) { 4073 disk->acwu = nsdata->nacwu + 1; /* 0-based */ 4074 } else { 4075 disk->acwu = cdata->acwu + 1; /* 0-based */ 4076 } 4077 4078 if (cdata->oncs.copy) { 4079 /* For now bdev interface allows only single segment copy */ 4080 disk->max_copy = nsdata->mssrl; 4081 } 4082 4083 disk->ctxt = ctx; 4084 disk->fn_table = &nvmelib_fn_table; 4085 disk->module = &nvme_if; 4086 4087 return 0; 4088 } 4089 4090 static struct nvme_bdev * 4091 nvme_bdev_alloc(void) 4092 { 4093 struct nvme_bdev *bdev; 4094 int rc; 4095 4096 bdev = calloc(1, sizeof(*bdev)); 4097 if (!bdev) { 4098 SPDK_ERRLOG("bdev calloc() failed\n"); 4099 return NULL; 4100 } 4101 4102 if (g_opts.nvme_error_stat) { 4103 bdev->err_stat = calloc(1, sizeof(struct nvme_error_stat)); 4104 if (!bdev->err_stat) { 4105 SPDK_ERRLOG("err_stat calloc() failed\n"); 4106 free(bdev); 4107 return NULL; 4108 } 4109 } 4110 4111 rc = pthread_mutex_init(&bdev->mutex, NULL); 4112 if (rc != 0) { 4113 free(bdev->err_stat); 4114 free(bdev); 4115 return NULL; 4116 } 4117 4118 bdev->ref = 1; 4119 bdev->mp_policy = BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE; 4120 bdev->mp_selector = BDEV_NVME_MP_SELECTOR_ROUND_ROBIN; 4121 bdev->rr_min_io = UINT32_MAX; 4122 TAILQ_INIT(&bdev->nvme_ns_list); 4123 4124 return bdev; 4125 } 4126 4127 static int 4128 nvme_bdev_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4129 { 4130 struct nvme_bdev *bdev; 4131 struct nvme_bdev_ctrlr *nbdev_ctrlr = nvme_ctrlr->nbdev_ctrlr; 4132 int rc; 4133 4134 bdev = nvme_bdev_alloc(); 4135 if (bdev == NULL) { 4136 SPDK_ERRLOG("Failed to allocate NVMe bdev\n"); 4137 return -ENOMEM; 4138 } 4139 4140 bdev->opal = nvme_ctrlr->opal_dev != NULL; 4141 4142 rc = nvme_disk_create(&bdev->disk, nbdev_ctrlr->name, nvme_ctrlr->ctrlr, 4143 nvme_ns->ns, nvme_ctrlr->opts.prchk_flags, bdev); 4144 if (rc != 0) { 4145 SPDK_ERRLOG("Failed to create NVMe disk\n"); 4146 nvme_bdev_free(bdev); 4147 return rc; 4148 } 4149 4150 spdk_io_device_register(bdev, 4151 bdev_nvme_create_bdev_channel_cb, 4152 bdev_nvme_destroy_bdev_channel_cb, 4153 sizeof(struct nvme_bdev_channel), 4154 bdev->disk.name); 4155 4156 nvme_ns->bdev = bdev; 4157 bdev->nsid = nvme_ns->id; 4158 TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); 4159 4160 bdev->nbdev_ctrlr = nbdev_ctrlr; 4161 TAILQ_INSERT_TAIL(&nbdev_ctrlr->bdevs, bdev, tailq); 4162 4163 rc = spdk_bdev_register(&bdev->disk); 4164 if (rc != 0) { 4165 SPDK_ERRLOG("spdk_bdev_register() failed\n"); 4166 spdk_io_device_unregister(bdev, NULL); 4167 nvme_ns->bdev = NULL; 4168 TAILQ_REMOVE(&nbdev_ctrlr->bdevs, bdev, tailq); 4169 nvme_bdev_free(bdev); 4170 return rc; 4171 } 4172 4173 return 0; 4174 } 4175 4176 static bool 4177 bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2) 4178 { 4179 const struct spdk_nvme_ns_data *nsdata1, *nsdata2; 4180 const struct spdk_uuid *uuid1, *uuid2; 4181 4182 nsdata1 = spdk_nvme_ns_get_data(ns1); 4183 nsdata2 = spdk_nvme_ns_get_data(ns2); 4184 uuid1 = spdk_nvme_ns_get_uuid(ns1); 4185 uuid2 = spdk_nvme_ns_get_uuid(ns2); 4186 4187 return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid)) == 0 && 4188 nsdata1->eui64 == nsdata2->eui64 && 4189 ((uuid1 == NULL && uuid2 == NULL) || 4190 (uuid1 != NULL && uuid2 != NULL && spdk_uuid_compare(uuid1, uuid2) == 0)) && 4191 spdk_nvme_ns_get_csi(ns1) == spdk_nvme_ns_get_csi(ns2); 4192 } 4193 4194 static bool 4195 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 4196 struct spdk_nvme_ctrlr_opts *opts) 4197 { 4198 struct nvme_probe_skip_entry *entry; 4199 4200 TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) { 4201 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 4202 return false; 4203 } 4204 } 4205 4206 opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst; 4207 opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight; 4208 opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight; 4209 opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight; 4210 opts->disable_read_ana_log_page = true; 4211 4212 SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr); 4213 4214 return true; 4215 } 4216 4217 static void 4218 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl) 4219 { 4220 struct nvme_ctrlr *nvme_ctrlr = ctx; 4221 4222 if (spdk_nvme_cpl_is_error(cpl)) { 4223 SPDK_WARNLOG("Abort failed. Resetting controller. sc is %u, sct is %u.\n", cpl->status.sc, 4224 cpl->status.sct); 4225 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4226 } else if (cpl->cdw0 & 0x1) { 4227 SPDK_WARNLOG("Specified command could not be aborted.\n"); 4228 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4229 } 4230 } 4231 4232 static void 4233 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr, 4234 struct spdk_nvme_qpair *qpair, uint16_t cid) 4235 { 4236 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 4237 union spdk_nvme_csts_register csts; 4238 int rc; 4239 4240 assert(nvme_ctrlr->ctrlr == ctrlr); 4241 4242 SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid); 4243 4244 /* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O 4245 * queue. (Note: qpair == NULL when there's an admin cmd timeout.) Otherwise we 4246 * would submit another fabrics cmd on the admin queue to read CSTS and check for its 4247 * completion recursively. 4248 */ 4249 if (nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) { 4250 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); 4251 if (csts.bits.cfs) { 4252 SPDK_ERRLOG("Controller Fatal Status, reset required\n"); 4253 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4254 return; 4255 } 4256 } 4257 4258 switch (g_opts.action_on_timeout) { 4259 case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT: 4260 if (qpair) { 4261 /* Don't send abort to ctrlr when ctrlr is not available. */ 4262 pthread_mutex_lock(&nvme_ctrlr->mutex); 4263 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 4264 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4265 SPDK_NOTICELOG("Quit abort. Ctrlr is not available.\n"); 4266 return; 4267 } 4268 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4269 4270 rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid, 4271 nvme_abort_cpl, nvme_ctrlr); 4272 if (rc == 0) { 4273 return; 4274 } 4275 4276 SPDK_ERRLOG("Unable to send abort. Resetting, rc is %d.\n", rc); 4277 } 4278 4279 /* FALLTHROUGH */ 4280 case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET: 4281 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4282 break; 4283 case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE: 4284 SPDK_DEBUGLOG(bdev_nvme, "No action for nvme controller timeout.\n"); 4285 break; 4286 default: 4287 SPDK_ERRLOG("An invalid timeout action value is found.\n"); 4288 break; 4289 } 4290 } 4291 4292 static struct nvme_ns * 4293 nvme_ns_alloc(void) 4294 { 4295 struct nvme_ns *nvme_ns; 4296 4297 nvme_ns = calloc(1, sizeof(struct nvme_ns)); 4298 if (nvme_ns == NULL) { 4299 return NULL; 4300 } 4301 4302 if (g_opts.io_path_stat) { 4303 nvme_ns->stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); 4304 if (nvme_ns->stat == NULL) { 4305 free(nvme_ns); 4306 return NULL; 4307 } 4308 spdk_bdev_reset_io_stat(nvme_ns->stat, SPDK_BDEV_RESET_STAT_MAXMIN); 4309 } 4310 4311 return nvme_ns; 4312 } 4313 4314 static void 4315 nvme_ns_free(struct nvme_ns *nvme_ns) 4316 { 4317 free(nvme_ns->stat); 4318 free(nvme_ns); 4319 } 4320 4321 static void 4322 nvme_ctrlr_populate_namespace_done(struct nvme_ns *nvme_ns, int rc) 4323 { 4324 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 4325 struct nvme_async_probe_ctx *ctx = nvme_ns->probe_ctx; 4326 4327 if (rc == 0) { 4328 nvme_ns->probe_ctx = NULL; 4329 pthread_mutex_lock(&nvme_ctrlr->mutex); 4330 nvme_ctrlr->ref++; 4331 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4332 } else { 4333 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4334 nvme_ns_free(nvme_ns); 4335 } 4336 4337 if (ctx) { 4338 ctx->populates_in_progress--; 4339 if (ctx->populates_in_progress == 0) { 4340 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 4341 } 4342 } 4343 } 4344 4345 static void 4346 bdev_nvme_add_io_path(struct spdk_io_channel_iter *i) 4347 { 4348 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4349 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4350 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4351 int rc; 4352 4353 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 4354 if (rc != 0) { 4355 SPDK_ERRLOG("Failed to add I/O path to bdev_channel dynamically.\n"); 4356 } 4357 4358 spdk_for_each_channel_continue(i, rc); 4359 } 4360 4361 static void 4362 bdev_nvme_delete_io_path(struct spdk_io_channel_iter *i) 4363 { 4364 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4365 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4366 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4367 struct nvme_io_path *io_path; 4368 4369 io_path = _bdev_nvme_get_io_path(nbdev_ch, nvme_ns); 4370 if (io_path != NULL) { 4371 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 4372 } 4373 4374 spdk_for_each_channel_continue(i, 0); 4375 } 4376 4377 static void 4378 bdev_nvme_add_io_path_failed(struct spdk_io_channel_iter *i, int status) 4379 { 4380 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4381 4382 nvme_ctrlr_populate_namespace_done(nvme_ns, -1); 4383 } 4384 4385 static void 4386 bdev_nvme_add_io_path_done(struct spdk_io_channel_iter *i, int status) 4387 { 4388 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4389 struct nvme_bdev *bdev = spdk_io_channel_iter_get_io_device(i); 4390 4391 if (status == 0) { 4392 nvme_ctrlr_populate_namespace_done(nvme_ns, 0); 4393 } else { 4394 /* Delete the added io_paths and fail populating the namespace. */ 4395 spdk_for_each_channel(bdev, 4396 bdev_nvme_delete_io_path, 4397 nvme_ns, 4398 bdev_nvme_add_io_path_failed); 4399 } 4400 } 4401 4402 static int 4403 nvme_bdev_add_ns(struct nvme_bdev *bdev, struct nvme_ns *nvme_ns) 4404 { 4405 struct nvme_ns *tmp_ns; 4406 const struct spdk_nvme_ns_data *nsdata; 4407 4408 nsdata = spdk_nvme_ns_get_data(nvme_ns->ns); 4409 if (!nsdata->nmic.can_share) { 4410 SPDK_ERRLOG("Namespace cannot be shared.\n"); 4411 return -EINVAL; 4412 } 4413 4414 pthread_mutex_lock(&bdev->mutex); 4415 4416 tmp_ns = TAILQ_FIRST(&bdev->nvme_ns_list); 4417 assert(tmp_ns != NULL); 4418 4419 if (!bdev_nvme_compare_ns(nvme_ns->ns, tmp_ns->ns)) { 4420 pthread_mutex_unlock(&bdev->mutex); 4421 SPDK_ERRLOG("Namespaces are not identical.\n"); 4422 return -EINVAL; 4423 } 4424 4425 bdev->ref++; 4426 TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); 4427 nvme_ns->bdev = bdev; 4428 4429 pthread_mutex_unlock(&bdev->mutex); 4430 4431 /* Add nvme_io_path to nvme_bdev_channels dynamically. */ 4432 spdk_for_each_channel(bdev, 4433 bdev_nvme_add_io_path, 4434 nvme_ns, 4435 bdev_nvme_add_io_path_done); 4436 4437 return 0; 4438 } 4439 4440 static void 4441 nvme_ctrlr_populate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4442 { 4443 struct spdk_nvme_ns *ns; 4444 struct nvme_bdev *bdev; 4445 int rc = 0; 4446 4447 ns = spdk_nvme_ctrlr_get_ns(nvme_ctrlr->ctrlr, nvme_ns->id); 4448 if (!ns) { 4449 SPDK_DEBUGLOG(bdev_nvme, "Invalid NS %d\n", nvme_ns->id); 4450 rc = -EINVAL; 4451 goto done; 4452 } 4453 4454 nvme_ns->ns = ns; 4455 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 4456 4457 if (nvme_ctrlr->ana_log_page != NULL) { 4458 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ns_set_ana_state, nvme_ns); 4459 } 4460 4461 bdev = nvme_bdev_ctrlr_get_bdev(nvme_ctrlr->nbdev_ctrlr, nvme_ns->id); 4462 if (bdev == NULL) { 4463 rc = nvme_bdev_create(nvme_ctrlr, nvme_ns); 4464 } else { 4465 rc = nvme_bdev_add_ns(bdev, nvme_ns); 4466 if (rc == 0) { 4467 return; 4468 } 4469 } 4470 done: 4471 nvme_ctrlr_populate_namespace_done(nvme_ns, rc); 4472 } 4473 4474 static void 4475 nvme_ctrlr_depopulate_namespace_done(struct nvme_ns *nvme_ns) 4476 { 4477 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 4478 4479 assert(nvme_ctrlr != NULL); 4480 4481 pthread_mutex_lock(&nvme_ctrlr->mutex); 4482 4483 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4484 4485 if (nvme_ns->bdev != NULL) { 4486 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4487 return; 4488 } 4489 4490 nvme_ns_free(nvme_ns); 4491 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4492 4493 nvme_ctrlr_release(nvme_ctrlr); 4494 } 4495 4496 static void 4497 bdev_nvme_delete_io_path_done(struct spdk_io_channel_iter *i, int status) 4498 { 4499 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4500 4501 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 4502 } 4503 4504 static void 4505 nvme_ctrlr_depopulate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4506 { 4507 struct nvme_bdev *bdev; 4508 4509 spdk_poller_unregister(&nvme_ns->anatt_timer); 4510 4511 bdev = nvme_ns->bdev; 4512 if (bdev != NULL) { 4513 pthread_mutex_lock(&bdev->mutex); 4514 4515 assert(bdev->ref > 0); 4516 bdev->ref--; 4517 if (bdev->ref == 0) { 4518 pthread_mutex_unlock(&bdev->mutex); 4519 4520 spdk_bdev_unregister(&bdev->disk, NULL, NULL); 4521 } else { 4522 /* spdk_bdev_unregister() is not called until the last nvme_ns is 4523 * depopulated. Hence we need to remove nvme_ns from bdev->nvme_ns_list 4524 * and clear nvme_ns->bdev here. 4525 */ 4526 TAILQ_REMOVE(&bdev->nvme_ns_list, nvme_ns, tailq); 4527 nvme_ns->bdev = NULL; 4528 4529 pthread_mutex_unlock(&bdev->mutex); 4530 4531 /* Delete nvme_io_paths from nvme_bdev_channels dynamically. After that, 4532 * we call depopulate_namespace_done() to avoid use-after-free. 4533 */ 4534 spdk_for_each_channel(bdev, 4535 bdev_nvme_delete_io_path, 4536 nvme_ns, 4537 bdev_nvme_delete_io_path_done); 4538 return; 4539 } 4540 } 4541 4542 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 4543 } 4544 4545 static void 4546 nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 4547 struct nvme_async_probe_ctx *ctx) 4548 { 4549 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 4550 struct nvme_ns *nvme_ns, *next; 4551 struct spdk_nvme_ns *ns; 4552 struct nvme_bdev *bdev; 4553 uint32_t nsid; 4554 int rc; 4555 uint64_t num_sectors; 4556 4557 if (ctx) { 4558 /* Initialize this count to 1 to handle the populate functions 4559 * calling nvme_ctrlr_populate_namespace_done() immediately. 4560 */ 4561 ctx->populates_in_progress = 1; 4562 } 4563 4564 /* First loop over our existing namespaces and see if they have been 4565 * removed. */ 4566 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 4567 while (nvme_ns != NULL) { 4568 next = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 4569 4570 if (spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) { 4571 /* NS is still there but attributes may have changed */ 4572 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id); 4573 num_sectors = spdk_nvme_ns_get_num_sectors(ns); 4574 bdev = nvme_ns->bdev; 4575 assert(bdev != NULL); 4576 if (bdev->disk.blockcnt != num_sectors) { 4577 SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n", 4578 nvme_ns->id, 4579 bdev->disk.name, 4580 bdev->disk.blockcnt, 4581 num_sectors); 4582 rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors); 4583 if (rc != 0) { 4584 SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n", 4585 bdev->disk.name, rc); 4586 } 4587 } 4588 } else { 4589 /* Namespace was removed */ 4590 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 4591 } 4592 4593 nvme_ns = next; 4594 } 4595 4596 /* Loop through all of the namespaces at the nvme level and see if any of them are new */ 4597 nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 4598 while (nsid != 0) { 4599 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 4600 4601 if (nvme_ns == NULL) { 4602 /* Found a new one */ 4603 nvme_ns = nvme_ns_alloc(); 4604 if (nvme_ns == NULL) { 4605 SPDK_ERRLOG("Failed to allocate namespace\n"); 4606 /* This just fails to attach the namespace. It may work on a future attempt. */ 4607 continue; 4608 } 4609 4610 nvme_ns->id = nsid; 4611 nvme_ns->ctrlr = nvme_ctrlr; 4612 4613 nvme_ns->bdev = NULL; 4614 4615 if (ctx) { 4616 ctx->populates_in_progress++; 4617 } 4618 nvme_ns->probe_ctx = ctx; 4619 4620 RB_INSERT(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4621 4622 nvme_ctrlr_populate_namespace(nvme_ctrlr, nvme_ns); 4623 } 4624 4625 nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid); 4626 } 4627 4628 if (ctx) { 4629 /* Decrement this count now that the loop is over to account 4630 * for the one we started with. If the count is then 0, we 4631 * know any populate_namespace functions completed immediately, 4632 * so we'll kick the callback here. 4633 */ 4634 ctx->populates_in_progress--; 4635 if (ctx->populates_in_progress == 0) { 4636 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 4637 } 4638 } 4639 4640 } 4641 4642 static void 4643 nvme_ctrlr_depopulate_namespaces(struct nvme_ctrlr *nvme_ctrlr) 4644 { 4645 struct nvme_ns *nvme_ns, *tmp; 4646 4647 RB_FOREACH_SAFE(nvme_ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp) { 4648 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 4649 } 4650 } 4651 4652 static uint32_t 4653 nvme_ctrlr_get_ana_log_page_size(struct nvme_ctrlr *nvme_ctrlr) 4654 { 4655 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 4656 const struct spdk_nvme_ctrlr_data *cdata; 4657 uint32_t nsid, ns_count = 0; 4658 4659 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 4660 4661 for (nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 4662 nsid != 0; nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid)) { 4663 ns_count++; 4664 } 4665 4666 return sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 4667 sizeof(struct spdk_nvme_ana_group_descriptor) + ns_count * 4668 sizeof(uint32_t); 4669 } 4670 4671 static int 4672 nvme_ctrlr_set_ana_states(const struct spdk_nvme_ana_group_descriptor *desc, 4673 void *cb_arg) 4674 { 4675 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 4676 struct nvme_ns *nvme_ns; 4677 uint32_t i, nsid; 4678 4679 for (i = 0; i < desc->num_of_nsid; i++) { 4680 nsid = desc->nsid[i]; 4681 if (nsid == 0) { 4682 continue; 4683 } 4684 4685 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 4686 4687 assert(nvme_ns != NULL); 4688 if (nvme_ns == NULL) { 4689 /* Target told us that an inactive namespace had an ANA change */ 4690 continue; 4691 } 4692 4693 _nvme_ns_set_ana_state(nvme_ns, desc); 4694 } 4695 4696 return 0; 4697 } 4698 4699 static void 4700 bdev_nvme_disable_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 4701 { 4702 struct nvme_ns *nvme_ns; 4703 4704 spdk_free(nvme_ctrlr->ana_log_page); 4705 nvme_ctrlr->ana_log_page = NULL; 4706 4707 for (nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 4708 nvme_ns != NULL; 4709 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns)) { 4710 nvme_ns->ana_state_updating = false; 4711 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 4712 } 4713 } 4714 4715 static void 4716 nvme_ctrlr_read_ana_log_page_done(void *ctx, const struct spdk_nvme_cpl *cpl) 4717 { 4718 struct nvme_ctrlr *nvme_ctrlr = ctx; 4719 4720 if (cpl != NULL && spdk_nvme_cpl_is_success(cpl)) { 4721 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ctrlr_set_ana_states, 4722 nvme_ctrlr); 4723 } else { 4724 bdev_nvme_disable_read_ana_log_page(nvme_ctrlr); 4725 } 4726 4727 pthread_mutex_lock(&nvme_ctrlr->mutex); 4728 4729 assert(nvme_ctrlr->ana_log_page_updating == true); 4730 nvme_ctrlr->ana_log_page_updating = false; 4731 4732 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 4733 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4734 4735 nvme_ctrlr_unregister(nvme_ctrlr); 4736 } else { 4737 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4738 4739 bdev_nvme_clear_io_path_caches(nvme_ctrlr); 4740 } 4741 } 4742 4743 static int 4744 nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 4745 { 4746 uint32_t ana_log_page_size; 4747 int rc; 4748 4749 if (nvme_ctrlr->ana_log_page == NULL) { 4750 return -EINVAL; 4751 } 4752 4753 ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); 4754 4755 if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { 4756 SPDK_ERRLOG("ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", 4757 ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); 4758 return -EINVAL; 4759 } 4760 4761 pthread_mutex_lock(&nvme_ctrlr->mutex); 4762 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 4763 nvme_ctrlr->ana_log_page_updating) { 4764 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4765 return -EBUSY; 4766 } 4767 4768 nvme_ctrlr->ana_log_page_updating = true; 4769 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4770 4771 rc = spdk_nvme_ctrlr_cmd_get_log_page(nvme_ctrlr->ctrlr, 4772 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 4773 SPDK_NVME_GLOBAL_NS_TAG, 4774 nvme_ctrlr->ana_log_page, 4775 ana_log_page_size, 0, 4776 nvme_ctrlr_read_ana_log_page_done, 4777 nvme_ctrlr); 4778 if (rc != 0) { 4779 nvme_ctrlr_read_ana_log_page_done(nvme_ctrlr, NULL); 4780 } 4781 4782 return rc; 4783 } 4784 4785 static void 4786 dummy_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 4787 { 4788 } 4789 4790 struct bdev_nvme_set_preferred_path_ctx { 4791 struct spdk_bdev_desc *desc; 4792 struct nvme_ns *nvme_ns; 4793 bdev_nvme_set_preferred_path_cb cb_fn; 4794 void *cb_arg; 4795 }; 4796 4797 static void 4798 bdev_nvme_set_preferred_path_done(struct spdk_io_channel_iter *i, int status) 4799 { 4800 struct bdev_nvme_set_preferred_path_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4801 4802 assert(ctx != NULL); 4803 assert(ctx->desc != NULL); 4804 assert(ctx->cb_fn != NULL); 4805 4806 spdk_bdev_close(ctx->desc); 4807 4808 ctx->cb_fn(ctx->cb_arg, status); 4809 4810 free(ctx); 4811 } 4812 4813 static void 4814 _bdev_nvme_set_preferred_path(struct spdk_io_channel_iter *i) 4815 { 4816 struct bdev_nvme_set_preferred_path_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4817 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4818 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4819 struct nvme_io_path *io_path, *prev; 4820 4821 prev = NULL; 4822 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 4823 if (io_path->nvme_ns == ctx->nvme_ns) { 4824 break; 4825 } 4826 prev = io_path; 4827 } 4828 4829 if (io_path != NULL) { 4830 if (prev != NULL) { 4831 STAILQ_REMOVE_AFTER(&nbdev_ch->io_path_list, prev, stailq); 4832 STAILQ_INSERT_HEAD(&nbdev_ch->io_path_list, io_path, stailq); 4833 } 4834 4835 /* We can set io_path to nbdev_ch->current_io_path directly here. 4836 * However, it needs to be conditional. To simplify the code, 4837 * just clear nbdev_ch->current_io_path and let find_io_path() 4838 * fill it. 4839 * 4840 * Automatic failback may be disabled. Hence even if the io_path is 4841 * already at the head, clear nbdev_ch->current_io_path. 4842 */ 4843 bdev_nvme_clear_current_io_path(nbdev_ch); 4844 } 4845 4846 spdk_for_each_channel_continue(i, 0); 4847 } 4848 4849 static struct nvme_ns * 4850 bdev_nvme_set_preferred_ns(struct nvme_bdev *nbdev, uint16_t cntlid) 4851 { 4852 struct nvme_ns *nvme_ns, *prev; 4853 const struct spdk_nvme_ctrlr_data *cdata; 4854 4855 prev = NULL; 4856 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 4857 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 4858 4859 if (cdata->cntlid == cntlid) { 4860 break; 4861 } 4862 prev = nvme_ns; 4863 } 4864 4865 if (nvme_ns != NULL && prev != NULL) { 4866 TAILQ_REMOVE(&nbdev->nvme_ns_list, nvme_ns, tailq); 4867 TAILQ_INSERT_HEAD(&nbdev->nvme_ns_list, nvme_ns, tailq); 4868 } 4869 4870 return nvme_ns; 4871 } 4872 4873 /* This function supports only multipath mode. There is only a single I/O path 4874 * for each NVMe-oF controller. Hence, just move the matched I/O path to the 4875 * head of the I/O path list for each NVMe bdev channel. 4876 * 4877 * NVMe bdev channel may be acquired after completing this function. move the 4878 * matched namespace to the head of the namespace list for the NVMe bdev too. 4879 */ 4880 void 4881 bdev_nvme_set_preferred_path(const char *name, uint16_t cntlid, 4882 bdev_nvme_set_preferred_path_cb cb_fn, void *cb_arg) 4883 { 4884 struct bdev_nvme_set_preferred_path_ctx *ctx; 4885 struct spdk_bdev *bdev; 4886 struct nvme_bdev *nbdev; 4887 int rc = 0; 4888 4889 assert(cb_fn != NULL); 4890 4891 ctx = calloc(1, sizeof(*ctx)); 4892 if (ctx == NULL) { 4893 SPDK_ERRLOG("Failed to alloc context.\n"); 4894 rc = -ENOMEM; 4895 goto err_alloc; 4896 } 4897 4898 ctx->cb_fn = cb_fn; 4899 ctx->cb_arg = cb_arg; 4900 4901 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 4902 if (rc != 0) { 4903 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 4904 goto err_open; 4905 } 4906 4907 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 4908 4909 if (bdev->module != &nvme_if) { 4910 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 4911 rc = -ENODEV; 4912 goto err_bdev; 4913 } 4914 4915 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 4916 4917 pthread_mutex_lock(&nbdev->mutex); 4918 4919 ctx->nvme_ns = bdev_nvme_set_preferred_ns(nbdev, cntlid); 4920 if (ctx->nvme_ns == NULL) { 4921 pthread_mutex_unlock(&nbdev->mutex); 4922 4923 SPDK_ERRLOG("bdev %s does not have namespace to controller %u.\n", name, cntlid); 4924 rc = -ENODEV; 4925 goto err_bdev; 4926 } 4927 4928 pthread_mutex_unlock(&nbdev->mutex); 4929 4930 spdk_for_each_channel(nbdev, 4931 _bdev_nvme_set_preferred_path, 4932 ctx, 4933 bdev_nvme_set_preferred_path_done); 4934 return; 4935 4936 err_bdev: 4937 spdk_bdev_close(ctx->desc); 4938 err_open: 4939 free(ctx); 4940 err_alloc: 4941 cb_fn(cb_arg, rc); 4942 } 4943 4944 struct bdev_nvme_set_multipath_policy_ctx { 4945 struct spdk_bdev_desc *desc; 4946 bdev_nvme_set_multipath_policy_cb cb_fn; 4947 void *cb_arg; 4948 }; 4949 4950 static void 4951 bdev_nvme_set_multipath_policy_done(struct spdk_io_channel_iter *i, int status) 4952 { 4953 struct bdev_nvme_set_multipath_policy_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4954 4955 assert(ctx != NULL); 4956 assert(ctx->desc != NULL); 4957 assert(ctx->cb_fn != NULL); 4958 4959 spdk_bdev_close(ctx->desc); 4960 4961 ctx->cb_fn(ctx->cb_arg, status); 4962 4963 free(ctx); 4964 } 4965 4966 static void 4967 _bdev_nvme_set_multipath_policy(struct spdk_io_channel_iter *i) 4968 { 4969 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4970 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4971 struct nvme_bdev *nbdev = spdk_io_channel_get_io_device(_ch); 4972 4973 nbdev_ch->mp_policy = nbdev->mp_policy; 4974 nbdev_ch->mp_selector = nbdev->mp_selector; 4975 nbdev_ch->rr_min_io = nbdev->rr_min_io; 4976 bdev_nvme_clear_current_io_path(nbdev_ch); 4977 4978 spdk_for_each_channel_continue(i, 0); 4979 } 4980 4981 void 4982 bdev_nvme_set_multipath_policy(const char *name, enum bdev_nvme_multipath_policy policy, 4983 enum bdev_nvme_multipath_selector selector, uint32_t rr_min_io, 4984 bdev_nvme_set_multipath_policy_cb cb_fn, void *cb_arg) 4985 { 4986 struct bdev_nvme_set_multipath_policy_ctx *ctx; 4987 struct spdk_bdev *bdev; 4988 struct nvme_bdev *nbdev; 4989 int rc; 4990 4991 assert(cb_fn != NULL); 4992 4993 if (policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE && selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 4994 if (rr_min_io == UINT32_MAX) { 4995 rr_min_io = 1; 4996 } else if (rr_min_io == 0) { 4997 rc = -EINVAL; 4998 goto exit; 4999 } 5000 } else if (rr_min_io != UINT32_MAX) { 5001 rc = -EINVAL; 5002 goto exit; 5003 } 5004 5005 ctx = calloc(1, sizeof(*ctx)); 5006 if (ctx == NULL) { 5007 SPDK_ERRLOG("Failed to alloc context.\n"); 5008 rc = -ENOMEM; 5009 goto exit; 5010 } 5011 5012 ctx->cb_fn = cb_fn; 5013 ctx->cb_arg = cb_arg; 5014 5015 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 5016 if (rc != 0) { 5017 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 5018 rc = -ENODEV; 5019 goto err_open; 5020 } 5021 5022 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 5023 if (bdev->module != &nvme_if) { 5024 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 5025 rc = -ENODEV; 5026 goto err_module; 5027 } 5028 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 5029 5030 pthread_mutex_lock(&nbdev->mutex); 5031 nbdev->mp_policy = policy; 5032 nbdev->mp_selector = selector; 5033 nbdev->rr_min_io = rr_min_io; 5034 pthread_mutex_unlock(&nbdev->mutex); 5035 5036 spdk_for_each_channel(nbdev, 5037 _bdev_nvme_set_multipath_policy, 5038 ctx, 5039 bdev_nvme_set_multipath_policy_done); 5040 return; 5041 5042 err_module: 5043 spdk_bdev_close(ctx->desc); 5044 err_open: 5045 free(ctx); 5046 exit: 5047 cb_fn(cb_arg, rc); 5048 } 5049 5050 static void 5051 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 5052 { 5053 struct nvme_ctrlr *nvme_ctrlr = arg; 5054 union spdk_nvme_async_event_completion event; 5055 5056 if (spdk_nvme_cpl_is_error(cpl)) { 5057 SPDK_WARNLOG("AER request execute failed\n"); 5058 return; 5059 } 5060 5061 event.raw = cpl->cdw0; 5062 if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 5063 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) { 5064 nvme_ctrlr_populate_namespaces(nvme_ctrlr, NULL); 5065 } else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 5066 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_ANA_CHANGE)) { 5067 nvme_ctrlr_read_ana_log_page(nvme_ctrlr); 5068 } 5069 } 5070 5071 static void 5072 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, int rc) 5073 { 5074 if (ctx->cb_fn) { 5075 ctx->cb_fn(ctx->cb_ctx, ctx->reported_bdevs, rc); 5076 } 5077 5078 ctx->namespaces_populated = true; 5079 if (ctx->probe_done) { 5080 /* The probe was already completed, so we need to free the context 5081 * here. This can happen for cases like OCSSD, where we need to 5082 * send additional commands to the SSD after attach. 5083 */ 5084 free(ctx); 5085 } 5086 } 5087 5088 static void 5089 nvme_ctrlr_create_done(struct nvme_ctrlr *nvme_ctrlr, 5090 struct nvme_async_probe_ctx *ctx) 5091 { 5092 spdk_io_device_register(nvme_ctrlr, 5093 bdev_nvme_create_ctrlr_channel_cb, 5094 bdev_nvme_destroy_ctrlr_channel_cb, 5095 sizeof(struct nvme_ctrlr_channel), 5096 nvme_ctrlr->nbdev_ctrlr->name); 5097 5098 nvme_ctrlr_populate_namespaces(nvme_ctrlr, ctx); 5099 } 5100 5101 static void 5102 nvme_ctrlr_init_ana_log_page_done(void *_ctx, const struct spdk_nvme_cpl *cpl) 5103 { 5104 struct nvme_ctrlr *nvme_ctrlr = _ctx; 5105 struct nvme_async_probe_ctx *ctx = nvme_ctrlr->probe_ctx; 5106 5107 nvme_ctrlr->probe_ctx = NULL; 5108 5109 if (spdk_nvme_cpl_is_error(cpl)) { 5110 nvme_ctrlr_delete(nvme_ctrlr); 5111 5112 if (ctx != NULL) { 5113 ctx->reported_bdevs = 0; 5114 populate_namespaces_cb(ctx, -1); 5115 } 5116 return; 5117 } 5118 5119 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 5120 } 5121 5122 static int 5123 nvme_ctrlr_init_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 5124 struct nvme_async_probe_ctx *ctx) 5125 { 5126 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 5127 const struct spdk_nvme_ctrlr_data *cdata; 5128 uint32_t ana_log_page_size; 5129 5130 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5131 5132 /* Set buffer size enough to include maximum number of allowed namespaces. */ 5133 ana_log_page_size = sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 5134 sizeof(struct spdk_nvme_ana_group_descriptor) + cdata->mnan * 5135 sizeof(uint32_t); 5136 5137 nvme_ctrlr->ana_log_page = spdk_zmalloc(ana_log_page_size, 64, NULL, 5138 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 5139 if (nvme_ctrlr->ana_log_page == NULL) { 5140 SPDK_ERRLOG("could not allocate ANA log page buffer\n"); 5141 return -ENXIO; 5142 } 5143 5144 /* Each descriptor in a ANA log page is not ensured to be 8-bytes aligned. 5145 * Hence copy each descriptor to a temporary area when parsing it. 5146 * 5147 * Allocate a buffer whose size is as large as ANA log page buffer because 5148 * we do not know the size of a descriptor until actually reading it. 5149 */ 5150 nvme_ctrlr->copied_ana_desc = calloc(1, ana_log_page_size); 5151 if (nvme_ctrlr->copied_ana_desc == NULL) { 5152 SPDK_ERRLOG("could not allocate a buffer to parse ANA descriptor\n"); 5153 return -ENOMEM; 5154 } 5155 5156 nvme_ctrlr->max_ana_log_page_size = ana_log_page_size; 5157 5158 nvme_ctrlr->probe_ctx = ctx; 5159 5160 /* Then, set the read size only to include the current active namespaces. */ 5161 ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); 5162 5163 if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { 5164 SPDK_ERRLOG("ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", 5165 ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); 5166 return -EINVAL; 5167 } 5168 5169 return spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, 5170 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 5171 SPDK_NVME_GLOBAL_NS_TAG, 5172 nvme_ctrlr->ana_log_page, 5173 ana_log_page_size, 0, 5174 nvme_ctrlr_init_ana_log_page_done, 5175 nvme_ctrlr); 5176 } 5177 5178 /* hostnqn and subnqn were already verified before attaching a controller. 5179 * Hence check only the multipath capability and cntlid here. 5180 */ 5181 static bool 5182 bdev_nvme_check_multipath(struct nvme_bdev_ctrlr *nbdev_ctrlr, struct spdk_nvme_ctrlr *ctrlr) 5183 { 5184 struct nvme_ctrlr *tmp; 5185 const struct spdk_nvme_ctrlr_data *cdata, *tmp_cdata; 5186 5187 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5188 5189 if (!cdata->cmic.multi_ctrlr) { 5190 SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); 5191 return false; 5192 } 5193 5194 TAILQ_FOREACH(tmp, &nbdev_ctrlr->ctrlrs, tailq) { 5195 tmp_cdata = spdk_nvme_ctrlr_get_data(tmp->ctrlr); 5196 5197 if (!tmp_cdata->cmic.multi_ctrlr) { 5198 SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); 5199 return false; 5200 } 5201 if (cdata->cntlid == tmp_cdata->cntlid) { 5202 SPDK_ERRLOG("cntlid %u are duplicated.\n", tmp_cdata->cntlid); 5203 return false; 5204 } 5205 } 5206 5207 return true; 5208 } 5209 5210 static int 5211 nvme_bdev_ctrlr_create(const char *name, struct nvme_ctrlr *nvme_ctrlr) 5212 { 5213 struct nvme_bdev_ctrlr *nbdev_ctrlr; 5214 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 5215 int rc = 0; 5216 5217 pthread_mutex_lock(&g_bdev_nvme_mutex); 5218 5219 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 5220 if (nbdev_ctrlr != NULL) { 5221 if (!bdev_nvme_check_multipath(nbdev_ctrlr, ctrlr)) { 5222 rc = -EINVAL; 5223 goto exit; 5224 } 5225 } else { 5226 nbdev_ctrlr = calloc(1, sizeof(*nbdev_ctrlr)); 5227 if (nbdev_ctrlr == NULL) { 5228 SPDK_ERRLOG("Failed to allocate nvme_bdev_ctrlr.\n"); 5229 rc = -ENOMEM; 5230 goto exit; 5231 } 5232 nbdev_ctrlr->name = strdup(name); 5233 if (nbdev_ctrlr->name == NULL) { 5234 SPDK_ERRLOG("Failed to allocate name of nvme_bdev_ctrlr.\n"); 5235 free(nbdev_ctrlr); 5236 goto exit; 5237 } 5238 TAILQ_INIT(&nbdev_ctrlr->ctrlrs); 5239 TAILQ_INIT(&nbdev_ctrlr->bdevs); 5240 TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 5241 } 5242 nvme_ctrlr->nbdev_ctrlr = nbdev_ctrlr; 5243 TAILQ_INSERT_TAIL(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 5244 exit: 5245 pthread_mutex_unlock(&g_bdev_nvme_mutex); 5246 return rc; 5247 } 5248 5249 static int 5250 nvme_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr, 5251 const char *name, 5252 const struct spdk_nvme_transport_id *trid, 5253 struct nvme_async_probe_ctx *ctx) 5254 { 5255 struct nvme_ctrlr *nvme_ctrlr; 5256 struct nvme_path_id *path_id; 5257 const struct spdk_nvme_ctrlr_data *cdata; 5258 int rc; 5259 5260 nvme_ctrlr = calloc(1, sizeof(*nvme_ctrlr)); 5261 if (nvme_ctrlr == NULL) { 5262 SPDK_ERRLOG("Failed to allocate device struct\n"); 5263 return -ENOMEM; 5264 } 5265 5266 rc = pthread_mutex_init(&nvme_ctrlr->mutex, NULL); 5267 if (rc != 0) { 5268 free(nvme_ctrlr); 5269 return rc; 5270 } 5271 5272 TAILQ_INIT(&nvme_ctrlr->trids); 5273 5274 RB_INIT(&nvme_ctrlr->namespaces); 5275 5276 path_id = calloc(1, sizeof(*path_id)); 5277 if (path_id == NULL) { 5278 SPDK_ERRLOG("Failed to allocate trid entry pointer\n"); 5279 rc = -ENOMEM; 5280 goto err; 5281 } 5282 5283 path_id->trid = *trid; 5284 if (ctx != NULL) { 5285 memcpy(path_id->hostid.hostaddr, ctx->drv_opts.src_addr, sizeof(path_id->hostid.hostaddr)); 5286 memcpy(path_id->hostid.hostsvcid, ctx->drv_opts.src_svcid, sizeof(path_id->hostid.hostsvcid)); 5287 } 5288 nvme_ctrlr->active_path_id = path_id; 5289 TAILQ_INSERT_HEAD(&nvme_ctrlr->trids, path_id, link); 5290 5291 nvme_ctrlr->thread = spdk_get_thread(); 5292 nvme_ctrlr->ctrlr = ctrlr; 5293 nvme_ctrlr->ref = 1; 5294 5295 if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) { 5296 SPDK_ERRLOG("OCSSDs are not supported"); 5297 rc = -ENOTSUP; 5298 goto err; 5299 } 5300 5301 if (ctx != NULL) { 5302 memcpy(&nvme_ctrlr->opts, &ctx->bdev_opts, sizeof(ctx->bdev_opts)); 5303 } else { 5304 bdev_nvme_get_default_ctrlr_opts(&nvme_ctrlr->opts); 5305 } 5306 5307 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_ctrlr, 5308 g_opts.nvme_adminq_poll_period_us); 5309 5310 if (g_opts.timeout_us > 0) { 5311 /* Register timeout callback. Timeout values for IO vs. admin reqs can be different. */ 5312 /* If timeout_admin_us is 0 (not specified), admin uses same timeout as IO. */ 5313 uint64_t adm_timeout_us = (g_opts.timeout_admin_us == 0) ? 5314 g_opts.timeout_us : g_opts.timeout_admin_us; 5315 spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us, 5316 adm_timeout_us, timeout_cb, nvme_ctrlr); 5317 } 5318 5319 spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_ctrlr); 5320 spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_ctrlr); 5321 5322 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 5323 SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) { 5324 nvme_ctrlr->opal_dev = spdk_opal_dev_construct(ctrlr); 5325 } 5326 5327 rc = nvme_bdev_ctrlr_create(name, nvme_ctrlr); 5328 if (rc != 0) { 5329 goto err; 5330 } 5331 5332 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5333 5334 if (cdata->cmic.ana_reporting) { 5335 rc = nvme_ctrlr_init_ana_log_page(nvme_ctrlr, ctx); 5336 if (rc == 0) { 5337 return 0; 5338 } 5339 } else { 5340 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 5341 return 0; 5342 } 5343 5344 err: 5345 nvme_ctrlr_delete(nvme_ctrlr); 5346 return rc; 5347 } 5348 5349 void 5350 bdev_nvme_get_default_ctrlr_opts(struct nvme_ctrlr_opts *opts) 5351 { 5352 opts->prchk_flags = 0; 5353 opts->ctrlr_loss_timeout_sec = g_opts.ctrlr_loss_timeout_sec; 5354 opts->reconnect_delay_sec = g_opts.reconnect_delay_sec; 5355 opts->fast_io_fail_timeout_sec = g_opts.fast_io_fail_timeout_sec; 5356 } 5357 5358 static void 5359 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5360 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *drv_opts) 5361 { 5362 char *name; 5363 5364 name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++); 5365 if (!name) { 5366 SPDK_ERRLOG("Failed to assign name to NVMe device\n"); 5367 return; 5368 } 5369 5370 if (nvme_ctrlr_create(ctrlr, name, trid, NULL) == 0) { 5371 SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name); 5372 } else { 5373 SPDK_ERRLOG("Failed to attach to %s (%s)\n", trid->traddr, name); 5374 } 5375 5376 free(name); 5377 } 5378 5379 static void 5380 _nvme_ctrlr_destruct(void *ctx) 5381 { 5382 struct nvme_ctrlr *nvme_ctrlr = ctx; 5383 5384 nvme_ctrlr_depopulate_namespaces(nvme_ctrlr); 5385 nvme_ctrlr_release(nvme_ctrlr); 5386 } 5387 5388 static int 5389 bdev_nvme_delete_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) 5390 { 5391 struct nvme_probe_skip_entry *entry; 5392 5393 /* The controller's destruction was already started */ 5394 if (nvme_ctrlr->destruct) { 5395 return -EALREADY; 5396 } 5397 5398 if (!hotplug && 5399 nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { 5400 entry = calloc(1, sizeof(*entry)); 5401 if (!entry) { 5402 return -ENOMEM; 5403 } 5404 entry->trid = nvme_ctrlr->active_path_id->trid; 5405 TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq); 5406 } 5407 5408 nvme_ctrlr->destruct = true; 5409 return 0; 5410 } 5411 5412 static int 5413 bdev_nvme_delete_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) 5414 { 5415 int rc; 5416 5417 pthread_mutex_lock(&nvme_ctrlr->mutex); 5418 rc = bdev_nvme_delete_ctrlr_unsafe(nvme_ctrlr, hotplug); 5419 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5420 5421 if (rc == 0) { 5422 _nvme_ctrlr_destruct(nvme_ctrlr); 5423 } else if (rc == -EALREADY) { 5424 rc = 0; 5425 } 5426 5427 return rc; 5428 } 5429 5430 static void 5431 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr) 5432 { 5433 struct nvme_ctrlr *nvme_ctrlr = cb_ctx; 5434 5435 bdev_nvme_delete_ctrlr(nvme_ctrlr, true); 5436 } 5437 5438 static int 5439 bdev_nvme_hotplug_probe(void *arg) 5440 { 5441 if (g_hotplug_probe_ctx == NULL) { 5442 spdk_poller_unregister(&g_hotplug_probe_poller); 5443 return SPDK_POLLER_IDLE; 5444 } 5445 5446 if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) { 5447 g_hotplug_probe_ctx = NULL; 5448 spdk_poller_unregister(&g_hotplug_probe_poller); 5449 } 5450 5451 return SPDK_POLLER_BUSY; 5452 } 5453 5454 static int 5455 bdev_nvme_hotplug(void *arg) 5456 { 5457 struct spdk_nvme_transport_id trid_pcie; 5458 5459 if (g_hotplug_probe_ctx) { 5460 return SPDK_POLLER_BUSY; 5461 } 5462 5463 memset(&trid_pcie, 0, sizeof(trid_pcie)); 5464 spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE); 5465 5466 g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL, 5467 hotplug_probe_cb, attach_cb, NULL); 5468 5469 if (g_hotplug_probe_ctx) { 5470 assert(g_hotplug_probe_poller == NULL); 5471 g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000); 5472 } 5473 5474 return SPDK_POLLER_BUSY; 5475 } 5476 5477 void 5478 bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts) 5479 { 5480 *opts = g_opts; 5481 } 5482 5483 static bool bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, 5484 uint32_t reconnect_delay_sec, 5485 uint32_t fast_io_fail_timeout_sec); 5486 5487 static int 5488 bdev_nvme_validate_opts(const struct spdk_bdev_nvme_opts *opts) 5489 { 5490 if ((opts->timeout_us == 0) && (opts->timeout_admin_us != 0)) { 5491 /* Can't set timeout_admin_us without also setting timeout_us */ 5492 SPDK_WARNLOG("Invalid options: Can't have (timeout_us == 0) with (timeout_admin_us > 0)\n"); 5493 return -EINVAL; 5494 } 5495 5496 if (opts->bdev_retry_count < -1) { 5497 SPDK_WARNLOG("Invalid option: bdev_retry_count can't be less than -1.\n"); 5498 return -EINVAL; 5499 } 5500 5501 if (!bdev_nvme_check_io_error_resiliency_params(opts->ctrlr_loss_timeout_sec, 5502 opts->reconnect_delay_sec, 5503 opts->fast_io_fail_timeout_sec)) { 5504 return -EINVAL; 5505 } 5506 5507 return 0; 5508 } 5509 5510 int 5511 bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts) 5512 { 5513 int ret; 5514 5515 ret = bdev_nvme_validate_opts(opts); 5516 if (ret) { 5517 SPDK_WARNLOG("Failed to set nvme opts.\n"); 5518 return ret; 5519 } 5520 5521 if (g_bdev_nvme_init_thread != NULL) { 5522 if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 5523 return -EPERM; 5524 } 5525 } 5526 5527 if (opts->rdma_srq_size != 0) { 5528 struct spdk_nvme_transport_opts drv_opts; 5529 5530 spdk_nvme_transport_get_opts(&drv_opts, sizeof(drv_opts)); 5531 drv_opts.rdma_srq_size = opts->rdma_srq_size; 5532 5533 ret = spdk_nvme_transport_set_opts(&drv_opts, sizeof(drv_opts)); 5534 if (ret) { 5535 SPDK_ERRLOG("Failed to set NVMe transport opts.\n"); 5536 return ret; 5537 } 5538 } 5539 5540 g_opts = *opts; 5541 5542 return 0; 5543 } 5544 5545 struct set_nvme_hotplug_ctx { 5546 uint64_t period_us; 5547 bool enabled; 5548 spdk_msg_fn fn; 5549 void *fn_ctx; 5550 }; 5551 5552 static void 5553 set_nvme_hotplug_period_cb(void *_ctx) 5554 { 5555 struct set_nvme_hotplug_ctx *ctx = _ctx; 5556 5557 spdk_poller_unregister(&g_hotplug_poller); 5558 if (ctx->enabled) { 5559 g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us); 5560 } 5561 5562 g_nvme_hotplug_poll_period_us = ctx->period_us; 5563 g_nvme_hotplug_enabled = ctx->enabled; 5564 if (ctx->fn) { 5565 ctx->fn(ctx->fn_ctx); 5566 } 5567 5568 free(ctx); 5569 } 5570 5571 int 5572 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx) 5573 { 5574 struct set_nvme_hotplug_ctx *ctx; 5575 5576 if (enabled == true && !spdk_process_is_primary()) { 5577 return -EPERM; 5578 } 5579 5580 ctx = calloc(1, sizeof(*ctx)); 5581 if (ctx == NULL) { 5582 return -ENOMEM; 5583 } 5584 5585 period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us; 5586 ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX); 5587 ctx->enabled = enabled; 5588 ctx->fn = cb; 5589 ctx->fn_ctx = cb_ctx; 5590 5591 spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx); 5592 return 0; 5593 } 5594 5595 static void 5596 nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 5597 struct nvme_async_probe_ctx *ctx) 5598 { 5599 struct nvme_ns *nvme_ns; 5600 struct nvme_bdev *nvme_bdev; 5601 size_t j; 5602 5603 assert(nvme_ctrlr != NULL); 5604 5605 if (ctx->names == NULL) { 5606 ctx->reported_bdevs = 0; 5607 populate_namespaces_cb(ctx, 0); 5608 return; 5609 } 5610 5611 /* 5612 * Report the new bdevs that were created in this call. 5613 * There can be more than one bdev per NVMe controller. 5614 */ 5615 j = 0; 5616 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 5617 while (nvme_ns != NULL) { 5618 nvme_bdev = nvme_ns->bdev; 5619 if (j < ctx->max_bdevs) { 5620 ctx->names[j] = nvme_bdev->disk.name; 5621 j++; 5622 } else { 5623 SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n", 5624 ctx->max_bdevs); 5625 ctx->reported_bdevs = 0; 5626 populate_namespaces_cb(ctx, -ERANGE); 5627 return; 5628 } 5629 5630 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 5631 } 5632 5633 ctx->reported_bdevs = j; 5634 populate_namespaces_cb(ctx, 0); 5635 } 5636 5637 static int 5638 bdev_nvme_check_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 5639 struct spdk_nvme_ctrlr *new_ctrlr, 5640 struct spdk_nvme_transport_id *trid) 5641 { 5642 struct nvme_path_id *tmp_trid; 5643 5644 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 5645 SPDK_ERRLOG("PCIe failover is not supported.\n"); 5646 return -ENOTSUP; 5647 } 5648 5649 /* Currently we only support failover to the same transport type. */ 5650 if (nvme_ctrlr->active_path_id->trid.trtype != trid->trtype) { 5651 SPDK_WARNLOG("Failover from trtype: %s to a different trtype: %s is not supported currently\n", 5652 spdk_nvme_transport_id_trtype_str(nvme_ctrlr->active_path_id->trid.trtype), 5653 spdk_nvme_transport_id_trtype_str(trid->trtype)); 5654 return -EINVAL; 5655 } 5656 5657 5658 /* Currently we only support failover to the same NQN. */ 5659 if (strncmp(trid->subnqn, nvme_ctrlr->active_path_id->trid.subnqn, SPDK_NVMF_NQN_MAX_LEN)) { 5660 SPDK_WARNLOG("Failover from subnqn: %s to a different subnqn: %s is not supported currently\n", 5661 nvme_ctrlr->active_path_id->trid.subnqn, trid->subnqn); 5662 return -EINVAL; 5663 } 5664 5665 /* Skip all the other checks if we've already registered this path. */ 5666 TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) { 5667 if (!spdk_nvme_transport_id_compare(&tmp_trid->trid, trid)) { 5668 SPDK_WARNLOG("This path (traddr: %s subnqn: %s) is already registered\n", trid->traddr, 5669 trid->subnqn); 5670 return -EEXIST; 5671 } 5672 } 5673 5674 return 0; 5675 } 5676 5677 static int 5678 bdev_nvme_check_secondary_namespace(struct nvme_ctrlr *nvme_ctrlr, 5679 struct spdk_nvme_ctrlr *new_ctrlr) 5680 { 5681 struct nvme_ns *nvme_ns; 5682 struct spdk_nvme_ns *new_ns; 5683 5684 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 5685 while (nvme_ns != NULL) { 5686 new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nvme_ns->id); 5687 assert(new_ns != NULL); 5688 5689 if (!bdev_nvme_compare_ns(nvme_ns->ns, new_ns)) { 5690 return -EINVAL; 5691 } 5692 5693 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 5694 } 5695 5696 return 0; 5697 } 5698 5699 static int 5700 _bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 5701 struct spdk_nvme_transport_id *trid) 5702 { 5703 struct nvme_path_id *active_id, *new_trid, *tmp_trid; 5704 5705 new_trid = calloc(1, sizeof(*new_trid)); 5706 if (new_trid == NULL) { 5707 return -ENOMEM; 5708 } 5709 new_trid->trid = *trid; 5710 5711 active_id = nvme_ctrlr->active_path_id; 5712 assert(active_id != NULL); 5713 assert(active_id == TAILQ_FIRST(&nvme_ctrlr->trids)); 5714 5715 /* Skip the active trid not to replace it until it is failed. */ 5716 tmp_trid = TAILQ_NEXT(active_id, link); 5717 if (tmp_trid == NULL) { 5718 goto add_tail; 5719 } 5720 5721 /* It means the trid is faled if its last failed time is non-zero. 5722 * Insert the new alternate trid before any failed trid. 5723 */ 5724 TAILQ_FOREACH_FROM(tmp_trid, &nvme_ctrlr->trids, link) { 5725 if (tmp_trid->last_failed_tsc != 0) { 5726 TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link); 5727 return 0; 5728 } 5729 } 5730 5731 add_tail: 5732 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, new_trid, link); 5733 return 0; 5734 } 5735 5736 /* This is the case that a secondary path is added to an existing 5737 * nvme_ctrlr for failover. After checking if it can access the same 5738 * namespaces as the primary path, it is disconnected until failover occurs. 5739 */ 5740 static int 5741 bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 5742 struct spdk_nvme_ctrlr *new_ctrlr, 5743 struct spdk_nvme_transport_id *trid) 5744 { 5745 int rc; 5746 5747 assert(nvme_ctrlr != NULL); 5748 5749 pthread_mutex_lock(&nvme_ctrlr->mutex); 5750 5751 rc = bdev_nvme_check_secondary_trid(nvme_ctrlr, new_ctrlr, trid); 5752 if (rc != 0) { 5753 goto exit; 5754 } 5755 5756 rc = bdev_nvme_check_secondary_namespace(nvme_ctrlr, new_ctrlr); 5757 if (rc != 0) { 5758 goto exit; 5759 } 5760 5761 rc = _bdev_nvme_add_secondary_trid(nvme_ctrlr, trid); 5762 5763 exit: 5764 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5765 5766 spdk_nvme_detach(new_ctrlr); 5767 5768 return rc; 5769 } 5770 5771 static void 5772 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5773 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 5774 { 5775 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 5776 struct nvme_async_probe_ctx *ctx; 5777 int rc; 5778 5779 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 5780 ctx->ctrlr_attached = true; 5781 5782 rc = nvme_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx); 5783 if (rc != 0) { 5784 ctx->reported_bdevs = 0; 5785 populate_namespaces_cb(ctx, rc); 5786 } 5787 } 5788 5789 static void 5790 connect_set_failover_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5791 struct spdk_nvme_ctrlr *ctrlr, 5792 const struct spdk_nvme_ctrlr_opts *opts) 5793 { 5794 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 5795 struct nvme_ctrlr *nvme_ctrlr; 5796 struct nvme_async_probe_ctx *ctx; 5797 int rc; 5798 5799 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 5800 ctx->ctrlr_attached = true; 5801 5802 nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->base_name); 5803 if (nvme_ctrlr) { 5804 rc = bdev_nvme_add_secondary_trid(nvme_ctrlr, ctrlr, &ctx->trid); 5805 } else { 5806 rc = -ENODEV; 5807 } 5808 5809 ctx->reported_bdevs = 0; 5810 populate_namespaces_cb(ctx, rc); 5811 } 5812 5813 static int 5814 bdev_nvme_async_poll(void *arg) 5815 { 5816 struct nvme_async_probe_ctx *ctx = arg; 5817 int rc; 5818 5819 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 5820 if (spdk_unlikely(rc != -EAGAIN)) { 5821 ctx->probe_done = true; 5822 spdk_poller_unregister(&ctx->poller); 5823 if (!ctx->ctrlr_attached) { 5824 /* The probe is done, but no controller was attached. 5825 * That means we had a failure, so report -EIO back to 5826 * the caller (usually the RPC). populate_namespaces_cb() 5827 * will take care of freeing the nvme_async_probe_ctx. 5828 */ 5829 ctx->reported_bdevs = 0; 5830 populate_namespaces_cb(ctx, -EIO); 5831 } else if (ctx->namespaces_populated) { 5832 /* The namespaces for the attached controller were all 5833 * populated and the response was already sent to the 5834 * caller (usually the RPC). So free the context here. 5835 */ 5836 free(ctx); 5837 } 5838 } 5839 5840 return SPDK_POLLER_BUSY; 5841 } 5842 5843 static bool 5844 bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, 5845 uint32_t reconnect_delay_sec, 5846 uint32_t fast_io_fail_timeout_sec) 5847 { 5848 if (ctrlr_loss_timeout_sec < -1) { 5849 SPDK_ERRLOG("ctrlr_loss_timeout_sec can't be less than -1.\n"); 5850 return false; 5851 } else if (ctrlr_loss_timeout_sec == -1) { 5852 if (reconnect_delay_sec == 0) { 5853 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 5854 return false; 5855 } else if (fast_io_fail_timeout_sec != 0 && 5856 fast_io_fail_timeout_sec < reconnect_delay_sec) { 5857 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io-fail_timeout_sec.\n"); 5858 return false; 5859 } 5860 } else if (ctrlr_loss_timeout_sec != 0) { 5861 if (reconnect_delay_sec == 0) { 5862 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 5863 return false; 5864 } else if (reconnect_delay_sec > (uint32_t)ctrlr_loss_timeout_sec) { 5865 SPDK_ERRLOG("reconnect_delay_sec can't be more than ctrlr_loss_timeout_sec.\n"); 5866 return false; 5867 } else if (fast_io_fail_timeout_sec != 0) { 5868 if (fast_io_fail_timeout_sec < reconnect_delay_sec) { 5869 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io_fail_timeout_sec.\n"); 5870 return false; 5871 } else if (fast_io_fail_timeout_sec > (uint32_t)ctrlr_loss_timeout_sec) { 5872 SPDK_ERRLOG("fast_io_fail_timeout_sec can't be more than ctrlr_loss_timeout_sec.\n"); 5873 return false; 5874 } 5875 } 5876 } else if (reconnect_delay_sec != 0 || fast_io_fail_timeout_sec != 0) { 5877 SPDK_ERRLOG("Both reconnect_delay_sec and fast_io_fail_timeout_sec must be 0 if ctrlr_loss_timeout_sec is 0.\n"); 5878 return false; 5879 } 5880 5881 return true; 5882 } 5883 5884 int 5885 bdev_nvme_create(struct spdk_nvme_transport_id *trid, 5886 const char *base_name, 5887 const char **names, 5888 uint32_t count, 5889 spdk_bdev_create_nvme_fn cb_fn, 5890 void *cb_ctx, 5891 struct spdk_nvme_ctrlr_opts *drv_opts, 5892 struct nvme_ctrlr_opts *bdev_opts, 5893 bool multipath) 5894 { 5895 struct nvme_probe_skip_entry *entry, *tmp; 5896 struct nvme_async_probe_ctx *ctx; 5897 spdk_nvme_attach_cb attach_cb; 5898 5899 /* TODO expand this check to include both the host and target TRIDs. 5900 * Only if both are the same should we fail. 5901 */ 5902 if (nvme_ctrlr_get(trid) != NULL) { 5903 SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr); 5904 return -EEXIST; 5905 } 5906 5907 if (bdev_opts != NULL && 5908 !bdev_nvme_check_io_error_resiliency_params(bdev_opts->ctrlr_loss_timeout_sec, 5909 bdev_opts->reconnect_delay_sec, 5910 bdev_opts->fast_io_fail_timeout_sec)) { 5911 return -EINVAL; 5912 } 5913 5914 ctx = calloc(1, sizeof(*ctx)); 5915 if (!ctx) { 5916 return -ENOMEM; 5917 } 5918 ctx->base_name = base_name; 5919 ctx->names = names; 5920 ctx->max_bdevs = count; 5921 ctx->cb_fn = cb_fn; 5922 ctx->cb_ctx = cb_ctx; 5923 ctx->trid = *trid; 5924 5925 if (bdev_opts) { 5926 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 5927 } else { 5928 bdev_nvme_get_default_ctrlr_opts(&ctx->bdev_opts); 5929 } 5930 5931 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 5932 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) { 5933 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 5934 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 5935 free(entry); 5936 break; 5937 } 5938 } 5939 } 5940 5941 if (drv_opts) { 5942 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 5943 } else { 5944 spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->drv_opts, sizeof(ctx->drv_opts)); 5945 } 5946 5947 ctx->drv_opts.transport_retry_count = g_opts.transport_retry_count; 5948 ctx->drv_opts.transport_ack_timeout = g_opts.transport_ack_timeout; 5949 ctx->drv_opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms; 5950 ctx->drv_opts.disable_read_ana_log_page = true; 5951 ctx->drv_opts.transport_tos = g_opts.transport_tos; 5952 5953 if (nvme_bdev_ctrlr_get_by_name(base_name) == NULL || multipath) { 5954 attach_cb = connect_attach_cb; 5955 } else { 5956 attach_cb = connect_set_failover_cb; 5957 } 5958 5959 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, attach_cb); 5960 if (ctx->probe_ctx == NULL) { 5961 SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr); 5962 free(ctx); 5963 return -ENODEV; 5964 } 5965 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000); 5966 5967 return 0; 5968 } 5969 5970 static bool 5971 nvme_path_should_delete(struct nvme_path_id *p, const struct nvme_path_id *path_id) 5972 { 5973 if (path_id->trid.trtype != 0) { 5974 if (path_id->trid.trtype == SPDK_NVME_TRANSPORT_CUSTOM) { 5975 if (strcasecmp(path_id->trid.trstring, p->trid.trstring) != 0) { 5976 return false; 5977 } 5978 } else { 5979 if (path_id->trid.trtype != p->trid.trtype) { 5980 return false; 5981 } 5982 } 5983 } 5984 5985 if (!spdk_mem_all_zero(path_id->trid.traddr, sizeof(path_id->trid.traddr))) { 5986 if (strcasecmp(path_id->trid.traddr, p->trid.traddr) != 0) { 5987 return false; 5988 } 5989 } 5990 5991 if (path_id->trid.adrfam != 0) { 5992 if (path_id->trid.adrfam != p->trid.adrfam) { 5993 return false; 5994 } 5995 } 5996 5997 if (!spdk_mem_all_zero(path_id->trid.trsvcid, sizeof(path_id->trid.trsvcid))) { 5998 if (strcasecmp(path_id->trid.trsvcid, p->trid.trsvcid) != 0) { 5999 return false; 6000 } 6001 } 6002 6003 if (!spdk_mem_all_zero(path_id->trid.subnqn, sizeof(path_id->trid.subnqn))) { 6004 if (strcmp(path_id->trid.subnqn, p->trid.subnqn) != 0) { 6005 return false; 6006 } 6007 } 6008 6009 if (!spdk_mem_all_zero(path_id->hostid.hostaddr, sizeof(path_id->hostid.hostaddr))) { 6010 if (strcmp(path_id->hostid.hostaddr, p->hostid.hostaddr) != 0) { 6011 return false; 6012 } 6013 } 6014 6015 if (!spdk_mem_all_zero(path_id->hostid.hostsvcid, sizeof(path_id->hostid.hostsvcid))) { 6016 if (strcmp(path_id->hostid.hostsvcid, p->hostid.hostsvcid) != 0) { 6017 return false; 6018 } 6019 } 6020 6021 return true; 6022 } 6023 6024 static int 6025 _bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, const struct nvme_path_id *path_id) 6026 { 6027 struct nvme_path_id *p, *t; 6028 spdk_msg_fn msg_fn; 6029 int rc = -ENXIO; 6030 6031 pthread_mutex_lock(&nvme_ctrlr->mutex); 6032 6033 TAILQ_FOREACH_REVERSE_SAFE(p, &nvme_ctrlr->trids, nvme_paths, link, t) { 6034 if (p == TAILQ_FIRST(&nvme_ctrlr->trids)) { 6035 break; 6036 } 6037 6038 if (!nvme_path_should_delete(p, path_id)) { 6039 continue; 6040 } 6041 6042 /* We are not using the specified path. */ 6043 TAILQ_REMOVE(&nvme_ctrlr->trids, p, link); 6044 free(p); 6045 rc = 0; 6046 } 6047 6048 if (p == NULL || !nvme_path_should_delete(p, path_id)) { 6049 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6050 return rc; 6051 } 6052 6053 /* If we made it here, then this path is a match! Now we need to remove it. */ 6054 6055 /* This is the active path in use right now. The active path is always the first in the list. */ 6056 assert(p == nvme_ctrlr->active_path_id); 6057 6058 if (!TAILQ_NEXT(p, link)) { 6059 /* The current path is the only path. */ 6060 msg_fn = _nvme_ctrlr_destruct; 6061 rc = bdev_nvme_delete_ctrlr_unsafe(nvme_ctrlr, false); 6062 } else { 6063 /* There is an alternative path. */ 6064 msg_fn = _bdev_nvme_reset_ctrlr; 6065 rc = bdev_nvme_failover_ctrlr_unsafe(nvme_ctrlr, true); 6066 } 6067 6068 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6069 6070 if (rc == 0) { 6071 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 6072 } else if (rc == -EALREADY) { 6073 rc = 0; 6074 } 6075 6076 return rc; 6077 } 6078 6079 int 6080 bdev_nvme_delete(const char *name, const struct nvme_path_id *path_id) 6081 { 6082 struct nvme_bdev_ctrlr *nbdev_ctrlr; 6083 struct nvme_ctrlr *nvme_ctrlr, *tmp_nvme_ctrlr; 6084 int rc = -ENXIO, _rc; 6085 6086 if (name == NULL || path_id == NULL) { 6087 return -EINVAL; 6088 } 6089 6090 pthread_mutex_lock(&g_bdev_nvme_mutex); 6091 6092 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 6093 if (nbdev_ctrlr == NULL) { 6094 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6095 6096 SPDK_ERRLOG("Failed to find NVMe bdev controller\n"); 6097 return -ENODEV; 6098 } 6099 6100 TAILQ_FOREACH_SAFE(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq, tmp_nvme_ctrlr) { 6101 _rc = _bdev_nvme_delete(nvme_ctrlr, path_id); 6102 if (_rc < 0 && _rc != -ENXIO) { 6103 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6104 6105 return _rc; 6106 } else if (_rc == 0) { 6107 /* We traverse all remaining nvme_ctrlrs even if one nvme_ctrlr 6108 * was deleted successfully. To remember the successful deletion, 6109 * overwrite rc only if _rc is zero. 6110 */ 6111 rc = 0; 6112 } 6113 } 6114 6115 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6116 6117 /* All nvme_ctrlrs were deleted or no nvme_ctrlr which had the trid was found. */ 6118 return rc; 6119 } 6120 6121 #define DISCOVERY_INFOLOG(ctx, format, ...) \ 6122 SPDK_INFOLOG(bdev_nvme, "Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 6123 6124 #define DISCOVERY_ERRLOG(ctx, format, ...) \ 6125 SPDK_ERRLOG("Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 6126 6127 struct discovery_entry_ctx { 6128 char name[128]; 6129 struct spdk_nvme_transport_id trid; 6130 struct spdk_nvme_ctrlr_opts drv_opts; 6131 struct spdk_nvmf_discovery_log_page_entry entry; 6132 TAILQ_ENTRY(discovery_entry_ctx) tailq; 6133 struct discovery_ctx *ctx; 6134 }; 6135 6136 struct discovery_ctx { 6137 char *name; 6138 spdk_bdev_nvme_start_discovery_fn start_cb_fn; 6139 spdk_bdev_nvme_stop_discovery_fn stop_cb_fn; 6140 void *cb_ctx; 6141 struct spdk_nvme_probe_ctx *probe_ctx; 6142 struct spdk_nvme_detach_ctx *detach_ctx; 6143 struct spdk_nvme_ctrlr *ctrlr; 6144 struct spdk_nvme_transport_id trid; 6145 struct discovery_entry_ctx *entry_ctx_in_use; 6146 struct spdk_poller *poller; 6147 struct spdk_nvme_ctrlr_opts drv_opts; 6148 struct nvme_ctrlr_opts bdev_opts; 6149 struct spdk_nvmf_discovery_log_page *log_page; 6150 TAILQ_ENTRY(discovery_ctx) tailq; 6151 TAILQ_HEAD(, discovery_entry_ctx) nvm_entry_ctxs; 6152 TAILQ_HEAD(, discovery_entry_ctx) discovery_entry_ctxs; 6153 int rc; 6154 bool wait_for_attach; 6155 uint64_t timeout_ticks; 6156 /* Denotes that the discovery service is being started. We're waiting 6157 * for the initial connection to the discovery controller to be 6158 * established and attach discovered NVM ctrlrs. 6159 */ 6160 bool initializing; 6161 /* Denotes if a discovery is currently in progress for this context. 6162 * That includes connecting to newly discovered subsystems. Used to 6163 * ensure we do not start a new discovery until an existing one is 6164 * complete. 6165 */ 6166 bool in_progress; 6167 6168 /* Denotes if another discovery is needed after the one in progress 6169 * completes. Set when we receive an AER completion while a discovery 6170 * is already in progress. 6171 */ 6172 bool pending; 6173 6174 /* Signal to the discovery context poller that it should stop the 6175 * discovery service, including detaching from the current discovery 6176 * controller. 6177 */ 6178 bool stop; 6179 6180 struct spdk_thread *calling_thread; 6181 uint32_t index; 6182 uint32_t attach_in_progress; 6183 char *hostnqn; 6184 6185 /* Denotes if the discovery service was started by the mdns discovery. 6186 */ 6187 bool from_mdns_discovery_service; 6188 }; 6189 6190 TAILQ_HEAD(discovery_ctxs, discovery_ctx); 6191 static struct discovery_ctxs g_discovery_ctxs = TAILQ_HEAD_INITIALIZER(g_discovery_ctxs); 6192 6193 static void get_discovery_log_page(struct discovery_ctx *ctx); 6194 6195 static void 6196 free_discovery_ctx(struct discovery_ctx *ctx) 6197 { 6198 free(ctx->log_page); 6199 free(ctx->hostnqn); 6200 free(ctx->name); 6201 free(ctx); 6202 } 6203 6204 static void 6205 discovery_complete(struct discovery_ctx *ctx) 6206 { 6207 ctx->initializing = false; 6208 ctx->in_progress = false; 6209 if (ctx->pending) { 6210 ctx->pending = false; 6211 get_discovery_log_page(ctx); 6212 } 6213 } 6214 6215 static void 6216 build_trid_from_log_page_entry(struct spdk_nvme_transport_id *trid, 6217 struct spdk_nvmf_discovery_log_page_entry *entry) 6218 { 6219 char *space; 6220 6221 trid->trtype = entry->trtype; 6222 trid->adrfam = entry->adrfam; 6223 memcpy(trid->traddr, entry->traddr, sizeof(entry->traddr)); 6224 memcpy(trid->trsvcid, entry->trsvcid, sizeof(entry->trsvcid)); 6225 /* Because the source buffer (entry->subnqn) is longer than trid->subnqn, and 6226 * before call to this function trid->subnqn is zeroed out, we need 6227 * to copy sizeof(trid->subnqn) minus one byte to make sure the last character 6228 * remains 0. Then we can shorten the string (replace ' ' with 0) if required 6229 */ 6230 memcpy(trid->subnqn, entry->subnqn, sizeof(trid->subnqn) - 1); 6231 6232 /* We want the traddr, trsvcid and subnqn fields to be NULL-terminated. 6233 * But the log page entries typically pad them with spaces, not zeroes. 6234 * So add a NULL terminator to each of these fields at the appropriate 6235 * location. 6236 */ 6237 space = strchr(trid->traddr, ' '); 6238 if (space) { 6239 *space = 0; 6240 } 6241 space = strchr(trid->trsvcid, ' '); 6242 if (space) { 6243 *space = 0; 6244 } 6245 space = strchr(trid->subnqn, ' '); 6246 if (space) { 6247 *space = 0; 6248 } 6249 } 6250 6251 static void 6252 _stop_discovery(void *_ctx) 6253 { 6254 struct discovery_ctx *ctx = _ctx; 6255 6256 if (ctx->attach_in_progress > 0) { 6257 spdk_thread_send_msg(spdk_get_thread(), _stop_discovery, ctx); 6258 return; 6259 } 6260 6261 ctx->stop = true; 6262 6263 while (!TAILQ_EMPTY(&ctx->nvm_entry_ctxs)) { 6264 struct discovery_entry_ctx *entry_ctx; 6265 struct nvme_path_id path = {}; 6266 6267 entry_ctx = TAILQ_FIRST(&ctx->nvm_entry_ctxs); 6268 path.trid = entry_ctx->trid; 6269 bdev_nvme_delete(entry_ctx->name, &path); 6270 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 6271 free(entry_ctx); 6272 } 6273 6274 while (!TAILQ_EMPTY(&ctx->discovery_entry_ctxs)) { 6275 struct discovery_entry_ctx *entry_ctx; 6276 6277 entry_ctx = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 6278 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 6279 free(entry_ctx); 6280 } 6281 6282 free(ctx->entry_ctx_in_use); 6283 ctx->entry_ctx_in_use = NULL; 6284 } 6285 6286 static void 6287 stop_discovery(struct discovery_ctx *ctx, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 6288 { 6289 ctx->stop_cb_fn = cb_fn; 6290 ctx->cb_ctx = cb_ctx; 6291 6292 if (ctx->attach_in_progress > 0) { 6293 DISCOVERY_INFOLOG(ctx, "stopping discovery with attach_in_progress: %"PRIu32"\n", 6294 ctx->attach_in_progress); 6295 } 6296 6297 _stop_discovery(ctx); 6298 } 6299 6300 static void 6301 remove_discovery_entry(struct nvme_ctrlr *nvme_ctrlr) 6302 { 6303 struct discovery_ctx *d_ctx; 6304 struct nvme_path_id *path_id; 6305 struct spdk_nvme_transport_id trid = {}; 6306 struct discovery_entry_ctx *entry_ctx, *tmp; 6307 6308 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 6309 6310 TAILQ_FOREACH(d_ctx, &g_discovery_ctxs, tailq) { 6311 TAILQ_FOREACH_SAFE(entry_ctx, &d_ctx->nvm_entry_ctxs, tailq, tmp) { 6312 build_trid_from_log_page_entry(&trid, &entry_ctx->entry); 6313 if (spdk_nvme_transport_id_compare(&trid, &path_id->trid) != 0) { 6314 continue; 6315 } 6316 6317 TAILQ_REMOVE(&d_ctx->nvm_entry_ctxs, entry_ctx, tailq); 6318 free(entry_ctx); 6319 DISCOVERY_INFOLOG(d_ctx, "Remove discovery entry: %s:%s:%s\n", 6320 trid.subnqn, trid.traddr, trid.trsvcid); 6321 6322 /* Fail discovery ctrlr to force reattach attempt */ 6323 spdk_nvme_ctrlr_fail(d_ctx->ctrlr); 6324 } 6325 } 6326 } 6327 6328 static void 6329 discovery_remove_controllers(struct discovery_ctx *ctx) 6330 { 6331 struct spdk_nvmf_discovery_log_page *log_page = ctx->log_page; 6332 struct discovery_entry_ctx *entry_ctx, *tmp; 6333 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 6334 struct spdk_nvme_transport_id old_trid = {}; 6335 uint64_t numrec, i; 6336 bool found; 6337 6338 numrec = from_le64(&log_page->numrec); 6339 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->nvm_entry_ctxs, tailq, tmp) { 6340 found = false; 6341 old_entry = &entry_ctx->entry; 6342 build_trid_from_log_page_entry(&old_trid, old_entry); 6343 for (i = 0; i < numrec; i++) { 6344 new_entry = &log_page->entries[i]; 6345 if (!memcmp(old_entry, new_entry, sizeof(*old_entry))) { 6346 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s found again\n", 6347 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 6348 found = true; 6349 break; 6350 } 6351 } 6352 if (!found) { 6353 struct nvme_path_id path = {}; 6354 6355 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s not found\n", 6356 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 6357 6358 path.trid = entry_ctx->trid; 6359 bdev_nvme_delete(entry_ctx->name, &path); 6360 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 6361 free(entry_ctx); 6362 } 6363 } 6364 free(log_page); 6365 ctx->log_page = NULL; 6366 discovery_complete(ctx); 6367 } 6368 6369 static void 6370 complete_discovery_start(struct discovery_ctx *ctx, int status) 6371 { 6372 ctx->timeout_ticks = 0; 6373 ctx->rc = status; 6374 if (ctx->start_cb_fn) { 6375 ctx->start_cb_fn(ctx->cb_ctx, status); 6376 ctx->start_cb_fn = NULL; 6377 ctx->cb_ctx = NULL; 6378 } 6379 } 6380 6381 static void 6382 discovery_attach_controller_done(void *cb_ctx, size_t bdev_count, int rc) 6383 { 6384 struct discovery_entry_ctx *entry_ctx = cb_ctx; 6385 struct discovery_ctx *ctx = entry_ctx->ctx; 6386 6387 DISCOVERY_INFOLOG(ctx, "attach %s done\n", entry_ctx->name); 6388 ctx->attach_in_progress--; 6389 if (ctx->attach_in_progress == 0) { 6390 complete_discovery_start(ctx, ctx->rc); 6391 if (ctx->initializing && ctx->rc != 0) { 6392 DISCOVERY_ERRLOG(ctx, "stopping discovery due to errors: %d\n", ctx->rc); 6393 stop_discovery(ctx, NULL, ctx->cb_ctx); 6394 } else { 6395 discovery_remove_controllers(ctx); 6396 } 6397 } 6398 } 6399 6400 static struct discovery_entry_ctx * 6401 create_discovery_entry_ctx(struct discovery_ctx *ctx, struct spdk_nvme_transport_id *trid) 6402 { 6403 struct discovery_entry_ctx *new_ctx; 6404 6405 new_ctx = calloc(1, sizeof(*new_ctx)); 6406 if (new_ctx == NULL) { 6407 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6408 return NULL; 6409 } 6410 6411 new_ctx->ctx = ctx; 6412 memcpy(&new_ctx->trid, trid, sizeof(*trid)); 6413 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 6414 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 6415 return new_ctx; 6416 } 6417 6418 static void 6419 discovery_log_page_cb(void *cb_arg, int rc, const struct spdk_nvme_cpl *cpl, 6420 struct spdk_nvmf_discovery_log_page *log_page) 6421 { 6422 struct discovery_ctx *ctx = cb_arg; 6423 struct discovery_entry_ctx *entry_ctx, *tmp; 6424 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 6425 uint64_t numrec, i; 6426 bool found; 6427 6428 if (rc || spdk_nvme_cpl_is_error(cpl)) { 6429 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 6430 return; 6431 } 6432 6433 ctx->log_page = log_page; 6434 assert(ctx->attach_in_progress == 0); 6435 numrec = from_le64(&log_page->numrec); 6436 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->discovery_entry_ctxs, tailq, tmp) { 6437 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 6438 free(entry_ctx); 6439 } 6440 for (i = 0; i < numrec; i++) { 6441 found = false; 6442 new_entry = &log_page->entries[i]; 6443 if (new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) { 6444 struct discovery_entry_ctx *new_ctx; 6445 struct spdk_nvme_transport_id trid = {}; 6446 6447 build_trid_from_log_page_entry(&trid, new_entry); 6448 new_ctx = create_discovery_entry_ctx(ctx, &trid); 6449 if (new_ctx == NULL) { 6450 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6451 break; 6452 } 6453 6454 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, new_ctx, tailq); 6455 continue; 6456 } 6457 TAILQ_FOREACH(entry_ctx, &ctx->nvm_entry_ctxs, tailq) { 6458 old_entry = &entry_ctx->entry; 6459 if (!memcmp(new_entry, old_entry, sizeof(*new_entry))) { 6460 found = true; 6461 break; 6462 } 6463 } 6464 if (!found) { 6465 struct discovery_entry_ctx *subnqn_ctx = NULL, *new_ctx; 6466 struct discovery_ctx *d_ctx; 6467 6468 TAILQ_FOREACH(d_ctx, &g_discovery_ctxs, tailq) { 6469 TAILQ_FOREACH(subnqn_ctx, &d_ctx->nvm_entry_ctxs, tailq) { 6470 if (!memcmp(subnqn_ctx->entry.subnqn, new_entry->subnqn, 6471 sizeof(new_entry->subnqn))) { 6472 break; 6473 } 6474 } 6475 if (subnqn_ctx) { 6476 break; 6477 } 6478 } 6479 6480 new_ctx = calloc(1, sizeof(*new_ctx)); 6481 if (new_ctx == NULL) { 6482 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6483 break; 6484 } 6485 6486 new_ctx->ctx = ctx; 6487 memcpy(&new_ctx->entry, new_entry, sizeof(*new_entry)); 6488 build_trid_from_log_page_entry(&new_ctx->trid, new_entry); 6489 if (subnqn_ctx) { 6490 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s", subnqn_ctx->name); 6491 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new path for %s\n", 6492 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 6493 new_ctx->name); 6494 } else { 6495 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s%d", ctx->name, ctx->index++); 6496 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new subsystem %s\n", 6497 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 6498 new_ctx->name); 6499 } 6500 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 6501 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 6502 rc = bdev_nvme_create(&new_ctx->trid, new_ctx->name, NULL, 0, 6503 discovery_attach_controller_done, new_ctx, 6504 &new_ctx->drv_opts, &ctx->bdev_opts, true); 6505 if (rc == 0) { 6506 TAILQ_INSERT_TAIL(&ctx->nvm_entry_ctxs, new_ctx, tailq); 6507 ctx->attach_in_progress++; 6508 } else { 6509 DISCOVERY_ERRLOG(ctx, "bdev_nvme_create failed (%s)\n", spdk_strerror(-rc)); 6510 } 6511 } 6512 } 6513 6514 if (ctx->attach_in_progress == 0) { 6515 discovery_remove_controllers(ctx); 6516 } 6517 } 6518 6519 static void 6520 get_discovery_log_page(struct discovery_ctx *ctx) 6521 { 6522 int rc; 6523 6524 assert(ctx->in_progress == false); 6525 ctx->in_progress = true; 6526 rc = spdk_nvme_ctrlr_get_discovery_log_page(ctx->ctrlr, discovery_log_page_cb, ctx); 6527 if (rc != 0) { 6528 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 6529 } 6530 DISCOVERY_INFOLOG(ctx, "sent discovery log page command\n"); 6531 } 6532 6533 static void 6534 discovery_aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 6535 { 6536 struct discovery_ctx *ctx = arg; 6537 uint32_t log_page_id = (cpl->cdw0 & 0xFF0000) >> 16; 6538 6539 if (spdk_nvme_cpl_is_error(cpl)) { 6540 DISCOVERY_ERRLOG(ctx, "aer failed\n"); 6541 return; 6542 } 6543 6544 if (log_page_id != SPDK_NVME_LOG_DISCOVERY) { 6545 DISCOVERY_ERRLOG(ctx, "unexpected log page 0x%x\n", log_page_id); 6546 return; 6547 } 6548 6549 DISCOVERY_INFOLOG(ctx, "got aer\n"); 6550 if (ctx->in_progress) { 6551 ctx->pending = true; 6552 return; 6553 } 6554 6555 get_discovery_log_page(ctx); 6556 } 6557 6558 static void 6559 discovery_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 6560 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 6561 { 6562 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 6563 struct discovery_ctx *ctx; 6564 6565 ctx = SPDK_CONTAINEROF(user_opts, struct discovery_ctx, drv_opts); 6566 6567 DISCOVERY_INFOLOG(ctx, "discovery ctrlr attached\n"); 6568 ctx->probe_ctx = NULL; 6569 ctx->ctrlr = ctrlr; 6570 6571 if (ctx->rc != 0) { 6572 DISCOVERY_ERRLOG(ctx, "encountered error while attaching discovery ctrlr: %d\n", 6573 ctx->rc); 6574 return; 6575 } 6576 6577 spdk_nvme_ctrlr_register_aer_callback(ctx->ctrlr, discovery_aer_cb, ctx); 6578 } 6579 6580 static int 6581 discovery_poller(void *arg) 6582 { 6583 struct discovery_ctx *ctx = arg; 6584 struct spdk_nvme_transport_id *trid; 6585 int rc; 6586 6587 if (ctx->detach_ctx) { 6588 rc = spdk_nvme_detach_poll_async(ctx->detach_ctx); 6589 if (rc != -EAGAIN) { 6590 ctx->detach_ctx = NULL; 6591 ctx->ctrlr = NULL; 6592 } 6593 } else if (ctx->stop) { 6594 if (ctx->ctrlr != NULL) { 6595 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 6596 if (rc == 0) { 6597 return SPDK_POLLER_BUSY; 6598 } 6599 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 6600 } 6601 spdk_poller_unregister(&ctx->poller); 6602 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 6603 assert(ctx->start_cb_fn == NULL); 6604 if (ctx->stop_cb_fn != NULL) { 6605 ctx->stop_cb_fn(ctx->cb_ctx); 6606 } 6607 free_discovery_ctx(ctx); 6608 } else if (ctx->probe_ctx == NULL && ctx->ctrlr == NULL) { 6609 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 6610 DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); 6611 assert(ctx->initializing); 6612 spdk_poller_unregister(&ctx->poller); 6613 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 6614 complete_discovery_start(ctx, -ETIMEDOUT); 6615 stop_discovery(ctx, NULL, NULL); 6616 free_discovery_ctx(ctx); 6617 return SPDK_POLLER_BUSY; 6618 } 6619 6620 assert(ctx->entry_ctx_in_use == NULL); 6621 ctx->entry_ctx_in_use = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 6622 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 6623 trid = &ctx->entry_ctx_in_use->trid; 6624 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, discovery_attach_cb); 6625 if (ctx->probe_ctx) { 6626 spdk_poller_unregister(&ctx->poller); 6627 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000); 6628 } else { 6629 DISCOVERY_ERRLOG(ctx, "could not start discovery connect\n"); 6630 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 6631 ctx->entry_ctx_in_use = NULL; 6632 } 6633 } else if (ctx->probe_ctx) { 6634 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 6635 DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); 6636 complete_discovery_start(ctx, -ETIMEDOUT); 6637 return SPDK_POLLER_BUSY; 6638 } 6639 6640 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 6641 if (rc != -EAGAIN) { 6642 if (ctx->rc != 0) { 6643 assert(ctx->initializing); 6644 stop_discovery(ctx, NULL, ctx->cb_ctx); 6645 } else { 6646 assert(rc == 0); 6647 DISCOVERY_INFOLOG(ctx, "discovery ctrlr connected\n"); 6648 ctx->rc = rc; 6649 get_discovery_log_page(ctx); 6650 } 6651 } 6652 } else { 6653 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 6654 DISCOVERY_ERRLOG(ctx, "timed out while attaching NVM ctrlrs\n"); 6655 complete_discovery_start(ctx, -ETIMEDOUT); 6656 /* We need to wait until all NVM ctrlrs are attached before we stop the 6657 * discovery service to make sure we don't detach a ctrlr that is still 6658 * being attached. 6659 */ 6660 if (ctx->attach_in_progress == 0) { 6661 stop_discovery(ctx, NULL, ctx->cb_ctx); 6662 return SPDK_POLLER_BUSY; 6663 } 6664 } 6665 6666 rc = spdk_nvme_ctrlr_process_admin_completions(ctx->ctrlr); 6667 if (rc < 0) { 6668 spdk_poller_unregister(&ctx->poller); 6669 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 6670 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 6671 ctx->entry_ctx_in_use = NULL; 6672 6673 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 6674 if (rc != 0) { 6675 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 6676 ctx->ctrlr = NULL; 6677 } 6678 } 6679 } 6680 6681 return SPDK_POLLER_BUSY; 6682 } 6683 6684 static void 6685 start_discovery_poller(void *arg) 6686 { 6687 struct discovery_ctx *ctx = arg; 6688 6689 TAILQ_INSERT_TAIL(&g_discovery_ctxs, ctx, tailq); 6690 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 6691 } 6692 6693 int 6694 bdev_nvme_start_discovery(struct spdk_nvme_transport_id *trid, 6695 const char *base_name, 6696 struct spdk_nvme_ctrlr_opts *drv_opts, 6697 struct nvme_ctrlr_opts *bdev_opts, 6698 uint64_t attach_timeout, 6699 bool from_mdns, 6700 spdk_bdev_nvme_start_discovery_fn cb_fn, void *cb_ctx) 6701 { 6702 struct discovery_ctx *ctx; 6703 struct discovery_entry_ctx *discovery_entry_ctx; 6704 6705 snprintf(trid->subnqn, sizeof(trid->subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN); 6706 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 6707 if (strcmp(ctx->name, base_name) == 0) { 6708 return -EEXIST; 6709 } 6710 6711 if (ctx->entry_ctx_in_use != NULL) { 6712 if (!spdk_nvme_transport_id_compare(trid, &ctx->entry_ctx_in_use->trid)) { 6713 return -EEXIST; 6714 } 6715 } 6716 6717 TAILQ_FOREACH(discovery_entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 6718 if (!spdk_nvme_transport_id_compare(trid, &discovery_entry_ctx->trid)) { 6719 return -EEXIST; 6720 } 6721 } 6722 } 6723 6724 ctx = calloc(1, sizeof(*ctx)); 6725 if (ctx == NULL) { 6726 return -ENOMEM; 6727 } 6728 6729 ctx->name = strdup(base_name); 6730 if (ctx->name == NULL) { 6731 free_discovery_ctx(ctx); 6732 return -ENOMEM; 6733 } 6734 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 6735 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 6736 ctx->from_mdns_discovery_service = from_mdns; 6737 ctx->bdev_opts.from_discovery_service = true; 6738 ctx->calling_thread = spdk_get_thread(); 6739 ctx->start_cb_fn = cb_fn; 6740 ctx->cb_ctx = cb_ctx; 6741 ctx->initializing = true; 6742 if (ctx->start_cb_fn) { 6743 /* We can use this when dumping json to denote if this RPC parameter 6744 * was specified or not. 6745 */ 6746 ctx->wait_for_attach = true; 6747 } 6748 if (attach_timeout != 0) { 6749 ctx->timeout_ticks = spdk_get_ticks() + attach_timeout * 6750 spdk_get_ticks_hz() / 1000ull; 6751 } 6752 TAILQ_INIT(&ctx->nvm_entry_ctxs); 6753 TAILQ_INIT(&ctx->discovery_entry_ctxs); 6754 memcpy(&ctx->trid, trid, sizeof(*trid)); 6755 /* Even if user did not specify hostnqn, we can still strdup("\0"); */ 6756 ctx->hostnqn = strdup(ctx->drv_opts.hostnqn); 6757 if (ctx->hostnqn == NULL) { 6758 free_discovery_ctx(ctx); 6759 return -ENOMEM; 6760 } 6761 discovery_entry_ctx = create_discovery_entry_ctx(ctx, trid); 6762 if (discovery_entry_ctx == NULL) { 6763 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6764 free_discovery_ctx(ctx); 6765 return -ENOMEM; 6766 } 6767 6768 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, discovery_entry_ctx, tailq); 6769 spdk_thread_send_msg(g_bdev_nvme_init_thread, start_discovery_poller, ctx); 6770 return 0; 6771 } 6772 6773 int 6774 bdev_nvme_stop_discovery(const char *name, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 6775 { 6776 struct discovery_ctx *ctx; 6777 6778 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 6779 if (strcmp(name, ctx->name) == 0) { 6780 if (ctx->stop) { 6781 return -EALREADY; 6782 } 6783 /* If we're still starting the discovery service and ->rc is non-zero, we're 6784 * going to stop it as soon as we can 6785 */ 6786 if (ctx->initializing && ctx->rc != 0) { 6787 return -EALREADY; 6788 } 6789 stop_discovery(ctx, cb_fn, cb_ctx); 6790 return 0; 6791 } 6792 } 6793 6794 return -ENOENT; 6795 } 6796 6797 static int 6798 bdev_nvme_library_init(void) 6799 { 6800 g_bdev_nvme_init_thread = spdk_get_thread(); 6801 6802 spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_create_poll_group_cb, 6803 bdev_nvme_destroy_poll_group_cb, 6804 sizeof(struct nvme_poll_group), "nvme_poll_groups"); 6805 6806 return 0; 6807 } 6808 6809 static void 6810 bdev_nvme_fini_destruct_ctrlrs(void) 6811 { 6812 struct nvme_bdev_ctrlr *nbdev_ctrlr; 6813 struct nvme_ctrlr *nvme_ctrlr; 6814 6815 pthread_mutex_lock(&g_bdev_nvme_mutex); 6816 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 6817 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 6818 pthread_mutex_lock(&nvme_ctrlr->mutex); 6819 if (nvme_ctrlr->destruct) { 6820 /* This controller's destruction was already started 6821 * before the application started shutting down 6822 */ 6823 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6824 continue; 6825 } 6826 nvme_ctrlr->destruct = true; 6827 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6828 6829 spdk_thread_send_msg(nvme_ctrlr->thread, _nvme_ctrlr_destruct, 6830 nvme_ctrlr); 6831 } 6832 } 6833 6834 g_bdev_nvme_module_finish = true; 6835 if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 6836 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6837 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 6838 spdk_bdev_module_fini_done(); 6839 return; 6840 } 6841 6842 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6843 } 6844 6845 static void 6846 check_discovery_fini(void *arg) 6847 { 6848 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 6849 bdev_nvme_fini_destruct_ctrlrs(); 6850 } 6851 } 6852 6853 static void 6854 bdev_nvme_library_fini(void) 6855 { 6856 struct nvme_probe_skip_entry *entry, *entry_tmp; 6857 struct discovery_ctx *ctx; 6858 6859 spdk_poller_unregister(&g_hotplug_poller); 6860 free(g_hotplug_probe_ctx); 6861 g_hotplug_probe_ctx = NULL; 6862 6863 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) { 6864 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 6865 free(entry); 6866 } 6867 6868 assert(spdk_get_thread() == g_bdev_nvme_init_thread); 6869 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 6870 bdev_nvme_fini_destruct_ctrlrs(); 6871 } else { 6872 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 6873 stop_discovery(ctx, check_discovery_fini, NULL); 6874 } 6875 } 6876 } 6877 6878 static void 6879 bdev_nvme_verify_pi_error(struct nvme_bdev_io *bio) 6880 { 6881 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 6882 struct spdk_bdev *bdev = bdev_io->bdev; 6883 struct spdk_dif_ctx dif_ctx; 6884 struct spdk_dif_error err_blk = {}; 6885 int rc; 6886 struct spdk_dif_ctx_init_ext_opts dif_opts; 6887 6888 dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format); 6889 dif_opts.dif_pi_format = SPDK_DIF_PI_FORMAT_16; 6890 rc = spdk_dif_ctx_init(&dif_ctx, 6891 bdev->blocklen, bdev->md_len, bdev->md_interleave, 6892 bdev->dif_is_head_of_md, bdev->dif_type, bdev->dif_check_flags, 6893 bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0, &dif_opts); 6894 if (rc != 0) { 6895 SPDK_ERRLOG("Initialization of DIF context failed\n"); 6896 return; 6897 } 6898 6899 if (bdev->md_interleave) { 6900 rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 6901 bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 6902 } else { 6903 struct iovec md_iov = { 6904 .iov_base = bdev_io->u.bdev.md_buf, 6905 .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len, 6906 }; 6907 6908 rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 6909 &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 6910 } 6911 6912 if (rc != 0) { 6913 SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n", 6914 err_blk.err_type, err_blk.err_offset); 6915 } else { 6916 SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n"); 6917 } 6918 } 6919 6920 static void 6921 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 6922 { 6923 struct nvme_bdev_io *bio = ref; 6924 6925 if (spdk_nvme_cpl_is_success(cpl)) { 6926 /* Run PI verification for read data buffer. */ 6927 bdev_nvme_verify_pi_error(bio); 6928 } 6929 6930 /* Return original completion status */ 6931 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 6932 } 6933 6934 static void 6935 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 6936 { 6937 struct nvme_bdev_io *bio = ref; 6938 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 6939 int ret; 6940 6941 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 6942 SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n", 6943 cpl->status.sct, cpl->status.sc); 6944 6945 /* Save completion status to use after verifying PI error. */ 6946 bio->cpl = *cpl; 6947 6948 if (spdk_likely(nvme_io_path_is_available(bio->io_path))) { 6949 /* Read without PI checking to verify PI error. */ 6950 ret = bdev_nvme_no_pi_readv(bio, 6951 bdev_io->u.bdev.iovs, 6952 bdev_io->u.bdev.iovcnt, 6953 bdev_io->u.bdev.md_buf, 6954 bdev_io->u.bdev.num_blocks, 6955 bdev_io->u.bdev.offset_blocks); 6956 if (ret == 0) { 6957 return; 6958 } 6959 } 6960 } 6961 6962 bdev_nvme_io_complete_nvme_status(bio, cpl); 6963 } 6964 6965 static void 6966 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 6967 { 6968 struct nvme_bdev_io *bio = ref; 6969 6970 if (spdk_nvme_cpl_is_pi_error(cpl)) { 6971 SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n", 6972 cpl->status.sct, cpl->status.sc); 6973 /* Run PI verification for write data buffer if PI error is detected. */ 6974 bdev_nvme_verify_pi_error(bio); 6975 } 6976 6977 bdev_nvme_io_complete_nvme_status(bio, cpl); 6978 } 6979 6980 static void 6981 bdev_nvme_zone_appendv_done(void *ref, const struct spdk_nvme_cpl *cpl) 6982 { 6983 struct nvme_bdev_io *bio = ref; 6984 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 6985 6986 /* spdk_bdev_io_get_append_location() requires that the ALBA is stored in offset_blocks. 6987 * Additionally, offset_blocks has to be set before calling bdev_nvme_verify_pi_error(). 6988 */ 6989 bdev_io->u.bdev.offset_blocks = *(uint64_t *)&cpl->cdw0; 6990 6991 if (spdk_nvme_cpl_is_pi_error(cpl)) { 6992 SPDK_ERRLOG("zone append completed with PI error (sct=%d, sc=%d)\n", 6993 cpl->status.sct, cpl->status.sc); 6994 /* Run PI verification for zone append data buffer if PI error is detected. */ 6995 bdev_nvme_verify_pi_error(bio); 6996 } 6997 6998 bdev_nvme_io_complete_nvme_status(bio, cpl); 6999 } 7000 7001 static void 7002 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7003 { 7004 struct nvme_bdev_io *bio = ref; 7005 7006 if (spdk_nvme_cpl_is_pi_error(cpl)) { 7007 SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n", 7008 cpl->status.sct, cpl->status.sc); 7009 /* Run PI verification for compare data buffer if PI error is detected. */ 7010 bdev_nvme_verify_pi_error(bio); 7011 } 7012 7013 bdev_nvme_io_complete_nvme_status(bio, cpl); 7014 } 7015 7016 static void 7017 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7018 { 7019 struct nvme_bdev_io *bio = ref; 7020 7021 /* Compare operation completion */ 7022 if (!bio->first_fused_completed) { 7023 /* Save compare result for write callback */ 7024 bio->cpl = *cpl; 7025 bio->first_fused_completed = true; 7026 return; 7027 } 7028 7029 /* Write operation completion */ 7030 if (spdk_nvme_cpl_is_error(&bio->cpl)) { 7031 /* If bio->cpl is already an error, it means the compare operation failed. In that case, 7032 * complete the IO with the compare operation's status. 7033 */ 7034 if (!spdk_nvme_cpl_is_error(cpl)) { 7035 SPDK_ERRLOG("Unexpected write success after compare failure.\n"); 7036 } 7037 7038 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 7039 } else { 7040 bdev_nvme_io_complete_nvme_status(bio, cpl); 7041 } 7042 } 7043 7044 static void 7045 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl) 7046 { 7047 struct nvme_bdev_io *bio = ref; 7048 7049 bdev_nvme_io_complete_nvme_status(bio, cpl); 7050 } 7051 7052 static int 7053 fill_zone_from_report(struct spdk_bdev_zone_info *info, struct spdk_nvme_zns_zone_desc *desc) 7054 { 7055 switch (desc->zt) { 7056 case SPDK_NVME_ZONE_TYPE_SEQWR: 7057 info->type = SPDK_BDEV_ZONE_TYPE_SEQWR; 7058 break; 7059 default: 7060 SPDK_ERRLOG("Invalid zone type: %#x in zone report\n", desc->zt); 7061 return -EIO; 7062 } 7063 7064 switch (desc->zs) { 7065 case SPDK_NVME_ZONE_STATE_EMPTY: 7066 info->state = SPDK_BDEV_ZONE_STATE_EMPTY; 7067 break; 7068 case SPDK_NVME_ZONE_STATE_IOPEN: 7069 info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN; 7070 break; 7071 case SPDK_NVME_ZONE_STATE_EOPEN: 7072 info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN; 7073 break; 7074 case SPDK_NVME_ZONE_STATE_CLOSED: 7075 info->state = SPDK_BDEV_ZONE_STATE_CLOSED; 7076 break; 7077 case SPDK_NVME_ZONE_STATE_RONLY: 7078 info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY; 7079 break; 7080 case SPDK_NVME_ZONE_STATE_FULL: 7081 info->state = SPDK_BDEV_ZONE_STATE_FULL; 7082 break; 7083 case SPDK_NVME_ZONE_STATE_OFFLINE: 7084 info->state = SPDK_BDEV_ZONE_STATE_OFFLINE; 7085 break; 7086 default: 7087 SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", desc->zs); 7088 return -EIO; 7089 } 7090 7091 info->zone_id = desc->zslba; 7092 info->write_pointer = desc->wp; 7093 info->capacity = desc->zcap; 7094 7095 return 0; 7096 } 7097 7098 static void 7099 bdev_nvme_get_zone_info_done(void *ref, const struct spdk_nvme_cpl *cpl) 7100 { 7101 struct nvme_bdev_io *bio = ref; 7102 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7103 uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id; 7104 uint32_t zones_to_copy = bdev_io->u.zone_mgmt.num_zones; 7105 struct spdk_bdev_zone_info *info = bdev_io->u.zone_mgmt.buf; 7106 uint64_t max_zones_per_buf, i; 7107 uint32_t zone_report_bufsize; 7108 struct spdk_nvme_ns *ns; 7109 struct spdk_nvme_qpair *qpair; 7110 int ret; 7111 7112 if (spdk_nvme_cpl_is_error(cpl)) { 7113 goto out_complete_io_nvme_cpl; 7114 } 7115 7116 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 7117 ret = -ENXIO; 7118 goto out_complete_io_ret; 7119 } 7120 7121 ns = bio->io_path->nvme_ns->ns; 7122 qpair = bio->io_path->qpair->qpair; 7123 7124 zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 7125 max_zones_per_buf = (zone_report_bufsize - sizeof(*bio->zone_report_buf)) / 7126 sizeof(bio->zone_report_buf->descs[0]); 7127 7128 if (bio->zone_report_buf->nr_zones > max_zones_per_buf) { 7129 ret = -EINVAL; 7130 goto out_complete_io_ret; 7131 } 7132 7133 if (!bio->zone_report_buf->nr_zones) { 7134 ret = -EINVAL; 7135 goto out_complete_io_ret; 7136 } 7137 7138 for (i = 0; i < bio->zone_report_buf->nr_zones && bio->handled_zones < zones_to_copy; i++) { 7139 ret = fill_zone_from_report(&info[bio->handled_zones], 7140 &bio->zone_report_buf->descs[i]); 7141 if (ret) { 7142 goto out_complete_io_ret; 7143 } 7144 bio->handled_zones++; 7145 } 7146 7147 if (bio->handled_zones < zones_to_copy) { 7148 uint64_t zone_size_lba = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 7149 uint64_t slba = zone_id + (zone_size_lba * bio->handled_zones); 7150 7151 memset(bio->zone_report_buf, 0, zone_report_bufsize); 7152 ret = spdk_nvme_zns_report_zones(ns, qpair, 7153 bio->zone_report_buf, zone_report_bufsize, 7154 slba, SPDK_NVME_ZRA_LIST_ALL, true, 7155 bdev_nvme_get_zone_info_done, bio); 7156 if (!ret) { 7157 return; 7158 } else { 7159 goto out_complete_io_ret; 7160 } 7161 } 7162 7163 out_complete_io_nvme_cpl: 7164 free(bio->zone_report_buf); 7165 bio->zone_report_buf = NULL; 7166 bdev_nvme_io_complete_nvme_status(bio, cpl); 7167 return; 7168 7169 out_complete_io_ret: 7170 free(bio->zone_report_buf); 7171 bio->zone_report_buf = NULL; 7172 bdev_nvme_io_complete(bio, ret); 7173 } 7174 7175 static void 7176 bdev_nvme_zone_management_done(void *ref, const struct spdk_nvme_cpl *cpl) 7177 { 7178 struct nvme_bdev_io *bio = ref; 7179 7180 bdev_nvme_io_complete_nvme_status(bio, cpl); 7181 } 7182 7183 static void 7184 bdev_nvme_admin_passthru_complete_nvme_status(void *ctx) 7185 { 7186 struct nvme_bdev_io *bio = ctx; 7187 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7188 const struct spdk_nvme_cpl *cpl = &bio->cpl; 7189 7190 assert(bdev_nvme_io_type_is_admin(bdev_io->type)); 7191 7192 __bdev_nvme_io_complete(bdev_io, 0, cpl); 7193 } 7194 7195 static void 7196 bdev_nvme_abort_complete(void *ctx) 7197 { 7198 struct nvme_bdev_io *bio = ctx; 7199 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7200 7201 if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) { 7202 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS, NULL); 7203 } else { 7204 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED, NULL); 7205 } 7206 } 7207 7208 static void 7209 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl) 7210 { 7211 struct nvme_bdev_io *bio = ref; 7212 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7213 7214 bio->cpl = *cpl; 7215 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), bdev_nvme_abort_complete, bio); 7216 } 7217 7218 static void 7219 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl) 7220 { 7221 struct nvme_bdev_io *bio = ref; 7222 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7223 7224 bio->cpl = *cpl; 7225 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7226 bdev_nvme_admin_passthru_complete_nvme_status, bio); 7227 } 7228 7229 static void 7230 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset) 7231 { 7232 struct nvme_bdev_io *bio = ref; 7233 struct iovec *iov; 7234 7235 bio->iov_offset = sgl_offset; 7236 for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) { 7237 iov = &bio->iovs[bio->iovpos]; 7238 if (bio->iov_offset < iov->iov_len) { 7239 break; 7240 } 7241 7242 bio->iov_offset -= iov->iov_len; 7243 } 7244 } 7245 7246 static int 7247 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length) 7248 { 7249 struct nvme_bdev_io *bio = ref; 7250 struct iovec *iov; 7251 7252 assert(bio->iovpos < bio->iovcnt); 7253 7254 iov = &bio->iovs[bio->iovpos]; 7255 7256 *address = iov->iov_base; 7257 *length = iov->iov_len; 7258 7259 if (bio->iov_offset) { 7260 assert(bio->iov_offset <= iov->iov_len); 7261 *address += bio->iov_offset; 7262 *length -= bio->iov_offset; 7263 } 7264 7265 bio->iov_offset += *length; 7266 if (bio->iov_offset == iov->iov_len) { 7267 bio->iovpos++; 7268 bio->iov_offset = 0; 7269 } 7270 7271 return 0; 7272 } 7273 7274 static void 7275 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset) 7276 { 7277 struct nvme_bdev_io *bio = ref; 7278 struct iovec *iov; 7279 7280 bio->fused_iov_offset = sgl_offset; 7281 for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) { 7282 iov = &bio->fused_iovs[bio->fused_iovpos]; 7283 if (bio->fused_iov_offset < iov->iov_len) { 7284 break; 7285 } 7286 7287 bio->fused_iov_offset -= iov->iov_len; 7288 } 7289 } 7290 7291 static int 7292 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length) 7293 { 7294 struct nvme_bdev_io *bio = ref; 7295 struct iovec *iov; 7296 7297 assert(bio->fused_iovpos < bio->fused_iovcnt); 7298 7299 iov = &bio->fused_iovs[bio->fused_iovpos]; 7300 7301 *address = iov->iov_base; 7302 *length = iov->iov_len; 7303 7304 if (bio->fused_iov_offset) { 7305 assert(bio->fused_iov_offset <= iov->iov_len); 7306 *address += bio->fused_iov_offset; 7307 *length -= bio->fused_iov_offset; 7308 } 7309 7310 bio->fused_iov_offset += *length; 7311 if (bio->fused_iov_offset == iov->iov_len) { 7312 bio->fused_iovpos++; 7313 bio->fused_iov_offset = 0; 7314 } 7315 7316 return 0; 7317 } 7318 7319 static int 7320 bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7321 void *md, uint64_t lba_count, uint64_t lba) 7322 { 7323 int rc; 7324 7325 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n", 7326 lba_count, lba); 7327 7328 bio->iovs = iov; 7329 bio->iovcnt = iovcnt; 7330 bio->iovpos = 0; 7331 bio->iov_offset = 0; 7332 7333 rc = spdk_nvme_ns_cmd_readv_with_md(bio->io_path->nvme_ns->ns, 7334 bio->io_path->qpair->qpair, 7335 lba, lba_count, 7336 bdev_nvme_no_pi_readv_done, bio, 0, 7337 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 7338 md, 0, 0); 7339 7340 if (rc != 0 && rc != -ENOMEM) { 7341 SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc); 7342 } 7343 return rc; 7344 } 7345 7346 static int 7347 bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7348 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, 7349 struct spdk_memory_domain *domain, void *domain_ctx, 7350 struct spdk_accel_sequence *seq) 7351 { 7352 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7353 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7354 int rc; 7355 7356 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7357 lba_count, lba); 7358 7359 bio->iovs = iov; 7360 bio->iovcnt = iovcnt; 7361 bio->iovpos = 0; 7362 bio->iov_offset = 0; 7363 7364 if (domain != NULL || seq != NULL) { 7365 bio->ext_opts.size = SPDK_SIZEOF(&bio->ext_opts, accel_sequence); 7366 bio->ext_opts.memory_domain = domain; 7367 bio->ext_opts.memory_domain_ctx = domain_ctx; 7368 bio->ext_opts.io_flags = flags; 7369 bio->ext_opts.metadata = md; 7370 bio->ext_opts.accel_sequence = seq; 7371 7372 rc = spdk_nvme_ns_cmd_readv_ext(ns, qpair, lba, lba_count, 7373 bdev_nvme_readv_done, bio, 7374 bdev_nvme_queued_reset_sgl, 7375 bdev_nvme_queued_next_sge, 7376 &bio->ext_opts); 7377 } else if (iovcnt == 1) { 7378 rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base, 7379 md, lba, lba_count, bdev_nvme_readv_done, 7380 bio, flags, 0, 0); 7381 } else { 7382 rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count, 7383 bdev_nvme_readv_done, bio, flags, 7384 bdev_nvme_queued_reset_sgl, 7385 bdev_nvme_queued_next_sge, md, 0, 0); 7386 } 7387 7388 if (rc != 0 && rc != -ENOMEM) { 7389 SPDK_ERRLOG("readv failed: rc = %d\n", rc); 7390 } 7391 return rc; 7392 } 7393 7394 static int 7395 bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7396 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, 7397 struct spdk_memory_domain *domain, void *domain_ctx, 7398 struct spdk_accel_sequence *seq) 7399 { 7400 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7401 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7402 int rc; 7403 7404 SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7405 lba_count, lba); 7406 7407 bio->iovs = iov; 7408 bio->iovcnt = iovcnt; 7409 bio->iovpos = 0; 7410 bio->iov_offset = 0; 7411 7412 if (domain != NULL || seq != NULL) { 7413 bio->ext_opts.size = SPDK_SIZEOF(&bio->ext_opts, accel_sequence); 7414 bio->ext_opts.memory_domain = domain; 7415 bio->ext_opts.memory_domain_ctx = domain_ctx; 7416 bio->ext_opts.io_flags = flags; 7417 bio->ext_opts.metadata = md; 7418 bio->ext_opts.accel_sequence = seq; 7419 7420 rc = spdk_nvme_ns_cmd_writev_ext(ns, qpair, lba, lba_count, 7421 bdev_nvme_writev_done, bio, 7422 bdev_nvme_queued_reset_sgl, 7423 bdev_nvme_queued_next_sge, 7424 &bio->ext_opts); 7425 } else if (iovcnt == 1) { 7426 rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base, 7427 md, lba, lba_count, bdev_nvme_writev_done, 7428 bio, flags, 0, 0); 7429 } else { 7430 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 7431 bdev_nvme_writev_done, bio, flags, 7432 bdev_nvme_queued_reset_sgl, 7433 bdev_nvme_queued_next_sge, md, 0, 0); 7434 } 7435 7436 if (rc != 0 && rc != -ENOMEM) { 7437 SPDK_ERRLOG("writev failed: rc = %d\n", rc); 7438 } 7439 return rc; 7440 } 7441 7442 static int 7443 bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7444 void *md, uint64_t lba_count, uint64_t zslba, 7445 uint32_t flags) 7446 { 7447 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7448 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7449 int rc; 7450 7451 SPDK_DEBUGLOG(bdev_nvme, "zone append %" PRIu64 " blocks to zone start lba %#" PRIx64 "\n", 7452 lba_count, zslba); 7453 7454 bio->iovs = iov; 7455 bio->iovcnt = iovcnt; 7456 bio->iovpos = 0; 7457 bio->iov_offset = 0; 7458 7459 if (iovcnt == 1) { 7460 rc = spdk_nvme_zns_zone_append_with_md(ns, qpair, iov[0].iov_base, md, zslba, 7461 lba_count, 7462 bdev_nvme_zone_appendv_done, bio, 7463 flags, 7464 0, 0); 7465 } else { 7466 rc = spdk_nvme_zns_zone_appendv_with_md(ns, qpair, zslba, lba_count, 7467 bdev_nvme_zone_appendv_done, bio, flags, 7468 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 7469 md, 0, 0); 7470 } 7471 7472 if (rc != 0 && rc != -ENOMEM) { 7473 SPDK_ERRLOG("zone append failed: rc = %d\n", rc); 7474 } 7475 return rc; 7476 } 7477 7478 static int 7479 bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7480 void *md, uint64_t lba_count, uint64_t lba, 7481 uint32_t flags) 7482 { 7483 int rc; 7484 7485 SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7486 lba_count, lba); 7487 7488 bio->iovs = iov; 7489 bio->iovcnt = iovcnt; 7490 bio->iovpos = 0; 7491 bio->iov_offset = 0; 7492 7493 rc = spdk_nvme_ns_cmd_comparev_with_md(bio->io_path->nvme_ns->ns, 7494 bio->io_path->qpair->qpair, 7495 lba, lba_count, 7496 bdev_nvme_comparev_done, bio, flags, 7497 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 7498 md, 0, 0); 7499 7500 if (rc != 0 && rc != -ENOMEM) { 7501 SPDK_ERRLOG("comparev failed: rc = %d\n", rc); 7502 } 7503 return rc; 7504 } 7505 7506 static int 7507 bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, 7508 struct iovec *write_iov, int write_iovcnt, 7509 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags) 7510 { 7511 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7512 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7513 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7514 int rc; 7515 7516 SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7517 lba_count, lba); 7518 7519 bio->iovs = cmp_iov; 7520 bio->iovcnt = cmp_iovcnt; 7521 bio->iovpos = 0; 7522 bio->iov_offset = 0; 7523 bio->fused_iovs = write_iov; 7524 bio->fused_iovcnt = write_iovcnt; 7525 bio->fused_iovpos = 0; 7526 bio->fused_iov_offset = 0; 7527 7528 if (bdev_io->num_retries == 0) { 7529 bio->first_fused_submitted = false; 7530 bio->first_fused_completed = false; 7531 } 7532 7533 if (!bio->first_fused_submitted) { 7534 flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST; 7535 memset(&bio->cpl, 0, sizeof(bio->cpl)); 7536 7537 rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count, 7538 bdev_nvme_comparev_and_writev_done, bio, flags, 7539 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0); 7540 if (rc == 0) { 7541 bio->first_fused_submitted = true; 7542 flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST; 7543 } else { 7544 if (rc != -ENOMEM) { 7545 SPDK_ERRLOG("compare failed: rc = %d\n", rc); 7546 } 7547 return rc; 7548 } 7549 } 7550 7551 flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND; 7552 7553 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 7554 bdev_nvme_comparev_and_writev_done, bio, flags, 7555 bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0); 7556 if (rc != 0 && rc != -ENOMEM) { 7557 SPDK_ERRLOG("write failed: rc = %d\n", rc); 7558 rc = 0; 7559 } 7560 7561 return rc; 7562 } 7563 7564 static int 7565 bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 7566 { 7567 struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES]; 7568 struct spdk_nvme_dsm_range *range; 7569 uint64_t offset, remaining; 7570 uint64_t num_ranges_u64; 7571 uint16_t num_ranges; 7572 int rc; 7573 7574 num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) / 7575 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 7576 if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) { 7577 SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks); 7578 return -EINVAL; 7579 } 7580 num_ranges = (uint16_t)num_ranges_u64; 7581 7582 offset = offset_blocks; 7583 remaining = num_blocks; 7584 range = &dsm_ranges[0]; 7585 7586 /* Fill max-size ranges until the remaining blocks fit into one range */ 7587 while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) { 7588 range->attributes.raw = 0; 7589 range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 7590 range->starting_lba = offset; 7591 7592 offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 7593 remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 7594 range++; 7595 } 7596 7597 /* Final range describes the remaining blocks */ 7598 range->attributes.raw = 0; 7599 range->length = remaining; 7600 range->starting_lba = offset; 7601 7602 rc = spdk_nvme_ns_cmd_dataset_management(bio->io_path->nvme_ns->ns, 7603 bio->io_path->qpair->qpair, 7604 SPDK_NVME_DSM_ATTR_DEALLOCATE, 7605 dsm_ranges, num_ranges, 7606 bdev_nvme_queued_done, bio); 7607 7608 return rc; 7609 } 7610 7611 static int 7612 bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 7613 { 7614 if (num_blocks > UINT16_MAX + 1) { 7615 SPDK_ERRLOG("NVMe write zeroes is limited to 16-bit block count\n"); 7616 return -EINVAL; 7617 } 7618 7619 return spdk_nvme_ns_cmd_write_zeroes(bio->io_path->nvme_ns->ns, 7620 bio->io_path->qpair->qpair, 7621 offset_blocks, num_blocks, 7622 bdev_nvme_queued_done, bio, 7623 0); 7624 } 7625 7626 static int 7627 bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones, 7628 struct spdk_bdev_zone_info *info) 7629 { 7630 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7631 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7632 uint32_t zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 7633 uint64_t zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 7634 uint64_t total_zones = spdk_nvme_zns_ns_get_num_zones(ns); 7635 7636 if (zone_id % zone_size != 0) { 7637 return -EINVAL; 7638 } 7639 7640 if (num_zones > total_zones || !num_zones) { 7641 return -EINVAL; 7642 } 7643 7644 assert(!bio->zone_report_buf); 7645 bio->zone_report_buf = calloc(1, zone_report_bufsize); 7646 if (!bio->zone_report_buf) { 7647 return -ENOMEM; 7648 } 7649 7650 bio->handled_zones = 0; 7651 7652 return spdk_nvme_zns_report_zones(ns, qpair, bio->zone_report_buf, zone_report_bufsize, 7653 zone_id, SPDK_NVME_ZRA_LIST_ALL, true, 7654 bdev_nvme_get_zone_info_done, bio); 7655 } 7656 7657 static int 7658 bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 7659 enum spdk_bdev_zone_action action) 7660 { 7661 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7662 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7663 7664 switch (action) { 7665 case SPDK_BDEV_ZONE_CLOSE: 7666 return spdk_nvme_zns_close_zone(ns, qpair, zone_id, false, 7667 bdev_nvme_zone_management_done, bio); 7668 case SPDK_BDEV_ZONE_FINISH: 7669 return spdk_nvme_zns_finish_zone(ns, qpair, zone_id, false, 7670 bdev_nvme_zone_management_done, bio); 7671 case SPDK_BDEV_ZONE_OPEN: 7672 return spdk_nvme_zns_open_zone(ns, qpair, zone_id, false, 7673 bdev_nvme_zone_management_done, bio); 7674 case SPDK_BDEV_ZONE_RESET: 7675 return spdk_nvme_zns_reset_zone(ns, qpair, zone_id, false, 7676 bdev_nvme_zone_management_done, bio); 7677 case SPDK_BDEV_ZONE_OFFLINE: 7678 return spdk_nvme_zns_offline_zone(ns, qpair, zone_id, false, 7679 bdev_nvme_zone_management_done, bio); 7680 default: 7681 return -EINVAL; 7682 } 7683 } 7684 7685 static void 7686 bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 7687 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) 7688 { 7689 struct nvme_io_path *io_path; 7690 struct nvme_ctrlr *nvme_ctrlr; 7691 uint32_t max_xfer_size; 7692 int rc = -ENXIO; 7693 7694 /* Choose the first ctrlr which is not failed. */ 7695 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 7696 nvme_ctrlr = io_path->qpair->ctrlr; 7697 7698 /* We should skip any unavailable nvme_ctrlr rather than checking 7699 * if the return value of spdk_nvme_ctrlr_cmd_admin_raw() is -ENXIO. 7700 */ 7701 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 7702 continue; 7703 } 7704 7705 max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ctrlr->ctrlr); 7706 7707 if (nbytes > max_xfer_size) { 7708 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 7709 rc = -EINVAL; 7710 goto err; 7711 } 7712 7713 rc = spdk_nvme_ctrlr_cmd_admin_raw(nvme_ctrlr->ctrlr, cmd, buf, (uint32_t)nbytes, 7714 bdev_nvme_admin_passthru_done, bio); 7715 if (rc == 0) { 7716 return; 7717 } 7718 } 7719 7720 err: 7721 bdev_nvme_admin_complete(bio, rc); 7722 } 7723 7724 static int 7725 bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 7726 void *buf, size_t nbytes) 7727 { 7728 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7729 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7730 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 7731 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 7732 7733 if (nbytes > max_xfer_size) { 7734 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 7735 return -EINVAL; 7736 } 7737 7738 /* 7739 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 7740 * so fill it out automatically. 7741 */ 7742 cmd->nsid = spdk_nvme_ns_get_id(ns); 7743 7744 return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf, 7745 (uint32_t)nbytes, bdev_nvme_queued_done, bio); 7746 } 7747 7748 static int 7749 bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 7750 void *buf, size_t nbytes, void *md_buf, size_t md_len) 7751 { 7752 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7753 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7754 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 7755 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 7756 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 7757 7758 if (nbytes > max_xfer_size) { 7759 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 7760 return -EINVAL; 7761 } 7762 7763 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 7764 SPDK_ERRLOG("invalid meta data buffer size\n"); 7765 return -EINVAL; 7766 } 7767 7768 /* 7769 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 7770 * so fill it out automatically. 7771 */ 7772 cmd->nsid = spdk_nvme_ns_get_id(ns); 7773 7774 return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf, 7775 (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio); 7776 } 7777 7778 static void 7779 bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 7780 struct nvme_bdev_io *bio_to_abort) 7781 { 7782 struct nvme_io_path *io_path; 7783 int rc = 0; 7784 7785 rc = bdev_nvme_abort_retry_io(nbdev_ch, bio_to_abort); 7786 if (rc == 0) { 7787 bdev_nvme_admin_complete(bio, 0); 7788 return; 7789 } 7790 7791 io_path = bio_to_abort->io_path; 7792 if (io_path != NULL) { 7793 rc = spdk_nvme_ctrlr_cmd_abort_ext(io_path->qpair->ctrlr->ctrlr, 7794 io_path->qpair->qpair, 7795 bio_to_abort, 7796 bdev_nvme_abort_done, bio); 7797 } else { 7798 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 7799 rc = spdk_nvme_ctrlr_cmd_abort_ext(io_path->qpair->ctrlr->ctrlr, 7800 NULL, 7801 bio_to_abort, 7802 bdev_nvme_abort_done, bio); 7803 7804 if (rc != -ENOENT) { 7805 break; 7806 } 7807 } 7808 } 7809 7810 if (rc != 0) { 7811 /* If no command was found or there was any error, complete the abort 7812 * request with failure. 7813 */ 7814 bdev_nvme_admin_complete(bio, rc); 7815 } 7816 } 7817 7818 static int 7819 bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, uint64_t src_offset_blocks, 7820 uint64_t num_blocks) 7821 { 7822 struct spdk_nvme_scc_source_range range = { 7823 .slba = src_offset_blocks, 7824 .nlb = num_blocks - 1 7825 }; 7826 7827 return spdk_nvme_ns_cmd_copy(bio->io_path->nvme_ns->ns, 7828 bio->io_path->qpair->qpair, 7829 &range, 1, dst_offset_blocks, 7830 bdev_nvme_queued_done, bio); 7831 } 7832 7833 static void 7834 bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w) 7835 { 7836 const char *action; 7837 7838 if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) { 7839 action = "reset"; 7840 } else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) { 7841 action = "abort"; 7842 } else { 7843 action = "none"; 7844 } 7845 7846 spdk_json_write_object_begin(w); 7847 7848 spdk_json_write_named_string(w, "method", "bdev_nvme_set_options"); 7849 7850 spdk_json_write_named_object_begin(w, "params"); 7851 spdk_json_write_named_string(w, "action_on_timeout", action); 7852 spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us); 7853 spdk_json_write_named_uint64(w, "timeout_admin_us", g_opts.timeout_admin_us); 7854 spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms); 7855 spdk_json_write_named_uint32(w, "transport_retry_count", g_opts.transport_retry_count); 7856 spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst); 7857 spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight); 7858 spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight); 7859 spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight); 7860 spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us); 7861 spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us); 7862 spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests); 7863 spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit); 7864 spdk_json_write_named_int32(w, "bdev_retry_count", g_opts.bdev_retry_count); 7865 spdk_json_write_named_uint8(w, "transport_ack_timeout", g_opts.transport_ack_timeout); 7866 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", g_opts.ctrlr_loss_timeout_sec); 7867 spdk_json_write_named_uint32(w, "reconnect_delay_sec", g_opts.reconnect_delay_sec); 7868 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", g_opts.fast_io_fail_timeout_sec); 7869 spdk_json_write_named_bool(w, "generate_uuids", g_opts.generate_uuids); 7870 spdk_json_write_named_uint8(w, "transport_tos", g_opts.transport_tos); 7871 spdk_json_write_named_bool(w, "io_path_stat", g_opts.io_path_stat); 7872 spdk_json_write_object_end(w); 7873 7874 spdk_json_write_object_end(w); 7875 } 7876 7877 static void 7878 bdev_nvme_discovery_config_json(struct spdk_json_write_ctx *w, struct discovery_ctx *ctx) 7879 { 7880 struct spdk_nvme_transport_id trid; 7881 7882 spdk_json_write_object_begin(w); 7883 7884 spdk_json_write_named_string(w, "method", "bdev_nvme_start_discovery"); 7885 7886 spdk_json_write_named_object_begin(w, "params"); 7887 spdk_json_write_named_string(w, "name", ctx->name); 7888 spdk_json_write_named_string(w, "hostnqn", ctx->hostnqn); 7889 7890 trid = ctx->trid; 7891 memset(trid.subnqn, 0, sizeof(trid.subnqn)); 7892 nvme_bdev_dump_trid_json(&trid, w); 7893 7894 spdk_json_write_named_bool(w, "wait_for_attach", ctx->wait_for_attach); 7895 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", ctx->bdev_opts.ctrlr_loss_timeout_sec); 7896 spdk_json_write_named_uint32(w, "reconnect_delay_sec", ctx->bdev_opts.reconnect_delay_sec); 7897 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 7898 ctx->bdev_opts.fast_io_fail_timeout_sec); 7899 spdk_json_write_object_end(w); 7900 7901 spdk_json_write_object_end(w); 7902 } 7903 7904 static void 7905 nvme_ctrlr_config_json(struct spdk_json_write_ctx *w, 7906 struct nvme_ctrlr *nvme_ctrlr) 7907 { 7908 struct spdk_nvme_transport_id *trid; 7909 const struct spdk_nvme_ctrlr_opts *opts; 7910 7911 if (nvme_ctrlr->opts.from_discovery_service) { 7912 /* Do not emit an RPC for this - it will be implicitly 7913 * covered by a separate bdev_nvme_start_discovery or 7914 * bdev_nvme_start_mdns_discovery RPC. 7915 */ 7916 return; 7917 } 7918 7919 trid = &nvme_ctrlr->active_path_id->trid; 7920 7921 spdk_json_write_object_begin(w); 7922 7923 spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller"); 7924 7925 spdk_json_write_named_object_begin(w, "params"); 7926 spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name); 7927 nvme_bdev_dump_trid_json(trid, w); 7928 spdk_json_write_named_bool(w, "prchk_reftag", 7929 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0); 7930 spdk_json_write_named_bool(w, "prchk_guard", 7931 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0); 7932 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", nvme_ctrlr->opts.ctrlr_loss_timeout_sec); 7933 spdk_json_write_named_uint32(w, "reconnect_delay_sec", nvme_ctrlr->opts.reconnect_delay_sec); 7934 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 7935 nvme_ctrlr->opts.fast_io_fail_timeout_sec); 7936 if (nvme_ctrlr->opts.psk_path[0] != '\0') { 7937 spdk_json_write_named_string(w, "psk", nvme_ctrlr->opts.psk_path); 7938 } 7939 7940 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 7941 spdk_json_write_named_string(w, "hostnqn", opts->hostnqn); 7942 spdk_json_write_named_bool(w, "hdgst", opts->header_digest); 7943 spdk_json_write_named_bool(w, "ddgst", opts->data_digest); 7944 7945 spdk_json_write_object_end(w); 7946 7947 spdk_json_write_object_end(w); 7948 } 7949 7950 static void 7951 bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w) 7952 { 7953 spdk_json_write_object_begin(w); 7954 spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug"); 7955 7956 spdk_json_write_named_object_begin(w, "params"); 7957 spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us); 7958 spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled); 7959 spdk_json_write_object_end(w); 7960 7961 spdk_json_write_object_end(w); 7962 } 7963 7964 static int 7965 bdev_nvme_config_json(struct spdk_json_write_ctx *w) 7966 { 7967 struct nvme_bdev_ctrlr *nbdev_ctrlr; 7968 struct nvme_ctrlr *nvme_ctrlr; 7969 struct discovery_ctx *ctx; 7970 7971 bdev_nvme_opts_config_json(w); 7972 7973 pthread_mutex_lock(&g_bdev_nvme_mutex); 7974 7975 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 7976 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 7977 nvme_ctrlr_config_json(w, nvme_ctrlr); 7978 } 7979 } 7980 7981 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 7982 if (!ctx->from_mdns_discovery_service) { 7983 bdev_nvme_discovery_config_json(w, ctx); 7984 } 7985 } 7986 7987 bdev_nvme_mdns_discovery_config_json(w); 7988 7989 /* Dump as last parameter to give all NVMe bdevs chance to be constructed 7990 * before enabling hotplug poller. 7991 */ 7992 bdev_nvme_hotplug_config_json(w); 7993 7994 pthread_mutex_unlock(&g_bdev_nvme_mutex); 7995 return 0; 7996 } 7997 7998 struct spdk_nvme_ctrlr * 7999 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev) 8000 { 8001 struct nvme_bdev *nbdev; 8002 struct nvme_ns *nvme_ns; 8003 8004 if (!bdev || bdev->module != &nvme_if) { 8005 return NULL; 8006 } 8007 8008 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 8009 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 8010 assert(nvme_ns != NULL); 8011 8012 return nvme_ns->ctrlr->ctrlr; 8013 } 8014 8015 void 8016 nvme_io_path_info_json(struct spdk_json_write_ctx *w, struct nvme_io_path *io_path) 8017 { 8018 struct nvme_ns *nvme_ns = io_path->nvme_ns; 8019 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 8020 const struct spdk_nvme_ctrlr_data *cdata; 8021 const struct spdk_nvme_transport_id *trid; 8022 const char *adrfam_str; 8023 8024 spdk_json_write_object_begin(w); 8025 8026 spdk_json_write_named_string(w, "bdev_name", nvme_ns->bdev->disk.name); 8027 8028 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 8029 trid = spdk_nvme_ctrlr_get_transport_id(nvme_ctrlr->ctrlr); 8030 8031 spdk_json_write_named_uint32(w, "cntlid", cdata->cntlid); 8032 spdk_json_write_named_bool(w, "current", io_path->nbdev_ch != NULL && 8033 io_path == io_path->nbdev_ch->current_io_path); 8034 spdk_json_write_named_bool(w, "connected", nvme_qpair_is_connected(io_path->qpair)); 8035 spdk_json_write_named_bool(w, "accessible", nvme_ns_is_accessible(nvme_ns)); 8036 8037 spdk_json_write_named_object_begin(w, "transport"); 8038 spdk_json_write_named_string(w, "trtype", trid->trstring); 8039 spdk_json_write_named_string(w, "traddr", trid->traddr); 8040 if (trid->trsvcid[0] != '\0') { 8041 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 8042 } 8043 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 8044 if (adrfam_str) { 8045 spdk_json_write_named_string(w, "adrfam", adrfam_str); 8046 } 8047 spdk_json_write_object_end(w); 8048 8049 spdk_json_write_object_end(w); 8050 } 8051 8052 void 8053 bdev_nvme_get_discovery_info(struct spdk_json_write_ctx *w) 8054 { 8055 struct discovery_ctx *ctx; 8056 struct discovery_entry_ctx *entry_ctx; 8057 8058 spdk_json_write_array_begin(w); 8059 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 8060 spdk_json_write_object_begin(w); 8061 spdk_json_write_named_string(w, "name", ctx->name); 8062 8063 spdk_json_write_named_object_begin(w, "trid"); 8064 nvme_bdev_dump_trid_json(&ctx->trid, w); 8065 spdk_json_write_object_end(w); 8066 8067 spdk_json_write_named_array_begin(w, "referrals"); 8068 TAILQ_FOREACH(entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 8069 spdk_json_write_object_begin(w); 8070 spdk_json_write_named_object_begin(w, "trid"); 8071 nvme_bdev_dump_trid_json(&entry_ctx->trid, w); 8072 spdk_json_write_object_end(w); 8073 spdk_json_write_object_end(w); 8074 } 8075 spdk_json_write_array_end(w); 8076 8077 spdk_json_write_object_end(w); 8078 } 8079 spdk_json_write_array_end(w); 8080 } 8081 8082 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme) 8083 8084 SPDK_TRACE_REGISTER_FN(bdev_nvme_trace, "bdev_nvme", TRACE_GROUP_BDEV_NVME) 8085 { 8086 struct spdk_trace_tpoint_opts opts[] = { 8087 { 8088 "BDEV_NVME_IO_START", TRACE_BDEV_NVME_IO_START, 8089 OWNER_NONE, OBJECT_BDEV_NVME_IO, 1, 8090 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 8091 }, 8092 { 8093 "BDEV_NVME_IO_DONE", TRACE_BDEV_NVME_IO_DONE, 8094 OWNER_NONE, OBJECT_BDEV_NVME_IO, 0, 8095 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 8096 } 8097 }; 8098 8099 8100 spdk_trace_register_object(OBJECT_BDEV_NVME_IO, 'N'); 8101 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 8102 spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_SUBMIT, OBJECT_BDEV_NVME_IO, 0); 8103 spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_SUBMIT, OBJECT_BDEV_NVME_IO, 0); 8104 spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_COMPLETE, OBJECT_BDEV_NVME_IO, 0); 8105 spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_COMPLETE, OBJECT_BDEV_NVME_IO, 0); 8106 } 8107