1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 * Copyright (c) 2022 Dell Inc, or its subsidiaries. All rights reserved. 6 */ 7 8 #include "spdk/stdinc.h" 9 10 #include "bdev_nvme.h" 11 12 #include "spdk/accel.h" 13 #include "spdk/config.h" 14 #include "spdk/endian.h" 15 #include "spdk/bdev.h" 16 #include "spdk/json.h" 17 #include "spdk/likely.h" 18 #include "spdk/nvme.h" 19 #include "spdk/nvme_ocssd.h" 20 #include "spdk/nvme_zns.h" 21 #include "spdk/opal.h" 22 #include "spdk/thread.h" 23 #include "spdk/trace.h" 24 #include "spdk/string.h" 25 #include "spdk/util.h" 26 #include "spdk/uuid.h" 27 28 #include "spdk/bdev_module.h" 29 #include "spdk/log.h" 30 31 #include "spdk_internal/usdt.h" 32 #include "spdk_internal/trace_defs.h" 33 34 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true 35 #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS (10000) 36 37 #define NSID_STR_LEN 10 38 39 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w); 40 41 struct nvme_bdev_io { 42 /** array of iovecs to transfer. */ 43 struct iovec *iovs; 44 45 /** Number of iovecs in iovs array. */ 46 int iovcnt; 47 48 /** Current iovec position. */ 49 int iovpos; 50 51 /** Offset in current iovec. */ 52 uint32_t iov_offset; 53 54 /** I/O path the current I/O or admin passthrough is submitted on, or the I/O path 55 * being reset in a reset I/O. 56 */ 57 struct nvme_io_path *io_path; 58 59 /** array of iovecs to transfer. */ 60 struct iovec *fused_iovs; 61 62 /** Number of iovecs in iovs array. */ 63 int fused_iovcnt; 64 65 /** Current iovec position. */ 66 int fused_iovpos; 67 68 /** Offset in current iovec. */ 69 uint32_t fused_iov_offset; 70 71 /** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */ 72 struct spdk_nvme_cpl cpl; 73 74 /** Extended IO opts passed by the user to bdev layer and mapped to NVME format */ 75 struct spdk_nvme_ns_cmd_ext_io_opts ext_opts; 76 77 /** Keeps track if first of fused commands was submitted */ 78 bool first_fused_submitted; 79 80 /** Keeps track if first of fused commands was completed */ 81 bool first_fused_completed; 82 83 /** Temporary pointer to zone report buffer */ 84 struct spdk_nvme_zns_zone_report *zone_report_buf; 85 86 /** Keep track of how many zones that have been copied to the spdk_bdev_zone_info struct */ 87 uint64_t handled_zones; 88 89 /** Expiration value in ticks to retry the current I/O. */ 90 uint64_t retry_ticks; 91 92 /* How many times the current I/O was retried. */ 93 int32_t retry_count; 94 95 /* Current tsc at submit time. */ 96 uint64_t submit_tsc; 97 }; 98 99 struct nvme_probe_skip_entry { 100 struct spdk_nvme_transport_id trid; 101 TAILQ_ENTRY(nvme_probe_skip_entry) tailq; 102 }; 103 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */ 104 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER( 105 g_skipped_nvme_ctrlrs); 106 107 static struct spdk_bdev_nvme_opts g_opts = { 108 .action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE, 109 .timeout_us = 0, 110 .timeout_admin_us = 0, 111 .keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS, 112 .transport_retry_count = 4, 113 .arbitration_burst = 0, 114 .low_priority_weight = 0, 115 .medium_priority_weight = 0, 116 .high_priority_weight = 0, 117 .nvme_adminq_poll_period_us = 10000ULL, 118 .nvme_ioq_poll_period_us = 0, 119 .io_queue_requests = 0, 120 .delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT, 121 .bdev_retry_count = 3, 122 .transport_ack_timeout = 0, 123 .ctrlr_loss_timeout_sec = 0, 124 .reconnect_delay_sec = 0, 125 .fast_io_fail_timeout_sec = 0, 126 .disable_auto_failback = false, 127 .generate_uuids = false, 128 .transport_tos = 0, 129 .nvme_error_stat = false, 130 .io_path_stat = false, 131 .allow_accel_sequence = false, 132 }; 133 134 #define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL 135 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT 100000ULL 136 137 static int g_hot_insert_nvme_controller_index = 0; 138 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT; 139 static bool g_nvme_hotplug_enabled = false; 140 struct spdk_thread *g_bdev_nvme_init_thread; 141 static struct spdk_poller *g_hotplug_poller; 142 static struct spdk_poller *g_hotplug_probe_poller; 143 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx; 144 145 static void nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 146 struct nvme_async_probe_ctx *ctx); 147 static void nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 148 struct nvme_async_probe_ctx *ctx); 149 static int bdev_nvme_library_init(void); 150 static void bdev_nvme_library_fini(void); 151 static void _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, 152 struct spdk_bdev_io *bdev_io); 153 static void bdev_nvme_submit_request(struct spdk_io_channel *ch, 154 struct spdk_bdev_io *bdev_io); 155 static int bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 156 void *md, uint64_t lba_count, uint64_t lba, 157 uint32_t flags, struct spdk_memory_domain *domain, void *domain_ctx, 158 struct spdk_accel_sequence *seq); 159 static int bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 160 void *md, uint64_t lba_count, uint64_t lba); 161 static int bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 162 void *md, uint64_t lba_count, uint64_t lba, 163 uint32_t flags, struct spdk_memory_domain *domain, void *domain_ctx, 164 struct spdk_accel_sequence *seq); 165 static int bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 166 void *md, uint64_t lba_count, 167 uint64_t zslba, uint32_t flags); 168 static int bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 169 void *md, uint64_t lba_count, uint64_t lba, 170 uint32_t flags); 171 static int bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, 172 struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov, 173 int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba, 174 uint32_t flags); 175 static int bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, 176 uint32_t num_zones, struct spdk_bdev_zone_info *info); 177 static int bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 178 enum spdk_bdev_zone_action action); 179 static void bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, 180 struct nvme_bdev_io *bio, 181 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); 182 static int bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 183 void *buf, size_t nbytes); 184 static int bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 185 void *buf, size_t nbytes, void *md_buf, size_t md_len); 186 static int bdev_nvme_iov_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 187 struct iovec *iov, int iovcnt, size_t nbytes, 188 void *md_buf, size_t md_len); 189 static void bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, 190 struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort); 191 static void bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio); 192 static int bdev_nvme_reset_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 193 static int bdev_nvme_failover_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 194 static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr); 195 static int nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr); 196 197 static struct nvme_ns *nvme_ns_alloc(void); 198 static void nvme_ns_free(struct nvme_ns *ns); 199 200 static int 201 nvme_ns_cmp(struct nvme_ns *ns1, struct nvme_ns *ns2) 202 { 203 return ns1->id < ns2->id ? -1 : ns1->id > ns2->id; 204 } 205 206 RB_GENERATE_STATIC(nvme_ns_tree, nvme_ns, node, nvme_ns_cmp); 207 208 struct spdk_nvme_qpair * 209 bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch) 210 { 211 struct nvme_ctrlr_channel *ctrlr_ch; 212 213 assert(ctrlr_io_ch != NULL); 214 215 ctrlr_ch = spdk_io_channel_get_ctx(ctrlr_io_ch); 216 217 return ctrlr_ch->qpair->qpair; 218 } 219 220 static int 221 bdev_nvme_get_ctx_size(void) 222 { 223 return sizeof(struct nvme_bdev_io); 224 } 225 226 static struct spdk_bdev_module nvme_if = { 227 .name = "nvme", 228 .async_fini = true, 229 .module_init = bdev_nvme_library_init, 230 .module_fini = bdev_nvme_library_fini, 231 .config_json = bdev_nvme_config_json, 232 .get_ctx_size = bdev_nvme_get_ctx_size, 233 234 }; 235 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if) 236 237 struct nvme_bdev_ctrlrs g_nvme_bdev_ctrlrs = TAILQ_HEAD_INITIALIZER(g_nvme_bdev_ctrlrs); 238 pthread_mutex_t g_bdev_nvme_mutex = PTHREAD_MUTEX_INITIALIZER; 239 bool g_bdev_nvme_module_finish; 240 241 struct nvme_bdev_ctrlr * 242 nvme_bdev_ctrlr_get_by_name(const char *name) 243 { 244 struct nvme_bdev_ctrlr *nbdev_ctrlr; 245 246 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 247 if (strcmp(name, nbdev_ctrlr->name) == 0) { 248 break; 249 } 250 } 251 252 return nbdev_ctrlr; 253 } 254 255 static struct nvme_ctrlr * 256 nvme_bdev_ctrlr_get_ctrlr(struct nvme_bdev_ctrlr *nbdev_ctrlr, 257 const struct spdk_nvme_transport_id *trid) 258 { 259 struct nvme_ctrlr *nvme_ctrlr; 260 261 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 262 if (spdk_nvme_transport_id_compare(trid, &nvme_ctrlr->active_path_id->trid) == 0) { 263 break; 264 } 265 } 266 267 return nvme_ctrlr; 268 } 269 270 struct nvme_ctrlr * 271 nvme_bdev_ctrlr_get_ctrlr_by_id(struct nvme_bdev_ctrlr *nbdev_ctrlr, 272 uint16_t cntlid) 273 { 274 struct nvme_ctrlr *nvme_ctrlr; 275 const struct spdk_nvme_ctrlr_data *cdata; 276 277 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 278 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 279 if (cdata->cntlid == cntlid) { 280 break; 281 } 282 } 283 284 return nvme_ctrlr; 285 } 286 287 static struct nvme_bdev * 288 nvme_bdev_ctrlr_get_bdev(struct nvme_bdev_ctrlr *nbdev_ctrlr, uint32_t nsid) 289 { 290 struct nvme_bdev *bdev; 291 292 pthread_mutex_lock(&g_bdev_nvme_mutex); 293 TAILQ_FOREACH(bdev, &nbdev_ctrlr->bdevs, tailq) { 294 if (bdev->nsid == nsid) { 295 break; 296 } 297 } 298 pthread_mutex_unlock(&g_bdev_nvme_mutex); 299 300 return bdev; 301 } 302 303 struct nvme_ns * 304 nvme_ctrlr_get_ns(struct nvme_ctrlr *nvme_ctrlr, uint32_t nsid) 305 { 306 struct nvme_ns ns; 307 308 assert(nsid > 0); 309 310 ns.id = nsid; 311 return RB_FIND(nvme_ns_tree, &nvme_ctrlr->namespaces, &ns); 312 } 313 314 struct nvme_ns * 315 nvme_ctrlr_get_first_active_ns(struct nvme_ctrlr *nvme_ctrlr) 316 { 317 return RB_MIN(nvme_ns_tree, &nvme_ctrlr->namespaces); 318 } 319 320 struct nvme_ns * 321 nvme_ctrlr_get_next_active_ns(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *ns) 322 { 323 if (ns == NULL) { 324 return NULL; 325 } 326 327 return RB_NEXT(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 328 } 329 330 static struct nvme_ctrlr * 331 nvme_ctrlr_get(const struct spdk_nvme_transport_id *trid) 332 { 333 struct nvme_bdev_ctrlr *nbdev_ctrlr; 334 struct nvme_ctrlr *nvme_ctrlr = NULL; 335 336 pthread_mutex_lock(&g_bdev_nvme_mutex); 337 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 338 nvme_ctrlr = nvme_bdev_ctrlr_get_ctrlr(nbdev_ctrlr, trid); 339 if (nvme_ctrlr != NULL) { 340 break; 341 } 342 } 343 pthread_mutex_unlock(&g_bdev_nvme_mutex); 344 345 return nvme_ctrlr; 346 } 347 348 struct nvme_ctrlr * 349 nvme_ctrlr_get_by_name(const char *name) 350 { 351 struct nvme_bdev_ctrlr *nbdev_ctrlr; 352 struct nvme_ctrlr *nvme_ctrlr = NULL; 353 354 if (name == NULL) { 355 return NULL; 356 } 357 358 pthread_mutex_lock(&g_bdev_nvme_mutex); 359 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 360 if (nbdev_ctrlr != NULL) { 361 nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 362 } 363 pthread_mutex_unlock(&g_bdev_nvme_mutex); 364 365 return nvme_ctrlr; 366 } 367 368 void 369 nvme_bdev_ctrlr_for_each(nvme_bdev_ctrlr_for_each_fn fn, void *ctx) 370 { 371 struct nvme_bdev_ctrlr *nbdev_ctrlr; 372 373 pthread_mutex_lock(&g_bdev_nvme_mutex); 374 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 375 fn(nbdev_ctrlr, ctx); 376 } 377 pthread_mutex_unlock(&g_bdev_nvme_mutex); 378 } 379 380 void 381 nvme_bdev_dump_trid_json(const struct spdk_nvme_transport_id *trid, struct spdk_json_write_ctx *w) 382 { 383 const char *trtype_str; 384 const char *adrfam_str; 385 386 trtype_str = spdk_nvme_transport_id_trtype_str(trid->trtype); 387 if (trtype_str) { 388 spdk_json_write_named_string(w, "trtype", trtype_str); 389 } 390 391 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 392 if (adrfam_str) { 393 spdk_json_write_named_string(w, "adrfam", adrfam_str); 394 } 395 396 if (trid->traddr[0] != '\0') { 397 spdk_json_write_named_string(w, "traddr", trid->traddr); 398 } 399 400 if (trid->trsvcid[0] != '\0') { 401 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 402 } 403 404 if (trid->subnqn[0] != '\0') { 405 spdk_json_write_named_string(w, "subnqn", trid->subnqn); 406 } 407 } 408 409 static void 410 nvme_bdev_ctrlr_delete(struct nvme_bdev_ctrlr *nbdev_ctrlr, 411 struct nvme_ctrlr *nvme_ctrlr) 412 { 413 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_delete, nvme_ctrlr->nbdev_ctrlr->name); 414 pthread_mutex_lock(&g_bdev_nvme_mutex); 415 416 TAILQ_REMOVE(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 417 if (!TAILQ_EMPTY(&nbdev_ctrlr->ctrlrs)) { 418 pthread_mutex_unlock(&g_bdev_nvme_mutex); 419 420 return; 421 } 422 TAILQ_REMOVE(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 423 424 pthread_mutex_unlock(&g_bdev_nvme_mutex); 425 426 assert(TAILQ_EMPTY(&nbdev_ctrlr->bdevs)); 427 428 free(nbdev_ctrlr->name); 429 free(nbdev_ctrlr); 430 } 431 432 static void 433 _nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 434 { 435 struct nvme_path_id *path_id, *tmp_path; 436 struct nvme_ns *ns, *tmp_ns; 437 438 free(nvme_ctrlr->copied_ana_desc); 439 spdk_free(nvme_ctrlr->ana_log_page); 440 441 if (nvme_ctrlr->opal_dev) { 442 spdk_opal_dev_destruct(nvme_ctrlr->opal_dev); 443 nvme_ctrlr->opal_dev = NULL; 444 } 445 446 if (nvme_ctrlr->nbdev_ctrlr) { 447 nvme_bdev_ctrlr_delete(nvme_ctrlr->nbdev_ctrlr, nvme_ctrlr); 448 } 449 450 RB_FOREACH_SAFE(ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp_ns) { 451 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 452 nvme_ns_free(ns); 453 } 454 455 TAILQ_FOREACH_SAFE(path_id, &nvme_ctrlr->trids, link, tmp_path) { 456 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 457 free(path_id); 458 } 459 460 pthread_mutex_destroy(&nvme_ctrlr->mutex); 461 462 free(nvme_ctrlr); 463 464 pthread_mutex_lock(&g_bdev_nvme_mutex); 465 if (g_bdev_nvme_module_finish && TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 466 pthread_mutex_unlock(&g_bdev_nvme_mutex); 467 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 468 spdk_bdev_module_fini_done(); 469 return; 470 } 471 pthread_mutex_unlock(&g_bdev_nvme_mutex); 472 } 473 474 static int 475 nvme_detach_poller(void *arg) 476 { 477 struct nvme_ctrlr *nvme_ctrlr = arg; 478 int rc; 479 480 rc = spdk_nvme_detach_poll_async(nvme_ctrlr->detach_ctx); 481 if (rc != -EAGAIN) { 482 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 483 _nvme_ctrlr_delete(nvme_ctrlr); 484 } 485 486 return SPDK_POLLER_BUSY; 487 } 488 489 static void 490 nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 491 { 492 int rc; 493 494 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 495 496 /* First, unregister the adminq poller, as the driver will poll adminq if necessary */ 497 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 498 499 /* If we got here, the reset/detach poller cannot be active */ 500 assert(nvme_ctrlr->reset_detach_poller == NULL); 501 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(nvme_detach_poller, 502 nvme_ctrlr, 1000); 503 if (nvme_ctrlr->reset_detach_poller == NULL) { 504 SPDK_ERRLOG("Failed to register detach poller\n"); 505 goto error; 506 } 507 508 rc = spdk_nvme_detach_async(nvme_ctrlr->ctrlr, &nvme_ctrlr->detach_ctx); 509 if (rc != 0) { 510 SPDK_ERRLOG("Failed to detach the NVMe controller\n"); 511 goto error; 512 } 513 514 return; 515 error: 516 /* We don't have a good way to handle errors here, so just do what we can and delete the 517 * controller without detaching the underlying NVMe device. 518 */ 519 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 520 _nvme_ctrlr_delete(nvme_ctrlr); 521 } 522 523 static void 524 nvme_ctrlr_unregister_cb(void *io_device) 525 { 526 struct nvme_ctrlr *nvme_ctrlr = io_device; 527 528 nvme_ctrlr_delete(nvme_ctrlr); 529 } 530 531 static void 532 nvme_ctrlr_unregister(void *ctx) 533 { 534 struct nvme_ctrlr *nvme_ctrlr = ctx; 535 536 spdk_io_device_unregister(nvme_ctrlr, nvme_ctrlr_unregister_cb); 537 } 538 539 static bool 540 nvme_ctrlr_can_be_unregistered(struct nvme_ctrlr *nvme_ctrlr) 541 { 542 if (!nvme_ctrlr->destruct) { 543 return false; 544 } 545 546 if (nvme_ctrlr->ref > 0) { 547 return false; 548 } 549 550 if (nvme_ctrlr->resetting) { 551 return false; 552 } 553 554 if (nvme_ctrlr->ana_log_page_updating) { 555 return false; 556 } 557 558 if (nvme_ctrlr->io_path_cache_clearing) { 559 return false; 560 } 561 562 return true; 563 } 564 565 static void 566 nvme_ctrlr_release(struct nvme_ctrlr *nvme_ctrlr) 567 { 568 pthread_mutex_lock(&nvme_ctrlr->mutex); 569 SPDK_DTRACE_PROBE2(bdev_nvme_ctrlr_release, nvme_ctrlr->nbdev_ctrlr->name, nvme_ctrlr->ref); 570 571 assert(nvme_ctrlr->ref > 0); 572 nvme_ctrlr->ref--; 573 574 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 575 pthread_mutex_unlock(&nvme_ctrlr->mutex); 576 return; 577 } 578 579 pthread_mutex_unlock(&nvme_ctrlr->mutex); 580 581 spdk_thread_exec_msg(nvme_ctrlr->thread, nvme_ctrlr_unregister, nvme_ctrlr); 582 } 583 584 static void 585 bdev_nvme_clear_current_io_path(struct nvme_bdev_channel *nbdev_ch) 586 { 587 nbdev_ch->current_io_path = NULL; 588 nbdev_ch->rr_counter = 0; 589 } 590 591 static struct nvme_io_path * 592 _bdev_nvme_get_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 593 { 594 struct nvme_io_path *io_path; 595 596 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 597 if (io_path->nvme_ns == nvme_ns) { 598 break; 599 } 600 } 601 602 return io_path; 603 } 604 605 static struct nvme_io_path * 606 nvme_io_path_alloc(void) 607 { 608 struct nvme_io_path *io_path; 609 610 io_path = calloc(1, sizeof(*io_path)); 611 if (io_path == NULL) { 612 SPDK_ERRLOG("Failed to alloc io_path.\n"); 613 return NULL; 614 } 615 616 if (g_opts.io_path_stat) { 617 io_path->stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); 618 if (io_path->stat == NULL) { 619 free(io_path); 620 SPDK_ERRLOG("Failed to alloc io_path stat.\n"); 621 return NULL; 622 } 623 spdk_bdev_reset_io_stat(io_path->stat, SPDK_BDEV_RESET_STAT_MAXMIN); 624 } 625 626 return io_path; 627 } 628 629 static void 630 nvme_io_path_free(struct nvme_io_path *io_path) 631 { 632 free(io_path->stat); 633 free(io_path); 634 } 635 636 static int 637 _bdev_nvme_add_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 638 { 639 struct nvme_io_path *io_path; 640 struct spdk_io_channel *ch; 641 struct nvme_ctrlr_channel *ctrlr_ch; 642 struct nvme_qpair *nvme_qpair; 643 644 io_path = nvme_io_path_alloc(); 645 if (io_path == NULL) { 646 return -ENOMEM; 647 } 648 649 io_path->nvme_ns = nvme_ns; 650 651 ch = spdk_get_io_channel(nvme_ns->ctrlr); 652 if (ch == NULL) { 653 nvme_io_path_free(io_path); 654 SPDK_ERRLOG("Failed to alloc io_channel.\n"); 655 return -ENOMEM; 656 } 657 658 ctrlr_ch = spdk_io_channel_get_ctx(ch); 659 660 nvme_qpair = ctrlr_ch->qpair; 661 assert(nvme_qpair != NULL); 662 663 io_path->qpair = nvme_qpair; 664 TAILQ_INSERT_TAIL(&nvme_qpair->io_path_list, io_path, tailq); 665 666 io_path->nbdev_ch = nbdev_ch; 667 STAILQ_INSERT_TAIL(&nbdev_ch->io_path_list, io_path, stailq); 668 669 bdev_nvme_clear_current_io_path(nbdev_ch); 670 671 return 0; 672 } 673 674 static void 675 bdev_nvme_clear_retry_io_path(struct nvme_bdev_channel *nbdev_ch, 676 struct nvme_io_path *io_path) 677 { 678 struct spdk_bdev_io *bdev_io; 679 struct nvme_bdev_io *bio; 680 681 TAILQ_FOREACH(bdev_io, &nbdev_ch->retry_io_list, module_link) { 682 bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 683 if (bio->io_path == io_path) { 684 bio->io_path = NULL; 685 } 686 } 687 } 688 689 static void 690 _bdev_nvme_delete_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *io_path) 691 { 692 struct spdk_io_channel *ch; 693 struct nvme_qpair *nvme_qpair; 694 struct nvme_ctrlr_channel *ctrlr_ch; 695 struct nvme_bdev *nbdev; 696 697 nbdev = spdk_io_channel_get_io_device(spdk_io_channel_from_ctx(nbdev_ch)); 698 699 /* Add the statistics to nvme_ns before this path is destroyed. */ 700 pthread_mutex_lock(&nbdev->mutex); 701 if (nbdev->ref != 0 && io_path->nvme_ns->stat != NULL && io_path->stat != NULL) { 702 spdk_bdev_add_io_stat(io_path->nvme_ns->stat, io_path->stat); 703 } 704 pthread_mutex_unlock(&nbdev->mutex); 705 706 bdev_nvme_clear_current_io_path(nbdev_ch); 707 bdev_nvme_clear_retry_io_path(nbdev_ch, io_path); 708 709 STAILQ_REMOVE(&nbdev_ch->io_path_list, io_path, nvme_io_path, stailq); 710 io_path->nbdev_ch = NULL; 711 712 nvme_qpair = io_path->qpair; 713 assert(nvme_qpair != NULL); 714 715 ctrlr_ch = nvme_qpair->ctrlr_ch; 716 assert(ctrlr_ch != NULL); 717 718 ch = spdk_io_channel_from_ctx(ctrlr_ch); 719 spdk_put_io_channel(ch); 720 721 /* After an io_path is removed, I/Os submitted to it may complete and update statistics 722 * of the io_path. To avoid heap-use-after-free error from this case, do not free the 723 * io_path here but free the io_path when the associated qpair is freed. It is ensured 724 * that all I/Os submitted to the io_path are completed when the associated qpair is freed. 725 */ 726 } 727 728 static void 729 _bdev_nvme_delete_io_paths(struct nvme_bdev_channel *nbdev_ch) 730 { 731 struct nvme_io_path *io_path, *tmp_io_path; 732 733 STAILQ_FOREACH_SAFE(io_path, &nbdev_ch->io_path_list, stailq, tmp_io_path) { 734 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 735 } 736 } 737 738 static int 739 bdev_nvme_create_bdev_channel_cb(void *io_device, void *ctx_buf) 740 { 741 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 742 struct nvme_bdev *nbdev = io_device; 743 struct nvme_ns *nvme_ns; 744 int rc; 745 746 STAILQ_INIT(&nbdev_ch->io_path_list); 747 TAILQ_INIT(&nbdev_ch->retry_io_list); 748 749 pthread_mutex_lock(&nbdev->mutex); 750 751 nbdev_ch->mp_policy = nbdev->mp_policy; 752 nbdev_ch->mp_selector = nbdev->mp_selector; 753 nbdev_ch->rr_min_io = nbdev->rr_min_io; 754 755 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 756 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 757 if (rc != 0) { 758 pthread_mutex_unlock(&nbdev->mutex); 759 760 _bdev_nvme_delete_io_paths(nbdev_ch); 761 return rc; 762 } 763 } 764 pthread_mutex_unlock(&nbdev->mutex); 765 766 return 0; 767 } 768 769 /* If cpl != NULL, complete the bdev_io with nvme status based on 'cpl'. 770 * If cpl == NULL, complete the bdev_io with bdev status based on 'status'. 771 */ 772 static inline void 773 __bdev_nvme_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status, 774 const struct spdk_nvme_cpl *cpl) 775 { 776 spdk_trace_record(TRACE_BDEV_NVME_IO_DONE, 0, 0, (uintptr_t)bdev_io->driver_ctx, 777 (uintptr_t)bdev_io); 778 if (cpl) { 779 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 780 } else { 781 spdk_bdev_io_complete(bdev_io, status); 782 } 783 } 784 785 static void bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch); 786 787 static void 788 bdev_nvme_destroy_bdev_channel_cb(void *io_device, void *ctx_buf) 789 { 790 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 791 792 bdev_nvme_abort_retry_ios(nbdev_ch); 793 _bdev_nvme_delete_io_paths(nbdev_ch); 794 } 795 796 static inline bool 797 bdev_nvme_io_type_is_admin(enum spdk_bdev_io_type io_type) 798 { 799 switch (io_type) { 800 case SPDK_BDEV_IO_TYPE_RESET: 801 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 802 case SPDK_BDEV_IO_TYPE_ABORT: 803 return true; 804 default: 805 break; 806 } 807 808 return false; 809 } 810 811 static inline bool 812 nvme_ns_is_accessible(struct nvme_ns *nvme_ns) 813 { 814 if (spdk_unlikely(nvme_ns->ana_state_updating)) { 815 return false; 816 } 817 818 switch (nvme_ns->ana_state) { 819 case SPDK_NVME_ANA_OPTIMIZED_STATE: 820 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 821 return true; 822 default: 823 break; 824 } 825 826 return false; 827 } 828 829 static inline bool 830 nvme_qpair_is_connected(struct nvme_qpair *nvme_qpair) 831 { 832 if (spdk_unlikely(nvme_qpair->qpair == NULL)) { 833 return false; 834 } 835 836 if (spdk_unlikely(spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) != 837 SPDK_NVME_QPAIR_FAILURE_NONE)) { 838 return false; 839 } 840 841 if (spdk_unlikely(nvme_qpair->ctrlr_ch->reset_iter != NULL)) { 842 return false; 843 } 844 845 return true; 846 } 847 848 static inline bool 849 nvme_io_path_is_available(struct nvme_io_path *io_path) 850 { 851 if (spdk_unlikely(!nvme_qpair_is_connected(io_path->qpair))) { 852 return false; 853 } 854 855 if (spdk_unlikely(!nvme_ns_is_accessible(io_path->nvme_ns))) { 856 return false; 857 } 858 859 return true; 860 } 861 862 static inline bool 863 nvme_ctrlr_is_failed(struct nvme_ctrlr *nvme_ctrlr) 864 { 865 if (nvme_ctrlr->destruct) { 866 return true; 867 } 868 869 if (nvme_ctrlr->fast_io_fail_timedout) { 870 return true; 871 } 872 873 if (nvme_ctrlr->resetting) { 874 if (nvme_ctrlr->opts.reconnect_delay_sec != 0) { 875 return false; 876 } else { 877 return true; 878 } 879 } 880 881 if (nvme_ctrlr->reconnect_is_delayed) { 882 return false; 883 } 884 885 if (nvme_ctrlr->disabled) { 886 return true; 887 } 888 889 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 890 return true; 891 } else { 892 return false; 893 } 894 } 895 896 static bool 897 nvme_ctrlr_is_available(struct nvme_ctrlr *nvme_ctrlr) 898 { 899 if (nvme_ctrlr->destruct) { 900 return false; 901 } 902 903 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 904 return false; 905 } 906 907 if (nvme_ctrlr->resetting || nvme_ctrlr->reconnect_is_delayed) { 908 return false; 909 } 910 911 if (nvme_ctrlr->disabled) { 912 return false; 913 } 914 915 return true; 916 } 917 918 /* Simulate circular linked list. */ 919 static inline struct nvme_io_path * 920 nvme_io_path_get_next(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *prev_path) 921 { 922 struct nvme_io_path *next_path; 923 924 if (prev_path != NULL) { 925 next_path = STAILQ_NEXT(prev_path, stailq); 926 if (next_path != NULL) { 927 return next_path; 928 } 929 } 930 931 return STAILQ_FIRST(&nbdev_ch->io_path_list); 932 } 933 934 static struct nvme_io_path * 935 _bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 936 { 937 struct nvme_io_path *io_path, *start, *non_optimized = NULL; 938 939 start = nvme_io_path_get_next(nbdev_ch, nbdev_ch->current_io_path); 940 941 io_path = start; 942 do { 943 if (spdk_likely(nvme_qpair_is_connected(io_path->qpair) && 944 !io_path->nvme_ns->ana_state_updating)) { 945 switch (io_path->nvme_ns->ana_state) { 946 case SPDK_NVME_ANA_OPTIMIZED_STATE: 947 nbdev_ch->current_io_path = io_path; 948 return io_path; 949 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 950 if (non_optimized == NULL) { 951 non_optimized = io_path; 952 } 953 break; 954 default: 955 break; 956 } 957 } 958 io_path = nvme_io_path_get_next(nbdev_ch, io_path); 959 } while (io_path != start); 960 961 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) { 962 /* We come here only if there is no optimized path. Cache even non_optimized 963 * path for load balance across multiple non_optimized paths. 964 */ 965 nbdev_ch->current_io_path = non_optimized; 966 } 967 968 return non_optimized; 969 } 970 971 static struct nvme_io_path * 972 _bdev_nvme_find_io_path_min_qd(struct nvme_bdev_channel *nbdev_ch) 973 { 974 struct nvme_io_path *io_path; 975 struct nvme_io_path *optimized = NULL, *non_optimized = NULL; 976 uint32_t opt_min_qd = UINT32_MAX, non_opt_min_qd = UINT32_MAX; 977 uint32_t num_outstanding_reqs; 978 979 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 980 if (spdk_unlikely(!nvme_qpair_is_connected(io_path->qpair))) { 981 /* The device is currently resetting. */ 982 continue; 983 } 984 985 if (spdk_unlikely(io_path->nvme_ns->ana_state_updating)) { 986 continue; 987 } 988 989 num_outstanding_reqs = spdk_nvme_qpair_get_num_outstanding_reqs(io_path->qpair->qpair); 990 switch (io_path->nvme_ns->ana_state) { 991 case SPDK_NVME_ANA_OPTIMIZED_STATE: 992 if (num_outstanding_reqs < opt_min_qd) { 993 opt_min_qd = num_outstanding_reqs; 994 optimized = io_path; 995 } 996 break; 997 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 998 if (num_outstanding_reqs < non_opt_min_qd) { 999 non_opt_min_qd = num_outstanding_reqs; 1000 non_optimized = io_path; 1001 } 1002 break; 1003 default: 1004 break; 1005 } 1006 } 1007 1008 /* don't cache io path for BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH selector */ 1009 if (optimized != NULL) { 1010 return optimized; 1011 } 1012 1013 return non_optimized; 1014 } 1015 1016 static inline struct nvme_io_path * 1017 bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 1018 { 1019 if (spdk_likely(nbdev_ch->current_io_path != NULL)) { 1020 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE) { 1021 return nbdev_ch->current_io_path; 1022 } else if (nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 1023 if (++nbdev_ch->rr_counter < nbdev_ch->rr_min_io) { 1024 return nbdev_ch->current_io_path; 1025 } 1026 nbdev_ch->rr_counter = 0; 1027 } 1028 } 1029 1030 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE || 1031 nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 1032 return _bdev_nvme_find_io_path(nbdev_ch); 1033 } else { 1034 return _bdev_nvme_find_io_path_min_qd(nbdev_ch); 1035 } 1036 } 1037 1038 /* Return true if there is any io_path whose qpair is active or ctrlr is not failed, 1039 * or false otherwise. 1040 * 1041 * If any io_path has an active qpair but find_io_path() returned NULL, its namespace 1042 * is likely to be non-accessible now but may become accessible. 1043 * 1044 * If any io_path has an unfailed ctrlr but find_io_path() returned NULL, the ctrlr 1045 * is likely to be resetting now but the reset may succeed. A ctrlr is set to unfailed 1046 * when starting to reset it but it is set to failed when the reset failed. Hence, if 1047 * a ctrlr is unfailed, it is likely that it works fine or is resetting. 1048 */ 1049 static bool 1050 any_io_path_may_become_available(struct nvme_bdev_channel *nbdev_ch) 1051 { 1052 struct nvme_io_path *io_path; 1053 1054 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 1055 if (io_path->nvme_ns->ana_transition_timedout) { 1056 continue; 1057 } 1058 1059 if (nvme_qpair_is_connected(io_path->qpair) || 1060 !nvme_ctrlr_is_failed(io_path->qpair->ctrlr)) { 1061 return true; 1062 } 1063 } 1064 1065 return false; 1066 } 1067 1068 static void 1069 bdev_nvme_retry_io(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io) 1070 { 1071 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 1072 struct spdk_io_channel *ch; 1073 1074 if (nbdev_io->io_path != NULL && nvme_io_path_is_available(nbdev_io->io_path)) { 1075 _bdev_nvme_submit_request(nbdev_ch, bdev_io); 1076 } else { 1077 ch = spdk_io_channel_from_ctx(nbdev_ch); 1078 bdev_nvme_submit_request(ch, bdev_io); 1079 } 1080 } 1081 1082 static int 1083 bdev_nvme_retry_ios(void *arg) 1084 { 1085 struct nvme_bdev_channel *nbdev_ch = arg; 1086 struct spdk_bdev_io *bdev_io, *tmp_bdev_io; 1087 struct nvme_bdev_io *bio; 1088 uint64_t now, delay_us; 1089 1090 now = spdk_get_ticks(); 1091 1092 TAILQ_FOREACH_SAFE(bdev_io, &nbdev_ch->retry_io_list, module_link, tmp_bdev_io) { 1093 bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 1094 if (bio->retry_ticks > now) { 1095 break; 1096 } 1097 1098 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io, module_link); 1099 1100 bdev_nvme_retry_io(nbdev_ch, bdev_io); 1101 } 1102 1103 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1104 1105 bdev_io = TAILQ_FIRST(&nbdev_ch->retry_io_list); 1106 if (bdev_io != NULL) { 1107 bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 1108 1109 delay_us = (bio->retry_ticks - now) * SPDK_SEC_TO_USEC / spdk_get_ticks_hz(); 1110 1111 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 1112 delay_us); 1113 } 1114 1115 return SPDK_POLLER_BUSY; 1116 } 1117 1118 static void 1119 bdev_nvme_queue_retry_io(struct nvme_bdev_channel *nbdev_ch, 1120 struct nvme_bdev_io *bio, uint64_t delay_ms) 1121 { 1122 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1123 struct spdk_bdev_io *tmp_bdev_io; 1124 struct nvme_bdev_io *tmp_bio; 1125 1126 bio->retry_ticks = spdk_get_ticks() + delay_ms * spdk_get_ticks_hz() / 1000ULL; 1127 1128 TAILQ_FOREACH_REVERSE(tmp_bdev_io, &nbdev_ch->retry_io_list, retry_io_head, module_link) { 1129 tmp_bio = (struct nvme_bdev_io *)tmp_bdev_io->driver_ctx; 1130 1131 if (tmp_bio->retry_ticks <= bio->retry_ticks) { 1132 TAILQ_INSERT_AFTER(&nbdev_ch->retry_io_list, tmp_bdev_io, bdev_io, 1133 module_link); 1134 return; 1135 } 1136 } 1137 1138 /* No earlier I/Os were found. This I/O must be the new head. */ 1139 TAILQ_INSERT_HEAD(&nbdev_ch->retry_io_list, bdev_io, module_link); 1140 1141 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1142 1143 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 1144 delay_ms * 1000ULL); 1145 } 1146 1147 static void 1148 bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch) 1149 { 1150 struct spdk_bdev_io *bdev_io, *tmp_io; 1151 1152 TAILQ_FOREACH_SAFE(bdev_io, &nbdev_ch->retry_io_list, module_link, tmp_io) { 1153 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io, module_link); 1154 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED, NULL); 1155 } 1156 1157 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1158 } 1159 1160 static int 1161 bdev_nvme_abort_retry_io(struct nvme_bdev_channel *nbdev_ch, 1162 struct nvme_bdev_io *bio_to_abort) 1163 { 1164 struct spdk_bdev_io *bdev_io_to_abort; 1165 1166 TAILQ_FOREACH(bdev_io_to_abort, &nbdev_ch->retry_io_list, module_link) { 1167 if ((struct nvme_bdev_io *)bdev_io_to_abort->driver_ctx == bio_to_abort) { 1168 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io_to_abort, module_link); 1169 __bdev_nvme_io_complete(bdev_io_to_abort, SPDK_BDEV_IO_STATUS_ABORTED, NULL); 1170 return 0; 1171 } 1172 } 1173 1174 return -ENOENT; 1175 } 1176 1177 static void 1178 bdev_nvme_update_nvme_error_stat(struct spdk_bdev_io *bdev_io, const struct spdk_nvme_cpl *cpl) 1179 { 1180 struct nvme_bdev *nbdev; 1181 uint16_t sct, sc; 1182 1183 assert(spdk_nvme_cpl_is_error(cpl)); 1184 1185 nbdev = bdev_io->bdev->ctxt; 1186 1187 if (nbdev->err_stat == NULL) { 1188 return; 1189 } 1190 1191 sct = cpl->status.sct; 1192 sc = cpl->status.sc; 1193 1194 pthread_mutex_lock(&nbdev->mutex); 1195 1196 nbdev->err_stat->status_type[sct]++; 1197 switch (sct) { 1198 case SPDK_NVME_SCT_GENERIC: 1199 case SPDK_NVME_SCT_COMMAND_SPECIFIC: 1200 case SPDK_NVME_SCT_MEDIA_ERROR: 1201 case SPDK_NVME_SCT_PATH: 1202 nbdev->err_stat->status[sct][sc]++; 1203 break; 1204 default: 1205 break; 1206 } 1207 1208 pthread_mutex_unlock(&nbdev->mutex); 1209 } 1210 1211 static inline void 1212 bdev_nvme_update_io_path_stat(struct nvme_bdev_io *bio) 1213 { 1214 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1215 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 1216 uint32_t blocklen = bdev_io->bdev->blocklen; 1217 struct spdk_bdev_io_stat *stat; 1218 uint64_t tsc_diff; 1219 1220 if (bio->io_path->stat == NULL) { 1221 return; 1222 } 1223 1224 tsc_diff = spdk_get_ticks() - bio->submit_tsc; 1225 stat = bio->io_path->stat; 1226 1227 switch (bdev_io->type) { 1228 case SPDK_BDEV_IO_TYPE_READ: 1229 stat->bytes_read += num_blocks * blocklen; 1230 stat->num_read_ops++; 1231 stat->read_latency_ticks += tsc_diff; 1232 if (stat->max_read_latency_ticks < tsc_diff) { 1233 stat->max_read_latency_ticks = tsc_diff; 1234 } 1235 if (stat->min_read_latency_ticks > tsc_diff) { 1236 stat->min_read_latency_ticks = tsc_diff; 1237 } 1238 break; 1239 case SPDK_BDEV_IO_TYPE_WRITE: 1240 stat->bytes_written += num_blocks * blocklen; 1241 stat->num_write_ops++; 1242 stat->write_latency_ticks += tsc_diff; 1243 if (stat->max_write_latency_ticks < tsc_diff) { 1244 stat->max_write_latency_ticks = tsc_diff; 1245 } 1246 if (stat->min_write_latency_ticks > tsc_diff) { 1247 stat->min_write_latency_ticks = tsc_diff; 1248 } 1249 break; 1250 case SPDK_BDEV_IO_TYPE_UNMAP: 1251 stat->bytes_unmapped += num_blocks * blocklen; 1252 stat->num_unmap_ops++; 1253 stat->unmap_latency_ticks += tsc_diff; 1254 if (stat->max_unmap_latency_ticks < tsc_diff) { 1255 stat->max_unmap_latency_ticks = tsc_diff; 1256 } 1257 if (stat->min_unmap_latency_ticks > tsc_diff) { 1258 stat->min_unmap_latency_ticks = tsc_diff; 1259 } 1260 break; 1261 case SPDK_BDEV_IO_TYPE_ZCOPY: 1262 /* Track the data in the start phase only */ 1263 if (!bdev_io->u.bdev.zcopy.start) { 1264 break; 1265 } 1266 if (bdev_io->u.bdev.zcopy.populate) { 1267 stat->bytes_read += num_blocks * blocklen; 1268 stat->num_read_ops++; 1269 stat->read_latency_ticks += tsc_diff; 1270 if (stat->max_read_latency_ticks < tsc_diff) { 1271 stat->max_read_latency_ticks = tsc_diff; 1272 } 1273 if (stat->min_read_latency_ticks > tsc_diff) { 1274 stat->min_read_latency_ticks = tsc_diff; 1275 } 1276 } else { 1277 stat->bytes_written += num_blocks * blocklen; 1278 stat->num_write_ops++; 1279 stat->write_latency_ticks += tsc_diff; 1280 if (stat->max_write_latency_ticks < tsc_diff) { 1281 stat->max_write_latency_ticks = tsc_diff; 1282 } 1283 if (stat->min_write_latency_ticks > tsc_diff) { 1284 stat->min_write_latency_ticks = tsc_diff; 1285 } 1286 } 1287 break; 1288 case SPDK_BDEV_IO_TYPE_COPY: 1289 stat->bytes_copied += num_blocks * blocklen; 1290 stat->num_copy_ops++; 1291 stat->copy_latency_ticks += tsc_diff; 1292 if (stat->max_copy_latency_ticks < tsc_diff) { 1293 stat->max_copy_latency_ticks = tsc_diff; 1294 } 1295 if (stat->min_copy_latency_ticks > tsc_diff) { 1296 stat->min_copy_latency_ticks = tsc_diff; 1297 } 1298 break; 1299 default: 1300 break; 1301 } 1302 } 1303 1304 static bool 1305 bdev_nvme_check_retry_io(struct nvme_bdev_io *bio, 1306 const struct spdk_nvme_cpl *cpl, 1307 struct nvme_bdev_channel *nbdev_ch, 1308 uint64_t *_delay_ms) 1309 { 1310 struct nvme_io_path *io_path = bio->io_path; 1311 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 1312 const struct spdk_nvme_ctrlr_data *cdata; 1313 1314 if (spdk_nvme_cpl_is_path_error(cpl) || 1315 spdk_nvme_cpl_is_aborted_sq_deletion(cpl) || 1316 !nvme_io_path_is_available(io_path) || 1317 !nvme_ctrlr_is_available(nvme_ctrlr)) { 1318 bdev_nvme_clear_current_io_path(nbdev_ch); 1319 bio->io_path = NULL; 1320 if (spdk_nvme_cpl_is_ana_error(cpl)) { 1321 if (nvme_ctrlr_read_ana_log_page(nvme_ctrlr) == 0) { 1322 io_path->nvme_ns->ana_state_updating = true; 1323 } 1324 } 1325 if (!any_io_path_may_become_available(nbdev_ch)) { 1326 return false; 1327 } 1328 *_delay_ms = 0; 1329 } else { 1330 bio->retry_count++; 1331 1332 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 1333 1334 if (cpl->status.crd != 0) { 1335 *_delay_ms = cdata->crdt[cpl->status.crd] * 100; 1336 } else { 1337 *_delay_ms = 0; 1338 } 1339 } 1340 1341 return true; 1342 } 1343 1344 static inline void 1345 bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio, 1346 const struct spdk_nvme_cpl *cpl) 1347 { 1348 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1349 struct nvme_bdev_channel *nbdev_ch; 1350 uint64_t delay_ms; 1351 1352 assert(!bdev_nvme_io_type_is_admin(bdev_io->type)); 1353 1354 if (spdk_likely(spdk_nvme_cpl_is_success(cpl))) { 1355 bdev_nvme_update_io_path_stat(bio); 1356 goto complete; 1357 } 1358 1359 /* Update error counts before deciding if retry is needed. 1360 * Hence, error counts may be more than the number of I/O errors. 1361 */ 1362 bdev_nvme_update_nvme_error_stat(bdev_io, cpl); 1363 1364 if (cpl->status.dnr != 0 || spdk_nvme_cpl_is_aborted_by_request(cpl) || 1365 (g_opts.bdev_retry_count != -1 && bio->retry_count >= g_opts.bdev_retry_count)) { 1366 goto complete; 1367 } 1368 1369 /* At this point we don't know whether the sequence was successfully executed or not, so we 1370 * cannot retry the IO */ 1371 if (bdev_io->u.bdev.accel_sequence != NULL) { 1372 goto complete; 1373 } 1374 1375 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1376 1377 if (bdev_nvme_check_retry_io(bio, cpl, nbdev_ch, &delay_ms)) { 1378 bdev_nvme_queue_retry_io(nbdev_ch, bio, delay_ms); 1379 return; 1380 } 1381 1382 complete: 1383 bio->retry_count = 0; 1384 bio->submit_tsc = 0; 1385 bdev_io->u.bdev.accel_sequence = NULL; 1386 __bdev_nvme_io_complete(bdev_io, 0, cpl); 1387 } 1388 1389 static inline void 1390 bdev_nvme_io_complete(struct nvme_bdev_io *bio, int rc) 1391 { 1392 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1393 struct nvme_bdev_channel *nbdev_ch; 1394 enum spdk_bdev_io_status io_status; 1395 1396 assert(!bdev_nvme_io_type_is_admin(bdev_io->type)); 1397 1398 switch (rc) { 1399 case 0: 1400 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1401 break; 1402 case -ENOMEM: 1403 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1404 break; 1405 case -ENXIO: 1406 if (g_opts.bdev_retry_count == -1 || bio->retry_count < g_opts.bdev_retry_count) { 1407 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1408 1409 bdev_nvme_clear_current_io_path(nbdev_ch); 1410 bio->io_path = NULL; 1411 1412 if (any_io_path_may_become_available(nbdev_ch)) { 1413 bdev_nvme_queue_retry_io(nbdev_ch, bio, 1000ULL); 1414 return; 1415 } 1416 } 1417 1418 /* fallthrough */ 1419 default: 1420 spdk_accel_sequence_abort(bdev_io->u.bdev.accel_sequence); 1421 bdev_io->u.bdev.accel_sequence = NULL; 1422 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1423 break; 1424 } 1425 1426 bio->retry_count = 0; 1427 bio->submit_tsc = 0; 1428 __bdev_nvme_io_complete(bdev_io, io_status, NULL); 1429 } 1430 1431 static inline void 1432 bdev_nvme_admin_complete(struct nvme_bdev_io *bio, int rc) 1433 { 1434 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1435 enum spdk_bdev_io_status io_status; 1436 1437 switch (rc) { 1438 case 0: 1439 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1440 break; 1441 case -ENOMEM: 1442 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1443 break; 1444 case -ENXIO: 1445 /* fallthrough */ 1446 default: 1447 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1448 break; 1449 } 1450 1451 __bdev_nvme_io_complete(bdev_io, io_status, NULL); 1452 } 1453 1454 static void 1455 bdev_nvme_clear_io_path_caches_done(struct spdk_io_channel_iter *i, int status) 1456 { 1457 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 1458 1459 pthread_mutex_lock(&nvme_ctrlr->mutex); 1460 1461 assert(nvme_ctrlr->io_path_cache_clearing == true); 1462 nvme_ctrlr->io_path_cache_clearing = false; 1463 1464 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 1465 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1466 return; 1467 } 1468 1469 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1470 1471 nvme_ctrlr_unregister(nvme_ctrlr); 1472 } 1473 1474 static void 1475 _bdev_nvme_clear_io_path_cache(struct nvme_qpair *nvme_qpair) 1476 { 1477 struct nvme_io_path *io_path; 1478 1479 TAILQ_FOREACH(io_path, &nvme_qpair->io_path_list, tailq) { 1480 if (io_path->nbdev_ch == NULL) { 1481 continue; 1482 } 1483 bdev_nvme_clear_current_io_path(io_path->nbdev_ch); 1484 } 1485 } 1486 1487 static void 1488 bdev_nvme_clear_io_path_cache(struct spdk_io_channel_iter *i) 1489 { 1490 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 1491 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 1492 1493 assert(ctrlr_ch->qpair != NULL); 1494 1495 _bdev_nvme_clear_io_path_cache(ctrlr_ch->qpair); 1496 1497 spdk_for_each_channel_continue(i, 0); 1498 } 1499 1500 static void 1501 bdev_nvme_clear_io_path_caches(struct nvme_ctrlr *nvme_ctrlr) 1502 { 1503 pthread_mutex_lock(&nvme_ctrlr->mutex); 1504 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 1505 nvme_ctrlr->io_path_cache_clearing) { 1506 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1507 return; 1508 } 1509 1510 nvme_ctrlr->io_path_cache_clearing = true; 1511 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1512 1513 spdk_for_each_channel(nvme_ctrlr, 1514 bdev_nvme_clear_io_path_cache, 1515 NULL, 1516 bdev_nvme_clear_io_path_caches_done); 1517 } 1518 1519 static struct nvme_qpair * 1520 nvme_poll_group_get_qpair(struct nvme_poll_group *group, struct spdk_nvme_qpair *qpair) 1521 { 1522 struct nvme_qpair *nvme_qpair; 1523 1524 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1525 if (nvme_qpair->qpair == qpair) { 1526 break; 1527 } 1528 } 1529 1530 return nvme_qpair; 1531 } 1532 1533 static void nvme_qpair_delete(struct nvme_qpair *nvme_qpair); 1534 1535 static void 1536 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx) 1537 { 1538 struct nvme_poll_group *group = poll_group_ctx; 1539 struct nvme_qpair *nvme_qpair; 1540 struct nvme_ctrlr_channel *ctrlr_ch; 1541 int status; 1542 1543 nvme_qpair = nvme_poll_group_get_qpair(group, qpair); 1544 if (nvme_qpair == NULL) { 1545 return; 1546 } 1547 1548 if (nvme_qpair->qpair != NULL) { 1549 spdk_nvme_ctrlr_free_io_qpair(nvme_qpair->qpair); 1550 nvme_qpair->qpair = NULL; 1551 } 1552 1553 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1554 1555 ctrlr_ch = nvme_qpair->ctrlr_ch; 1556 1557 if (ctrlr_ch != NULL) { 1558 if (ctrlr_ch->reset_iter != NULL) { 1559 /* We are in a full reset sequence. */ 1560 if (ctrlr_ch->connect_poller != NULL) { 1561 /* qpair was failed to connect. Abort the reset sequence. */ 1562 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was failed to connect. abort the reset ctrlr sequence.\n", 1563 qpair); 1564 spdk_poller_unregister(&ctrlr_ch->connect_poller); 1565 status = -1; 1566 } else { 1567 /* qpair was completed to disconnect. Just move to the next ctrlr_channel. */ 1568 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was disconnected and freed in a reset ctrlr sequence.\n", 1569 qpair); 1570 status = 0; 1571 } 1572 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, status); 1573 ctrlr_ch->reset_iter = NULL; 1574 } else { 1575 /* qpair was disconnected unexpectedly. Reset controller for recovery. */ 1576 SPDK_NOTICELOG("qpair %p was disconnected and freed. reset controller.\n", qpair); 1577 bdev_nvme_failover_ctrlr(nvme_qpair->ctrlr); 1578 } 1579 } else { 1580 /* In this case, ctrlr_channel is already deleted. */ 1581 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was disconnected and freed. delete nvme_qpair.\n", qpair); 1582 nvme_qpair_delete(nvme_qpair); 1583 } 1584 } 1585 1586 static void 1587 bdev_nvme_check_io_qpairs(struct nvme_poll_group *group) 1588 { 1589 struct nvme_qpair *nvme_qpair; 1590 1591 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1592 if (nvme_qpair->qpair == NULL || nvme_qpair->ctrlr_ch == NULL) { 1593 continue; 1594 } 1595 1596 if (spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) != 1597 SPDK_NVME_QPAIR_FAILURE_NONE) { 1598 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1599 } 1600 } 1601 } 1602 1603 static int 1604 bdev_nvme_poll(void *arg) 1605 { 1606 struct nvme_poll_group *group = arg; 1607 int64_t num_completions; 1608 1609 if (group->collect_spin_stat && group->start_ticks == 0) { 1610 group->start_ticks = spdk_get_ticks(); 1611 } 1612 1613 num_completions = spdk_nvme_poll_group_process_completions(group->group, 0, 1614 bdev_nvme_disconnected_qpair_cb); 1615 if (group->collect_spin_stat) { 1616 if (num_completions > 0) { 1617 if (group->end_ticks != 0) { 1618 group->spin_ticks += (group->end_ticks - group->start_ticks); 1619 group->end_ticks = 0; 1620 } 1621 group->start_ticks = 0; 1622 } else { 1623 group->end_ticks = spdk_get_ticks(); 1624 } 1625 } 1626 1627 if (spdk_unlikely(num_completions < 0)) { 1628 bdev_nvme_check_io_qpairs(group); 1629 } 1630 1631 return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 1632 } 1633 1634 static int bdev_nvme_poll_adminq(void *arg); 1635 1636 static void 1637 bdev_nvme_change_adminq_poll_period(struct nvme_ctrlr *nvme_ctrlr, uint64_t new_period_us) 1638 { 1639 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 1640 1641 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, 1642 nvme_ctrlr, new_period_us); 1643 } 1644 1645 static int 1646 bdev_nvme_poll_adminq(void *arg) 1647 { 1648 int32_t rc; 1649 struct nvme_ctrlr *nvme_ctrlr = arg; 1650 nvme_ctrlr_disconnected_cb disconnected_cb; 1651 1652 assert(nvme_ctrlr != NULL); 1653 1654 rc = spdk_nvme_ctrlr_process_admin_completions(nvme_ctrlr->ctrlr); 1655 if (rc < 0) { 1656 disconnected_cb = nvme_ctrlr->disconnected_cb; 1657 nvme_ctrlr->disconnected_cb = NULL; 1658 1659 if (disconnected_cb != NULL) { 1660 bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 1661 g_opts.nvme_adminq_poll_period_us); 1662 disconnected_cb(nvme_ctrlr); 1663 } else { 1664 bdev_nvme_failover_ctrlr(nvme_ctrlr); 1665 } 1666 } else if (spdk_nvme_ctrlr_get_admin_qp_failure_reason(nvme_ctrlr->ctrlr) != 1667 SPDK_NVME_QPAIR_FAILURE_NONE) { 1668 bdev_nvme_clear_io_path_caches(nvme_ctrlr); 1669 } 1670 1671 return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY; 1672 } 1673 1674 static void 1675 nvme_bdev_free(void *io_device) 1676 { 1677 struct nvme_bdev *nvme_disk = io_device; 1678 1679 pthread_mutex_destroy(&nvme_disk->mutex); 1680 free(nvme_disk->disk.name); 1681 free(nvme_disk->err_stat); 1682 free(nvme_disk); 1683 } 1684 1685 static int 1686 bdev_nvme_destruct(void *ctx) 1687 { 1688 struct nvme_bdev *nvme_disk = ctx; 1689 struct nvme_ns *nvme_ns, *tmp_nvme_ns; 1690 1691 SPDK_DTRACE_PROBE2(bdev_nvme_destruct, nvme_disk->nbdev_ctrlr->name, nvme_disk->nsid); 1692 1693 TAILQ_FOREACH_SAFE(nvme_ns, &nvme_disk->nvme_ns_list, tailq, tmp_nvme_ns) { 1694 pthread_mutex_lock(&nvme_ns->ctrlr->mutex); 1695 1696 nvme_ns->bdev = NULL; 1697 1698 assert(nvme_ns->id > 0); 1699 1700 if (nvme_ctrlr_get_ns(nvme_ns->ctrlr, nvme_ns->id) == NULL) { 1701 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1702 1703 nvme_ctrlr_release(nvme_ns->ctrlr); 1704 nvme_ns_free(nvme_ns); 1705 } else { 1706 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1707 } 1708 } 1709 1710 pthread_mutex_lock(&g_bdev_nvme_mutex); 1711 TAILQ_REMOVE(&nvme_disk->nbdev_ctrlr->bdevs, nvme_disk, tailq); 1712 pthread_mutex_unlock(&g_bdev_nvme_mutex); 1713 1714 spdk_io_device_unregister(nvme_disk, nvme_bdev_free); 1715 1716 return 0; 1717 } 1718 1719 static int 1720 bdev_nvme_create_qpair(struct nvme_qpair *nvme_qpair) 1721 { 1722 struct nvme_ctrlr *nvme_ctrlr; 1723 struct spdk_nvme_io_qpair_opts opts; 1724 struct spdk_nvme_qpair *qpair; 1725 int rc; 1726 1727 nvme_ctrlr = nvme_qpair->ctrlr; 1728 1729 spdk_nvme_ctrlr_get_default_io_qpair_opts(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1730 opts.delay_cmd_submit = g_opts.delay_cmd_submit; 1731 opts.create_only = true; 1732 opts.async_mode = true; 1733 opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests); 1734 g_opts.io_queue_requests = opts.io_queue_requests; 1735 1736 qpair = spdk_nvme_ctrlr_alloc_io_qpair(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1737 if (qpair == NULL) { 1738 return -1; 1739 } 1740 1741 SPDK_DTRACE_PROBE3(bdev_nvme_create_qpair, nvme_ctrlr->nbdev_ctrlr->name, 1742 spdk_nvme_qpair_get_id(qpair), spdk_thread_get_id(nvme_ctrlr->thread)); 1743 1744 assert(nvme_qpair->group != NULL); 1745 1746 rc = spdk_nvme_poll_group_add(nvme_qpair->group->group, qpair); 1747 if (rc != 0) { 1748 SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n"); 1749 goto err; 1750 } 1751 1752 rc = spdk_nvme_ctrlr_connect_io_qpair(nvme_ctrlr->ctrlr, qpair); 1753 if (rc != 0) { 1754 SPDK_ERRLOG("Unable to connect I/O qpair.\n"); 1755 goto err; 1756 } 1757 1758 nvme_qpair->qpair = qpair; 1759 1760 if (!g_opts.disable_auto_failback) { 1761 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1762 } 1763 1764 return 0; 1765 1766 err: 1767 spdk_nvme_ctrlr_free_io_qpair(qpair); 1768 1769 return rc; 1770 } 1771 1772 static void 1773 bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i) 1774 { 1775 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 1776 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 1777 enum spdk_bdev_io_status status = SPDK_BDEV_IO_STATUS_SUCCESS; 1778 struct spdk_bdev_io *bdev_io; 1779 1780 if (spdk_io_channel_iter_get_ctx(i) != NULL) { 1781 status = SPDK_BDEV_IO_STATUS_FAILED; 1782 } 1783 1784 while (!TAILQ_EMPTY(&ctrlr_ch->pending_resets)) { 1785 bdev_io = TAILQ_FIRST(&ctrlr_ch->pending_resets); 1786 TAILQ_REMOVE(&ctrlr_ch->pending_resets, bdev_io, module_link); 1787 __bdev_nvme_io_complete(bdev_io, status, NULL); 1788 } 1789 1790 spdk_for_each_channel_continue(i, 0); 1791 } 1792 1793 /* This function marks the current trid as failed by storing the current ticks 1794 * and then sets the next trid to the active trid within a controller if exists. 1795 * 1796 * The purpose of the boolean return value is to request the caller to disconnect 1797 * the current trid now to try connecting the next trid. 1798 */ 1799 static bool 1800 bdev_nvme_failover_trid(struct nvme_ctrlr *nvme_ctrlr, bool remove, bool start) 1801 { 1802 struct nvme_path_id *path_id, *next_path; 1803 int rc __attribute__((unused)); 1804 1805 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 1806 assert(path_id); 1807 assert(path_id == nvme_ctrlr->active_path_id); 1808 next_path = TAILQ_NEXT(path_id, link); 1809 1810 /* Update the last failed time. It means the trid is failed if its last 1811 * failed time is non-zero. 1812 */ 1813 path_id->last_failed_tsc = spdk_get_ticks(); 1814 1815 if (next_path == NULL) { 1816 /* There is no alternate trid within a controller. */ 1817 return false; 1818 } 1819 1820 if (!start && nvme_ctrlr->opts.reconnect_delay_sec == 0) { 1821 /* Connect is not retried in a controller reset sequence. Connecting 1822 * the next trid will be done by the next bdev_nvme_failover_ctrlr() call. 1823 */ 1824 return false; 1825 } 1826 1827 assert(path_id->trid.trtype != SPDK_NVME_TRANSPORT_PCIE); 1828 1829 SPDK_NOTICELOG("Start failover from %s:%s to %s:%s\n", path_id->trid.traddr, 1830 path_id->trid.trsvcid, next_path->trid.traddr, next_path->trid.trsvcid); 1831 1832 spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr); 1833 nvme_ctrlr->active_path_id = next_path; 1834 rc = spdk_nvme_ctrlr_set_trid(nvme_ctrlr->ctrlr, &next_path->trid); 1835 assert(rc == 0); 1836 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 1837 if (!remove) { 1838 /** Shuffle the old trid to the end of the list and use the new one. 1839 * Allows for round robin through multiple connections. 1840 */ 1841 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, path_id, link); 1842 } else { 1843 free(path_id); 1844 } 1845 1846 if (start || next_path->last_failed_tsc == 0) { 1847 /* bdev_nvme_failover_ctrlr() is just called or the next trid is not failed 1848 * or used yet. Try the next trid now. 1849 */ 1850 return true; 1851 } 1852 1853 if (spdk_get_ticks() > next_path->last_failed_tsc + spdk_get_ticks_hz() * 1854 nvme_ctrlr->opts.reconnect_delay_sec) { 1855 /* Enough backoff passed since the next trid failed. Try the next trid now. */ 1856 return true; 1857 } 1858 1859 /* The next trid will be tried after reconnect_delay_sec seconds. */ 1860 return false; 1861 } 1862 1863 static bool 1864 bdev_nvme_check_ctrlr_loss_timeout(struct nvme_ctrlr *nvme_ctrlr) 1865 { 1866 int32_t elapsed; 1867 1868 if (nvme_ctrlr->opts.ctrlr_loss_timeout_sec == 0 || 1869 nvme_ctrlr->opts.ctrlr_loss_timeout_sec == -1) { 1870 return false; 1871 } 1872 1873 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 1874 if (elapsed >= nvme_ctrlr->opts.ctrlr_loss_timeout_sec) { 1875 return true; 1876 } else { 1877 return false; 1878 } 1879 } 1880 1881 static bool 1882 bdev_nvme_check_fast_io_fail_timeout(struct nvme_ctrlr *nvme_ctrlr) 1883 { 1884 uint32_t elapsed; 1885 1886 if (nvme_ctrlr->opts.fast_io_fail_timeout_sec == 0) { 1887 return false; 1888 } 1889 1890 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 1891 if (elapsed >= nvme_ctrlr->opts.fast_io_fail_timeout_sec) { 1892 return true; 1893 } else { 1894 return false; 1895 } 1896 } 1897 1898 static void bdev_nvme_reset_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, bool success); 1899 1900 static void 1901 nvme_ctrlr_disconnect(struct nvme_ctrlr *nvme_ctrlr, nvme_ctrlr_disconnected_cb cb_fn) 1902 { 1903 int rc; 1904 1905 rc = spdk_nvme_ctrlr_disconnect(nvme_ctrlr->ctrlr); 1906 if (rc != 0) { 1907 /* Disconnect fails if ctrlr is already resetting or removed. In this case, 1908 * fail the reset sequence immediately. 1909 */ 1910 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 1911 return; 1912 } 1913 1914 /* spdk_nvme_ctrlr_disconnect() may complete asynchronously later by polling adminq. 1915 * Set callback here to execute the specified operation after ctrlr is really disconnected. 1916 */ 1917 assert(nvme_ctrlr->disconnected_cb == NULL); 1918 nvme_ctrlr->disconnected_cb = cb_fn; 1919 1920 /* During disconnection, reduce the period to poll adminq more often. */ 1921 bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 0); 1922 } 1923 1924 enum bdev_nvme_op_after_reset { 1925 OP_NONE, 1926 OP_COMPLETE_PENDING_DESTRUCT, 1927 OP_DESTRUCT, 1928 OP_DELAYED_RECONNECT, 1929 OP_FAILOVER, 1930 }; 1931 1932 typedef enum bdev_nvme_op_after_reset _bdev_nvme_op_after_reset; 1933 1934 static _bdev_nvme_op_after_reset 1935 bdev_nvme_check_op_after_reset(struct nvme_ctrlr *nvme_ctrlr, bool success) 1936 { 1937 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 1938 /* Complete pending destruct after reset completes. */ 1939 return OP_COMPLETE_PENDING_DESTRUCT; 1940 } else if (nvme_ctrlr->pending_failover) { 1941 nvme_ctrlr->pending_failover = false; 1942 nvme_ctrlr->reset_start_tsc = 0; 1943 return OP_FAILOVER; 1944 } else if (success || nvme_ctrlr->opts.reconnect_delay_sec == 0) { 1945 nvme_ctrlr->reset_start_tsc = 0; 1946 return OP_NONE; 1947 } else if (bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 1948 return OP_DESTRUCT; 1949 } else { 1950 if (bdev_nvme_check_fast_io_fail_timeout(nvme_ctrlr)) { 1951 nvme_ctrlr->fast_io_fail_timedout = true; 1952 } 1953 return OP_DELAYED_RECONNECT; 1954 } 1955 } 1956 1957 static int bdev_nvme_delete_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool hotplug); 1958 static void bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 1959 1960 static int 1961 bdev_nvme_reconnect_delay_timer_expired(void *ctx) 1962 { 1963 struct nvme_ctrlr *nvme_ctrlr = ctx; 1964 1965 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect_delay, nvme_ctrlr->nbdev_ctrlr->name); 1966 pthread_mutex_lock(&nvme_ctrlr->mutex); 1967 1968 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 1969 1970 if (!nvme_ctrlr->reconnect_is_delayed) { 1971 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1972 return SPDK_POLLER_BUSY; 1973 } 1974 1975 nvme_ctrlr->reconnect_is_delayed = false; 1976 1977 if (nvme_ctrlr->destruct) { 1978 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1979 return SPDK_POLLER_BUSY; 1980 } 1981 1982 assert(nvme_ctrlr->resetting == false); 1983 nvme_ctrlr->resetting = true; 1984 1985 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1986 1987 spdk_poller_resume(nvme_ctrlr->adminq_timer_poller); 1988 1989 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 1990 return SPDK_POLLER_BUSY; 1991 } 1992 1993 static void 1994 bdev_nvme_start_reconnect_delay_timer(struct nvme_ctrlr *nvme_ctrlr) 1995 { 1996 spdk_poller_pause(nvme_ctrlr->adminq_timer_poller); 1997 1998 assert(nvme_ctrlr->reconnect_is_delayed == false); 1999 nvme_ctrlr->reconnect_is_delayed = true; 2000 2001 assert(nvme_ctrlr->reconnect_delay_timer == NULL); 2002 nvme_ctrlr->reconnect_delay_timer = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_delay_timer_expired, 2003 nvme_ctrlr, 2004 nvme_ctrlr->opts.reconnect_delay_sec * SPDK_SEC_TO_USEC); 2005 } 2006 2007 static void remove_discovery_entry(struct nvme_ctrlr *nvme_ctrlr); 2008 2009 static void 2010 _bdev_nvme_reset_ctrlr_complete(struct spdk_io_channel_iter *i, int status) 2011 { 2012 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2013 bool success = spdk_io_channel_iter_get_ctx(i) == NULL; 2014 bdev_nvme_ctrlr_op_cb ctrlr_op_cb_fn = nvme_ctrlr->ctrlr_op_cb_fn; 2015 void *ctrlr_op_cb_arg = nvme_ctrlr->ctrlr_op_cb_arg; 2016 enum bdev_nvme_op_after_reset op_after_reset; 2017 2018 assert(nvme_ctrlr->thread == spdk_get_thread()); 2019 2020 nvme_ctrlr->ctrlr_op_cb_fn = NULL; 2021 nvme_ctrlr->ctrlr_op_cb_arg = NULL; 2022 2023 if (!success) { 2024 SPDK_ERRLOG("Resetting controller failed.\n"); 2025 } else { 2026 SPDK_NOTICELOG("Resetting controller successful.\n"); 2027 } 2028 2029 pthread_mutex_lock(&nvme_ctrlr->mutex); 2030 nvme_ctrlr->resetting = false; 2031 nvme_ctrlr->dont_retry = false; 2032 nvme_ctrlr->in_failover = false; 2033 2034 op_after_reset = bdev_nvme_check_op_after_reset(nvme_ctrlr, success); 2035 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2036 2037 if (ctrlr_op_cb_fn) { 2038 ctrlr_op_cb_fn(ctrlr_op_cb_arg, success ? 0 : -1); 2039 } 2040 2041 switch (op_after_reset) { 2042 case OP_COMPLETE_PENDING_DESTRUCT: 2043 nvme_ctrlr_unregister(nvme_ctrlr); 2044 break; 2045 case OP_DESTRUCT: 2046 bdev_nvme_delete_ctrlr(nvme_ctrlr, false); 2047 remove_discovery_entry(nvme_ctrlr); 2048 break; 2049 case OP_DELAYED_RECONNECT: 2050 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_start_reconnect_delay_timer); 2051 break; 2052 case OP_FAILOVER: 2053 bdev_nvme_failover_ctrlr(nvme_ctrlr); 2054 break; 2055 default: 2056 break; 2057 } 2058 } 2059 2060 static void 2061 bdev_nvme_reset_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, bool success) 2062 { 2063 pthread_mutex_lock(&nvme_ctrlr->mutex); 2064 if (!success) { 2065 /* Connecting the active trid failed. Set the next alternate trid to the 2066 * active trid if it exists. 2067 */ 2068 if (bdev_nvme_failover_trid(nvme_ctrlr, false, false)) { 2069 /* The next alternate trid exists and is ready to try. Try it now. */ 2070 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2071 2072 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr); 2073 return; 2074 } 2075 2076 /* We came here if there is no alternate trid or if the next trid exists but 2077 * is not ready to try. We will try the active trid after reconnect_delay_sec 2078 * seconds if it is non-zero or at the next reset call otherwise. 2079 */ 2080 } else { 2081 /* Connecting the active trid succeeded. Clear the last failed time because it 2082 * means the trid is failed if its last failed time is non-zero. 2083 */ 2084 nvme_ctrlr->active_path_id->last_failed_tsc = 0; 2085 } 2086 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2087 2088 /* Make sure we clear any pending resets before returning. */ 2089 spdk_for_each_channel(nvme_ctrlr, 2090 bdev_nvme_complete_pending_resets, 2091 success ? NULL : (void *)0x1, 2092 _bdev_nvme_reset_ctrlr_complete); 2093 } 2094 2095 static void 2096 bdev_nvme_reset_create_qpairs_failed(struct spdk_io_channel_iter *i, int status) 2097 { 2098 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2099 2100 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 2101 } 2102 2103 static void 2104 bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i) 2105 { 2106 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 2107 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch); 2108 struct nvme_qpair *nvme_qpair; 2109 2110 nvme_qpair = ctrlr_ch->qpair; 2111 assert(nvme_qpair != NULL); 2112 2113 _bdev_nvme_clear_io_path_cache(nvme_qpair); 2114 2115 if (nvme_qpair->qpair != NULL) { 2116 if (nvme_qpair->ctrlr->dont_retry) { 2117 spdk_nvme_qpair_set_abort_dnr(nvme_qpair->qpair, true); 2118 } 2119 spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); 2120 2121 /* The current full reset sequence will move to the next 2122 * ctrlr_channel after the qpair is actually disconnected. 2123 */ 2124 assert(ctrlr_ch->reset_iter == NULL); 2125 ctrlr_ch->reset_iter = i; 2126 } else { 2127 spdk_for_each_channel_continue(i, 0); 2128 } 2129 } 2130 2131 static void 2132 bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status) 2133 { 2134 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2135 2136 if (status == 0) { 2137 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, true); 2138 } else { 2139 /* Delete the added qpairs and quiesce ctrlr to make the states clean. */ 2140 spdk_for_each_channel(nvme_ctrlr, 2141 bdev_nvme_reset_destroy_qpair, 2142 NULL, 2143 bdev_nvme_reset_create_qpairs_failed); 2144 } 2145 } 2146 2147 static int 2148 bdev_nvme_reset_check_qpair_connected(void *ctx) 2149 { 2150 struct nvme_ctrlr_channel *ctrlr_ch = ctx; 2151 2152 if (ctrlr_ch->reset_iter == NULL) { 2153 /* qpair was already failed to connect and the reset sequence is being aborted. */ 2154 assert(ctrlr_ch->connect_poller == NULL); 2155 assert(ctrlr_ch->qpair->qpair == NULL); 2156 return SPDK_POLLER_BUSY; 2157 } 2158 2159 assert(ctrlr_ch->qpair->qpair != NULL); 2160 2161 if (!spdk_nvme_qpair_is_connected(ctrlr_ch->qpair->qpair)) { 2162 return SPDK_POLLER_BUSY; 2163 } 2164 2165 spdk_poller_unregister(&ctrlr_ch->connect_poller); 2166 2167 /* qpair was completed to connect. Move to the next ctrlr_channel */ 2168 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 2169 ctrlr_ch->reset_iter = NULL; 2170 2171 if (!g_opts.disable_auto_failback) { 2172 _bdev_nvme_clear_io_path_cache(ctrlr_ch->qpair); 2173 } 2174 2175 return SPDK_POLLER_BUSY; 2176 } 2177 2178 static void 2179 bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i) 2180 { 2181 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 2182 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 2183 int rc; 2184 2185 rc = bdev_nvme_create_qpair(ctrlr_ch->qpair); 2186 if (rc == 0) { 2187 ctrlr_ch->connect_poller = SPDK_POLLER_REGISTER(bdev_nvme_reset_check_qpair_connected, 2188 ctrlr_ch, 0); 2189 2190 /* The current full reset sequence will move to the next 2191 * ctrlr_channel after the qpair is actually connected. 2192 */ 2193 assert(ctrlr_ch->reset_iter == NULL); 2194 ctrlr_ch->reset_iter = i; 2195 } else { 2196 spdk_for_each_channel_continue(i, rc); 2197 } 2198 } 2199 2200 static int 2201 bdev_nvme_reconnect_ctrlr_poll(void *arg) 2202 { 2203 struct nvme_ctrlr *nvme_ctrlr = arg; 2204 int rc = -ETIMEDOUT; 2205 2206 if (!bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 2207 rc = spdk_nvme_ctrlr_reconnect_poll_async(nvme_ctrlr->ctrlr); 2208 if (rc == -EAGAIN) { 2209 return SPDK_POLLER_BUSY; 2210 } 2211 } 2212 2213 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 2214 if (rc == 0) { 2215 /* Recreate all of the I/O queue pairs */ 2216 spdk_for_each_channel(nvme_ctrlr, 2217 bdev_nvme_reset_create_qpair, 2218 NULL, 2219 bdev_nvme_reset_create_qpairs_done); 2220 } else { 2221 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 2222 } 2223 return SPDK_POLLER_BUSY; 2224 } 2225 2226 static void 2227 bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2228 { 2229 spdk_nvme_ctrlr_reconnect_async(nvme_ctrlr->ctrlr); 2230 2231 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect, nvme_ctrlr->nbdev_ctrlr->name); 2232 assert(nvme_ctrlr->reset_detach_poller == NULL); 2233 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_ctrlr_poll, 2234 nvme_ctrlr, 0); 2235 } 2236 2237 static void 2238 bdev_nvme_reset_destroy_qpair_done(struct spdk_io_channel_iter *i, int status) 2239 { 2240 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2241 2242 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reset, nvme_ctrlr->nbdev_ctrlr->name); 2243 assert(status == 0); 2244 2245 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2246 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2247 } else { 2248 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr); 2249 } 2250 } 2251 2252 static void 2253 bdev_nvme_reset_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr) 2254 { 2255 spdk_for_each_channel(nvme_ctrlr, 2256 bdev_nvme_reset_destroy_qpair, 2257 NULL, 2258 bdev_nvme_reset_destroy_qpair_done); 2259 } 2260 2261 static void 2262 bdev_nvme_reconnect_ctrlr_now(void *ctx) 2263 { 2264 struct nvme_ctrlr *nvme_ctrlr = ctx; 2265 2266 assert(nvme_ctrlr->resetting == true); 2267 assert(nvme_ctrlr->thread == spdk_get_thread()); 2268 2269 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 2270 2271 spdk_poller_resume(nvme_ctrlr->adminq_timer_poller); 2272 2273 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2274 } 2275 2276 static void 2277 _bdev_nvme_reset_ctrlr(void *ctx) 2278 { 2279 struct nvme_ctrlr *nvme_ctrlr = ctx; 2280 2281 assert(nvme_ctrlr->resetting == true); 2282 assert(nvme_ctrlr->thread == spdk_get_thread()); 2283 2284 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2285 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reset_destroy_qpairs); 2286 } else { 2287 bdev_nvme_reset_destroy_qpairs(nvme_ctrlr); 2288 } 2289 } 2290 2291 static int 2292 bdev_nvme_reset_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2293 { 2294 spdk_msg_fn msg_fn; 2295 2296 pthread_mutex_lock(&nvme_ctrlr->mutex); 2297 if (nvme_ctrlr->destruct) { 2298 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2299 return -ENXIO; 2300 } 2301 2302 if (nvme_ctrlr->resetting) { 2303 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2304 SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); 2305 return -EBUSY; 2306 } 2307 2308 if (nvme_ctrlr->disabled) { 2309 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2310 SPDK_NOTICELOG("Unable to perform reset. Controller is disabled.\n"); 2311 return -EALREADY; 2312 } 2313 2314 nvme_ctrlr->resetting = true; 2315 nvme_ctrlr->dont_retry = true; 2316 2317 if (nvme_ctrlr->reconnect_is_delayed) { 2318 SPDK_DEBUGLOG(bdev_nvme, "Reconnect is already scheduled.\n"); 2319 msg_fn = bdev_nvme_reconnect_ctrlr_now; 2320 nvme_ctrlr->reconnect_is_delayed = false; 2321 } else { 2322 msg_fn = _bdev_nvme_reset_ctrlr; 2323 assert(nvme_ctrlr->reset_start_tsc == 0); 2324 } 2325 2326 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2327 2328 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2329 2330 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 2331 return 0; 2332 } 2333 2334 static int 2335 bdev_nvme_enable_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2336 { 2337 pthread_mutex_lock(&nvme_ctrlr->mutex); 2338 if (nvme_ctrlr->destruct) { 2339 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2340 return -ENXIO; 2341 } 2342 2343 if (nvme_ctrlr->resetting) { 2344 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2345 return -EBUSY; 2346 } 2347 2348 if (!nvme_ctrlr->disabled) { 2349 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2350 return -EALREADY; 2351 } 2352 2353 nvme_ctrlr->disabled = false; 2354 nvme_ctrlr->resetting = true; 2355 2356 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2357 2358 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2359 2360 spdk_thread_send_msg(nvme_ctrlr->thread, bdev_nvme_reconnect_ctrlr_now, nvme_ctrlr); 2361 return 0; 2362 } 2363 2364 static void 2365 _bdev_nvme_disable_ctrlr_complete(struct spdk_io_channel_iter *i, int status) 2366 { 2367 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2368 bdev_nvme_ctrlr_op_cb ctrlr_op_cb_fn = nvme_ctrlr->ctrlr_op_cb_fn; 2369 void *ctrlr_op_cb_arg = nvme_ctrlr->ctrlr_op_cb_arg; 2370 enum bdev_nvme_op_after_reset op_after_disable; 2371 2372 assert(nvme_ctrlr->thread == spdk_get_thread()); 2373 2374 nvme_ctrlr->ctrlr_op_cb_fn = NULL; 2375 nvme_ctrlr->ctrlr_op_cb_arg = NULL; 2376 2377 pthread_mutex_lock(&nvme_ctrlr->mutex); 2378 2379 nvme_ctrlr->resetting = false; 2380 nvme_ctrlr->dont_retry = false; 2381 2382 op_after_disable = bdev_nvme_check_op_after_reset(nvme_ctrlr, true); 2383 2384 nvme_ctrlr->disabled = true; 2385 spdk_poller_pause(nvme_ctrlr->adminq_timer_poller); 2386 2387 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2388 2389 if (ctrlr_op_cb_fn) { 2390 ctrlr_op_cb_fn(ctrlr_op_cb_arg, 0); 2391 } 2392 2393 switch (op_after_disable) { 2394 case OP_COMPLETE_PENDING_DESTRUCT: 2395 nvme_ctrlr_unregister(nvme_ctrlr); 2396 break; 2397 default: 2398 break; 2399 } 2400 2401 } 2402 2403 static void 2404 bdev_nvme_disable_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr) 2405 { 2406 /* Make sure we clear any pending resets before returning. */ 2407 spdk_for_each_channel(nvme_ctrlr, 2408 bdev_nvme_complete_pending_resets, 2409 NULL, 2410 _bdev_nvme_disable_ctrlr_complete); 2411 } 2412 2413 static void 2414 bdev_nvme_disable_destroy_qpairs_done(struct spdk_io_channel_iter *i, int status) 2415 { 2416 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2417 2418 assert(status == 0); 2419 2420 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2421 bdev_nvme_disable_ctrlr_complete(nvme_ctrlr); 2422 } else { 2423 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_disable_ctrlr_complete); 2424 } 2425 } 2426 2427 static void 2428 bdev_nvme_disable_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr) 2429 { 2430 spdk_for_each_channel(nvme_ctrlr, 2431 bdev_nvme_reset_destroy_qpair, 2432 NULL, 2433 bdev_nvme_disable_destroy_qpairs_done); 2434 } 2435 2436 static void 2437 _bdev_nvme_cancel_reconnect_and_disable_ctrlr(void *ctx) 2438 { 2439 struct nvme_ctrlr *nvme_ctrlr = ctx; 2440 2441 assert(nvme_ctrlr->resetting == true); 2442 assert(nvme_ctrlr->thread == spdk_get_thread()); 2443 2444 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 2445 2446 bdev_nvme_disable_ctrlr_complete(nvme_ctrlr); 2447 } 2448 2449 static void 2450 _bdev_nvme_disconnect_and_disable_ctrlr(void *ctx) 2451 { 2452 struct nvme_ctrlr *nvme_ctrlr = ctx; 2453 2454 assert(nvme_ctrlr->resetting == true); 2455 assert(nvme_ctrlr->thread == spdk_get_thread()); 2456 2457 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2458 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_disable_destroy_qpairs); 2459 } else { 2460 bdev_nvme_disable_destroy_qpairs(nvme_ctrlr); 2461 } 2462 } 2463 2464 static int 2465 bdev_nvme_disable_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2466 { 2467 spdk_msg_fn msg_fn; 2468 2469 pthread_mutex_lock(&nvme_ctrlr->mutex); 2470 if (nvme_ctrlr->destruct) { 2471 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2472 return -ENXIO; 2473 } 2474 2475 if (nvme_ctrlr->resetting) { 2476 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2477 return -EBUSY; 2478 } 2479 2480 if (nvme_ctrlr->disabled) { 2481 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2482 return -EALREADY; 2483 } 2484 2485 nvme_ctrlr->resetting = true; 2486 nvme_ctrlr->dont_retry = true; 2487 2488 if (nvme_ctrlr->reconnect_is_delayed) { 2489 msg_fn = _bdev_nvme_cancel_reconnect_and_disable_ctrlr; 2490 nvme_ctrlr->reconnect_is_delayed = false; 2491 } else { 2492 msg_fn = _bdev_nvme_disconnect_and_disable_ctrlr; 2493 } 2494 2495 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2496 2497 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2498 2499 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 2500 return 0; 2501 } 2502 2503 static int 2504 nvme_ctrlr_op(struct nvme_ctrlr *nvme_ctrlr, enum nvme_ctrlr_op op, 2505 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2506 { 2507 int rc; 2508 2509 switch (op) { 2510 case NVME_CTRLR_OP_RESET: 2511 rc = bdev_nvme_reset_ctrlr(nvme_ctrlr); 2512 break; 2513 case NVME_CTRLR_OP_ENABLE: 2514 rc = bdev_nvme_enable_ctrlr(nvme_ctrlr); 2515 break; 2516 case NVME_CTRLR_OP_DISABLE: 2517 rc = bdev_nvme_disable_ctrlr(nvme_ctrlr); 2518 break; 2519 default: 2520 rc = -EINVAL; 2521 break; 2522 } 2523 2524 if (rc == 0) { 2525 assert(nvme_ctrlr->ctrlr_op_cb_fn == NULL); 2526 assert(nvme_ctrlr->ctrlr_op_cb_arg == NULL); 2527 nvme_ctrlr->ctrlr_op_cb_fn = cb_fn; 2528 nvme_ctrlr->ctrlr_op_cb_arg = cb_arg; 2529 } 2530 return rc; 2531 } 2532 2533 struct nvme_ctrlr_op_rpc_ctx { 2534 struct nvme_ctrlr *nvme_ctrlr; 2535 struct spdk_thread *orig_thread; 2536 enum nvme_ctrlr_op op; 2537 int rc; 2538 bdev_nvme_ctrlr_op_cb cb_fn; 2539 void *cb_arg; 2540 }; 2541 2542 static void 2543 _nvme_ctrlr_op_rpc_complete(void *_ctx) 2544 { 2545 struct nvme_ctrlr_op_rpc_ctx *ctx = _ctx; 2546 2547 assert(ctx != NULL); 2548 assert(ctx->cb_fn != NULL); 2549 2550 ctx->cb_fn(ctx->cb_arg, ctx->rc); 2551 2552 free(ctx); 2553 } 2554 2555 static void 2556 nvme_ctrlr_op_rpc_complete(void *cb_arg, int rc) 2557 { 2558 struct nvme_ctrlr_op_rpc_ctx *ctx = cb_arg; 2559 2560 ctx->rc = rc; 2561 2562 spdk_thread_send_msg(ctx->orig_thread, _nvme_ctrlr_op_rpc_complete, ctx); 2563 } 2564 2565 void 2566 nvme_ctrlr_op_rpc(struct nvme_ctrlr *nvme_ctrlr, enum nvme_ctrlr_op op, 2567 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2568 { 2569 struct nvme_ctrlr_op_rpc_ctx *ctx; 2570 int rc; 2571 2572 assert(cb_fn != NULL); 2573 2574 ctx = calloc(1, sizeof(*ctx)); 2575 if (ctx == NULL) { 2576 SPDK_ERRLOG("Failed to allocate nvme_ctrlr_op_rpc_ctx.\n"); 2577 cb_fn(cb_arg, -ENOMEM); 2578 return; 2579 } 2580 2581 ctx->orig_thread = spdk_get_thread(); 2582 ctx->cb_fn = cb_fn; 2583 ctx->cb_arg = cb_arg; 2584 2585 rc = nvme_ctrlr_op(nvme_ctrlr, op, nvme_ctrlr_op_rpc_complete, ctx); 2586 if (rc == 0) { 2587 return; 2588 } else if (rc == -EALREADY) { 2589 rc = 0; 2590 } 2591 2592 nvme_ctrlr_op_rpc_complete(ctx, rc); 2593 } 2594 2595 static void nvme_bdev_ctrlr_op_rpc_continue(void *cb_arg, int rc); 2596 2597 static void 2598 _nvme_bdev_ctrlr_op_rpc_continue(void *_ctx) 2599 { 2600 struct nvme_ctrlr_op_rpc_ctx *ctx = _ctx; 2601 struct nvme_ctrlr *prev_nvme_ctrlr, *next_nvme_ctrlr; 2602 int rc; 2603 2604 prev_nvme_ctrlr = ctx->nvme_ctrlr; 2605 ctx->nvme_ctrlr = NULL; 2606 2607 if (ctx->rc != 0) { 2608 goto complete; 2609 } 2610 2611 next_nvme_ctrlr = TAILQ_NEXT(prev_nvme_ctrlr, tailq); 2612 if (next_nvme_ctrlr == NULL) { 2613 goto complete; 2614 } 2615 2616 rc = nvme_ctrlr_op(next_nvme_ctrlr, ctx->op, nvme_bdev_ctrlr_op_rpc_continue, ctx); 2617 if (rc == 0) { 2618 ctx->nvme_ctrlr = next_nvme_ctrlr; 2619 return; 2620 } else if (rc == -EALREADY) { 2621 ctx->nvme_ctrlr = next_nvme_ctrlr; 2622 rc = 0; 2623 } 2624 2625 ctx->rc = rc; 2626 2627 complete: 2628 ctx->cb_fn(ctx->cb_arg, ctx->rc); 2629 free(ctx); 2630 } 2631 2632 static void 2633 nvme_bdev_ctrlr_op_rpc_continue(void *cb_arg, int rc) 2634 { 2635 struct nvme_ctrlr_op_rpc_ctx *ctx = cb_arg; 2636 2637 ctx->rc = rc; 2638 2639 spdk_thread_send_msg(ctx->orig_thread, _nvme_bdev_ctrlr_op_rpc_continue, ctx); 2640 } 2641 2642 void 2643 nvme_bdev_ctrlr_op_rpc(struct nvme_bdev_ctrlr *nbdev_ctrlr, enum nvme_ctrlr_op op, 2644 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2645 { 2646 struct nvme_ctrlr_op_rpc_ctx *ctx; 2647 struct nvme_ctrlr *nvme_ctrlr; 2648 int rc; 2649 2650 assert(cb_fn != NULL); 2651 2652 ctx = calloc(1, sizeof(*ctx)); 2653 if (ctx == NULL) { 2654 SPDK_ERRLOG("Failed to allocate nvme_ctrlr_op_rpc_ctx.\n"); 2655 cb_fn(cb_arg, -ENOMEM); 2656 return; 2657 } 2658 2659 ctx->orig_thread = spdk_get_thread(); 2660 ctx->op = op; 2661 ctx->cb_fn = cb_fn; 2662 ctx->cb_arg = cb_arg; 2663 2664 nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 2665 assert(nvme_ctrlr != NULL); 2666 2667 rc = nvme_ctrlr_op(nvme_ctrlr, op, nvme_bdev_ctrlr_op_rpc_continue, ctx); 2668 if (rc == 0) { 2669 ctx->nvme_ctrlr = nvme_ctrlr; 2670 return; 2671 } else if (rc == -EALREADY) { 2672 ctx->nvme_ctrlr = nvme_ctrlr; 2673 rc = 0; 2674 } 2675 2676 nvme_bdev_ctrlr_op_rpc_continue(ctx, rc); 2677 } 2678 2679 static int _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio); 2680 2681 static void 2682 _bdev_nvme_reset_io_complete(struct spdk_io_channel_iter *i, int status) 2683 { 2684 struct nvme_bdev_io *bio = spdk_io_channel_iter_get_ctx(i); 2685 enum spdk_bdev_io_status io_status; 2686 2687 if (bio->cpl.cdw0 == 0) { 2688 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 2689 } else { 2690 io_status = SPDK_BDEV_IO_STATUS_FAILED; 2691 } 2692 2693 __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), io_status, NULL); 2694 } 2695 2696 static void 2697 bdev_nvme_abort_bdev_channel(struct spdk_io_channel_iter *i) 2698 { 2699 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 2700 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 2701 2702 bdev_nvme_abort_retry_ios(nbdev_ch); 2703 2704 spdk_for_each_channel_continue(i, 0); 2705 } 2706 2707 static void 2708 bdev_nvme_reset_io_complete(struct nvme_bdev_io *bio) 2709 { 2710 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2711 struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt; 2712 2713 /* Abort all queued I/Os for retry. */ 2714 spdk_for_each_channel(nbdev, 2715 bdev_nvme_abort_bdev_channel, 2716 bio, 2717 _bdev_nvme_reset_io_complete); 2718 } 2719 2720 static void 2721 _bdev_nvme_reset_io_continue(void *ctx) 2722 { 2723 struct nvme_bdev_io *bio = ctx; 2724 struct nvme_io_path *prev_io_path, *next_io_path; 2725 int rc; 2726 2727 prev_io_path = bio->io_path; 2728 bio->io_path = NULL; 2729 2730 if (bio->cpl.cdw0 != 0) { 2731 goto complete; 2732 } 2733 2734 next_io_path = STAILQ_NEXT(prev_io_path, stailq); 2735 if (next_io_path == NULL) { 2736 goto complete; 2737 } 2738 2739 rc = _bdev_nvme_reset_io(next_io_path, bio); 2740 if (rc == 0) { 2741 return; 2742 } 2743 2744 bio->cpl.cdw0 = 1; 2745 2746 complete: 2747 bdev_nvme_reset_io_complete(bio); 2748 } 2749 2750 static void 2751 bdev_nvme_reset_io_continue(void *cb_arg, int rc) 2752 { 2753 struct nvme_bdev_io *bio = cb_arg; 2754 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2755 2756 bio->cpl.cdw0 = (rc == 0) ? 0 : 1; 2757 2758 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), _bdev_nvme_reset_io_continue, bio); 2759 } 2760 2761 static int 2762 _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio) 2763 { 2764 struct nvme_ctrlr_channel *ctrlr_ch; 2765 struct spdk_bdev_io *bdev_io; 2766 int rc; 2767 2768 rc = nvme_ctrlr_op(io_path->qpair->ctrlr, NVME_CTRLR_OP_RESET, 2769 bdev_nvme_reset_io_continue, bio); 2770 if (rc == 0) { 2771 assert(bio->io_path == NULL); 2772 bio->io_path = io_path; 2773 } else if (rc == -EBUSY) { 2774 ctrlr_ch = io_path->qpair->ctrlr_ch; 2775 assert(ctrlr_ch != NULL); 2776 /* 2777 * Reset call is queued only if it is from the app framework. This is on purpose so that 2778 * we don't interfere with the app framework reset strategy. i.e. we are deferring to the 2779 * upper level. If they are in the middle of a reset, we won't try to schedule another one. 2780 */ 2781 bdev_io = spdk_bdev_io_from_ctx(bio); 2782 TAILQ_INSERT_TAIL(&ctrlr_ch->pending_resets, bdev_io, module_link); 2783 rc = 0; 2784 } 2785 2786 return rc; 2787 } 2788 2789 static void 2790 bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio) 2791 { 2792 struct nvme_io_path *io_path; 2793 int rc; 2794 2795 bio->cpl.cdw0 = 0; 2796 2797 /* Reset all nvme_ctrlrs of a bdev controller sequentially. */ 2798 io_path = STAILQ_FIRST(&nbdev_ch->io_path_list); 2799 assert(io_path != NULL); 2800 2801 rc = _bdev_nvme_reset_io(io_path, bio); 2802 if (rc != 0) { 2803 /* If the current nvme_ctrlr is disabled, skip it and move to the next nvme_ctrlr. */ 2804 bdev_nvme_reset_io_continue(bio, rc == -EALREADY); 2805 } 2806 } 2807 2808 static int 2809 bdev_nvme_failover_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool remove) 2810 { 2811 if (nvme_ctrlr->destruct) { 2812 /* Don't bother resetting if the controller is in the process of being destructed. */ 2813 return -ENXIO; 2814 } 2815 2816 if (nvme_ctrlr->resetting) { 2817 if (!nvme_ctrlr->in_failover) { 2818 SPDK_NOTICELOG("Reset is already in progress. Defer failover until reset completes.\n"); 2819 2820 /* Defer failover until reset completes. */ 2821 nvme_ctrlr->pending_failover = true; 2822 return -EINPROGRESS; 2823 } else { 2824 SPDK_NOTICELOG("Unable to perform failover, already in progress.\n"); 2825 return -EBUSY; 2826 } 2827 } 2828 2829 bdev_nvme_failover_trid(nvme_ctrlr, remove, true); 2830 2831 if (nvme_ctrlr->reconnect_is_delayed) { 2832 SPDK_NOTICELOG("Reconnect is already scheduled.\n"); 2833 2834 /* We rely on the next reconnect for the failover. */ 2835 return -EALREADY; 2836 } 2837 2838 if (nvme_ctrlr->disabled) { 2839 SPDK_NOTICELOG("Controller is disabled.\n"); 2840 2841 /* We rely on the enablement for the failover. */ 2842 return -EALREADY; 2843 } 2844 2845 nvme_ctrlr->resetting = true; 2846 nvme_ctrlr->in_failover = true; 2847 2848 assert(nvme_ctrlr->reset_start_tsc == 0); 2849 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2850 2851 return 0; 2852 } 2853 2854 static int 2855 bdev_nvme_failover_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2856 { 2857 int rc; 2858 2859 pthread_mutex_lock(&nvme_ctrlr->mutex); 2860 rc = bdev_nvme_failover_ctrlr_unsafe(nvme_ctrlr, false); 2861 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2862 2863 if (rc == 0) { 2864 spdk_thread_send_msg(nvme_ctrlr->thread, _bdev_nvme_reset_ctrlr, nvme_ctrlr); 2865 } else if (rc == -EALREADY) { 2866 rc = 0; 2867 } 2868 2869 return rc; 2870 } 2871 2872 static int bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, 2873 uint64_t num_blocks); 2874 2875 static int bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, 2876 uint64_t num_blocks); 2877 2878 static int bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, 2879 uint64_t src_offset_blocks, 2880 uint64_t num_blocks); 2881 2882 static void 2883 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 2884 bool success) 2885 { 2886 struct nvme_bdev_io *bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 2887 struct spdk_bdev *bdev = bdev_io->bdev; 2888 int ret; 2889 2890 if (!success) { 2891 ret = -EINVAL; 2892 goto exit; 2893 } 2894 2895 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 2896 ret = -ENXIO; 2897 goto exit; 2898 } 2899 2900 ret = bdev_nvme_readv(bio, 2901 bdev_io->u.bdev.iovs, 2902 bdev_io->u.bdev.iovcnt, 2903 bdev_io->u.bdev.md_buf, 2904 bdev_io->u.bdev.num_blocks, 2905 bdev_io->u.bdev.offset_blocks, 2906 bdev->dif_check_flags, 2907 bdev_io->u.bdev.memory_domain, 2908 bdev_io->u.bdev.memory_domain_ctx, 2909 bdev_io->u.bdev.accel_sequence); 2910 2911 exit: 2912 if (spdk_unlikely(ret != 0)) { 2913 bdev_nvme_io_complete(bio, ret); 2914 } 2915 } 2916 2917 static inline void 2918 _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io) 2919 { 2920 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 2921 struct spdk_bdev *bdev = bdev_io->bdev; 2922 struct nvme_bdev_io *nbdev_io_to_abort; 2923 int rc = 0; 2924 2925 switch (bdev_io->type) { 2926 case SPDK_BDEV_IO_TYPE_READ: 2927 if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) { 2928 rc = bdev_nvme_readv(nbdev_io, 2929 bdev_io->u.bdev.iovs, 2930 bdev_io->u.bdev.iovcnt, 2931 bdev_io->u.bdev.md_buf, 2932 bdev_io->u.bdev.num_blocks, 2933 bdev_io->u.bdev.offset_blocks, 2934 bdev->dif_check_flags, 2935 bdev_io->u.bdev.memory_domain, 2936 bdev_io->u.bdev.memory_domain_ctx, 2937 bdev_io->u.bdev.accel_sequence); 2938 } else { 2939 spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb, 2940 bdev_io->u.bdev.num_blocks * bdev->blocklen); 2941 rc = 0; 2942 } 2943 break; 2944 case SPDK_BDEV_IO_TYPE_WRITE: 2945 rc = bdev_nvme_writev(nbdev_io, 2946 bdev_io->u.bdev.iovs, 2947 bdev_io->u.bdev.iovcnt, 2948 bdev_io->u.bdev.md_buf, 2949 bdev_io->u.bdev.num_blocks, 2950 bdev_io->u.bdev.offset_blocks, 2951 bdev->dif_check_flags, 2952 bdev_io->u.bdev.memory_domain, 2953 bdev_io->u.bdev.memory_domain_ctx, 2954 bdev_io->u.bdev.accel_sequence); 2955 break; 2956 case SPDK_BDEV_IO_TYPE_COMPARE: 2957 rc = bdev_nvme_comparev(nbdev_io, 2958 bdev_io->u.bdev.iovs, 2959 bdev_io->u.bdev.iovcnt, 2960 bdev_io->u.bdev.md_buf, 2961 bdev_io->u.bdev.num_blocks, 2962 bdev_io->u.bdev.offset_blocks, 2963 bdev->dif_check_flags); 2964 break; 2965 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 2966 rc = bdev_nvme_comparev_and_writev(nbdev_io, 2967 bdev_io->u.bdev.iovs, 2968 bdev_io->u.bdev.iovcnt, 2969 bdev_io->u.bdev.fused_iovs, 2970 bdev_io->u.bdev.fused_iovcnt, 2971 bdev_io->u.bdev.md_buf, 2972 bdev_io->u.bdev.num_blocks, 2973 bdev_io->u.bdev.offset_blocks, 2974 bdev->dif_check_flags); 2975 break; 2976 case SPDK_BDEV_IO_TYPE_UNMAP: 2977 rc = bdev_nvme_unmap(nbdev_io, 2978 bdev_io->u.bdev.offset_blocks, 2979 bdev_io->u.bdev.num_blocks); 2980 break; 2981 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2982 rc = bdev_nvme_write_zeroes(nbdev_io, 2983 bdev_io->u.bdev.offset_blocks, 2984 bdev_io->u.bdev.num_blocks); 2985 break; 2986 case SPDK_BDEV_IO_TYPE_RESET: 2987 nbdev_io->io_path = NULL; 2988 bdev_nvme_reset_io(nbdev_ch, nbdev_io); 2989 return; 2990 2991 case SPDK_BDEV_IO_TYPE_FLUSH: 2992 bdev_nvme_io_complete(nbdev_io, 0); 2993 return; 2994 2995 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 2996 rc = bdev_nvme_zone_appendv(nbdev_io, 2997 bdev_io->u.bdev.iovs, 2998 bdev_io->u.bdev.iovcnt, 2999 bdev_io->u.bdev.md_buf, 3000 bdev_io->u.bdev.num_blocks, 3001 bdev_io->u.bdev.offset_blocks, 3002 bdev->dif_check_flags); 3003 break; 3004 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 3005 rc = bdev_nvme_get_zone_info(nbdev_io, 3006 bdev_io->u.zone_mgmt.zone_id, 3007 bdev_io->u.zone_mgmt.num_zones, 3008 bdev_io->u.zone_mgmt.buf); 3009 break; 3010 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 3011 rc = bdev_nvme_zone_management(nbdev_io, 3012 bdev_io->u.zone_mgmt.zone_id, 3013 bdev_io->u.zone_mgmt.zone_action); 3014 break; 3015 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 3016 nbdev_io->io_path = NULL; 3017 bdev_nvme_admin_passthru(nbdev_ch, 3018 nbdev_io, 3019 &bdev_io->u.nvme_passthru.cmd, 3020 bdev_io->u.nvme_passthru.buf, 3021 bdev_io->u.nvme_passthru.nbytes); 3022 return; 3023 3024 case SPDK_BDEV_IO_TYPE_NVME_IO: 3025 rc = bdev_nvme_io_passthru(nbdev_io, 3026 &bdev_io->u.nvme_passthru.cmd, 3027 bdev_io->u.nvme_passthru.buf, 3028 bdev_io->u.nvme_passthru.nbytes); 3029 break; 3030 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3031 rc = bdev_nvme_io_passthru_md(nbdev_io, 3032 &bdev_io->u.nvme_passthru.cmd, 3033 bdev_io->u.nvme_passthru.buf, 3034 bdev_io->u.nvme_passthru.nbytes, 3035 bdev_io->u.nvme_passthru.md_buf, 3036 bdev_io->u.nvme_passthru.md_len); 3037 break; 3038 case SPDK_BDEV_IO_TYPE_NVME_IOV_MD: 3039 rc = bdev_nvme_iov_passthru_md(nbdev_io, 3040 &bdev_io->u.nvme_passthru.cmd, 3041 bdev_io->u.nvme_passthru.iovs, 3042 bdev_io->u.nvme_passthru.iovcnt, 3043 bdev_io->u.nvme_passthru.nbytes, 3044 bdev_io->u.nvme_passthru.md_buf, 3045 bdev_io->u.nvme_passthru.md_len); 3046 break; 3047 case SPDK_BDEV_IO_TYPE_ABORT: 3048 nbdev_io->io_path = NULL; 3049 nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx; 3050 bdev_nvme_abort(nbdev_ch, 3051 nbdev_io, 3052 nbdev_io_to_abort); 3053 return; 3054 3055 case SPDK_BDEV_IO_TYPE_COPY: 3056 rc = bdev_nvme_copy(nbdev_io, 3057 bdev_io->u.bdev.offset_blocks, 3058 bdev_io->u.bdev.copy.src_offset_blocks, 3059 bdev_io->u.bdev.num_blocks); 3060 break; 3061 default: 3062 rc = -EINVAL; 3063 break; 3064 } 3065 3066 if (spdk_unlikely(rc != 0)) { 3067 bdev_nvme_io_complete(nbdev_io, rc); 3068 } 3069 } 3070 3071 static void 3072 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 3073 { 3074 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 3075 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 3076 3077 if (spdk_likely(nbdev_io->submit_tsc == 0)) { 3078 nbdev_io->submit_tsc = spdk_bdev_io_get_submit_tsc(bdev_io); 3079 } else { 3080 /* There are cases where submit_tsc != 0, i.e. retry I/O. 3081 * We need to update submit_tsc here. 3082 */ 3083 nbdev_io->submit_tsc = spdk_get_ticks(); 3084 } 3085 3086 spdk_trace_record(TRACE_BDEV_NVME_IO_START, 0, 0, (uintptr_t)nbdev_io, (uintptr_t)bdev_io); 3087 nbdev_io->io_path = bdev_nvme_find_io_path(nbdev_ch); 3088 if (spdk_unlikely(!nbdev_io->io_path)) { 3089 if (!bdev_nvme_io_type_is_admin(bdev_io->type)) { 3090 bdev_nvme_io_complete(nbdev_io, -ENXIO); 3091 return; 3092 } 3093 3094 /* Admin commands do not use the optimal I/O path. 3095 * Simply fall through even if it is not found. 3096 */ 3097 } 3098 3099 _bdev_nvme_submit_request(nbdev_ch, bdev_io); 3100 } 3101 3102 static bool 3103 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 3104 { 3105 struct nvme_bdev *nbdev = ctx; 3106 struct nvme_ns *nvme_ns; 3107 struct spdk_nvme_ns *ns; 3108 struct spdk_nvme_ctrlr *ctrlr; 3109 const struct spdk_nvme_ctrlr_data *cdata; 3110 3111 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 3112 assert(nvme_ns != NULL); 3113 ns = nvme_ns->ns; 3114 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 3115 3116 switch (io_type) { 3117 case SPDK_BDEV_IO_TYPE_READ: 3118 case SPDK_BDEV_IO_TYPE_WRITE: 3119 case SPDK_BDEV_IO_TYPE_RESET: 3120 case SPDK_BDEV_IO_TYPE_FLUSH: 3121 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 3122 case SPDK_BDEV_IO_TYPE_NVME_IO: 3123 case SPDK_BDEV_IO_TYPE_ABORT: 3124 return true; 3125 3126 case SPDK_BDEV_IO_TYPE_COMPARE: 3127 return spdk_nvme_ns_supports_compare(ns); 3128 3129 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3130 return spdk_nvme_ns_get_md_size(ns) ? true : false; 3131 3132 case SPDK_BDEV_IO_TYPE_UNMAP: 3133 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3134 return cdata->oncs.dsm; 3135 3136 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3137 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3138 return cdata->oncs.write_zeroes; 3139 3140 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 3141 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 3142 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) { 3143 return true; 3144 } 3145 return false; 3146 3147 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 3148 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 3149 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS; 3150 3151 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 3152 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS && 3153 spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ZONE_APPEND_SUPPORTED; 3154 3155 case SPDK_BDEV_IO_TYPE_COPY: 3156 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3157 return cdata->oncs.copy; 3158 3159 default: 3160 return false; 3161 } 3162 } 3163 3164 static int 3165 nvme_qpair_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ctrlr_channel *ctrlr_ch) 3166 { 3167 struct nvme_qpair *nvme_qpair; 3168 struct spdk_io_channel *pg_ch; 3169 int rc; 3170 3171 nvme_qpair = calloc(1, sizeof(*nvme_qpair)); 3172 if (!nvme_qpair) { 3173 SPDK_ERRLOG("Failed to alloc nvme_qpair.\n"); 3174 return -1; 3175 } 3176 3177 TAILQ_INIT(&nvme_qpair->io_path_list); 3178 3179 nvme_qpair->ctrlr = nvme_ctrlr; 3180 nvme_qpair->ctrlr_ch = ctrlr_ch; 3181 3182 pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs); 3183 if (!pg_ch) { 3184 free(nvme_qpair); 3185 return -1; 3186 } 3187 3188 nvme_qpair->group = spdk_io_channel_get_ctx(pg_ch); 3189 3190 #ifdef SPDK_CONFIG_VTUNE 3191 nvme_qpair->group->collect_spin_stat = true; 3192 #else 3193 nvme_qpair->group->collect_spin_stat = false; 3194 #endif 3195 3196 if (!nvme_ctrlr->disabled) { 3197 /* If a nvme_ctrlr is disabled, don't try to create qpair for it. Qpair will 3198 * be created when it's enabled. 3199 */ 3200 rc = bdev_nvme_create_qpair(nvme_qpair); 3201 if (rc != 0) { 3202 /* nvme_ctrlr can't create IO qpair if connection is down. 3203 * If reconnect_delay_sec is non-zero, creating IO qpair is retried 3204 * after reconnect_delay_sec seconds. If bdev_retry_count is non-zero, 3205 * submitted IO will be queued until IO qpair is successfully created. 3206 * 3207 * Hence, if both are satisfied, ignore the failure. 3208 */ 3209 if (nvme_ctrlr->opts.reconnect_delay_sec == 0 || g_opts.bdev_retry_count == 0) { 3210 spdk_put_io_channel(pg_ch); 3211 free(nvme_qpair); 3212 return rc; 3213 } 3214 } 3215 } 3216 3217 TAILQ_INSERT_TAIL(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 3218 3219 ctrlr_ch->qpair = nvme_qpair; 3220 3221 pthread_mutex_lock(&nvme_qpair->ctrlr->mutex); 3222 nvme_qpair->ctrlr->ref++; 3223 pthread_mutex_unlock(&nvme_qpair->ctrlr->mutex); 3224 3225 return 0; 3226 } 3227 3228 static int 3229 bdev_nvme_create_ctrlr_channel_cb(void *io_device, void *ctx_buf) 3230 { 3231 struct nvme_ctrlr *nvme_ctrlr = io_device; 3232 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 3233 3234 TAILQ_INIT(&ctrlr_ch->pending_resets); 3235 3236 return nvme_qpair_create(nvme_ctrlr, ctrlr_ch); 3237 } 3238 3239 static void 3240 nvme_qpair_delete(struct nvme_qpair *nvme_qpair) 3241 { 3242 struct nvme_io_path *io_path, *next; 3243 3244 assert(nvme_qpair->group != NULL); 3245 3246 TAILQ_FOREACH_SAFE(io_path, &nvme_qpair->io_path_list, tailq, next) { 3247 TAILQ_REMOVE(&nvme_qpair->io_path_list, io_path, tailq); 3248 nvme_io_path_free(io_path); 3249 } 3250 3251 TAILQ_REMOVE(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 3252 3253 spdk_put_io_channel(spdk_io_channel_from_ctx(nvme_qpair->group)); 3254 3255 nvme_ctrlr_release(nvme_qpair->ctrlr); 3256 3257 free(nvme_qpair); 3258 } 3259 3260 static void 3261 bdev_nvme_destroy_ctrlr_channel_cb(void *io_device, void *ctx_buf) 3262 { 3263 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 3264 struct nvme_qpair *nvme_qpair; 3265 3266 nvme_qpair = ctrlr_ch->qpair; 3267 assert(nvme_qpair != NULL); 3268 3269 _bdev_nvme_clear_io_path_cache(nvme_qpair); 3270 3271 if (nvme_qpair->qpair != NULL) { 3272 if (ctrlr_ch->reset_iter == NULL) { 3273 spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); 3274 } else { 3275 /* Skip current ctrlr_channel in a full reset sequence because 3276 * it is being deleted now. The qpair is already being disconnected. 3277 * We do not have to restart disconnecting it. 3278 */ 3279 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 3280 } 3281 3282 /* We cannot release a reference to the poll group now. 3283 * The qpair may be disconnected asynchronously later. 3284 * We need to poll it until it is actually disconnected. 3285 * Just detach the qpair from the deleting ctrlr_channel. 3286 */ 3287 nvme_qpair->ctrlr_ch = NULL; 3288 } else { 3289 assert(ctrlr_ch->reset_iter == NULL); 3290 3291 nvme_qpair_delete(nvme_qpair); 3292 } 3293 } 3294 3295 static inline struct spdk_io_channel * 3296 bdev_nvme_get_accel_channel(struct nvme_poll_group *group) 3297 { 3298 if (spdk_unlikely(!group->accel_channel)) { 3299 group->accel_channel = spdk_accel_get_io_channel(); 3300 if (!group->accel_channel) { 3301 SPDK_ERRLOG("Cannot get the accel_channel for bdev nvme polling group=%p\n", 3302 group); 3303 return NULL; 3304 } 3305 } 3306 3307 return group->accel_channel; 3308 } 3309 3310 static void 3311 bdev_nvme_submit_accel_crc32c(void *ctx, uint32_t *dst, struct iovec *iov, 3312 uint32_t iov_cnt, uint32_t seed, 3313 spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) 3314 { 3315 struct spdk_io_channel *accel_ch; 3316 struct nvme_poll_group *group = ctx; 3317 int rc; 3318 3319 assert(cb_fn != NULL); 3320 3321 accel_ch = bdev_nvme_get_accel_channel(group); 3322 if (spdk_unlikely(accel_ch == NULL)) { 3323 cb_fn(cb_arg, -ENOMEM); 3324 return; 3325 } 3326 3327 rc = spdk_accel_submit_crc32cv(accel_ch, dst, iov, iov_cnt, seed, cb_fn, cb_arg); 3328 if (rc) { 3329 /* For the two cases, spdk_accel_submit_crc32cv does not call the user's cb_fn */ 3330 if (rc == -ENOMEM || rc == -EINVAL) { 3331 cb_fn(cb_arg, rc); 3332 } 3333 SPDK_ERRLOG("Cannot complete the accelerated crc32c operation with iov=%p\n", iov); 3334 } 3335 } 3336 3337 static void 3338 bdev_nvme_finish_sequence(void *seq, spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) 3339 { 3340 spdk_accel_sequence_finish(seq, cb_fn, cb_arg); 3341 } 3342 3343 static void 3344 bdev_nvme_abort_sequence(void *seq) 3345 { 3346 spdk_accel_sequence_abort(seq); 3347 } 3348 3349 static void 3350 bdev_nvme_reverse_sequence(void *seq) 3351 { 3352 spdk_accel_sequence_reverse(seq); 3353 } 3354 3355 static int 3356 bdev_nvme_append_crc32c(void *ctx, void **seq, uint32_t *dst, struct iovec *iovs, uint32_t iovcnt, 3357 struct spdk_memory_domain *domain, void *domain_ctx, uint32_t seed, 3358 spdk_nvme_accel_step_cb cb_fn, void *cb_arg) 3359 { 3360 struct spdk_io_channel *ch; 3361 struct nvme_poll_group *group = ctx; 3362 3363 ch = bdev_nvme_get_accel_channel(group); 3364 if (spdk_unlikely(ch == NULL)) { 3365 return -ENOMEM; 3366 } 3367 3368 return spdk_accel_append_crc32c((struct spdk_accel_sequence **)seq, ch, dst, iovs, iovcnt, 3369 domain, domain_ctx, seed, cb_fn, cb_arg); 3370 } 3371 3372 static struct spdk_nvme_accel_fn_table g_bdev_nvme_accel_fn_table = { 3373 .table_size = sizeof(struct spdk_nvme_accel_fn_table), 3374 .submit_accel_crc32c = bdev_nvme_submit_accel_crc32c, 3375 .append_crc32c = bdev_nvme_append_crc32c, 3376 .finish_sequence = bdev_nvme_finish_sequence, 3377 .reverse_sequence = bdev_nvme_reverse_sequence, 3378 .abort_sequence = bdev_nvme_abort_sequence, 3379 }; 3380 3381 static int 3382 bdev_nvme_create_poll_group_cb(void *io_device, void *ctx_buf) 3383 { 3384 struct nvme_poll_group *group = ctx_buf; 3385 3386 TAILQ_INIT(&group->qpair_list); 3387 3388 group->group = spdk_nvme_poll_group_create(group, &g_bdev_nvme_accel_fn_table); 3389 if (group->group == NULL) { 3390 return -1; 3391 } 3392 3393 group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us); 3394 3395 if (group->poller == NULL) { 3396 spdk_nvme_poll_group_destroy(group->group); 3397 return -1; 3398 } 3399 3400 return 0; 3401 } 3402 3403 static void 3404 bdev_nvme_destroy_poll_group_cb(void *io_device, void *ctx_buf) 3405 { 3406 struct nvme_poll_group *group = ctx_buf; 3407 3408 assert(TAILQ_EMPTY(&group->qpair_list)); 3409 3410 if (group->accel_channel) { 3411 spdk_put_io_channel(group->accel_channel); 3412 } 3413 3414 spdk_poller_unregister(&group->poller); 3415 if (spdk_nvme_poll_group_destroy(group->group)) { 3416 SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.\n"); 3417 assert(false); 3418 } 3419 } 3420 3421 static struct spdk_io_channel * 3422 bdev_nvme_get_io_channel(void *ctx) 3423 { 3424 struct nvme_bdev *nvme_bdev = ctx; 3425 3426 return spdk_get_io_channel(nvme_bdev); 3427 } 3428 3429 static void * 3430 bdev_nvme_get_module_ctx(void *ctx) 3431 { 3432 struct nvme_bdev *nvme_bdev = ctx; 3433 struct nvme_ns *nvme_ns; 3434 3435 if (!nvme_bdev || nvme_bdev->disk.module != &nvme_if) { 3436 return NULL; 3437 } 3438 3439 nvme_ns = TAILQ_FIRST(&nvme_bdev->nvme_ns_list); 3440 if (!nvme_ns) { 3441 return NULL; 3442 } 3443 3444 return nvme_ns->ns; 3445 } 3446 3447 static const char * 3448 _nvme_ana_state_str(enum spdk_nvme_ana_state ana_state) 3449 { 3450 switch (ana_state) { 3451 case SPDK_NVME_ANA_OPTIMIZED_STATE: 3452 return "optimized"; 3453 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 3454 return "non_optimized"; 3455 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 3456 return "inaccessible"; 3457 case SPDK_NVME_ANA_PERSISTENT_LOSS_STATE: 3458 return "persistent_loss"; 3459 case SPDK_NVME_ANA_CHANGE_STATE: 3460 return "change"; 3461 default: 3462 return NULL; 3463 } 3464 } 3465 3466 static int 3467 bdev_nvme_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size) 3468 { 3469 struct spdk_memory_domain **_domains = NULL; 3470 struct nvme_bdev *nbdev = ctx; 3471 struct nvme_ns *nvme_ns; 3472 int i = 0, _array_size = array_size; 3473 int rc = 0; 3474 3475 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 3476 if (domains && array_size >= i) { 3477 _domains = &domains[i]; 3478 } else { 3479 _domains = NULL; 3480 } 3481 rc = spdk_nvme_ctrlr_get_memory_domains(nvme_ns->ctrlr->ctrlr, _domains, _array_size); 3482 if (rc > 0) { 3483 i += rc; 3484 if (_array_size >= rc) { 3485 _array_size -= rc; 3486 } else { 3487 _array_size = 0; 3488 } 3489 } else if (rc < 0) { 3490 return rc; 3491 } 3492 } 3493 3494 return i; 3495 } 3496 3497 static const char * 3498 nvme_ctrlr_get_state_str(struct nvme_ctrlr *nvme_ctrlr) 3499 { 3500 if (nvme_ctrlr->destruct) { 3501 return "deleting"; 3502 } else if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 3503 return "failed"; 3504 } else if (nvme_ctrlr->resetting) { 3505 return "resetting"; 3506 } else if (nvme_ctrlr->reconnect_is_delayed > 0) { 3507 return "reconnect_is_delayed"; 3508 } else if (nvme_ctrlr->disabled) { 3509 return "disabled"; 3510 } else { 3511 return "enabled"; 3512 } 3513 } 3514 3515 void 3516 nvme_ctrlr_info_json(struct spdk_json_write_ctx *w, struct nvme_ctrlr *nvme_ctrlr) 3517 { 3518 struct spdk_nvme_transport_id *trid; 3519 const struct spdk_nvme_ctrlr_opts *opts; 3520 const struct spdk_nvme_ctrlr_data *cdata; 3521 struct nvme_path_id *path_id; 3522 3523 spdk_json_write_object_begin(w); 3524 3525 spdk_json_write_named_string(w, "state", nvme_ctrlr_get_state_str(nvme_ctrlr)); 3526 3527 #ifdef SPDK_CONFIG_NVME_CUSE 3528 size_t cuse_name_size = 128; 3529 char cuse_name[cuse_name_size]; 3530 3531 int rc = spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr, cuse_name, &cuse_name_size); 3532 if (rc == 0) { 3533 spdk_json_write_named_string(w, "cuse_device", cuse_name); 3534 } 3535 #endif 3536 trid = &nvme_ctrlr->active_path_id->trid; 3537 spdk_json_write_named_object_begin(w, "trid"); 3538 nvme_bdev_dump_trid_json(trid, w); 3539 spdk_json_write_object_end(w); 3540 3541 path_id = TAILQ_NEXT(nvme_ctrlr->active_path_id, link); 3542 if (path_id != NULL) { 3543 spdk_json_write_named_array_begin(w, "alternate_trids"); 3544 do { 3545 trid = &path_id->trid; 3546 spdk_json_write_object_begin(w); 3547 nvme_bdev_dump_trid_json(trid, w); 3548 spdk_json_write_object_end(w); 3549 3550 path_id = TAILQ_NEXT(path_id, link); 3551 } while (path_id != NULL); 3552 spdk_json_write_array_end(w); 3553 } 3554 3555 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 3556 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 3557 3558 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 3559 spdk_json_write_named_object_begin(w, "host"); 3560 spdk_json_write_named_string(w, "nqn", opts->hostnqn); 3561 spdk_json_write_named_string(w, "addr", opts->src_addr); 3562 spdk_json_write_named_string(w, "svcid", opts->src_svcid); 3563 spdk_json_write_object_end(w); 3564 3565 spdk_json_write_object_end(w); 3566 } 3567 3568 static void 3569 nvme_namespace_info_json(struct spdk_json_write_ctx *w, 3570 struct nvme_ns *nvme_ns) 3571 { 3572 struct spdk_nvme_ns *ns; 3573 struct spdk_nvme_ctrlr *ctrlr; 3574 const struct spdk_nvme_ctrlr_data *cdata; 3575 const struct spdk_nvme_transport_id *trid; 3576 union spdk_nvme_vs_register vs; 3577 const struct spdk_nvme_ns_data *nsdata; 3578 char buf[128]; 3579 3580 ns = nvme_ns->ns; 3581 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 3582 3583 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3584 trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); 3585 vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr); 3586 3587 spdk_json_write_object_begin(w); 3588 3589 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 3590 spdk_json_write_named_string(w, "pci_address", trid->traddr); 3591 } 3592 3593 spdk_json_write_named_object_begin(w, "trid"); 3594 3595 nvme_bdev_dump_trid_json(trid, w); 3596 3597 spdk_json_write_object_end(w); 3598 3599 #ifdef SPDK_CONFIG_NVME_CUSE 3600 size_t cuse_name_size = 128; 3601 char cuse_name[cuse_name_size]; 3602 3603 int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns), 3604 cuse_name, &cuse_name_size); 3605 if (rc == 0) { 3606 spdk_json_write_named_string(w, "cuse_device", cuse_name); 3607 } 3608 #endif 3609 3610 spdk_json_write_named_object_begin(w, "ctrlr_data"); 3611 3612 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 3613 3614 spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid); 3615 3616 snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn); 3617 spdk_str_trim(buf); 3618 spdk_json_write_named_string(w, "model_number", buf); 3619 3620 snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn); 3621 spdk_str_trim(buf); 3622 spdk_json_write_named_string(w, "serial_number", buf); 3623 3624 snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr); 3625 spdk_str_trim(buf); 3626 spdk_json_write_named_string(w, "firmware_revision", buf); 3627 3628 if (cdata->subnqn[0] != '\0') { 3629 spdk_json_write_named_string(w, "subnqn", cdata->subnqn); 3630 } 3631 3632 spdk_json_write_named_object_begin(w, "oacs"); 3633 3634 spdk_json_write_named_uint32(w, "security", cdata->oacs.security); 3635 spdk_json_write_named_uint32(w, "format", cdata->oacs.format); 3636 spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware); 3637 spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage); 3638 3639 spdk_json_write_object_end(w); 3640 3641 spdk_json_write_named_bool(w, "multi_ctrlr", cdata->cmic.multi_ctrlr); 3642 spdk_json_write_named_bool(w, "ana_reporting", cdata->cmic.ana_reporting); 3643 3644 spdk_json_write_object_end(w); 3645 3646 spdk_json_write_named_object_begin(w, "vs"); 3647 3648 spdk_json_write_name(w, "nvme_version"); 3649 if (vs.bits.ter) { 3650 spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter); 3651 } else { 3652 spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr); 3653 } 3654 3655 spdk_json_write_object_end(w); 3656 3657 nsdata = spdk_nvme_ns_get_data(ns); 3658 3659 spdk_json_write_named_object_begin(w, "ns_data"); 3660 3661 spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns)); 3662 3663 if (cdata->cmic.ana_reporting) { 3664 spdk_json_write_named_string(w, "ana_state", 3665 _nvme_ana_state_str(nvme_ns->ana_state)); 3666 } 3667 3668 spdk_json_write_named_bool(w, "can_share", nsdata->nmic.can_share); 3669 3670 spdk_json_write_object_end(w); 3671 3672 if (cdata->oacs.security) { 3673 spdk_json_write_named_object_begin(w, "security"); 3674 3675 spdk_json_write_named_bool(w, "opal", nvme_ns->bdev->opal); 3676 3677 spdk_json_write_object_end(w); 3678 } 3679 3680 spdk_json_write_object_end(w); 3681 } 3682 3683 static const char * 3684 nvme_bdev_get_mp_policy_str(struct nvme_bdev *nbdev) 3685 { 3686 switch (nbdev->mp_policy) { 3687 case BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE: 3688 return "active_passive"; 3689 case BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE: 3690 return "active_active"; 3691 default: 3692 assert(false); 3693 return "invalid"; 3694 } 3695 } 3696 3697 static int 3698 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) 3699 { 3700 struct nvme_bdev *nvme_bdev = ctx; 3701 struct nvme_ns *nvme_ns; 3702 3703 pthread_mutex_lock(&nvme_bdev->mutex); 3704 spdk_json_write_named_array_begin(w, "nvme"); 3705 TAILQ_FOREACH(nvme_ns, &nvme_bdev->nvme_ns_list, tailq) { 3706 nvme_namespace_info_json(w, nvme_ns); 3707 } 3708 spdk_json_write_array_end(w); 3709 spdk_json_write_named_string(w, "mp_policy", nvme_bdev_get_mp_policy_str(nvme_bdev)); 3710 pthread_mutex_unlock(&nvme_bdev->mutex); 3711 3712 return 0; 3713 } 3714 3715 static void 3716 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3717 { 3718 /* No config per bdev needed */ 3719 } 3720 3721 static uint64_t 3722 bdev_nvme_get_spin_time(struct spdk_io_channel *ch) 3723 { 3724 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 3725 struct nvme_io_path *io_path; 3726 struct nvme_poll_group *group; 3727 uint64_t spin_time = 0; 3728 3729 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 3730 group = io_path->qpair->group; 3731 3732 if (!group || !group->collect_spin_stat) { 3733 continue; 3734 } 3735 3736 if (group->end_ticks != 0) { 3737 group->spin_ticks += (group->end_ticks - group->start_ticks); 3738 group->end_ticks = 0; 3739 } 3740 3741 spin_time += group->spin_ticks; 3742 group->start_ticks = 0; 3743 group->spin_ticks = 0; 3744 } 3745 3746 return (spin_time * 1000000ULL) / spdk_get_ticks_hz(); 3747 } 3748 3749 static void 3750 bdev_nvme_reset_device_stat(void *ctx) 3751 { 3752 struct nvme_bdev *nbdev = ctx; 3753 3754 if (nbdev->err_stat != NULL) { 3755 memset(nbdev->err_stat, 0, sizeof(struct nvme_error_stat)); 3756 } 3757 } 3758 3759 /* JSON string should be lowercases and underscore delimited string. */ 3760 static void 3761 bdev_nvme_format_nvme_status(char *dst, const char *src) 3762 { 3763 char tmp[256]; 3764 3765 spdk_strcpy_replace(dst, 256, src, " - ", "_"); 3766 spdk_strcpy_replace(tmp, 256, dst, "-", "_"); 3767 spdk_strcpy_replace(dst, 256, tmp, " ", "_"); 3768 spdk_strlwr(dst); 3769 } 3770 3771 static void 3772 bdev_nvme_dump_device_stat_json(void *ctx, struct spdk_json_write_ctx *w) 3773 { 3774 struct nvme_bdev *nbdev = ctx; 3775 struct spdk_nvme_status status = {}; 3776 uint16_t sct, sc; 3777 char status_json[256]; 3778 const char *status_str; 3779 3780 if (nbdev->err_stat == NULL) { 3781 return; 3782 } 3783 3784 spdk_json_write_named_object_begin(w, "nvme_error"); 3785 3786 spdk_json_write_named_object_begin(w, "status_type"); 3787 for (sct = 0; sct < 8; sct++) { 3788 if (nbdev->err_stat->status_type[sct] == 0) { 3789 continue; 3790 } 3791 status.sct = sct; 3792 3793 status_str = spdk_nvme_cpl_get_status_type_string(&status); 3794 assert(status_str != NULL); 3795 bdev_nvme_format_nvme_status(status_json, status_str); 3796 3797 spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status_type[sct]); 3798 } 3799 spdk_json_write_object_end(w); 3800 3801 spdk_json_write_named_object_begin(w, "status_code"); 3802 for (sct = 0; sct < 4; sct++) { 3803 status.sct = sct; 3804 for (sc = 0; sc < 256; sc++) { 3805 if (nbdev->err_stat->status[sct][sc] == 0) { 3806 continue; 3807 } 3808 status.sc = sc; 3809 3810 status_str = spdk_nvme_cpl_get_status_string(&status); 3811 assert(status_str != NULL); 3812 bdev_nvme_format_nvme_status(status_json, status_str); 3813 3814 spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status[sct][sc]); 3815 } 3816 } 3817 spdk_json_write_object_end(w); 3818 3819 spdk_json_write_object_end(w); 3820 } 3821 3822 static bool 3823 bdev_nvme_accel_sequence_supported(void *ctx, enum spdk_bdev_io_type type) 3824 { 3825 struct nvme_bdev *nbdev = ctx; 3826 struct spdk_nvme_ctrlr *ctrlr; 3827 3828 if (!g_opts.allow_accel_sequence) { 3829 return false; 3830 } 3831 3832 switch (type) { 3833 case SPDK_BDEV_IO_TYPE_WRITE: 3834 case SPDK_BDEV_IO_TYPE_READ: 3835 break; 3836 default: 3837 return false; 3838 } 3839 3840 ctrlr = bdev_nvme_get_ctrlr(&nbdev->disk); 3841 assert(ctrlr != NULL); 3842 3843 return spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ACCEL_SEQUENCE_SUPPORTED; 3844 } 3845 3846 static const struct spdk_bdev_fn_table nvmelib_fn_table = { 3847 .destruct = bdev_nvme_destruct, 3848 .submit_request = bdev_nvme_submit_request, 3849 .io_type_supported = bdev_nvme_io_type_supported, 3850 .get_io_channel = bdev_nvme_get_io_channel, 3851 .dump_info_json = bdev_nvme_dump_info_json, 3852 .write_config_json = bdev_nvme_write_config_json, 3853 .get_spin_time = bdev_nvme_get_spin_time, 3854 .get_module_ctx = bdev_nvme_get_module_ctx, 3855 .get_memory_domains = bdev_nvme_get_memory_domains, 3856 .accel_sequence_supported = bdev_nvme_accel_sequence_supported, 3857 .reset_device_stat = bdev_nvme_reset_device_stat, 3858 .dump_device_stat_json = bdev_nvme_dump_device_stat_json, 3859 }; 3860 3861 typedef int (*bdev_nvme_parse_ana_log_page_cb)( 3862 const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg); 3863 3864 static int 3865 bdev_nvme_parse_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 3866 bdev_nvme_parse_ana_log_page_cb cb_fn, void *cb_arg) 3867 { 3868 struct spdk_nvme_ana_group_descriptor *copied_desc; 3869 uint8_t *orig_desc; 3870 uint32_t i, desc_size, copy_len; 3871 int rc = 0; 3872 3873 if (nvme_ctrlr->ana_log_page == NULL) { 3874 return -EINVAL; 3875 } 3876 3877 copied_desc = nvme_ctrlr->copied_ana_desc; 3878 3879 orig_desc = (uint8_t *)nvme_ctrlr->ana_log_page + sizeof(struct spdk_nvme_ana_page); 3880 copy_len = nvme_ctrlr->max_ana_log_page_size - sizeof(struct spdk_nvme_ana_page); 3881 3882 for (i = 0; i < nvme_ctrlr->ana_log_page->num_ana_group_desc; i++) { 3883 memcpy(copied_desc, orig_desc, copy_len); 3884 3885 rc = cb_fn(copied_desc, cb_arg); 3886 if (rc != 0) { 3887 break; 3888 } 3889 3890 desc_size = sizeof(struct spdk_nvme_ana_group_descriptor) + 3891 copied_desc->num_of_nsid * sizeof(uint32_t); 3892 orig_desc += desc_size; 3893 copy_len -= desc_size; 3894 } 3895 3896 return rc; 3897 } 3898 3899 static int 3900 nvme_ns_ana_transition_timedout(void *ctx) 3901 { 3902 struct nvme_ns *nvme_ns = ctx; 3903 3904 spdk_poller_unregister(&nvme_ns->anatt_timer); 3905 nvme_ns->ana_transition_timedout = true; 3906 3907 return SPDK_POLLER_BUSY; 3908 } 3909 3910 static void 3911 _nvme_ns_set_ana_state(struct nvme_ns *nvme_ns, 3912 const struct spdk_nvme_ana_group_descriptor *desc) 3913 { 3914 const struct spdk_nvme_ctrlr_data *cdata; 3915 3916 nvme_ns->ana_group_id = desc->ana_group_id; 3917 nvme_ns->ana_state = desc->ana_state; 3918 nvme_ns->ana_state_updating = false; 3919 3920 switch (nvme_ns->ana_state) { 3921 case SPDK_NVME_ANA_OPTIMIZED_STATE: 3922 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 3923 nvme_ns->ana_transition_timedout = false; 3924 spdk_poller_unregister(&nvme_ns->anatt_timer); 3925 break; 3926 3927 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 3928 case SPDK_NVME_ANA_CHANGE_STATE: 3929 if (nvme_ns->anatt_timer != NULL) { 3930 break; 3931 } 3932 3933 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 3934 nvme_ns->anatt_timer = SPDK_POLLER_REGISTER(nvme_ns_ana_transition_timedout, 3935 nvme_ns, 3936 cdata->anatt * SPDK_SEC_TO_USEC); 3937 break; 3938 default: 3939 break; 3940 } 3941 } 3942 3943 static int 3944 nvme_ns_set_ana_state(const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg) 3945 { 3946 struct nvme_ns *nvme_ns = cb_arg; 3947 uint32_t i; 3948 3949 for (i = 0; i < desc->num_of_nsid; i++) { 3950 if (desc->nsid[i] != spdk_nvme_ns_get_id(nvme_ns->ns)) { 3951 continue; 3952 } 3953 3954 _nvme_ns_set_ana_state(nvme_ns, desc); 3955 return 1; 3956 } 3957 3958 return 0; 3959 } 3960 3961 static struct spdk_uuid 3962 nvme_generate_uuid(const char *sn, uint32_t nsid) 3963 { 3964 struct spdk_uuid new_uuid, namespace_uuid; 3965 char merged_str[SPDK_NVME_CTRLR_SN_LEN + NSID_STR_LEN + 1] = {'\0'}; 3966 /* This namespace UUID was generated using uuid_generate() method. */ 3967 const char *namespace_str = {"edaed2de-24bc-4b07-b559-f47ecbe730fd"}; 3968 int size; 3969 3970 assert(strlen(sn) <= SPDK_NVME_CTRLR_SN_LEN); 3971 3972 spdk_uuid_set_null(&new_uuid); 3973 spdk_uuid_set_null(&namespace_uuid); 3974 3975 size = snprintf(merged_str, sizeof(merged_str), "%s%"PRIu32, sn, nsid); 3976 assert(size > 0 && (unsigned long)size < sizeof(merged_str)); 3977 3978 spdk_uuid_parse(&namespace_uuid, namespace_str); 3979 3980 spdk_uuid_generate_sha1(&new_uuid, &namespace_uuid, merged_str, size); 3981 3982 return new_uuid; 3983 } 3984 3985 static int 3986 nvme_disk_create(struct spdk_bdev *disk, const char *base_name, 3987 struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns, 3988 uint32_t prchk_flags, void *ctx) 3989 { 3990 const struct spdk_uuid *uuid; 3991 const uint8_t *nguid; 3992 const struct spdk_nvme_ctrlr_data *cdata; 3993 const struct spdk_nvme_ns_data *nsdata; 3994 const struct spdk_nvme_ctrlr_opts *opts; 3995 enum spdk_nvme_csi csi; 3996 uint32_t atomic_bs, phys_bs, bs; 3997 char sn_tmp[SPDK_NVME_CTRLR_SN_LEN + 1] = {'\0'}; 3998 3999 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 4000 csi = spdk_nvme_ns_get_csi(ns); 4001 opts = spdk_nvme_ctrlr_get_opts(ctrlr); 4002 4003 switch (csi) { 4004 case SPDK_NVME_CSI_NVM: 4005 disk->product_name = "NVMe disk"; 4006 break; 4007 case SPDK_NVME_CSI_ZNS: 4008 disk->product_name = "NVMe ZNS disk"; 4009 disk->zoned = true; 4010 disk->zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 4011 disk->max_zone_append_size = spdk_nvme_zns_ctrlr_get_max_zone_append_size(ctrlr) / 4012 spdk_nvme_ns_get_extended_sector_size(ns); 4013 disk->max_open_zones = spdk_nvme_zns_ns_get_max_open_zones(ns); 4014 disk->max_active_zones = spdk_nvme_zns_ns_get_max_active_zones(ns); 4015 break; 4016 default: 4017 SPDK_ERRLOG("unsupported CSI: %u\n", csi); 4018 return -ENOTSUP; 4019 } 4020 4021 disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns)); 4022 if (!disk->name) { 4023 return -ENOMEM; 4024 } 4025 4026 disk->write_cache = 0; 4027 if (cdata->vwc.present) { 4028 /* Enable if the Volatile Write Cache exists */ 4029 disk->write_cache = 1; 4030 } 4031 if (cdata->oncs.write_zeroes) { 4032 disk->max_write_zeroes = UINT16_MAX + 1; 4033 } 4034 disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns); 4035 disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns); 4036 disk->max_segment_size = spdk_nvme_ctrlr_get_max_xfer_size(ctrlr); 4037 /* NVMe driver will split one request into multiple requests 4038 * based on MDTS and stripe boundary, the bdev layer will use 4039 * max_segment_size and max_num_segments to split one big IO 4040 * into multiple requests, then small request can't run out 4041 * of NVMe internal requests data structure. 4042 */ 4043 if (opts && opts->io_queue_requests) { 4044 disk->max_num_segments = opts->io_queue_requests / 2; 4045 } 4046 disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns); 4047 4048 nguid = spdk_nvme_ns_get_nguid(ns); 4049 if (!nguid) { 4050 uuid = spdk_nvme_ns_get_uuid(ns); 4051 if (uuid) { 4052 disk->uuid = *uuid; 4053 } else if (g_opts.generate_uuids) { 4054 spdk_strcpy_pad(sn_tmp, cdata->sn, SPDK_NVME_CTRLR_SN_LEN, '\0'); 4055 disk->uuid = nvme_generate_uuid(sn_tmp, spdk_nvme_ns_get_id(ns)); 4056 } 4057 } else { 4058 memcpy(&disk->uuid, nguid, sizeof(disk->uuid)); 4059 } 4060 4061 nsdata = spdk_nvme_ns_get_data(ns); 4062 bs = spdk_nvme_ns_get_sector_size(ns); 4063 atomic_bs = bs; 4064 phys_bs = bs; 4065 if (nsdata->nabo == 0) { 4066 if (nsdata->nsfeat.ns_atomic_write_unit && nsdata->nawupf) { 4067 atomic_bs = bs * (1 + nsdata->nawupf); 4068 } else { 4069 atomic_bs = bs * (1 + cdata->awupf); 4070 } 4071 } 4072 if (nsdata->nsfeat.optperf) { 4073 phys_bs = bs * (1 + nsdata->npwg); 4074 } 4075 disk->phys_blocklen = spdk_min(phys_bs, atomic_bs); 4076 4077 disk->md_len = spdk_nvme_ns_get_md_size(ns); 4078 if (disk->md_len != 0) { 4079 disk->md_interleave = nsdata->flbas.extended; 4080 disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns); 4081 if (disk->dif_type != SPDK_DIF_DISABLE) { 4082 disk->dif_is_head_of_md = nsdata->dps.md_start; 4083 disk->dif_check_flags = prchk_flags; 4084 } 4085 } 4086 4087 if (!(spdk_nvme_ctrlr_get_flags(ctrlr) & 4088 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) { 4089 disk->acwu = 0; 4090 } else if (nsdata->nsfeat.ns_atomic_write_unit) { 4091 disk->acwu = nsdata->nacwu + 1; /* 0-based */ 4092 } else { 4093 disk->acwu = cdata->acwu + 1; /* 0-based */ 4094 } 4095 4096 if (cdata->oncs.copy) { 4097 /* For now bdev interface allows only single segment copy */ 4098 disk->max_copy = nsdata->mssrl; 4099 } 4100 4101 disk->ctxt = ctx; 4102 disk->fn_table = &nvmelib_fn_table; 4103 disk->module = &nvme_if; 4104 4105 return 0; 4106 } 4107 4108 static struct nvme_bdev * 4109 nvme_bdev_alloc(void) 4110 { 4111 struct nvme_bdev *bdev; 4112 int rc; 4113 4114 bdev = calloc(1, sizeof(*bdev)); 4115 if (!bdev) { 4116 SPDK_ERRLOG("bdev calloc() failed\n"); 4117 return NULL; 4118 } 4119 4120 if (g_opts.nvme_error_stat) { 4121 bdev->err_stat = calloc(1, sizeof(struct nvme_error_stat)); 4122 if (!bdev->err_stat) { 4123 SPDK_ERRLOG("err_stat calloc() failed\n"); 4124 free(bdev); 4125 return NULL; 4126 } 4127 } 4128 4129 rc = pthread_mutex_init(&bdev->mutex, NULL); 4130 if (rc != 0) { 4131 free(bdev->err_stat); 4132 free(bdev); 4133 return NULL; 4134 } 4135 4136 bdev->ref = 1; 4137 bdev->mp_policy = BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE; 4138 bdev->mp_selector = BDEV_NVME_MP_SELECTOR_ROUND_ROBIN; 4139 bdev->rr_min_io = UINT32_MAX; 4140 TAILQ_INIT(&bdev->nvme_ns_list); 4141 4142 return bdev; 4143 } 4144 4145 static int 4146 nvme_bdev_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4147 { 4148 struct nvme_bdev *bdev; 4149 struct nvme_bdev_ctrlr *nbdev_ctrlr = nvme_ctrlr->nbdev_ctrlr; 4150 int rc; 4151 4152 bdev = nvme_bdev_alloc(); 4153 if (bdev == NULL) { 4154 SPDK_ERRLOG("Failed to allocate NVMe bdev\n"); 4155 return -ENOMEM; 4156 } 4157 4158 bdev->opal = nvme_ctrlr->opal_dev != NULL; 4159 4160 rc = nvme_disk_create(&bdev->disk, nbdev_ctrlr->name, nvme_ctrlr->ctrlr, 4161 nvme_ns->ns, nvme_ctrlr->opts.prchk_flags, bdev); 4162 if (rc != 0) { 4163 SPDK_ERRLOG("Failed to create NVMe disk\n"); 4164 nvme_bdev_free(bdev); 4165 return rc; 4166 } 4167 4168 spdk_io_device_register(bdev, 4169 bdev_nvme_create_bdev_channel_cb, 4170 bdev_nvme_destroy_bdev_channel_cb, 4171 sizeof(struct nvme_bdev_channel), 4172 bdev->disk.name); 4173 4174 nvme_ns->bdev = bdev; 4175 bdev->nsid = nvme_ns->id; 4176 TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); 4177 4178 bdev->nbdev_ctrlr = nbdev_ctrlr; 4179 TAILQ_INSERT_TAIL(&nbdev_ctrlr->bdevs, bdev, tailq); 4180 4181 rc = spdk_bdev_register(&bdev->disk); 4182 if (rc != 0) { 4183 SPDK_ERRLOG("spdk_bdev_register() failed\n"); 4184 spdk_io_device_unregister(bdev, NULL); 4185 nvme_ns->bdev = NULL; 4186 TAILQ_REMOVE(&nbdev_ctrlr->bdevs, bdev, tailq); 4187 nvme_bdev_free(bdev); 4188 return rc; 4189 } 4190 4191 return 0; 4192 } 4193 4194 static bool 4195 bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2) 4196 { 4197 const struct spdk_nvme_ns_data *nsdata1, *nsdata2; 4198 const struct spdk_uuid *uuid1, *uuid2; 4199 4200 nsdata1 = spdk_nvme_ns_get_data(ns1); 4201 nsdata2 = spdk_nvme_ns_get_data(ns2); 4202 uuid1 = spdk_nvme_ns_get_uuid(ns1); 4203 uuid2 = spdk_nvme_ns_get_uuid(ns2); 4204 4205 return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid)) == 0 && 4206 nsdata1->eui64 == nsdata2->eui64 && 4207 ((uuid1 == NULL && uuid2 == NULL) || 4208 (uuid1 != NULL && uuid2 != NULL && spdk_uuid_compare(uuid1, uuid2) == 0)) && 4209 spdk_nvme_ns_get_csi(ns1) == spdk_nvme_ns_get_csi(ns2); 4210 } 4211 4212 static bool 4213 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 4214 struct spdk_nvme_ctrlr_opts *opts) 4215 { 4216 struct nvme_probe_skip_entry *entry; 4217 4218 TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) { 4219 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 4220 return false; 4221 } 4222 } 4223 4224 opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst; 4225 opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight; 4226 opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight; 4227 opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight; 4228 opts->disable_read_ana_log_page = true; 4229 4230 SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr); 4231 4232 return true; 4233 } 4234 4235 static void 4236 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl) 4237 { 4238 struct nvme_ctrlr *nvme_ctrlr = ctx; 4239 4240 if (spdk_nvme_cpl_is_error(cpl)) { 4241 SPDK_WARNLOG("Abort failed. Resetting controller. sc is %u, sct is %u.\n", cpl->status.sc, 4242 cpl->status.sct); 4243 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4244 } else if (cpl->cdw0 & 0x1) { 4245 SPDK_WARNLOG("Specified command could not be aborted.\n"); 4246 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4247 } 4248 } 4249 4250 static void 4251 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr, 4252 struct spdk_nvme_qpair *qpair, uint16_t cid) 4253 { 4254 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 4255 union spdk_nvme_csts_register csts; 4256 int rc; 4257 4258 assert(nvme_ctrlr->ctrlr == ctrlr); 4259 4260 SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid); 4261 4262 /* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O 4263 * queue. (Note: qpair == NULL when there's an admin cmd timeout.) Otherwise we 4264 * would submit another fabrics cmd on the admin queue to read CSTS and check for its 4265 * completion recursively. 4266 */ 4267 if (nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) { 4268 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); 4269 if (csts.bits.cfs) { 4270 SPDK_ERRLOG("Controller Fatal Status, reset required\n"); 4271 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4272 return; 4273 } 4274 } 4275 4276 switch (g_opts.action_on_timeout) { 4277 case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT: 4278 if (qpair) { 4279 /* Don't send abort to ctrlr when ctrlr is not available. */ 4280 pthread_mutex_lock(&nvme_ctrlr->mutex); 4281 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 4282 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4283 SPDK_NOTICELOG("Quit abort. Ctrlr is not available.\n"); 4284 return; 4285 } 4286 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4287 4288 rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid, 4289 nvme_abort_cpl, nvme_ctrlr); 4290 if (rc == 0) { 4291 return; 4292 } 4293 4294 SPDK_ERRLOG("Unable to send abort. Resetting, rc is %d.\n", rc); 4295 } 4296 4297 /* FALLTHROUGH */ 4298 case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET: 4299 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4300 break; 4301 case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE: 4302 SPDK_DEBUGLOG(bdev_nvme, "No action for nvme controller timeout.\n"); 4303 break; 4304 default: 4305 SPDK_ERRLOG("An invalid timeout action value is found.\n"); 4306 break; 4307 } 4308 } 4309 4310 static struct nvme_ns * 4311 nvme_ns_alloc(void) 4312 { 4313 struct nvme_ns *nvme_ns; 4314 4315 nvme_ns = calloc(1, sizeof(struct nvme_ns)); 4316 if (nvme_ns == NULL) { 4317 return NULL; 4318 } 4319 4320 if (g_opts.io_path_stat) { 4321 nvme_ns->stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); 4322 if (nvme_ns->stat == NULL) { 4323 free(nvme_ns); 4324 return NULL; 4325 } 4326 spdk_bdev_reset_io_stat(nvme_ns->stat, SPDK_BDEV_RESET_STAT_MAXMIN); 4327 } 4328 4329 return nvme_ns; 4330 } 4331 4332 static void 4333 nvme_ns_free(struct nvme_ns *nvme_ns) 4334 { 4335 free(nvme_ns->stat); 4336 free(nvme_ns); 4337 } 4338 4339 static void 4340 nvme_ctrlr_populate_namespace_done(struct nvme_ns *nvme_ns, int rc) 4341 { 4342 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 4343 struct nvme_async_probe_ctx *ctx = nvme_ns->probe_ctx; 4344 4345 if (rc == 0) { 4346 nvme_ns->probe_ctx = NULL; 4347 pthread_mutex_lock(&nvme_ctrlr->mutex); 4348 nvme_ctrlr->ref++; 4349 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4350 } else { 4351 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4352 nvme_ns_free(nvme_ns); 4353 } 4354 4355 if (ctx) { 4356 ctx->populates_in_progress--; 4357 if (ctx->populates_in_progress == 0) { 4358 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 4359 } 4360 } 4361 } 4362 4363 static void 4364 bdev_nvme_add_io_path(struct spdk_io_channel_iter *i) 4365 { 4366 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4367 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4368 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4369 int rc; 4370 4371 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 4372 if (rc != 0) { 4373 SPDK_ERRLOG("Failed to add I/O path to bdev_channel dynamically.\n"); 4374 } 4375 4376 spdk_for_each_channel_continue(i, rc); 4377 } 4378 4379 static void 4380 bdev_nvme_delete_io_path(struct spdk_io_channel_iter *i) 4381 { 4382 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4383 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4384 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4385 struct nvme_io_path *io_path; 4386 4387 io_path = _bdev_nvme_get_io_path(nbdev_ch, nvme_ns); 4388 if (io_path != NULL) { 4389 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 4390 } 4391 4392 spdk_for_each_channel_continue(i, 0); 4393 } 4394 4395 static void 4396 bdev_nvme_add_io_path_failed(struct spdk_io_channel_iter *i, int status) 4397 { 4398 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4399 4400 nvme_ctrlr_populate_namespace_done(nvme_ns, -1); 4401 } 4402 4403 static void 4404 bdev_nvme_add_io_path_done(struct spdk_io_channel_iter *i, int status) 4405 { 4406 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4407 struct nvme_bdev *bdev = spdk_io_channel_iter_get_io_device(i); 4408 4409 if (status == 0) { 4410 nvme_ctrlr_populate_namespace_done(nvme_ns, 0); 4411 } else { 4412 /* Delete the added io_paths and fail populating the namespace. */ 4413 spdk_for_each_channel(bdev, 4414 bdev_nvme_delete_io_path, 4415 nvme_ns, 4416 bdev_nvme_add_io_path_failed); 4417 } 4418 } 4419 4420 static int 4421 nvme_bdev_add_ns(struct nvme_bdev *bdev, struct nvme_ns *nvme_ns) 4422 { 4423 struct nvme_ns *tmp_ns; 4424 const struct spdk_nvme_ns_data *nsdata; 4425 4426 nsdata = spdk_nvme_ns_get_data(nvme_ns->ns); 4427 if (!nsdata->nmic.can_share) { 4428 SPDK_ERRLOG("Namespace cannot be shared.\n"); 4429 return -EINVAL; 4430 } 4431 4432 pthread_mutex_lock(&bdev->mutex); 4433 4434 tmp_ns = TAILQ_FIRST(&bdev->nvme_ns_list); 4435 assert(tmp_ns != NULL); 4436 4437 if (!bdev_nvme_compare_ns(nvme_ns->ns, tmp_ns->ns)) { 4438 pthread_mutex_unlock(&bdev->mutex); 4439 SPDK_ERRLOG("Namespaces are not identical.\n"); 4440 return -EINVAL; 4441 } 4442 4443 bdev->ref++; 4444 TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); 4445 nvme_ns->bdev = bdev; 4446 4447 pthread_mutex_unlock(&bdev->mutex); 4448 4449 /* Add nvme_io_path to nvme_bdev_channels dynamically. */ 4450 spdk_for_each_channel(bdev, 4451 bdev_nvme_add_io_path, 4452 nvme_ns, 4453 bdev_nvme_add_io_path_done); 4454 4455 return 0; 4456 } 4457 4458 static void 4459 nvme_ctrlr_populate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4460 { 4461 struct spdk_nvme_ns *ns; 4462 struct nvme_bdev *bdev; 4463 int rc = 0; 4464 4465 ns = spdk_nvme_ctrlr_get_ns(nvme_ctrlr->ctrlr, nvme_ns->id); 4466 if (!ns) { 4467 SPDK_DEBUGLOG(bdev_nvme, "Invalid NS %d\n", nvme_ns->id); 4468 rc = -EINVAL; 4469 goto done; 4470 } 4471 4472 nvme_ns->ns = ns; 4473 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 4474 4475 if (nvme_ctrlr->ana_log_page != NULL) { 4476 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ns_set_ana_state, nvme_ns); 4477 } 4478 4479 bdev = nvme_bdev_ctrlr_get_bdev(nvme_ctrlr->nbdev_ctrlr, nvme_ns->id); 4480 if (bdev == NULL) { 4481 rc = nvme_bdev_create(nvme_ctrlr, nvme_ns); 4482 } else { 4483 rc = nvme_bdev_add_ns(bdev, nvme_ns); 4484 if (rc == 0) { 4485 return; 4486 } 4487 } 4488 done: 4489 nvme_ctrlr_populate_namespace_done(nvme_ns, rc); 4490 } 4491 4492 static void 4493 nvme_ctrlr_depopulate_namespace_done(struct nvme_ns *nvme_ns) 4494 { 4495 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 4496 4497 assert(nvme_ctrlr != NULL); 4498 4499 pthread_mutex_lock(&nvme_ctrlr->mutex); 4500 4501 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4502 4503 if (nvme_ns->bdev != NULL) { 4504 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4505 return; 4506 } 4507 4508 nvme_ns_free(nvme_ns); 4509 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4510 4511 nvme_ctrlr_release(nvme_ctrlr); 4512 } 4513 4514 static void 4515 bdev_nvme_delete_io_path_done(struct spdk_io_channel_iter *i, int status) 4516 { 4517 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4518 4519 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 4520 } 4521 4522 static void 4523 nvme_ctrlr_depopulate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4524 { 4525 struct nvme_bdev *bdev; 4526 4527 spdk_poller_unregister(&nvme_ns->anatt_timer); 4528 4529 bdev = nvme_ns->bdev; 4530 if (bdev != NULL) { 4531 pthread_mutex_lock(&bdev->mutex); 4532 4533 assert(bdev->ref > 0); 4534 bdev->ref--; 4535 if (bdev->ref == 0) { 4536 pthread_mutex_unlock(&bdev->mutex); 4537 4538 spdk_bdev_unregister(&bdev->disk, NULL, NULL); 4539 } else { 4540 /* spdk_bdev_unregister() is not called until the last nvme_ns is 4541 * depopulated. Hence we need to remove nvme_ns from bdev->nvme_ns_list 4542 * and clear nvme_ns->bdev here. 4543 */ 4544 TAILQ_REMOVE(&bdev->nvme_ns_list, nvme_ns, tailq); 4545 nvme_ns->bdev = NULL; 4546 4547 pthread_mutex_unlock(&bdev->mutex); 4548 4549 /* Delete nvme_io_paths from nvme_bdev_channels dynamically. After that, 4550 * we call depopulate_namespace_done() to avoid use-after-free. 4551 */ 4552 spdk_for_each_channel(bdev, 4553 bdev_nvme_delete_io_path, 4554 nvme_ns, 4555 bdev_nvme_delete_io_path_done); 4556 return; 4557 } 4558 } 4559 4560 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 4561 } 4562 4563 static void 4564 nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 4565 struct nvme_async_probe_ctx *ctx) 4566 { 4567 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 4568 struct nvme_ns *nvme_ns, *next; 4569 struct spdk_nvme_ns *ns; 4570 struct nvme_bdev *bdev; 4571 uint32_t nsid; 4572 int rc; 4573 uint64_t num_sectors; 4574 4575 if (ctx) { 4576 /* Initialize this count to 1 to handle the populate functions 4577 * calling nvme_ctrlr_populate_namespace_done() immediately. 4578 */ 4579 ctx->populates_in_progress = 1; 4580 } 4581 4582 /* First loop over our existing namespaces and see if they have been 4583 * removed. */ 4584 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 4585 while (nvme_ns != NULL) { 4586 next = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 4587 4588 if (spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) { 4589 /* NS is still there but attributes may have changed */ 4590 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id); 4591 num_sectors = spdk_nvme_ns_get_num_sectors(ns); 4592 bdev = nvme_ns->bdev; 4593 assert(bdev != NULL); 4594 if (bdev->disk.blockcnt != num_sectors) { 4595 SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n", 4596 nvme_ns->id, 4597 bdev->disk.name, 4598 bdev->disk.blockcnt, 4599 num_sectors); 4600 rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors); 4601 if (rc != 0) { 4602 SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n", 4603 bdev->disk.name, rc); 4604 } 4605 } 4606 } else { 4607 /* Namespace was removed */ 4608 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 4609 } 4610 4611 nvme_ns = next; 4612 } 4613 4614 /* Loop through all of the namespaces at the nvme level and see if any of them are new */ 4615 nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 4616 while (nsid != 0) { 4617 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 4618 4619 if (nvme_ns == NULL) { 4620 /* Found a new one */ 4621 nvme_ns = nvme_ns_alloc(); 4622 if (nvme_ns == NULL) { 4623 SPDK_ERRLOG("Failed to allocate namespace\n"); 4624 /* This just fails to attach the namespace. It may work on a future attempt. */ 4625 continue; 4626 } 4627 4628 nvme_ns->id = nsid; 4629 nvme_ns->ctrlr = nvme_ctrlr; 4630 4631 nvme_ns->bdev = NULL; 4632 4633 if (ctx) { 4634 ctx->populates_in_progress++; 4635 } 4636 nvme_ns->probe_ctx = ctx; 4637 4638 RB_INSERT(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4639 4640 nvme_ctrlr_populate_namespace(nvme_ctrlr, nvme_ns); 4641 } 4642 4643 nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid); 4644 } 4645 4646 if (ctx) { 4647 /* Decrement this count now that the loop is over to account 4648 * for the one we started with. If the count is then 0, we 4649 * know any populate_namespace functions completed immediately, 4650 * so we'll kick the callback here. 4651 */ 4652 ctx->populates_in_progress--; 4653 if (ctx->populates_in_progress == 0) { 4654 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 4655 } 4656 } 4657 4658 } 4659 4660 static void 4661 nvme_ctrlr_depopulate_namespaces(struct nvme_ctrlr *nvme_ctrlr) 4662 { 4663 struct nvme_ns *nvme_ns, *tmp; 4664 4665 RB_FOREACH_SAFE(nvme_ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp) { 4666 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 4667 } 4668 } 4669 4670 static uint32_t 4671 nvme_ctrlr_get_ana_log_page_size(struct nvme_ctrlr *nvme_ctrlr) 4672 { 4673 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 4674 const struct spdk_nvme_ctrlr_data *cdata; 4675 uint32_t nsid, ns_count = 0; 4676 4677 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 4678 4679 for (nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 4680 nsid != 0; nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid)) { 4681 ns_count++; 4682 } 4683 4684 return sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 4685 sizeof(struct spdk_nvme_ana_group_descriptor) + ns_count * 4686 sizeof(uint32_t); 4687 } 4688 4689 static int 4690 nvme_ctrlr_set_ana_states(const struct spdk_nvme_ana_group_descriptor *desc, 4691 void *cb_arg) 4692 { 4693 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 4694 struct nvme_ns *nvme_ns; 4695 uint32_t i, nsid; 4696 4697 for (i = 0; i < desc->num_of_nsid; i++) { 4698 nsid = desc->nsid[i]; 4699 if (nsid == 0) { 4700 continue; 4701 } 4702 4703 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 4704 4705 assert(nvme_ns != NULL); 4706 if (nvme_ns == NULL) { 4707 /* Target told us that an inactive namespace had an ANA change */ 4708 continue; 4709 } 4710 4711 _nvme_ns_set_ana_state(nvme_ns, desc); 4712 } 4713 4714 return 0; 4715 } 4716 4717 static void 4718 bdev_nvme_disable_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 4719 { 4720 struct nvme_ns *nvme_ns; 4721 4722 spdk_free(nvme_ctrlr->ana_log_page); 4723 nvme_ctrlr->ana_log_page = NULL; 4724 4725 for (nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 4726 nvme_ns != NULL; 4727 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns)) { 4728 nvme_ns->ana_state_updating = false; 4729 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 4730 } 4731 } 4732 4733 static void 4734 nvme_ctrlr_read_ana_log_page_done(void *ctx, const struct spdk_nvme_cpl *cpl) 4735 { 4736 struct nvme_ctrlr *nvme_ctrlr = ctx; 4737 4738 if (cpl != NULL && spdk_nvme_cpl_is_success(cpl)) { 4739 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ctrlr_set_ana_states, 4740 nvme_ctrlr); 4741 } else { 4742 bdev_nvme_disable_read_ana_log_page(nvme_ctrlr); 4743 } 4744 4745 pthread_mutex_lock(&nvme_ctrlr->mutex); 4746 4747 assert(nvme_ctrlr->ana_log_page_updating == true); 4748 nvme_ctrlr->ana_log_page_updating = false; 4749 4750 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 4751 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4752 4753 nvme_ctrlr_unregister(nvme_ctrlr); 4754 } else { 4755 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4756 4757 bdev_nvme_clear_io_path_caches(nvme_ctrlr); 4758 } 4759 } 4760 4761 static int 4762 nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 4763 { 4764 uint32_t ana_log_page_size; 4765 int rc; 4766 4767 if (nvme_ctrlr->ana_log_page == NULL) { 4768 return -EINVAL; 4769 } 4770 4771 ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); 4772 4773 if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { 4774 SPDK_ERRLOG("ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", 4775 ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); 4776 return -EINVAL; 4777 } 4778 4779 pthread_mutex_lock(&nvme_ctrlr->mutex); 4780 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 4781 nvme_ctrlr->ana_log_page_updating) { 4782 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4783 return -EBUSY; 4784 } 4785 4786 nvme_ctrlr->ana_log_page_updating = true; 4787 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4788 4789 rc = spdk_nvme_ctrlr_cmd_get_log_page(nvme_ctrlr->ctrlr, 4790 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 4791 SPDK_NVME_GLOBAL_NS_TAG, 4792 nvme_ctrlr->ana_log_page, 4793 ana_log_page_size, 0, 4794 nvme_ctrlr_read_ana_log_page_done, 4795 nvme_ctrlr); 4796 if (rc != 0) { 4797 nvme_ctrlr_read_ana_log_page_done(nvme_ctrlr, NULL); 4798 } 4799 4800 return rc; 4801 } 4802 4803 static void 4804 dummy_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 4805 { 4806 } 4807 4808 struct bdev_nvme_set_preferred_path_ctx { 4809 struct spdk_bdev_desc *desc; 4810 struct nvme_ns *nvme_ns; 4811 bdev_nvme_set_preferred_path_cb cb_fn; 4812 void *cb_arg; 4813 }; 4814 4815 static void 4816 bdev_nvme_set_preferred_path_done(struct spdk_io_channel_iter *i, int status) 4817 { 4818 struct bdev_nvme_set_preferred_path_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4819 4820 assert(ctx != NULL); 4821 assert(ctx->desc != NULL); 4822 assert(ctx->cb_fn != NULL); 4823 4824 spdk_bdev_close(ctx->desc); 4825 4826 ctx->cb_fn(ctx->cb_arg, status); 4827 4828 free(ctx); 4829 } 4830 4831 static void 4832 _bdev_nvme_set_preferred_path(struct spdk_io_channel_iter *i) 4833 { 4834 struct bdev_nvme_set_preferred_path_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4835 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4836 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4837 struct nvme_io_path *io_path, *prev; 4838 4839 prev = NULL; 4840 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 4841 if (io_path->nvme_ns == ctx->nvme_ns) { 4842 break; 4843 } 4844 prev = io_path; 4845 } 4846 4847 if (io_path != NULL) { 4848 if (prev != NULL) { 4849 STAILQ_REMOVE_AFTER(&nbdev_ch->io_path_list, prev, stailq); 4850 STAILQ_INSERT_HEAD(&nbdev_ch->io_path_list, io_path, stailq); 4851 } 4852 4853 /* We can set io_path to nbdev_ch->current_io_path directly here. 4854 * However, it needs to be conditional. To simplify the code, 4855 * just clear nbdev_ch->current_io_path and let find_io_path() 4856 * fill it. 4857 * 4858 * Automatic failback may be disabled. Hence even if the io_path is 4859 * already at the head, clear nbdev_ch->current_io_path. 4860 */ 4861 bdev_nvme_clear_current_io_path(nbdev_ch); 4862 } 4863 4864 spdk_for_each_channel_continue(i, 0); 4865 } 4866 4867 static struct nvme_ns * 4868 bdev_nvme_set_preferred_ns(struct nvme_bdev *nbdev, uint16_t cntlid) 4869 { 4870 struct nvme_ns *nvme_ns, *prev; 4871 const struct spdk_nvme_ctrlr_data *cdata; 4872 4873 prev = NULL; 4874 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 4875 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 4876 4877 if (cdata->cntlid == cntlid) { 4878 break; 4879 } 4880 prev = nvme_ns; 4881 } 4882 4883 if (nvme_ns != NULL && prev != NULL) { 4884 TAILQ_REMOVE(&nbdev->nvme_ns_list, nvme_ns, tailq); 4885 TAILQ_INSERT_HEAD(&nbdev->nvme_ns_list, nvme_ns, tailq); 4886 } 4887 4888 return nvme_ns; 4889 } 4890 4891 /* This function supports only multipath mode. There is only a single I/O path 4892 * for each NVMe-oF controller. Hence, just move the matched I/O path to the 4893 * head of the I/O path list for each NVMe bdev channel. 4894 * 4895 * NVMe bdev channel may be acquired after completing this function. move the 4896 * matched namespace to the head of the namespace list for the NVMe bdev too. 4897 */ 4898 void 4899 bdev_nvme_set_preferred_path(const char *name, uint16_t cntlid, 4900 bdev_nvme_set_preferred_path_cb cb_fn, void *cb_arg) 4901 { 4902 struct bdev_nvme_set_preferred_path_ctx *ctx; 4903 struct spdk_bdev *bdev; 4904 struct nvme_bdev *nbdev; 4905 int rc = 0; 4906 4907 assert(cb_fn != NULL); 4908 4909 ctx = calloc(1, sizeof(*ctx)); 4910 if (ctx == NULL) { 4911 SPDK_ERRLOG("Failed to alloc context.\n"); 4912 rc = -ENOMEM; 4913 goto err_alloc; 4914 } 4915 4916 ctx->cb_fn = cb_fn; 4917 ctx->cb_arg = cb_arg; 4918 4919 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 4920 if (rc != 0) { 4921 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 4922 goto err_open; 4923 } 4924 4925 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 4926 4927 if (bdev->module != &nvme_if) { 4928 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 4929 rc = -ENODEV; 4930 goto err_bdev; 4931 } 4932 4933 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 4934 4935 pthread_mutex_lock(&nbdev->mutex); 4936 4937 ctx->nvme_ns = bdev_nvme_set_preferred_ns(nbdev, cntlid); 4938 if (ctx->nvme_ns == NULL) { 4939 pthread_mutex_unlock(&nbdev->mutex); 4940 4941 SPDK_ERRLOG("bdev %s does not have namespace to controller %u.\n", name, cntlid); 4942 rc = -ENODEV; 4943 goto err_bdev; 4944 } 4945 4946 pthread_mutex_unlock(&nbdev->mutex); 4947 4948 spdk_for_each_channel(nbdev, 4949 _bdev_nvme_set_preferred_path, 4950 ctx, 4951 bdev_nvme_set_preferred_path_done); 4952 return; 4953 4954 err_bdev: 4955 spdk_bdev_close(ctx->desc); 4956 err_open: 4957 free(ctx); 4958 err_alloc: 4959 cb_fn(cb_arg, rc); 4960 } 4961 4962 struct bdev_nvme_set_multipath_policy_ctx { 4963 struct spdk_bdev_desc *desc; 4964 bdev_nvme_set_multipath_policy_cb cb_fn; 4965 void *cb_arg; 4966 }; 4967 4968 static void 4969 bdev_nvme_set_multipath_policy_done(struct spdk_io_channel_iter *i, int status) 4970 { 4971 struct bdev_nvme_set_multipath_policy_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4972 4973 assert(ctx != NULL); 4974 assert(ctx->desc != NULL); 4975 assert(ctx->cb_fn != NULL); 4976 4977 spdk_bdev_close(ctx->desc); 4978 4979 ctx->cb_fn(ctx->cb_arg, status); 4980 4981 free(ctx); 4982 } 4983 4984 static void 4985 _bdev_nvme_set_multipath_policy(struct spdk_io_channel_iter *i) 4986 { 4987 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4988 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4989 struct nvme_bdev *nbdev = spdk_io_channel_get_io_device(_ch); 4990 4991 nbdev_ch->mp_policy = nbdev->mp_policy; 4992 nbdev_ch->mp_selector = nbdev->mp_selector; 4993 nbdev_ch->rr_min_io = nbdev->rr_min_io; 4994 bdev_nvme_clear_current_io_path(nbdev_ch); 4995 4996 spdk_for_each_channel_continue(i, 0); 4997 } 4998 4999 void 5000 bdev_nvme_set_multipath_policy(const char *name, enum bdev_nvme_multipath_policy policy, 5001 enum bdev_nvme_multipath_selector selector, uint32_t rr_min_io, 5002 bdev_nvme_set_multipath_policy_cb cb_fn, void *cb_arg) 5003 { 5004 struct bdev_nvme_set_multipath_policy_ctx *ctx; 5005 struct spdk_bdev *bdev; 5006 struct nvme_bdev *nbdev; 5007 int rc; 5008 5009 assert(cb_fn != NULL); 5010 5011 if (policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE && selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 5012 if (rr_min_io == UINT32_MAX) { 5013 rr_min_io = 1; 5014 } else if (rr_min_io == 0) { 5015 rc = -EINVAL; 5016 goto exit; 5017 } 5018 } else if (rr_min_io != UINT32_MAX) { 5019 rc = -EINVAL; 5020 goto exit; 5021 } 5022 5023 ctx = calloc(1, sizeof(*ctx)); 5024 if (ctx == NULL) { 5025 SPDK_ERRLOG("Failed to alloc context.\n"); 5026 rc = -ENOMEM; 5027 goto exit; 5028 } 5029 5030 ctx->cb_fn = cb_fn; 5031 ctx->cb_arg = cb_arg; 5032 5033 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 5034 if (rc != 0) { 5035 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 5036 rc = -ENODEV; 5037 goto err_open; 5038 } 5039 5040 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 5041 if (bdev->module != &nvme_if) { 5042 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 5043 rc = -ENODEV; 5044 goto err_module; 5045 } 5046 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 5047 5048 pthread_mutex_lock(&nbdev->mutex); 5049 nbdev->mp_policy = policy; 5050 nbdev->mp_selector = selector; 5051 nbdev->rr_min_io = rr_min_io; 5052 pthread_mutex_unlock(&nbdev->mutex); 5053 5054 spdk_for_each_channel(nbdev, 5055 _bdev_nvme_set_multipath_policy, 5056 ctx, 5057 bdev_nvme_set_multipath_policy_done); 5058 return; 5059 5060 err_module: 5061 spdk_bdev_close(ctx->desc); 5062 err_open: 5063 free(ctx); 5064 exit: 5065 cb_fn(cb_arg, rc); 5066 } 5067 5068 static void 5069 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 5070 { 5071 struct nvme_ctrlr *nvme_ctrlr = arg; 5072 union spdk_nvme_async_event_completion event; 5073 5074 if (spdk_nvme_cpl_is_error(cpl)) { 5075 SPDK_WARNLOG("AER request execute failed\n"); 5076 return; 5077 } 5078 5079 event.raw = cpl->cdw0; 5080 if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 5081 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) { 5082 nvme_ctrlr_populate_namespaces(nvme_ctrlr, NULL); 5083 } else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 5084 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_ANA_CHANGE)) { 5085 nvme_ctrlr_read_ana_log_page(nvme_ctrlr); 5086 } 5087 } 5088 5089 static void 5090 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, int rc) 5091 { 5092 if (ctx->cb_fn) { 5093 ctx->cb_fn(ctx->cb_ctx, ctx->reported_bdevs, rc); 5094 } 5095 5096 ctx->namespaces_populated = true; 5097 if (ctx->probe_done) { 5098 /* The probe was already completed, so we need to free the context 5099 * here. This can happen for cases like OCSSD, where we need to 5100 * send additional commands to the SSD after attach. 5101 */ 5102 free(ctx); 5103 } 5104 } 5105 5106 static void 5107 nvme_ctrlr_create_done(struct nvme_ctrlr *nvme_ctrlr, 5108 struct nvme_async_probe_ctx *ctx) 5109 { 5110 spdk_io_device_register(nvme_ctrlr, 5111 bdev_nvme_create_ctrlr_channel_cb, 5112 bdev_nvme_destroy_ctrlr_channel_cb, 5113 sizeof(struct nvme_ctrlr_channel), 5114 nvme_ctrlr->nbdev_ctrlr->name); 5115 5116 nvme_ctrlr_populate_namespaces(nvme_ctrlr, ctx); 5117 } 5118 5119 static void 5120 nvme_ctrlr_init_ana_log_page_done(void *_ctx, const struct spdk_nvme_cpl *cpl) 5121 { 5122 struct nvme_ctrlr *nvme_ctrlr = _ctx; 5123 struct nvme_async_probe_ctx *ctx = nvme_ctrlr->probe_ctx; 5124 5125 nvme_ctrlr->probe_ctx = NULL; 5126 5127 if (spdk_nvme_cpl_is_error(cpl)) { 5128 nvme_ctrlr_delete(nvme_ctrlr); 5129 5130 if (ctx != NULL) { 5131 ctx->reported_bdevs = 0; 5132 populate_namespaces_cb(ctx, -1); 5133 } 5134 return; 5135 } 5136 5137 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 5138 } 5139 5140 static int 5141 nvme_ctrlr_init_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 5142 struct nvme_async_probe_ctx *ctx) 5143 { 5144 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 5145 const struct spdk_nvme_ctrlr_data *cdata; 5146 uint32_t ana_log_page_size; 5147 5148 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5149 5150 /* Set buffer size enough to include maximum number of allowed namespaces. */ 5151 ana_log_page_size = sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 5152 sizeof(struct spdk_nvme_ana_group_descriptor) + cdata->mnan * 5153 sizeof(uint32_t); 5154 5155 nvme_ctrlr->ana_log_page = spdk_zmalloc(ana_log_page_size, 64, NULL, 5156 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 5157 if (nvme_ctrlr->ana_log_page == NULL) { 5158 SPDK_ERRLOG("could not allocate ANA log page buffer\n"); 5159 return -ENXIO; 5160 } 5161 5162 /* Each descriptor in a ANA log page is not ensured to be 8-bytes aligned. 5163 * Hence copy each descriptor to a temporary area when parsing it. 5164 * 5165 * Allocate a buffer whose size is as large as ANA log page buffer because 5166 * we do not know the size of a descriptor until actually reading it. 5167 */ 5168 nvme_ctrlr->copied_ana_desc = calloc(1, ana_log_page_size); 5169 if (nvme_ctrlr->copied_ana_desc == NULL) { 5170 SPDK_ERRLOG("could not allocate a buffer to parse ANA descriptor\n"); 5171 return -ENOMEM; 5172 } 5173 5174 nvme_ctrlr->max_ana_log_page_size = ana_log_page_size; 5175 5176 nvme_ctrlr->probe_ctx = ctx; 5177 5178 /* Then, set the read size only to include the current active namespaces. */ 5179 ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); 5180 5181 if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { 5182 SPDK_ERRLOG("ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", 5183 ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); 5184 return -EINVAL; 5185 } 5186 5187 return spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, 5188 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 5189 SPDK_NVME_GLOBAL_NS_TAG, 5190 nvme_ctrlr->ana_log_page, 5191 ana_log_page_size, 0, 5192 nvme_ctrlr_init_ana_log_page_done, 5193 nvme_ctrlr); 5194 } 5195 5196 /* hostnqn and subnqn were already verified before attaching a controller. 5197 * Hence check only the multipath capability and cntlid here. 5198 */ 5199 static bool 5200 bdev_nvme_check_multipath(struct nvme_bdev_ctrlr *nbdev_ctrlr, struct spdk_nvme_ctrlr *ctrlr) 5201 { 5202 struct nvme_ctrlr *tmp; 5203 const struct spdk_nvme_ctrlr_data *cdata, *tmp_cdata; 5204 5205 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5206 5207 if (!cdata->cmic.multi_ctrlr) { 5208 SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); 5209 return false; 5210 } 5211 5212 TAILQ_FOREACH(tmp, &nbdev_ctrlr->ctrlrs, tailq) { 5213 tmp_cdata = spdk_nvme_ctrlr_get_data(tmp->ctrlr); 5214 5215 if (!tmp_cdata->cmic.multi_ctrlr) { 5216 SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); 5217 return false; 5218 } 5219 if (cdata->cntlid == tmp_cdata->cntlid) { 5220 SPDK_ERRLOG("cntlid %u are duplicated.\n", tmp_cdata->cntlid); 5221 return false; 5222 } 5223 } 5224 5225 return true; 5226 } 5227 5228 static int 5229 nvme_bdev_ctrlr_create(const char *name, struct nvme_ctrlr *nvme_ctrlr) 5230 { 5231 struct nvme_bdev_ctrlr *nbdev_ctrlr; 5232 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 5233 int rc = 0; 5234 5235 pthread_mutex_lock(&g_bdev_nvme_mutex); 5236 5237 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 5238 if (nbdev_ctrlr != NULL) { 5239 if (!bdev_nvme_check_multipath(nbdev_ctrlr, ctrlr)) { 5240 rc = -EINVAL; 5241 goto exit; 5242 } 5243 } else { 5244 nbdev_ctrlr = calloc(1, sizeof(*nbdev_ctrlr)); 5245 if (nbdev_ctrlr == NULL) { 5246 SPDK_ERRLOG("Failed to allocate nvme_bdev_ctrlr.\n"); 5247 rc = -ENOMEM; 5248 goto exit; 5249 } 5250 nbdev_ctrlr->name = strdup(name); 5251 if (nbdev_ctrlr->name == NULL) { 5252 SPDK_ERRLOG("Failed to allocate name of nvme_bdev_ctrlr.\n"); 5253 free(nbdev_ctrlr); 5254 goto exit; 5255 } 5256 TAILQ_INIT(&nbdev_ctrlr->ctrlrs); 5257 TAILQ_INIT(&nbdev_ctrlr->bdevs); 5258 TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 5259 } 5260 nvme_ctrlr->nbdev_ctrlr = nbdev_ctrlr; 5261 TAILQ_INSERT_TAIL(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 5262 exit: 5263 pthread_mutex_unlock(&g_bdev_nvme_mutex); 5264 return rc; 5265 } 5266 5267 static int 5268 nvme_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr, 5269 const char *name, 5270 const struct spdk_nvme_transport_id *trid, 5271 struct nvme_async_probe_ctx *ctx) 5272 { 5273 struct nvme_ctrlr *nvme_ctrlr; 5274 struct nvme_path_id *path_id; 5275 const struct spdk_nvme_ctrlr_data *cdata; 5276 int rc; 5277 5278 nvme_ctrlr = calloc(1, sizeof(*nvme_ctrlr)); 5279 if (nvme_ctrlr == NULL) { 5280 SPDK_ERRLOG("Failed to allocate device struct\n"); 5281 return -ENOMEM; 5282 } 5283 5284 rc = pthread_mutex_init(&nvme_ctrlr->mutex, NULL); 5285 if (rc != 0) { 5286 free(nvme_ctrlr); 5287 return rc; 5288 } 5289 5290 TAILQ_INIT(&nvme_ctrlr->trids); 5291 5292 RB_INIT(&nvme_ctrlr->namespaces); 5293 5294 path_id = calloc(1, sizeof(*path_id)); 5295 if (path_id == NULL) { 5296 SPDK_ERRLOG("Failed to allocate trid entry pointer\n"); 5297 rc = -ENOMEM; 5298 goto err; 5299 } 5300 5301 path_id->trid = *trid; 5302 if (ctx != NULL) { 5303 memcpy(path_id->hostid.hostaddr, ctx->drv_opts.src_addr, sizeof(path_id->hostid.hostaddr)); 5304 memcpy(path_id->hostid.hostsvcid, ctx->drv_opts.src_svcid, sizeof(path_id->hostid.hostsvcid)); 5305 } 5306 nvme_ctrlr->active_path_id = path_id; 5307 TAILQ_INSERT_HEAD(&nvme_ctrlr->trids, path_id, link); 5308 5309 nvme_ctrlr->thread = spdk_get_thread(); 5310 nvme_ctrlr->ctrlr = ctrlr; 5311 nvme_ctrlr->ref = 1; 5312 5313 if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) { 5314 SPDK_ERRLOG("OCSSDs are not supported"); 5315 rc = -ENOTSUP; 5316 goto err; 5317 } 5318 5319 if (ctx != NULL) { 5320 memcpy(&nvme_ctrlr->opts, &ctx->bdev_opts, sizeof(ctx->bdev_opts)); 5321 } else { 5322 bdev_nvme_get_default_ctrlr_opts(&nvme_ctrlr->opts); 5323 } 5324 5325 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_ctrlr, 5326 g_opts.nvme_adminq_poll_period_us); 5327 5328 if (g_opts.timeout_us > 0) { 5329 /* Register timeout callback. Timeout values for IO vs. admin reqs can be different. */ 5330 /* If timeout_admin_us is 0 (not specified), admin uses same timeout as IO. */ 5331 uint64_t adm_timeout_us = (g_opts.timeout_admin_us == 0) ? 5332 g_opts.timeout_us : g_opts.timeout_admin_us; 5333 spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us, 5334 adm_timeout_us, timeout_cb, nvme_ctrlr); 5335 } 5336 5337 spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_ctrlr); 5338 spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_ctrlr); 5339 5340 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 5341 SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) { 5342 nvme_ctrlr->opal_dev = spdk_opal_dev_construct(ctrlr); 5343 } 5344 5345 rc = nvme_bdev_ctrlr_create(name, nvme_ctrlr); 5346 if (rc != 0) { 5347 goto err; 5348 } 5349 5350 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5351 5352 if (cdata->cmic.ana_reporting) { 5353 rc = nvme_ctrlr_init_ana_log_page(nvme_ctrlr, ctx); 5354 if (rc == 0) { 5355 return 0; 5356 } 5357 } else { 5358 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 5359 return 0; 5360 } 5361 5362 err: 5363 nvme_ctrlr_delete(nvme_ctrlr); 5364 return rc; 5365 } 5366 5367 void 5368 bdev_nvme_get_default_ctrlr_opts(struct nvme_ctrlr_opts *opts) 5369 { 5370 opts->prchk_flags = 0; 5371 opts->ctrlr_loss_timeout_sec = g_opts.ctrlr_loss_timeout_sec; 5372 opts->reconnect_delay_sec = g_opts.reconnect_delay_sec; 5373 opts->fast_io_fail_timeout_sec = g_opts.fast_io_fail_timeout_sec; 5374 } 5375 5376 static void 5377 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5378 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *drv_opts) 5379 { 5380 char *name; 5381 5382 name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++); 5383 if (!name) { 5384 SPDK_ERRLOG("Failed to assign name to NVMe device\n"); 5385 return; 5386 } 5387 5388 if (nvme_ctrlr_create(ctrlr, name, trid, NULL) == 0) { 5389 SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name); 5390 } else { 5391 SPDK_ERRLOG("Failed to attach to %s (%s)\n", trid->traddr, name); 5392 } 5393 5394 free(name); 5395 } 5396 5397 static void 5398 _nvme_ctrlr_destruct(void *ctx) 5399 { 5400 struct nvme_ctrlr *nvme_ctrlr = ctx; 5401 5402 nvme_ctrlr_depopulate_namespaces(nvme_ctrlr); 5403 nvme_ctrlr_release(nvme_ctrlr); 5404 } 5405 5406 static int 5407 bdev_nvme_delete_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) 5408 { 5409 struct nvme_probe_skip_entry *entry; 5410 5411 /* The controller's destruction was already started */ 5412 if (nvme_ctrlr->destruct) { 5413 return -EALREADY; 5414 } 5415 5416 if (!hotplug && 5417 nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { 5418 entry = calloc(1, sizeof(*entry)); 5419 if (!entry) { 5420 return -ENOMEM; 5421 } 5422 entry->trid = nvme_ctrlr->active_path_id->trid; 5423 TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq); 5424 } 5425 5426 nvme_ctrlr->destruct = true; 5427 return 0; 5428 } 5429 5430 static int 5431 bdev_nvme_delete_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) 5432 { 5433 int rc; 5434 5435 pthread_mutex_lock(&nvme_ctrlr->mutex); 5436 rc = bdev_nvme_delete_ctrlr_unsafe(nvme_ctrlr, hotplug); 5437 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5438 5439 if (rc == 0) { 5440 _nvme_ctrlr_destruct(nvme_ctrlr); 5441 } else if (rc == -EALREADY) { 5442 rc = 0; 5443 } 5444 5445 return rc; 5446 } 5447 5448 static void 5449 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr) 5450 { 5451 struct nvme_ctrlr *nvme_ctrlr = cb_ctx; 5452 5453 bdev_nvme_delete_ctrlr(nvme_ctrlr, true); 5454 } 5455 5456 static int 5457 bdev_nvme_hotplug_probe(void *arg) 5458 { 5459 if (g_hotplug_probe_ctx == NULL) { 5460 spdk_poller_unregister(&g_hotplug_probe_poller); 5461 return SPDK_POLLER_IDLE; 5462 } 5463 5464 if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) { 5465 g_hotplug_probe_ctx = NULL; 5466 spdk_poller_unregister(&g_hotplug_probe_poller); 5467 } 5468 5469 return SPDK_POLLER_BUSY; 5470 } 5471 5472 static int 5473 bdev_nvme_hotplug(void *arg) 5474 { 5475 struct spdk_nvme_transport_id trid_pcie; 5476 5477 if (g_hotplug_probe_ctx) { 5478 return SPDK_POLLER_BUSY; 5479 } 5480 5481 memset(&trid_pcie, 0, sizeof(trid_pcie)); 5482 spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE); 5483 5484 g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL, 5485 hotplug_probe_cb, attach_cb, NULL); 5486 5487 if (g_hotplug_probe_ctx) { 5488 assert(g_hotplug_probe_poller == NULL); 5489 g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000); 5490 } 5491 5492 return SPDK_POLLER_BUSY; 5493 } 5494 5495 void 5496 bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts) 5497 { 5498 *opts = g_opts; 5499 } 5500 5501 static bool bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, 5502 uint32_t reconnect_delay_sec, 5503 uint32_t fast_io_fail_timeout_sec); 5504 5505 static int 5506 bdev_nvme_validate_opts(const struct spdk_bdev_nvme_opts *opts) 5507 { 5508 if ((opts->timeout_us == 0) && (opts->timeout_admin_us != 0)) { 5509 /* Can't set timeout_admin_us without also setting timeout_us */ 5510 SPDK_WARNLOG("Invalid options: Can't have (timeout_us == 0) with (timeout_admin_us > 0)\n"); 5511 return -EINVAL; 5512 } 5513 5514 if (opts->bdev_retry_count < -1) { 5515 SPDK_WARNLOG("Invalid option: bdev_retry_count can't be less than -1.\n"); 5516 return -EINVAL; 5517 } 5518 5519 if (!bdev_nvme_check_io_error_resiliency_params(opts->ctrlr_loss_timeout_sec, 5520 opts->reconnect_delay_sec, 5521 opts->fast_io_fail_timeout_sec)) { 5522 return -EINVAL; 5523 } 5524 5525 return 0; 5526 } 5527 5528 int 5529 bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts) 5530 { 5531 int ret; 5532 5533 ret = bdev_nvme_validate_opts(opts); 5534 if (ret) { 5535 SPDK_WARNLOG("Failed to set nvme opts.\n"); 5536 return ret; 5537 } 5538 5539 if (g_bdev_nvme_init_thread != NULL) { 5540 if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 5541 return -EPERM; 5542 } 5543 } 5544 5545 if (opts->rdma_srq_size != 0 || 5546 opts->rdma_max_cq_size != 0 || 5547 opts->rdma_cm_event_timeout_ms != 0) { 5548 struct spdk_nvme_transport_opts drv_opts; 5549 5550 spdk_nvme_transport_get_opts(&drv_opts, sizeof(drv_opts)); 5551 if (opts->rdma_srq_size != 0) { 5552 drv_opts.rdma_srq_size = opts->rdma_srq_size; 5553 } 5554 if (opts->rdma_max_cq_size != 0) { 5555 drv_opts.rdma_max_cq_size = opts->rdma_max_cq_size; 5556 } 5557 if (opts->rdma_cm_event_timeout_ms != 0) { 5558 drv_opts.rdma_cm_event_timeout_ms = opts->rdma_cm_event_timeout_ms; 5559 } 5560 5561 ret = spdk_nvme_transport_set_opts(&drv_opts, sizeof(drv_opts)); 5562 if (ret) { 5563 SPDK_ERRLOG("Failed to set NVMe transport opts.\n"); 5564 return ret; 5565 } 5566 } 5567 5568 g_opts = *opts; 5569 5570 return 0; 5571 } 5572 5573 struct set_nvme_hotplug_ctx { 5574 uint64_t period_us; 5575 bool enabled; 5576 spdk_msg_fn fn; 5577 void *fn_ctx; 5578 }; 5579 5580 static void 5581 set_nvme_hotplug_period_cb(void *_ctx) 5582 { 5583 struct set_nvme_hotplug_ctx *ctx = _ctx; 5584 5585 spdk_poller_unregister(&g_hotplug_poller); 5586 if (ctx->enabled) { 5587 g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us); 5588 } 5589 5590 g_nvme_hotplug_poll_period_us = ctx->period_us; 5591 g_nvme_hotplug_enabled = ctx->enabled; 5592 if (ctx->fn) { 5593 ctx->fn(ctx->fn_ctx); 5594 } 5595 5596 free(ctx); 5597 } 5598 5599 int 5600 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx) 5601 { 5602 struct set_nvme_hotplug_ctx *ctx; 5603 5604 if (enabled == true && !spdk_process_is_primary()) { 5605 return -EPERM; 5606 } 5607 5608 ctx = calloc(1, sizeof(*ctx)); 5609 if (ctx == NULL) { 5610 return -ENOMEM; 5611 } 5612 5613 period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us; 5614 ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX); 5615 ctx->enabled = enabled; 5616 ctx->fn = cb; 5617 ctx->fn_ctx = cb_ctx; 5618 5619 spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx); 5620 return 0; 5621 } 5622 5623 static void 5624 nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 5625 struct nvme_async_probe_ctx *ctx) 5626 { 5627 struct nvme_ns *nvme_ns; 5628 struct nvme_bdev *nvme_bdev; 5629 size_t j; 5630 5631 assert(nvme_ctrlr != NULL); 5632 5633 if (ctx->names == NULL) { 5634 ctx->reported_bdevs = 0; 5635 populate_namespaces_cb(ctx, 0); 5636 return; 5637 } 5638 5639 /* 5640 * Report the new bdevs that were created in this call. 5641 * There can be more than one bdev per NVMe controller. 5642 */ 5643 j = 0; 5644 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 5645 while (nvme_ns != NULL) { 5646 nvme_bdev = nvme_ns->bdev; 5647 if (j < ctx->max_bdevs) { 5648 ctx->names[j] = nvme_bdev->disk.name; 5649 j++; 5650 } else { 5651 SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n", 5652 ctx->max_bdevs); 5653 ctx->reported_bdevs = 0; 5654 populate_namespaces_cb(ctx, -ERANGE); 5655 return; 5656 } 5657 5658 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 5659 } 5660 5661 ctx->reported_bdevs = j; 5662 populate_namespaces_cb(ctx, 0); 5663 } 5664 5665 static int 5666 bdev_nvme_check_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 5667 struct spdk_nvme_ctrlr *new_ctrlr, 5668 struct spdk_nvme_transport_id *trid) 5669 { 5670 struct nvme_path_id *tmp_trid; 5671 5672 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 5673 SPDK_ERRLOG("PCIe failover is not supported.\n"); 5674 return -ENOTSUP; 5675 } 5676 5677 /* Currently we only support failover to the same transport type. */ 5678 if (nvme_ctrlr->active_path_id->trid.trtype != trid->trtype) { 5679 SPDK_WARNLOG("Failover from trtype: %s to a different trtype: %s is not supported currently\n", 5680 spdk_nvme_transport_id_trtype_str(nvme_ctrlr->active_path_id->trid.trtype), 5681 spdk_nvme_transport_id_trtype_str(trid->trtype)); 5682 return -EINVAL; 5683 } 5684 5685 5686 /* Currently we only support failover to the same NQN. */ 5687 if (strncmp(trid->subnqn, nvme_ctrlr->active_path_id->trid.subnqn, SPDK_NVMF_NQN_MAX_LEN)) { 5688 SPDK_WARNLOG("Failover from subnqn: %s to a different subnqn: %s is not supported currently\n", 5689 nvme_ctrlr->active_path_id->trid.subnqn, trid->subnqn); 5690 return -EINVAL; 5691 } 5692 5693 /* Skip all the other checks if we've already registered this path. */ 5694 TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) { 5695 if (!spdk_nvme_transport_id_compare(&tmp_trid->trid, trid)) { 5696 SPDK_WARNLOG("This path (traddr: %s subnqn: %s) is already registered\n", trid->traddr, 5697 trid->subnqn); 5698 return -EEXIST; 5699 } 5700 } 5701 5702 return 0; 5703 } 5704 5705 static int 5706 bdev_nvme_check_secondary_namespace(struct nvme_ctrlr *nvme_ctrlr, 5707 struct spdk_nvme_ctrlr *new_ctrlr) 5708 { 5709 struct nvme_ns *nvme_ns; 5710 struct spdk_nvme_ns *new_ns; 5711 5712 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 5713 while (nvme_ns != NULL) { 5714 new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nvme_ns->id); 5715 assert(new_ns != NULL); 5716 5717 if (!bdev_nvme_compare_ns(nvme_ns->ns, new_ns)) { 5718 return -EINVAL; 5719 } 5720 5721 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 5722 } 5723 5724 return 0; 5725 } 5726 5727 static int 5728 _bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 5729 struct spdk_nvme_transport_id *trid) 5730 { 5731 struct nvme_path_id *active_id, *new_trid, *tmp_trid; 5732 5733 new_trid = calloc(1, sizeof(*new_trid)); 5734 if (new_trid == NULL) { 5735 return -ENOMEM; 5736 } 5737 new_trid->trid = *trid; 5738 5739 active_id = nvme_ctrlr->active_path_id; 5740 assert(active_id != NULL); 5741 assert(active_id == TAILQ_FIRST(&nvme_ctrlr->trids)); 5742 5743 /* Skip the active trid not to replace it until it is failed. */ 5744 tmp_trid = TAILQ_NEXT(active_id, link); 5745 if (tmp_trid == NULL) { 5746 goto add_tail; 5747 } 5748 5749 /* It means the trid is faled if its last failed time is non-zero. 5750 * Insert the new alternate trid before any failed trid. 5751 */ 5752 TAILQ_FOREACH_FROM(tmp_trid, &nvme_ctrlr->trids, link) { 5753 if (tmp_trid->last_failed_tsc != 0) { 5754 TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link); 5755 return 0; 5756 } 5757 } 5758 5759 add_tail: 5760 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, new_trid, link); 5761 return 0; 5762 } 5763 5764 /* This is the case that a secondary path is added to an existing 5765 * nvme_ctrlr for failover. After checking if it can access the same 5766 * namespaces as the primary path, it is disconnected until failover occurs. 5767 */ 5768 static int 5769 bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 5770 struct spdk_nvme_ctrlr *new_ctrlr, 5771 struct spdk_nvme_transport_id *trid) 5772 { 5773 int rc; 5774 5775 assert(nvme_ctrlr != NULL); 5776 5777 pthread_mutex_lock(&nvme_ctrlr->mutex); 5778 5779 rc = bdev_nvme_check_secondary_trid(nvme_ctrlr, new_ctrlr, trid); 5780 if (rc != 0) { 5781 goto exit; 5782 } 5783 5784 rc = bdev_nvme_check_secondary_namespace(nvme_ctrlr, new_ctrlr); 5785 if (rc != 0) { 5786 goto exit; 5787 } 5788 5789 rc = _bdev_nvme_add_secondary_trid(nvme_ctrlr, trid); 5790 5791 exit: 5792 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5793 5794 spdk_nvme_detach(new_ctrlr); 5795 5796 return rc; 5797 } 5798 5799 static void 5800 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5801 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 5802 { 5803 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 5804 struct nvme_async_probe_ctx *ctx; 5805 int rc; 5806 5807 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 5808 ctx->ctrlr_attached = true; 5809 5810 rc = nvme_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx); 5811 if (rc != 0) { 5812 ctx->reported_bdevs = 0; 5813 populate_namespaces_cb(ctx, rc); 5814 } 5815 } 5816 5817 static void 5818 connect_set_failover_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5819 struct spdk_nvme_ctrlr *ctrlr, 5820 const struct spdk_nvme_ctrlr_opts *opts) 5821 { 5822 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 5823 struct nvme_ctrlr *nvme_ctrlr; 5824 struct nvme_async_probe_ctx *ctx; 5825 int rc; 5826 5827 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 5828 ctx->ctrlr_attached = true; 5829 5830 nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->base_name); 5831 if (nvme_ctrlr) { 5832 rc = bdev_nvme_add_secondary_trid(nvme_ctrlr, ctrlr, &ctx->trid); 5833 } else { 5834 rc = -ENODEV; 5835 } 5836 5837 ctx->reported_bdevs = 0; 5838 populate_namespaces_cb(ctx, rc); 5839 } 5840 5841 static int 5842 bdev_nvme_async_poll(void *arg) 5843 { 5844 struct nvme_async_probe_ctx *ctx = arg; 5845 int rc; 5846 5847 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 5848 if (spdk_unlikely(rc != -EAGAIN)) { 5849 ctx->probe_done = true; 5850 spdk_poller_unregister(&ctx->poller); 5851 if (!ctx->ctrlr_attached) { 5852 /* The probe is done, but no controller was attached. 5853 * That means we had a failure, so report -EIO back to 5854 * the caller (usually the RPC). populate_namespaces_cb() 5855 * will take care of freeing the nvme_async_probe_ctx. 5856 */ 5857 ctx->reported_bdevs = 0; 5858 populate_namespaces_cb(ctx, -EIO); 5859 } else if (ctx->namespaces_populated) { 5860 /* The namespaces for the attached controller were all 5861 * populated and the response was already sent to the 5862 * caller (usually the RPC). So free the context here. 5863 */ 5864 free(ctx); 5865 } 5866 } 5867 5868 return SPDK_POLLER_BUSY; 5869 } 5870 5871 static bool 5872 bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, 5873 uint32_t reconnect_delay_sec, 5874 uint32_t fast_io_fail_timeout_sec) 5875 { 5876 if (ctrlr_loss_timeout_sec < -1) { 5877 SPDK_ERRLOG("ctrlr_loss_timeout_sec can't be less than -1.\n"); 5878 return false; 5879 } else if (ctrlr_loss_timeout_sec == -1) { 5880 if (reconnect_delay_sec == 0) { 5881 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 5882 return false; 5883 } else if (fast_io_fail_timeout_sec != 0 && 5884 fast_io_fail_timeout_sec < reconnect_delay_sec) { 5885 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io-fail_timeout_sec.\n"); 5886 return false; 5887 } 5888 } else if (ctrlr_loss_timeout_sec != 0) { 5889 if (reconnect_delay_sec == 0) { 5890 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 5891 return false; 5892 } else if (reconnect_delay_sec > (uint32_t)ctrlr_loss_timeout_sec) { 5893 SPDK_ERRLOG("reconnect_delay_sec can't be more than ctrlr_loss_timeout_sec.\n"); 5894 return false; 5895 } else if (fast_io_fail_timeout_sec != 0) { 5896 if (fast_io_fail_timeout_sec < reconnect_delay_sec) { 5897 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io_fail_timeout_sec.\n"); 5898 return false; 5899 } else if (fast_io_fail_timeout_sec > (uint32_t)ctrlr_loss_timeout_sec) { 5900 SPDK_ERRLOG("fast_io_fail_timeout_sec can't be more than ctrlr_loss_timeout_sec.\n"); 5901 return false; 5902 } 5903 } 5904 } else if (reconnect_delay_sec != 0 || fast_io_fail_timeout_sec != 0) { 5905 SPDK_ERRLOG("Both reconnect_delay_sec and fast_io_fail_timeout_sec must be 0 if ctrlr_loss_timeout_sec is 0.\n"); 5906 return false; 5907 } 5908 5909 return true; 5910 } 5911 5912 int 5913 bdev_nvme_create(struct spdk_nvme_transport_id *trid, 5914 const char *base_name, 5915 const char **names, 5916 uint32_t count, 5917 spdk_bdev_create_nvme_fn cb_fn, 5918 void *cb_ctx, 5919 struct spdk_nvme_ctrlr_opts *drv_opts, 5920 struct nvme_ctrlr_opts *bdev_opts, 5921 bool multipath) 5922 { 5923 struct nvme_probe_skip_entry *entry, *tmp; 5924 struct nvme_async_probe_ctx *ctx; 5925 spdk_nvme_attach_cb attach_cb; 5926 5927 /* TODO expand this check to include both the host and target TRIDs. 5928 * Only if both are the same should we fail. 5929 */ 5930 if (nvme_ctrlr_get(trid) != NULL) { 5931 SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr); 5932 return -EEXIST; 5933 } 5934 5935 if (bdev_opts != NULL && 5936 !bdev_nvme_check_io_error_resiliency_params(bdev_opts->ctrlr_loss_timeout_sec, 5937 bdev_opts->reconnect_delay_sec, 5938 bdev_opts->fast_io_fail_timeout_sec)) { 5939 return -EINVAL; 5940 } 5941 5942 ctx = calloc(1, sizeof(*ctx)); 5943 if (!ctx) { 5944 return -ENOMEM; 5945 } 5946 ctx->base_name = base_name; 5947 ctx->names = names; 5948 ctx->max_bdevs = count; 5949 ctx->cb_fn = cb_fn; 5950 ctx->cb_ctx = cb_ctx; 5951 ctx->trid = *trid; 5952 5953 if (bdev_opts) { 5954 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 5955 } else { 5956 bdev_nvme_get_default_ctrlr_opts(&ctx->bdev_opts); 5957 } 5958 5959 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 5960 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) { 5961 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 5962 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 5963 free(entry); 5964 break; 5965 } 5966 } 5967 } 5968 5969 if (drv_opts) { 5970 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 5971 } else { 5972 spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->drv_opts, sizeof(ctx->drv_opts)); 5973 } 5974 5975 ctx->drv_opts.transport_retry_count = g_opts.transport_retry_count; 5976 ctx->drv_opts.transport_ack_timeout = g_opts.transport_ack_timeout; 5977 ctx->drv_opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms; 5978 ctx->drv_opts.disable_read_ana_log_page = true; 5979 ctx->drv_opts.transport_tos = g_opts.transport_tos; 5980 5981 if (nvme_bdev_ctrlr_get_by_name(base_name) == NULL || multipath) { 5982 attach_cb = connect_attach_cb; 5983 } else { 5984 attach_cb = connect_set_failover_cb; 5985 } 5986 5987 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, attach_cb); 5988 if (ctx->probe_ctx == NULL) { 5989 SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr); 5990 free(ctx); 5991 return -ENODEV; 5992 } 5993 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000); 5994 5995 return 0; 5996 } 5997 5998 struct bdev_nvme_delete_ctx { 5999 char *name; 6000 struct nvme_path_id path_id; 6001 bdev_nvme_delete_done_fn delete_done; 6002 void *delete_done_ctx; 6003 uint64_t timeout_ticks; 6004 struct spdk_poller *poller; 6005 }; 6006 6007 static void 6008 free_bdev_nvme_delete_ctx(struct bdev_nvme_delete_ctx *ctx) 6009 { 6010 if (ctx != NULL) { 6011 free(ctx->name); 6012 free(ctx); 6013 } 6014 } 6015 6016 static bool 6017 nvme_path_id_compare(struct nvme_path_id *p, const struct nvme_path_id *path_id) 6018 { 6019 if (path_id->trid.trtype != 0) { 6020 if (path_id->trid.trtype == SPDK_NVME_TRANSPORT_CUSTOM) { 6021 if (strcasecmp(path_id->trid.trstring, p->trid.trstring) != 0) { 6022 return false; 6023 } 6024 } else { 6025 if (path_id->trid.trtype != p->trid.trtype) { 6026 return false; 6027 } 6028 } 6029 } 6030 6031 if (!spdk_mem_all_zero(path_id->trid.traddr, sizeof(path_id->trid.traddr))) { 6032 if (strcasecmp(path_id->trid.traddr, p->trid.traddr) != 0) { 6033 return false; 6034 } 6035 } 6036 6037 if (path_id->trid.adrfam != 0) { 6038 if (path_id->trid.adrfam != p->trid.adrfam) { 6039 return false; 6040 } 6041 } 6042 6043 if (!spdk_mem_all_zero(path_id->trid.trsvcid, sizeof(path_id->trid.trsvcid))) { 6044 if (strcasecmp(path_id->trid.trsvcid, p->trid.trsvcid) != 0) { 6045 return false; 6046 } 6047 } 6048 6049 if (!spdk_mem_all_zero(path_id->trid.subnqn, sizeof(path_id->trid.subnqn))) { 6050 if (strcmp(path_id->trid.subnqn, p->trid.subnqn) != 0) { 6051 return false; 6052 } 6053 } 6054 6055 if (!spdk_mem_all_zero(path_id->hostid.hostaddr, sizeof(path_id->hostid.hostaddr))) { 6056 if (strcmp(path_id->hostid.hostaddr, p->hostid.hostaddr) != 0) { 6057 return false; 6058 } 6059 } 6060 6061 if (!spdk_mem_all_zero(path_id->hostid.hostsvcid, sizeof(path_id->hostid.hostsvcid))) { 6062 if (strcmp(path_id->hostid.hostsvcid, p->hostid.hostsvcid) != 0) { 6063 return false; 6064 } 6065 } 6066 6067 return true; 6068 } 6069 6070 static bool 6071 nvme_path_id_exists(const char *name, const struct nvme_path_id *path_id) 6072 { 6073 struct nvme_bdev_ctrlr *nbdev_ctrlr; 6074 struct nvme_ctrlr *ctrlr; 6075 struct nvme_path_id *p; 6076 6077 pthread_mutex_lock(&g_bdev_nvme_mutex); 6078 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 6079 if (!nbdev_ctrlr) { 6080 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6081 return false; 6082 } 6083 6084 TAILQ_FOREACH(ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 6085 pthread_mutex_lock(&ctrlr->mutex); 6086 TAILQ_FOREACH(p, &ctrlr->trids, link) { 6087 if (nvme_path_id_compare(p, path_id)) { 6088 pthread_mutex_unlock(&ctrlr->mutex); 6089 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6090 return true; 6091 } 6092 } 6093 pthread_mutex_unlock(&ctrlr->mutex); 6094 } 6095 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6096 6097 return false; 6098 } 6099 6100 static int 6101 bdev_nvme_delete_complete_poll(void *arg) 6102 { 6103 struct bdev_nvme_delete_ctx *ctx = arg; 6104 int rc = 0; 6105 6106 if (nvme_path_id_exists(ctx->name, &ctx->path_id)) { 6107 if (ctx->timeout_ticks > spdk_get_ticks()) { 6108 return SPDK_POLLER_BUSY; 6109 } 6110 6111 SPDK_ERRLOG("NVMe path '%s' still exists after delete\n", ctx->name); 6112 rc = -ETIMEDOUT; 6113 } 6114 6115 spdk_poller_unregister(&ctx->poller); 6116 6117 ctx->delete_done(ctx->delete_done_ctx, rc); 6118 free_bdev_nvme_delete_ctx(ctx); 6119 6120 return SPDK_POLLER_BUSY; 6121 } 6122 6123 static int 6124 _bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, const struct nvme_path_id *path_id) 6125 { 6126 struct nvme_path_id *p, *t; 6127 spdk_msg_fn msg_fn; 6128 int rc = -ENXIO; 6129 6130 pthread_mutex_lock(&nvme_ctrlr->mutex); 6131 6132 TAILQ_FOREACH_REVERSE_SAFE(p, &nvme_ctrlr->trids, nvme_paths, link, t) { 6133 if (p == TAILQ_FIRST(&nvme_ctrlr->trids)) { 6134 break; 6135 } 6136 6137 if (!nvme_path_id_compare(p, path_id)) { 6138 continue; 6139 } 6140 6141 /* We are not using the specified path. */ 6142 TAILQ_REMOVE(&nvme_ctrlr->trids, p, link); 6143 free(p); 6144 rc = 0; 6145 } 6146 6147 if (p == NULL || !nvme_path_id_compare(p, path_id)) { 6148 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6149 return rc; 6150 } 6151 6152 /* If we made it here, then this path is a match! Now we need to remove it. */ 6153 6154 /* This is the active path in use right now. The active path is always the first in the list. */ 6155 assert(p == nvme_ctrlr->active_path_id); 6156 6157 if (!TAILQ_NEXT(p, link)) { 6158 /* The current path is the only path. */ 6159 msg_fn = _nvme_ctrlr_destruct; 6160 rc = bdev_nvme_delete_ctrlr_unsafe(nvme_ctrlr, false); 6161 } else { 6162 /* There is an alternative path. */ 6163 msg_fn = _bdev_nvme_reset_ctrlr; 6164 rc = bdev_nvme_failover_ctrlr_unsafe(nvme_ctrlr, true); 6165 } 6166 6167 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6168 6169 if (rc == 0) { 6170 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 6171 } else if (rc == -EALREADY) { 6172 rc = 0; 6173 } 6174 6175 return rc; 6176 } 6177 6178 int 6179 bdev_nvme_delete(const char *name, const struct nvme_path_id *path_id, 6180 bdev_nvme_delete_done_fn delete_done, void *delete_done_ctx) 6181 { 6182 struct nvme_bdev_ctrlr *nbdev_ctrlr; 6183 struct nvme_ctrlr *nvme_ctrlr, *tmp_nvme_ctrlr; 6184 struct bdev_nvme_delete_ctx *ctx = NULL; 6185 int rc = -ENXIO, _rc; 6186 6187 if (name == NULL || path_id == NULL) { 6188 rc = -EINVAL; 6189 goto exit; 6190 } 6191 6192 pthread_mutex_lock(&g_bdev_nvme_mutex); 6193 6194 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 6195 if (nbdev_ctrlr == NULL) { 6196 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6197 6198 SPDK_ERRLOG("Failed to find NVMe bdev controller\n"); 6199 rc = -ENODEV; 6200 goto exit; 6201 } 6202 6203 TAILQ_FOREACH_SAFE(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq, tmp_nvme_ctrlr) { 6204 _rc = _bdev_nvme_delete(nvme_ctrlr, path_id); 6205 if (_rc < 0 && _rc != -ENXIO) { 6206 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6207 rc = _rc; 6208 goto exit; 6209 } else if (_rc == 0) { 6210 /* We traverse all remaining nvme_ctrlrs even if one nvme_ctrlr 6211 * was deleted successfully. To remember the successful deletion, 6212 * overwrite rc only if _rc is zero. 6213 */ 6214 rc = 0; 6215 } 6216 } 6217 6218 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6219 6220 if (rc != 0 || delete_done == NULL) { 6221 goto exit; 6222 } 6223 6224 ctx = calloc(1, sizeof(*ctx)); 6225 if (ctx == NULL) { 6226 SPDK_ERRLOG("Failed to allocate context for bdev_nvme_delete\n"); 6227 rc = -ENOMEM; 6228 goto exit; 6229 } 6230 6231 ctx->name = strdup(name); 6232 if (ctx->name == NULL) { 6233 SPDK_ERRLOG("Failed to copy controller name for deletion\n"); 6234 rc = -ENOMEM; 6235 goto exit; 6236 } 6237 6238 ctx->delete_done = delete_done; 6239 ctx->delete_done_ctx = delete_done_ctx; 6240 ctx->path_id = *path_id; 6241 ctx->timeout_ticks = spdk_get_ticks() + 10 * spdk_get_ticks_hz(); 6242 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_delete_complete_poll, ctx, 1000); 6243 if (ctx->poller == NULL) { 6244 SPDK_ERRLOG("Failed to register bdev_nvme_delete poller\n"); 6245 rc = -ENOMEM; 6246 goto exit; 6247 } 6248 6249 exit: 6250 if (rc != 0) { 6251 free_bdev_nvme_delete_ctx(ctx); 6252 } 6253 6254 return rc; 6255 } 6256 6257 #define DISCOVERY_INFOLOG(ctx, format, ...) \ 6258 SPDK_INFOLOG(bdev_nvme, "Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 6259 6260 #define DISCOVERY_ERRLOG(ctx, format, ...) \ 6261 SPDK_ERRLOG("Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 6262 6263 struct discovery_entry_ctx { 6264 char name[128]; 6265 struct spdk_nvme_transport_id trid; 6266 struct spdk_nvme_ctrlr_opts drv_opts; 6267 struct spdk_nvmf_discovery_log_page_entry entry; 6268 TAILQ_ENTRY(discovery_entry_ctx) tailq; 6269 struct discovery_ctx *ctx; 6270 }; 6271 6272 struct discovery_ctx { 6273 char *name; 6274 spdk_bdev_nvme_start_discovery_fn start_cb_fn; 6275 spdk_bdev_nvme_stop_discovery_fn stop_cb_fn; 6276 void *cb_ctx; 6277 struct spdk_nvme_probe_ctx *probe_ctx; 6278 struct spdk_nvme_detach_ctx *detach_ctx; 6279 struct spdk_nvme_ctrlr *ctrlr; 6280 struct spdk_nvme_transport_id trid; 6281 struct discovery_entry_ctx *entry_ctx_in_use; 6282 struct spdk_poller *poller; 6283 struct spdk_nvme_ctrlr_opts drv_opts; 6284 struct nvme_ctrlr_opts bdev_opts; 6285 struct spdk_nvmf_discovery_log_page *log_page; 6286 TAILQ_ENTRY(discovery_ctx) tailq; 6287 TAILQ_HEAD(, discovery_entry_ctx) nvm_entry_ctxs; 6288 TAILQ_HEAD(, discovery_entry_ctx) discovery_entry_ctxs; 6289 int rc; 6290 bool wait_for_attach; 6291 uint64_t timeout_ticks; 6292 /* Denotes that the discovery service is being started. We're waiting 6293 * for the initial connection to the discovery controller to be 6294 * established and attach discovered NVM ctrlrs. 6295 */ 6296 bool initializing; 6297 /* Denotes if a discovery is currently in progress for this context. 6298 * That includes connecting to newly discovered subsystems. Used to 6299 * ensure we do not start a new discovery until an existing one is 6300 * complete. 6301 */ 6302 bool in_progress; 6303 6304 /* Denotes if another discovery is needed after the one in progress 6305 * completes. Set when we receive an AER completion while a discovery 6306 * is already in progress. 6307 */ 6308 bool pending; 6309 6310 /* Signal to the discovery context poller that it should stop the 6311 * discovery service, including detaching from the current discovery 6312 * controller. 6313 */ 6314 bool stop; 6315 6316 struct spdk_thread *calling_thread; 6317 uint32_t index; 6318 uint32_t attach_in_progress; 6319 char *hostnqn; 6320 6321 /* Denotes if the discovery service was started by the mdns discovery. 6322 */ 6323 bool from_mdns_discovery_service; 6324 }; 6325 6326 TAILQ_HEAD(discovery_ctxs, discovery_ctx); 6327 static struct discovery_ctxs g_discovery_ctxs = TAILQ_HEAD_INITIALIZER(g_discovery_ctxs); 6328 6329 static void get_discovery_log_page(struct discovery_ctx *ctx); 6330 6331 static void 6332 free_discovery_ctx(struct discovery_ctx *ctx) 6333 { 6334 free(ctx->log_page); 6335 free(ctx->hostnqn); 6336 free(ctx->name); 6337 free(ctx); 6338 } 6339 6340 static void 6341 discovery_complete(struct discovery_ctx *ctx) 6342 { 6343 ctx->initializing = false; 6344 ctx->in_progress = false; 6345 if (ctx->pending) { 6346 ctx->pending = false; 6347 get_discovery_log_page(ctx); 6348 } 6349 } 6350 6351 static void 6352 build_trid_from_log_page_entry(struct spdk_nvme_transport_id *trid, 6353 struct spdk_nvmf_discovery_log_page_entry *entry) 6354 { 6355 char *space; 6356 6357 trid->trtype = entry->trtype; 6358 trid->adrfam = entry->adrfam; 6359 memcpy(trid->traddr, entry->traddr, sizeof(entry->traddr)); 6360 memcpy(trid->trsvcid, entry->trsvcid, sizeof(entry->trsvcid)); 6361 /* Because the source buffer (entry->subnqn) is longer than trid->subnqn, and 6362 * before call to this function trid->subnqn is zeroed out, we need 6363 * to copy sizeof(trid->subnqn) minus one byte to make sure the last character 6364 * remains 0. Then we can shorten the string (replace ' ' with 0) if required 6365 */ 6366 memcpy(trid->subnqn, entry->subnqn, sizeof(trid->subnqn) - 1); 6367 6368 /* We want the traddr, trsvcid and subnqn fields to be NULL-terminated. 6369 * But the log page entries typically pad them with spaces, not zeroes. 6370 * So add a NULL terminator to each of these fields at the appropriate 6371 * location. 6372 */ 6373 space = strchr(trid->traddr, ' '); 6374 if (space) { 6375 *space = 0; 6376 } 6377 space = strchr(trid->trsvcid, ' '); 6378 if (space) { 6379 *space = 0; 6380 } 6381 space = strchr(trid->subnqn, ' '); 6382 if (space) { 6383 *space = 0; 6384 } 6385 } 6386 6387 static void 6388 _stop_discovery(void *_ctx) 6389 { 6390 struct discovery_ctx *ctx = _ctx; 6391 6392 if (ctx->attach_in_progress > 0) { 6393 spdk_thread_send_msg(spdk_get_thread(), _stop_discovery, ctx); 6394 return; 6395 } 6396 6397 ctx->stop = true; 6398 6399 while (!TAILQ_EMPTY(&ctx->nvm_entry_ctxs)) { 6400 struct discovery_entry_ctx *entry_ctx; 6401 struct nvme_path_id path = {}; 6402 6403 entry_ctx = TAILQ_FIRST(&ctx->nvm_entry_ctxs); 6404 path.trid = entry_ctx->trid; 6405 bdev_nvme_delete(entry_ctx->name, &path, NULL, NULL); 6406 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 6407 free(entry_ctx); 6408 } 6409 6410 while (!TAILQ_EMPTY(&ctx->discovery_entry_ctxs)) { 6411 struct discovery_entry_ctx *entry_ctx; 6412 6413 entry_ctx = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 6414 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 6415 free(entry_ctx); 6416 } 6417 6418 free(ctx->entry_ctx_in_use); 6419 ctx->entry_ctx_in_use = NULL; 6420 } 6421 6422 static void 6423 stop_discovery(struct discovery_ctx *ctx, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 6424 { 6425 ctx->stop_cb_fn = cb_fn; 6426 ctx->cb_ctx = cb_ctx; 6427 6428 if (ctx->attach_in_progress > 0) { 6429 DISCOVERY_INFOLOG(ctx, "stopping discovery with attach_in_progress: %"PRIu32"\n", 6430 ctx->attach_in_progress); 6431 } 6432 6433 _stop_discovery(ctx); 6434 } 6435 6436 static void 6437 remove_discovery_entry(struct nvme_ctrlr *nvme_ctrlr) 6438 { 6439 struct discovery_ctx *d_ctx; 6440 struct nvme_path_id *path_id; 6441 struct spdk_nvme_transport_id trid = {}; 6442 struct discovery_entry_ctx *entry_ctx, *tmp; 6443 6444 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 6445 6446 TAILQ_FOREACH(d_ctx, &g_discovery_ctxs, tailq) { 6447 TAILQ_FOREACH_SAFE(entry_ctx, &d_ctx->nvm_entry_ctxs, tailq, tmp) { 6448 build_trid_from_log_page_entry(&trid, &entry_ctx->entry); 6449 if (spdk_nvme_transport_id_compare(&trid, &path_id->trid) != 0) { 6450 continue; 6451 } 6452 6453 TAILQ_REMOVE(&d_ctx->nvm_entry_ctxs, entry_ctx, tailq); 6454 free(entry_ctx); 6455 DISCOVERY_INFOLOG(d_ctx, "Remove discovery entry: %s:%s:%s\n", 6456 trid.subnqn, trid.traddr, trid.trsvcid); 6457 6458 /* Fail discovery ctrlr to force reattach attempt */ 6459 spdk_nvme_ctrlr_fail(d_ctx->ctrlr); 6460 } 6461 } 6462 } 6463 6464 static void 6465 discovery_remove_controllers(struct discovery_ctx *ctx) 6466 { 6467 struct spdk_nvmf_discovery_log_page *log_page = ctx->log_page; 6468 struct discovery_entry_ctx *entry_ctx, *tmp; 6469 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 6470 struct spdk_nvme_transport_id old_trid = {}; 6471 uint64_t numrec, i; 6472 bool found; 6473 6474 numrec = from_le64(&log_page->numrec); 6475 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->nvm_entry_ctxs, tailq, tmp) { 6476 found = false; 6477 old_entry = &entry_ctx->entry; 6478 build_trid_from_log_page_entry(&old_trid, old_entry); 6479 for (i = 0; i < numrec; i++) { 6480 new_entry = &log_page->entries[i]; 6481 if (!memcmp(old_entry, new_entry, sizeof(*old_entry))) { 6482 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s found again\n", 6483 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 6484 found = true; 6485 break; 6486 } 6487 } 6488 if (!found) { 6489 struct nvme_path_id path = {}; 6490 6491 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s not found\n", 6492 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 6493 6494 path.trid = entry_ctx->trid; 6495 bdev_nvme_delete(entry_ctx->name, &path, NULL, NULL); 6496 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 6497 free(entry_ctx); 6498 } 6499 } 6500 free(log_page); 6501 ctx->log_page = NULL; 6502 discovery_complete(ctx); 6503 } 6504 6505 static void 6506 complete_discovery_start(struct discovery_ctx *ctx, int status) 6507 { 6508 ctx->timeout_ticks = 0; 6509 ctx->rc = status; 6510 if (ctx->start_cb_fn) { 6511 ctx->start_cb_fn(ctx->cb_ctx, status); 6512 ctx->start_cb_fn = NULL; 6513 ctx->cb_ctx = NULL; 6514 } 6515 } 6516 6517 static void 6518 discovery_attach_controller_done(void *cb_ctx, size_t bdev_count, int rc) 6519 { 6520 struct discovery_entry_ctx *entry_ctx = cb_ctx; 6521 struct discovery_ctx *ctx = entry_ctx->ctx; 6522 6523 DISCOVERY_INFOLOG(ctx, "attach %s done\n", entry_ctx->name); 6524 ctx->attach_in_progress--; 6525 if (ctx->attach_in_progress == 0) { 6526 complete_discovery_start(ctx, ctx->rc); 6527 if (ctx->initializing && ctx->rc != 0) { 6528 DISCOVERY_ERRLOG(ctx, "stopping discovery due to errors: %d\n", ctx->rc); 6529 stop_discovery(ctx, NULL, ctx->cb_ctx); 6530 } else { 6531 discovery_remove_controllers(ctx); 6532 } 6533 } 6534 } 6535 6536 static struct discovery_entry_ctx * 6537 create_discovery_entry_ctx(struct discovery_ctx *ctx, struct spdk_nvme_transport_id *trid) 6538 { 6539 struct discovery_entry_ctx *new_ctx; 6540 6541 new_ctx = calloc(1, sizeof(*new_ctx)); 6542 if (new_ctx == NULL) { 6543 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6544 return NULL; 6545 } 6546 6547 new_ctx->ctx = ctx; 6548 memcpy(&new_ctx->trid, trid, sizeof(*trid)); 6549 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 6550 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 6551 return new_ctx; 6552 } 6553 6554 static void 6555 discovery_log_page_cb(void *cb_arg, int rc, const struct spdk_nvme_cpl *cpl, 6556 struct spdk_nvmf_discovery_log_page *log_page) 6557 { 6558 struct discovery_ctx *ctx = cb_arg; 6559 struct discovery_entry_ctx *entry_ctx, *tmp; 6560 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 6561 uint64_t numrec, i; 6562 bool found; 6563 6564 if (rc || spdk_nvme_cpl_is_error(cpl)) { 6565 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 6566 return; 6567 } 6568 6569 ctx->log_page = log_page; 6570 assert(ctx->attach_in_progress == 0); 6571 numrec = from_le64(&log_page->numrec); 6572 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->discovery_entry_ctxs, tailq, tmp) { 6573 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 6574 free(entry_ctx); 6575 } 6576 for (i = 0; i < numrec; i++) { 6577 found = false; 6578 new_entry = &log_page->entries[i]; 6579 if (new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY_CURRENT || 6580 new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) { 6581 struct discovery_entry_ctx *new_ctx; 6582 struct spdk_nvme_transport_id trid = {}; 6583 6584 build_trid_from_log_page_entry(&trid, new_entry); 6585 new_ctx = create_discovery_entry_ctx(ctx, &trid); 6586 if (new_ctx == NULL) { 6587 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6588 break; 6589 } 6590 6591 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, new_ctx, tailq); 6592 continue; 6593 } 6594 TAILQ_FOREACH(entry_ctx, &ctx->nvm_entry_ctxs, tailq) { 6595 old_entry = &entry_ctx->entry; 6596 if (!memcmp(new_entry, old_entry, sizeof(*new_entry))) { 6597 found = true; 6598 break; 6599 } 6600 } 6601 if (!found) { 6602 struct discovery_entry_ctx *subnqn_ctx = NULL, *new_ctx; 6603 struct discovery_ctx *d_ctx; 6604 6605 TAILQ_FOREACH(d_ctx, &g_discovery_ctxs, tailq) { 6606 TAILQ_FOREACH(subnqn_ctx, &d_ctx->nvm_entry_ctxs, tailq) { 6607 if (!memcmp(subnqn_ctx->entry.subnqn, new_entry->subnqn, 6608 sizeof(new_entry->subnqn))) { 6609 break; 6610 } 6611 } 6612 if (subnqn_ctx) { 6613 break; 6614 } 6615 } 6616 6617 new_ctx = calloc(1, sizeof(*new_ctx)); 6618 if (new_ctx == NULL) { 6619 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6620 break; 6621 } 6622 6623 new_ctx->ctx = ctx; 6624 memcpy(&new_ctx->entry, new_entry, sizeof(*new_entry)); 6625 build_trid_from_log_page_entry(&new_ctx->trid, new_entry); 6626 if (subnqn_ctx) { 6627 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s", subnqn_ctx->name); 6628 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new path for %s\n", 6629 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 6630 new_ctx->name); 6631 } else { 6632 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s%d", ctx->name, ctx->index++); 6633 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new subsystem %s\n", 6634 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 6635 new_ctx->name); 6636 } 6637 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 6638 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 6639 rc = bdev_nvme_create(&new_ctx->trid, new_ctx->name, NULL, 0, 6640 discovery_attach_controller_done, new_ctx, 6641 &new_ctx->drv_opts, &ctx->bdev_opts, true); 6642 if (rc == 0) { 6643 TAILQ_INSERT_TAIL(&ctx->nvm_entry_ctxs, new_ctx, tailq); 6644 ctx->attach_in_progress++; 6645 } else { 6646 DISCOVERY_ERRLOG(ctx, "bdev_nvme_create failed (%s)\n", spdk_strerror(-rc)); 6647 } 6648 } 6649 } 6650 6651 if (ctx->attach_in_progress == 0) { 6652 discovery_remove_controllers(ctx); 6653 } 6654 } 6655 6656 static void 6657 get_discovery_log_page(struct discovery_ctx *ctx) 6658 { 6659 int rc; 6660 6661 assert(ctx->in_progress == false); 6662 ctx->in_progress = true; 6663 rc = spdk_nvme_ctrlr_get_discovery_log_page(ctx->ctrlr, discovery_log_page_cb, ctx); 6664 if (rc != 0) { 6665 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 6666 } 6667 DISCOVERY_INFOLOG(ctx, "sent discovery log page command\n"); 6668 } 6669 6670 static void 6671 discovery_aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 6672 { 6673 struct discovery_ctx *ctx = arg; 6674 uint32_t log_page_id = (cpl->cdw0 & 0xFF0000) >> 16; 6675 6676 if (spdk_nvme_cpl_is_error(cpl)) { 6677 DISCOVERY_ERRLOG(ctx, "aer failed\n"); 6678 return; 6679 } 6680 6681 if (log_page_id != SPDK_NVME_LOG_DISCOVERY) { 6682 DISCOVERY_ERRLOG(ctx, "unexpected log page 0x%x\n", log_page_id); 6683 return; 6684 } 6685 6686 DISCOVERY_INFOLOG(ctx, "got aer\n"); 6687 if (ctx->in_progress) { 6688 ctx->pending = true; 6689 return; 6690 } 6691 6692 get_discovery_log_page(ctx); 6693 } 6694 6695 static void 6696 discovery_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 6697 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 6698 { 6699 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 6700 struct discovery_ctx *ctx; 6701 6702 ctx = SPDK_CONTAINEROF(user_opts, struct discovery_ctx, drv_opts); 6703 6704 DISCOVERY_INFOLOG(ctx, "discovery ctrlr attached\n"); 6705 ctx->probe_ctx = NULL; 6706 ctx->ctrlr = ctrlr; 6707 6708 if (ctx->rc != 0) { 6709 DISCOVERY_ERRLOG(ctx, "encountered error while attaching discovery ctrlr: %d\n", 6710 ctx->rc); 6711 return; 6712 } 6713 6714 spdk_nvme_ctrlr_register_aer_callback(ctx->ctrlr, discovery_aer_cb, ctx); 6715 } 6716 6717 static int 6718 discovery_poller(void *arg) 6719 { 6720 struct discovery_ctx *ctx = arg; 6721 struct spdk_nvme_transport_id *trid; 6722 int rc; 6723 6724 if (ctx->detach_ctx) { 6725 rc = spdk_nvme_detach_poll_async(ctx->detach_ctx); 6726 if (rc != -EAGAIN) { 6727 ctx->detach_ctx = NULL; 6728 ctx->ctrlr = NULL; 6729 } 6730 } else if (ctx->stop) { 6731 if (ctx->ctrlr != NULL) { 6732 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 6733 if (rc == 0) { 6734 return SPDK_POLLER_BUSY; 6735 } 6736 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 6737 } 6738 spdk_poller_unregister(&ctx->poller); 6739 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 6740 assert(ctx->start_cb_fn == NULL); 6741 if (ctx->stop_cb_fn != NULL) { 6742 ctx->stop_cb_fn(ctx->cb_ctx); 6743 } 6744 free_discovery_ctx(ctx); 6745 } else if (ctx->probe_ctx == NULL && ctx->ctrlr == NULL) { 6746 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 6747 DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); 6748 assert(ctx->initializing); 6749 spdk_poller_unregister(&ctx->poller); 6750 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 6751 complete_discovery_start(ctx, -ETIMEDOUT); 6752 stop_discovery(ctx, NULL, NULL); 6753 free_discovery_ctx(ctx); 6754 return SPDK_POLLER_BUSY; 6755 } 6756 6757 assert(ctx->entry_ctx_in_use == NULL); 6758 ctx->entry_ctx_in_use = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 6759 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 6760 trid = &ctx->entry_ctx_in_use->trid; 6761 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, discovery_attach_cb); 6762 if (ctx->probe_ctx) { 6763 spdk_poller_unregister(&ctx->poller); 6764 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000); 6765 } else { 6766 DISCOVERY_ERRLOG(ctx, "could not start discovery connect\n"); 6767 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 6768 ctx->entry_ctx_in_use = NULL; 6769 } 6770 } else if (ctx->probe_ctx) { 6771 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 6772 DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); 6773 complete_discovery_start(ctx, -ETIMEDOUT); 6774 return SPDK_POLLER_BUSY; 6775 } 6776 6777 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 6778 if (rc != -EAGAIN) { 6779 if (ctx->rc != 0) { 6780 assert(ctx->initializing); 6781 stop_discovery(ctx, NULL, ctx->cb_ctx); 6782 } else { 6783 assert(rc == 0); 6784 DISCOVERY_INFOLOG(ctx, "discovery ctrlr connected\n"); 6785 ctx->rc = rc; 6786 get_discovery_log_page(ctx); 6787 } 6788 } 6789 } else { 6790 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 6791 DISCOVERY_ERRLOG(ctx, "timed out while attaching NVM ctrlrs\n"); 6792 complete_discovery_start(ctx, -ETIMEDOUT); 6793 /* We need to wait until all NVM ctrlrs are attached before we stop the 6794 * discovery service to make sure we don't detach a ctrlr that is still 6795 * being attached. 6796 */ 6797 if (ctx->attach_in_progress == 0) { 6798 stop_discovery(ctx, NULL, ctx->cb_ctx); 6799 return SPDK_POLLER_BUSY; 6800 } 6801 } 6802 6803 rc = spdk_nvme_ctrlr_process_admin_completions(ctx->ctrlr); 6804 if (rc < 0) { 6805 spdk_poller_unregister(&ctx->poller); 6806 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 6807 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 6808 ctx->entry_ctx_in_use = NULL; 6809 6810 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 6811 if (rc != 0) { 6812 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 6813 ctx->ctrlr = NULL; 6814 } 6815 } 6816 } 6817 6818 return SPDK_POLLER_BUSY; 6819 } 6820 6821 static void 6822 start_discovery_poller(void *arg) 6823 { 6824 struct discovery_ctx *ctx = arg; 6825 6826 TAILQ_INSERT_TAIL(&g_discovery_ctxs, ctx, tailq); 6827 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 6828 } 6829 6830 int 6831 bdev_nvme_start_discovery(struct spdk_nvme_transport_id *trid, 6832 const char *base_name, 6833 struct spdk_nvme_ctrlr_opts *drv_opts, 6834 struct nvme_ctrlr_opts *bdev_opts, 6835 uint64_t attach_timeout, 6836 bool from_mdns, 6837 spdk_bdev_nvme_start_discovery_fn cb_fn, void *cb_ctx) 6838 { 6839 struct discovery_ctx *ctx; 6840 struct discovery_entry_ctx *discovery_entry_ctx; 6841 6842 snprintf(trid->subnqn, sizeof(trid->subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN); 6843 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 6844 if (strcmp(ctx->name, base_name) == 0) { 6845 return -EEXIST; 6846 } 6847 6848 if (ctx->entry_ctx_in_use != NULL) { 6849 if (!spdk_nvme_transport_id_compare(trid, &ctx->entry_ctx_in_use->trid)) { 6850 return -EEXIST; 6851 } 6852 } 6853 6854 TAILQ_FOREACH(discovery_entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 6855 if (!spdk_nvme_transport_id_compare(trid, &discovery_entry_ctx->trid)) { 6856 return -EEXIST; 6857 } 6858 } 6859 } 6860 6861 ctx = calloc(1, sizeof(*ctx)); 6862 if (ctx == NULL) { 6863 return -ENOMEM; 6864 } 6865 6866 ctx->name = strdup(base_name); 6867 if (ctx->name == NULL) { 6868 free_discovery_ctx(ctx); 6869 return -ENOMEM; 6870 } 6871 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 6872 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 6873 ctx->from_mdns_discovery_service = from_mdns; 6874 ctx->bdev_opts.from_discovery_service = true; 6875 ctx->calling_thread = spdk_get_thread(); 6876 ctx->start_cb_fn = cb_fn; 6877 ctx->cb_ctx = cb_ctx; 6878 ctx->initializing = true; 6879 if (ctx->start_cb_fn) { 6880 /* We can use this when dumping json to denote if this RPC parameter 6881 * was specified or not. 6882 */ 6883 ctx->wait_for_attach = true; 6884 } 6885 if (attach_timeout != 0) { 6886 ctx->timeout_ticks = spdk_get_ticks() + attach_timeout * 6887 spdk_get_ticks_hz() / 1000ull; 6888 } 6889 TAILQ_INIT(&ctx->nvm_entry_ctxs); 6890 TAILQ_INIT(&ctx->discovery_entry_ctxs); 6891 memcpy(&ctx->trid, trid, sizeof(*trid)); 6892 /* Even if user did not specify hostnqn, we can still strdup("\0"); */ 6893 ctx->hostnqn = strdup(ctx->drv_opts.hostnqn); 6894 if (ctx->hostnqn == NULL) { 6895 free_discovery_ctx(ctx); 6896 return -ENOMEM; 6897 } 6898 discovery_entry_ctx = create_discovery_entry_ctx(ctx, trid); 6899 if (discovery_entry_ctx == NULL) { 6900 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6901 free_discovery_ctx(ctx); 6902 return -ENOMEM; 6903 } 6904 6905 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, discovery_entry_ctx, tailq); 6906 spdk_thread_send_msg(g_bdev_nvme_init_thread, start_discovery_poller, ctx); 6907 return 0; 6908 } 6909 6910 int 6911 bdev_nvme_stop_discovery(const char *name, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 6912 { 6913 struct discovery_ctx *ctx; 6914 6915 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 6916 if (strcmp(name, ctx->name) == 0) { 6917 if (ctx->stop) { 6918 return -EALREADY; 6919 } 6920 /* If we're still starting the discovery service and ->rc is non-zero, we're 6921 * going to stop it as soon as we can 6922 */ 6923 if (ctx->initializing && ctx->rc != 0) { 6924 return -EALREADY; 6925 } 6926 stop_discovery(ctx, cb_fn, cb_ctx); 6927 return 0; 6928 } 6929 } 6930 6931 return -ENOENT; 6932 } 6933 6934 static int 6935 bdev_nvme_library_init(void) 6936 { 6937 g_bdev_nvme_init_thread = spdk_get_thread(); 6938 6939 spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_create_poll_group_cb, 6940 bdev_nvme_destroy_poll_group_cb, 6941 sizeof(struct nvme_poll_group), "nvme_poll_groups"); 6942 6943 return 0; 6944 } 6945 6946 static void 6947 bdev_nvme_fini_destruct_ctrlrs(void) 6948 { 6949 struct nvme_bdev_ctrlr *nbdev_ctrlr; 6950 struct nvme_ctrlr *nvme_ctrlr; 6951 6952 pthread_mutex_lock(&g_bdev_nvme_mutex); 6953 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 6954 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 6955 pthread_mutex_lock(&nvme_ctrlr->mutex); 6956 if (nvme_ctrlr->destruct) { 6957 /* This controller's destruction was already started 6958 * before the application started shutting down 6959 */ 6960 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6961 continue; 6962 } 6963 nvme_ctrlr->destruct = true; 6964 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6965 6966 spdk_thread_send_msg(nvme_ctrlr->thread, _nvme_ctrlr_destruct, 6967 nvme_ctrlr); 6968 } 6969 } 6970 6971 g_bdev_nvme_module_finish = true; 6972 if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 6973 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6974 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 6975 spdk_bdev_module_fini_done(); 6976 return; 6977 } 6978 6979 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6980 } 6981 6982 static void 6983 check_discovery_fini(void *arg) 6984 { 6985 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 6986 bdev_nvme_fini_destruct_ctrlrs(); 6987 } 6988 } 6989 6990 static void 6991 bdev_nvme_library_fini(void) 6992 { 6993 struct nvme_probe_skip_entry *entry, *entry_tmp; 6994 struct discovery_ctx *ctx; 6995 6996 spdk_poller_unregister(&g_hotplug_poller); 6997 free(g_hotplug_probe_ctx); 6998 g_hotplug_probe_ctx = NULL; 6999 7000 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) { 7001 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 7002 free(entry); 7003 } 7004 7005 assert(spdk_get_thread() == g_bdev_nvme_init_thread); 7006 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 7007 bdev_nvme_fini_destruct_ctrlrs(); 7008 } else { 7009 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 7010 stop_discovery(ctx, check_discovery_fini, NULL); 7011 } 7012 } 7013 } 7014 7015 static void 7016 bdev_nvme_verify_pi_error(struct nvme_bdev_io *bio) 7017 { 7018 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7019 struct spdk_bdev *bdev = bdev_io->bdev; 7020 struct spdk_dif_ctx dif_ctx; 7021 struct spdk_dif_error err_blk = {}; 7022 int rc; 7023 struct spdk_dif_ctx_init_ext_opts dif_opts; 7024 7025 dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format); 7026 dif_opts.dif_pi_format = SPDK_DIF_PI_FORMAT_16; 7027 rc = spdk_dif_ctx_init(&dif_ctx, 7028 bdev->blocklen, bdev->md_len, bdev->md_interleave, 7029 bdev->dif_is_head_of_md, bdev->dif_type, bdev->dif_check_flags, 7030 bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0, &dif_opts); 7031 if (rc != 0) { 7032 SPDK_ERRLOG("Initialization of DIF context failed\n"); 7033 return; 7034 } 7035 7036 if (bdev->md_interleave) { 7037 rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 7038 bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 7039 } else { 7040 struct iovec md_iov = { 7041 .iov_base = bdev_io->u.bdev.md_buf, 7042 .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len, 7043 }; 7044 7045 rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 7046 &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 7047 } 7048 7049 if (rc != 0) { 7050 SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n", 7051 err_blk.err_type, err_blk.err_offset); 7052 } else { 7053 SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n"); 7054 } 7055 } 7056 7057 static void 7058 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 7059 { 7060 struct nvme_bdev_io *bio = ref; 7061 7062 if (spdk_nvme_cpl_is_success(cpl)) { 7063 /* Run PI verification for read data buffer. */ 7064 bdev_nvme_verify_pi_error(bio); 7065 } 7066 7067 /* Return original completion status */ 7068 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 7069 } 7070 7071 static void 7072 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 7073 { 7074 struct nvme_bdev_io *bio = ref; 7075 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7076 int ret; 7077 7078 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 7079 SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n", 7080 cpl->status.sct, cpl->status.sc); 7081 7082 /* Save completion status to use after verifying PI error. */ 7083 bio->cpl = *cpl; 7084 7085 if (spdk_likely(nvme_io_path_is_available(bio->io_path))) { 7086 /* Read without PI checking to verify PI error. */ 7087 ret = bdev_nvme_no_pi_readv(bio, 7088 bdev_io->u.bdev.iovs, 7089 bdev_io->u.bdev.iovcnt, 7090 bdev_io->u.bdev.md_buf, 7091 bdev_io->u.bdev.num_blocks, 7092 bdev_io->u.bdev.offset_blocks); 7093 if (ret == 0) { 7094 return; 7095 } 7096 } 7097 } 7098 7099 bdev_nvme_io_complete_nvme_status(bio, cpl); 7100 } 7101 7102 static void 7103 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7104 { 7105 struct nvme_bdev_io *bio = ref; 7106 7107 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 7108 SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n", 7109 cpl->status.sct, cpl->status.sc); 7110 /* Run PI verification for write data buffer if PI error is detected. */ 7111 bdev_nvme_verify_pi_error(bio); 7112 } 7113 7114 bdev_nvme_io_complete_nvme_status(bio, cpl); 7115 } 7116 7117 static void 7118 bdev_nvme_zone_appendv_done(void *ref, const struct spdk_nvme_cpl *cpl) 7119 { 7120 struct nvme_bdev_io *bio = ref; 7121 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7122 7123 /* spdk_bdev_io_get_append_location() requires that the ALBA is stored in offset_blocks. 7124 * Additionally, offset_blocks has to be set before calling bdev_nvme_verify_pi_error(). 7125 */ 7126 bdev_io->u.bdev.offset_blocks = *(uint64_t *)&cpl->cdw0; 7127 7128 if (spdk_nvme_cpl_is_pi_error(cpl)) { 7129 SPDK_ERRLOG("zone append completed with PI error (sct=%d, sc=%d)\n", 7130 cpl->status.sct, cpl->status.sc); 7131 /* Run PI verification for zone append data buffer if PI error is detected. */ 7132 bdev_nvme_verify_pi_error(bio); 7133 } 7134 7135 bdev_nvme_io_complete_nvme_status(bio, cpl); 7136 } 7137 7138 static void 7139 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7140 { 7141 struct nvme_bdev_io *bio = ref; 7142 7143 if (spdk_nvme_cpl_is_pi_error(cpl)) { 7144 SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n", 7145 cpl->status.sct, cpl->status.sc); 7146 /* Run PI verification for compare data buffer if PI error is detected. */ 7147 bdev_nvme_verify_pi_error(bio); 7148 } 7149 7150 bdev_nvme_io_complete_nvme_status(bio, cpl); 7151 } 7152 7153 static void 7154 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7155 { 7156 struct nvme_bdev_io *bio = ref; 7157 7158 /* Compare operation completion */ 7159 if (!bio->first_fused_completed) { 7160 /* Save compare result for write callback */ 7161 bio->cpl = *cpl; 7162 bio->first_fused_completed = true; 7163 return; 7164 } 7165 7166 /* Write operation completion */ 7167 if (spdk_nvme_cpl_is_error(&bio->cpl)) { 7168 /* If bio->cpl is already an error, it means the compare operation failed. In that case, 7169 * complete the IO with the compare operation's status. 7170 */ 7171 if (!spdk_nvme_cpl_is_error(cpl)) { 7172 SPDK_ERRLOG("Unexpected write success after compare failure.\n"); 7173 } 7174 7175 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 7176 } else { 7177 bdev_nvme_io_complete_nvme_status(bio, cpl); 7178 } 7179 } 7180 7181 static void 7182 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl) 7183 { 7184 struct nvme_bdev_io *bio = ref; 7185 7186 bdev_nvme_io_complete_nvme_status(bio, cpl); 7187 } 7188 7189 static int 7190 fill_zone_from_report(struct spdk_bdev_zone_info *info, struct spdk_nvme_zns_zone_desc *desc) 7191 { 7192 switch (desc->zt) { 7193 case SPDK_NVME_ZONE_TYPE_SEQWR: 7194 info->type = SPDK_BDEV_ZONE_TYPE_SEQWR; 7195 break; 7196 default: 7197 SPDK_ERRLOG("Invalid zone type: %#x in zone report\n", desc->zt); 7198 return -EIO; 7199 } 7200 7201 switch (desc->zs) { 7202 case SPDK_NVME_ZONE_STATE_EMPTY: 7203 info->state = SPDK_BDEV_ZONE_STATE_EMPTY; 7204 break; 7205 case SPDK_NVME_ZONE_STATE_IOPEN: 7206 info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN; 7207 break; 7208 case SPDK_NVME_ZONE_STATE_EOPEN: 7209 info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN; 7210 break; 7211 case SPDK_NVME_ZONE_STATE_CLOSED: 7212 info->state = SPDK_BDEV_ZONE_STATE_CLOSED; 7213 break; 7214 case SPDK_NVME_ZONE_STATE_RONLY: 7215 info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY; 7216 break; 7217 case SPDK_NVME_ZONE_STATE_FULL: 7218 info->state = SPDK_BDEV_ZONE_STATE_FULL; 7219 break; 7220 case SPDK_NVME_ZONE_STATE_OFFLINE: 7221 info->state = SPDK_BDEV_ZONE_STATE_OFFLINE; 7222 break; 7223 default: 7224 SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", desc->zs); 7225 return -EIO; 7226 } 7227 7228 info->zone_id = desc->zslba; 7229 info->write_pointer = desc->wp; 7230 info->capacity = desc->zcap; 7231 7232 return 0; 7233 } 7234 7235 static void 7236 bdev_nvme_get_zone_info_done(void *ref, const struct spdk_nvme_cpl *cpl) 7237 { 7238 struct nvme_bdev_io *bio = ref; 7239 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7240 uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id; 7241 uint32_t zones_to_copy = bdev_io->u.zone_mgmt.num_zones; 7242 struct spdk_bdev_zone_info *info = bdev_io->u.zone_mgmt.buf; 7243 uint64_t max_zones_per_buf, i; 7244 uint32_t zone_report_bufsize; 7245 struct spdk_nvme_ns *ns; 7246 struct spdk_nvme_qpair *qpair; 7247 int ret; 7248 7249 if (spdk_nvme_cpl_is_error(cpl)) { 7250 goto out_complete_io_nvme_cpl; 7251 } 7252 7253 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 7254 ret = -ENXIO; 7255 goto out_complete_io_ret; 7256 } 7257 7258 ns = bio->io_path->nvme_ns->ns; 7259 qpair = bio->io_path->qpair->qpair; 7260 7261 zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 7262 max_zones_per_buf = (zone_report_bufsize - sizeof(*bio->zone_report_buf)) / 7263 sizeof(bio->zone_report_buf->descs[0]); 7264 7265 if (bio->zone_report_buf->nr_zones > max_zones_per_buf) { 7266 ret = -EINVAL; 7267 goto out_complete_io_ret; 7268 } 7269 7270 if (!bio->zone_report_buf->nr_zones) { 7271 ret = -EINVAL; 7272 goto out_complete_io_ret; 7273 } 7274 7275 for (i = 0; i < bio->zone_report_buf->nr_zones && bio->handled_zones < zones_to_copy; i++) { 7276 ret = fill_zone_from_report(&info[bio->handled_zones], 7277 &bio->zone_report_buf->descs[i]); 7278 if (ret) { 7279 goto out_complete_io_ret; 7280 } 7281 bio->handled_zones++; 7282 } 7283 7284 if (bio->handled_zones < zones_to_copy) { 7285 uint64_t zone_size_lba = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 7286 uint64_t slba = zone_id + (zone_size_lba * bio->handled_zones); 7287 7288 memset(bio->zone_report_buf, 0, zone_report_bufsize); 7289 ret = spdk_nvme_zns_report_zones(ns, qpair, 7290 bio->zone_report_buf, zone_report_bufsize, 7291 slba, SPDK_NVME_ZRA_LIST_ALL, true, 7292 bdev_nvme_get_zone_info_done, bio); 7293 if (!ret) { 7294 return; 7295 } else { 7296 goto out_complete_io_ret; 7297 } 7298 } 7299 7300 out_complete_io_nvme_cpl: 7301 free(bio->zone_report_buf); 7302 bio->zone_report_buf = NULL; 7303 bdev_nvme_io_complete_nvme_status(bio, cpl); 7304 return; 7305 7306 out_complete_io_ret: 7307 free(bio->zone_report_buf); 7308 bio->zone_report_buf = NULL; 7309 bdev_nvme_io_complete(bio, ret); 7310 } 7311 7312 static void 7313 bdev_nvme_zone_management_done(void *ref, const struct spdk_nvme_cpl *cpl) 7314 { 7315 struct nvme_bdev_io *bio = ref; 7316 7317 bdev_nvme_io_complete_nvme_status(bio, cpl); 7318 } 7319 7320 static void 7321 bdev_nvme_admin_passthru_complete_nvme_status(void *ctx) 7322 { 7323 struct nvme_bdev_io *bio = ctx; 7324 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7325 const struct spdk_nvme_cpl *cpl = &bio->cpl; 7326 7327 assert(bdev_nvme_io_type_is_admin(bdev_io->type)); 7328 7329 __bdev_nvme_io_complete(bdev_io, 0, cpl); 7330 } 7331 7332 static void 7333 bdev_nvme_abort_complete(void *ctx) 7334 { 7335 struct nvme_bdev_io *bio = ctx; 7336 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7337 7338 if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) { 7339 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS, NULL); 7340 } else { 7341 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED, NULL); 7342 } 7343 } 7344 7345 static void 7346 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl) 7347 { 7348 struct nvme_bdev_io *bio = ref; 7349 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7350 7351 bio->cpl = *cpl; 7352 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), bdev_nvme_abort_complete, bio); 7353 } 7354 7355 static void 7356 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl) 7357 { 7358 struct nvme_bdev_io *bio = ref; 7359 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7360 7361 bio->cpl = *cpl; 7362 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7363 bdev_nvme_admin_passthru_complete_nvme_status, bio); 7364 } 7365 7366 static void 7367 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset) 7368 { 7369 struct nvme_bdev_io *bio = ref; 7370 struct iovec *iov; 7371 7372 bio->iov_offset = sgl_offset; 7373 for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) { 7374 iov = &bio->iovs[bio->iovpos]; 7375 if (bio->iov_offset < iov->iov_len) { 7376 break; 7377 } 7378 7379 bio->iov_offset -= iov->iov_len; 7380 } 7381 } 7382 7383 static int 7384 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length) 7385 { 7386 struct nvme_bdev_io *bio = ref; 7387 struct iovec *iov; 7388 7389 assert(bio->iovpos < bio->iovcnt); 7390 7391 iov = &bio->iovs[bio->iovpos]; 7392 7393 *address = iov->iov_base; 7394 *length = iov->iov_len; 7395 7396 if (bio->iov_offset) { 7397 assert(bio->iov_offset <= iov->iov_len); 7398 *address += bio->iov_offset; 7399 *length -= bio->iov_offset; 7400 } 7401 7402 bio->iov_offset += *length; 7403 if (bio->iov_offset == iov->iov_len) { 7404 bio->iovpos++; 7405 bio->iov_offset = 0; 7406 } 7407 7408 return 0; 7409 } 7410 7411 static void 7412 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset) 7413 { 7414 struct nvme_bdev_io *bio = ref; 7415 struct iovec *iov; 7416 7417 bio->fused_iov_offset = sgl_offset; 7418 for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) { 7419 iov = &bio->fused_iovs[bio->fused_iovpos]; 7420 if (bio->fused_iov_offset < iov->iov_len) { 7421 break; 7422 } 7423 7424 bio->fused_iov_offset -= iov->iov_len; 7425 } 7426 } 7427 7428 static int 7429 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length) 7430 { 7431 struct nvme_bdev_io *bio = ref; 7432 struct iovec *iov; 7433 7434 assert(bio->fused_iovpos < bio->fused_iovcnt); 7435 7436 iov = &bio->fused_iovs[bio->fused_iovpos]; 7437 7438 *address = iov->iov_base; 7439 *length = iov->iov_len; 7440 7441 if (bio->fused_iov_offset) { 7442 assert(bio->fused_iov_offset <= iov->iov_len); 7443 *address += bio->fused_iov_offset; 7444 *length -= bio->fused_iov_offset; 7445 } 7446 7447 bio->fused_iov_offset += *length; 7448 if (bio->fused_iov_offset == iov->iov_len) { 7449 bio->fused_iovpos++; 7450 bio->fused_iov_offset = 0; 7451 } 7452 7453 return 0; 7454 } 7455 7456 static int 7457 bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7458 void *md, uint64_t lba_count, uint64_t lba) 7459 { 7460 int rc; 7461 7462 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n", 7463 lba_count, lba); 7464 7465 bio->iovs = iov; 7466 bio->iovcnt = iovcnt; 7467 bio->iovpos = 0; 7468 bio->iov_offset = 0; 7469 7470 rc = spdk_nvme_ns_cmd_readv_with_md(bio->io_path->nvme_ns->ns, 7471 bio->io_path->qpair->qpair, 7472 lba, lba_count, 7473 bdev_nvme_no_pi_readv_done, bio, 0, 7474 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 7475 md, 0, 0); 7476 7477 if (rc != 0 && rc != -ENOMEM) { 7478 SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc); 7479 } 7480 return rc; 7481 } 7482 7483 static int 7484 bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7485 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, 7486 struct spdk_memory_domain *domain, void *domain_ctx, 7487 struct spdk_accel_sequence *seq) 7488 { 7489 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7490 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7491 int rc; 7492 7493 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7494 lba_count, lba); 7495 7496 bio->iovs = iov; 7497 bio->iovcnt = iovcnt; 7498 bio->iovpos = 0; 7499 bio->iov_offset = 0; 7500 7501 if (domain != NULL || seq != NULL) { 7502 bio->ext_opts.size = SPDK_SIZEOF(&bio->ext_opts, accel_sequence); 7503 bio->ext_opts.memory_domain = domain; 7504 bio->ext_opts.memory_domain_ctx = domain_ctx; 7505 bio->ext_opts.io_flags = flags; 7506 bio->ext_opts.metadata = md; 7507 bio->ext_opts.accel_sequence = seq; 7508 7509 rc = spdk_nvme_ns_cmd_readv_ext(ns, qpair, lba, lba_count, 7510 bdev_nvme_readv_done, bio, 7511 bdev_nvme_queued_reset_sgl, 7512 bdev_nvme_queued_next_sge, 7513 &bio->ext_opts); 7514 } else if (iovcnt == 1) { 7515 rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base, 7516 md, lba, lba_count, bdev_nvme_readv_done, 7517 bio, flags, 0, 0); 7518 } else { 7519 rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count, 7520 bdev_nvme_readv_done, bio, flags, 7521 bdev_nvme_queued_reset_sgl, 7522 bdev_nvme_queued_next_sge, md, 0, 0); 7523 } 7524 7525 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 7526 SPDK_ERRLOG("readv failed: rc = %d\n", rc); 7527 } 7528 return rc; 7529 } 7530 7531 static int 7532 bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7533 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, 7534 struct spdk_memory_domain *domain, void *domain_ctx, 7535 struct spdk_accel_sequence *seq) 7536 { 7537 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7538 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7539 int rc; 7540 7541 SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7542 lba_count, lba); 7543 7544 bio->iovs = iov; 7545 bio->iovcnt = iovcnt; 7546 bio->iovpos = 0; 7547 bio->iov_offset = 0; 7548 7549 if (domain != NULL || seq != NULL) { 7550 bio->ext_opts.size = SPDK_SIZEOF(&bio->ext_opts, accel_sequence); 7551 bio->ext_opts.memory_domain = domain; 7552 bio->ext_opts.memory_domain_ctx = domain_ctx; 7553 bio->ext_opts.io_flags = flags; 7554 bio->ext_opts.metadata = md; 7555 bio->ext_opts.accel_sequence = seq; 7556 7557 rc = spdk_nvme_ns_cmd_writev_ext(ns, qpair, lba, lba_count, 7558 bdev_nvme_writev_done, bio, 7559 bdev_nvme_queued_reset_sgl, 7560 bdev_nvme_queued_next_sge, 7561 &bio->ext_opts); 7562 } else if (iovcnt == 1) { 7563 rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base, 7564 md, lba, lba_count, bdev_nvme_writev_done, 7565 bio, flags, 0, 0); 7566 } else { 7567 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 7568 bdev_nvme_writev_done, bio, flags, 7569 bdev_nvme_queued_reset_sgl, 7570 bdev_nvme_queued_next_sge, md, 0, 0); 7571 } 7572 7573 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 7574 SPDK_ERRLOG("writev failed: rc = %d\n", rc); 7575 } 7576 return rc; 7577 } 7578 7579 static int 7580 bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7581 void *md, uint64_t lba_count, uint64_t zslba, 7582 uint32_t flags) 7583 { 7584 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7585 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7586 int rc; 7587 7588 SPDK_DEBUGLOG(bdev_nvme, "zone append %" PRIu64 " blocks to zone start lba %#" PRIx64 "\n", 7589 lba_count, zslba); 7590 7591 bio->iovs = iov; 7592 bio->iovcnt = iovcnt; 7593 bio->iovpos = 0; 7594 bio->iov_offset = 0; 7595 7596 if (iovcnt == 1) { 7597 rc = spdk_nvme_zns_zone_append_with_md(ns, qpair, iov[0].iov_base, md, zslba, 7598 lba_count, 7599 bdev_nvme_zone_appendv_done, bio, 7600 flags, 7601 0, 0); 7602 } else { 7603 rc = spdk_nvme_zns_zone_appendv_with_md(ns, qpair, zslba, lba_count, 7604 bdev_nvme_zone_appendv_done, bio, flags, 7605 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 7606 md, 0, 0); 7607 } 7608 7609 if (rc != 0 && rc != -ENOMEM) { 7610 SPDK_ERRLOG("zone append failed: rc = %d\n", rc); 7611 } 7612 return rc; 7613 } 7614 7615 static int 7616 bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7617 void *md, uint64_t lba_count, uint64_t lba, 7618 uint32_t flags) 7619 { 7620 int rc; 7621 7622 SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7623 lba_count, lba); 7624 7625 bio->iovs = iov; 7626 bio->iovcnt = iovcnt; 7627 bio->iovpos = 0; 7628 bio->iov_offset = 0; 7629 7630 rc = spdk_nvme_ns_cmd_comparev_with_md(bio->io_path->nvme_ns->ns, 7631 bio->io_path->qpair->qpair, 7632 lba, lba_count, 7633 bdev_nvme_comparev_done, bio, flags, 7634 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 7635 md, 0, 0); 7636 7637 if (rc != 0 && rc != -ENOMEM) { 7638 SPDK_ERRLOG("comparev failed: rc = %d\n", rc); 7639 } 7640 return rc; 7641 } 7642 7643 static int 7644 bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, 7645 struct iovec *write_iov, int write_iovcnt, 7646 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags) 7647 { 7648 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7649 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7650 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7651 int rc; 7652 7653 SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7654 lba_count, lba); 7655 7656 bio->iovs = cmp_iov; 7657 bio->iovcnt = cmp_iovcnt; 7658 bio->iovpos = 0; 7659 bio->iov_offset = 0; 7660 bio->fused_iovs = write_iov; 7661 bio->fused_iovcnt = write_iovcnt; 7662 bio->fused_iovpos = 0; 7663 bio->fused_iov_offset = 0; 7664 7665 if (bdev_io->num_retries == 0) { 7666 bio->first_fused_submitted = false; 7667 bio->first_fused_completed = false; 7668 } 7669 7670 if (!bio->first_fused_submitted) { 7671 flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST; 7672 memset(&bio->cpl, 0, sizeof(bio->cpl)); 7673 7674 rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count, 7675 bdev_nvme_comparev_and_writev_done, bio, flags, 7676 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0); 7677 if (rc == 0) { 7678 bio->first_fused_submitted = true; 7679 flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST; 7680 } else { 7681 if (rc != -ENOMEM) { 7682 SPDK_ERRLOG("compare failed: rc = %d\n", rc); 7683 } 7684 return rc; 7685 } 7686 } 7687 7688 flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND; 7689 7690 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 7691 bdev_nvme_comparev_and_writev_done, bio, flags, 7692 bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0); 7693 if (rc != 0 && rc != -ENOMEM) { 7694 SPDK_ERRLOG("write failed: rc = %d\n", rc); 7695 rc = 0; 7696 } 7697 7698 return rc; 7699 } 7700 7701 static int 7702 bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 7703 { 7704 struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES]; 7705 struct spdk_nvme_dsm_range *range; 7706 uint64_t offset, remaining; 7707 uint64_t num_ranges_u64; 7708 uint16_t num_ranges; 7709 int rc; 7710 7711 num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) / 7712 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 7713 if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) { 7714 SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks); 7715 return -EINVAL; 7716 } 7717 num_ranges = (uint16_t)num_ranges_u64; 7718 7719 offset = offset_blocks; 7720 remaining = num_blocks; 7721 range = &dsm_ranges[0]; 7722 7723 /* Fill max-size ranges until the remaining blocks fit into one range */ 7724 while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) { 7725 range->attributes.raw = 0; 7726 range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 7727 range->starting_lba = offset; 7728 7729 offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 7730 remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 7731 range++; 7732 } 7733 7734 /* Final range describes the remaining blocks */ 7735 range->attributes.raw = 0; 7736 range->length = remaining; 7737 range->starting_lba = offset; 7738 7739 rc = spdk_nvme_ns_cmd_dataset_management(bio->io_path->nvme_ns->ns, 7740 bio->io_path->qpair->qpair, 7741 SPDK_NVME_DSM_ATTR_DEALLOCATE, 7742 dsm_ranges, num_ranges, 7743 bdev_nvme_queued_done, bio); 7744 7745 return rc; 7746 } 7747 7748 static int 7749 bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 7750 { 7751 if (num_blocks > UINT16_MAX + 1) { 7752 SPDK_ERRLOG("NVMe write zeroes is limited to 16-bit block count\n"); 7753 return -EINVAL; 7754 } 7755 7756 return spdk_nvme_ns_cmd_write_zeroes(bio->io_path->nvme_ns->ns, 7757 bio->io_path->qpair->qpair, 7758 offset_blocks, num_blocks, 7759 bdev_nvme_queued_done, bio, 7760 0); 7761 } 7762 7763 static int 7764 bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones, 7765 struct spdk_bdev_zone_info *info) 7766 { 7767 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7768 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7769 uint32_t zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 7770 uint64_t zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 7771 uint64_t total_zones = spdk_nvme_zns_ns_get_num_zones(ns); 7772 7773 if (zone_id % zone_size != 0) { 7774 return -EINVAL; 7775 } 7776 7777 if (num_zones > total_zones || !num_zones) { 7778 return -EINVAL; 7779 } 7780 7781 assert(!bio->zone_report_buf); 7782 bio->zone_report_buf = calloc(1, zone_report_bufsize); 7783 if (!bio->zone_report_buf) { 7784 return -ENOMEM; 7785 } 7786 7787 bio->handled_zones = 0; 7788 7789 return spdk_nvme_zns_report_zones(ns, qpair, bio->zone_report_buf, zone_report_bufsize, 7790 zone_id, SPDK_NVME_ZRA_LIST_ALL, true, 7791 bdev_nvme_get_zone_info_done, bio); 7792 } 7793 7794 static int 7795 bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 7796 enum spdk_bdev_zone_action action) 7797 { 7798 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7799 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7800 7801 switch (action) { 7802 case SPDK_BDEV_ZONE_CLOSE: 7803 return spdk_nvme_zns_close_zone(ns, qpair, zone_id, false, 7804 bdev_nvme_zone_management_done, bio); 7805 case SPDK_BDEV_ZONE_FINISH: 7806 return spdk_nvme_zns_finish_zone(ns, qpair, zone_id, false, 7807 bdev_nvme_zone_management_done, bio); 7808 case SPDK_BDEV_ZONE_OPEN: 7809 return spdk_nvme_zns_open_zone(ns, qpair, zone_id, false, 7810 bdev_nvme_zone_management_done, bio); 7811 case SPDK_BDEV_ZONE_RESET: 7812 return spdk_nvme_zns_reset_zone(ns, qpair, zone_id, false, 7813 bdev_nvme_zone_management_done, bio); 7814 case SPDK_BDEV_ZONE_OFFLINE: 7815 return spdk_nvme_zns_offline_zone(ns, qpair, zone_id, false, 7816 bdev_nvme_zone_management_done, bio); 7817 default: 7818 return -EINVAL; 7819 } 7820 } 7821 7822 static void 7823 bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 7824 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) 7825 { 7826 struct nvme_io_path *io_path; 7827 struct nvme_ctrlr *nvme_ctrlr; 7828 uint32_t max_xfer_size; 7829 int rc = -ENXIO; 7830 7831 /* Choose the first ctrlr which is not failed. */ 7832 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 7833 nvme_ctrlr = io_path->qpair->ctrlr; 7834 7835 /* We should skip any unavailable nvme_ctrlr rather than checking 7836 * if the return value of spdk_nvme_ctrlr_cmd_admin_raw() is -ENXIO. 7837 */ 7838 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 7839 continue; 7840 } 7841 7842 max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ctrlr->ctrlr); 7843 7844 if (nbytes > max_xfer_size) { 7845 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 7846 rc = -EINVAL; 7847 goto err; 7848 } 7849 7850 rc = spdk_nvme_ctrlr_cmd_admin_raw(nvme_ctrlr->ctrlr, cmd, buf, (uint32_t)nbytes, 7851 bdev_nvme_admin_passthru_done, bio); 7852 if (rc == 0) { 7853 return; 7854 } 7855 } 7856 7857 err: 7858 bdev_nvme_admin_complete(bio, rc); 7859 } 7860 7861 static int 7862 bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 7863 void *buf, size_t nbytes) 7864 { 7865 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7866 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7867 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 7868 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 7869 7870 if (nbytes > max_xfer_size) { 7871 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 7872 return -EINVAL; 7873 } 7874 7875 /* 7876 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 7877 * so fill it out automatically. 7878 */ 7879 cmd->nsid = spdk_nvme_ns_get_id(ns); 7880 7881 return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf, 7882 (uint32_t)nbytes, bdev_nvme_queued_done, bio); 7883 } 7884 7885 static int 7886 bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 7887 void *buf, size_t nbytes, void *md_buf, size_t md_len) 7888 { 7889 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7890 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7891 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 7892 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 7893 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 7894 7895 if (nbytes > max_xfer_size) { 7896 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 7897 return -EINVAL; 7898 } 7899 7900 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 7901 SPDK_ERRLOG("invalid meta data buffer size\n"); 7902 return -EINVAL; 7903 } 7904 7905 /* 7906 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 7907 * so fill it out automatically. 7908 */ 7909 cmd->nsid = spdk_nvme_ns_get_id(ns); 7910 7911 return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf, 7912 (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio); 7913 } 7914 7915 static int 7916 bdev_nvme_iov_passthru_md(struct nvme_bdev_io *bio, 7917 struct spdk_nvme_cmd *cmd, struct iovec *iov, int iovcnt, 7918 size_t nbytes, void *md_buf, size_t md_len) 7919 { 7920 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7921 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7922 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 7923 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 7924 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 7925 7926 bio->iovs = iov; 7927 bio->iovcnt = iovcnt; 7928 bio->iovpos = 0; 7929 bio->iov_offset = 0; 7930 7931 if (nbytes > max_xfer_size) { 7932 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 7933 return -EINVAL; 7934 } 7935 7936 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 7937 SPDK_ERRLOG("invalid meta data buffer size\n"); 7938 return -EINVAL; 7939 } 7940 7941 /* 7942 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands 7943 * require a nsid, so fill it out automatically. 7944 */ 7945 cmd->nsid = spdk_nvme_ns_get_id(ns); 7946 7947 return spdk_nvme_ctrlr_cmd_iov_raw_with_md( 7948 ctrlr, qpair, cmd, (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio, 7949 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge); 7950 } 7951 7952 static void 7953 bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 7954 struct nvme_bdev_io *bio_to_abort) 7955 { 7956 struct nvme_io_path *io_path; 7957 int rc = 0; 7958 7959 rc = bdev_nvme_abort_retry_io(nbdev_ch, bio_to_abort); 7960 if (rc == 0) { 7961 bdev_nvme_admin_complete(bio, 0); 7962 return; 7963 } 7964 7965 io_path = bio_to_abort->io_path; 7966 if (io_path != NULL) { 7967 rc = spdk_nvme_ctrlr_cmd_abort_ext(io_path->qpair->ctrlr->ctrlr, 7968 io_path->qpair->qpair, 7969 bio_to_abort, 7970 bdev_nvme_abort_done, bio); 7971 } else { 7972 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 7973 rc = spdk_nvme_ctrlr_cmd_abort_ext(io_path->qpair->ctrlr->ctrlr, 7974 NULL, 7975 bio_to_abort, 7976 bdev_nvme_abort_done, bio); 7977 7978 if (rc != -ENOENT) { 7979 break; 7980 } 7981 } 7982 } 7983 7984 if (rc != 0) { 7985 /* If no command was found or there was any error, complete the abort 7986 * request with failure. 7987 */ 7988 bdev_nvme_admin_complete(bio, rc); 7989 } 7990 } 7991 7992 static int 7993 bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, uint64_t src_offset_blocks, 7994 uint64_t num_blocks) 7995 { 7996 struct spdk_nvme_scc_source_range range = { 7997 .slba = src_offset_blocks, 7998 .nlb = num_blocks - 1 7999 }; 8000 8001 return spdk_nvme_ns_cmd_copy(bio->io_path->nvme_ns->ns, 8002 bio->io_path->qpair->qpair, 8003 &range, 1, dst_offset_blocks, 8004 bdev_nvme_queued_done, bio); 8005 } 8006 8007 static void 8008 bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w) 8009 { 8010 const char *action; 8011 8012 if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) { 8013 action = "reset"; 8014 } else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) { 8015 action = "abort"; 8016 } else { 8017 action = "none"; 8018 } 8019 8020 spdk_json_write_object_begin(w); 8021 8022 spdk_json_write_named_string(w, "method", "bdev_nvme_set_options"); 8023 8024 spdk_json_write_named_object_begin(w, "params"); 8025 spdk_json_write_named_string(w, "action_on_timeout", action); 8026 spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us); 8027 spdk_json_write_named_uint64(w, "timeout_admin_us", g_opts.timeout_admin_us); 8028 spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms); 8029 spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst); 8030 spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight); 8031 spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight); 8032 spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight); 8033 spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us); 8034 spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us); 8035 spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests); 8036 spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit); 8037 spdk_json_write_named_uint32(w, "transport_retry_count", g_opts.transport_retry_count); 8038 spdk_json_write_named_int32(w, "bdev_retry_count", g_opts.bdev_retry_count); 8039 spdk_json_write_named_uint8(w, "transport_ack_timeout", g_opts.transport_ack_timeout); 8040 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", g_opts.ctrlr_loss_timeout_sec); 8041 spdk_json_write_named_uint32(w, "reconnect_delay_sec", g_opts.reconnect_delay_sec); 8042 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", g_opts.fast_io_fail_timeout_sec); 8043 spdk_json_write_named_bool(w, "disable_auto_failback", g_opts.disable_auto_failback); 8044 spdk_json_write_named_bool(w, "generate_uuids", g_opts.generate_uuids); 8045 spdk_json_write_named_uint8(w, "transport_tos", g_opts.transport_tos); 8046 spdk_json_write_named_bool(w, "nvme_error_stat", g_opts.nvme_error_stat); 8047 spdk_json_write_named_uint32(w, "rdma_srq_size", g_opts.rdma_srq_size); 8048 spdk_json_write_named_bool(w, "io_path_stat", g_opts.io_path_stat); 8049 spdk_json_write_named_bool(w, "allow_accel_sequence", g_opts.allow_accel_sequence); 8050 spdk_json_write_named_uint32(w, "rdma_max_cq_size", g_opts.rdma_max_cq_size); 8051 spdk_json_write_named_uint16(w, "rdma_cm_event_timeout_ms", g_opts.rdma_cm_event_timeout_ms); 8052 spdk_json_write_object_end(w); 8053 8054 spdk_json_write_object_end(w); 8055 } 8056 8057 static void 8058 bdev_nvme_discovery_config_json(struct spdk_json_write_ctx *w, struct discovery_ctx *ctx) 8059 { 8060 struct spdk_nvme_transport_id trid; 8061 8062 spdk_json_write_object_begin(w); 8063 8064 spdk_json_write_named_string(w, "method", "bdev_nvme_start_discovery"); 8065 8066 spdk_json_write_named_object_begin(w, "params"); 8067 spdk_json_write_named_string(w, "name", ctx->name); 8068 spdk_json_write_named_string(w, "hostnqn", ctx->hostnqn); 8069 8070 trid = ctx->trid; 8071 memset(trid.subnqn, 0, sizeof(trid.subnqn)); 8072 nvme_bdev_dump_trid_json(&trid, w); 8073 8074 spdk_json_write_named_bool(w, "wait_for_attach", ctx->wait_for_attach); 8075 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", ctx->bdev_opts.ctrlr_loss_timeout_sec); 8076 spdk_json_write_named_uint32(w, "reconnect_delay_sec", ctx->bdev_opts.reconnect_delay_sec); 8077 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 8078 ctx->bdev_opts.fast_io_fail_timeout_sec); 8079 spdk_json_write_object_end(w); 8080 8081 spdk_json_write_object_end(w); 8082 } 8083 8084 #ifdef SPDK_CONFIG_NVME_CUSE 8085 static void 8086 nvme_ctrlr_cuse_config_json(struct spdk_json_write_ctx *w, 8087 struct nvme_ctrlr *nvme_ctrlr) 8088 { 8089 size_t cuse_name_size = 128; 8090 char cuse_name[cuse_name_size]; 8091 8092 if (spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr, 8093 cuse_name, &cuse_name_size) != 0) { 8094 return; 8095 } 8096 8097 spdk_json_write_object_begin(w); 8098 8099 spdk_json_write_named_string(w, "method", "bdev_nvme_cuse_register"); 8100 8101 spdk_json_write_named_object_begin(w, "params"); 8102 spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name); 8103 spdk_json_write_object_end(w); 8104 8105 spdk_json_write_object_end(w); 8106 } 8107 #endif 8108 8109 static void 8110 nvme_ctrlr_config_json(struct spdk_json_write_ctx *w, 8111 struct nvme_ctrlr *nvme_ctrlr) 8112 { 8113 struct spdk_nvme_transport_id *trid; 8114 const struct spdk_nvme_ctrlr_opts *opts; 8115 8116 if (nvme_ctrlr->opts.from_discovery_service) { 8117 /* Do not emit an RPC for this - it will be implicitly 8118 * covered by a separate bdev_nvme_start_discovery or 8119 * bdev_nvme_start_mdns_discovery RPC. 8120 */ 8121 return; 8122 } 8123 8124 trid = &nvme_ctrlr->active_path_id->trid; 8125 8126 spdk_json_write_object_begin(w); 8127 8128 spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller"); 8129 8130 spdk_json_write_named_object_begin(w, "params"); 8131 spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name); 8132 nvme_bdev_dump_trid_json(trid, w); 8133 spdk_json_write_named_bool(w, "prchk_reftag", 8134 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0); 8135 spdk_json_write_named_bool(w, "prchk_guard", 8136 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0); 8137 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", nvme_ctrlr->opts.ctrlr_loss_timeout_sec); 8138 spdk_json_write_named_uint32(w, "reconnect_delay_sec", nvme_ctrlr->opts.reconnect_delay_sec); 8139 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 8140 nvme_ctrlr->opts.fast_io_fail_timeout_sec); 8141 if (nvme_ctrlr->opts.psk_path[0] != '\0') { 8142 spdk_json_write_named_string(w, "psk", nvme_ctrlr->opts.psk_path); 8143 } 8144 8145 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 8146 spdk_json_write_named_string(w, "hostnqn", opts->hostnqn); 8147 spdk_json_write_named_bool(w, "hdgst", opts->header_digest); 8148 spdk_json_write_named_bool(w, "ddgst", opts->data_digest); 8149 if (opts->src_addr[0] != '\0') { 8150 spdk_json_write_named_string(w, "hostaddr", opts->src_addr); 8151 } 8152 if (opts->src_svcid[0] != '\0') { 8153 spdk_json_write_named_string(w, "hostsvcid", opts->src_svcid); 8154 } 8155 8156 spdk_json_write_object_end(w); 8157 8158 spdk_json_write_object_end(w); 8159 } 8160 8161 static void 8162 bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w) 8163 { 8164 spdk_json_write_object_begin(w); 8165 spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug"); 8166 8167 spdk_json_write_named_object_begin(w, "params"); 8168 spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us); 8169 spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled); 8170 spdk_json_write_object_end(w); 8171 8172 spdk_json_write_object_end(w); 8173 } 8174 8175 static int 8176 bdev_nvme_config_json(struct spdk_json_write_ctx *w) 8177 { 8178 struct nvme_bdev_ctrlr *nbdev_ctrlr; 8179 struct nvme_ctrlr *nvme_ctrlr; 8180 struct discovery_ctx *ctx; 8181 8182 bdev_nvme_opts_config_json(w); 8183 8184 pthread_mutex_lock(&g_bdev_nvme_mutex); 8185 8186 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 8187 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 8188 nvme_ctrlr_config_json(w, nvme_ctrlr); 8189 8190 #ifdef SPDK_CONFIG_NVME_CUSE 8191 nvme_ctrlr_cuse_config_json(w, nvme_ctrlr); 8192 #endif 8193 } 8194 } 8195 8196 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 8197 if (!ctx->from_mdns_discovery_service) { 8198 bdev_nvme_discovery_config_json(w, ctx); 8199 } 8200 } 8201 8202 bdev_nvme_mdns_discovery_config_json(w); 8203 8204 /* Dump as last parameter to give all NVMe bdevs chance to be constructed 8205 * before enabling hotplug poller. 8206 */ 8207 bdev_nvme_hotplug_config_json(w); 8208 8209 pthread_mutex_unlock(&g_bdev_nvme_mutex); 8210 return 0; 8211 } 8212 8213 struct spdk_nvme_ctrlr * 8214 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev) 8215 { 8216 struct nvme_bdev *nbdev; 8217 struct nvme_ns *nvme_ns; 8218 8219 if (!bdev || bdev->module != &nvme_if) { 8220 return NULL; 8221 } 8222 8223 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 8224 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 8225 assert(nvme_ns != NULL); 8226 8227 return nvme_ns->ctrlr->ctrlr; 8228 } 8229 8230 void 8231 nvme_io_path_info_json(struct spdk_json_write_ctx *w, struct nvme_io_path *io_path) 8232 { 8233 struct nvme_ns *nvme_ns = io_path->nvme_ns; 8234 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 8235 const struct spdk_nvme_ctrlr_data *cdata; 8236 const struct spdk_nvme_transport_id *trid; 8237 const char *adrfam_str; 8238 8239 spdk_json_write_object_begin(w); 8240 8241 spdk_json_write_named_string(w, "bdev_name", nvme_ns->bdev->disk.name); 8242 8243 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 8244 trid = spdk_nvme_ctrlr_get_transport_id(nvme_ctrlr->ctrlr); 8245 8246 spdk_json_write_named_uint32(w, "cntlid", cdata->cntlid); 8247 spdk_json_write_named_bool(w, "current", io_path->nbdev_ch != NULL && 8248 io_path == io_path->nbdev_ch->current_io_path); 8249 spdk_json_write_named_bool(w, "connected", nvme_qpair_is_connected(io_path->qpair)); 8250 spdk_json_write_named_bool(w, "accessible", nvme_ns_is_accessible(nvme_ns)); 8251 8252 spdk_json_write_named_object_begin(w, "transport"); 8253 spdk_json_write_named_string(w, "trtype", trid->trstring); 8254 spdk_json_write_named_string(w, "traddr", trid->traddr); 8255 if (trid->trsvcid[0] != '\0') { 8256 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 8257 } 8258 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 8259 if (adrfam_str) { 8260 spdk_json_write_named_string(w, "adrfam", adrfam_str); 8261 } 8262 spdk_json_write_object_end(w); 8263 8264 spdk_json_write_object_end(w); 8265 } 8266 8267 void 8268 bdev_nvme_get_discovery_info(struct spdk_json_write_ctx *w) 8269 { 8270 struct discovery_ctx *ctx; 8271 struct discovery_entry_ctx *entry_ctx; 8272 8273 spdk_json_write_array_begin(w); 8274 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 8275 spdk_json_write_object_begin(w); 8276 spdk_json_write_named_string(w, "name", ctx->name); 8277 8278 spdk_json_write_named_object_begin(w, "trid"); 8279 nvme_bdev_dump_trid_json(&ctx->trid, w); 8280 spdk_json_write_object_end(w); 8281 8282 spdk_json_write_named_array_begin(w, "referrals"); 8283 TAILQ_FOREACH(entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 8284 spdk_json_write_object_begin(w); 8285 spdk_json_write_named_object_begin(w, "trid"); 8286 nvme_bdev_dump_trid_json(&entry_ctx->trid, w); 8287 spdk_json_write_object_end(w); 8288 spdk_json_write_object_end(w); 8289 } 8290 spdk_json_write_array_end(w); 8291 8292 spdk_json_write_object_end(w); 8293 } 8294 spdk_json_write_array_end(w); 8295 } 8296 8297 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme) 8298 8299 SPDK_TRACE_REGISTER_FN(bdev_nvme_trace, "bdev_nvme", TRACE_GROUP_BDEV_NVME) 8300 { 8301 struct spdk_trace_tpoint_opts opts[] = { 8302 { 8303 "BDEV_NVME_IO_START", TRACE_BDEV_NVME_IO_START, 8304 OWNER_NONE, OBJECT_BDEV_NVME_IO, 1, 8305 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 8306 }, 8307 { 8308 "BDEV_NVME_IO_DONE", TRACE_BDEV_NVME_IO_DONE, 8309 OWNER_NONE, OBJECT_BDEV_NVME_IO, 0, 8310 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 8311 } 8312 }; 8313 8314 8315 spdk_trace_register_object(OBJECT_BDEV_NVME_IO, 'N'); 8316 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 8317 spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_SUBMIT, OBJECT_BDEV_NVME_IO, 0); 8318 spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_SUBMIT, OBJECT_BDEV_NVME_IO, 0); 8319 spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_COMPLETE, OBJECT_BDEV_NVME_IO, 0); 8320 spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_COMPLETE, OBJECT_BDEV_NVME_IO, 0); 8321 } 8322