1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 * Copyright (c) 2022 Dell Inc, or its subsidiaries. All rights reserved. 6 */ 7 8 #include "spdk/stdinc.h" 9 10 #include "bdev_nvme.h" 11 12 #include "spdk/accel.h" 13 #include "spdk/config.h" 14 #include "spdk/endian.h" 15 #include "spdk/bdev.h" 16 #include "spdk/json.h" 17 #include "spdk/likely.h" 18 #include "spdk/nvme.h" 19 #include "spdk/nvme_ocssd.h" 20 #include "spdk/nvme_zns.h" 21 #include "spdk/opal.h" 22 #include "spdk/thread.h" 23 #include "spdk/trace.h" 24 #include "spdk/string.h" 25 #include "spdk/util.h" 26 #include "spdk/uuid.h" 27 28 #include "spdk/bdev_module.h" 29 #include "spdk/log.h" 30 31 #include "spdk_internal/usdt.h" 32 #include "spdk_internal/trace_defs.h" 33 34 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true 35 #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS (10000) 36 37 #define NSID_STR_LEN 10 38 39 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w); 40 41 struct nvme_bdev_io { 42 /** array of iovecs to transfer. */ 43 struct iovec *iovs; 44 45 /** Number of iovecs in iovs array. */ 46 int iovcnt; 47 48 /** Current iovec position. */ 49 int iovpos; 50 51 /** Offset in current iovec. */ 52 uint32_t iov_offset; 53 54 /** I/O path the current I/O or admin passthrough is submitted on, or the I/O path 55 * being reset in a reset I/O. 56 */ 57 struct nvme_io_path *io_path; 58 59 /** array of iovecs to transfer. */ 60 struct iovec *fused_iovs; 61 62 /** Number of iovecs in iovs array. */ 63 int fused_iovcnt; 64 65 /** Current iovec position. */ 66 int fused_iovpos; 67 68 /** Offset in current iovec. */ 69 uint32_t fused_iov_offset; 70 71 /** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */ 72 struct spdk_nvme_cpl cpl; 73 74 /** Extended IO opts passed by the user to bdev layer and mapped to NVME format */ 75 struct spdk_nvme_ns_cmd_ext_io_opts ext_opts; 76 77 /** Keeps track if first of fused commands was submitted */ 78 bool first_fused_submitted; 79 80 /** Keeps track if first of fused commands was completed */ 81 bool first_fused_completed; 82 83 /** Temporary pointer to zone report buffer */ 84 struct spdk_nvme_zns_zone_report *zone_report_buf; 85 86 /** Keep track of how many zones that have been copied to the spdk_bdev_zone_info struct */ 87 uint64_t handled_zones; 88 89 /** Expiration value in ticks to retry the current I/O. */ 90 uint64_t retry_ticks; 91 92 /* How many times the current I/O was retried. */ 93 int32_t retry_count; 94 95 /* Current tsc at submit time. */ 96 uint64_t submit_tsc; 97 }; 98 99 struct nvme_probe_skip_entry { 100 struct spdk_nvme_transport_id trid; 101 TAILQ_ENTRY(nvme_probe_skip_entry) tailq; 102 }; 103 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */ 104 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER( 105 g_skipped_nvme_ctrlrs); 106 107 static struct spdk_bdev_nvme_opts g_opts = { 108 .action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE, 109 .timeout_us = 0, 110 .timeout_admin_us = 0, 111 .keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS, 112 .transport_retry_count = 4, 113 .arbitration_burst = 0, 114 .low_priority_weight = 0, 115 .medium_priority_weight = 0, 116 .high_priority_weight = 0, 117 .nvme_adminq_poll_period_us = 10000ULL, 118 .nvme_ioq_poll_period_us = 0, 119 .io_queue_requests = 0, 120 .delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT, 121 .bdev_retry_count = 3, 122 .transport_ack_timeout = 0, 123 .ctrlr_loss_timeout_sec = 0, 124 .reconnect_delay_sec = 0, 125 .fast_io_fail_timeout_sec = 0, 126 .disable_auto_failback = false, 127 .generate_uuids = false, 128 .transport_tos = 0, 129 .nvme_error_stat = false, 130 .io_path_stat = false, 131 }; 132 133 #define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL 134 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT 100000ULL 135 136 static int g_hot_insert_nvme_controller_index = 0; 137 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT; 138 static bool g_nvme_hotplug_enabled = false; 139 struct spdk_thread *g_bdev_nvme_init_thread; 140 static struct spdk_poller *g_hotplug_poller; 141 static struct spdk_poller *g_hotplug_probe_poller; 142 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx; 143 144 static void nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 145 struct nvme_async_probe_ctx *ctx); 146 static void nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 147 struct nvme_async_probe_ctx *ctx); 148 static int bdev_nvme_library_init(void); 149 static void bdev_nvme_library_fini(void); 150 static void _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, 151 struct spdk_bdev_io *bdev_io); 152 static void bdev_nvme_submit_request(struct spdk_io_channel *ch, 153 struct spdk_bdev_io *bdev_io); 154 static int bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 155 void *md, uint64_t lba_count, uint64_t lba, 156 uint32_t flags, struct spdk_memory_domain *domain, void *domain_ctx); 157 static int bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 158 void *md, uint64_t lba_count, uint64_t lba); 159 static int bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 160 void *md, uint64_t lba_count, uint64_t lba, 161 uint32_t flags, struct spdk_memory_domain *domain, void *domain_ctx); 162 static int bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 163 void *md, uint64_t lba_count, 164 uint64_t zslba, uint32_t flags); 165 static int bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 166 void *md, uint64_t lba_count, uint64_t lba, 167 uint32_t flags); 168 static int bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, 169 struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov, 170 int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba, 171 uint32_t flags); 172 static int bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, 173 uint32_t num_zones, struct spdk_bdev_zone_info *info); 174 static int bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 175 enum spdk_bdev_zone_action action); 176 static void bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, 177 struct nvme_bdev_io *bio, 178 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); 179 static int bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 180 void *buf, size_t nbytes); 181 static int bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 182 void *buf, size_t nbytes, void *md_buf, size_t md_len); 183 static void bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, 184 struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort); 185 static void bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio); 186 static int bdev_nvme_reset_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 187 static int bdev_nvme_failover_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool remove); 188 static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr); 189 static int nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr); 190 191 static struct nvme_ns *nvme_ns_alloc(void); 192 static void nvme_ns_free(struct nvme_ns *ns); 193 194 static int 195 nvme_ns_cmp(struct nvme_ns *ns1, struct nvme_ns *ns2) 196 { 197 return ns1->id < ns2->id ? -1 : ns1->id > ns2->id; 198 } 199 200 RB_GENERATE_STATIC(nvme_ns_tree, nvme_ns, node, nvme_ns_cmp); 201 202 struct spdk_nvme_qpair * 203 bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch) 204 { 205 struct nvme_ctrlr_channel *ctrlr_ch; 206 207 assert(ctrlr_io_ch != NULL); 208 209 ctrlr_ch = spdk_io_channel_get_ctx(ctrlr_io_ch); 210 211 return ctrlr_ch->qpair->qpair; 212 } 213 214 static int 215 bdev_nvme_get_ctx_size(void) 216 { 217 return sizeof(struct nvme_bdev_io); 218 } 219 220 static struct spdk_bdev_module nvme_if = { 221 .name = "nvme", 222 .async_fini = true, 223 .module_init = bdev_nvme_library_init, 224 .module_fini = bdev_nvme_library_fini, 225 .config_json = bdev_nvme_config_json, 226 .get_ctx_size = bdev_nvme_get_ctx_size, 227 228 }; 229 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if) 230 231 struct nvme_bdev_ctrlrs g_nvme_bdev_ctrlrs = TAILQ_HEAD_INITIALIZER(g_nvme_bdev_ctrlrs); 232 pthread_mutex_t g_bdev_nvme_mutex = PTHREAD_MUTEX_INITIALIZER; 233 bool g_bdev_nvme_module_finish; 234 235 struct nvme_bdev_ctrlr * 236 nvme_bdev_ctrlr_get_by_name(const char *name) 237 { 238 struct nvme_bdev_ctrlr *nbdev_ctrlr; 239 240 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 241 if (strcmp(name, nbdev_ctrlr->name) == 0) { 242 break; 243 } 244 } 245 246 return nbdev_ctrlr; 247 } 248 249 static struct nvme_ctrlr * 250 nvme_bdev_ctrlr_get_ctrlr(struct nvme_bdev_ctrlr *nbdev_ctrlr, 251 const struct spdk_nvme_transport_id *trid) 252 { 253 struct nvme_ctrlr *nvme_ctrlr; 254 255 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 256 if (spdk_nvme_transport_id_compare(trid, &nvme_ctrlr->active_path_id->trid) == 0) { 257 break; 258 } 259 } 260 261 return nvme_ctrlr; 262 } 263 264 struct nvme_ctrlr * 265 nvme_bdev_ctrlr_get_ctrlr_by_id(struct nvme_bdev_ctrlr *nbdev_ctrlr, 266 uint16_t cntlid) 267 { 268 struct nvme_ctrlr *nvme_ctrlr; 269 const struct spdk_nvme_ctrlr_data *cdata; 270 271 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 272 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 273 if (cdata->cntlid == cntlid) { 274 break; 275 } 276 } 277 278 return nvme_ctrlr; 279 } 280 281 static struct nvme_bdev * 282 nvme_bdev_ctrlr_get_bdev(struct nvme_bdev_ctrlr *nbdev_ctrlr, uint32_t nsid) 283 { 284 struct nvme_bdev *bdev; 285 286 pthread_mutex_lock(&g_bdev_nvme_mutex); 287 TAILQ_FOREACH(bdev, &nbdev_ctrlr->bdevs, tailq) { 288 if (bdev->nsid == nsid) { 289 break; 290 } 291 } 292 pthread_mutex_unlock(&g_bdev_nvme_mutex); 293 294 return bdev; 295 } 296 297 struct nvme_ns * 298 nvme_ctrlr_get_ns(struct nvme_ctrlr *nvme_ctrlr, uint32_t nsid) 299 { 300 struct nvme_ns ns; 301 302 assert(nsid > 0); 303 304 ns.id = nsid; 305 return RB_FIND(nvme_ns_tree, &nvme_ctrlr->namespaces, &ns); 306 } 307 308 struct nvme_ns * 309 nvme_ctrlr_get_first_active_ns(struct nvme_ctrlr *nvme_ctrlr) 310 { 311 return RB_MIN(nvme_ns_tree, &nvme_ctrlr->namespaces); 312 } 313 314 struct nvme_ns * 315 nvme_ctrlr_get_next_active_ns(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *ns) 316 { 317 if (ns == NULL) { 318 return NULL; 319 } 320 321 return RB_NEXT(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 322 } 323 324 static struct nvme_ctrlr * 325 nvme_ctrlr_get(const struct spdk_nvme_transport_id *trid) 326 { 327 struct nvme_bdev_ctrlr *nbdev_ctrlr; 328 struct nvme_ctrlr *nvme_ctrlr = NULL; 329 330 pthread_mutex_lock(&g_bdev_nvme_mutex); 331 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 332 nvme_ctrlr = nvme_bdev_ctrlr_get_ctrlr(nbdev_ctrlr, trid); 333 if (nvme_ctrlr != NULL) { 334 break; 335 } 336 } 337 pthread_mutex_unlock(&g_bdev_nvme_mutex); 338 339 return nvme_ctrlr; 340 } 341 342 struct nvme_ctrlr * 343 nvme_ctrlr_get_by_name(const char *name) 344 { 345 struct nvme_bdev_ctrlr *nbdev_ctrlr; 346 struct nvme_ctrlr *nvme_ctrlr = NULL; 347 348 if (name == NULL) { 349 return NULL; 350 } 351 352 pthread_mutex_lock(&g_bdev_nvme_mutex); 353 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 354 if (nbdev_ctrlr != NULL) { 355 nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 356 } 357 pthread_mutex_unlock(&g_bdev_nvme_mutex); 358 359 return nvme_ctrlr; 360 } 361 362 void 363 nvme_bdev_ctrlr_for_each(nvme_bdev_ctrlr_for_each_fn fn, void *ctx) 364 { 365 struct nvme_bdev_ctrlr *nbdev_ctrlr; 366 367 pthread_mutex_lock(&g_bdev_nvme_mutex); 368 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 369 fn(nbdev_ctrlr, ctx); 370 } 371 pthread_mutex_unlock(&g_bdev_nvme_mutex); 372 } 373 374 void 375 nvme_bdev_dump_trid_json(const struct spdk_nvme_transport_id *trid, struct spdk_json_write_ctx *w) 376 { 377 const char *trtype_str; 378 const char *adrfam_str; 379 380 trtype_str = spdk_nvme_transport_id_trtype_str(trid->trtype); 381 if (trtype_str) { 382 spdk_json_write_named_string(w, "trtype", trtype_str); 383 } 384 385 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 386 if (adrfam_str) { 387 spdk_json_write_named_string(w, "adrfam", adrfam_str); 388 } 389 390 if (trid->traddr[0] != '\0') { 391 spdk_json_write_named_string(w, "traddr", trid->traddr); 392 } 393 394 if (trid->trsvcid[0] != '\0') { 395 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 396 } 397 398 if (trid->subnqn[0] != '\0') { 399 spdk_json_write_named_string(w, "subnqn", trid->subnqn); 400 } 401 } 402 403 static void 404 nvme_bdev_ctrlr_delete(struct nvme_bdev_ctrlr *nbdev_ctrlr, 405 struct nvme_ctrlr *nvme_ctrlr) 406 { 407 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_delete, nvme_ctrlr->nbdev_ctrlr->name); 408 pthread_mutex_lock(&g_bdev_nvme_mutex); 409 410 TAILQ_REMOVE(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 411 if (!TAILQ_EMPTY(&nbdev_ctrlr->ctrlrs)) { 412 pthread_mutex_unlock(&g_bdev_nvme_mutex); 413 414 return; 415 } 416 TAILQ_REMOVE(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 417 418 pthread_mutex_unlock(&g_bdev_nvme_mutex); 419 420 assert(TAILQ_EMPTY(&nbdev_ctrlr->bdevs)); 421 422 free(nbdev_ctrlr->name); 423 free(nbdev_ctrlr); 424 } 425 426 static void 427 _nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 428 { 429 struct nvme_path_id *path_id, *tmp_path; 430 struct nvme_ns *ns, *tmp_ns; 431 432 free(nvme_ctrlr->copied_ana_desc); 433 spdk_free(nvme_ctrlr->ana_log_page); 434 435 if (nvme_ctrlr->opal_dev) { 436 spdk_opal_dev_destruct(nvme_ctrlr->opal_dev); 437 nvme_ctrlr->opal_dev = NULL; 438 } 439 440 if (nvme_ctrlr->nbdev_ctrlr) { 441 nvme_bdev_ctrlr_delete(nvme_ctrlr->nbdev_ctrlr, nvme_ctrlr); 442 } 443 444 RB_FOREACH_SAFE(ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp_ns) { 445 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 446 nvme_ns_free(ns); 447 } 448 449 TAILQ_FOREACH_SAFE(path_id, &nvme_ctrlr->trids, link, tmp_path) { 450 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 451 free(path_id); 452 } 453 454 pthread_mutex_destroy(&nvme_ctrlr->mutex); 455 456 free(nvme_ctrlr); 457 458 pthread_mutex_lock(&g_bdev_nvme_mutex); 459 if (g_bdev_nvme_module_finish && TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 460 pthread_mutex_unlock(&g_bdev_nvme_mutex); 461 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 462 spdk_bdev_module_fini_done(); 463 return; 464 } 465 pthread_mutex_unlock(&g_bdev_nvme_mutex); 466 } 467 468 static int 469 nvme_detach_poller(void *arg) 470 { 471 struct nvme_ctrlr *nvme_ctrlr = arg; 472 int rc; 473 474 rc = spdk_nvme_detach_poll_async(nvme_ctrlr->detach_ctx); 475 if (rc != -EAGAIN) { 476 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 477 _nvme_ctrlr_delete(nvme_ctrlr); 478 } 479 480 return SPDK_POLLER_BUSY; 481 } 482 483 static void 484 nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 485 { 486 int rc; 487 488 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 489 490 /* First, unregister the adminq poller, as the driver will poll adminq if necessary */ 491 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 492 493 /* If we got here, the reset/detach poller cannot be active */ 494 assert(nvme_ctrlr->reset_detach_poller == NULL); 495 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(nvme_detach_poller, 496 nvme_ctrlr, 1000); 497 if (nvme_ctrlr->reset_detach_poller == NULL) { 498 SPDK_ERRLOG("Failed to register detach poller\n"); 499 goto error; 500 } 501 502 rc = spdk_nvme_detach_async(nvme_ctrlr->ctrlr, &nvme_ctrlr->detach_ctx); 503 if (rc != 0) { 504 SPDK_ERRLOG("Failed to detach the NVMe controller\n"); 505 goto error; 506 } 507 508 return; 509 error: 510 /* We don't have a good way to handle errors here, so just do what we can and delete the 511 * controller without detaching the underlying NVMe device. 512 */ 513 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 514 _nvme_ctrlr_delete(nvme_ctrlr); 515 } 516 517 static void 518 nvme_ctrlr_unregister_cb(void *io_device) 519 { 520 struct nvme_ctrlr *nvme_ctrlr = io_device; 521 522 nvme_ctrlr_delete(nvme_ctrlr); 523 } 524 525 static void 526 nvme_ctrlr_unregister(void *ctx) 527 { 528 struct nvme_ctrlr *nvme_ctrlr = ctx; 529 530 spdk_io_device_unregister(nvme_ctrlr, nvme_ctrlr_unregister_cb); 531 } 532 533 static bool 534 nvme_ctrlr_can_be_unregistered(struct nvme_ctrlr *nvme_ctrlr) 535 { 536 if (!nvme_ctrlr->destruct) { 537 return false; 538 } 539 540 if (nvme_ctrlr->ref > 0) { 541 return false; 542 } 543 544 if (nvme_ctrlr->resetting) { 545 return false; 546 } 547 548 if (nvme_ctrlr->ana_log_page_updating) { 549 return false; 550 } 551 552 if (nvme_ctrlr->io_path_cache_clearing) { 553 return false; 554 } 555 556 return true; 557 } 558 559 static void 560 nvme_ctrlr_release(struct nvme_ctrlr *nvme_ctrlr) 561 { 562 pthread_mutex_lock(&nvme_ctrlr->mutex); 563 SPDK_DTRACE_PROBE2(bdev_nvme_ctrlr_release, nvme_ctrlr->nbdev_ctrlr->name, nvme_ctrlr->ref); 564 565 assert(nvme_ctrlr->ref > 0); 566 nvme_ctrlr->ref--; 567 568 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 569 pthread_mutex_unlock(&nvme_ctrlr->mutex); 570 return; 571 } 572 573 pthread_mutex_unlock(&nvme_ctrlr->mutex); 574 575 spdk_thread_exec_msg(nvme_ctrlr->thread, nvme_ctrlr_unregister, nvme_ctrlr); 576 } 577 578 static void 579 bdev_nvme_clear_current_io_path(struct nvme_bdev_channel *nbdev_ch) 580 { 581 nbdev_ch->current_io_path = NULL; 582 nbdev_ch->rr_counter = 0; 583 } 584 585 static struct nvme_io_path * 586 _bdev_nvme_get_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 587 { 588 struct nvme_io_path *io_path; 589 590 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 591 if (io_path->nvme_ns == nvme_ns) { 592 break; 593 } 594 } 595 596 return io_path; 597 } 598 599 static struct nvme_io_path * 600 nvme_io_path_alloc(void) 601 { 602 struct nvme_io_path *io_path; 603 604 io_path = calloc(1, sizeof(*io_path)); 605 if (io_path == NULL) { 606 SPDK_ERRLOG("Failed to alloc io_path.\n"); 607 return NULL; 608 } 609 610 if (g_opts.io_path_stat) { 611 io_path->stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); 612 if (io_path->stat == NULL) { 613 free(io_path); 614 SPDK_ERRLOG("Failed to alloc io_path stat.\n"); 615 return NULL; 616 } 617 spdk_bdev_reset_io_stat(io_path->stat, SPDK_BDEV_RESET_STAT_MAXMIN); 618 } 619 620 return io_path; 621 } 622 623 static void 624 nvme_io_path_free(struct nvme_io_path *io_path) 625 { 626 free(io_path->stat); 627 free(io_path); 628 } 629 630 static int 631 _bdev_nvme_add_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 632 { 633 struct nvme_io_path *io_path; 634 struct spdk_io_channel *ch; 635 struct nvme_ctrlr_channel *ctrlr_ch; 636 struct nvme_qpair *nvme_qpair; 637 638 io_path = nvme_io_path_alloc(); 639 if (io_path == NULL) { 640 return -ENOMEM; 641 } 642 643 io_path->nvme_ns = nvme_ns; 644 645 ch = spdk_get_io_channel(nvme_ns->ctrlr); 646 if (ch == NULL) { 647 nvme_io_path_free(io_path); 648 SPDK_ERRLOG("Failed to alloc io_channel.\n"); 649 return -ENOMEM; 650 } 651 652 ctrlr_ch = spdk_io_channel_get_ctx(ch); 653 654 nvme_qpair = ctrlr_ch->qpair; 655 assert(nvme_qpair != NULL); 656 657 io_path->qpair = nvme_qpair; 658 TAILQ_INSERT_TAIL(&nvme_qpair->io_path_list, io_path, tailq); 659 660 io_path->nbdev_ch = nbdev_ch; 661 STAILQ_INSERT_TAIL(&nbdev_ch->io_path_list, io_path, stailq); 662 663 bdev_nvme_clear_current_io_path(nbdev_ch); 664 665 return 0; 666 } 667 668 static void 669 bdev_nvme_clear_retry_io_path(struct nvme_bdev_channel *nbdev_ch, 670 struct nvme_io_path *io_path) 671 { 672 struct spdk_bdev_io *bdev_io; 673 struct nvme_bdev_io *bio; 674 675 TAILQ_FOREACH(bdev_io, &nbdev_ch->retry_io_list, module_link) { 676 bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 677 if (bio->io_path == io_path) { 678 bio->io_path = NULL; 679 } 680 } 681 } 682 683 static void 684 _bdev_nvme_delete_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *io_path) 685 { 686 struct spdk_io_channel *ch; 687 struct nvme_qpair *nvme_qpair; 688 struct nvme_ctrlr_channel *ctrlr_ch; 689 struct nvme_bdev *nbdev; 690 691 nbdev = spdk_io_channel_get_io_device(spdk_io_channel_from_ctx(nbdev_ch)); 692 693 /* Add the statistics to nvme_ns before this path is destroyed. */ 694 pthread_mutex_lock(&nbdev->mutex); 695 if (nbdev->ref != 0 && io_path->nvme_ns->stat != NULL && io_path->stat != NULL) { 696 spdk_bdev_add_io_stat(io_path->nvme_ns->stat, io_path->stat); 697 } 698 pthread_mutex_unlock(&nbdev->mutex); 699 700 bdev_nvme_clear_current_io_path(nbdev_ch); 701 bdev_nvme_clear_retry_io_path(nbdev_ch, io_path); 702 703 STAILQ_REMOVE(&nbdev_ch->io_path_list, io_path, nvme_io_path, stailq); 704 io_path->nbdev_ch = NULL; 705 706 nvme_qpair = io_path->qpair; 707 assert(nvme_qpair != NULL); 708 709 ctrlr_ch = nvme_qpair->ctrlr_ch; 710 assert(ctrlr_ch != NULL); 711 712 ch = spdk_io_channel_from_ctx(ctrlr_ch); 713 spdk_put_io_channel(ch); 714 715 /* After an io_path is removed, I/Os submitted to it may complete and update statistics 716 * of the io_path. To avoid heap-use-after-free error from this case, do not free the 717 * io_path here but free the io_path when the associated qpair is freed. It is ensured 718 * that all I/Os submitted to the io_path are completed when the associated qpair is freed. 719 */ 720 } 721 722 static void 723 _bdev_nvme_delete_io_paths(struct nvme_bdev_channel *nbdev_ch) 724 { 725 struct nvme_io_path *io_path, *tmp_io_path; 726 727 STAILQ_FOREACH_SAFE(io_path, &nbdev_ch->io_path_list, stailq, tmp_io_path) { 728 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 729 } 730 } 731 732 static int 733 bdev_nvme_create_bdev_channel_cb(void *io_device, void *ctx_buf) 734 { 735 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 736 struct nvme_bdev *nbdev = io_device; 737 struct nvme_ns *nvme_ns; 738 int rc; 739 740 STAILQ_INIT(&nbdev_ch->io_path_list); 741 TAILQ_INIT(&nbdev_ch->retry_io_list); 742 743 pthread_mutex_lock(&nbdev->mutex); 744 745 nbdev_ch->mp_policy = nbdev->mp_policy; 746 nbdev_ch->mp_selector = nbdev->mp_selector; 747 nbdev_ch->rr_min_io = nbdev->rr_min_io; 748 749 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 750 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 751 if (rc != 0) { 752 pthread_mutex_unlock(&nbdev->mutex); 753 754 _bdev_nvme_delete_io_paths(nbdev_ch); 755 return rc; 756 } 757 } 758 pthread_mutex_unlock(&nbdev->mutex); 759 760 return 0; 761 } 762 763 /* If cpl != NULL, complete the bdev_io with nvme status based on 'cpl'. 764 * If cpl == NULL, complete the bdev_io with bdev status based on 'status'. 765 */ 766 static inline void 767 __bdev_nvme_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status, 768 const struct spdk_nvme_cpl *cpl) 769 { 770 spdk_trace_record(TRACE_BDEV_NVME_IO_DONE, 0, 0, (uintptr_t)bdev_io->driver_ctx, 771 (uintptr_t)bdev_io); 772 if (cpl) { 773 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 774 } else { 775 spdk_bdev_io_complete(bdev_io, status); 776 } 777 } 778 779 static void bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch); 780 781 static void 782 bdev_nvme_destroy_bdev_channel_cb(void *io_device, void *ctx_buf) 783 { 784 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 785 786 bdev_nvme_abort_retry_ios(nbdev_ch); 787 _bdev_nvme_delete_io_paths(nbdev_ch); 788 } 789 790 static inline bool 791 bdev_nvme_io_type_is_admin(enum spdk_bdev_io_type io_type) 792 { 793 switch (io_type) { 794 case SPDK_BDEV_IO_TYPE_RESET: 795 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 796 case SPDK_BDEV_IO_TYPE_ABORT: 797 return true; 798 default: 799 break; 800 } 801 802 return false; 803 } 804 805 static inline bool 806 nvme_ns_is_accessible(struct nvme_ns *nvme_ns) 807 { 808 if (spdk_unlikely(nvme_ns->ana_state_updating)) { 809 return false; 810 } 811 812 switch (nvme_ns->ana_state) { 813 case SPDK_NVME_ANA_OPTIMIZED_STATE: 814 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 815 return true; 816 default: 817 break; 818 } 819 820 return false; 821 } 822 823 static inline bool 824 nvme_qpair_is_connected(struct nvme_qpair *nvme_qpair) 825 { 826 if (spdk_unlikely(nvme_qpair->qpair == NULL)) { 827 return false; 828 } 829 830 if (spdk_unlikely(spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) != 831 SPDK_NVME_QPAIR_FAILURE_NONE)) { 832 return false; 833 } 834 835 if (spdk_unlikely(nvme_qpair->ctrlr_ch->reset_iter != NULL)) { 836 return false; 837 } 838 839 if (spdk_nvme_ctrlr_get_admin_qp_failure_reason(nvme_qpair->ctrlr->ctrlr) != 840 SPDK_NVME_QPAIR_FAILURE_NONE) { 841 return false; 842 } 843 844 return true; 845 } 846 847 static inline bool 848 nvme_io_path_is_available(struct nvme_io_path *io_path) 849 { 850 if (spdk_unlikely(!nvme_qpair_is_connected(io_path->qpair))) { 851 return false; 852 } 853 854 if (spdk_unlikely(!nvme_ns_is_accessible(io_path->nvme_ns))) { 855 return false; 856 } 857 858 return true; 859 } 860 861 static inline bool 862 nvme_ctrlr_is_failed(struct nvme_ctrlr *nvme_ctrlr) 863 { 864 if (nvme_ctrlr->destruct) { 865 return true; 866 } 867 868 if (nvme_ctrlr->fast_io_fail_timedout) { 869 return true; 870 } 871 872 if (nvme_ctrlr->resetting) { 873 if (nvme_ctrlr->opts.reconnect_delay_sec != 0) { 874 return false; 875 } else { 876 return true; 877 } 878 } 879 880 if (nvme_ctrlr->reconnect_is_delayed) { 881 return false; 882 } 883 884 if (nvme_ctrlr->disabled) { 885 return true; 886 } 887 888 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 889 return true; 890 } else { 891 return false; 892 } 893 } 894 895 static bool 896 nvme_ctrlr_is_available(struct nvme_ctrlr *nvme_ctrlr) 897 { 898 if (nvme_ctrlr->destruct) { 899 return false; 900 } 901 902 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 903 return false; 904 } 905 906 if (nvme_ctrlr->resetting || nvme_ctrlr->reconnect_is_delayed) { 907 return false; 908 } 909 910 if (nvme_ctrlr->disabled) { 911 return false; 912 } 913 914 return true; 915 } 916 917 /* Simulate circular linked list. */ 918 static inline struct nvme_io_path * 919 nvme_io_path_get_next(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *prev_path) 920 { 921 struct nvme_io_path *next_path; 922 923 if (prev_path != NULL) { 924 next_path = STAILQ_NEXT(prev_path, stailq); 925 if (next_path != NULL) { 926 return next_path; 927 } 928 } 929 930 return STAILQ_FIRST(&nbdev_ch->io_path_list); 931 } 932 933 static struct nvme_io_path * 934 _bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 935 { 936 struct nvme_io_path *io_path, *start, *non_optimized = NULL; 937 938 start = nvme_io_path_get_next(nbdev_ch, nbdev_ch->current_io_path); 939 940 io_path = start; 941 do { 942 if (spdk_likely(nvme_qpair_is_connected(io_path->qpair) && 943 !io_path->nvme_ns->ana_state_updating)) { 944 switch (io_path->nvme_ns->ana_state) { 945 case SPDK_NVME_ANA_OPTIMIZED_STATE: 946 nbdev_ch->current_io_path = io_path; 947 return io_path; 948 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 949 if (non_optimized == NULL) { 950 non_optimized = io_path; 951 } 952 break; 953 default: 954 break; 955 } 956 } 957 io_path = nvme_io_path_get_next(nbdev_ch, io_path); 958 } while (io_path != start); 959 960 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) { 961 /* We come here only if there is no optimized path. Cache even non_optimized 962 * path for load balance across multiple non_optimized paths. 963 */ 964 nbdev_ch->current_io_path = non_optimized; 965 } 966 967 return non_optimized; 968 } 969 970 static struct nvme_io_path * 971 _bdev_nvme_find_io_path_min_qd(struct nvme_bdev_channel *nbdev_ch) 972 { 973 struct nvme_io_path *io_path; 974 struct nvme_io_path *optimized = NULL, *non_optimized = NULL; 975 uint32_t opt_min_qd = UINT32_MAX, non_opt_min_qd = UINT32_MAX; 976 uint32_t num_outstanding_reqs; 977 978 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 979 if (spdk_unlikely(!nvme_qpair_is_connected(io_path->qpair))) { 980 /* The device is currently resetting. */ 981 continue; 982 } 983 984 if (spdk_unlikely(io_path->nvme_ns->ana_state_updating)) { 985 continue; 986 } 987 988 num_outstanding_reqs = spdk_nvme_qpair_get_num_outstanding_reqs(io_path->qpair->qpair); 989 switch (io_path->nvme_ns->ana_state) { 990 case SPDK_NVME_ANA_OPTIMIZED_STATE: 991 if (num_outstanding_reqs < opt_min_qd) { 992 opt_min_qd = num_outstanding_reqs; 993 optimized = io_path; 994 } 995 break; 996 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 997 if (num_outstanding_reqs < non_opt_min_qd) { 998 non_opt_min_qd = num_outstanding_reqs; 999 non_optimized = io_path; 1000 } 1001 break; 1002 default: 1003 break; 1004 } 1005 } 1006 1007 /* don't cache io path for BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH selector */ 1008 if (optimized != NULL) { 1009 return optimized; 1010 } 1011 1012 return non_optimized; 1013 } 1014 1015 static inline struct nvme_io_path * 1016 bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 1017 { 1018 if (spdk_likely(nbdev_ch->current_io_path != NULL)) { 1019 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE) { 1020 return nbdev_ch->current_io_path; 1021 } else if (nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 1022 if (++nbdev_ch->rr_counter < nbdev_ch->rr_min_io) { 1023 return nbdev_ch->current_io_path; 1024 } 1025 nbdev_ch->rr_counter = 0; 1026 } 1027 } 1028 1029 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE || 1030 nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 1031 return _bdev_nvme_find_io_path(nbdev_ch); 1032 } else { 1033 return _bdev_nvme_find_io_path_min_qd(nbdev_ch); 1034 } 1035 } 1036 1037 /* Return true if there is any io_path whose qpair is active or ctrlr is not failed, 1038 * or false otherwise. 1039 * 1040 * If any io_path has an active qpair but find_io_path() returned NULL, its namespace 1041 * is likely to be non-accessible now but may become accessible. 1042 * 1043 * If any io_path has an unfailed ctrlr but find_io_path() returned NULL, the ctrlr 1044 * is likely to be resetting now but the reset may succeed. A ctrlr is set to unfailed 1045 * when starting to reset it but it is set to failed when the reset failed. Hence, if 1046 * a ctrlr is unfailed, it is likely that it works fine or is resetting. 1047 */ 1048 static bool 1049 any_io_path_may_become_available(struct nvme_bdev_channel *nbdev_ch) 1050 { 1051 struct nvme_io_path *io_path; 1052 1053 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 1054 if (io_path->nvme_ns->ana_transition_timedout) { 1055 continue; 1056 } 1057 1058 if (nvme_qpair_is_connected(io_path->qpair) || 1059 !nvme_ctrlr_is_failed(io_path->qpair->ctrlr)) { 1060 return true; 1061 } 1062 } 1063 1064 return false; 1065 } 1066 1067 static void 1068 bdev_nvme_retry_io(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io) 1069 { 1070 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 1071 struct spdk_io_channel *ch; 1072 1073 if (nbdev_io->io_path != NULL && nvme_io_path_is_available(nbdev_io->io_path)) { 1074 _bdev_nvme_submit_request(nbdev_ch, bdev_io); 1075 } else { 1076 ch = spdk_io_channel_from_ctx(nbdev_ch); 1077 bdev_nvme_submit_request(ch, bdev_io); 1078 } 1079 } 1080 1081 static int 1082 bdev_nvme_retry_ios(void *arg) 1083 { 1084 struct nvme_bdev_channel *nbdev_ch = arg; 1085 struct spdk_bdev_io *bdev_io, *tmp_bdev_io; 1086 struct nvme_bdev_io *bio; 1087 uint64_t now, delay_us; 1088 1089 now = spdk_get_ticks(); 1090 1091 TAILQ_FOREACH_SAFE(bdev_io, &nbdev_ch->retry_io_list, module_link, tmp_bdev_io) { 1092 bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 1093 if (bio->retry_ticks > now) { 1094 break; 1095 } 1096 1097 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io, module_link); 1098 1099 bdev_nvme_retry_io(nbdev_ch, bdev_io); 1100 } 1101 1102 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1103 1104 bdev_io = TAILQ_FIRST(&nbdev_ch->retry_io_list); 1105 if (bdev_io != NULL) { 1106 bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 1107 1108 delay_us = (bio->retry_ticks - now) * SPDK_SEC_TO_USEC / spdk_get_ticks_hz(); 1109 1110 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 1111 delay_us); 1112 } 1113 1114 return SPDK_POLLER_BUSY; 1115 } 1116 1117 static void 1118 bdev_nvme_queue_retry_io(struct nvme_bdev_channel *nbdev_ch, 1119 struct nvme_bdev_io *bio, uint64_t delay_ms) 1120 { 1121 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1122 struct spdk_bdev_io *tmp_bdev_io; 1123 struct nvme_bdev_io *tmp_bio; 1124 1125 bio->retry_ticks = spdk_get_ticks() + delay_ms * spdk_get_ticks_hz() / 1000ULL; 1126 1127 TAILQ_FOREACH_REVERSE(tmp_bdev_io, &nbdev_ch->retry_io_list, retry_io_head, module_link) { 1128 tmp_bio = (struct nvme_bdev_io *)tmp_bdev_io->driver_ctx; 1129 1130 if (tmp_bio->retry_ticks <= bio->retry_ticks) { 1131 TAILQ_INSERT_AFTER(&nbdev_ch->retry_io_list, tmp_bdev_io, bdev_io, 1132 module_link); 1133 return; 1134 } 1135 } 1136 1137 /* No earlier I/Os were found. This I/O must be the new head. */ 1138 TAILQ_INSERT_HEAD(&nbdev_ch->retry_io_list, bdev_io, module_link); 1139 1140 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1141 1142 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 1143 delay_ms * 1000ULL); 1144 } 1145 1146 static void 1147 bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch) 1148 { 1149 struct spdk_bdev_io *bdev_io, *tmp_io; 1150 1151 TAILQ_FOREACH_SAFE(bdev_io, &nbdev_ch->retry_io_list, module_link, tmp_io) { 1152 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io, module_link); 1153 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED, NULL); 1154 } 1155 1156 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1157 } 1158 1159 static int 1160 bdev_nvme_abort_retry_io(struct nvme_bdev_channel *nbdev_ch, 1161 struct nvme_bdev_io *bio_to_abort) 1162 { 1163 struct spdk_bdev_io *bdev_io_to_abort; 1164 1165 TAILQ_FOREACH(bdev_io_to_abort, &nbdev_ch->retry_io_list, module_link) { 1166 if ((struct nvme_bdev_io *)bdev_io_to_abort->driver_ctx == bio_to_abort) { 1167 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io_to_abort, module_link); 1168 __bdev_nvme_io_complete(bdev_io_to_abort, SPDK_BDEV_IO_STATUS_ABORTED, NULL); 1169 return 0; 1170 } 1171 } 1172 1173 return -ENOENT; 1174 } 1175 1176 static void 1177 bdev_nvme_update_nvme_error_stat(struct spdk_bdev_io *bdev_io, const struct spdk_nvme_cpl *cpl) 1178 { 1179 struct nvme_bdev *nbdev; 1180 uint16_t sct, sc; 1181 1182 assert(spdk_nvme_cpl_is_error(cpl)); 1183 1184 nbdev = bdev_io->bdev->ctxt; 1185 1186 if (nbdev->err_stat == NULL) { 1187 return; 1188 } 1189 1190 sct = cpl->status.sct; 1191 sc = cpl->status.sc; 1192 1193 pthread_mutex_lock(&nbdev->mutex); 1194 1195 nbdev->err_stat->status_type[sct]++; 1196 switch (sct) { 1197 case SPDK_NVME_SCT_GENERIC: 1198 case SPDK_NVME_SCT_COMMAND_SPECIFIC: 1199 case SPDK_NVME_SCT_MEDIA_ERROR: 1200 case SPDK_NVME_SCT_PATH: 1201 nbdev->err_stat->status[sct][sc]++; 1202 break; 1203 default: 1204 break; 1205 } 1206 1207 pthread_mutex_unlock(&nbdev->mutex); 1208 } 1209 1210 static inline void 1211 bdev_nvme_update_io_path_stat(struct nvme_bdev_io *bio) 1212 { 1213 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1214 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 1215 uint32_t blocklen = bdev_io->bdev->blocklen; 1216 struct spdk_bdev_io_stat *stat; 1217 uint64_t tsc_diff; 1218 1219 if (bio->io_path->stat == NULL) { 1220 return; 1221 } 1222 1223 tsc_diff = spdk_get_ticks() - bio->submit_tsc; 1224 stat = bio->io_path->stat; 1225 1226 switch (bdev_io->type) { 1227 case SPDK_BDEV_IO_TYPE_READ: 1228 stat->bytes_read += num_blocks * blocklen; 1229 stat->num_read_ops++; 1230 stat->read_latency_ticks += tsc_diff; 1231 if (stat->max_read_latency_ticks < tsc_diff) { 1232 stat->max_read_latency_ticks = tsc_diff; 1233 } 1234 if (stat->min_read_latency_ticks > tsc_diff) { 1235 stat->min_read_latency_ticks = tsc_diff; 1236 } 1237 break; 1238 case SPDK_BDEV_IO_TYPE_WRITE: 1239 stat->bytes_written += num_blocks * blocklen; 1240 stat->num_write_ops++; 1241 stat->write_latency_ticks += tsc_diff; 1242 if (stat->max_write_latency_ticks < tsc_diff) { 1243 stat->max_write_latency_ticks = tsc_diff; 1244 } 1245 if (stat->min_write_latency_ticks > tsc_diff) { 1246 stat->min_write_latency_ticks = tsc_diff; 1247 } 1248 break; 1249 case SPDK_BDEV_IO_TYPE_UNMAP: 1250 stat->bytes_unmapped += num_blocks * blocklen; 1251 stat->num_unmap_ops++; 1252 stat->unmap_latency_ticks += tsc_diff; 1253 if (stat->max_unmap_latency_ticks < tsc_diff) { 1254 stat->max_unmap_latency_ticks = tsc_diff; 1255 } 1256 if (stat->min_unmap_latency_ticks > tsc_diff) { 1257 stat->min_unmap_latency_ticks = tsc_diff; 1258 } 1259 break; 1260 case SPDK_BDEV_IO_TYPE_ZCOPY: 1261 /* Track the data in the start phase only */ 1262 if (!bdev_io->u.bdev.zcopy.start) { 1263 break; 1264 } 1265 if (bdev_io->u.bdev.zcopy.populate) { 1266 stat->bytes_read += num_blocks * blocklen; 1267 stat->num_read_ops++; 1268 stat->read_latency_ticks += tsc_diff; 1269 if (stat->max_read_latency_ticks < tsc_diff) { 1270 stat->max_read_latency_ticks = tsc_diff; 1271 } 1272 if (stat->min_read_latency_ticks > tsc_diff) { 1273 stat->min_read_latency_ticks = tsc_diff; 1274 } 1275 } else { 1276 stat->bytes_written += num_blocks * blocklen; 1277 stat->num_write_ops++; 1278 stat->write_latency_ticks += tsc_diff; 1279 if (stat->max_write_latency_ticks < tsc_diff) { 1280 stat->max_write_latency_ticks = tsc_diff; 1281 } 1282 if (stat->min_write_latency_ticks > tsc_diff) { 1283 stat->min_write_latency_ticks = tsc_diff; 1284 } 1285 } 1286 break; 1287 case SPDK_BDEV_IO_TYPE_COPY: 1288 stat->bytes_copied += num_blocks * blocklen; 1289 stat->num_copy_ops++; 1290 stat->copy_latency_ticks += tsc_diff; 1291 if (stat->max_copy_latency_ticks < tsc_diff) { 1292 stat->max_copy_latency_ticks = tsc_diff; 1293 } 1294 if (stat->min_copy_latency_ticks > tsc_diff) { 1295 stat->min_copy_latency_ticks = tsc_diff; 1296 } 1297 break; 1298 default: 1299 break; 1300 } 1301 } 1302 1303 static bool 1304 bdev_nvme_check_retry_io(struct nvme_bdev_io *bio, 1305 const struct spdk_nvme_cpl *cpl, 1306 struct nvme_bdev_channel *nbdev_ch, 1307 uint64_t *_delay_ms) 1308 { 1309 struct nvme_io_path *io_path = bio->io_path; 1310 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 1311 const struct spdk_nvme_ctrlr_data *cdata; 1312 1313 if (spdk_nvme_cpl_is_path_error(cpl) || 1314 spdk_nvme_cpl_is_aborted_sq_deletion(cpl) || 1315 !nvme_io_path_is_available(io_path) || 1316 !nvme_ctrlr_is_available(nvme_ctrlr)) { 1317 bdev_nvme_clear_current_io_path(nbdev_ch); 1318 bio->io_path = NULL; 1319 if (spdk_nvme_cpl_is_ana_error(cpl)) { 1320 if (nvme_ctrlr_read_ana_log_page(nvme_ctrlr) == 0) { 1321 io_path->nvme_ns->ana_state_updating = true; 1322 } 1323 } 1324 if (!any_io_path_may_become_available(nbdev_ch)) { 1325 return false; 1326 } 1327 *_delay_ms = 0; 1328 } else { 1329 bio->retry_count++; 1330 1331 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 1332 1333 if (cpl->status.crd != 0) { 1334 *_delay_ms = cdata->crdt[cpl->status.crd] * 100; 1335 } else { 1336 *_delay_ms = 0; 1337 } 1338 } 1339 1340 return true; 1341 } 1342 1343 static inline void 1344 bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio, 1345 const struct spdk_nvme_cpl *cpl) 1346 { 1347 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1348 struct nvme_bdev_channel *nbdev_ch; 1349 uint64_t delay_ms; 1350 1351 assert(!bdev_nvme_io_type_is_admin(bdev_io->type)); 1352 1353 if (spdk_likely(spdk_nvme_cpl_is_success(cpl))) { 1354 bdev_nvme_update_io_path_stat(bio); 1355 goto complete; 1356 } 1357 1358 /* Update error counts before deciding if retry is needed. 1359 * Hence, error counts may be more than the number of I/O errors. 1360 */ 1361 bdev_nvme_update_nvme_error_stat(bdev_io, cpl); 1362 1363 if (cpl->status.dnr != 0 || spdk_nvme_cpl_is_aborted_by_request(cpl) || 1364 (g_opts.bdev_retry_count != -1 && bio->retry_count >= g_opts.bdev_retry_count)) { 1365 goto complete; 1366 } 1367 1368 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1369 1370 if (bdev_nvme_check_retry_io(bio, cpl, nbdev_ch, &delay_ms)) { 1371 bdev_nvme_queue_retry_io(nbdev_ch, bio, delay_ms); 1372 return; 1373 } 1374 1375 complete: 1376 bio->retry_count = 0; 1377 bio->submit_tsc = 0; 1378 __bdev_nvme_io_complete(bdev_io, 0, cpl); 1379 } 1380 1381 static inline void 1382 bdev_nvme_io_complete(struct nvme_bdev_io *bio, int rc) 1383 { 1384 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1385 struct nvme_bdev_channel *nbdev_ch; 1386 enum spdk_bdev_io_status io_status; 1387 1388 assert(!bdev_nvme_io_type_is_admin(bdev_io->type)); 1389 1390 switch (rc) { 1391 case 0: 1392 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1393 break; 1394 case -ENOMEM: 1395 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1396 break; 1397 case -ENXIO: 1398 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1399 1400 bdev_nvme_clear_current_io_path(nbdev_ch); 1401 bio->io_path = NULL; 1402 1403 if (any_io_path_may_become_available(nbdev_ch)) { 1404 bdev_nvme_queue_retry_io(nbdev_ch, bio, 1000ULL); 1405 return; 1406 } 1407 1408 /* fallthrough */ 1409 default: 1410 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1411 break; 1412 } 1413 1414 bio->retry_count = 0; 1415 bio->submit_tsc = 0; 1416 __bdev_nvme_io_complete(bdev_io, io_status, NULL); 1417 } 1418 1419 static inline void 1420 bdev_nvme_admin_complete(struct nvme_bdev_io *bio, int rc) 1421 { 1422 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1423 enum spdk_bdev_io_status io_status; 1424 1425 switch (rc) { 1426 case 0: 1427 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1428 break; 1429 case -ENOMEM: 1430 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1431 break; 1432 case -ENXIO: 1433 /* fallthrough */ 1434 default: 1435 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1436 break; 1437 } 1438 1439 __bdev_nvme_io_complete(bdev_io, io_status, NULL); 1440 } 1441 1442 static void 1443 bdev_nvme_clear_io_path_caches_done(struct spdk_io_channel_iter *i, int status) 1444 { 1445 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 1446 1447 pthread_mutex_lock(&nvme_ctrlr->mutex); 1448 1449 assert(nvme_ctrlr->io_path_cache_clearing == true); 1450 nvme_ctrlr->io_path_cache_clearing = false; 1451 1452 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 1453 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1454 return; 1455 } 1456 1457 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1458 1459 nvme_ctrlr_unregister(nvme_ctrlr); 1460 } 1461 1462 static void 1463 _bdev_nvme_clear_io_path_cache(struct nvme_qpair *nvme_qpair) 1464 { 1465 struct nvme_io_path *io_path; 1466 1467 TAILQ_FOREACH(io_path, &nvme_qpair->io_path_list, tailq) { 1468 if (io_path->nbdev_ch == NULL) { 1469 continue; 1470 } 1471 bdev_nvme_clear_current_io_path(io_path->nbdev_ch); 1472 } 1473 } 1474 1475 static void 1476 bdev_nvme_clear_io_path_cache(struct spdk_io_channel_iter *i) 1477 { 1478 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 1479 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 1480 1481 assert(ctrlr_ch->qpair != NULL); 1482 1483 _bdev_nvme_clear_io_path_cache(ctrlr_ch->qpair); 1484 1485 spdk_for_each_channel_continue(i, 0); 1486 } 1487 1488 static void 1489 bdev_nvme_clear_io_path_caches(struct nvme_ctrlr *nvme_ctrlr) 1490 { 1491 pthread_mutex_lock(&nvme_ctrlr->mutex); 1492 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 1493 nvme_ctrlr->io_path_cache_clearing) { 1494 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1495 return; 1496 } 1497 1498 nvme_ctrlr->io_path_cache_clearing = true; 1499 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1500 1501 spdk_for_each_channel(nvme_ctrlr, 1502 bdev_nvme_clear_io_path_cache, 1503 NULL, 1504 bdev_nvme_clear_io_path_caches_done); 1505 } 1506 1507 static struct nvme_qpair * 1508 nvme_poll_group_get_qpair(struct nvme_poll_group *group, struct spdk_nvme_qpair *qpair) 1509 { 1510 struct nvme_qpair *nvme_qpair; 1511 1512 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1513 if (nvme_qpair->qpair == qpair) { 1514 break; 1515 } 1516 } 1517 1518 return nvme_qpair; 1519 } 1520 1521 static void nvme_qpair_delete(struct nvme_qpair *nvme_qpair); 1522 1523 static void 1524 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx) 1525 { 1526 struct nvme_poll_group *group = poll_group_ctx; 1527 struct nvme_qpair *nvme_qpair; 1528 struct nvme_ctrlr_channel *ctrlr_ch; 1529 int status; 1530 1531 nvme_qpair = nvme_poll_group_get_qpair(group, qpair); 1532 if (nvme_qpair == NULL) { 1533 return; 1534 } 1535 1536 if (nvme_qpair->qpair != NULL) { 1537 spdk_nvme_ctrlr_free_io_qpair(nvme_qpair->qpair); 1538 nvme_qpair->qpair = NULL; 1539 } 1540 1541 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1542 1543 ctrlr_ch = nvme_qpair->ctrlr_ch; 1544 1545 if (ctrlr_ch != NULL) { 1546 if (ctrlr_ch->reset_iter != NULL) { 1547 /* We are in a full reset sequence. */ 1548 if (ctrlr_ch->connect_poller != NULL) { 1549 /* qpair was failed to connect. Abort the reset sequence. */ 1550 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was failed to connect. abort the reset ctrlr sequence.\n", 1551 qpair); 1552 spdk_poller_unregister(&ctrlr_ch->connect_poller); 1553 status = -1; 1554 } else { 1555 /* qpair was completed to disconnect. Just move to the next ctrlr_channel. */ 1556 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was disconnected and freed in a reset ctrlr sequence.\n", 1557 qpair); 1558 status = 0; 1559 } 1560 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, status); 1561 ctrlr_ch->reset_iter = NULL; 1562 } else { 1563 /* qpair was disconnected unexpectedly. Reset controller for recovery. */ 1564 SPDK_NOTICELOG("qpair %p was disconnected and freed. reset controller.\n", qpair); 1565 bdev_nvme_failover_ctrlr(nvme_qpair->ctrlr, false); 1566 } 1567 } else { 1568 /* In this case, ctrlr_channel is already deleted. */ 1569 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was disconnected and freed. delete nvme_qpair.\n", qpair); 1570 nvme_qpair_delete(nvme_qpair); 1571 } 1572 } 1573 1574 static void 1575 bdev_nvme_check_io_qpairs(struct nvme_poll_group *group) 1576 { 1577 struct nvme_qpair *nvme_qpair; 1578 1579 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1580 if (nvme_qpair->qpair == NULL || nvme_qpair->ctrlr_ch == NULL) { 1581 continue; 1582 } 1583 1584 if (spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) != 1585 SPDK_NVME_QPAIR_FAILURE_NONE) { 1586 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1587 } 1588 } 1589 } 1590 1591 static int 1592 bdev_nvme_poll(void *arg) 1593 { 1594 struct nvme_poll_group *group = arg; 1595 int64_t num_completions; 1596 1597 if (group->collect_spin_stat && group->start_ticks == 0) { 1598 group->start_ticks = spdk_get_ticks(); 1599 } 1600 1601 num_completions = spdk_nvme_poll_group_process_completions(group->group, 0, 1602 bdev_nvme_disconnected_qpair_cb); 1603 if (group->collect_spin_stat) { 1604 if (num_completions > 0) { 1605 if (group->end_ticks != 0) { 1606 group->spin_ticks += (group->end_ticks - group->start_ticks); 1607 group->end_ticks = 0; 1608 } 1609 group->start_ticks = 0; 1610 } else { 1611 group->end_ticks = spdk_get_ticks(); 1612 } 1613 } 1614 1615 if (spdk_unlikely(num_completions < 0)) { 1616 bdev_nvme_check_io_qpairs(group); 1617 } 1618 1619 return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 1620 } 1621 1622 static int bdev_nvme_poll_adminq(void *arg); 1623 1624 static void 1625 bdev_nvme_change_adminq_poll_period(struct nvme_ctrlr *nvme_ctrlr, uint64_t new_period_us) 1626 { 1627 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 1628 1629 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, 1630 nvme_ctrlr, new_period_us); 1631 } 1632 1633 static int 1634 bdev_nvme_poll_adminq(void *arg) 1635 { 1636 int32_t rc; 1637 struct nvme_ctrlr *nvme_ctrlr = arg; 1638 nvme_ctrlr_disconnected_cb disconnected_cb; 1639 1640 assert(nvme_ctrlr != NULL); 1641 1642 rc = spdk_nvme_ctrlr_process_admin_completions(nvme_ctrlr->ctrlr); 1643 if (rc < 0) { 1644 disconnected_cb = nvme_ctrlr->disconnected_cb; 1645 nvme_ctrlr->disconnected_cb = NULL; 1646 1647 if (disconnected_cb != NULL) { 1648 bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 1649 g_opts.nvme_adminq_poll_period_us); 1650 disconnected_cb(nvme_ctrlr); 1651 } else { 1652 bdev_nvme_failover_ctrlr(nvme_ctrlr, false); 1653 } 1654 } else if (spdk_nvme_ctrlr_get_admin_qp_failure_reason(nvme_ctrlr->ctrlr) != 1655 SPDK_NVME_QPAIR_FAILURE_NONE) { 1656 bdev_nvme_clear_io_path_caches(nvme_ctrlr); 1657 } 1658 1659 return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY; 1660 } 1661 1662 static void 1663 nvme_bdev_free(void *io_device) 1664 { 1665 struct nvme_bdev *nvme_disk = io_device; 1666 1667 pthread_mutex_destroy(&nvme_disk->mutex); 1668 free(nvme_disk->disk.name); 1669 free(nvme_disk->err_stat); 1670 free(nvme_disk); 1671 } 1672 1673 static int 1674 bdev_nvme_destruct(void *ctx) 1675 { 1676 struct nvme_bdev *nvme_disk = ctx; 1677 struct nvme_ns *nvme_ns, *tmp_nvme_ns; 1678 1679 SPDK_DTRACE_PROBE2(bdev_nvme_destruct, nvme_disk->nbdev_ctrlr->name, nvme_disk->nsid); 1680 1681 TAILQ_FOREACH_SAFE(nvme_ns, &nvme_disk->nvme_ns_list, tailq, tmp_nvme_ns) { 1682 pthread_mutex_lock(&nvme_ns->ctrlr->mutex); 1683 1684 nvme_ns->bdev = NULL; 1685 1686 assert(nvme_ns->id > 0); 1687 1688 if (nvme_ctrlr_get_ns(nvme_ns->ctrlr, nvme_ns->id) == NULL) { 1689 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1690 1691 nvme_ctrlr_release(nvme_ns->ctrlr); 1692 nvme_ns_free(nvme_ns); 1693 } else { 1694 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1695 } 1696 } 1697 1698 pthread_mutex_lock(&g_bdev_nvme_mutex); 1699 TAILQ_REMOVE(&nvme_disk->nbdev_ctrlr->bdevs, nvme_disk, tailq); 1700 pthread_mutex_unlock(&g_bdev_nvme_mutex); 1701 1702 spdk_io_device_unregister(nvme_disk, nvme_bdev_free); 1703 1704 return 0; 1705 } 1706 1707 static int 1708 bdev_nvme_create_qpair(struct nvme_qpair *nvme_qpair) 1709 { 1710 struct nvme_ctrlr *nvme_ctrlr; 1711 struct spdk_nvme_io_qpair_opts opts; 1712 struct spdk_nvme_qpair *qpair; 1713 int rc; 1714 1715 nvme_ctrlr = nvme_qpair->ctrlr; 1716 1717 spdk_nvme_ctrlr_get_default_io_qpair_opts(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1718 opts.delay_cmd_submit = g_opts.delay_cmd_submit; 1719 opts.create_only = true; 1720 opts.async_mode = true; 1721 opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests); 1722 g_opts.io_queue_requests = opts.io_queue_requests; 1723 1724 qpair = spdk_nvme_ctrlr_alloc_io_qpair(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1725 if (qpair == NULL) { 1726 return -1; 1727 } 1728 1729 SPDK_DTRACE_PROBE3(bdev_nvme_create_qpair, nvme_ctrlr->nbdev_ctrlr->name, 1730 spdk_nvme_qpair_get_id(qpair), spdk_thread_get_id(nvme_ctrlr->thread)); 1731 1732 assert(nvme_qpair->group != NULL); 1733 1734 rc = spdk_nvme_poll_group_add(nvme_qpair->group->group, qpair); 1735 if (rc != 0) { 1736 SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n"); 1737 goto err; 1738 } 1739 1740 rc = spdk_nvme_ctrlr_connect_io_qpair(nvme_ctrlr->ctrlr, qpair); 1741 if (rc != 0) { 1742 SPDK_ERRLOG("Unable to connect I/O qpair.\n"); 1743 goto err; 1744 } 1745 1746 nvme_qpair->qpair = qpair; 1747 1748 if (!g_opts.disable_auto_failback) { 1749 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1750 } 1751 1752 return 0; 1753 1754 err: 1755 spdk_nvme_ctrlr_free_io_qpair(qpair); 1756 1757 return rc; 1758 } 1759 1760 static void 1761 bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i) 1762 { 1763 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 1764 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 1765 enum spdk_bdev_io_status status = SPDK_BDEV_IO_STATUS_SUCCESS; 1766 struct spdk_bdev_io *bdev_io; 1767 1768 if (spdk_io_channel_iter_get_ctx(i) != NULL) { 1769 status = SPDK_BDEV_IO_STATUS_FAILED; 1770 } 1771 1772 while (!TAILQ_EMPTY(&ctrlr_ch->pending_resets)) { 1773 bdev_io = TAILQ_FIRST(&ctrlr_ch->pending_resets); 1774 TAILQ_REMOVE(&ctrlr_ch->pending_resets, bdev_io, module_link); 1775 __bdev_nvme_io_complete(bdev_io, status, NULL); 1776 } 1777 1778 spdk_for_each_channel_continue(i, 0); 1779 } 1780 1781 /* This function marks the current trid as failed by storing the current ticks 1782 * and then sets the next trid to the active trid within a controller if exists. 1783 * 1784 * The purpose of the boolean return value is to request the caller to disconnect 1785 * the current trid now to try connecting the next trid. 1786 */ 1787 static bool 1788 bdev_nvme_failover_trid(struct nvme_ctrlr *nvme_ctrlr, bool remove, bool start) 1789 { 1790 struct nvme_path_id *path_id, *next_path; 1791 int rc __attribute__((unused)); 1792 1793 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 1794 assert(path_id); 1795 assert(path_id == nvme_ctrlr->active_path_id); 1796 next_path = TAILQ_NEXT(path_id, link); 1797 1798 /* Update the last failed time. It means the trid is failed if its last 1799 * failed time is non-zero. 1800 */ 1801 path_id->last_failed_tsc = spdk_get_ticks(); 1802 1803 if (next_path == NULL) { 1804 /* There is no alternate trid within a controller. */ 1805 return false; 1806 } 1807 1808 if (!start && nvme_ctrlr->opts.reconnect_delay_sec == 0) { 1809 /* Connect is not retried in a controller reset sequence. Connecting 1810 * the next trid will be done by the next bdev_nvme_failover_ctrlr() call. 1811 */ 1812 return false; 1813 } 1814 1815 assert(path_id->trid.trtype != SPDK_NVME_TRANSPORT_PCIE); 1816 1817 SPDK_NOTICELOG("Start failover from %s:%s to %s:%s\n", path_id->trid.traddr, 1818 path_id->trid.trsvcid, next_path->trid.traddr, next_path->trid.trsvcid); 1819 1820 spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr); 1821 nvme_ctrlr->active_path_id = next_path; 1822 rc = spdk_nvme_ctrlr_set_trid(nvme_ctrlr->ctrlr, &next_path->trid); 1823 assert(rc == 0); 1824 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 1825 if (!remove) { 1826 /** Shuffle the old trid to the end of the list and use the new one. 1827 * Allows for round robin through multiple connections. 1828 */ 1829 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, path_id, link); 1830 } else { 1831 free(path_id); 1832 } 1833 1834 if (start || next_path->last_failed_tsc == 0) { 1835 /* bdev_nvme_failover_ctrlr() is just called or the next trid is not failed 1836 * or used yet. Try the next trid now. 1837 */ 1838 return true; 1839 } 1840 1841 if (spdk_get_ticks() > next_path->last_failed_tsc + spdk_get_ticks_hz() * 1842 nvme_ctrlr->opts.reconnect_delay_sec) { 1843 /* Enough backoff passed since the next trid failed. Try the next trid now. */ 1844 return true; 1845 } 1846 1847 /* The next trid will be tried after reconnect_delay_sec seconds. */ 1848 return false; 1849 } 1850 1851 static bool 1852 bdev_nvme_check_ctrlr_loss_timeout(struct nvme_ctrlr *nvme_ctrlr) 1853 { 1854 int32_t elapsed; 1855 1856 if (nvme_ctrlr->opts.ctrlr_loss_timeout_sec == 0 || 1857 nvme_ctrlr->opts.ctrlr_loss_timeout_sec == -1) { 1858 return false; 1859 } 1860 1861 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 1862 if (elapsed >= nvme_ctrlr->opts.ctrlr_loss_timeout_sec) { 1863 return true; 1864 } else { 1865 return false; 1866 } 1867 } 1868 1869 static bool 1870 bdev_nvme_check_fast_io_fail_timeout(struct nvme_ctrlr *nvme_ctrlr) 1871 { 1872 uint32_t elapsed; 1873 1874 if (nvme_ctrlr->opts.fast_io_fail_timeout_sec == 0) { 1875 return false; 1876 } 1877 1878 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 1879 if (elapsed >= nvme_ctrlr->opts.fast_io_fail_timeout_sec) { 1880 return true; 1881 } else { 1882 return false; 1883 } 1884 } 1885 1886 static void bdev_nvme_reset_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, bool success); 1887 1888 static void 1889 nvme_ctrlr_disconnect(struct nvme_ctrlr *nvme_ctrlr, nvme_ctrlr_disconnected_cb cb_fn) 1890 { 1891 int rc; 1892 1893 rc = spdk_nvme_ctrlr_disconnect(nvme_ctrlr->ctrlr); 1894 if (rc != 0) { 1895 /* Disconnect fails if ctrlr is already resetting or removed. In this case, 1896 * fail the reset sequence immediately. 1897 */ 1898 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 1899 return; 1900 } 1901 1902 /* spdk_nvme_ctrlr_disconnect() may complete asynchronously later by polling adminq. 1903 * Set callback here to execute the specified operation after ctrlr is really disconnected. 1904 */ 1905 assert(nvme_ctrlr->disconnected_cb == NULL); 1906 nvme_ctrlr->disconnected_cb = cb_fn; 1907 1908 /* During disconnection, reduce the period to poll adminq more often. */ 1909 bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 0); 1910 } 1911 1912 enum bdev_nvme_op_after_reset { 1913 OP_NONE, 1914 OP_COMPLETE_PENDING_DESTRUCT, 1915 OP_DESTRUCT, 1916 OP_DELAYED_RECONNECT, 1917 OP_FAILOVER, 1918 }; 1919 1920 typedef enum bdev_nvme_op_after_reset _bdev_nvme_op_after_reset; 1921 1922 static _bdev_nvme_op_after_reset 1923 bdev_nvme_check_op_after_reset(struct nvme_ctrlr *nvme_ctrlr, bool success) 1924 { 1925 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 1926 /* Complete pending destruct after reset completes. */ 1927 return OP_COMPLETE_PENDING_DESTRUCT; 1928 } else if (nvme_ctrlr->pending_failover) { 1929 nvme_ctrlr->pending_failover = false; 1930 nvme_ctrlr->reset_start_tsc = 0; 1931 return OP_FAILOVER; 1932 } else if (success || nvme_ctrlr->opts.reconnect_delay_sec == 0) { 1933 nvme_ctrlr->reset_start_tsc = 0; 1934 return OP_NONE; 1935 } else if (bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 1936 return OP_DESTRUCT; 1937 } else { 1938 if (bdev_nvme_check_fast_io_fail_timeout(nvme_ctrlr)) { 1939 nvme_ctrlr->fast_io_fail_timedout = true; 1940 } 1941 return OP_DELAYED_RECONNECT; 1942 } 1943 } 1944 1945 static int bdev_nvme_delete_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool hotplug); 1946 static void bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 1947 1948 static int 1949 bdev_nvme_reconnect_delay_timer_expired(void *ctx) 1950 { 1951 struct nvme_ctrlr *nvme_ctrlr = ctx; 1952 1953 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect_delay, nvme_ctrlr->nbdev_ctrlr->name); 1954 pthread_mutex_lock(&nvme_ctrlr->mutex); 1955 1956 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 1957 1958 if (!nvme_ctrlr->reconnect_is_delayed) { 1959 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1960 return SPDK_POLLER_BUSY; 1961 } 1962 1963 nvme_ctrlr->reconnect_is_delayed = false; 1964 1965 if (nvme_ctrlr->destruct) { 1966 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1967 return SPDK_POLLER_BUSY; 1968 } 1969 1970 assert(nvme_ctrlr->resetting == false); 1971 nvme_ctrlr->resetting = true; 1972 1973 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1974 1975 spdk_poller_resume(nvme_ctrlr->adminq_timer_poller); 1976 1977 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 1978 return SPDK_POLLER_BUSY; 1979 } 1980 1981 static void 1982 bdev_nvme_start_reconnect_delay_timer(struct nvme_ctrlr *nvme_ctrlr) 1983 { 1984 spdk_poller_pause(nvme_ctrlr->adminq_timer_poller); 1985 1986 assert(nvme_ctrlr->reconnect_is_delayed == false); 1987 nvme_ctrlr->reconnect_is_delayed = true; 1988 1989 assert(nvme_ctrlr->reconnect_delay_timer == NULL); 1990 nvme_ctrlr->reconnect_delay_timer = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_delay_timer_expired, 1991 nvme_ctrlr, 1992 nvme_ctrlr->opts.reconnect_delay_sec * SPDK_SEC_TO_USEC); 1993 } 1994 1995 static void remove_discovery_entry(struct nvme_ctrlr *nvme_ctrlr); 1996 1997 static void 1998 _bdev_nvme_reset_ctrlr_complete(struct spdk_io_channel_iter *i, int status) 1999 { 2000 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2001 bool success = spdk_io_channel_iter_get_ctx(i) == NULL; 2002 bdev_nvme_ctrlr_op_cb ctrlr_op_cb_fn = nvme_ctrlr->ctrlr_op_cb_fn; 2003 void *ctrlr_op_cb_arg = nvme_ctrlr->ctrlr_op_cb_arg; 2004 enum bdev_nvme_op_after_reset op_after_reset; 2005 2006 assert(nvme_ctrlr->thread == spdk_get_thread()); 2007 2008 nvme_ctrlr->ctrlr_op_cb_fn = NULL; 2009 nvme_ctrlr->ctrlr_op_cb_arg = NULL; 2010 2011 if (!success) { 2012 SPDK_ERRLOG("Resetting controller failed.\n"); 2013 } else { 2014 SPDK_NOTICELOG("Resetting controller successful.\n"); 2015 } 2016 2017 pthread_mutex_lock(&nvme_ctrlr->mutex); 2018 nvme_ctrlr->resetting = false; 2019 nvme_ctrlr->dont_retry = false; 2020 nvme_ctrlr->in_failover = false; 2021 2022 op_after_reset = bdev_nvme_check_op_after_reset(nvme_ctrlr, success); 2023 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2024 2025 if (ctrlr_op_cb_fn) { 2026 ctrlr_op_cb_fn(ctrlr_op_cb_arg, success ? 0 : -1); 2027 } 2028 2029 switch (op_after_reset) { 2030 case OP_COMPLETE_PENDING_DESTRUCT: 2031 nvme_ctrlr_unregister(nvme_ctrlr); 2032 break; 2033 case OP_DESTRUCT: 2034 bdev_nvme_delete_ctrlr(nvme_ctrlr, false); 2035 remove_discovery_entry(nvme_ctrlr); 2036 break; 2037 case OP_DELAYED_RECONNECT: 2038 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_start_reconnect_delay_timer); 2039 break; 2040 case OP_FAILOVER: 2041 bdev_nvme_failover_ctrlr(nvme_ctrlr, false); 2042 break; 2043 default: 2044 break; 2045 } 2046 } 2047 2048 static void 2049 bdev_nvme_reset_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, bool success) 2050 { 2051 pthread_mutex_lock(&nvme_ctrlr->mutex); 2052 if (!success) { 2053 /* Connecting the active trid failed. Set the next alternate trid to the 2054 * active trid if it exists. 2055 */ 2056 if (bdev_nvme_failover_trid(nvme_ctrlr, false, false)) { 2057 /* The next alternate trid exists and is ready to try. Try it now. */ 2058 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2059 2060 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr); 2061 return; 2062 } 2063 2064 /* We came here if there is no alternate trid or if the next trid exists but 2065 * is not ready to try. We will try the active trid after reconnect_delay_sec 2066 * seconds if it is non-zero or at the next reset call otherwise. 2067 */ 2068 } else { 2069 /* Connecting the active trid succeeded. Clear the last failed time because it 2070 * means the trid is failed if its last failed time is non-zero. 2071 */ 2072 nvme_ctrlr->active_path_id->last_failed_tsc = 0; 2073 } 2074 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2075 2076 /* Make sure we clear any pending resets before returning. */ 2077 spdk_for_each_channel(nvme_ctrlr, 2078 bdev_nvme_complete_pending_resets, 2079 success ? NULL : (void *)0x1, 2080 _bdev_nvme_reset_ctrlr_complete); 2081 } 2082 2083 static void 2084 bdev_nvme_reset_create_qpairs_failed(struct spdk_io_channel_iter *i, int status) 2085 { 2086 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2087 2088 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 2089 } 2090 2091 static void 2092 bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i) 2093 { 2094 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 2095 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch); 2096 struct nvme_qpair *nvme_qpair; 2097 2098 nvme_qpair = ctrlr_ch->qpair; 2099 assert(nvme_qpair != NULL); 2100 2101 _bdev_nvme_clear_io_path_cache(nvme_qpair); 2102 2103 if (nvme_qpair->qpair != NULL) { 2104 if (nvme_qpair->ctrlr->dont_retry) { 2105 spdk_nvme_qpair_set_abort_dnr(nvme_qpair->qpair, true); 2106 } 2107 spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); 2108 2109 /* The current full reset sequence will move to the next 2110 * ctrlr_channel after the qpair is actually disconnected. 2111 */ 2112 assert(ctrlr_ch->reset_iter == NULL); 2113 ctrlr_ch->reset_iter = i; 2114 } else { 2115 spdk_for_each_channel_continue(i, 0); 2116 } 2117 } 2118 2119 static void 2120 bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status) 2121 { 2122 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2123 2124 if (status == 0) { 2125 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, true); 2126 } else { 2127 /* Delete the added qpairs and quiesce ctrlr to make the states clean. */ 2128 spdk_for_each_channel(nvme_ctrlr, 2129 bdev_nvme_reset_destroy_qpair, 2130 NULL, 2131 bdev_nvme_reset_create_qpairs_failed); 2132 } 2133 } 2134 2135 static int 2136 bdev_nvme_reset_check_qpair_connected(void *ctx) 2137 { 2138 struct nvme_ctrlr_channel *ctrlr_ch = ctx; 2139 2140 if (ctrlr_ch->reset_iter == NULL) { 2141 /* qpair was already failed to connect and the reset sequence is being aborted. */ 2142 assert(ctrlr_ch->connect_poller == NULL); 2143 assert(ctrlr_ch->qpair->qpair == NULL); 2144 return SPDK_POLLER_BUSY; 2145 } 2146 2147 assert(ctrlr_ch->qpair->qpair != NULL); 2148 2149 if (!spdk_nvme_qpair_is_connected(ctrlr_ch->qpair->qpair)) { 2150 return SPDK_POLLER_BUSY; 2151 } 2152 2153 spdk_poller_unregister(&ctrlr_ch->connect_poller); 2154 2155 /* qpair was completed to connect. Move to the next ctrlr_channel */ 2156 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 2157 ctrlr_ch->reset_iter = NULL; 2158 2159 return SPDK_POLLER_BUSY; 2160 } 2161 2162 static void 2163 bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i) 2164 { 2165 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 2166 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 2167 int rc; 2168 2169 rc = bdev_nvme_create_qpair(ctrlr_ch->qpair); 2170 if (rc == 0) { 2171 ctrlr_ch->connect_poller = SPDK_POLLER_REGISTER(bdev_nvme_reset_check_qpair_connected, 2172 ctrlr_ch, 0); 2173 2174 /* The current full reset sequence will move to the next 2175 * ctrlr_channel after the qpair is actually connected. 2176 */ 2177 assert(ctrlr_ch->reset_iter == NULL); 2178 ctrlr_ch->reset_iter = i; 2179 } else { 2180 spdk_for_each_channel_continue(i, rc); 2181 } 2182 } 2183 2184 static int 2185 bdev_nvme_reconnect_ctrlr_poll(void *arg) 2186 { 2187 struct nvme_ctrlr *nvme_ctrlr = arg; 2188 int rc = -ETIMEDOUT; 2189 2190 if (!bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 2191 rc = spdk_nvme_ctrlr_reconnect_poll_async(nvme_ctrlr->ctrlr); 2192 if (rc == -EAGAIN) { 2193 return SPDK_POLLER_BUSY; 2194 } 2195 } 2196 2197 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 2198 if (rc == 0) { 2199 /* Recreate all of the I/O queue pairs */ 2200 spdk_for_each_channel(nvme_ctrlr, 2201 bdev_nvme_reset_create_qpair, 2202 NULL, 2203 bdev_nvme_reset_create_qpairs_done); 2204 } else { 2205 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 2206 } 2207 return SPDK_POLLER_BUSY; 2208 } 2209 2210 static void 2211 bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2212 { 2213 spdk_nvme_ctrlr_reconnect_async(nvme_ctrlr->ctrlr); 2214 2215 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect, nvme_ctrlr->nbdev_ctrlr->name); 2216 assert(nvme_ctrlr->reset_detach_poller == NULL); 2217 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_ctrlr_poll, 2218 nvme_ctrlr, 0); 2219 } 2220 2221 static void 2222 bdev_nvme_reset_destroy_qpair_done(struct spdk_io_channel_iter *i, int status) 2223 { 2224 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2225 2226 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reset, nvme_ctrlr->nbdev_ctrlr->name); 2227 assert(status == 0); 2228 2229 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2230 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2231 } else { 2232 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr); 2233 } 2234 } 2235 2236 static void 2237 bdev_nvme_reset_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr) 2238 { 2239 spdk_for_each_channel(nvme_ctrlr, 2240 bdev_nvme_reset_destroy_qpair, 2241 NULL, 2242 bdev_nvme_reset_destroy_qpair_done); 2243 } 2244 2245 static void 2246 bdev_nvme_reconnect_ctrlr_now(void *ctx) 2247 { 2248 struct nvme_ctrlr *nvme_ctrlr = ctx; 2249 2250 assert(nvme_ctrlr->resetting == true); 2251 assert(nvme_ctrlr->thread == spdk_get_thread()); 2252 2253 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 2254 2255 spdk_poller_resume(nvme_ctrlr->adminq_timer_poller); 2256 2257 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2258 } 2259 2260 static void 2261 _bdev_nvme_reset_ctrlr(void *ctx) 2262 { 2263 struct nvme_ctrlr *nvme_ctrlr = ctx; 2264 2265 assert(nvme_ctrlr->resetting == true); 2266 assert(nvme_ctrlr->thread == spdk_get_thread()); 2267 2268 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2269 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reset_destroy_qpairs); 2270 } else { 2271 bdev_nvme_reset_destroy_qpairs(nvme_ctrlr); 2272 } 2273 } 2274 2275 static int 2276 bdev_nvme_reset_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2277 { 2278 spdk_msg_fn msg_fn; 2279 2280 pthread_mutex_lock(&nvme_ctrlr->mutex); 2281 if (nvme_ctrlr->destruct) { 2282 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2283 return -ENXIO; 2284 } 2285 2286 if (nvme_ctrlr->resetting) { 2287 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2288 SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); 2289 return -EBUSY; 2290 } 2291 2292 if (nvme_ctrlr->disabled) { 2293 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2294 SPDK_NOTICELOG("Unable to perform reset. Controller is disabled.\n"); 2295 return -EALREADY; 2296 } 2297 2298 nvme_ctrlr->resetting = true; 2299 nvme_ctrlr->dont_retry = true; 2300 2301 if (nvme_ctrlr->reconnect_is_delayed) { 2302 SPDK_DEBUGLOG(bdev_nvme, "Reconnect is already scheduled.\n"); 2303 msg_fn = bdev_nvme_reconnect_ctrlr_now; 2304 nvme_ctrlr->reconnect_is_delayed = false; 2305 } else { 2306 msg_fn = _bdev_nvme_reset_ctrlr; 2307 assert(nvme_ctrlr->reset_start_tsc == 0); 2308 } 2309 2310 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2311 2312 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2313 2314 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 2315 return 0; 2316 } 2317 2318 static int 2319 bdev_nvme_enable_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2320 { 2321 pthread_mutex_lock(&nvme_ctrlr->mutex); 2322 if (nvme_ctrlr->destruct) { 2323 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2324 return -ENXIO; 2325 } 2326 2327 if (nvme_ctrlr->resetting) { 2328 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2329 return -EBUSY; 2330 } 2331 2332 if (!nvme_ctrlr->disabled) { 2333 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2334 return -EALREADY; 2335 } 2336 2337 nvme_ctrlr->disabled = false; 2338 nvme_ctrlr->resetting = true; 2339 2340 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2341 2342 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2343 2344 spdk_thread_send_msg(nvme_ctrlr->thread, bdev_nvme_reconnect_ctrlr_now, nvme_ctrlr); 2345 return 0; 2346 } 2347 2348 static void 2349 _bdev_nvme_disable_ctrlr_complete(struct spdk_io_channel_iter *i, int status) 2350 { 2351 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2352 bdev_nvme_ctrlr_op_cb ctrlr_op_cb_fn = nvme_ctrlr->ctrlr_op_cb_fn; 2353 void *ctrlr_op_cb_arg = nvme_ctrlr->ctrlr_op_cb_arg; 2354 enum bdev_nvme_op_after_reset op_after_disable; 2355 2356 assert(nvme_ctrlr->thread == spdk_get_thread()); 2357 2358 nvme_ctrlr->ctrlr_op_cb_fn = NULL; 2359 nvme_ctrlr->ctrlr_op_cb_arg = NULL; 2360 2361 pthread_mutex_lock(&nvme_ctrlr->mutex); 2362 2363 nvme_ctrlr->resetting = false; 2364 nvme_ctrlr->dont_retry = false; 2365 2366 op_after_disable = bdev_nvme_check_op_after_reset(nvme_ctrlr, true); 2367 2368 nvme_ctrlr->disabled = true; 2369 spdk_poller_pause(nvme_ctrlr->adminq_timer_poller); 2370 2371 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2372 2373 if (ctrlr_op_cb_fn) { 2374 ctrlr_op_cb_fn(ctrlr_op_cb_arg, 0); 2375 } 2376 2377 switch (op_after_disable) { 2378 case OP_COMPLETE_PENDING_DESTRUCT: 2379 nvme_ctrlr_unregister(nvme_ctrlr); 2380 break; 2381 default: 2382 break; 2383 } 2384 2385 } 2386 2387 static void 2388 bdev_nvme_disable_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr) 2389 { 2390 /* Make sure we clear any pending resets before returning. */ 2391 spdk_for_each_channel(nvme_ctrlr, 2392 bdev_nvme_complete_pending_resets, 2393 NULL, 2394 _bdev_nvme_disable_ctrlr_complete); 2395 } 2396 2397 static void 2398 bdev_nvme_disable_destroy_qpairs_done(struct spdk_io_channel_iter *i, int status) 2399 { 2400 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2401 2402 assert(status == 0); 2403 2404 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2405 bdev_nvme_disable_ctrlr_complete(nvme_ctrlr); 2406 } else { 2407 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_disable_ctrlr_complete); 2408 } 2409 } 2410 2411 static void 2412 bdev_nvme_disable_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr) 2413 { 2414 spdk_for_each_channel(nvme_ctrlr, 2415 bdev_nvme_reset_destroy_qpair, 2416 NULL, 2417 bdev_nvme_disable_destroy_qpairs_done); 2418 } 2419 2420 static void 2421 _bdev_nvme_cancel_reconnect_and_disable_ctrlr(void *ctx) 2422 { 2423 struct nvme_ctrlr *nvme_ctrlr = ctx; 2424 2425 assert(nvme_ctrlr->resetting == true); 2426 assert(nvme_ctrlr->thread == spdk_get_thread()); 2427 2428 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 2429 2430 bdev_nvme_disable_ctrlr_complete(nvme_ctrlr); 2431 } 2432 2433 static void 2434 _bdev_nvme_disconnect_and_disable_ctrlr(void *ctx) 2435 { 2436 struct nvme_ctrlr *nvme_ctrlr = ctx; 2437 2438 assert(nvme_ctrlr->resetting == true); 2439 assert(nvme_ctrlr->thread == spdk_get_thread()); 2440 2441 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2442 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_disable_destroy_qpairs); 2443 } else { 2444 bdev_nvme_disable_destroy_qpairs(nvme_ctrlr); 2445 } 2446 } 2447 2448 static int 2449 bdev_nvme_disable_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2450 { 2451 spdk_msg_fn msg_fn; 2452 2453 pthread_mutex_lock(&nvme_ctrlr->mutex); 2454 if (nvme_ctrlr->destruct) { 2455 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2456 return -ENXIO; 2457 } 2458 2459 if (nvme_ctrlr->resetting) { 2460 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2461 return -EBUSY; 2462 } 2463 2464 if (nvme_ctrlr->disabled) { 2465 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2466 return -EALREADY; 2467 } 2468 2469 nvme_ctrlr->resetting = true; 2470 nvme_ctrlr->dont_retry = true; 2471 2472 if (nvme_ctrlr->reconnect_is_delayed) { 2473 msg_fn = _bdev_nvme_cancel_reconnect_and_disable_ctrlr; 2474 nvme_ctrlr->reconnect_is_delayed = false; 2475 } else { 2476 msg_fn = _bdev_nvme_disconnect_and_disable_ctrlr; 2477 } 2478 2479 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2480 2481 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2482 2483 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 2484 return 0; 2485 } 2486 2487 static int 2488 nvme_ctrlr_op(struct nvme_ctrlr *nvme_ctrlr, enum nvme_ctrlr_op op, 2489 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2490 { 2491 int rc; 2492 2493 switch (op) { 2494 case NVME_CTRLR_OP_RESET: 2495 rc = bdev_nvme_reset_ctrlr(nvme_ctrlr); 2496 break; 2497 case NVME_CTRLR_OP_ENABLE: 2498 rc = bdev_nvme_enable_ctrlr(nvme_ctrlr); 2499 break; 2500 case NVME_CTRLR_OP_DISABLE: 2501 rc = bdev_nvme_disable_ctrlr(nvme_ctrlr); 2502 break; 2503 default: 2504 rc = -EINVAL; 2505 break; 2506 } 2507 2508 if (rc == 0) { 2509 assert(nvme_ctrlr->ctrlr_op_cb_fn == NULL); 2510 assert(nvme_ctrlr->ctrlr_op_cb_arg == NULL); 2511 nvme_ctrlr->ctrlr_op_cb_fn = cb_fn; 2512 nvme_ctrlr->ctrlr_op_cb_arg = cb_arg; 2513 } 2514 return rc; 2515 } 2516 2517 struct nvme_ctrlr_op_rpc_ctx { 2518 struct nvme_ctrlr *nvme_ctrlr; 2519 struct spdk_thread *orig_thread; 2520 enum nvme_ctrlr_op op; 2521 int rc; 2522 bdev_nvme_ctrlr_op_cb cb_fn; 2523 void *cb_arg; 2524 }; 2525 2526 static void 2527 _nvme_ctrlr_op_rpc_complete(void *_ctx) 2528 { 2529 struct nvme_ctrlr_op_rpc_ctx *ctx = _ctx; 2530 2531 assert(ctx != NULL); 2532 assert(ctx->cb_fn != NULL); 2533 2534 ctx->cb_fn(ctx->cb_arg, ctx->rc); 2535 2536 free(ctx); 2537 } 2538 2539 static void 2540 nvme_ctrlr_op_rpc_complete(void *cb_arg, int rc) 2541 { 2542 struct nvme_ctrlr_op_rpc_ctx *ctx = cb_arg; 2543 2544 ctx->rc = rc; 2545 2546 spdk_thread_send_msg(ctx->orig_thread, _nvme_ctrlr_op_rpc_complete, ctx); 2547 } 2548 2549 void 2550 nvme_ctrlr_op_rpc(struct nvme_ctrlr *nvme_ctrlr, enum nvme_ctrlr_op op, 2551 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2552 { 2553 struct nvme_ctrlr_op_rpc_ctx *ctx; 2554 int rc; 2555 2556 assert(cb_fn != NULL); 2557 2558 ctx = calloc(1, sizeof(*ctx)); 2559 if (ctx == NULL) { 2560 SPDK_ERRLOG("Failed to allocate nvme_ctrlr_op_rpc_ctx.\n"); 2561 cb_fn(cb_arg, -ENOMEM); 2562 return; 2563 } 2564 2565 ctx->orig_thread = spdk_get_thread(); 2566 ctx->cb_fn = cb_fn; 2567 ctx->cb_arg = cb_arg; 2568 2569 rc = nvme_ctrlr_op(nvme_ctrlr, op, nvme_ctrlr_op_rpc_complete, ctx); 2570 if (rc == 0) { 2571 return; 2572 } else if (rc == -EALREADY) { 2573 rc = 0; 2574 } 2575 2576 nvme_ctrlr_op_rpc_complete(ctx, rc); 2577 } 2578 2579 static void nvme_bdev_ctrlr_op_rpc_continue(void *cb_arg, int rc); 2580 2581 static void 2582 _nvme_bdev_ctrlr_op_rpc_continue(void *_ctx) 2583 { 2584 struct nvme_ctrlr_op_rpc_ctx *ctx = _ctx; 2585 struct nvme_ctrlr *prev_nvme_ctrlr, *next_nvme_ctrlr; 2586 int rc; 2587 2588 prev_nvme_ctrlr = ctx->nvme_ctrlr; 2589 ctx->nvme_ctrlr = NULL; 2590 2591 if (ctx->rc != 0) { 2592 goto complete; 2593 } 2594 2595 next_nvme_ctrlr = TAILQ_NEXT(prev_nvme_ctrlr, tailq); 2596 if (next_nvme_ctrlr == NULL) { 2597 goto complete; 2598 } 2599 2600 rc = nvme_ctrlr_op(next_nvme_ctrlr, ctx->op, nvme_bdev_ctrlr_op_rpc_continue, ctx); 2601 if (rc == 0) { 2602 ctx->nvme_ctrlr = next_nvme_ctrlr; 2603 return; 2604 } else if (rc == -EALREADY) { 2605 ctx->nvme_ctrlr = next_nvme_ctrlr; 2606 rc = 0; 2607 } 2608 2609 ctx->rc = rc; 2610 2611 complete: 2612 ctx->cb_fn(ctx->cb_arg, ctx->rc); 2613 free(ctx); 2614 } 2615 2616 static void 2617 nvme_bdev_ctrlr_op_rpc_continue(void *cb_arg, int rc) 2618 { 2619 struct nvme_ctrlr_op_rpc_ctx *ctx = cb_arg; 2620 2621 ctx->rc = rc; 2622 2623 spdk_thread_send_msg(ctx->orig_thread, _nvme_bdev_ctrlr_op_rpc_continue, ctx); 2624 } 2625 2626 void 2627 nvme_bdev_ctrlr_op_rpc(struct nvme_bdev_ctrlr *nbdev_ctrlr, enum nvme_ctrlr_op op, 2628 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2629 { 2630 struct nvme_ctrlr_op_rpc_ctx *ctx; 2631 struct nvme_ctrlr *nvme_ctrlr; 2632 int rc; 2633 2634 assert(cb_fn != NULL); 2635 2636 ctx = calloc(1, sizeof(*ctx)); 2637 if (ctx == NULL) { 2638 SPDK_ERRLOG("Failed to allocate nvme_ctrlr_op_rpc_ctx.\n"); 2639 cb_fn(cb_arg, -ENOMEM); 2640 return; 2641 } 2642 2643 ctx->orig_thread = spdk_get_thread(); 2644 ctx->op = op; 2645 ctx->cb_fn = cb_fn; 2646 ctx->cb_arg = cb_arg; 2647 2648 nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 2649 assert(nvme_ctrlr != NULL); 2650 2651 rc = nvme_ctrlr_op(nvme_ctrlr, op, nvme_bdev_ctrlr_op_rpc_continue, ctx); 2652 if (rc == 0) { 2653 ctx->nvme_ctrlr = nvme_ctrlr; 2654 return; 2655 } else if (rc == -EALREADY) { 2656 ctx->nvme_ctrlr = nvme_ctrlr; 2657 rc = 0; 2658 } 2659 2660 nvme_bdev_ctrlr_op_rpc_continue(ctx, rc); 2661 } 2662 2663 static int _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio); 2664 2665 static void 2666 _bdev_nvme_reset_io_complete(struct spdk_io_channel_iter *i, int status) 2667 { 2668 struct nvme_bdev_io *bio = spdk_io_channel_iter_get_ctx(i); 2669 enum spdk_bdev_io_status io_status; 2670 2671 if (bio->cpl.cdw0 == 0) { 2672 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 2673 } else { 2674 io_status = SPDK_BDEV_IO_STATUS_FAILED; 2675 } 2676 2677 __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), io_status, NULL); 2678 } 2679 2680 static void 2681 bdev_nvme_abort_bdev_channel(struct spdk_io_channel_iter *i) 2682 { 2683 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 2684 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 2685 2686 bdev_nvme_abort_retry_ios(nbdev_ch); 2687 2688 spdk_for_each_channel_continue(i, 0); 2689 } 2690 2691 static void 2692 bdev_nvme_reset_io_complete(struct nvme_bdev_io *bio) 2693 { 2694 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2695 struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt; 2696 2697 /* Abort all queued I/Os for retry. */ 2698 spdk_for_each_channel(nbdev, 2699 bdev_nvme_abort_bdev_channel, 2700 bio, 2701 _bdev_nvme_reset_io_complete); 2702 } 2703 2704 static void 2705 _bdev_nvme_reset_io_continue(void *ctx) 2706 { 2707 struct nvme_bdev_io *bio = ctx; 2708 struct nvme_io_path *prev_io_path, *next_io_path; 2709 int rc; 2710 2711 prev_io_path = bio->io_path; 2712 bio->io_path = NULL; 2713 2714 if (bio->cpl.cdw0 != 0) { 2715 goto complete; 2716 } 2717 2718 next_io_path = STAILQ_NEXT(prev_io_path, stailq); 2719 if (next_io_path == NULL) { 2720 goto complete; 2721 } 2722 2723 rc = _bdev_nvme_reset_io(next_io_path, bio); 2724 if (rc == 0) { 2725 return; 2726 } 2727 2728 bio->cpl.cdw0 = 1; 2729 2730 complete: 2731 bdev_nvme_reset_io_complete(bio); 2732 } 2733 2734 static void 2735 bdev_nvme_reset_io_continue(void *cb_arg, int rc) 2736 { 2737 struct nvme_bdev_io *bio = cb_arg; 2738 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2739 2740 bio->cpl.cdw0 = (rc == 0) ? 0 : 1; 2741 2742 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), _bdev_nvme_reset_io_continue, bio); 2743 } 2744 2745 static int 2746 _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio) 2747 { 2748 struct nvme_ctrlr_channel *ctrlr_ch; 2749 struct spdk_bdev_io *bdev_io; 2750 int rc; 2751 2752 rc = nvme_ctrlr_op(io_path->qpair->ctrlr, NVME_CTRLR_OP_RESET, 2753 bdev_nvme_reset_io_continue, bio); 2754 if (rc == 0) { 2755 assert(bio->io_path == NULL); 2756 bio->io_path = io_path; 2757 } else if (rc == -EBUSY) { 2758 ctrlr_ch = io_path->qpair->ctrlr_ch; 2759 assert(ctrlr_ch != NULL); 2760 /* 2761 * Reset call is queued only if it is from the app framework. This is on purpose so that 2762 * we don't interfere with the app framework reset strategy. i.e. we are deferring to the 2763 * upper level. If they are in the middle of a reset, we won't try to schedule another one. 2764 */ 2765 bdev_io = spdk_bdev_io_from_ctx(bio); 2766 TAILQ_INSERT_TAIL(&ctrlr_ch->pending_resets, bdev_io, module_link); 2767 rc = 0; 2768 } 2769 2770 return rc; 2771 } 2772 2773 static void 2774 bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio) 2775 { 2776 struct nvme_io_path *io_path; 2777 int rc; 2778 2779 bio->cpl.cdw0 = 0; 2780 2781 /* Reset all nvme_ctrlrs of a bdev controller sequentially. */ 2782 io_path = STAILQ_FIRST(&nbdev_ch->io_path_list); 2783 assert(io_path != NULL); 2784 2785 rc = _bdev_nvme_reset_io(io_path, bio); 2786 if (rc != 0) { 2787 /* If the current nvme_ctrlr is disabled, skip it and move to the next nvme_ctrlr. */ 2788 bdev_nvme_reset_io_continue(bio, rc == -EALREADY); 2789 } 2790 } 2791 2792 static int 2793 bdev_nvme_failover_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool remove) 2794 { 2795 if (nvme_ctrlr->destruct) { 2796 /* Don't bother resetting if the controller is in the process of being destructed. */ 2797 return -ENXIO; 2798 } 2799 2800 if (nvme_ctrlr->resetting) { 2801 if (!nvme_ctrlr->in_failover) { 2802 SPDK_NOTICELOG("Reset is already in progress. Defer failover until reset completes.\n"); 2803 2804 /* Defer failover until reset completes. */ 2805 nvme_ctrlr->pending_failover = true; 2806 return -EINPROGRESS; 2807 } else { 2808 SPDK_NOTICELOG("Unable to perform failover, already in progress.\n"); 2809 return -EBUSY; 2810 } 2811 } 2812 2813 bdev_nvme_failover_trid(nvme_ctrlr, remove, true); 2814 2815 if (nvme_ctrlr->reconnect_is_delayed) { 2816 SPDK_NOTICELOG("Reconnect is already scheduled.\n"); 2817 2818 /* We rely on the next reconnect for the failover. */ 2819 return -EALREADY; 2820 } 2821 2822 if (nvme_ctrlr->disabled) { 2823 SPDK_NOTICELOG("Controller is disabled.\n"); 2824 2825 /* We rely on the enablement for the failover. */ 2826 return -EALREADY; 2827 } 2828 2829 nvme_ctrlr->resetting = true; 2830 nvme_ctrlr->in_failover = true; 2831 2832 assert(nvme_ctrlr->reset_start_tsc == 0); 2833 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2834 2835 return 0; 2836 } 2837 2838 static int 2839 bdev_nvme_failover_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool remove) 2840 { 2841 int rc; 2842 2843 pthread_mutex_lock(&nvme_ctrlr->mutex); 2844 rc = bdev_nvme_failover_ctrlr_unsafe(nvme_ctrlr, remove); 2845 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2846 2847 if (rc == 0) { 2848 spdk_thread_send_msg(nvme_ctrlr->thread, _bdev_nvme_reset_ctrlr, nvme_ctrlr); 2849 } else if (rc == -EALREADY) { 2850 rc = 0; 2851 } 2852 2853 return rc; 2854 } 2855 2856 static int bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, 2857 uint64_t num_blocks); 2858 2859 static int bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, 2860 uint64_t num_blocks); 2861 2862 static int bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, 2863 uint64_t src_offset_blocks, 2864 uint64_t num_blocks); 2865 2866 static void 2867 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 2868 bool success) 2869 { 2870 struct nvme_bdev_io *bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 2871 struct spdk_bdev *bdev = bdev_io->bdev; 2872 int ret; 2873 2874 if (!success) { 2875 ret = -EINVAL; 2876 goto exit; 2877 } 2878 2879 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 2880 ret = -ENXIO; 2881 goto exit; 2882 } 2883 2884 ret = bdev_nvme_readv(bio, 2885 bdev_io->u.bdev.iovs, 2886 bdev_io->u.bdev.iovcnt, 2887 bdev_io->u.bdev.md_buf, 2888 bdev_io->u.bdev.num_blocks, 2889 bdev_io->u.bdev.offset_blocks, 2890 bdev->dif_check_flags, 2891 bdev_io->u.bdev.memory_domain, 2892 bdev_io->u.bdev.memory_domain_ctx); 2893 2894 exit: 2895 if (spdk_unlikely(ret != 0)) { 2896 bdev_nvme_io_complete(bio, ret); 2897 } 2898 } 2899 2900 static inline void 2901 _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io) 2902 { 2903 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 2904 struct spdk_bdev *bdev = bdev_io->bdev; 2905 struct nvme_bdev_io *nbdev_io_to_abort; 2906 int rc = 0; 2907 2908 switch (bdev_io->type) { 2909 case SPDK_BDEV_IO_TYPE_READ: 2910 if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) { 2911 rc = bdev_nvme_readv(nbdev_io, 2912 bdev_io->u.bdev.iovs, 2913 bdev_io->u.bdev.iovcnt, 2914 bdev_io->u.bdev.md_buf, 2915 bdev_io->u.bdev.num_blocks, 2916 bdev_io->u.bdev.offset_blocks, 2917 bdev->dif_check_flags, 2918 bdev_io->u.bdev.memory_domain, 2919 bdev_io->u.bdev.memory_domain_ctx); 2920 } else { 2921 spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb, 2922 bdev_io->u.bdev.num_blocks * bdev->blocklen); 2923 rc = 0; 2924 } 2925 break; 2926 case SPDK_BDEV_IO_TYPE_WRITE: 2927 rc = bdev_nvme_writev(nbdev_io, 2928 bdev_io->u.bdev.iovs, 2929 bdev_io->u.bdev.iovcnt, 2930 bdev_io->u.bdev.md_buf, 2931 bdev_io->u.bdev.num_blocks, 2932 bdev_io->u.bdev.offset_blocks, 2933 bdev->dif_check_flags, 2934 bdev_io->u.bdev.memory_domain, 2935 bdev_io->u.bdev.memory_domain_ctx); 2936 break; 2937 case SPDK_BDEV_IO_TYPE_COMPARE: 2938 rc = bdev_nvme_comparev(nbdev_io, 2939 bdev_io->u.bdev.iovs, 2940 bdev_io->u.bdev.iovcnt, 2941 bdev_io->u.bdev.md_buf, 2942 bdev_io->u.bdev.num_blocks, 2943 bdev_io->u.bdev.offset_blocks, 2944 bdev->dif_check_flags); 2945 break; 2946 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 2947 rc = bdev_nvme_comparev_and_writev(nbdev_io, 2948 bdev_io->u.bdev.iovs, 2949 bdev_io->u.bdev.iovcnt, 2950 bdev_io->u.bdev.fused_iovs, 2951 bdev_io->u.bdev.fused_iovcnt, 2952 bdev_io->u.bdev.md_buf, 2953 bdev_io->u.bdev.num_blocks, 2954 bdev_io->u.bdev.offset_blocks, 2955 bdev->dif_check_flags); 2956 break; 2957 case SPDK_BDEV_IO_TYPE_UNMAP: 2958 rc = bdev_nvme_unmap(nbdev_io, 2959 bdev_io->u.bdev.offset_blocks, 2960 bdev_io->u.bdev.num_blocks); 2961 break; 2962 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2963 rc = bdev_nvme_write_zeroes(nbdev_io, 2964 bdev_io->u.bdev.offset_blocks, 2965 bdev_io->u.bdev.num_blocks); 2966 break; 2967 case SPDK_BDEV_IO_TYPE_RESET: 2968 nbdev_io->io_path = NULL; 2969 bdev_nvme_reset_io(nbdev_ch, nbdev_io); 2970 return; 2971 2972 case SPDK_BDEV_IO_TYPE_FLUSH: 2973 bdev_nvme_io_complete(nbdev_io, 0); 2974 return; 2975 2976 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 2977 rc = bdev_nvme_zone_appendv(nbdev_io, 2978 bdev_io->u.bdev.iovs, 2979 bdev_io->u.bdev.iovcnt, 2980 bdev_io->u.bdev.md_buf, 2981 bdev_io->u.bdev.num_blocks, 2982 bdev_io->u.bdev.offset_blocks, 2983 bdev->dif_check_flags); 2984 break; 2985 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 2986 rc = bdev_nvme_get_zone_info(nbdev_io, 2987 bdev_io->u.zone_mgmt.zone_id, 2988 bdev_io->u.zone_mgmt.num_zones, 2989 bdev_io->u.zone_mgmt.buf); 2990 break; 2991 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 2992 rc = bdev_nvme_zone_management(nbdev_io, 2993 bdev_io->u.zone_mgmt.zone_id, 2994 bdev_io->u.zone_mgmt.zone_action); 2995 break; 2996 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 2997 nbdev_io->io_path = NULL; 2998 bdev_nvme_admin_passthru(nbdev_ch, 2999 nbdev_io, 3000 &bdev_io->u.nvme_passthru.cmd, 3001 bdev_io->u.nvme_passthru.buf, 3002 bdev_io->u.nvme_passthru.nbytes); 3003 return; 3004 3005 case SPDK_BDEV_IO_TYPE_NVME_IO: 3006 rc = bdev_nvme_io_passthru(nbdev_io, 3007 &bdev_io->u.nvme_passthru.cmd, 3008 bdev_io->u.nvme_passthru.buf, 3009 bdev_io->u.nvme_passthru.nbytes); 3010 break; 3011 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3012 rc = bdev_nvme_io_passthru_md(nbdev_io, 3013 &bdev_io->u.nvme_passthru.cmd, 3014 bdev_io->u.nvme_passthru.buf, 3015 bdev_io->u.nvme_passthru.nbytes, 3016 bdev_io->u.nvme_passthru.md_buf, 3017 bdev_io->u.nvme_passthru.md_len); 3018 break; 3019 case SPDK_BDEV_IO_TYPE_ABORT: 3020 nbdev_io->io_path = NULL; 3021 nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx; 3022 bdev_nvme_abort(nbdev_ch, 3023 nbdev_io, 3024 nbdev_io_to_abort); 3025 return; 3026 3027 case SPDK_BDEV_IO_TYPE_COPY: 3028 rc = bdev_nvme_copy(nbdev_io, 3029 bdev_io->u.bdev.offset_blocks, 3030 bdev_io->u.bdev.copy.src_offset_blocks, 3031 bdev_io->u.bdev.num_blocks); 3032 break; 3033 default: 3034 rc = -EINVAL; 3035 break; 3036 } 3037 3038 if (spdk_unlikely(rc != 0)) { 3039 bdev_nvme_io_complete(nbdev_io, rc); 3040 } 3041 } 3042 3043 static void 3044 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 3045 { 3046 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 3047 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 3048 3049 if (spdk_likely(nbdev_io->submit_tsc == 0)) { 3050 nbdev_io->submit_tsc = spdk_bdev_io_get_submit_tsc(bdev_io); 3051 } else { 3052 /* There are cases where submit_tsc != 0, i.e. retry I/O. 3053 * We need to update submit_tsc here. 3054 */ 3055 nbdev_io->submit_tsc = spdk_get_ticks(); 3056 } 3057 3058 spdk_trace_record(TRACE_BDEV_NVME_IO_START, 0, 0, (uintptr_t)nbdev_io, (uintptr_t)bdev_io); 3059 nbdev_io->io_path = bdev_nvme_find_io_path(nbdev_ch); 3060 if (spdk_unlikely(!nbdev_io->io_path)) { 3061 if (!bdev_nvme_io_type_is_admin(bdev_io->type)) { 3062 bdev_nvme_io_complete(nbdev_io, -ENXIO); 3063 return; 3064 } 3065 3066 /* Admin commands do not use the optimal I/O path. 3067 * Simply fall through even if it is not found. 3068 */ 3069 } 3070 3071 _bdev_nvme_submit_request(nbdev_ch, bdev_io); 3072 } 3073 3074 static bool 3075 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 3076 { 3077 struct nvme_bdev *nbdev = ctx; 3078 struct nvme_ns *nvme_ns; 3079 struct spdk_nvme_ns *ns; 3080 struct spdk_nvme_ctrlr *ctrlr; 3081 const struct spdk_nvme_ctrlr_data *cdata; 3082 3083 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 3084 assert(nvme_ns != NULL); 3085 ns = nvme_ns->ns; 3086 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 3087 3088 switch (io_type) { 3089 case SPDK_BDEV_IO_TYPE_READ: 3090 case SPDK_BDEV_IO_TYPE_WRITE: 3091 case SPDK_BDEV_IO_TYPE_RESET: 3092 case SPDK_BDEV_IO_TYPE_FLUSH: 3093 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 3094 case SPDK_BDEV_IO_TYPE_NVME_IO: 3095 case SPDK_BDEV_IO_TYPE_ABORT: 3096 return true; 3097 3098 case SPDK_BDEV_IO_TYPE_COMPARE: 3099 return spdk_nvme_ns_supports_compare(ns); 3100 3101 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3102 return spdk_nvme_ns_get_md_size(ns) ? true : false; 3103 3104 case SPDK_BDEV_IO_TYPE_UNMAP: 3105 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3106 return cdata->oncs.dsm; 3107 3108 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3109 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3110 return cdata->oncs.write_zeroes; 3111 3112 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 3113 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 3114 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) { 3115 return true; 3116 } 3117 return false; 3118 3119 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 3120 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 3121 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS; 3122 3123 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 3124 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS && 3125 spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ZONE_APPEND_SUPPORTED; 3126 3127 case SPDK_BDEV_IO_TYPE_COPY: 3128 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3129 return cdata->oncs.copy; 3130 3131 default: 3132 return false; 3133 } 3134 } 3135 3136 static int 3137 nvme_qpair_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ctrlr_channel *ctrlr_ch) 3138 { 3139 struct nvme_qpair *nvme_qpair; 3140 struct spdk_io_channel *pg_ch; 3141 int rc; 3142 3143 nvme_qpair = calloc(1, sizeof(*nvme_qpair)); 3144 if (!nvme_qpair) { 3145 SPDK_ERRLOG("Failed to alloc nvme_qpair.\n"); 3146 return -1; 3147 } 3148 3149 TAILQ_INIT(&nvme_qpair->io_path_list); 3150 3151 nvme_qpair->ctrlr = nvme_ctrlr; 3152 nvme_qpair->ctrlr_ch = ctrlr_ch; 3153 3154 pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs); 3155 if (!pg_ch) { 3156 free(nvme_qpair); 3157 return -1; 3158 } 3159 3160 nvme_qpair->group = spdk_io_channel_get_ctx(pg_ch); 3161 3162 #ifdef SPDK_CONFIG_VTUNE 3163 nvme_qpair->group->collect_spin_stat = true; 3164 #else 3165 nvme_qpair->group->collect_spin_stat = false; 3166 #endif 3167 3168 rc = bdev_nvme_create_qpair(nvme_qpair); 3169 if (rc != 0) { 3170 /* nvme_ctrlr can't create IO qpair if connection is down. 3171 * 3172 * If reconnect_delay_sec is non-zero, creating IO qpair is retried 3173 * after reconnect_delay_sec seconds. If bdev_retry_count is non-zero, 3174 * submitted IO will be queued until IO qpair is successfully created. 3175 * 3176 * Hence, if both are satisfied, ignore the failure. 3177 */ 3178 if (nvme_ctrlr->opts.reconnect_delay_sec == 0 || g_opts.bdev_retry_count == 0) { 3179 spdk_put_io_channel(pg_ch); 3180 free(nvme_qpair); 3181 return rc; 3182 } 3183 } 3184 3185 TAILQ_INSERT_TAIL(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 3186 3187 ctrlr_ch->qpair = nvme_qpair; 3188 3189 pthread_mutex_lock(&nvme_qpair->ctrlr->mutex); 3190 nvme_qpair->ctrlr->ref++; 3191 pthread_mutex_unlock(&nvme_qpair->ctrlr->mutex); 3192 3193 return 0; 3194 } 3195 3196 static int 3197 bdev_nvme_create_ctrlr_channel_cb(void *io_device, void *ctx_buf) 3198 { 3199 struct nvme_ctrlr *nvme_ctrlr = io_device; 3200 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 3201 3202 TAILQ_INIT(&ctrlr_ch->pending_resets); 3203 3204 return nvme_qpair_create(nvme_ctrlr, ctrlr_ch); 3205 } 3206 3207 static void 3208 nvme_qpair_delete(struct nvme_qpair *nvme_qpair) 3209 { 3210 struct nvme_io_path *io_path, *next; 3211 3212 assert(nvme_qpair->group != NULL); 3213 3214 TAILQ_FOREACH_SAFE(io_path, &nvme_qpair->io_path_list, tailq, next) { 3215 TAILQ_REMOVE(&nvme_qpair->io_path_list, io_path, tailq); 3216 nvme_io_path_free(io_path); 3217 } 3218 3219 TAILQ_REMOVE(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 3220 3221 spdk_put_io_channel(spdk_io_channel_from_ctx(nvme_qpair->group)); 3222 3223 nvme_ctrlr_release(nvme_qpair->ctrlr); 3224 3225 free(nvme_qpair); 3226 } 3227 3228 static void 3229 bdev_nvme_destroy_ctrlr_channel_cb(void *io_device, void *ctx_buf) 3230 { 3231 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 3232 struct nvme_qpair *nvme_qpair; 3233 3234 nvme_qpair = ctrlr_ch->qpair; 3235 assert(nvme_qpair != NULL); 3236 3237 _bdev_nvme_clear_io_path_cache(nvme_qpair); 3238 3239 if (nvme_qpair->qpair != NULL) { 3240 if (ctrlr_ch->reset_iter == NULL) { 3241 spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); 3242 } else { 3243 /* Skip current ctrlr_channel in a full reset sequence because 3244 * it is being deleted now. The qpair is already being disconnected. 3245 * We do not have to restart disconnecting it. 3246 */ 3247 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 3248 } 3249 3250 /* We cannot release a reference to the poll group now. 3251 * The qpair may be disconnected asynchronously later. 3252 * We need to poll it until it is actually disconnected. 3253 * Just detach the qpair from the deleting ctrlr_channel. 3254 */ 3255 nvme_qpair->ctrlr_ch = NULL; 3256 } else { 3257 assert(ctrlr_ch->reset_iter == NULL); 3258 3259 nvme_qpair_delete(nvme_qpair); 3260 } 3261 } 3262 3263 static void 3264 bdev_nvme_submit_accel_crc32c(void *ctx, uint32_t *dst, struct iovec *iov, 3265 uint32_t iov_cnt, uint32_t seed, 3266 spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) 3267 { 3268 struct nvme_poll_group *group = ctx; 3269 int rc; 3270 3271 assert(cb_fn != NULL); 3272 3273 if (spdk_unlikely(!group->accel_channel)) { 3274 group->accel_channel = spdk_accel_get_io_channel(); 3275 if (!group->accel_channel) { 3276 cb_fn(cb_arg, -ENOMEM); 3277 SPDK_ERRLOG("Cannot get the accel_channel for bdev nvme polling group=%p\n", 3278 group); 3279 return; 3280 } 3281 } 3282 3283 rc = spdk_accel_submit_crc32cv(group->accel_channel, dst, iov, iov_cnt, seed, cb_fn, cb_arg); 3284 if (rc) { 3285 /* For the two cases, spdk_accel_submit_crc32cv does not call the user's cb_fn */ 3286 if (rc == -ENOMEM || rc == -EINVAL) { 3287 cb_fn(cb_arg, rc); 3288 } 3289 SPDK_ERRLOG("Cannot complete the accelerated crc32c operation with iov=%p\n", iov); 3290 } 3291 } 3292 3293 static struct spdk_nvme_accel_fn_table g_bdev_nvme_accel_fn_table = { 3294 .table_size = sizeof(struct spdk_nvme_accel_fn_table), 3295 .submit_accel_crc32c = bdev_nvme_submit_accel_crc32c, 3296 }; 3297 3298 static int 3299 bdev_nvme_create_poll_group_cb(void *io_device, void *ctx_buf) 3300 { 3301 struct nvme_poll_group *group = ctx_buf; 3302 3303 TAILQ_INIT(&group->qpair_list); 3304 3305 group->group = spdk_nvme_poll_group_create(group, &g_bdev_nvme_accel_fn_table); 3306 if (group->group == NULL) { 3307 return -1; 3308 } 3309 3310 group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us); 3311 3312 if (group->poller == NULL) { 3313 spdk_nvme_poll_group_destroy(group->group); 3314 return -1; 3315 } 3316 3317 return 0; 3318 } 3319 3320 static void 3321 bdev_nvme_destroy_poll_group_cb(void *io_device, void *ctx_buf) 3322 { 3323 struct nvme_poll_group *group = ctx_buf; 3324 3325 assert(TAILQ_EMPTY(&group->qpair_list)); 3326 3327 if (group->accel_channel) { 3328 spdk_put_io_channel(group->accel_channel); 3329 } 3330 3331 spdk_poller_unregister(&group->poller); 3332 if (spdk_nvme_poll_group_destroy(group->group)) { 3333 SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.\n"); 3334 assert(false); 3335 } 3336 } 3337 3338 static struct spdk_io_channel * 3339 bdev_nvme_get_io_channel(void *ctx) 3340 { 3341 struct nvme_bdev *nvme_bdev = ctx; 3342 3343 return spdk_get_io_channel(nvme_bdev); 3344 } 3345 3346 static void * 3347 bdev_nvme_get_module_ctx(void *ctx) 3348 { 3349 struct nvme_bdev *nvme_bdev = ctx; 3350 struct nvme_ns *nvme_ns; 3351 3352 if (!nvme_bdev || nvme_bdev->disk.module != &nvme_if) { 3353 return NULL; 3354 } 3355 3356 nvme_ns = TAILQ_FIRST(&nvme_bdev->nvme_ns_list); 3357 if (!nvme_ns) { 3358 return NULL; 3359 } 3360 3361 return nvme_ns->ns; 3362 } 3363 3364 static const char * 3365 _nvme_ana_state_str(enum spdk_nvme_ana_state ana_state) 3366 { 3367 switch (ana_state) { 3368 case SPDK_NVME_ANA_OPTIMIZED_STATE: 3369 return "optimized"; 3370 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 3371 return "non_optimized"; 3372 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 3373 return "inaccessible"; 3374 case SPDK_NVME_ANA_PERSISTENT_LOSS_STATE: 3375 return "persistent_loss"; 3376 case SPDK_NVME_ANA_CHANGE_STATE: 3377 return "change"; 3378 default: 3379 return NULL; 3380 } 3381 } 3382 3383 static int 3384 bdev_nvme_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size) 3385 { 3386 struct spdk_memory_domain **_domains = NULL; 3387 struct nvme_bdev *nbdev = ctx; 3388 struct nvme_ns *nvme_ns; 3389 int i = 0, _array_size = array_size; 3390 int rc = 0; 3391 3392 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 3393 if (domains && array_size >= i) { 3394 _domains = &domains[i]; 3395 } else { 3396 _domains = NULL; 3397 } 3398 rc = spdk_nvme_ctrlr_get_memory_domains(nvme_ns->ctrlr->ctrlr, _domains, _array_size); 3399 if (rc > 0) { 3400 i += rc; 3401 if (_array_size >= rc) { 3402 _array_size -= rc; 3403 } else { 3404 _array_size = 0; 3405 } 3406 } else if (rc < 0) { 3407 return rc; 3408 } 3409 } 3410 3411 return i; 3412 } 3413 3414 static const char * 3415 nvme_ctrlr_get_state_str(struct nvme_ctrlr *nvme_ctrlr) 3416 { 3417 if (nvme_ctrlr->destruct) { 3418 return "deleting"; 3419 } else if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 3420 return "failed"; 3421 } else if (nvme_ctrlr->resetting) { 3422 return "resetting"; 3423 } else if (nvme_ctrlr->reconnect_is_delayed > 0) { 3424 return "reconnect_is_delayed"; 3425 } else if (nvme_ctrlr->disabled) { 3426 return "disabled"; 3427 } else { 3428 return "enabled"; 3429 } 3430 } 3431 3432 void 3433 nvme_ctrlr_info_json(struct spdk_json_write_ctx *w, struct nvme_ctrlr *nvme_ctrlr) 3434 { 3435 struct spdk_nvme_transport_id *trid; 3436 const struct spdk_nvme_ctrlr_opts *opts; 3437 const struct spdk_nvme_ctrlr_data *cdata; 3438 struct nvme_path_id *path_id; 3439 3440 spdk_json_write_object_begin(w); 3441 3442 spdk_json_write_named_string(w, "state", nvme_ctrlr_get_state_str(nvme_ctrlr)); 3443 3444 #ifdef SPDK_CONFIG_NVME_CUSE 3445 size_t cuse_name_size = 128; 3446 char cuse_name[cuse_name_size]; 3447 3448 int rc = spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr, cuse_name, &cuse_name_size); 3449 if (rc == 0) { 3450 spdk_json_write_named_string(w, "cuse_device", cuse_name); 3451 } 3452 #endif 3453 trid = &nvme_ctrlr->active_path_id->trid; 3454 spdk_json_write_named_object_begin(w, "trid"); 3455 nvme_bdev_dump_trid_json(trid, w); 3456 spdk_json_write_object_end(w); 3457 3458 path_id = TAILQ_NEXT(nvme_ctrlr->active_path_id, link); 3459 if (path_id != NULL) { 3460 spdk_json_write_named_array_begin(w, "alternate_trids"); 3461 do { 3462 trid = &path_id->trid; 3463 spdk_json_write_object_begin(w); 3464 nvme_bdev_dump_trid_json(trid, w); 3465 spdk_json_write_object_end(w); 3466 3467 path_id = TAILQ_NEXT(path_id, link); 3468 } while (path_id != NULL); 3469 spdk_json_write_array_end(w); 3470 } 3471 3472 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 3473 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 3474 3475 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 3476 spdk_json_write_named_object_begin(w, "host"); 3477 spdk_json_write_named_string(w, "nqn", opts->hostnqn); 3478 spdk_json_write_named_string(w, "addr", opts->src_addr); 3479 spdk_json_write_named_string(w, "svcid", opts->src_svcid); 3480 spdk_json_write_object_end(w); 3481 3482 spdk_json_write_object_end(w); 3483 } 3484 3485 static void 3486 nvme_namespace_info_json(struct spdk_json_write_ctx *w, 3487 struct nvme_ns *nvme_ns) 3488 { 3489 struct spdk_nvme_ns *ns; 3490 struct spdk_nvme_ctrlr *ctrlr; 3491 const struct spdk_nvme_ctrlr_data *cdata; 3492 const struct spdk_nvme_transport_id *trid; 3493 union spdk_nvme_vs_register vs; 3494 const struct spdk_nvme_ns_data *nsdata; 3495 char buf[128]; 3496 3497 ns = nvme_ns->ns; 3498 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 3499 3500 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3501 trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); 3502 vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr); 3503 3504 spdk_json_write_object_begin(w); 3505 3506 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 3507 spdk_json_write_named_string(w, "pci_address", trid->traddr); 3508 } 3509 3510 spdk_json_write_named_object_begin(w, "trid"); 3511 3512 nvme_bdev_dump_trid_json(trid, w); 3513 3514 spdk_json_write_object_end(w); 3515 3516 #ifdef SPDK_CONFIG_NVME_CUSE 3517 size_t cuse_name_size = 128; 3518 char cuse_name[cuse_name_size]; 3519 3520 int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns), 3521 cuse_name, &cuse_name_size); 3522 if (rc == 0) { 3523 spdk_json_write_named_string(w, "cuse_device", cuse_name); 3524 } 3525 #endif 3526 3527 spdk_json_write_named_object_begin(w, "ctrlr_data"); 3528 3529 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 3530 3531 spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid); 3532 3533 snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn); 3534 spdk_str_trim(buf); 3535 spdk_json_write_named_string(w, "model_number", buf); 3536 3537 snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn); 3538 spdk_str_trim(buf); 3539 spdk_json_write_named_string(w, "serial_number", buf); 3540 3541 snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr); 3542 spdk_str_trim(buf); 3543 spdk_json_write_named_string(w, "firmware_revision", buf); 3544 3545 if (cdata->subnqn[0] != '\0') { 3546 spdk_json_write_named_string(w, "subnqn", cdata->subnqn); 3547 } 3548 3549 spdk_json_write_named_object_begin(w, "oacs"); 3550 3551 spdk_json_write_named_uint32(w, "security", cdata->oacs.security); 3552 spdk_json_write_named_uint32(w, "format", cdata->oacs.format); 3553 spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware); 3554 spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage); 3555 3556 spdk_json_write_object_end(w); 3557 3558 spdk_json_write_named_bool(w, "multi_ctrlr", cdata->cmic.multi_ctrlr); 3559 spdk_json_write_named_bool(w, "ana_reporting", cdata->cmic.ana_reporting); 3560 3561 spdk_json_write_object_end(w); 3562 3563 spdk_json_write_named_object_begin(w, "vs"); 3564 3565 spdk_json_write_name(w, "nvme_version"); 3566 if (vs.bits.ter) { 3567 spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter); 3568 } else { 3569 spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr); 3570 } 3571 3572 spdk_json_write_object_end(w); 3573 3574 nsdata = spdk_nvme_ns_get_data(ns); 3575 3576 spdk_json_write_named_object_begin(w, "ns_data"); 3577 3578 spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns)); 3579 3580 if (cdata->cmic.ana_reporting) { 3581 spdk_json_write_named_string(w, "ana_state", 3582 _nvme_ana_state_str(nvme_ns->ana_state)); 3583 } 3584 3585 spdk_json_write_named_bool(w, "can_share", nsdata->nmic.can_share); 3586 3587 spdk_json_write_object_end(w); 3588 3589 if (cdata->oacs.security) { 3590 spdk_json_write_named_object_begin(w, "security"); 3591 3592 spdk_json_write_named_bool(w, "opal", nvme_ns->bdev->opal); 3593 3594 spdk_json_write_object_end(w); 3595 } 3596 3597 spdk_json_write_object_end(w); 3598 } 3599 3600 static const char * 3601 nvme_bdev_get_mp_policy_str(struct nvme_bdev *nbdev) 3602 { 3603 switch (nbdev->mp_policy) { 3604 case BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE: 3605 return "active_passive"; 3606 case BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE: 3607 return "active_active"; 3608 default: 3609 assert(false); 3610 return "invalid"; 3611 } 3612 } 3613 3614 static int 3615 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) 3616 { 3617 struct nvme_bdev *nvme_bdev = ctx; 3618 struct nvme_ns *nvme_ns; 3619 3620 pthread_mutex_lock(&nvme_bdev->mutex); 3621 spdk_json_write_named_array_begin(w, "nvme"); 3622 TAILQ_FOREACH(nvme_ns, &nvme_bdev->nvme_ns_list, tailq) { 3623 nvme_namespace_info_json(w, nvme_ns); 3624 } 3625 spdk_json_write_array_end(w); 3626 spdk_json_write_named_string(w, "mp_policy", nvme_bdev_get_mp_policy_str(nvme_bdev)); 3627 pthread_mutex_unlock(&nvme_bdev->mutex); 3628 3629 return 0; 3630 } 3631 3632 static void 3633 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3634 { 3635 /* No config per bdev needed */ 3636 } 3637 3638 static uint64_t 3639 bdev_nvme_get_spin_time(struct spdk_io_channel *ch) 3640 { 3641 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 3642 struct nvme_io_path *io_path; 3643 struct nvme_poll_group *group; 3644 uint64_t spin_time = 0; 3645 3646 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 3647 group = io_path->qpair->group; 3648 3649 if (!group || !group->collect_spin_stat) { 3650 continue; 3651 } 3652 3653 if (group->end_ticks != 0) { 3654 group->spin_ticks += (group->end_ticks - group->start_ticks); 3655 group->end_ticks = 0; 3656 } 3657 3658 spin_time += group->spin_ticks; 3659 group->start_ticks = 0; 3660 group->spin_ticks = 0; 3661 } 3662 3663 return (spin_time * 1000000ULL) / spdk_get_ticks_hz(); 3664 } 3665 3666 static void 3667 bdev_nvme_reset_device_stat(void *ctx) 3668 { 3669 struct nvme_bdev *nbdev = ctx; 3670 3671 if (nbdev->err_stat != NULL) { 3672 memset(nbdev->err_stat, 0, sizeof(struct nvme_error_stat)); 3673 } 3674 } 3675 3676 /* JSON string should be lowercases and underscore delimited string. */ 3677 static void 3678 bdev_nvme_format_nvme_status(char *dst, const char *src) 3679 { 3680 char tmp[256]; 3681 3682 spdk_strcpy_replace(dst, 256, src, " - ", "_"); 3683 spdk_strcpy_replace(tmp, 256, dst, "-", "_"); 3684 spdk_strcpy_replace(dst, 256, tmp, " ", "_"); 3685 spdk_strlwr(dst); 3686 } 3687 3688 static void 3689 bdev_nvme_dump_device_stat_json(void *ctx, struct spdk_json_write_ctx *w) 3690 { 3691 struct nvme_bdev *nbdev = ctx; 3692 struct spdk_nvme_status status = {}; 3693 uint16_t sct, sc; 3694 char status_json[256]; 3695 const char *status_str; 3696 3697 if (nbdev->err_stat == NULL) { 3698 return; 3699 } 3700 3701 spdk_json_write_named_object_begin(w, "nvme_error"); 3702 3703 spdk_json_write_named_object_begin(w, "status_type"); 3704 for (sct = 0; sct < 8; sct++) { 3705 if (nbdev->err_stat->status_type[sct] == 0) { 3706 continue; 3707 } 3708 status.sct = sct; 3709 3710 status_str = spdk_nvme_cpl_get_status_type_string(&status); 3711 assert(status_str != NULL); 3712 bdev_nvme_format_nvme_status(status_json, status_str); 3713 3714 spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status_type[sct]); 3715 } 3716 spdk_json_write_object_end(w); 3717 3718 spdk_json_write_named_object_begin(w, "status_code"); 3719 for (sct = 0; sct < 4; sct++) { 3720 status.sct = sct; 3721 for (sc = 0; sc < 256; sc++) { 3722 if (nbdev->err_stat->status[sct][sc] == 0) { 3723 continue; 3724 } 3725 status.sc = sc; 3726 3727 status_str = spdk_nvme_cpl_get_status_string(&status); 3728 assert(status_str != NULL); 3729 bdev_nvme_format_nvme_status(status_json, status_str); 3730 3731 spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status[sct][sc]); 3732 } 3733 } 3734 spdk_json_write_object_end(w); 3735 3736 spdk_json_write_object_end(w); 3737 } 3738 3739 static const struct spdk_bdev_fn_table nvmelib_fn_table = { 3740 .destruct = bdev_nvme_destruct, 3741 .submit_request = bdev_nvme_submit_request, 3742 .io_type_supported = bdev_nvme_io_type_supported, 3743 .get_io_channel = bdev_nvme_get_io_channel, 3744 .dump_info_json = bdev_nvme_dump_info_json, 3745 .write_config_json = bdev_nvme_write_config_json, 3746 .get_spin_time = bdev_nvme_get_spin_time, 3747 .get_module_ctx = bdev_nvme_get_module_ctx, 3748 .get_memory_domains = bdev_nvme_get_memory_domains, 3749 .reset_device_stat = bdev_nvme_reset_device_stat, 3750 .dump_device_stat_json = bdev_nvme_dump_device_stat_json, 3751 }; 3752 3753 typedef int (*bdev_nvme_parse_ana_log_page_cb)( 3754 const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg); 3755 3756 static int 3757 bdev_nvme_parse_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 3758 bdev_nvme_parse_ana_log_page_cb cb_fn, void *cb_arg) 3759 { 3760 struct spdk_nvme_ana_group_descriptor *copied_desc; 3761 uint8_t *orig_desc; 3762 uint32_t i, desc_size, copy_len; 3763 int rc = 0; 3764 3765 if (nvme_ctrlr->ana_log_page == NULL) { 3766 return -EINVAL; 3767 } 3768 3769 copied_desc = nvme_ctrlr->copied_ana_desc; 3770 3771 orig_desc = (uint8_t *)nvme_ctrlr->ana_log_page + sizeof(struct spdk_nvme_ana_page); 3772 copy_len = nvme_ctrlr->max_ana_log_page_size - sizeof(struct spdk_nvme_ana_page); 3773 3774 for (i = 0; i < nvme_ctrlr->ana_log_page->num_ana_group_desc; i++) { 3775 memcpy(copied_desc, orig_desc, copy_len); 3776 3777 rc = cb_fn(copied_desc, cb_arg); 3778 if (rc != 0) { 3779 break; 3780 } 3781 3782 desc_size = sizeof(struct spdk_nvme_ana_group_descriptor) + 3783 copied_desc->num_of_nsid * sizeof(uint32_t); 3784 orig_desc += desc_size; 3785 copy_len -= desc_size; 3786 } 3787 3788 return rc; 3789 } 3790 3791 static int 3792 nvme_ns_ana_transition_timedout(void *ctx) 3793 { 3794 struct nvme_ns *nvme_ns = ctx; 3795 3796 spdk_poller_unregister(&nvme_ns->anatt_timer); 3797 nvme_ns->ana_transition_timedout = true; 3798 3799 return SPDK_POLLER_BUSY; 3800 } 3801 3802 static void 3803 _nvme_ns_set_ana_state(struct nvme_ns *nvme_ns, 3804 const struct spdk_nvme_ana_group_descriptor *desc) 3805 { 3806 const struct spdk_nvme_ctrlr_data *cdata; 3807 3808 nvme_ns->ana_group_id = desc->ana_group_id; 3809 nvme_ns->ana_state = desc->ana_state; 3810 nvme_ns->ana_state_updating = false; 3811 3812 switch (nvme_ns->ana_state) { 3813 case SPDK_NVME_ANA_OPTIMIZED_STATE: 3814 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 3815 nvme_ns->ana_transition_timedout = false; 3816 spdk_poller_unregister(&nvme_ns->anatt_timer); 3817 break; 3818 3819 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 3820 case SPDK_NVME_ANA_CHANGE_STATE: 3821 if (nvme_ns->anatt_timer != NULL) { 3822 break; 3823 } 3824 3825 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 3826 nvme_ns->anatt_timer = SPDK_POLLER_REGISTER(nvme_ns_ana_transition_timedout, 3827 nvme_ns, 3828 cdata->anatt * SPDK_SEC_TO_USEC); 3829 break; 3830 default: 3831 break; 3832 } 3833 } 3834 3835 static int 3836 nvme_ns_set_ana_state(const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg) 3837 { 3838 struct nvme_ns *nvme_ns = cb_arg; 3839 uint32_t i; 3840 3841 for (i = 0; i < desc->num_of_nsid; i++) { 3842 if (desc->nsid[i] != spdk_nvme_ns_get_id(nvme_ns->ns)) { 3843 continue; 3844 } 3845 3846 _nvme_ns_set_ana_state(nvme_ns, desc); 3847 return 1; 3848 } 3849 3850 return 0; 3851 } 3852 3853 static struct spdk_uuid 3854 nvme_generate_uuid(const char *sn, uint32_t nsid) 3855 { 3856 struct spdk_uuid new_uuid, namespace_uuid; 3857 char merged_str[SPDK_NVME_CTRLR_SN_LEN + NSID_STR_LEN + 1] = {'\0'}; 3858 /* This namespace UUID was generated using uuid_generate() method. */ 3859 const char *namespace_str = {"edaed2de-24bc-4b07-b559-f47ecbe730fd"}; 3860 int size; 3861 3862 assert(strlen(sn) <= SPDK_NVME_CTRLR_SN_LEN); 3863 3864 spdk_uuid_set_null(&new_uuid); 3865 spdk_uuid_set_null(&namespace_uuid); 3866 3867 size = snprintf(merged_str, sizeof(merged_str), "%s%"PRIu32, sn, nsid); 3868 assert(size > 0 && (unsigned long)size < sizeof(merged_str)); 3869 3870 spdk_uuid_parse(&namespace_uuid, namespace_str); 3871 3872 spdk_uuid_generate_sha1(&new_uuid, &namespace_uuid, merged_str, size); 3873 3874 return new_uuid; 3875 } 3876 3877 static int 3878 nvme_disk_create(struct spdk_bdev *disk, const char *base_name, 3879 struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns, 3880 uint32_t prchk_flags, void *ctx) 3881 { 3882 const struct spdk_uuid *uuid; 3883 const uint8_t *nguid; 3884 const struct spdk_nvme_ctrlr_data *cdata; 3885 const struct spdk_nvme_ns_data *nsdata; 3886 const struct spdk_nvme_ctrlr_opts *opts; 3887 enum spdk_nvme_csi csi; 3888 uint32_t atomic_bs, phys_bs, bs; 3889 char sn_tmp[SPDK_NVME_CTRLR_SN_LEN + 1] = {'\0'}; 3890 3891 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3892 csi = spdk_nvme_ns_get_csi(ns); 3893 opts = spdk_nvme_ctrlr_get_opts(ctrlr); 3894 3895 switch (csi) { 3896 case SPDK_NVME_CSI_NVM: 3897 disk->product_name = "NVMe disk"; 3898 break; 3899 case SPDK_NVME_CSI_ZNS: 3900 disk->product_name = "NVMe ZNS disk"; 3901 disk->zoned = true; 3902 disk->zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 3903 disk->max_zone_append_size = spdk_nvme_zns_ctrlr_get_max_zone_append_size(ctrlr) / 3904 spdk_nvme_ns_get_extended_sector_size(ns); 3905 disk->max_open_zones = spdk_nvme_zns_ns_get_max_open_zones(ns); 3906 disk->max_active_zones = spdk_nvme_zns_ns_get_max_active_zones(ns); 3907 break; 3908 default: 3909 SPDK_ERRLOG("unsupported CSI: %u\n", csi); 3910 return -ENOTSUP; 3911 } 3912 3913 disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns)); 3914 if (!disk->name) { 3915 return -ENOMEM; 3916 } 3917 3918 disk->write_cache = 0; 3919 if (cdata->vwc.present) { 3920 /* Enable if the Volatile Write Cache exists */ 3921 disk->write_cache = 1; 3922 } 3923 if (cdata->oncs.write_zeroes) { 3924 disk->max_write_zeroes = UINT16_MAX + 1; 3925 } 3926 disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns); 3927 disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns); 3928 disk->max_segment_size = spdk_nvme_ctrlr_get_max_xfer_size(ctrlr); 3929 /* NVMe driver will split one request into multiple requests 3930 * based on MDTS and stripe boundary, the bdev layer will use 3931 * max_segment_size and max_num_segments to split one big IO 3932 * into multiple requests, then small request can't run out 3933 * of NVMe internal requests data structure. 3934 */ 3935 if (opts && opts->io_queue_requests) { 3936 disk->max_num_segments = opts->io_queue_requests / 2; 3937 } 3938 disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns); 3939 3940 nguid = spdk_nvme_ns_get_nguid(ns); 3941 if (!nguid) { 3942 uuid = spdk_nvme_ns_get_uuid(ns); 3943 if (uuid) { 3944 disk->uuid = *uuid; 3945 } else if (g_opts.generate_uuids) { 3946 spdk_strcpy_pad(sn_tmp, cdata->sn, SPDK_NVME_CTRLR_SN_LEN, '\0'); 3947 disk->uuid = nvme_generate_uuid(sn_tmp, spdk_nvme_ns_get_id(ns)); 3948 } 3949 } else { 3950 memcpy(&disk->uuid, nguid, sizeof(disk->uuid)); 3951 } 3952 3953 nsdata = spdk_nvme_ns_get_data(ns); 3954 bs = spdk_nvme_ns_get_sector_size(ns); 3955 atomic_bs = bs; 3956 phys_bs = bs; 3957 if (nsdata->nabo == 0) { 3958 if (nsdata->nsfeat.ns_atomic_write_unit && nsdata->nawupf) { 3959 atomic_bs = bs * (1 + nsdata->nawupf); 3960 } else { 3961 atomic_bs = bs * (1 + cdata->awupf); 3962 } 3963 } 3964 if (nsdata->nsfeat.optperf) { 3965 phys_bs = bs * (1 + nsdata->npwg); 3966 } 3967 disk->phys_blocklen = spdk_min(phys_bs, atomic_bs); 3968 3969 disk->md_len = spdk_nvme_ns_get_md_size(ns); 3970 if (disk->md_len != 0) { 3971 disk->md_interleave = nsdata->flbas.extended; 3972 disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns); 3973 if (disk->dif_type != SPDK_DIF_DISABLE) { 3974 disk->dif_is_head_of_md = nsdata->dps.md_start; 3975 disk->dif_check_flags = prchk_flags; 3976 } 3977 } 3978 3979 if (!(spdk_nvme_ctrlr_get_flags(ctrlr) & 3980 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) { 3981 disk->acwu = 0; 3982 } else if (nsdata->nsfeat.ns_atomic_write_unit) { 3983 disk->acwu = nsdata->nacwu + 1; /* 0-based */ 3984 } else { 3985 disk->acwu = cdata->acwu + 1; /* 0-based */ 3986 } 3987 3988 if (cdata->oncs.copy) { 3989 /* For now bdev interface allows only single segment copy */ 3990 disk->max_copy = nsdata->mssrl; 3991 } 3992 3993 disk->ctxt = ctx; 3994 disk->fn_table = &nvmelib_fn_table; 3995 disk->module = &nvme_if; 3996 3997 return 0; 3998 } 3999 4000 static struct nvme_bdev * 4001 nvme_bdev_alloc(void) 4002 { 4003 struct nvme_bdev *bdev; 4004 int rc; 4005 4006 bdev = calloc(1, sizeof(*bdev)); 4007 if (!bdev) { 4008 SPDK_ERRLOG("bdev calloc() failed\n"); 4009 return NULL; 4010 } 4011 4012 if (g_opts.nvme_error_stat) { 4013 bdev->err_stat = calloc(1, sizeof(struct nvme_error_stat)); 4014 if (!bdev->err_stat) { 4015 SPDK_ERRLOG("err_stat calloc() failed\n"); 4016 free(bdev); 4017 return NULL; 4018 } 4019 } 4020 4021 rc = pthread_mutex_init(&bdev->mutex, NULL); 4022 if (rc != 0) { 4023 free(bdev->err_stat); 4024 free(bdev); 4025 return NULL; 4026 } 4027 4028 bdev->ref = 1; 4029 bdev->mp_policy = BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE; 4030 bdev->mp_selector = BDEV_NVME_MP_SELECTOR_ROUND_ROBIN; 4031 bdev->rr_min_io = UINT32_MAX; 4032 TAILQ_INIT(&bdev->nvme_ns_list); 4033 4034 return bdev; 4035 } 4036 4037 static int 4038 nvme_bdev_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4039 { 4040 struct nvme_bdev *bdev; 4041 struct nvme_bdev_ctrlr *nbdev_ctrlr = nvme_ctrlr->nbdev_ctrlr; 4042 int rc; 4043 4044 bdev = nvme_bdev_alloc(); 4045 if (bdev == NULL) { 4046 SPDK_ERRLOG("Failed to allocate NVMe bdev\n"); 4047 return -ENOMEM; 4048 } 4049 4050 bdev->opal = nvme_ctrlr->opal_dev != NULL; 4051 4052 rc = nvme_disk_create(&bdev->disk, nbdev_ctrlr->name, nvme_ctrlr->ctrlr, 4053 nvme_ns->ns, nvme_ctrlr->opts.prchk_flags, bdev); 4054 if (rc != 0) { 4055 SPDK_ERRLOG("Failed to create NVMe disk\n"); 4056 nvme_bdev_free(bdev); 4057 return rc; 4058 } 4059 4060 spdk_io_device_register(bdev, 4061 bdev_nvme_create_bdev_channel_cb, 4062 bdev_nvme_destroy_bdev_channel_cb, 4063 sizeof(struct nvme_bdev_channel), 4064 bdev->disk.name); 4065 4066 nvme_ns->bdev = bdev; 4067 bdev->nsid = nvme_ns->id; 4068 TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); 4069 4070 bdev->nbdev_ctrlr = nbdev_ctrlr; 4071 TAILQ_INSERT_TAIL(&nbdev_ctrlr->bdevs, bdev, tailq); 4072 4073 rc = spdk_bdev_register(&bdev->disk); 4074 if (rc != 0) { 4075 SPDK_ERRLOG("spdk_bdev_register() failed\n"); 4076 spdk_io_device_unregister(bdev, NULL); 4077 nvme_ns->bdev = NULL; 4078 TAILQ_REMOVE(&nbdev_ctrlr->bdevs, bdev, tailq); 4079 nvme_bdev_free(bdev); 4080 return rc; 4081 } 4082 4083 return 0; 4084 } 4085 4086 static bool 4087 bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2) 4088 { 4089 const struct spdk_nvme_ns_data *nsdata1, *nsdata2; 4090 const struct spdk_uuid *uuid1, *uuid2; 4091 4092 nsdata1 = spdk_nvme_ns_get_data(ns1); 4093 nsdata2 = spdk_nvme_ns_get_data(ns2); 4094 uuid1 = spdk_nvme_ns_get_uuid(ns1); 4095 uuid2 = spdk_nvme_ns_get_uuid(ns2); 4096 4097 return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid)) == 0 && 4098 nsdata1->eui64 == nsdata2->eui64 && 4099 ((uuid1 == NULL && uuid2 == NULL) || 4100 (uuid1 != NULL && uuid2 != NULL && spdk_uuid_compare(uuid1, uuid2) == 0)) && 4101 spdk_nvme_ns_get_csi(ns1) == spdk_nvme_ns_get_csi(ns2); 4102 } 4103 4104 static bool 4105 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 4106 struct spdk_nvme_ctrlr_opts *opts) 4107 { 4108 struct nvme_probe_skip_entry *entry; 4109 4110 TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) { 4111 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 4112 return false; 4113 } 4114 } 4115 4116 opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst; 4117 opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight; 4118 opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight; 4119 opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight; 4120 opts->disable_read_ana_log_page = true; 4121 4122 SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr); 4123 4124 return true; 4125 } 4126 4127 static void 4128 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl) 4129 { 4130 struct nvme_ctrlr *nvme_ctrlr = ctx; 4131 4132 if (spdk_nvme_cpl_is_error(cpl)) { 4133 SPDK_WARNLOG("Abort failed. Resetting controller. sc is %u, sct is %u.\n", cpl->status.sc, 4134 cpl->status.sct); 4135 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4136 } else if (cpl->cdw0 & 0x1) { 4137 SPDK_WARNLOG("Specified command could not be aborted.\n"); 4138 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4139 } 4140 } 4141 4142 static void 4143 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr, 4144 struct spdk_nvme_qpair *qpair, uint16_t cid) 4145 { 4146 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 4147 union spdk_nvme_csts_register csts; 4148 int rc; 4149 4150 assert(nvme_ctrlr->ctrlr == ctrlr); 4151 4152 SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid); 4153 4154 /* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O 4155 * queue. (Note: qpair == NULL when there's an admin cmd timeout.) Otherwise we 4156 * would submit another fabrics cmd on the admin queue to read CSTS and check for its 4157 * completion recursively. 4158 */ 4159 if (nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) { 4160 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); 4161 if (csts.bits.cfs) { 4162 SPDK_ERRLOG("Controller Fatal Status, reset required\n"); 4163 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4164 return; 4165 } 4166 } 4167 4168 switch (g_opts.action_on_timeout) { 4169 case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT: 4170 if (qpair) { 4171 /* Don't send abort to ctrlr when ctrlr is not available. */ 4172 pthread_mutex_lock(&nvme_ctrlr->mutex); 4173 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 4174 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4175 SPDK_NOTICELOG("Quit abort. Ctrlr is not available.\n"); 4176 return; 4177 } 4178 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4179 4180 rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid, 4181 nvme_abort_cpl, nvme_ctrlr); 4182 if (rc == 0) { 4183 return; 4184 } 4185 4186 SPDK_ERRLOG("Unable to send abort. Resetting, rc is %d.\n", rc); 4187 } 4188 4189 /* FALLTHROUGH */ 4190 case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET: 4191 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4192 break; 4193 case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE: 4194 SPDK_DEBUGLOG(bdev_nvme, "No action for nvme controller timeout.\n"); 4195 break; 4196 default: 4197 SPDK_ERRLOG("An invalid timeout action value is found.\n"); 4198 break; 4199 } 4200 } 4201 4202 static struct nvme_ns * 4203 nvme_ns_alloc(void) 4204 { 4205 struct nvme_ns *nvme_ns; 4206 4207 nvme_ns = calloc(1, sizeof(struct nvme_ns)); 4208 if (nvme_ns == NULL) { 4209 return NULL; 4210 } 4211 4212 if (g_opts.io_path_stat) { 4213 nvme_ns->stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); 4214 if (nvme_ns->stat == NULL) { 4215 free(nvme_ns); 4216 return NULL; 4217 } 4218 spdk_bdev_reset_io_stat(nvme_ns->stat, SPDK_BDEV_RESET_STAT_MAXMIN); 4219 } 4220 4221 return nvme_ns; 4222 } 4223 4224 static void 4225 nvme_ns_free(struct nvme_ns *nvme_ns) 4226 { 4227 free(nvme_ns->stat); 4228 free(nvme_ns); 4229 } 4230 4231 static void 4232 nvme_ctrlr_populate_namespace_done(struct nvme_ns *nvme_ns, int rc) 4233 { 4234 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 4235 struct nvme_async_probe_ctx *ctx = nvme_ns->probe_ctx; 4236 4237 if (rc == 0) { 4238 nvme_ns->probe_ctx = NULL; 4239 pthread_mutex_lock(&nvme_ctrlr->mutex); 4240 nvme_ctrlr->ref++; 4241 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4242 } else { 4243 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4244 nvme_ns_free(nvme_ns); 4245 } 4246 4247 if (ctx) { 4248 ctx->populates_in_progress--; 4249 if (ctx->populates_in_progress == 0) { 4250 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 4251 } 4252 } 4253 } 4254 4255 static void 4256 bdev_nvme_add_io_path(struct spdk_io_channel_iter *i) 4257 { 4258 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4259 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4260 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4261 int rc; 4262 4263 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 4264 if (rc != 0) { 4265 SPDK_ERRLOG("Failed to add I/O path to bdev_channel dynamically.\n"); 4266 } 4267 4268 spdk_for_each_channel_continue(i, rc); 4269 } 4270 4271 static void 4272 bdev_nvme_delete_io_path(struct spdk_io_channel_iter *i) 4273 { 4274 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4275 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4276 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4277 struct nvme_io_path *io_path; 4278 4279 io_path = _bdev_nvme_get_io_path(nbdev_ch, nvme_ns); 4280 if (io_path != NULL) { 4281 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 4282 } 4283 4284 spdk_for_each_channel_continue(i, 0); 4285 } 4286 4287 static void 4288 bdev_nvme_add_io_path_failed(struct spdk_io_channel_iter *i, int status) 4289 { 4290 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4291 4292 nvme_ctrlr_populate_namespace_done(nvme_ns, -1); 4293 } 4294 4295 static void 4296 bdev_nvme_add_io_path_done(struct spdk_io_channel_iter *i, int status) 4297 { 4298 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4299 struct nvme_bdev *bdev = spdk_io_channel_iter_get_io_device(i); 4300 4301 if (status == 0) { 4302 nvme_ctrlr_populate_namespace_done(nvme_ns, 0); 4303 } else { 4304 /* Delete the added io_paths and fail populating the namespace. */ 4305 spdk_for_each_channel(bdev, 4306 bdev_nvme_delete_io_path, 4307 nvme_ns, 4308 bdev_nvme_add_io_path_failed); 4309 } 4310 } 4311 4312 static int 4313 nvme_bdev_add_ns(struct nvme_bdev *bdev, struct nvme_ns *nvme_ns) 4314 { 4315 struct nvme_ns *tmp_ns; 4316 const struct spdk_nvme_ns_data *nsdata; 4317 4318 nsdata = spdk_nvme_ns_get_data(nvme_ns->ns); 4319 if (!nsdata->nmic.can_share) { 4320 SPDK_ERRLOG("Namespace cannot be shared.\n"); 4321 return -EINVAL; 4322 } 4323 4324 pthread_mutex_lock(&bdev->mutex); 4325 4326 tmp_ns = TAILQ_FIRST(&bdev->nvme_ns_list); 4327 assert(tmp_ns != NULL); 4328 4329 if (!bdev_nvme_compare_ns(nvme_ns->ns, tmp_ns->ns)) { 4330 pthread_mutex_unlock(&bdev->mutex); 4331 SPDK_ERRLOG("Namespaces are not identical.\n"); 4332 return -EINVAL; 4333 } 4334 4335 bdev->ref++; 4336 TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); 4337 nvme_ns->bdev = bdev; 4338 4339 pthread_mutex_unlock(&bdev->mutex); 4340 4341 /* Add nvme_io_path to nvme_bdev_channels dynamically. */ 4342 spdk_for_each_channel(bdev, 4343 bdev_nvme_add_io_path, 4344 nvme_ns, 4345 bdev_nvme_add_io_path_done); 4346 4347 return 0; 4348 } 4349 4350 static void 4351 nvme_ctrlr_populate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4352 { 4353 struct spdk_nvme_ns *ns; 4354 struct nvme_bdev *bdev; 4355 int rc = 0; 4356 4357 ns = spdk_nvme_ctrlr_get_ns(nvme_ctrlr->ctrlr, nvme_ns->id); 4358 if (!ns) { 4359 SPDK_DEBUGLOG(bdev_nvme, "Invalid NS %d\n", nvme_ns->id); 4360 rc = -EINVAL; 4361 goto done; 4362 } 4363 4364 nvme_ns->ns = ns; 4365 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 4366 4367 if (nvme_ctrlr->ana_log_page != NULL) { 4368 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ns_set_ana_state, nvme_ns); 4369 } 4370 4371 bdev = nvme_bdev_ctrlr_get_bdev(nvme_ctrlr->nbdev_ctrlr, nvme_ns->id); 4372 if (bdev == NULL) { 4373 rc = nvme_bdev_create(nvme_ctrlr, nvme_ns); 4374 } else { 4375 rc = nvme_bdev_add_ns(bdev, nvme_ns); 4376 if (rc == 0) { 4377 return; 4378 } 4379 } 4380 done: 4381 nvme_ctrlr_populate_namespace_done(nvme_ns, rc); 4382 } 4383 4384 static void 4385 nvme_ctrlr_depopulate_namespace_done(struct nvme_ns *nvme_ns) 4386 { 4387 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 4388 4389 assert(nvme_ctrlr != NULL); 4390 4391 pthread_mutex_lock(&nvme_ctrlr->mutex); 4392 4393 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4394 4395 if (nvme_ns->bdev != NULL) { 4396 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4397 return; 4398 } 4399 4400 nvme_ns_free(nvme_ns); 4401 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4402 4403 nvme_ctrlr_release(nvme_ctrlr); 4404 } 4405 4406 static void 4407 bdev_nvme_delete_io_path_done(struct spdk_io_channel_iter *i, int status) 4408 { 4409 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4410 4411 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 4412 } 4413 4414 static void 4415 nvme_ctrlr_depopulate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4416 { 4417 struct nvme_bdev *bdev; 4418 4419 spdk_poller_unregister(&nvme_ns->anatt_timer); 4420 4421 bdev = nvme_ns->bdev; 4422 if (bdev != NULL) { 4423 pthread_mutex_lock(&bdev->mutex); 4424 4425 assert(bdev->ref > 0); 4426 bdev->ref--; 4427 if (bdev->ref == 0) { 4428 pthread_mutex_unlock(&bdev->mutex); 4429 4430 spdk_bdev_unregister(&bdev->disk, NULL, NULL); 4431 } else { 4432 /* spdk_bdev_unregister() is not called until the last nvme_ns is 4433 * depopulated. Hence we need to remove nvme_ns from bdev->nvme_ns_list 4434 * and clear nvme_ns->bdev here. 4435 */ 4436 TAILQ_REMOVE(&bdev->nvme_ns_list, nvme_ns, tailq); 4437 nvme_ns->bdev = NULL; 4438 4439 pthread_mutex_unlock(&bdev->mutex); 4440 4441 /* Delete nvme_io_paths from nvme_bdev_channels dynamically. After that, 4442 * we call depopulate_namespace_done() to avoid use-after-free. 4443 */ 4444 spdk_for_each_channel(bdev, 4445 bdev_nvme_delete_io_path, 4446 nvme_ns, 4447 bdev_nvme_delete_io_path_done); 4448 return; 4449 } 4450 } 4451 4452 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 4453 } 4454 4455 static void 4456 nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 4457 struct nvme_async_probe_ctx *ctx) 4458 { 4459 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 4460 struct nvme_ns *nvme_ns, *next; 4461 struct spdk_nvme_ns *ns; 4462 struct nvme_bdev *bdev; 4463 uint32_t nsid; 4464 int rc; 4465 uint64_t num_sectors; 4466 4467 if (ctx) { 4468 /* Initialize this count to 1 to handle the populate functions 4469 * calling nvme_ctrlr_populate_namespace_done() immediately. 4470 */ 4471 ctx->populates_in_progress = 1; 4472 } 4473 4474 /* First loop over our existing namespaces and see if they have been 4475 * removed. */ 4476 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 4477 while (nvme_ns != NULL) { 4478 next = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 4479 4480 if (spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) { 4481 /* NS is still there but attributes may have changed */ 4482 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id); 4483 num_sectors = spdk_nvme_ns_get_num_sectors(ns); 4484 bdev = nvme_ns->bdev; 4485 assert(bdev != NULL); 4486 if (bdev->disk.blockcnt != num_sectors) { 4487 SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n", 4488 nvme_ns->id, 4489 bdev->disk.name, 4490 bdev->disk.blockcnt, 4491 num_sectors); 4492 rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors); 4493 if (rc != 0) { 4494 SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n", 4495 bdev->disk.name, rc); 4496 } 4497 } 4498 } else { 4499 /* Namespace was removed */ 4500 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 4501 } 4502 4503 nvme_ns = next; 4504 } 4505 4506 /* Loop through all of the namespaces at the nvme level and see if any of them are new */ 4507 nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 4508 while (nsid != 0) { 4509 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 4510 4511 if (nvme_ns == NULL) { 4512 /* Found a new one */ 4513 nvme_ns = nvme_ns_alloc(); 4514 if (nvme_ns == NULL) { 4515 SPDK_ERRLOG("Failed to allocate namespace\n"); 4516 /* This just fails to attach the namespace. It may work on a future attempt. */ 4517 continue; 4518 } 4519 4520 nvme_ns->id = nsid; 4521 nvme_ns->ctrlr = nvme_ctrlr; 4522 4523 nvme_ns->bdev = NULL; 4524 4525 if (ctx) { 4526 ctx->populates_in_progress++; 4527 } 4528 nvme_ns->probe_ctx = ctx; 4529 4530 RB_INSERT(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4531 4532 nvme_ctrlr_populate_namespace(nvme_ctrlr, nvme_ns); 4533 } 4534 4535 nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid); 4536 } 4537 4538 if (ctx) { 4539 /* Decrement this count now that the loop is over to account 4540 * for the one we started with. If the count is then 0, we 4541 * know any populate_namespace functions completed immediately, 4542 * so we'll kick the callback here. 4543 */ 4544 ctx->populates_in_progress--; 4545 if (ctx->populates_in_progress == 0) { 4546 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 4547 } 4548 } 4549 4550 } 4551 4552 static void 4553 nvme_ctrlr_depopulate_namespaces(struct nvme_ctrlr *nvme_ctrlr) 4554 { 4555 struct nvme_ns *nvme_ns, *tmp; 4556 4557 RB_FOREACH_SAFE(nvme_ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp) { 4558 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 4559 } 4560 } 4561 4562 static uint32_t 4563 nvme_ctrlr_get_ana_log_page_size(struct nvme_ctrlr *nvme_ctrlr) 4564 { 4565 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 4566 const struct spdk_nvme_ctrlr_data *cdata; 4567 uint32_t nsid, ns_count = 0; 4568 4569 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 4570 4571 for (nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 4572 nsid != 0; nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid)) { 4573 ns_count++; 4574 } 4575 4576 return sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 4577 sizeof(struct spdk_nvme_ana_group_descriptor) + ns_count * 4578 sizeof(uint32_t); 4579 } 4580 4581 static int 4582 nvme_ctrlr_set_ana_states(const struct spdk_nvme_ana_group_descriptor *desc, 4583 void *cb_arg) 4584 { 4585 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 4586 struct nvme_ns *nvme_ns; 4587 uint32_t i, nsid; 4588 4589 for (i = 0; i < desc->num_of_nsid; i++) { 4590 nsid = desc->nsid[i]; 4591 if (nsid == 0) { 4592 continue; 4593 } 4594 4595 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 4596 4597 assert(nvme_ns != NULL); 4598 if (nvme_ns == NULL) { 4599 /* Target told us that an inactive namespace had an ANA change */ 4600 continue; 4601 } 4602 4603 _nvme_ns_set_ana_state(nvme_ns, desc); 4604 } 4605 4606 return 0; 4607 } 4608 4609 static void 4610 bdev_nvme_disable_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 4611 { 4612 struct nvme_ns *nvme_ns; 4613 4614 spdk_free(nvme_ctrlr->ana_log_page); 4615 nvme_ctrlr->ana_log_page = NULL; 4616 4617 for (nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 4618 nvme_ns != NULL; 4619 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns)) { 4620 nvme_ns->ana_state_updating = false; 4621 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 4622 } 4623 } 4624 4625 static void 4626 nvme_ctrlr_read_ana_log_page_done(void *ctx, const struct spdk_nvme_cpl *cpl) 4627 { 4628 struct nvme_ctrlr *nvme_ctrlr = ctx; 4629 4630 if (cpl != NULL && spdk_nvme_cpl_is_success(cpl)) { 4631 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ctrlr_set_ana_states, 4632 nvme_ctrlr); 4633 } else { 4634 bdev_nvme_disable_read_ana_log_page(nvme_ctrlr); 4635 } 4636 4637 pthread_mutex_lock(&nvme_ctrlr->mutex); 4638 4639 assert(nvme_ctrlr->ana_log_page_updating == true); 4640 nvme_ctrlr->ana_log_page_updating = false; 4641 4642 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 4643 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4644 4645 nvme_ctrlr_unregister(nvme_ctrlr); 4646 } else { 4647 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4648 4649 bdev_nvme_clear_io_path_caches(nvme_ctrlr); 4650 } 4651 } 4652 4653 static int 4654 nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 4655 { 4656 uint32_t ana_log_page_size; 4657 int rc; 4658 4659 if (nvme_ctrlr->ana_log_page == NULL) { 4660 return -EINVAL; 4661 } 4662 4663 ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); 4664 4665 if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { 4666 SPDK_ERRLOG("ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", 4667 ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); 4668 return -EINVAL; 4669 } 4670 4671 pthread_mutex_lock(&nvme_ctrlr->mutex); 4672 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 4673 nvme_ctrlr->ana_log_page_updating) { 4674 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4675 return -EBUSY; 4676 } 4677 4678 nvme_ctrlr->ana_log_page_updating = true; 4679 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4680 4681 rc = spdk_nvme_ctrlr_cmd_get_log_page(nvme_ctrlr->ctrlr, 4682 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 4683 SPDK_NVME_GLOBAL_NS_TAG, 4684 nvme_ctrlr->ana_log_page, 4685 ana_log_page_size, 0, 4686 nvme_ctrlr_read_ana_log_page_done, 4687 nvme_ctrlr); 4688 if (rc != 0) { 4689 nvme_ctrlr_read_ana_log_page_done(nvme_ctrlr, NULL); 4690 } 4691 4692 return rc; 4693 } 4694 4695 static void 4696 dummy_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 4697 { 4698 } 4699 4700 struct bdev_nvme_set_preferred_path_ctx { 4701 struct spdk_bdev_desc *desc; 4702 struct nvme_ns *nvme_ns; 4703 bdev_nvme_set_preferred_path_cb cb_fn; 4704 void *cb_arg; 4705 }; 4706 4707 static void 4708 bdev_nvme_set_preferred_path_done(struct spdk_io_channel_iter *i, int status) 4709 { 4710 struct bdev_nvme_set_preferred_path_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4711 4712 assert(ctx != NULL); 4713 assert(ctx->desc != NULL); 4714 assert(ctx->cb_fn != NULL); 4715 4716 spdk_bdev_close(ctx->desc); 4717 4718 ctx->cb_fn(ctx->cb_arg, status); 4719 4720 free(ctx); 4721 } 4722 4723 static void 4724 _bdev_nvme_set_preferred_path(struct spdk_io_channel_iter *i) 4725 { 4726 struct bdev_nvme_set_preferred_path_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4727 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4728 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4729 struct nvme_io_path *io_path, *prev; 4730 4731 prev = NULL; 4732 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 4733 if (io_path->nvme_ns == ctx->nvme_ns) { 4734 break; 4735 } 4736 prev = io_path; 4737 } 4738 4739 if (io_path != NULL) { 4740 if (prev != NULL) { 4741 STAILQ_REMOVE_AFTER(&nbdev_ch->io_path_list, prev, stailq); 4742 STAILQ_INSERT_HEAD(&nbdev_ch->io_path_list, io_path, stailq); 4743 } 4744 4745 /* We can set io_path to nbdev_ch->current_io_path directly here. 4746 * However, it needs to be conditional. To simplify the code, 4747 * just clear nbdev_ch->current_io_path and let find_io_path() 4748 * fill it. 4749 * 4750 * Automatic failback may be disabled. Hence even if the io_path is 4751 * already at the head, clear nbdev_ch->current_io_path. 4752 */ 4753 bdev_nvme_clear_current_io_path(nbdev_ch); 4754 } 4755 4756 spdk_for_each_channel_continue(i, 0); 4757 } 4758 4759 static struct nvme_ns * 4760 bdev_nvme_set_preferred_ns(struct nvme_bdev *nbdev, uint16_t cntlid) 4761 { 4762 struct nvme_ns *nvme_ns, *prev; 4763 const struct spdk_nvme_ctrlr_data *cdata; 4764 4765 prev = NULL; 4766 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 4767 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 4768 4769 if (cdata->cntlid == cntlid) { 4770 break; 4771 } 4772 prev = nvme_ns; 4773 } 4774 4775 if (nvme_ns != NULL && prev != NULL) { 4776 TAILQ_REMOVE(&nbdev->nvme_ns_list, nvme_ns, tailq); 4777 TAILQ_INSERT_HEAD(&nbdev->nvme_ns_list, nvme_ns, tailq); 4778 } 4779 4780 return nvme_ns; 4781 } 4782 4783 /* This function supports only multipath mode. There is only a single I/O path 4784 * for each NVMe-oF controller. Hence, just move the matched I/O path to the 4785 * head of the I/O path list for each NVMe bdev channel. 4786 * 4787 * NVMe bdev channel may be acquired after completing this function. move the 4788 * matched namespace to the head of the namespace list for the NVMe bdev too. 4789 */ 4790 void 4791 bdev_nvme_set_preferred_path(const char *name, uint16_t cntlid, 4792 bdev_nvme_set_preferred_path_cb cb_fn, void *cb_arg) 4793 { 4794 struct bdev_nvme_set_preferred_path_ctx *ctx; 4795 struct spdk_bdev *bdev; 4796 struct nvme_bdev *nbdev; 4797 int rc = 0; 4798 4799 assert(cb_fn != NULL); 4800 4801 ctx = calloc(1, sizeof(*ctx)); 4802 if (ctx == NULL) { 4803 SPDK_ERRLOG("Failed to alloc context.\n"); 4804 rc = -ENOMEM; 4805 goto err_alloc; 4806 } 4807 4808 ctx->cb_fn = cb_fn; 4809 ctx->cb_arg = cb_arg; 4810 4811 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 4812 if (rc != 0) { 4813 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 4814 goto err_open; 4815 } 4816 4817 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 4818 4819 if (bdev->module != &nvme_if) { 4820 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 4821 rc = -ENODEV; 4822 goto err_bdev; 4823 } 4824 4825 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 4826 4827 pthread_mutex_lock(&nbdev->mutex); 4828 4829 ctx->nvme_ns = bdev_nvme_set_preferred_ns(nbdev, cntlid); 4830 if (ctx->nvme_ns == NULL) { 4831 pthread_mutex_unlock(&nbdev->mutex); 4832 4833 SPDK_ERRLOG("bdev %s does not have namespace to controller %u.\n", name, cntlid); 4834 rc = -ENODEV; 4835 goto err_bdev; 4836 } 4837 4838 pthread_mutex_unlock(&nbdev->mutex); 4839 4840 spdk_for_each_channel(nbdev, 4841 _bdev_nvme_set_preferred_path, 4842 ctx, 4843 bdev_nvme_set_preferred_path_done); 4844 return; 4845 4846 err_bdev: 4847 spdk_bdev_close(ctx->desc); 4848 err_open: 4849 free(ctx); 4850 err_alloc: 4851 cb_fn(cb_arg, rc); 4852 } 4853 4854 struct bdev_nvme_set_multipath_policy_ctx { 4855 struct spdk_bdev_desc *desc; 4856 bdev_nvme_set_multipath_policy_cb cb_fn; 4857 void *cb_arg; 4858 }; 4859 4860 static void 4861 bdev_nvme_set_multipath_policy_done(struct spdk_io_channel_iter *i, int status) 4862 { 4863 struct bdev_nvme_set_multipath_policy_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4864 4865 assert(ctx != NULL); 4866 assert(ctx->desc != NULL); 4867 assert(ctx->cb_fn != NULL); 4868 4869 spdk_bdev_close(ctx->desc); 4870 4871 ctx->cb_fn(ctx->cb_arg, status); 4872 4873 free(ctx); 4874 } 4875 4876 static void 4877 _bdev_nvme_set_multipath_policy(struct spdk_io_channel_iter *i) 4878 { 4879 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4880 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4881 struct nvme_bdev *nbdev = spdk_io_channel_get_io_device(_ch); 4882 4883 nbdev_ch->mp_policy = nbdev->mp_policy; 4884 nbdev_ch->mp_selector = nbdev->mp_selector; 4885 nbdev_ch->rr_min_io = nbdev->rr_min_io; 4886 bdev_nvme_clear_current_io_path(nbdev_ch); 4887 4888 spdk_for_each_channel_continue(i, 0); 4889 } 4890 4891 void 4892 bdev_nvme_set_multipath_policy(const char *name, enum bdev_nvme_multipath_policy policy, 4893 enum bdev_nvme_multipath_selector selector, uint32_t rr_min_io, 4894 bdev_nvme_set_multipath_policy_cb cb_fn, void *cb_arg) 4895 { 4896 struct bdev_nvme_set_multipath_policy_ctx *ctx; 4897 struct spdk_bdev *bdev; 4898 struct nvme_bdev *nbdev; 4899 int rc; 4900 4901 assert(cb_fn != NULL); 4902 4903 if (policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE && selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 4904 if (rr_min_io == UINT32_MAX) { 4905 rr_min_io = 1; 4906 } else if (rr_min_io == 0) { 4907 rc = -EINVAL; 4908 goto exit; 4909 } 4910 } else if (rr_min_io != UINT32_MAX) { 4911 rc = -EINVAL; 4912 goto exit; 4913 } 4914 4915 ctx = calloc(1, sizeof(*ctx)); 4916 if (ctx == NULL) { 4917 SPDK_ERRLOG("Failed to alloc context.\n"); 4918 rc = -ENOMEM; 4919 goto exit; 4920 } 4921 4922 ctx->cb_fn = cb_fn; 4923 ctx->cb_arg = cb_arg; 4924 4925 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 4926 if (rc != 0) { 4927 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 4928 rc = -ENODEV; 4929 goto err_open; 4930 } 4931 4932 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 4933 if (bdev->module != &nvme_if) { 4934 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 4935 rc = -ENODEV; 4936 goto err_module; 4937 } 4938 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 4939 4940 pthread_mutex_lock(&nbdev->mutex); 4941 nbdev->mp_policy = policy; 4942 nbdev->mp_selector = selector; 4943 nbdev->rr_min_io = rr_min_io; 4944 pthread_mutex_unlock(&nbdev->mutex); 4945 4946 spdk_for_each_channel(nbdev, 4947 _bdev_nvme_set_multipath_policy, 4948 ctx, 4949 bdev_nvme_set_multipath_policy_done); 4950 return; 4951 4952 err_module: 4953 spdk_bdev_close(ctx->desc); 4954 err_open: 4955 free(ctx); 4956 exit: 4957 cb_fn(cb_arg, rc); 4958 } 4959 4960 static void 4961 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 4962 { 4963 struct nvme_ctrlr *nvme_ctrlr = arg; 4964 union spdk_nvme_async_event_completion event; 4965 4966 if (spdk_nvme_cpl_is_error(cpl)) { 4967 SPDK_WARNLOG("AER request execute failed\n"); 4968 return; 4969 } 4970 4971 event.raw = cpl->cdw0; 4972 if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 4973 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) { 4974 nvme_ctrlr_populate_namespaces(nvme_ctrlr, NULL); 4975 } else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 4976 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_ANA_CHANGE)) { 4977 nvme_ctrlr_read_ana_log_page(nvme_ctrlr); 4978 } 4979 } 4980 4981 static void 4982 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, size_t count, int rc) 4983 { 4984 if (ctx->cb_fn) { 4985 ctx->cb_fn(ctx->cb_ctx, count, rc); 4986 } 4987 4988 ctx->namespaces_populated = true; 4989 if (ctx->probe_done) { 4990 /* The probe was already completed, so we need to free the context 4991 * here. This can happen for cases like OCSSD, where we need to 4992 * send additional commands to the SSD after attach. 4993 */ 4994 free(ctx); 4995 } 4996 } 4997 4998 static void 4999 nvme_ctrlr_create_done(struct nvme_ctrlr *nvme_ctrlr, 5000 struct nvme_async_probe_ctx *ctx) 5001 { 5002 spdk_io_device_register(nvme_ctrlr, 5003 bdev_nvme_create_ctrlr_channel_cb, 5004 bdev_nvme_destroy_ctrlr_channel_cb, 5005 sizeof(struct nvme_ctrlr_channel), 5006 nvme_ctrlr->nbdev_ctrlr->name); 5007 5008 nvme_ctrlr_populate_namespaces(nvme_ctrlr, ctx); 5009 } 5010 5011 static void 5012 nvme_ctrlr_init_ana_log_page_done(void *_ctx, const struct spdk_nvme_cpl *cpl) 5013 { 5014 struct nvme_ctrlr *nvme_ctrlr = _ctx; 5015 struct nvme_async_probe_ctx *ctx = nvme_ctrlr->probe_ctx; 5016 5017 nvme_ctrlr->probe_ctx = NULL; 5018 5019 if (spdk_nvme_cpl_is_error(cpl)) { 5020 nvme_ctrlr_delete(nvme_ctrlr); 5021 5022 if (ctx != NULL) { 5023 populate_namespaces_cb(ctx, 0, -1); 5024 } 5025 return; 5026 } 5027 5028 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 5029 } 5030 5031 static int 5032 nvme_ctrlr_init_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 5033 struct nvme_async_probe_ctx *ctx) 5034 { 5035 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 5036 const struct spdk_nvme_ctrlr_data *cdata; 5037 uint32_t ana_log_page_size; 5038 5039 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5040 5041 /* Set buffer size enough to include maximum number of allowed namespaces. */ 5042 ana_log_page_size = sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 5043 sizeof(struct spdk_nvme_ana_group_descriptor) + cdata->mnan * 5044 sizeof(uint32_t); 5045 5046 nvme_ctrlr->ana_log_page = spdk_zmalloc(ana_log_page_size, 64, NULL, 5047 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 5048 if (nvme_ctrlr->ana_log_page == NULL) { 5049 SPDK_ERRLOG("could not allocate ANA log page buffer\n"); 5050 return -ENXIO; 5051 } 5052 5053 /* Each descriptor in a ANA log page is not ensured to be 8-bytes aligned. 5054 * Hence copy each descriptor to a temporary area when parsing it. 5055 * 5056 * Allocate a buffer whose size is as large as ANA log page buffer because 5057 * we do not know the size of a descriptor until actually reading it. 5058 */ 5059 nvme_ctrlr->copied_ana_desc = calloc(1, ana_log_page_size); 5060 if (nvme_ctrlr->copied_ana_desc == NULL) { 5061 SPDK_ERRLOG("could not allocate a buffer to parse ANA descriptor\n"); 5062 return -ENOMEM; 5063 } 5064 5065 nvme_ctrlr->max_ana_log_page_size = ana_log_page_size; 5066 5067 nvme_ctrlr->probe_ctx = ctx; 5068 5069 /* Then, set the read size only to include the current active namespaces. */ 5070 ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); 5071 5072 if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { 5073 SPDK_ERRLOG("ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", 5074 ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); 5075 return -EINVAL; 5076 } 5077 5078 return spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, 5079 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 5080 SPDK_NVME_GLOBAL_NS_TAG, 5081 nvme_ctrlr->ana_log_page, 5082 ana_log_page_size, 0, 5083 nvme_ctrlr_init_ana_log_page_done, 5084 nvme_ctrlr); 5085 } 5086 5087 /* hostnqn and subnqn were already verified before attaching a controller. 5088 * Hence check only the multipath capability and cntlid here. 5089 */ 5090 static bool 5091 bdev_nvme_check_multipath(struct nvme_bdev_ctrlr *nbdev_ctrlr, struct spdk_nvme_ctrlr *ctrlr) 5092 { 5093 struct nvme_ctrlr *tmp; 5094 const struct spdk_nvme_ctrlr_data *cdata, *tmp_cdata; 5095 5096 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5097 5098 if (!cdata->cmic.multi_ctrlr) { 5099 SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); 5100 return false; 5101 } 5102 5103 TAILQ_FOREACH(tmp, &nbdev_ctrlr->ctrlrs, tailq) { 5104 tmp_cdata = spdk_nvme_ctrlr_get_data(tmp->ctrlr); 5105 5106 if (!tmp_cdata->cmic.multi_ctrlr) { 5107 SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); 5108 return false; 5109 } 5110 if (cdata->cntlid == tmp_cdata->cntlid) { 5111 SPDK_ERRLOG("cntlid %u are duplicated.\n", tmp_cdata->cntlid); 5112 return false; 5113 } 5114 } 5115 5116 return true; 5117 } 5118 5119 static int 5120 nvme_bdev_ctrlr_create(const char *name, struct nvme_ctrlr *nvme_ctrlr) 5121 { 5122 struct nvme_bdev_ctrlr *nbdev_ctrlr; 5123 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 5124 int rc = 0; 5125 5126 pthread_mutex_lock(&g_bdev_nvme_mutex); 5127 5128 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 5129 if (nbdev_ctrlr != NULL) { 5130 if (!bdev_nvme_check_multipath(nbdev_ctrlr, ctrlr)) { 5131 rc = -EINVAL; 5132 goto exit; 5133 } 5134 } else { 5135 nbdev_ctrlr = calloc(1, sizeof(*nbdev_ctrlr)); 5136 if (nbdev_ctrlr == NULL) { 5137 SPDK_ERRLOG("Failed to allocate nvme_bdev_ctrlr.\n"); 5138 rc = -ENOMEM; 5139 goto exit; 5140 } 5141 nbdev_ctrlr->name = strdup(name); 5142 if (nbdev_ctrlr->name == NULL) { 5143 SPDK_ERRLOG("Failed to allocate name of nvme_bdev_ctrlr.\n"); 5144 free(nbdev_ctrlr); 5145 goto exit; 5146 } 5147 TAILQ_INIT(&nbdev_ctrlr->ctrlrs); 5148 TAILQ_INIT(&nbdev_ctrlr->bdevs); 5149 TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 5150 } 5151 nvme_ctrlr->nbdev_ctrlr = nbdev_ctrlr; 5152 TAILQ_INSERT_TAIL(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 5153 exit: 5154 pthread_mutex_unlock(&g_bdev_nvme_mutex); 5155 return rc; 5156 } 5157 5158 static int 5159 nvme_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr, 5160 const char *name, 5161 const struct spdk_nvme_transport_id *trid, 5162 struct nvme_async_probe_ctx *ctx) 5163 { 5164 struct nvme_ctrlr *nvme_ctrlr; 5165 struct nvme_path_id *path_id; 5166 const struct spdk_nvme_ctrlr_data *cdata; 5167 int rc; 5168 5169 nvme_ctrlr = calloc(1, sizeof(*nvme_ctrlr)); 5170 if (nvme_ctrlr == NULL) { 5171 SPDK_ERRLOG("Failed to allocate device struct\n"); 5172 return -ENOMEM; 5173 } 5174 5175 rc = pthread_mutex_init(&nvme_ctrlr->mutex, NULL); 5176 if (rc != 0) { 5177 free(nvme_ctrlr); 5178 return rc; 5179 } 5180 5181 TAILQ_INIT(&nvme_ctrlr->trids); 5182 5183 RB_INIT(&nvme_ctrlr->namespaces); 5184 5185 path_id = calloc(1, sizeof(*path_id)); 5186 if (path_id == NULL) { 5187 SPDK_ERRLOG("Failed to allocate trid entry pointer\n"); 5188 rc = -ENOMEM; 5189 goto err; 5190 } 5191 5192 path_id->trid = *trid; 5193 if (ctx != NULL) { 5194 memcpy(path_id->hostid.hostaddr, ctx->drv_opts.src_addr, sizeof(path_id->hostid.hostaddr)); 5195 memcpy(path_id->hostid.hostsvcid, ctx->drv_opts.src_svcid, sizeof(path_id->hostid.hostsvcid)); 5196 } 5197 nvme_ctrlr->active_path_id = path_id; 5198 TAILQ_INSERT_HEAD(&nvme_ctrlr->trids, path_id, link); 5199 5200 nvme_ctrlr->thread = spdk_get_thread(); 5201 nvme_ctrlr->ctrlr = ctrlr; 5202 nvme_ctrlr->ref = 1; 5203 5204 if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) { 5205 SPDK_ERRLOG("OCSSDs are not supported"); 5206 rc = -ENOTSUP; 5207 goto err; 5208 } 5209 5210 if (ctx != NULL) { 5211 memcpy(&nvme_ctrlr->opts, &ctx->bdev_opts, sizeof(ctx->bdev_opts)); 5212 } else { 5213 bdev_nvme_get_default_ctrlr_opts(&nvme_ctrlr->opts); 5214 } 5215 5216 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_ctrlr, 5217 g_opts.nvme_adminq_poll_period_us); 5218 5219 if (g_opts.timeout_us > 0) { 5220 /* Register timeout callback. Timeout values for IO vs. admin reqs can be different. */ 5221 /* If timeout_admin_us is 0 (not specified), admin uses same timeout as IO. */ 5222 uint64_t adm_timeout_us = (g_opts.timeout_admin_us == 0) ? 5223 g_opts.timeout_us : g_opts.timeout_admin_us; 5224 spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us, 5225 adm_timeout_us, timeout_cb, nvme_ctrlr); 5226 } 5227 5228 spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_ctrlr); 5229 spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_ctrlr); 5230 5231 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 5232 SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) { 5233 nvme_ctrlr->opal_dev = spdk_opal_dev_construct(ctrlr); 5234 } 5235 5236 rc = nvme_bdev_ctrlr_create(name, nvme_ctrlr); 5237 if (rc != 0) { 5238 goto err; 5239 } 5240 5241 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5242 5243 if (cdata->cmic.ana_reporting) { 5244 rc = nvme_ctrlr_init_ana_log_page(nvme_ctrlr, ctx); 5245 if (rc == 0) { 5246 return 0; 5247 } 5248 } else { 5249 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 5250 return 0; 5251 } 5252 5253 err: 5254 nvme_ctrlr_delete(nvme_ctrlr); 5255 return rc; 5256 } 5257 5258 void 5259 bdev_nvme_get_default_ctrlr_opts(struct nvme_ctrlr_opts *opts) 5260 { 5261 opts->prchk_flags = 0; 5262 opts->ctrlr_loss_timeout_sec = g_opts.ctrlr_loss_timeout_sec; 5263 opts->reconnect_delay_sec = g_opts.reconnect_delay_sec; 5264 opts->fast_io_fail_timeout_sec = g_opts.fast_io_fail_timeout_sec; 5265 } 5266 5267 static void 5268 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5269 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *drv_opts) 5270 { 5271 char *name; 5272 5273 name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++); 5274 if (!name) { 5275 SPDK_ERRLOG("Failed to assign name to NVMe device\n"); 5276 return; 5277 } 5278 5279 if (nvme_ctrlr_create(ctrlr, name, trid, NULL) == 0) { 5280 SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name); 5281 } else { 5282 SPDK_ERRLOG("Failed to attach to %s (%s)\n", trid->traddr, name); 5283 } 5284 5285 free(name); 5286 } 5287 5288 static void 5289 _nvme_ctrlr_destruct(void *ctx) 5290 { 5291 struct nvme_ctrlr *nvme_ctrlr = ctx; 5292 5293 nvme_ctrlr_depopulate_namespaces(nvme_ctrlr); 5294 nvme_ctrlr_release(nvme_ctrlr); 5295 } 5296 5297 static int 5298 bdev_nvme_delete_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) 5299 { 5300 struct nvme_probe_skip_entry *entry; 5301 5302 /* The controller's destruction was already started */ 5303 if (nvme_ctrlr->destruct) { 5304 return -EALREADY; 5305 } 5306 5307 if (!hotplug && 5308 nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { 5309 entry = calloc(1, sizeof(*entry)); 5310 if (!entry) { 5311 return -ENOMEM; 5312 } 5313 entry->trid = nvme_ctrlr->active_path_id->trid; 5314 TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq); 5315 } 5316 5317 nvme_ctrlr->destruct = true; 5318 return 0; 5319 } 5320 5321 static int 5322 bdev_nvme_delete_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) 5323 { 5324 int rc; 5325 5326 pthread_mutex_lock(&nvme_ctrlr->mutex); 5327 rc = bdev_nvme_delete_ctrlr_unsafe(nvme_ctrlr, hotplug); 5328 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5329 5330 if (rc == 0) { 5331 _nvme_ctrlr_destruct(nvme_ctrlr); 5332 } else if (rc == -EALREADY) { 5333 rc = 0; 5334 } 5335 5336 return rc; 5337 } 5338 5339 static void 5340 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr) 5341 { 5342 struct nvme_ctrlr *nvme_ctrlr = cb_ctx; 5343 5344 bdev_nvme_delete_ctrlr(nvme_ctrlr, true); 5345 } 5346 5347 static int 5348 bdev_nvme_hotplug_probe(void *arg) 5349 { 5350 if (g_hotplug_probe_ctx == NULL) { 5351 spdk_poller_unregister(&g_hotplug_probe_poller); 5352 return SPDK_POLLER_IDLE; 5353 } 5354 5355 if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) { 5356 g_hotplug_probe_ctx = NULL; 5357 spdk_poller_unregister(&g_hotplug_probe_poller); 5358 } 5359 5360 return SPDK_POLLER_BUSY; 5361 } 5362 5363 static int 5364 bdev_nvme_hotplug(void *arg) 5365 { 5366 struct spdk_nvme_transport_id trid_pcie; 5367 5368 if (g_hotplug_probe_ctx) { 5369 return SPDK_POLLER_BUSY; 5370 } 5371 5372 memset(&trid_pcie, 0, sizeof(trid_pcie)); 5373 spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE); 5374 5375 g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL, 5376 hotplug_probe_cb, attach_cb, NULL); 5377 5378 if (g_hotplug_probe_ctx) { 5379 assert(g_hotplug_probe_poller == NULL); 5380 g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000); 5381 } 5382 5383 return SPDK_POLLER_BUSY; 5384 } 5385 5386 void 5387 bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts) 5388 { 5389 *opts = g_opts; 5390 } 5391 5392 static bool bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, 5393 uint32_t reconnect_delay_sec, 5394 uint32_t fast_io_fail_timeout_sec); 5395 5396 static int 5397 bdev_nvme_validate_opts(const struct spdk_bdev_nvme_opts *opts) 5398 { 5399 if ((opts->timeout_us == 0) && (opts->timeout_admin_us != 0)) { 5400 /* Can't set timeout_admin_us without also setting timeout_us */ 5401 SPDK_WARNLOG("Invalid options: Can't have (timeout_us == 0) with (timeout_admin_us > 0)\n"); 5402 return -EINVAL; 5403 } 5404 5405 if (opts->bdev_retry_count < -1) { 5406 SPDK_WARNLOG("Invalid option: bdev_retry_count can't be less than -1.\n"); 5407 return -EINVAL; 5408 } 5409 5410 if (!bdev_nvme_check_io_error_resiliency_params(opts->ctrlr_loss_timeout_sec, 5411 opts->reconnect_delay_sec, 5412 opts->fast_io_fail_timeout_sec)) { 5413 return -EINVAL; 5414 } 5415 5416 return 0; 5417 } 5418 5419 int 5420 bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts) 5421 { 5422 int ret; 5423 5424 ret = bdev_nvme_validate_opts(opts); 5425 if (ret) { 5426 SPDK_WARNLOG("Failed to set nvme opts.\n"); 5427 return ret; 5428 } 5429 5430 if (g_bdev_nvme_init_thread != NULL) { 5431 if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 5432 return -EPERM; 5433 } 5434 } 5435 5436 if (opts->rdma_srq_size != 0) { 5437 struct spdk_nvme_transport_opts drv_opts; 5438 5439 spdk_nvme_transport_get_opts(&drv_opts, sizeof(drv_opts)); 5440 drv_opts.rdma_srq_size = opts->rdma_srq_size; 5441 5442 ret = spdk_nvme_transport_set_opts(&drv_opts, sizeof(drv_opts)); 5443 if (ret) { 5444 SPDK_ERRLOG("Failed to set NVMe transport opts.\n"); 5445 return ret; 5446 } 5447 } 5448 5449 g_opts = *opts; 5450 5451 return 0; 5452 } 5453 5454 struct set_nvme_hotplug_ctx { 5455 uint64_t period_us; 5456 bool enabled; 5457 spdk_msg_fn fn; 5458 void *fn_ctx; 5459 }; 5460 5461 static void 5462 set_nvme_hotplug_period_cb(void *_ctx) 5463 { 5464 struct set_nvme_hotplug_ctx *ctx = _ctx; 5465 5466 spdk_poller_unregister(&g_hotplug_poller); 5467 if (ctx->enabled) { 5468 g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us); 5469 } 5470 5471 g_nvme_hotplug_poll_period_us = ctx->period_us; 5472 g_nvme_hotplug_enabled = ctx->enabled; 5473 if (ctx->fn) { 5474 ctx->fn(ctx->fn_ctx); 5475 } 5476 5477 free(ctx); 5478 } 5479 5480 int 5481 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx) 5482 { 5483 struct set_nvme_hotplug_ctx *ctx; 5484 5485 if (enabled == true && !spdk_process_is_primary()) { 5486 return -EPERM; 5487 } 5488 5489 ctx = calloc(1, sizeof(*ctx)); 5490 if (ctx == NULL) { 5491 return -ENOMEM; 5492 } 5493 5494 period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us; 5495 ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX); 5496 ctx->enabled = enabled; 5497 ctx->fn = cb; 5498 ctx->fn_ctx = cb_ctx; 5499 5500 spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx); 5501 return 0; 5502 } 5503 5504 static void 5505 nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 5506 struct nvme_async_probe_ctx *ctx) 5507 { 5508 struct nvme_ns *nvme_ns; 5509 struct nvme_bdev *nvme_bdev; 5510 size_t j; 5511 5512 assert(nvme_ctrlr != NULL); 5513 5514 if (ctx->names == NULL) { 5515 populate_namespaces_cb(ctx, 0, 0); 5516 return; 5517 } 5518 5519 /* 5520 * Report the new bdevs that were created in this call. 5521 * There can be more than one bdev per NVMe controller. 5522 */ 5523 j = 0; 5524 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 5525 while (nvme_ns != NULL) { 5526 nvme_bdev = nvme_ns->bdev; 5527 if (j < ctx->count) { 5528 ctx->names[j] = nvme_bdev->disk.name; 5529 j++; 5530 } else { 5531 SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n", 5532 ctx->count); 5533 populate_namespaces_cb(ctx, 0, -ERANGE); 5534 return; 5535 } 5536 5537 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 5538 } 5539 5540 populate_namespaces_cb(ctx, j, 0); 5541 } 5542 5543 static int 5544 bdev_nvme_check_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 5545 struct spdk_nvme_ctrlr *new_ctrlr, 5546 struct spdk_nvme_transport_id *trid) 5547 { 5548 struct nvme_path_id *tmp_trid; 5549 5550 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 5551 SPDK_ERRLOG("PCIe failover is not supported.\n"); 5552 return -ENOTSUP; 5553 } 5554 5555 /* Currently we only support failover to the same transport type. */ 5556 if (nvme_ctrlr->active_path_id->trid.trtype != trid->trtype) { 5557 SPDK_WARNLOG("Failover from trtype: %s to a different trtype: %s is not supported currently\n", 5558 spdk_nvme_transport_id_trtype_str(nvme_ctrlr->active_path_id->trid.trtype), 5559 spdk_nvme_transport_id_trtype_str(trid->trtype)); 5560 return -EINVAL; 5561 } 5562 5563 5564 /* Currently we only support failover to the same NQN. */ 5565 if (strncmp(trid->subnqn, nvme_ctrlr->active_path_id->trid.subnqn, SPDK_NVMF_NQN_MAX_LEN)) { 5566 SPDK_WARNLOG("Failover from subnqn: %s to a different subnqn: %s is not supported currently\n", 5567 nvme_ctrlr->active_path_id->trid.subnqn, trid->subnqn); 5568 return -EINVAL; 5569 } 5570 5571 /* Skip all the other checks if we've already registered this path. */ 5572 TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) { 5573 if (!spdk_nvme_transport_id_compare(&tmp_trid->trid, trid)) { 5574 SPDK_WARNLOG("This path (traddr: %s subnqn: %s) is already registered\n", trid->traddr, 5575 trid->subnqn); 5576 return -EEXIST; 5577 } 5578 } 5579 5580 return 0; 5581 } 5582 5583 static int 5584 bdev_nvme_check_secondary_namespace(struct nvme_ctrlr *nvme_ctrlr, 5585 struct spdk_nvme_ctrlr *new_ctrlr) 5586 { 5587 struct nvme_ns *nvme_ns; 5588 struct spdk_nvme_ns *new_ns; 5589 5590 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 5591 while (nvme_ns != NULL) { 5592 new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nvme_ns->id); 5593 assert(new_ns != NULL); 5594 5595 if (!bdev_nvme_compare_ns(nvme_ns->ns, new_ns)) { 5596 return -EINVAL; 5597 } 5598 5599 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 5600 } 5601 5602 return 0; 5603 } 5604 5605 static int 5606 _bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 5607 struct spdk_nvme_transport_id *trid) 5608 { 5609 struct nvme_path_id *active_id, *new_trid, *tmp_trid; 5610 5611 new_trid = calloc(1, sizeof(*new_trid)); 5612 if (new_trid == NULL) { 5613 return -ENOMEM; 5614 } 5615 new_trid->trid = *trid; 5616 5617 active_id = nvme_ctrlr->active_path_id; 5618 assert(active_id != NULL); 5619 assert(active_id == TAILQ_FIRST(&nvme_ctrlr->trids)); 5620 5621 /* Skip the active trid not to replace it until it is failed. */ 5622 tmp_trid = TAILQ_NEXT(active_id, link); 5623 if (tmp_trid == NULL) { 5624 goto add_tail; 5625 } 5626 5627 /* It means the trid is faled if its last failed time is non-zero. 5628 * Insert the new alternate trid before any failed trid. 5629 */ 5630 TAILQ_FOREACH_FROM(tmp_trid, &nvme_ctrlr->trids, link) { 5631 if (tmp_trid->last_failed_tsc != 0) { 5632 TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link); 5633 return 0; 5634 } 5635 } 5636 5637 add_tail: 5638 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, new_trid, link); 5639 return 0; 5640 } 5641 5642 /* This is the case that a secondary path is added to an existing 5643 * nvme_ctrlr for failover. After checking if it can access the same 5644 * namespaces as the primary path, it is disconnected until failover occurs. 5645 */ 5646 static int 5647 bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 5648 struct spdk_nvme_ctrlr *new_ctrlr, 5649 struct spdk_nvme_transport_id *trid) 5650 { 5651 int rc; 5652 5653 assert(nvme_ctrlr != NULL); 5654 5655 pthread_mutex_lock(&nvme_ctrlr->mutex); 5656 5657 rc = bdev_nvme_check_secondary_trid(nvme_ctrlr, new_ctrlr, trid); 5658 if (rc != 0) { 5659 goto exit; 5660 } 5661 5662 rc = bdev_nvme_check_secondary_namespace(nvme_ctrlr, new_ctrlr); 5663 if (rc != 0) { 5664 goto exit; 5665 } 5666 5667 rc = _bdev_nvme_add_secondary_trid(nvme_ctrlr, trid); 5668 5669 exit: 5670 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5671 5672 spdk_nvme_detach(new_ctrlr); 5673 5674 return rc; 5675 } 5676 5677 static void 5678 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5679 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 5680 { 5681 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 5682 struct nvme_async_probe_ctx *ctx; 5683 int rc; 5684 5685 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 5686 ctx->ctrlr_attached = true; 5687 5688 rc = nvme_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx); 5689 if (rc != 0) { 5690 populate_namespaces_cb(ctx, 0, rc); 5691 } 5692 } 5693 5694 static void 5695 connect_set_failover_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5696 struct spdk_nvme_ctrlr *ctrlr, 5697 const struct spdk_nvme_ctrlr_opts *opts) 5698 { 5699 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 5700 struct nvme_ctrlr *nvme_ctrlr; 5701 struct nvme_async_probe_ctx *ctx; 5702 int rc; 5703 5704 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 5705 ctx->ctrlr_attached = true; 5706 5707 nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->base_name); 5708 if (nvme_ctrlr) { 5709 rc = bdev_nvme_add_secondary_trid(nvme_ctrlr, ctrlr, &ctx->trid); 5710 } else { 5711 rc = -ENODEV; 5712 } 5713 5714 populate_namespaces_cb(ctx, 0, rc); 5715 } 5716 5717 static int 5718 bdev_nvme_async_poll(void *arg) 5719 { 5720 struct nvme_async_probe_ctx *ctx = arg; 5721 int rc; 5722 5723 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 5724 if (spdk_unlikely(rc != -EAGAIN)) { 5725 ctx->probe_done = true; 5726 spdk_poller_unregister(&ctx->poller); 5727 if (!ctx->ctrlr_attached) { 5728 /* The probe is done, but no controller was attached. 5729 * That means we had a failure, so report -EIO back to 5730 * the caller (usually the RPC). populate_namespaces_cb() 5731 * will take care of freeing the nvme_async_probe_ctx. 5732 */ 5733 populate_namespaces_cb(ctx, 0, -EIO); 5734 } else if (ctx->namespaces_populated) { 5735 /* The namespaces for the attached controller were all 5736 * populated and the response was already sent to the 5737 * caller (usually the RPC). So free the context here. 5738 */ 5739 free(ctx); 5740 } 5741 } 5742 5743 return SPDK_POLLER_BUSY; 5744 } 5745 5746 static bool 5747 bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, 5748 uint32_t reconnect_delay_sec, 5749 uint32_t fast_io_fail_timeout_sec) 5750 { 5751 if (ctrlr_loss_timeout_sec < -1) { 5752 SPDK_ERRLOG("ctrlr_loss_timeout_sec can't be less than -1.\n"); 5753 return false; 5754 } else if (ctrlr_loss_timeout_sec == -1) { 5755 if (reconnect_delay_sec == 0) { 5756 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 5757 return false; 5758 } else if (fast_io_fail_timeout_sec != 0 && 5759 fast_io_fail_timeout_sec < reconnect_delay_sec) { 5760 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io-fail_timeout_sec.\n"); 5761 return false; 5762 } 5763 } else if (ctrlr_loss_timeout_sec != 0) { 5764 if (reconnect_delay_sec == 0) { 5765 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 5766 return false; 5767 } else if (reconnect_delay_sec > (uint32_t)ctrlr_loss_timeout_sec) { 5768 SPDK_ERRLOG("reconnect_delay_sec can't be more than ctrlr_loss_timeout_sec.\n"); 5769 return false; 5770 } else if (fast_io_fail_timeout_sec != 0) { 5771 if (fast_io_fail_timeout_sec < reconnect_delay_sec) { 5772 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io_fail_timeout_sec.\n"); 5773 return false; 5774 } else if (fast_io_fail_timeout_sec > (uint32_t)ctrlr_loss_timeout_sec) { 5775 SPDK_ERRLOG("fast_io_fail_timeout_sec can't be more than ctrlr_loss_timeout_sec.\n"); 5776 return false; 5777 } 5778 } 5779 } else if (reconnect_delay_sec != 0 || fast_io_fail_timeout_sec != 0) { 5780 SPDK_ERRLOG("Both reconnect_delay_sec and fast_io_fail_timeout_sec must be 0 if ctrlr_loss_timeout_sec is 0.\n"); 5781 return false; 5782 } 5783 5784 return true; 5785 } 5786 5787 int 5788 bdev_nvme_create(struct spdk_nvme_transport_id *trid, 5789 const char *base_name, 5790 const char **names, 5791 uint32_t count, 5792 spdk_bdev_create_nvme_fn cb_fn, 5793 void *cb_ctx, 5794 struct spdk_nvme_ctrlr_opts *drv_opts, 5795 struct nvme_ctrlr_opts *bdev_opts, 5796 bool multipath) 5797 { 5798 struct nvme_probe_skip_entry *entry, *tmp; 5799 struct nvme_async_probe_ctx *ctx; 5800 spdk_nvme_attach_cb attach_cb; 5801 5802 /* TODO expand this check to include both the host and target TRIDs. 5803 * Only if both are the same should we fail. 5804 */ 5805 if (nvme_ctrlr_get(trid) != NULL) { 5806 SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr); 5807 return -EEXIST; 5808 } 5809 5810 if (bdev_opts != NULL && 5811 !bdev_nvme_check_io_error_resiliency_params(bdev_opts->ctrlr_loss_timeout_sec, 5812 bdev_opts->reconnect_delay_sec, 5813 bdev_opts->fast_io_fail_timeout_sec)) { 5814 return -EINVAL; 5815 } 5816 5817 ctx = calloc(1, sizeof(*ctx)); 5818 if (!ctx) { 5819 return -ENOMEM; 5820 } 5821 ctx->base_name = base_name; 5822 ctx->names = names; 5823 ctx->count = count; 5824 ctx->cb_fn = cb_fn; 5825 ctx->cb_ctx = cb_ctx; 5826 ctx->trid = *trid; 5827 5828 if (bdev_opts) { 5829 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 5830 } else { 5831 bdev_nvme_get_default_ctrlr_opts(&ctx->bdev_opts); 5832 } 5833 5834 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 5835 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) { 5836 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 5837 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 5838 free(entry); 5839 break; 5840 } 5841 } 5842 } 5843 5844 if (drv_opts) { 5845 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 5846 } else { 5847 spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->drv_opts, sizeof(ctx->drv_opts)); 5848 } 5849 5850 ctx->drv_opts.transport_retry_count = g_opts.transport_retry_count; 5851 ctx->drv_opts.transport_ack_timeout = g_opts.transport_ack_timeout; 5852 ctx->drv_opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms; 5853 ctx->drv_opts.disable_read_ana_log_page = true; 5854 ctx->drv_opts.transport_tos = g_opts.transport_tos; 5855 5856 if (nvme_bdev_ctrlr_get_by_name(base_name) == NULL || multipath) { 5857 attach_cb = connect_attach_cb; 5858 } else { 5859 attach_cb = connect_set_failover_cb; 5860 } 5861 5862 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, attach_cb); 5863 if (ctx->probe_ctx == NULL) { 5864 SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr); 5865 free(ctx); 5866 return -ENODEV; 5867 } 5868 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000); 5869 5870 return 0; 5871 } 5872 5873 static bool 5874 nvme_path_should_delete(struct nvme_path_id *p, const struct nvme_path_id *path_id) 5875 { 5876 if (path_id->trid.trtype != 0) { 5877 if (path_id->trid.trtype == SPDK_NVME_TRANSPORT_CUSTOM) { 5878 if (strcasecmp(path_id->trid.trstring, p->trid.trstring) != 0) { 5879 return false; 5880 } 5881 } else { 5882 if (path_id->trid.trtype != p->trid.trtype) { 5883 return false; 5884 } 5885 } 5886 } 5887 5888 if (!spdk_mem_all_zero(path_id->trid.traddr, sizeof(path_id->trid.traddr))) { 5889 if (strcasecmp(path_id->trid.traddr, p->trid.traddr) != 0) { 5890 return false; 5891 } 5892 } 5893 5894 if (path_id->trid.adrfam != 0) { 5895 if (path_id->trid.adrfam != p->trid.adrfam) { 5896 return false; 5897 } 5898 } 5899 5900 if (!spdk_mem_all_zero(path_id->trid.trsvcid, sizeof(path_id->trid.trsvcid))) { 5901 if (strcasecmp(path_id->trid.trsvcid, p->trid.trsvcid) != 0) { 5902 return false; 5903 } 5904 } 5905 5906 if (!spdk_mem_all_zero(path_id->trid.subnqn, sizeof(path_id->trid.subnqn))) { 5907 if (strcmp(path_id->trid.subnqn, p->trid.subnqn) != 0) { 5908 return false; 5909 } 5910 } 5911 5912 if (!spdk_mem_all_zero(path_id->hostid.hostaddr, sizeof(path_id->hostid.hostaddr))) { 5913 if (strcmp(path_id->hostid.hostaddr, p->hostid.hostaddr) != 0) { 5914 return false; 5915 } 5916 } 5917 5918 if (!spdk_mem_all_zero(path_id->hostid.hostsvcid, sizeof(path_id->hostid.hostsvcid))) { 5919 if (strcmp(path_id->hostid.hostsvcid, p->hostid.hostsvcid) != 0) { 5920 return false; 5921 } 5922 } 5923 5924 return true; 5925 } 5926 5927 static int 5928 _bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, const struct nvme_path_id *path_id) 5929 { 5930 struct nvme_path_id *p, *t; 5931 spdk_msg_fn msg_fn; 5932 int rc = -ENXIO; 5933 5934 pthread_mutex_lock(&nvme_ctrlr->mutex); 5935 5936 TAILQ_FOREACH_REVERSE_SAFE(p, &nvme_ctrlr->trids, nvme_paths, link, t) { 5937 if (p == TAILQ_FIRST(&nvme_ctrlr->trids)) { 5938 break; 5939 } 5940 5941 if (!nvme_path_should_delete(p, path_id)) { 5942 continue; 5943 } 5944 5945 /* We are not using the specified path. */ 5946 TAILQ_REMOVE(&nvme_ctrlr->trids, p, link); 5947 free(p); 5948 rc = 0; 5949 } 5950 5951 if (p == NULL || !nvme_path_should_delete(p, path_id)) { 5952 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5953 return rc; 5954 } 5955 5956 /* If we made it here, then this path is a match! Now we need to remove it. */ 5957 5958 /* This is the active path in use right now. The active path is always the first in the list. */ 5959 assert(p == nvme_ctrlr->active_path_id); 5960 5961 if (!TAILQ_NEXT(p, link)) { 5962 /* The current path is the only path. */ 5963 msg_fn = _nvme_ctrlr_destruct; 5964 rc = bdev_nvme_delete_ctrlr_unsafe(nvme_ctrlr, false); 5965 } else { 5966 /* There is an alternative path. */ 5967 msg_fn = _bdev_nvme_reset_ctrlr; 5968 rc = bdev_nvme_failover_ctrlr_unsafe(nvme_ctrlr, true); 5969 } 5970 5971 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5972 5973 if (rc == 0) { 5974 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 5975 } else if (rc == -EALREADY) { 5976 rc = 0; 5977 } 5978 5979 return rc; 5980 } 5981 5982 int 5983 bdev_nvme_delete(const char *name, const struct nvme_path_id *path_id) 5984 { 5985 struct nvme_bdev_ctrlr *nbdev_ctrlr; 5986 struct nvme_ctrlr *nvme_ctrlr, *tmp_nvme_ctrlr; 5987 int rc = -ENXIO, _rc; 5988 5989 if (name == NULL || path_id == NULL) { 5990 return -EINVAL; 5991 } 5992 5993 pthread_mutex_lock(&g_bdev_nvme_mutex); 5994 5995 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 5996 if (nbdev_ctrlr == NULL) { 5997 pthread_mutex_unlock(&g_bdev_nvme_mutex); 5998 5999 SPDK_ERRLOG("Failed to find NVMe bdev controller\n"); 6000 return -ENODEV; 6001 } 6002 6003 TAILQ_FOREACH_SAFE(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq, tmp_nvme_ctrlr) { 6004 _rc = _bdev_nvme_delete(nvme_ctrlr, path_id); 6005 if (_rc < 0 && _rc != -ENXIO) { 6006 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6007 6008 return _rc; 6009 } else if (_rc == 0) { 6010 /* We traverse all remaining nvme_ctrlrs even if one nvme_ctrlr 6011 * was deleted successfully. To remember the successful deletion, 6012 * overwrite rc only if _rc is zero. 6013 */ 6014 rc = 0; 6015 } 6016 } 6017 6018 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6019 6020 /* All nvme_ctrlrs were deleted or no nvme_ctrlr which had the trid was found. */ 6021 return rc; 6022 } 6023 6024 #define DISCOVERY_INFOLOG(ctx, format, ...) \ 6025 SPDK_INFOLOG(bdev_nvme, "Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 6026 6027 #define DISCOVERY_ERRLOG(ctx, format, ...) \ 6028 SPDK_ERRLOG("Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 6029 6030 struct discovery_entry_ctx { 6031 char name[128]; 6032 struct spdk_nvme_transport_id trid; 6033 struct spdk_nvme_ctrlr_opts drv_opts; 6034 struct spdk_nvmf_discovery_log_page_entry entry; 6035 TAILQ_ENTRY(discovery_entry_ctx) tailq; 6036 struct discovery_ctx *ctx; 6037 }; 6038 6039 struct discovery_ctx { 6040 char *name; 6041 spdk_bdev_nvme_start_discovery_fn start_cb_fn; 6042 spdk_bdev_nvme_stop_discovery_fn stop_cb_fn; 6043 void *cb_ctx; 6044 struct spdk_nvme_probe_ctx *probe_ctx; 6045 struct spdk_nvme_detach_ctx *detach_ctx; 6046 struct spdk_nvme_ctrlr *ctrlr; 6047 struct spdk_nvme_transport_id trid; 6048 struct discovery_entry_ctx *entry_ctx_in_use; 6049 struct spdk_poller *poller; 6050 struct spdk_nvme_ctrlr_opts drv_opts; 6051 struct nvme_ctrlr_opts bdev_opts; 6052 struct spdk_nvmf_discovery_log_page *log_page; 6053 TAILQ_ENTRY(discovery_ctx) tailq; 6054 TAILQ_HEAD(, discovery_entry_ctx) nvm_entry_ctxs; 6055 TAILQ_HEAD(, discovery_entry_ctx) discovery_entry_ctxs; 6056 int rc; 6057 bool wait_for_attach; 6058 uint64_t timeout_ticks; 6059 /* Denotes that the discovery service is being started. We're waiting 6060 * for the initial connection to the discovery controller to be 6061 * established and attach discovered NVM ctrlrs. 6062 */ 6063 bool initializing; 6064 /* Denotes if a discovery is currently in progress for this context. 6065 * That includes connecting to newly discovered subsystems. Used to 6066 * ensure we do not start a new discovery until an existing one is 6067 * complete. 6068 */ 6069 bool in_progress; 6070 6071 /* Denotes if another discovery is needed after the one in progress 6072 * completes. Set when we receive an AER completion while a discovery 6073 * is already in progress. 6074 */ 6075 bool pending; 6076 6077 /* Signal to the discovery context poller that it should stop the 6078 * discovery service, including detaching from the current discovery 6079 * controller. 6080 */ 6081 bool stop; 6082 6083 struct spdk_thread *calling_thread; 6084 uint32_t index; 6085 uint32_t attach_in_progress; 6086 char *hostnqn; 6087 6088 /* Denotes if the discovery service was started by the mdns discovery. 6089 */ 6090 bool from_mdns_discovery_service; 6091 }; 6092 6093 TAILQ_HEAD(discovery_ctxs, discovery_ctx); 6094 static struct discovery_ctxs g_discovery_ctxs = TAILQ_HEAD_INITIALIZER(g_discovery_ctxs); 6095 6096 static void get_discovery_log_page(struct discovery_ctx *ctx); 6097 6098 static void 6099 free_discovery_ctx(struct discovery_ctx *ctx) 6100 { 6101 free(ctx->log_page); 6102 free(ctx->hostnqn); 6103 free(ctx->name); 6104 free(ctx); 6105 } 6106 6107 static void 6108 discovery_complete(struct discovery_ctx *ctx) 6109 { 6110 ctx->initializing = false; 6111 ctx->in_progress = false; 6112 if (ctx->pending) { 6113 ctx->pending = false; 6114 get_discovery_log_page(ctx); 6115 } 6116 } 6117 6118 static void 6119 build_trid_from_log_page_entry(struct spdk_nvme_transport_id *trid, 6120 struct spdk_nvmf_discovery_log_page_entry *entry) 6121 { 6122 char *space; 6123 6124 trid->trtype = entry->trtype; 6125 trid->adrfam = entry->adrfam; 6126 memcpy(trid->traddr, entry->traddr, sizeof(entry->traddr)); 6127 memcpy(trid->trsvcid, entry->trsvcid, sizeof(entry->trsvcid)); 6128 /* Because the source buffer (entry->subnqn) is longer than trid->subnqn, and 6129 * before call to this function trid->subnqn is zeroed out, we need 6130 * to copy sizeof(trid->subnqn) minus one byte to make sure the last character 6131 * remains 0. Then we can shorten the string (replace ' ' with 0) if required 6132 */ 6133 memcpy(trid->subnqn, entry->subnqn, sizeof(trid->subnqn) - 1); 6134 6135 /* We want the traddr, trsvcid and subnqn fields to be NULL-terminated. 6136 * But the log page entries typically pad them with spaces, not zeroes. 6137 * So add a NULL terminator to each of these fields at the appropriate 6138 * location. 6139 */ 6140 space = strchr(trid->traddr, ' '); 6141 if (space) { 6142 *space = 0; 6143 } 6144 space = strchr(trid->trsvcid, ' '); 6145 if (space) { 6146 *space = 0; 6147 } 6148 space = strchr(trid->subnqn, ' '); 6149 if (space) { 6150 *space = 0; 6151 } 6152 } 6153 6154 static void 6155 _stop_discovery(void *_ctx) 6156 { 6157 struct discovery_ctx *ctx = _ctx; 6158 6159 if (ctx->attach_in_progress > 0) { 6160 spdk_thread_send_msg(spdk_get_thread(), _stop_discovery, ctx); 6161 return; 6162 } 6163 6164 ctx->stop = true; 6165 6166 while (!TAILQ_EMPTY(&ctx->nvm_entry_ctxs)) { 6167 struct discovery_entry_ctx *entry_ctx; 6168 struct nvme_path_id path = {}; 6169 6170 entry_ctx = TAILQ_FIRST(&ctx->nvm_entry_ctxs); 6171 path.trid = entry_ctx->trid; 6172 bdev_nvme_delete(entry_ctx->name, &path); 6173 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 6174 free(entry_ctx); 6175 } 6176 6177 while (!TAILQ_EMPTY(&ctx->discovery_entry_ctxs)) { 6178 struct discovery_entry_ctx *entry_ctx; 6179 6180 entry_ctx = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 6181 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 6182 free(entry_ctx); 6183 } 6184 6185 free(ctx->entry_ctx_in_use); 6186 ctx->entry_ctx_in_use = NULL; 6187 } 6188 6189 static void 6190 stop_discovery(struct discovery_ctx *ctx, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 6191 { 6192 ctx->stop_cb_fn = cb_fn; 6193 ctx->cb_ctx = cb_ctx; 6194 6195 if (ctx->attach_in_progress > 0) { 6196 DISCOVERY_INFOLOG(ctx, "stopping discovery with attach_in_progress: %"PRIu32"\n", 6197 ctx->attach_in_progress); 6198 } 6199 6200 _stop_discovery(ctx); 6201 } 6202 6203 static void 6204 remove_discovery_entry(struct nvme_ctrlr *nvme_ctrlr) 6205 { 6206 struct discovery_ctx *d_ctx; 6207 struct nvme_path_id *path_id; 6208 struct spdk_nvme_transport_id trid = {}; 6209 struct discovery_entry_ctx *entry_ctx, *tmp; 6210 6211 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 6212 6213 TAILQ_FOREACH(d_ctx, &g_discovery_ctxs, tailq) { 6214 TAILQ_FOREACH_SAFE(entry_ctx, &d_ctx->nvm_entry_ctxs, tailq, tmp) { 6215 build_trid_from_log_page_entry(&trid, &entry_ctx->entry); 6216 if (spdk_nvme_transport_id_compare(&trid, &path_id->trid) != 0) { 6217 continue; 6218 } 6219 6220 TAILQ_REMOVE(&d_ctx->nvm_entry_ctxs, entry_ctx, tailq); 6221 free(entry_ctx); 6222 DISCOVERY_INFOLOG(d_ctx, "Remove discovery entry: %s:%s:%s\n", 6223 trid.subnqn, trid.traddr, trid.trsvcid); 6224 6225 /* Fail discovery ctrlr to force reattach attempt */ 6226 spdk_nvme_ctrlr_fail(d_ctx->ctrlr); 6227 } 6228 } 6229 } 6230 6231 static void 6232 discovery_remove_controllers(struct discovery_ctx *ctx) 6233 { 6234 struct spdk_nvmf_discovery_log_page *log_page = ctx->log_page; 6235 struct discovery_entry_ctx *entry_ctx, *tmp; 6236 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 6237 struct spdk_nvme_transport_id old_trid = {}; 6238 uint64_t numrec, i; 6239 bool found; 6240 6241 numrec = from_le64(&log_page->numrec); 6242 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->nvm_entry_ctxs, tailq, tmp) { 6243 found = false; 6244 old_entry = &entry_ctx->entry; 6245 build_trid_from_log_page_entry(&old_trid, old_entry); 6246 for (i = 0; i < numrec; i++) { 6247 new_entry = &log_page->entries[i]; 6248 if (!memcmp(old_entry, new_entry, sizeof(*old_entry))) { 6249 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s found again\n", 6250 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 6251 found = true; 6252 break; 6253 } 6254 } 6255 if (!found) { 6256 struct nvme_path_id path = {}; 6257 6258 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s not found\n", 6259 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 6260 6261 path.trid = entry_ctx->trid; 6262 bdev_nvme_delete(entry_ctx->name, &path); 6263 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 6264 free(entry_ctx); 6265 } 6266 } 6267 free(log_page); 6268 ctx->log_page = NULL; 6269 discovery_complete(ctx); 6270 } 6271 6272 static void 6273 complete_discovery_start(struct discovery_ctx *ctx, int status) 6274 { 6275 ctx->timeout_ticks = 0; 6276 ctx->rc = status; 6277 if (ctx->start_cb_fn) { 6278 ctx->start_cb_fn(ctx->cb_ctx, status); 6279 ctx->start_cb_fn = NULL; 6280 ctx->cb_ctx = NULL; 6281 } 6282 } 6283 6284 static void 6285 discovery_attach_controller_done(void *cb_ctx, size_t bdev_count, int rc) 6286 { 6287 struct discovery_entry_ctx *entry_ctx = cb_ctx; 6288 struct discovery_ctx *ctx = entry_ctx->ctx; 6289 6290 DISCOVERY_INFOLOG(ctx, "attach %s done\n", entry_ctx->name); 6291 ctx->attach_in_progress--; 6292 if (ctx->attach_in_progress == 0) { 6293 complete_discovery_start(ctx, ctx->rc); 6294 if (ctx->initializing && ctx->rc != 0) { 6295 DISCOVERY_ERRLOG(ctx, "stopping discovery due to errors: %d\n", ctx->rc); 6296 stop_discovery(ctx, NULL, ctx->cb_ctx); 6297 } else { 6298 discovery_remove_controllers(ctx); 6299 } 6300 } 6301 } 6302 6303 static struct discovery_entry_ctx * 6304 create_discovery_entry_ctx(struct discovery_ctx *ctx, struct spdk_nvme_transport_id *trid) 6305 { 6306 struct discovery_entry_ctx *new_ctx; 6307 6308 new_ctx = calloc(1, sizeof(*new_ctx)); 6309 if (new_ctx == NULL) { 6310 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6311 return NULL; 6312 } 6313 6314 new_ctx->ctx = ctx; 6315 memcpy(&new_ctx->trid, trid, sizeof(*trid)); 6316 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 6317 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 6318 return new_ctx; 6319 } 6320 6321 static void 6322 discovery_log_page_cb(void *cb_arg, int rc, const struct spdk_nvme_cpl *cpl, 6323 struct spdk_nvmf_discovery_log_page *log_page) 6324 { 6325 struct discovery_ctx *ctx = cb_arg; 6326 struct discovery_entry_ctx *entry_ctx, *tmp; 6327 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 6328 uint64_t numrec, i; 6329 bool found; 6330 6331 if (rc || spdk_nvme_cpl_is_error(cpl)) { 6332 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 6333 return; 6334 } 6335 6336 ctx->log_page = log_page; 6337 assert(ctx->attach_in_progress == 0); 6338 numrec = from_le64(&log_page->numrec); 6339 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->discovery_entry_ctxs, tailq, tmp) { 6340 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 6341 free(entry_ctx); 6342 } 6343 for (i = 0; i < numrec; i++) { 6344 found = false; 6345 new_entry = &log_page->entries[i]; 6346 if (new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) { 6347 struct discovery_entry_ctx *new_ctx; 6348 struct spdk_nvme_transport_id trid = {}; 6349 6350 build_trid_from_log_page_entry(&trid, new_entry); 6351 new_ctx = create_discovery_entry_ctx(ctx, &trid); 6352 if (new_ctx == NULL) { 6353 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6354 break; 6355 } 6356 6357 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, new_ctx, tailq); 6358 continue; 6359 } 6360 TAILQ_FOREACH(entry_ctx, &ctx->nvm_entry_ctxs, tailq) { 6361 old_entry = &entry_ctx->entry; 6362 if (!memcmp(new_entry, old_entry, sizeof(*new_entry))) { 6363 found = true; 6364 break; 6365 } 6366 } 6367 if (!found) { 6368 struct discovery_entry_ctx *subnqn_ctx = NULL, *new_ctx; 6369 struct discovery_ctx *d_ctx; 6370 6371 TAILQ_FOREACH(d_ctx, &g_discovery_ctxs, tailq) { 6372 TAILQ_FOREACH(subnqn_ctx, &d_ctx->nvm_entry_ctxs, tailq) { 6373 if (!memcmp(subnqn_ctx->entry.subnqn, new_entry->subnqn, 6374 sizeof(new_entry->subnqn))) { 6375 break; 6376 } 6377 } 6378 if (subnqn_ctx) { 6379 break; 6380 } 6381 } 6382 6383 new_ctx = calloc(1, sizeof(*new_ctx)); 6384 if (new_ctx == NULL) { 6385 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6386 break; 6387 } 6388 6389 new_ctx->ctx = ctx; 6390 memcpy(&new_ctx->entry, new_entry, sizeof(*new_entry)); 6391 build_trid_from_log_page_entry(&new_ctx->trid, new_entry); 6392 if (subnqn_ctx) { 6393 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s", subnqn_ctx->name); 6394 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new path for %s\n", 6395 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 6396 new_ctx->name); 6397 } else { 6398 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s%d", ctx->name, ctx->index++); 6399 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new subsystem %s\n", 6400 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 6401 new_ctx->name); 6402 } 6403 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 6404 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 6405 rc = bdev_nvme_create(&new_ctx->trid, new_ctx->name, NULL, 0, 6406 discovery_attach_controller_done, new_ctx, 6407 &new_ctx->drv_opts, &ctx->bdev_opts, true); 6408 if (rc == 0) { 6409 TAILQ_INSERT_TAIL(&ctx->nvm_entry_ctxs, new_ctx, tailq); 6410 ctx->attach_in_progress++; 6411 } else { 6412 DISCOVERY_ERRLOG(ctx, "bdev_nvme_create failed (%s)\n", spdk_strerror(-rc)); 6413 } 6414 } 6415 } 6416 6417 if (ctx->attach_in_progress == 0) { 6418 discovery_remove_controllers(ctx); 6419 } 6420 } 6421 6422 static void 6423 get_discovery_log_page(struct discovery_ctx *ctx) 6424 { 6425 int rc; 6426 6427 assert(ctx->in_progress == false); 6428 ctx->in_progress = true; 6429 rc = spdk_nvme_ctrlr_get_discovery_log_page(ctx->ctrlr, discovery_log_page_cb, ctx); 6430 if (rc != 0) { 6431 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 6432 } 6433 DISCOVERY_INFOLOG(ctx, "sent discovery log page command\n"); 6434 } 6435 6436 static void 6437 discovery_aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 6438 { 6439 struct discovery_ctx *ctx = arg; 6440 uint32_t log_page_id = (cpl->cdw0 & 0xFF0000) >> 16; 6441 6442 if (spdk_nvme_cpl_is_error(cpl)) { 6443 DISCOVERY_ERRLOG(ctx, "aer failed\n"); 6444 return; 6445 } 6446 6447 if (log_page_id != SPDK_NVME_LOG_DISCOVERY) { 6448 DISCOVERY_ERRLOG(ctx, "unexpected log page 0x%x\n", log_page_id); 6449 return; 6450 } 6451 6452 DISCOVERY_INFOLOG(ctx, "got aer\n"); 6453 if (ctx->in_progress) { 6454 ctx->pending = true; 6455 return; 6456 } 6457 6458 get_discovery_log_page(ctx); 6459 } 6460 6461 static void 6462 discovery_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 6463 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 6464 { 6465 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 6466 struct discovery_ctx *ctx; 6467 6468 ctx = SPDK_CONTAINEROF(user_opts, struct discovery_ctx, drv_opts); 6469 6470 DISCOVERY_INFOLOG(ctx, "discovery ctrlr attached\n"); 6471 ctx->probe_ctx = NULL; 6472 ctx->ctrlr = ctrlr; 6473 6474 if (ctx->rc != 0) { 6475 DISCOVERY_ERRLOG(ctx, "encountered error while attaching discovery ctrlr: %d\n", 6476 ctx->rc); 6477 return; 6478 } 6479 6480 spdk_nvme_ctrlr_register_aer_callback(ctx->ctrlr, discovery_aer_cb, ctx); 6481 } 6482 6483 static int 6484 discovery_poller(void *arg) 6485 { 6486 struct discovery_ctx *ctx = arg; 6487 struct spdk_nvme_transport_id *trid; 6488 int rc; 6489 6490 if (ctx->detach_ctx) { 6491 rc = spdk_nvme_detach_poll_async(ctx->detach_ctx); 6492 if (rc != -EAGAIN) { 6493 ctx->detach_ctx = NULL; 6494 ctx->ctrlr = NULL; 6495 } 6496 } else if (ctx->stop) { 6497 if (ctx->ctrlr != NULL) { 6498 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 6499 if (rc == 0) { 6500 return SPDK_POLLER_BUSY; 6501 } 6502 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 6503 } 6504 spdk_poller_unregister(&ctx->poller); 6505 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 6506 assert(ctx->start_cb_fn == NULL); 6507 if (ctx->stop_cb_fn != NULL) { 6508 ctx->stop_cb_fn(ctx->cb_ctx); 6509 } 6510 free_discovery_ctx(ctx); 6511 } else if (ctx->probe_ctx == NULL && ctx->ctrlr == NULL) { 6512 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 6513 DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); 6514 assert(ctx->initializing); 6515 spdk_poller_unregister(&ctx->poller); 6516 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 6517 complete_discovery_start(ctx, -ETIMEDOUT); 6518 stop_discovery(ctx, NULL, NULL); 6519 free_discovery_ctx(ctx); 6520 return SPDK_POLLER_BUSY; 6521 } 6522 6523 assert(ctx->entry_ctx_in_use == NULL); 6524 ctx->entry_ctx_in_use = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 6525 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 6526 trid = &ctx->entry_ctx_in_use->trid; 6527 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, discovery_attach_cb); 6528 if (ctx->probe_ctx) { 6529 spdk_poller_unregister(&ctx->poller); 6530 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000); 6531 } else { 6532 DISCOVERY_ERRLOG(ctx, "could not start discovery connect\n"); 6533 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 6534 ctx->entry_ctx_in_use = NULL; 6535 } 6536 } else if (ctx->probe_ctx) { 6537 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 6538 DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); 6539 complete_discovery_start(ctx, -ETIMEDOUT); 6540 return SPDK_POLLER_BUSY; 6541 } 6542 6543 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 6544 if (rc != -EAGAIN) { 6545 if (ctx->rc != 0) { 6546 assert(ctx->initializing); 6547 stop_discovery(ctx, NULL, ctx->cb_ctx); 6548 } else { 6549 assert(rc == 0); 6550 DISCOVERY_INFOLOG(ctx, "discovery ctrlr connected\n"); 6551 ctx->rc = rc; 6552 get_discovery_log_page(ctx); 6553 } 6554 } 6555 } else { 6556 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 6557 DISCOVERY_ERRLOG(ctx, "timed out while attaching NVM ctrlrs\n"); 6558 complete_discovery_start(ctx, -ETIMEDOUT); 6559 /* We need to wait until all NVM ctrlrs are attached before we stop the 6560 * discovery service to make sure we don't detach a ctrlr that is still 6561 * being attached. 6562 */ 6563 if (ctx->attach_in_progress == 0) { 6564 stop_discovery(ctx, NULL, ctx->cb_ctx); 6565 return SPDK_POLLER_BUSY; 6566 } 6567 } 6568 6569 rc = spdk_nvme_ctrlr_process_admin_completions(ctx->ctrlr); 6570 if (rc < 0) { 6571 spdk_poller_unregister(&ctx->poller); 6572 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 6573 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 6574 ctx->entry_ctx_in_use = NULL; 6575 6576 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 6577 if (rc != 0) { 6578 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 6579 ctx->ctrlr = NULL; 6580 } 6581 } 6582 } 6583 6584 return SPDK_POLLER_BUSY; 6585 } 6586 6587 static void 6588 start_discovery_poller(void *arg) 6589 { 6590 struct discovery_ctx *ctx = arg; 6591 6592 TAILQ_INSERT_TAIL(&g_discovery_ctxs, ctx, tailq); 6593 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 6594 } 6595 6596 int 6597 bdev_nvme_start_discovery(struct spdk_nvme_transport_id *trid, 6598 const char *base_name, 6599 struct spdk_nvme_ctrlr_opts *drv_opts, 6600 struct nvme_ctrlr_opts *bdev_opts, 6601 uint64_t attach_timeout, 6602 bool from_mdns, 6603 spdk_bdev_nvme_start_discovery_fn cb_fn, void *cb_ctx) 6604 { 6605 struct discovery_ctx *ctx; 6606 struct discovery_entry_ctx *discovery_entry_ctx; 6607 6608 snprintf(trid->subnqn, sizeof(trid->subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN); 6609 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 6610 if (strcmp(ctx->name, base_name) == 0) { 6611 return -EEXIST; 6612 } 6613 6614 if (ctx->entry_ctx_in_use != NULL) { 6615 if (!spdk_nvme_transport_id_compare(trid, &ctx->entry_ctx_in_use->trid)) { 6616 return -EEXIST; 6617 } 6618 } 6619 6620 TAILQ_FOREACH(discovery_entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 6621 if (!spdk_nvme_transport_id_compare(trid, &discovery_entry_ctx->trid)) { 6622 return -EEXIST; 6623 } 6624 } 6625 } 6626 6627 ctx = calloc(1, sizeof(*ctx)); 6628 if (ctx == NULL) { 6629 return -ENOMEM; 6630 } 6631 6632 ctx->name = strdup(base_name); 6633 if (ctx->name == NULL) { 6634 free_discovery_ctx(ctx); 6635 return -ENOMEM; 6636 } 6637 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 6638 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 6639 ctx->from_mdns_discovery_service = from_mdns; 6640 ctx->bdev_opts.from_discovery_service = true; 6641 ctx->calling_thread = spdk_get_thread(); 6642 ctx->start_cb_fn = cb_fn; 6643 ctx->cb_ctx = cb_ctx; 6644 ctx->initializing = true; 6645 if (ctx->start_cb_fn) { 6646 /* We can use this when dumping json to denote if this RPC parameter 6647 * was specified or not. 6648 */ 6649 ctx->wait_for_attach = true; 6650 } 6651 if (attach_timeout != 0) { 6652 ctx->timeout_ticks = spdk_get_ticks() + attach_timeout * 6653 spdk_get_ticks_hz() / 1000ull; 6654 } 6655 TAILQ_INIT(&ctx->nvm_entry_ctxs); 6656 TAILQ_INIT(&ctx->discovery_entry_ctxs); 6657 memcpy(&ctx->trid, trid, sizeof(*trid)); 6658 /* Even if user did not specify hostnqn, we can still strdup("\0"); */ 6659 ctx->hostnqn = strdup(ctx->drv_opts.hostnqn); 6660 if (ctx->hostnqn == NULL) { 6661 free_discovery_ctx(ctx); 6662 return -ENOMEM; 6663 } 6664 discovery_entry_ctx = create_discovery_entry_ctx(ctx, trid); 6665 if (discovery_entry_ctx == NULL) { 6666 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6667 free_discovery_ctx(ctx); 6668 return -ENOMEM; 6669 } 6670 6671 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, discovery_entry_ctx, tailq); 6672 spdk_thread_send_msg(g_bdev_nvme_init_thread, start_discovery_poller, ctx); 6673 return 0; 6674 } 6675 6676 int 6677 bdev_nvme_stop_discovery(const char *name, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 6678 { 6679 struct discovery_ctx *ctx; 6680 6681 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 6682 if (strcmp(name, ctx->name) == 0) { 6683 if (ctx->stop) { 6684 return -EALREADY; 6685 } 6686 /* If we're still starting the discovery service and ->rc is non-zero, we're 6687 * going to stop it as soon as we can 6688 */ 6689 if (ctx->initializing && ctx->rc != 0) { 6690 return -EALREADY; 6691 } 6692 stop_discovery(ctx, cb_fn, cb_ctx); 6693 return 0; 6694 } 6695 } 6696 6697 return -ENOENT; 6698 } 6699 6700 static int 6701 bdev_nvme_library_init(void) 6702 { 6703 g_bdev_nvme_init_thread = spdk_get_thread(); 6704 6705 spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_create_poll_group_cb, 6706 bdev_nvme_destroy_poll_group_cb, 6707 sizeof(struct nvme_poll_group), "nvme_poll_groups"); 6708 6709 return 0; 6710 } 6711 6712 static void 6713 bdev_nvme_fini_destruct_ctrlrs(void) 6714 { 6715 struct nvme_bdev_ctrlr *nbdev_ctrlr; 6716 struct nvme_ctrlr *nvme_ctrlr; 6717 6718 pthread_mutex_lock(&g_bdev_nvme_mutex); 6719 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 6720 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 6721 pthread_mutex_lock(&nvme_ctrlr->mutex); 6722 if (nvme_ctrlr->destruct) { 6723 /* This controller's destruction was already started 6724 * before the application started shutting down 6725 */ 6726 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6727 continue; 6728 } 6729 nvme_ctrlr->destruct = true; 6730 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6731 6732 spdk_thread_send_msg(nvme_ctrlr->thread, _nvme_ctrlr_destruct, 6733 nvme_ctrlr); 6734 } 6735 } 6736 6737 g_bdev_nvme_module_finish = true; 6738 if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 6739 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6740 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 6741 spdk_bdev_module_fini_done(); 6742 return; 6743 } 6744 6745 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6746 } 6747 6748 static void 6749 check_discovery_fini(void *arg) 6750 { 6751 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 6752 bdev_nvme_fini_destruct_ctrlrs(); 6753 } 6754 } 6755 6756 static void 6757 bdev_nvme_library_fini(void) 6758 { 6759 struct nvme_probe_skip_entry *entry, *entry_tmp; 6760 struct discovery_ctx *ctx; 6761 6762 spdk_poller_unregister(&g_hotplug_poller); 6763 free(g_hotplug_probe_ctx); 6764 g_hotplug_probe_ctx = NULL; 6765 6766 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) { 6767 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 6768 free(entry); 6769 } 6770 6771 assert(spdk_get_thread() == g_bdev_nvme_init_thread); 6772 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 6773 bdev_nvme_fini_destruct_ctrlrs(); 6774 } else { 6775 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 6776 stop_discovery(ctx, check_discovery_fini, NULL); 6777 } 6778 } 6779 } 6780 6781 static void 6782 bdev_nvme_verify_pi_error(struct nvme_bdev_io *bio) 6783 { 6784 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 6785 struct spdk_bdev *bdev = bdev_io->bdev; 6786 struct spdk_dif_ctx dif_ctx; 6787 struct spdk_dif_error err_blk = {}; 6788 int rc; 6789 struct spdk_dif_ctx_init_ext_opts dif_opts; 6790 6791 dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format); 6792 dif_opts.dif_pi_format = SPDK_DIF_PI_FORMAT_16; 6793 rc = spdk_dif_ctx_init(&dif_ctx, 6794 bdev->blocklen, bdev->md_len, bdev->md_interleave, 6795 bdev->dif_is_head_of_md, bdev->dif_type, bdev->dif_check_flags, 6796 bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0, &dif_opts); 6797 if (rc != 0) { 6798 SPDK_ERRLOG("Initialization of DIF context failed\n"); 6799 return; 6800 } 6801 6802 if (bdev->md_interleave) { 6803 rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 6804 bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 6805 } else { 6806 struct iovec md_iov = { 6807 .iov_base = bdev_io->u.bdev.md_buf, 6808 .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len, 6809 }; 6810 6811 rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 6812 &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 6813 } 6814 6815 if (rc != 0) { 6816 SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n", 6817 err_blk.err_type, err_blk.err_offset); 6818 } else { 6819 SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n"); 6820 } 6821 } 6822 6823 static void 6824 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 6825 { 6826 struct nvme_bdev_io *bio = ref; 6827 6828 if (spdk_nvme_cpl_is_success(cpl)) { 6829 /* Run PI verification for read data buffer. */ 6830 bdev_nvme_verify_pi_error(bio); 6831 } 6832 6833 /* Return original completion status */ 6834 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 6835 } 6836 6837 static void 6838 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 6839 { 6840 struct nvme_bdev_io *bio = ref; 6841 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 6842 int ret; 6843 6844 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 6845 SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n", 6846 cpl->status.sct, cpl->status.sc); 6847 6848 /* Save completion status to use after verifying PI error. */ 6849 bio->cpl = *cpl; 6850 6851 if (spdk_likely(nvme_io_path_is_available(bio->io_path))) { 6852 /* Read without PI checking to verify PI error. */ 6853 ret = bdev_nvme_no_pi_readv(bio, 6854 bdev_io->u.bdev.iovs, 6855 bdev_io->u.bdev.iovcnt, 6856 bdev_io->u.bdev.md_buf, 6857 bdev_io->u.bdev.num_blocks, 6858 bdev_io->u.bdev.offset_blocks); 6859 if (ret == 0) { 6860 return; 6861 } 6862 } 6863 } 6864 6865 bdev_nvme_io_complete_nvme_status(bio, cpl); 6866 } 6867 6868 static void 6869 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 6870 { 6871 struct nvme_bdev_io *bio = ref; 6872 6873 if (spdk_nvme_cpl_is_pi_error(cpl)) { 6874 SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n", 6875 cpl->status.sct, cpl->status.sc); 6876 /* Run PI verification for write data buffer if PI error is detected. */ 6877 bdev_nvme_verify_pi_error(bio); 6878 } 6879 6880 bdev_nvme_io_complete_nvme_status(bio, cpl); 6881 } 6882 6883 static void 6884 bdev_nvme_zone_appendv_done(void *ref, const struct spdk_nvme_cpl *cpl) 6885 { 6886 struct nvme_bdev_io *bio = ref; 6887 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 6888 6889 /* spdk_bdev_io_get_append_location() requires that the ALBA is stored in offset_blocks. 6890 * Additionally, offset_blocks has to be set before calling bdev_nvme_verify_pi_error(). 6891 */ 6892 bdev_io->u.bdev.offset_blocks = *(uint64_t *)&cpl->cdw0; 6893 6894 if (spdk_nvme_cpl_is_pi_error(cpl)) { 6895 SPDK_ERRLOG("zone append completed with PI error (sct=%d, sc=%d)\n", 6896 cpl->status.sct, cpl->status.sc); 6897 /* Run PI verification for zone append data buffer if PI error is detected. */ 6898 bdev_nvme_verify_pi_error(bio); 6899 } 6900 6901 bdev_nvme_io_complete_nvme_status(bio, cpl); 6902 } 6903 6904 static void 6905 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl) 6906 { 6907 struct nvme_bdev_io *bio = ref; 6908 6909 if (spdk_nvme_cpl_is_pi_error(cpl)) { 6910 SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n", 6911 cpl->status.sct, cpl->status.sc); 6912 /* Run PI verification for compare data buffer if PI error is detected. */ 6913 bdev_nvme_verify_pi_error(bio); 6914 } 6915 6916 bdev_nvme_io_complete_nvme_status(bio, cpl); 6917 } 6918 6919 static void 6920 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 6921 { 6922 struct nvme_bdev_io *bio = ref; 6923 6924 /* Compare operation completion */ 6925 if (!bio->first_fused_completed) { 6926 /* Save compare result for write callback */ 6927 bio->cpl = *cpl; 6928 bio->first_fused_completed = true; 6929 return; 6930 } 6931 6932 /* Write operation completion */ 6933 if (spdk_nvme_cpl_is_error(&bio->cpl)) { 6934 /* If bio->cpl is already an error, it means the compare operation failed. In that case, 6935 * complete the IO with the compare operation's status. 6936 */ 6937 if (!spdk_nvme_cpl_is_error(cpl)) { 6938 SPDK_ERRLOG("Unexpected write success after compare failure.\n"); 6939 } 6940 6941 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 6942 } else { 6943 bdev_nvme_io_complete_nvme_status(bio, cpl); 6944 } 6945 } 6946 6947 static void 6948 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl) 6949 { 6950 struct nvme_bdev_io *bio = ref; 6951 6952 bdev_nvme_io_complete_nvme_status(bio, cpl); 6953 } 6954 6955 static int 6956 fill_zone_from_report(struct spdk_bdev_zone_info *info, struct spdk_nvme_zns_zone_desc *desc) 6957 { 6958 switch (desc->zt) { 6959 case SPDK_NVME_ZONE_TYPE_SEQWR: 6960 info->type = SPDK_BDEV_ZONE_TYPE_SEQWR; 6961 break; 6962 default: 6963 SPDK_ERRLOG("Invalid zone type: %#x in zone report\n", desc->zt); 6964 return -EIO; 6965 } 6966 6967 switch (desc->zs) { 6968 case SPDK_NVME_ZONE_STATE_EMPTY: 6969 info->state = SPDK_BDEV_ZONE_STATE_EMPTY; 6970 break; 6971 case SPDK_NVME_ZONE_STATE_IOPEN: 6972 info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN; 6973 break; 6974 case SPDK_NVME_ZONE_STATE_EOPEN: 6975 info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN; 6976 break; 6977 case SPDK_NVME_ZONE_STATE_CLOSED: 6978 info->state = SPDK_BDEV_ZONE_STATE_CLOSED; 6979 break; 6980 case SPDK_NVME_ZONE_STATE_RONLY: 6981 info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY; 6982 break; 6983 case SPDK_NVME_ZONE_STATE_FULL: 6984 info->state = SPDK_BDEV_ZONE_STATE_FULL; 6985 break; 6986 case SPDK_NVME_ZONE_STATE_OFFLINE: 6987 info->state = SPDK_BDEV_ZONE_STATE_OFFLINE; 6988 break; 6989 default: 6990 SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", desc->zs); 6991 return -EIO; 6992 } 6993 6994 info->zone_id = desc->zslba; 6995 info->write_pointer = desc->wp; 6996 info->capacity = desc->zcap; 6997 6998 return 0; 6999 } 7000 7001 static void 7002 bdev_nvme_get_zone_info_done(void *ref, const struct spdk_nvme_cpl *cpl) 7003 { 7004 struct nvme_bdev_io *bio = ref; 7005 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7006 uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id; 7007 uint32_t zones_to_copy = bdev_io->u.zone_mgmt.num_zones; 7008 struct spdk_bdev_zone_info *info = bdev_io->u.zone_mgmt.buf; 7009 uint64_t max_zones_per_buf, i; 7010 uint32_t zone_report_bufsize; 7011 struct spdk_nvme_ns *ns; 7012 struct spdk_nvme_qpair *qpair; 7013 int ret; 7014 7015 if (spdk_nvme_cpl_is_error(cpl)) { 7016 goto out_complete_io_nvme_cpl; 7017 } 7018 7019 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 7020 ret = -ENXIO; 7021 goto out_complete_io_ret; 7022 } 7023 7024 ns = bio->io_path->nvme_ns->ns; 7025 qpair = bio->io_path->qpair->qpair; 7026 7027 zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 7028 max_zones_per_buf = (zone_report_bufsize - sizeof(*bio->zone_report_buf)) / 7029 sizeof(bio->zone_report_buf->descs[0]); 7030 7031 if (bio->zone_report_buf->nr_zones > max_zones_per_buf) { 7032 ret = -EINVAL; 7033 goto out_complete_io_ret; 7034 } 7035 7036 if (!bio->zone_report_buf->nr_zones) { 7037 ret = -EINVAL; 7038 goto out_complete_io_ret; 7039 } 7040 7041 for (i = 0; i < bio->zone_report_buf->nr_zones && bio->handled_zones < zones_to_copy; i++) { 7042 ret = fill_zone_from_report(&info[bio->handled_zones], 7043 &bio->zone_report_buf->descs[i]); 7044 if (ret) { 7045 goto out_complete_io_ret; 7046 } 7047 bio->handled_zones++; 7048 } 7049 7050 if (bio->handled_zones < zones_to_copy) { 7051 uint64_t zone_size_lba = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 7052 uint64_t slba = zone_id + (zone_size_lba * bio->handled_zones); 7053 7054 memset(bio->zone_report_buf, 0, zone_report_bufsize); 7055 ret = spdk_nvme_zns_report_zones(ns, qpair, 7056 bio->zone_report_buf, zone_report_bufsize, 7057 slba, SPDK_NVME_ZRA_LIST_ALL, true, 7058 bdev_nvme_get_zone_info_done, bio); 7059 if (!ret) { 7060 return; 7061 } else { 7062 goto out_complete_io_ret; 7063 } 7064 } 7065 7066 out_complete_io_nvme_cpl: 7067 free(bio->zone_report_buf); 7068 bio->zone_report_buf = NULL; 7069 bdev_nvme_io_complete_nvme_status(bio, cpl); 7070 return; 7071 7072 out_complete_io_ret: 7073 free(bio->zone_report_buf); 7074 bio->zone_report_buf = NULL; 7075 bdev_nvme_io_complete(bio, ret); 7076 } 7077 7078 static void 7079 bdev_nvme_zone_management_done(void *ref, const struct spdk_nvme_cpl *cpl) 7080 { 7081 struct nvme_bdev_io *bio = ref; 7082 7083 bdev_nvme_io_complete_nvme_status(bio, cpl); 7084 } 7085 7086 static void 7087 bdev_nvme_admin_passthru_complete_nvme_status(void *ctx) 7088 { 7089 struct nvme_bdev_io *bio = ctx; 7090 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7091 const struct spdk_nvme_cpl *cpl = &bio->cpl; 7092 7093 assert(bdev_nvme_io_type_is_admin(bdev_io->type)); 7094 7095 __bdev_nvme_io_complete(bdev_io, 0, cpl); 7096 } 7097 7098 static void 7099 bdev_nvme_abort_complete(void *ctx) 7100 { 7101 struct nvme_bdev_io *bio = ctx; 7102 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7103 7104 if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) { 7105 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS, NULL); 7106 } else { 7107 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED, NULL); 7108 } 7109 } 7110 7111 static void 7112 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl) 7113 { 7114 struct nvme_bdev_io *bio = ref; 7115 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7116 7117 bio->cpl = *cpl; 7118 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), bdev_nvme_abort_complete, bio); 7119 } 7120 7121 static void 7122 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl) 7123 { 7124 struct nvme_bdev_io *bio = ref; 7125 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7126 7127 bio->cpl = *cpl; 7128 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7129 bdev_nvme_admin_passthru_complete_nvme_status, bio); 7130 } 7131 7132 static void 7133 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset) 7134 { 7135 struct nvme_bdev_io *bio = ref; 7136 struct iovec *iov; 7137 7138 bio->iov_offset = sgl_offset; 7139 for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) { 7140 iov = &bio->iovs[bio->iovpos]; 7141 if (bio->iov_offset < iov->iov_len) { 7142 break; 7143 } 7144 7145 bio->iov_offset -= iov->iov_len; 7146 } 7147 } 7148 7149 static int 7150 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length) 7151 { 7152 struct nvme_bdev_io *bio = ref; 7153 struct iovec *iov; 7154 7155 assert(bio->iovpos < bio->iovcnt); 7156 7157 iov = &bio->iovs[bio->iovpos]; 7158 7159 *address = iov->iov_base; 7160 *length = iov->iov_len; 7161 7162 if (bio->iov_offset) { 7163 assert(bio->iov_offset <= iov->iov_len); 7164 *address += bio->iov_offset; 7165 *length -= bio->iov_offset; 7166 } 7167 7168 bio->iov_offset += *length; 7169 if (bio->iov_offset == iov->iov_len) { 7170 bio->iovpos++; 7171 bio->iov_offset = 0; 7172 } 7173 7174 return 0; 7175 } 7176 7177 static void 7178 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset) 7179 { 7180 struct nvme_bdev_io *bio = ref; 7181 struct iovec *iov; 7182 7183 bio->fused_iov_offset = sgl_offset; 7184 for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) { 7185 iov = &bio->fused_iovs[bio->fused_iovpos]; 7186 if (bio->fused_iov_offset < iov->iov_len) { 7187 break; 7188 } 7189 7190 bio->fused_iov_offset -= iov->iov_len; 7191 } 7192 } 7193 7194 static int 7195 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length) 7196 { 7197 struct nvme_bdev_io *bio = ref; 7198 struct iovec *iov; 7199 7200 assert(bio->fused_iovpos < bio->fused_iovcnt); 7201 7202 iov = &bio->fused_iovs[bio->fused_iovpos]; 7203 7204 *address = iov->iov_base; 7205 *length = iov->iov_len; 7206 7207 if (bio->fused_iov_offset) { 7208 assert(bio->fused_iov_offset <= iov->iov_len); 7209 *address += bio->fused_iov_offset; 7210 *length -= bio->fused_iov_offset; 7211 } 7212 7213 bio->fused_iov_offset += *length; 7214 if (bio->fused_iov_offset == iov->iov_len) { 7215 bio->fused_iovpos++; 7216 bio->fused_iov_offset = 0; 7217 } 7218 7219 return 0; 7220 } 7221 7222 static int 7223 bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7224 void *md, uint64_t lba_count, uint64_t lba) 7225 { 7226 int rc; 7227 7228 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n", 7229 lba_count, lba); 7230 7231 bio->iovs = iov; 7232 bio->iovcnt = iovcnt; 7233 bio->iovpos = 0; 7234 bio->iov_offset = 0; 7235 7236 rc = spdk_nvme_ns_cmd_readv_with_md(bio->io_path->nvme_ns->ns, 7237 bio->io_path->qpair->qpair, 7238 lba, lba_count, 7239 bdev_nvme_no_pi_readv_done, bio, 0, 7240 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 7241 md, 0, 0); 7242 7243 if (rc != 0 && rc != -ENOMEM) { 7244 SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc); 7245 } 7246 return rc; 7247 } 7248 7249 static int 7250 bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7251 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, 7252 struct spdk_memory_domain *domain, void *domain_ctx) 7253 { 7254 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7255 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7256 int rc; 7257 7258 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7259 lba_count, lba); 7260 7261 bio->iovs = iov; 7262 bio->iovcnt = iovcnt; 7263 bio->iovpos = 0; 7264 bio->iov_offset = 0; 7265 7266 if (domain != NULL) { 7267 bio->ext_opts.size = SPDK_SIZEOF(&bio->ext_opts, cdw13); 7268 bio->ext_opts.memory_domain = domain; 7269 bio->ext_opts.memory_domain_ctx = domain_ctx; 7270 bio->ext_opts.io_flags = flags; 7271 bio->ext_opts.metadata = md; 7272 7273 rc = spdk_nvme_ns_cmd_readv_ext(ns, qpair, lba, lba_count, 7274 bdev_nvme_readv_done, bio, 7275 bdev_nvme_queued_reset_sgl, 7276 bdev_nvme_queued_next_sge, 7277 &bio->ext_opts); 7278 } else if (iovcnt == 1) { 7279 rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base, 7280 md, lba, lba_count, bdev_nvme_readv_done, 7281 bio, flags, 0, 0); 7282 } else { 7283 rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count, 7284 bdev_nvme_readv_done, bio, flags, 7285 bdev_nvme_queued_reset_sgl, 7286 bdev_nvme_queued_next_sge, md, 0, 0); 7287 } 7288 7289 if (rc != 0 && rc != -ENOMEM) { 7290 SPDK_ERRLOG("readv failed: rc = %d\n", rc); 7291 } 7292 return rc; 7293 } 7294 7295 static int 7296 bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7297 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, 7298 struct spdk_memory_domain *domain, void *domain_ctx) 7299 { 7300 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7301 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7302 int rc; 7303 7304 SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7305 lba_count, lba); 7306 7307 bio->iovs = iov; 7308 bio->iovcnt = iovcnt; 7309 bio->iovpos = 0; 7310 bio->iov_offset = 0; 7311 7312 if (domain != NULL) { 7313 bio->ext_opts.size = SPDK_SIZEOF(&bio->ext_opts, cdw13); 7314 bio->ext_opts.memory_domain = domain; 7315 bio->ext_opts.memory_domain_ctx = domain_ctx; 7316 bio->ext_opts.io_flags = flags; 7317 bio->ext_opts.metadata = md; 7318 7319 rc = spdk_nvme_ns_cmd_writev_ext(ns, qpair, lba, lba_count, 7320 bdev_nvme_writev_done, bio, 7321 bdev_nvme_queued_reset_sgl, 7322 bdev_nvme_queued_next_sge, 7323 &bio->ext_opts); 7324 } else if (iovcnt == 1) { 7325 rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base, 7326 md, lba, lba_count, bdev_nvme_writev_done, 7327 bio, flags, 0, 0); 7328 } else { 7329 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 7330 bdev_nvme_writev_done, bio, flags, 7331 bdev_nvme_queued_reset_sgl, 7332 bdev_nvme_queued_next_sge, md, 0, 0); 7333 } 7334 7335 if (rc != 0 && rc != -ENOMEM) { 7336 SPDK_ERRLOG("writev failed: rc = %d\n", rc); 7337 } 7338 return rc; 7339 } 7340 7341 static int 7342 bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7343 void *md, uint64_t lba_count, uint64_t zslba, 7344 uint32_t flags) 7345 { 7346 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7347 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7348 int rc; 7349 7350 SPDK_DEBUGLOG(bdev_nvme, "zone append %" PRIu64 " blocks to zone start lba %#" PRIx64 "\n", 7351 lba_count, zslba); 7352 7353 bio->iovs = iov; 7354 bio->iovcnt = iovcnt; 7355 bio->iovpos = 0; 7356 bio->iov_offset = 0; 7357 7358 if (iovcnt == 1) { 7359 rc = spdk_nvme_zns_zone_append_with_md(ns, qpair, iov[0].iov_base, md, zslba, 7360 lba_count, 7361 bdev_nvme_zone_appendv_done, bio, 7362 flags, 7363 0, 0); 7364 } else { 7365 rc = spdk_nvme_zns_zone_appendv_with_md(ns, qpair, zslba, lba_count, 7366 bdev_nvme_zone_appendv_done, bio, flags, 7367 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 7368 md, 0, 0); 7369 } 7370 7371 if (rc != 0 && rc != -ENOMEM) { 7372 SPDK_ERRLOG("zone append failed: rc = %d\n", rc); 7373 } 7374 return rc; 7375 } 7376 7377 static int 7378 bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7379 void *md, uint64_t lba_count, uint64_t lba, 7380 uint32_t flags) 7381 { 7382 int rc; 7383 7384 SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7385 lba_count, lba); 7386 7387 bio->iovs = iov; 7388 bio->iovcnt = iovcnt; 7389 bio->iovpos = 0; 7390 bio->iov_offset = 0; 7391 7392 rc = spdk_nvme_ns_cmd_comparev_with_md(bio->io_path->nvme_ns->ns, 7393 bio->io_path->qpair->qpair, 7394 lba, lba_count, 7395 bdev_nvme_comparev_done, bio, flags, 7396 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 7397 md, 0, 0); 7398 7399 if (rc != 0 && rc != -ENOMEM) { 7400 SPDK_ERRLOG("comparev failed: rc = %d\n", rc); 7401 } 7402 return rc; 7403 } 7404 7405 static int 7406 bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, 7407 struct iovec *write_iov, int write_iovcnt, 7408 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags) 7409 { 7410 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7411 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7412 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7413 int rc; 7414 7415 SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7416 lba_count, lba); 7417 7418 bio->iovs = cmp_iov; 7419 bio->iovcnt = cmp_iovcnt; 7420 bio->iovpos = 0; 7421 bio->iov_offset = 0; 7422 bio->fused_iovs = write_iov; 7423 bio->fused_iovcnt = write_iovcnt; 7424 bio->fused_iovpos = 0; 7425 bio->fused_iov_offset = 0; 7426 7427 if (bdev_io->num_retries == 0) { 7428 bio->first_fused_submitted = false; 7429 bio->first_fused_completed = false; 7430 } 7431 7432 if (!bio->first_fused_submitted) { 7433 flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST; 7434 memset(&bio->cpl, 0, sizeof(bio->cpl)); 7435 7436 rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count, 7437 bdev_nvme_comparev_and_writev_done, bio, flags, 7438 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0); 7439 if (rc == 0) { 7440 bio->first_fused_submitted = true; 7441 flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST; 7442 } else { 7443 if (rc != -ENOMEM) { 7444 SPDK_ERRLOG("compare failed: rc = %d\n", rc); 7445 } 7446 return rc; 7447 } 7448 } 7449 7450 flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND; 7451 7452 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 7453 bdev_nvme_comparev_and_writev_done, bio, flags, 7454 bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0); 7455 if (rc != 0 && rc != -ENOMEM) { 7456 SPDK_ERRLOG("write failed: rc = %d\n", rc); 7457 rc = 0; 7458 } 7459 7460 return rc; 7461 } 7462 7463 static int 7464 bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 7465 { 7466 struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES]; 7467 struct spdk_nvme_dsm_range *range; 7468 uint64_t offset, remaining; 7469 uint64_t num_ranges_u64; 7470 uint16_t num_ranges; 7471 int rc; 7472 7473 num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) / 7474 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 7475 if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) { 7476 SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks); 7477 return -EINVAL; 7478 } 7479 num_ranges = (uint16_t)num_ranges_u64; 7480 7481 offset = offset_blocks; 7482 remaining = num_blocks; 7483 range = &dsm_ranges[0]; 7484 7485 /* Fill max-size ranges until the remaining blocks fit into one range */ 7486 while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) { 7487 range->attributes.raw = 0; 7488 range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 7489 range->starting_lba = offset; 7490 7491 offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 7492 remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 7493 range++; 7494 } 7495 7496 /* Final range describes the remaining blocks */ 7497 range->attributes.raw = 0; 7498 range->length = remaining; 7499 range->starting_lba = offset; 7500 7501 rc = spdk_nvme_ns_cmd_dataset_management(bio->io_path->nvme_ns->ns, 7502 bio->io_path->qpair->qpair, 7503 SPDK_NVME_DSM_ATTR_DEALLOCATE, 7504 dsm_ranges, num_ranges, 7505 bdev_nvme_queued_done, bio); 7506 7507 return rc; 7508 } 7509 7510 static int 7511 bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 7512 { 7513 if (num_blocks > UINT16_MAX + 1) { 7514 SPDK_ERRLOG("NVMe write zeroes is limited to 16-bit block count\n"); 7515 return -EINVAL; 7516 } 7517 7518 return spdk_nvme_ns_cmd_write_zeroes(bio->io_path->nvme_ns->ns, 7519 bio->io_path->qpair->qpair, 7520 offset_blocks, num_blocks, 7521 bdev_nvme_queued_done, bio, 7522 0); 7523 } 7524 7525 static int 7526 bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones, 7527 struct spdk_bdev_zone_info *info) 7528 { 7529 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7530 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7531 uint32_t zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 7532 uint64_t zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 7533 uint64_t total_zones = spdk_nvme_zns_ns_get_num_zones(ns); 7534 7535 if (zone_id % zone_size != 0) { 7536 return -EINVAL; 7537 } 7538 7539 if (num_zones > total_zones || !num_zones) { 7540 return -EINVAL; 7541 } 7542 7543 assert(!bio->zone_report_buf); 7544 bio->zone_report_buf = calloc(1, zone_report_bufsize); 7545 if (!bio->zone_report_buf) { 7546 return -ENOMEM; 7547 } 7548 7549 bio->handled_zones = 0; 7550 7551 return spdk_nvme_zns_report_zones(ns, qpair, bio->zone_report_buf, zone_report_bufsize, 7552 zone_id, SPDK_NVME_ZRA_LIST_ALL, true, 7553 bdev_nvme_get_zone_info_done, bio); 7554 } 7555 7556 static int 7557 bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 7558 enum spdk_bdev_zone_action action) 7559 { 7560 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7561 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7562 7563 switch (action) { 7564 case SPDK_BDEV_ZONE_CLOSE: 7565 return spdk_nvme_zns_close_zone(ns, qpair, zone_id, false, 7566 bdev_nvme_zone_management_done, bio); 7567 case SPDK_BDEV_ZONE_FINISH: 7568 return spdk_nvme_zns_finish_zone(ns, qpair, zone_id, false, 7569 bdev_nvme_zone_management_done, bio); 7570 case SPDK_BDEV_ZONE_OPEN: 7571 return spdk_nvme_zns_open_zone(ns, qpair, zone_id, false, 7572 bdev_nvme_zone_management_done, bio); 7573 case SPDK_BDEV_ZONE_RESET: 7574 return spdk_nvme_zns_reset_zone(ns, qpair, zone_id, false, 7575 bdev_nvme_zone_management_done, bio); 7576 case SPDK_BDEV_ZONE_OFFLINE: 7577 return spdk_nvme_zns_offline_zone(ns, qpair, zone_id, false, 7578 bdev_nvme_zone_management_done, bio); 7579 default: 7580 return -EINVAL; 7581 } 7582 } 7583 7584 static void 7585 bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 7586 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) 7587 { 7588 struct nvme_io_path *io_path; 7589 struct nvme_ctrlr *nvme_ctrlr; 7590 uint32_t max_xfer_size; 7591 int rc = -ENXIO; 7592 7593 /* Choose the first ctrlr which is not failed. */ 7594 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 7595 nvme_ctrlr = io_path->qpair->ctrlr; 7596 7597 /* We should skip any unavailable nvme_ctrlr rather than checking 7598 * if the return value of spdk_nvme_ctrlr_cmd_admin_raw() is -ENXIO. 7599 */ 7600 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 7601 continue; 7602 } 7603 7604 max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ctrlr->ctrlr); 7605 7606 if (nbytes > max_xfer_size) { 7607 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 7608 rc = -EINVAL; 7609 goto err; 7610 } 7611 7612 rc = spdk_nvme_ctrlr_cmd_admin_raw(nvme_ctrlr->ctrlr, cmd, buf, (uint32_t)nbytes, 7613 bdev_nvme_admin_passthru_done, bio); 7614 if (rc == 0) { 7615 return; 7616 } 7617 } 7618 7619 err: 7620 bdev_nvme_admin_complete(bio, rc); 7621 } 7622 7623 static int 7624 bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 7625 void *buf, size_t nbytes) 7626 { 7627 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7628 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7629 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 7630 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 7631 7632 if (nbytes > max_xfer_size) { 7633 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 7634 return -EINVAL; 7635 } 7636 7637 /* 7638 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 7639 * so fill it out automatically. 7640 */ 7641 cmd->nsid = spdk_nvme_ns_get_id(ns); 7642 7643 return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf, 7644 (uint32_t)nbytes, bdev_nvme_queued_done, bio); 7645 } 7646 7647 static int 7648 bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 7649 void *buf, size_t nbytes, void *md_buf, size_t md_len) 7650 { 7651 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7652 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7653 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 7654 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 7655 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 7656 7657 if (nbytes > max_xfer_size) { 7658 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 7659 return -EINVAL; 7660 } 7661 7662 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 7663 SPDK_ERRLOG("invalid meta data buffer size\n"); 7664 return -EINVAL; 7665 } 7666 7667 /* 7668 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 7669 * so fill it out automatically. 7670 */ 7671 cmd->nsid = spdk_nvme_ns_get_id(ns); 7672 7673 return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf, 7674 (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio); 7675 } 7676 7677 static void 7678 bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 7679 struct nvme_bdev_io *bio_to_abort) 7680 { 7681 struct nvme_io_path *io_path; 7682 int rc = 0; 7683 7684 rc = bdev_nvme_abort_retry_io(nbdev_ch, bio_to_abort); 7685 if (rc == 0) { 7686 bdev_nvme_admin_complete(bio, 0); 7687 return; 7688 } 7689 7690 io_path = bio_to_abort->io_path; 7691 if (io_path != NULL) { 7692 rc = spdk_nvme_ctrlr_cmd_abort_ext(io_path->qpair->ctrlr->ctrlr, 7693 io_path->qpair->qpair, 7694 bio_to_abort, 7695 bdev_nvme_abort_done, bio); 7696 } else { 7697 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 7698 rc = spdk_nvme_ctrlr_cmd_abort_ext(io_path->qpair->ctrlr->ctrlr, 7699 NULL, 7700 bio_to_abort, 7701 bdev_nvme_abort_done, bio); 7702 7703 if (rc != -ENOENT) { 7704 break; 7705 } 7706 } 7707 } 7708 7709 if (rc != 0) { 7710 /* If no command was found or there was any error, complete the abort 7711 * request with failure. 7712 */ 7713 bdev_nvme_admin_complete(bio, rc); 7714 } 7715 } 7716 7717 static int 7718 bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, uint64_t src_offset_blocks, 7719 uint64_t num_blocks) 7720 { 7721 struct spdk_nvme_scc_source_range range = { 7722 .slba = src_offset_blocks, 7723 .nlb = num_blocks - 1 7724 }; 7725 7726 return spdk_nvme_ns_cmd_copy(bio->io_path->nvme_ns->ns, 7727 bio->io_path->qpair->qpair, 7728 &range, 1, dst_offset_blocks, 7729 bdev_nvme_queued_done, bio); 7730 } 7731 7732 static void 7733 bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w) 7734 { 7735 const char *action; 7736 7737 if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) { 7738 action = "reset"; 7739 } else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) { 7740 action = "abort"; 7741 } else { 7742 action = "none"; 7743 } 7744 7745 spdk_json_write_object_begin(w); 7746 7747 spdk_json_write_named_string(w, "method", "bdev_nvme_set_options"); 7748 7749 spdk_json_write_named_object_begin(w, "params"); 7750 spdk_json_write_named_string(w, "action_on_timeout", action); 7751 spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us); 7752 spdk_json_write_named_uint64(w, "timeout_admin_us", g_opts.timeout_admin_us); 7753 spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms); 7754 spdk_json_write_named_uint32(w, "transport_retry_count", g_opts.transport_retry_count); 7755 spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst); 7756 spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight); 7757 spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight); 7758 spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight); 7759 spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us); 7760 spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us); 7761 spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests); 7762 spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit); 7763 spdk_json_write_named_int32(w, "bdev_retry_count", g_opts.bdev_retry_count); 7764 spdk_json_write_named_uint8(w, "transport_ack_timeout", g_opts.transport_ack_timeout); 7765 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", g_opts.ctrlr_loss_timeout_sec); 7766 spdk_json_write_named_uint32(w, "reconnect_delay_sec", g_opts.reconnect_delay_sec); 7767 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", g_opts.fast_io_fail_timeout_sec); 7768 spdk_json_write_named_bool(w, "generate_uuids", g_opts.generate_uuids); 7769 spdk_json_write_named_uint8(w, "transport_tos", g_opts.transport_tos); 7770 spdk_json_write_named_bool(w, "io_path_stat", g_opts.io_path_stat); 7771 spdk_json_write_object_end(w); 7772 7773 spdk_json_write_object_end(w); 7774 } 7775 7776 static void 7777 bdev_nvme_discovery_config_json(struct spdk_json_write_ctx *w, struct discovery_ctx *ctx) 7778 { 7779 struct spdk_nvme_transport_id trid; 7780 7781 spdk_json_write_object_begin(w); 7782 7783 spdk_json_write_named_string(w, "method", "bdev_nvme_start_discovery"); 7784 7785 spdk_json_write_named_object_begin(w, "params"); 7786 spdk_json_write_named_string(w, "name", ctx->name); 7787 spdk_json_write_named_string(w, "hostnqn", ctx->hostnqn); 7788 7789 trid = ctx->trid; 7790 memset(trid.subnqn, 0, sizeof(trid.subnqn)); 7791 nvme_bdev_dump_trid_json(&trid, w); 7792 7793 spdk_json_write_named_bool(w, "wait_for_attach", ctx->wait_for_attach); 7794 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", ctx->bdev_opts.ctrlr_loss_timeout_sec); 7795 spdk_json_write_named_uint32(w, "reconnect_delay_sec", ctx->bdev_opts.reconnect_delay_sec); 7796 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 7797 ctx->bdev_opts.fast_io_fail_timeout_sec); 7798 spdk_json_write_object_end(w); 7799 7800 spdk_json_write_object_end(w); 7801 } 7802 7803 static void 7804 nvme_ctrlr_config_json(struct spdk_json_write_ctx *w, 7805 struct nvme_ctrlr *nvme_ctrlr) 7806 { 7807 struct spdk_nvme_transport_id *trid; 7808 const struct spdk_nvme_ctrlr_opts *opts; 7809 7810 if (nvme_ctrlr->opts.from_discovery_service) { 7811 /* Do not emit an RPC for this - it will be implicitly 7812 * covered by a separate bdev_nvme_start_discovery or 7813 * bdev_nvme_start_mdns_discovery RPC. 7814 */ 7815 return; 7816 } 7817 7818 trid = &nvme_ctrlr->active_path_id->trid; 7819 7820 spdk_json_write_object_begin(w); 7821 7822 spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller"); 7823 7824 spdk_json_write_named_object_begin(w, "params"); 7825 spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name); 7826 nvme_bdev_dump_trid_json(trid, w); 7827 spdk_json_write_named_bool(w, "prchk_reftag", 7828 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0); 7829 spdk_json_write_named_bool(w, "prchk_guard", 7830 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0); 7831 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", nvme_ctrlr->opts.ctrlr_loss_timeout_sec); 7832 spdk_json_write_named_uint32(w, "reconnect_delay_sec", nvme_ctrlr->opts.reconnect_delay_sec); 7833 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 7834 nvme_ctrlr->opts.fast_io_fail_timeout_sec); 7835 7836 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 7837 spdk_json_write_named_bool(w, "hdgst", opts->header_digest); 7838 spdk_json_write_named_bool(w, "ddgst", opts->data_digest); 7839 7840 spdk_json_write_object_end(w); 7841 7842 spdk_json_write_object_end(w); 7843 } 7844 7845 static void 7846 bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w) 7847 { 7848 spdk_json_write_object_begin(w); 7849 spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug"); 7850 7851 spdk_json_write_named_object_begin(w, "params"); 7852 spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us); 7853 spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled); 7854 spdk_json_write_object_end(w); 7855 7856 spdk_json_write_object_end(w); 7857 } 7858 7859 static int 7860 bdev_nvme_config_json(struct spdk_json_write_ctx *w) 7861 { 7862 struct nvme_bdev_ctrlr *nbdev_ctrlr; 7863 struct nvme_ctrlr *nvme_ctrlr; 7864 struct discovery_ctx *ctx; 7865 7866 bdev_nvme_opts_config_json(w); 7867 7868 pthread_mutex_lock(&g_bdev_nvme_mutex); 7869 7870 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 7871 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 7872 nvme_ctrlr_config_json(w, nvme_ctrlr); 7873 } 7874 } 7875 7876 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 7877 if (!ctx->from_mdns_discovery_service) { 7878 bdev_nvme_discovery_config_json(w, ctx); 7879 } 7880 } 7881 7882 bdev_nvme_mdns_discovery_config_json(w); 7883 7884 /* Dump as last parameter to give all NVMe bdevs chance to be constructed 7885 * before enabling hotplug poller. 7886 */ 7887 bdev_nvme_hotplug_config_json(w); 7888 7889 pthread_mutex_unlock(&g_bdev_nvme_mutex); 7890 return 0; 7891 } 7892 7893 struct spdk_nvme_ctrlr * 7894 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev) 7895 { 7896 struct nvme_bdev *nbdev; 7897 struct nvme_ns *nvme_ns; 7898 7899 if (!bdev || bdev->module != &nvme_if) { 7900 return NULL; 7901 } 7902 7903 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 7904 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 7905 assert(nvme_ns != NULL); 7906 7907 return nvme_ns->ctrlr->ctrlr; 7908 } 7909 7910 void 7911 nvme_io_path_info_json(struct spdk_json_write_ctx *w, struct nvme_io_path *io_path) 7912 { 7913 struct nvme_ns *nvme_ns = io_path->nvme_ns; 7914 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 7915 const struct spdk_nvme_ctrlr_data *cdata; 7916 const struct spdk_nvme_transport_id *trid; 7917 const char *adrfam_str; 7918 7919 spdk_json_write_object_begin(w); 7920 7921 spdk_json_write_named_string(w, "bdev_name", nvme_ns->bdev->disk.name); 7922 7923 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 7924 trid = spdk_nvme_ctrlr_get_transport_id(nvme_ctrlr->ctrlr); 7925 7926 spdk_json_write_named_uint32(w, "cntlid", cdata->cntlid); 7927 spdk_json_write_named_bool(w, "current", io_path->nbdev_ch != NULL && 7928 io_path == io_path->nbdev_ch->current_io_path); 7929 spdk_json_write_named_bool(w, "connected", nvme_qpair_is_connected(io_path->qpair)); 7930 spdk_json_write_named_bool(w, "accessible", nvme_ns_is_accessible(nvme_ns)); 7931 7932 spdk_json_write_named_object_begin(w, "transport"); 7933 spdk_json_write_named_string(w, "trtype", trid->trstring); 7934 spdk_json_write_named_string(w, "traddr", trid->traddr); 7935 if (trid->trsvcid[0] != '\0') { 7936 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 7937 } 7938 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 7939 if (adrfam_str) { 7940 spdk_json_write_named_string(w, "adrfam", adrfam_str); 7941 } 7942 spdk_json_write_object_end(w); 7943 7944 spdk_json_write_object_end(w); 7945 } 7946 7947 void 7948 bdev_nvme_get_discovery_info(struct spdk_json_write_ctx *w) 7949 { 7950 struct discovery_ctx *ctx; 7951 struct discovery_entry_ctx *entry_ctx; 7952 7953 spdk_json_write_array_begin(w); 7954 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 7955 spdk_json_write_object_begin(w); 7956 spdk_json_write_named_string(w, "name", ctx->name); 7957 7958 spdk_json_write_named_object_begin(w, "trid"); 7959 nvme_bdev_dump_trid_json(&ctx->trid, w); 7960 spdk_json_write_object_end(w); 7961 7962 spdk_json_write_named_array_begin(w, "referrals"); 7963 TAILQ_FOREACH(entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 7964 spdk_json_write_object_begin(w); 7965 spdk_json_write_named_object_begin(w, "trid"); 7966 nvme_bdev_dump_trid_json(&entry_ctx->trid, w); 7967 spdk_json_write_object_end(w); 7968 spdk_json_write_object_end(w); 7969 } 7970 spdk_json_write_array_end(w); 7971 7972 spdk_json_write_object_end(w); 7973 } 7974 spdk_json_write_array_end(w); 7975 } 7976 7977 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme) 7978 7979 SPDK_TRACE_REGISTER_FN(bdev_nvme_trace, "bdev_nvme", TRACE_GROUP_BDEV_NVME) 7980 { 7981 struct spdk_trace_tpoint_opts opts[] = { 7982 { 7983 "BDEV_NVME_IO_START", TRACE_BDEV_NVME_IO_START, 7984 OWNER_NONE, OBJECT_BDEV_NVME_IO, 1, 7985 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 7986 }, 7987 { 7988 "BDEV_NVME_IO_DONE", TRACE_BDEV_NVME_IO_DONE, 7989 OWNER_NONE, OBJECT_BDEV_NVME_IO, 0, 7990 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 7991 } 7992 }; 7993 7994 7995 spdk_trace_register_object(OBJECT_BDEV_NVME_IO, 'N'); 7996 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 7997 spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_SUBMIT, OBJECT_BDEV_NVME_IO, 0); 7998 spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_SUBMIT, OBJECT_BDEV_NVME_IO, 0); 7999 spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_COMPLETE, OBJECT_BDEV_NVME_IO, 0); 8000 spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_COMPLETE, OBJECT_BDEV_NVME_IO, 0); 8001 } 8002