1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 * Copyright (c) 2022 Dell Inc, or its subsidiaries. All rights reserved. 6 */ 7 8 #include "spdk/stdinc.h" 9 10 #include "bdev_nvme.h" 11 12 #include "spdk/accel.h" 13 #include "spdk/config.h" 14 #include "spdk/endian.h" 15 #include "spdk/bdev.h" 16 #include "spdk/json.h" 17 #include "spdk/keyring.h" 18 #include "spdk/likely.h" 19 #include "spdk/nvme.h" 20 #include "spdk/nvme_ocssd.h" 21 #include "spdk/nvme_zns.h" 22 #include "spdk/opal.h" 23 #include "spdk/thread.h" 24 #include "spdk/trace.h" 25 #include "spdk/string.h" 26 #include "spdk/util.h" 27 #include "spdk/uuid.h" 28 29 #include "spdk/bdev_module.h" 30 #include "spdk/log.h" 31 32 #include "spdk_internal/usdt.h" 33 #include "spdk_internal/trace_defs.h" 34 35 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true 36 #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS (10000) 37 38 #define NSID_STR_LEN 10 39 40 #define SPDK_CONTROLLER_NAME_MAX 512 41 42 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w); 43 44 struct nvme_bdev_io { 45 /** array of iovecs to transfer. */ 46 struct iovec *iovs; 47 48 /** Number of iovecs in iovs array. */ 49 int iovcnt; 50 51 /** Current iovec position. */ 52 int iovpos; 53 54 /** Offset in current iovec. */ 55 uint32_t iov_offset; 56 57 /** I/O path the current I/O or admin passthrough is submitted on, or the I/O path 58 * being reset in a reset I/O. 59 */ 60 struct nvme_io_path *io_path; 61 62 /** array of iovecs to transfer. */ 63 struct iovec *fused_iovs; 64 65 /** Number of iovecs in iovs array. */ 66 int fused_iovcnt; 67 68 /** Current iovec position. */ 69 int fused_iovpos; 70 71 /** Offset in current iovec. */ 72 uint32_t fused_iov_offset; 73 74 /** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */ 75 struct spdk_nvme_cpl cpl; 76 77 /** Extended IO opts passed by the user to bdev layer and mapped to NVME format */ 78 struct spdk_nvme_ns_cmd_ext_io_opts ext_opts; 79 80 /** Keeps track if first of fused commands was submitted */ 81 bool first_fused_submitted; 82 83 /** Keeps track if first of fused commands was completed */ 84 bool first_fused_completed; 85 86 /** Temporary pointer to zone report buffer */ 87 struct spdk_nvme_zns_zone_report *zone_report_buf; 88 89 /** Keep track of how many zones that have been copied to the spdk_bdev_zone_info struct */ 90 uint64_t handled_zones; 91 92 /** Expiration value in ticks to retry the current I/O. */ 93 uint64_t retry_ticks; 94 95 /* How many times the current I/O was retried. */ 96 int32_t retry_count; 97 98 /* Current tsc at submit time. */ 99 uint64_t submit_tsc; 100 101 /* Used to put nvme_bdev_io into the list */ 102 TAILQ_ENTRY(nvme_bdev_io) retry_link; 103 }; 104 105 struct nvme_probe_skip_entry { 106 struct spdk_nvme_transport_id trid; 107 TAILQ_ENTRY(nvme_probe_skip_entry) tailq; 108 }; 109 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */ 110 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER( 111 g_skipped_nvme_ctrlrs); 112 113 static struct spdk_bdev_nvme_opts g_opts = { 114 .action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE, 115 .timeout_us = 0, 116 .timeout_admin_us = 0, 117 .keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS, 118 .transport_retry_count = 4, 119 .arbitration_burst = 0, 120 .low_priority_weight = 0, 121 .medium_priority_weight = 0, 122 .high_priority_weight = 0, 123 .nvme_adminq_poll_period_us = 10000ULL, 124 .nvme_ioq_poll_period_us = 0, 125 .io_queue_requests = 0, 126 .delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT, 127 .bdev_retry_count = 3, 128 .transport_ack_timeout = 0, 129 .ctrlr_loss_timeout_sec = 0, 130 .reconnect_delay_sec = 0, 131 .fast_io_fail_timeout_sec = 0, 132 .disable_auto_failback = false, 133 .generate_uuids = false, 134 .transport_tos = 0, 135 .nvme_error_stat = false, 136 .io_path_stat = false, 137 .allow_accel_sequence = false, 138 }; 139 140 #define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL 141 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT 100000ULL 142 143 static int g_hot_insert_nvme_controller_index = 0; 144 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT; 145 static bool g_nvme_hotplug_enabled = false; 146 struct spdk_thread *g_bdev_nvme_init_thread; 147 static struct spdk_poller *g_hotplug_poller; 148 static struct spdk_poller *g_hotplug_probe_poller; 149 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx; 150 151 static void nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 152 struct nvme_async_probe_ctx *ctx); 153 static void nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 154 struct nvme_async_probe_ctx *ctx); 155 static int bdev_nvme_library_init(void); 156 static void bdev_nvme_library_fini(void); 157 static void _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, 158 struct spdk_bdev_io *bdev_io); 159 static void bdev_nvme_submit_request(struct spdk_io_channel *ch, 160 struct spdk_bdev_io *bdev_io); 161 static int bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 162 void *md, uint64_t lba_count, uint64_t lba, 163 uint32_t flags, struct spdk_memory_domain *domain, void *domain_ctx, 164 struct spdk_accel_sequence *seq); 165 static int bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 166 void *md, uint64_t lba_count, uint64_t lba); 167 static int bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 168 void *md, uint64_t lba_count, uint64_t lba, 169 uint32_t flags, struct spdk_memory_domain *domain, void *domain_ctx, 170 struct spdk_accel_sequence *seq); 171 static int bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 172 void *md, uint64_t lba_count, 173 uint64_t zslba, uint32_t flags); 174 static int bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 175 void *md, uint64_t lba_count, uint64_t lba, 176 uint32_t flags); 177 static int bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, 178 struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov, 179 int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba, 180 uint32_t flags); 181 static int bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, 182 uint32_t num_zones, struct spdk_bdev_zone_info *info); 183 static int bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 184 enum spdk_bdev_zone_action action); 185 static void bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, 186 struct nvme_bdev_io *bio, 187 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); 188 static int bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 189 void *buf, size_t nbytes); 190 static int bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 191 void *buf, size_t nbytes, void *md_buf, size_t md_len); 192 static int bdev_nvme_iov_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 193 struct iovec *iov, int iovcnt, size_t nbytes, 194 void *md_buf, size_t md_len); 195 static void bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, 196 struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort); 197 static void bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio); 198 static int bdev_nvme_reset_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 199 static int bdev_nvme_failover_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 200 static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr); 201 static int nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr); 202 203 static struct nvme_ns *nvme_ns_alloc(void); 204 static void nvme_ns_free(struct nvme_ns *ns); 205 206 static int 207 nvme_ns_cmp(struct nvme_ns *ns1, struct nvme_ns *ns2) 208 { 209 return ns1->id < ns2->id ? -1 : ns1->id > ns2->id; 210 } 211 212 RB_GENERATE_STATIC(nvme_ns_tree, nvme_ns, node, nvme_ns_cmp); 213 214 struct spdk_nvme_qpair * 215 bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch) 216 { 217 struct nvme_ctrlr_channel *ctrlr_ch; 218 219 assert(ctrlr_io_ch != NULL); 220 221 ctrlr_ch = spdk_io_channel_get_ctx(ctrlr_io_ch); 222 223 return ctrlr_ch->qpair->qpair; 224 } 225 226 static int 227 bdev_nvme_get_ctx_size(void) 228 { 229 return sizeof(struct nvme_bdev_io); 230 } 231 232 static struct spdk_bdev_module nvme_if = { 233 .name = "nvme", 234 .async_fini = true, 235 .module_init = bdev_nvme_library_init, 236 .module_fini = bdev_nvme_library_fini, 237 .config_json = bdev_nvme_config_json, 238 .get_ctx_size = bdev_nvme_get_ctx_size, 239 240 }; 241 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if) 242 243 struct nvme_bdev_ctrlrs g_nvme_bdev_ctrlrs = TAILQ_HEAD_INITIALIZER(g_nvme_bdev_ctrlrs); 244 pthread_mutex_t g_bdev_nvme_mutex = PTHREAD_MUTEX_INITIALIZER; 245 bool g_bdev_nvme_module_finish; 246 247 struct nvme_bdev_ctrlr * 248 nvme_bdev_ctrlr_get_by_name(const char *name) 249 { 250 struct nvme_bdev_ctrlr *nbdev_ctrlr; 251 252 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 253 if (strcmp(name, nbdev_ctrlr->name) == 0) { 254 break; 255 } 256 } 257 258 return nbdev_ctrlr; 259 } 260 261 static struct nvme_ctrlr * 262 nvme_bdev_ctrlr_get_ctrlr(struct nvme_bdev_ctrlr *nbdev_ctrlr, 263 const struct spdk_nvme_transport_id *trid) 264 { 265 struct nvme_ctrlr *nvme_ctrlr; 266 267 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 268 if (spdk_nvme_transport_id_compare(trid, &nvme_ctrlr->active_path_id->trid) == 0) { 269 break; 270 } 271 } 272 273 return nvme_ctrlr; 274 } 275 276 struct nvme_ctrlr * 277 nvme_bdev_ctrlr_get_ctrlr_by_id(struct nvme_bdev_ctrlr *nbdev_ctrlr, 278 uint16_t cntlid) 279 { 280 struct nvme_ctrlr *nvme_ctrlr; 281 const struct spdk_nvme_ctrlr_data *cdata; 282 283 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 284 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 285 if (cdata->cntlid == cntlid) { 286 break; 287 } 288 } 289 290 return nvme_ctrlr; 291 } 292 293 static struct nvme_bdev * 294 nvme_bdev_ctrlr_get_bdev(struct nvme_bdev_ctrlr *nbdev_ctrlr, uint32_t nsid) 295 { 296 struct nvme_bdev *bdev; 297 298 pthread_mutex_lock(&g_bdev_nvme_mutex); 299 TAILQ_FOREACH(bdev, &nbdev_ctrlr->bdevs, tailq) { 300 if (bdev->nsid == nsid) { 301 break; 302 } 303 } 304 pthread_mutex_unlock(&g_bdev_nvme_mutex); 305 306 return bdev; 307 } 308 309 struct nvme_ns * 310 nvme_ctrlr_get_ns(struct nvme_ctrlr *nvme_ctrlr, uint32_t nsid) 311 { 312 struct nvme_ns ns; 313 314 assert(nsid > 0); 315 316 ns.id = nsid; 317 return RB_FIND(nvme_ns_tree, &nvme_ctrlr->namespaces, &ns); 318 } 319 320 struct nvme_ns * 321 nvme_ctrlr_get_first_active_ns(struct nvme_ctrlr *nvme_ctrlr) 322 { 323 return RB_MIN(nvme_ns_tree, &nvme_ctrlr->namespaces); 324 } 325 326 struct nvme_ns * 327 nvme_ctrlr_get_next_active_ns(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *ns) 328 { 329 if (ns == NULL) { 330 return NULL; 331 } 332 333 return RB_NEXT(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 334 } 335 336 static struct nvme_ctrlr * 337 nvme_ctrlr_get(const struct spdk_nvme_transport_id *trid) 338 { 339 struct nvme_bdev_ctrlr *nbdev_ctrlr; 340 struct nvme_ctrlr *nvme_ctrlr = NULL; 341 342 pthread_mutex_lock(&g_bdev_nvme_mutex); 343 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 344 nvme_ctrlr = nvme_bdev_ctrlr_get_ctrlr(nbdev_ctrlr, trid); 345 if (nvme_ctrlr != NULL) { 346 break; 347 } 348 } 349 pthread_mutex_unlock(&g_bdev_nvme_mutex); 350 351 return nvme_ctrlr; 352 } 353 354 struct nvme_ctrlr * 355 nvme_ctrlr_get_by_name(const char *name) 356 { 357 struct nvme_bdev_ctrlr *nbdev_ctrlr; 358 struct nvme_ctrlr *nvme_ctrlr = NULL; 359 360 if (name == NULL) { 361 return NULL; 362 } 363 364 pthread_mutex_lock(&g_bdev_nvme_mutex); 365 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 366 if (nbdev_ctrlr != NULL) { 367 nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 368 } 369 pthread_mutex_unlock(&g_bdev_nvme_mutex); 370 371 return nvme_ctrlr; 372 } 373 374 void 375 nvme_bdev_ctrlr_for_each(nvme_bdev_ctrlr_for_each_fn fn, void *ctx) 376 { 377 struct nvme_bdev_ctrlr *nbdev_ctrlr; 378 379 pthread_mutex_lock(&g_bdev_nvme_mutex); 380 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 381 fn(nbdev_ctrlr, ctx); 382 } 383 pthread_mutex_unlock(&g_bdev_nvme_mutex); 384 } 385 386 void 387 nvme_bdev_dump_trid_json(const struct spdk_nvme_transport_id *trid, struct spdk_json_write_ctx *w) 388 { 389 const char *trtype_str; 390 const char *adrfam_str; 391 392 trtype_str = spdk_nvme_transport_id_trtype_str(trid->trtype); 393 if (trtype_str) { 394 spdk_json_write_named_string(w, "trtype", trtype_str); 395 } 396 397 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 398 if (adrfam_str) { 399 spdk_json_write_named_string(w, "adrfam", adrfam_str); 400 } 401 402 if (trid->traddr[0] != '\0') { 403 spdk_json_write_named_string(w, "traddr", trid->traddr); 404 } 405 406 if (trid->trsvcid[0] != '\0') { 407 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 408 } 409 410 if (trid->subnqn[0] != '\0') { 411 spdk_json_write_named_string(w, "subnqn", trid->subnqn); 412 } 413 } 414 415 static void 416 nvme_bdev_ctrlr_delete(struct nvme_bdev_ctrlr *nbdev_ctrlr, 417 struct nvme_ctrlr *nvme_ctrlr) 418 { 419 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_delete, nvme_ctrlr->nbdev_ctrlr->name); 420 pthread_mutex_lock(&g_bdev_nvme_mutex); 421 422 TAILQ_REMOVE(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 423 if (!TAILQ_EMPTY(&nbdev_ctrlr->ctrlrs)) { 424 pthread_mutex_unlock(&g_bdev_nvme_mutex); 425 426 return; 427 } 428 TAILQ_REMOVE(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 429 430 pthread_mutex_unlock(&g_bdev_nvme_mutex); 431 432 assert(TAILQ_EMPTY(&nbdev_ctrlr->bdevs)); 433 434 free(nbdev_ctrlr->name); 435 free(nbdev_ctrlr); 436 } 437 438 static void 439 _nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 440 { 441 struct nvme_path_id *path_id, *tmp_path; 442 struct nvme_ns *ns, *tmp_ns; 443 444 free(nvme_ctrlr->copied_ana_desc); 445 spdk_free(nvme_ctrlr->ana_log_page); 446 447 if (nvme_ctrlr->opal_dev) { 448 spdk_opal_dev_destruct(nvme_ctrlr->opal_dev); 449 nvme_ctrlr->opal_dev = NULL; 450 } 451 452 if (nvme_ctrlr->nbdev_ctrlr) { 453 nvme_bdev_ctrlr_delete(nvme_ctrlr->nbdev_ctrlr, nvme_ctrlr); 454 } 455 456 RB_FOREACH_SAFE(ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp_ns) { 457 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 458 nvme_ns_free(ns); 459 } 460 461 TAILQ_FOREACH_SAFE(path_id, &nvme_ctrlr->trids, link, tmp_path) { 462 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 463 free(path_id); 464 } 465 466 pthread_mutex_destroy(&nvme_ctrlr->mutex); 467 spdk_keyring_put_key(nvme_ctrlr->psk); 468 free(nvme_ctrlr); 469 470 pthread_mutex_lock(&g_bdev_nvme_mutex); 471 if (g_bdev_nvme_module_finish && TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 472 pthread_mutex_unlock(&g_bdev_nvme_mutex); 473 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 474 spdk_bdev_module_fini_done(); 475 return; 476 } 477 pthread_mutex_unlock(&g_bdev_nvme_mutex); 478 } 479 480 static int 481 nvme_detach_poller(void *arg) 482 { 483 struct nvme_ctrlr *nvme_ctrlr = arg; 484 int rc; 485 486 rc = spdk_nvme_detach_poll_async(nvme_ctrlr->detach_ctx); 487 if (rc != -EAGAIN) { 488 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 489 _nvme_ctrlr_delete(nvme_ctrlr); 490 } 491 492 return SPDK_POLLER_BUSY; 493 } 494 495 static void 496 nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 497 { 498 int rc; 499 500 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 501 502 /* First, unregister the adminq poller, as the driver will poll adminq if necessary */ 503 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 504 505 /* If we got here, the reset/detach poller cannot be active */ 506 assert(nvme_ctrlr->reset_detach_poller == NULL); 507 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(nvme_detach_poller, 508 nvme_ctrlr, 1000); 509 if (nvme_ctrlr->reset_detach_poller == NULL) { 510 SPDK_ERRLOG("Failed to register detach poller\n"); 511 goto error; 512 } 513 514 rc = spdk_nvme_detach_async(nvme_ctrlr->ctrlr, &nvme_ctrlr->detach_ctx); 515 if (rc != 0) { 516 SPDK_ERRLOG("Failed to detach the NVMe controller\n"); 517 goto error; 518 } 519 520 return; 521 error: 522 /* We don't have a good way to handle errors here, so just do what we can and delete the 523 * controller without detaching the underlying NVMe device. 524 */ 525 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 526 _nvme_ctrlr_delete(nvme_ctrlr); 527 } 528 529 static void 530 nvme_ctrlr_unregister_cb(void *io_device) 531 { 532 struct nvme_ctrlr *nvme_ctrlr = io_device; 533 534 nvme_ctrlr_delete(nvme_ctrlr); 535 } 536 537 static void 538 nvme_ctrlr_unregister(void *ctx) 539 { 540 struct nvme_ctrlr *nvme_ctrlr = ctx; 541 542 spdk_io_device_unregister(nvme_ctrlr, nvme_ctrlr_unregister_cb); 543 } 544 545 static bool 546 nvme_ctrlr_can_be_unregistered(struct nvme_ctrlr *nvme_ctrlr) 547 { 548 if (!nvme_ctrlr->destruct) { 549 return false; 550 } 551 552 if (nvme_ctrlr->ref > 0) { 553 return false; 554 } 555 556 if (nvme_ctrlr->resetting) { 557 return false; 558 } 559 560 if (nvme_ctrlr->ana_log_page_updating) { 561 return false; 562 } 563 564 if (nvme_ctrlr->io_path_cache_clearing) { 565 return false; 566 } 567 568 return true; 569 } 570 571 static void 572 nvme_ctrlr_release(struct nvme_ctrlr *nvme_ctrlr) 573 { 574 pthread_mutex_lock(&nvme_ctrlr->mutex); 575 SPDK_DTRACE_PROBE2(bdev_nvme_ctrlr_release, nvme_ctrlr->nbdev_ctrlr->name, nvme_ctrlr->ref); 576 577 assert(nvme_ctrlr->ref > 0); 578 nvme_ctrlr->ref--; 579 580 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 581 pthread_mutex_unlock(&nvme_ctrlr->mutex); 582 return; 583 } 584 585 pthread_mutex_unlock(&nvme_ctrlr->mutex); 586 587 spdk_thread_exec_msg(nvme_ctrlr->thread, nvme_ctrlr_unregister, nvme_ctrlr); 588 } 589 590 static void 591 bdev_nvme_clear_current_io_path(struct nvme_bdev_channel *nbdev_ch) 592 { 593 nbdev_ch->current_io_path = NULL; 594 nbdev_ch->rr_counter = 0; 595 } 596 597 static struct nvme_io_path * 598 _bdev_nvme_get_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 599 { 600 struct nvme_io_path *io_path; 601 602 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 603 if (io_path->nvme_ns == nvme_ns) { 604 break; 605 } 606 } 607 608 return io_path; 609 } 610 611 static struct nvme_io_path * 612 nvme_io_path_alloc(void) 613 { 614 struct nvme_io_path *io_path; 615 616 io_path = calloc(1, sizeof(*io_path)); 617 if (io_path == NULL) { 618 SPDK_ERRLOG("Failed to alloc io_path.\n"); 619 return NULL; 620 } 621 622 if (g_opts.io_path_stat) { 623 io_path->stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); 624 if (io_path->stat == NULL) { 625 free(io_path); 626 SPDK_ERRLOG("Failed to alloc io_path stat.\n"); 627 return NULL; 628 } 629 spdk_bdev_reset_io_stat(io_path->stat, SPDK_BDEV_RESET_STAT_MAXMIN); 630 } 631 632 return io_path; 633 } 634 635 static void 636 nvme_io_path_free(struct nvme_io_path *io_path) 637 { 638 free(io_path->stat); 639 free(io_path); 640 } 641 642 static int 643 _bdev_nvme_add_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 644 { 645 struct nvme_io_path *io_path; 646 struct spdk_io_channel *ch; 647 struct nvme_ctrlr_channel *ctrlr_ch; 648 struct nvme_qpair *nvme_qpair; 649 650 io_path = nvme_io_path_alloc(); 651 if (io_path == NULL) { 652 return -ENOMEM; 653 } 654 655 io_path->nvme_ns = nvme_ns; 656 657 ch = spdk_get_io_channel(nvme_ns->ctrlr); 658 if (ch == NULL) { 659 nvme_io_path_free(io_path); 660 SPDK_ERRLOG("Failed to alloc io_channel.\n"); 661 return -ENOMEM; 662 } 663 664 ctrlr_ch = spdk_io_channel_get_ctx(ch); 665 666 nvme_qpair = ctrlr_ch->qpair; 667 assert(nvme_qpair != NULL); 668 669 io_path->qpair = nvme_qpair; 670 TAILQ_INSERT_TAIL(&nvme_qpair->io_path_list, io_path, tailq); 671 672 io_path->nbdev_ch = nbdev_ch; 673 STAILQ_INSERT_TAIL(&nbdev_ch->io_path_list, io_path, stailq); 674 675 bdev_nvme_clear_current_io_path(nbdev_ch); 676 677 return 0; 678 } 679 680 static void 681 bdev_nvme_clear_retry_io_path(struct nvme_bdev_channel *nbdev_ch, 682 struct nvme_io_path *io_path) 683 { 684 struct nvme_bdev_io *bio; 685 686 TAILQ_FOREACH(bio, &nbdev_ch->retry_io_list, retry_link) { 687 if (bio->io_path == io_path) { 688 bio->io_path = NULL; 689 } 690 } 691 } 692 693 static void 694 _bdev_nvme_delete_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *io_path) 695 { 696 struct spdk_io_channel *ch; 697 struct nvme_qpair *nvme_qpair; 698 struct nvme_ctrlr_channel *ctrlr_ch; 699 struct nvme_bdev *nbdev; 700 701 nbdev = spdk_io_channel_get_io_device(spdk_io_channel_from_ctx(nbdev_ch)); 702 703 /* Add the statistics to nvme_ns before this path is destroyed. */ 704 pthread_mutex_lock(&nbdev->mutex); 705 if (nbdev->ref != 0 && io_path->nvme_ns->stat != NULL && io_path->stat != NULL) { 706 spdk_bdev_add_io_stat(io_path->nvme_ns->stat, io_path->stat); 707 } 708 pthread_mutex_unlock(&nbdev->mutex); 709 710 bdev_nvme_clear_current_io_path(nbdev_ch); 711 bdev_nvme_clear_retry_io_path(nbdev_ch, io_path); 712 713 STAILQ_REMOVE(&nbdev_ch->io_path_list, io_path, nvme_io_path, stailq); 714 io_path->nbdev_ch = NULL; 715 716 nvme_qpair = io_path->qpair; 717 assert(nvme_qpair != NULL); 718 719 ctrlr_ch = nvme_qpair->ctrlr_ch; 720 assert(ctrlr_ch != NULL); 721 722 ch = spdk_io_channel_from_ctx(ctrlr_ch); 723 spdk_put_io_channel(ch); 724 725 /* After an io_path is removed, I/Os submitted to it may complete and update statistics 726 * of the io_path. To avoid heap-use-after-free error from this case, do not free the 727 * io_path here but free the io_path when the associated qpair is freed. It is ensured 728 * that all I/Os submitted to the io_path are completed when the associated qpair is freed. 729 */ 730 } 731 732 static void 733 _bdev_nvme_delete_io_paths(struct nvme_bdev_channel *nbdev_ch) 734 { 735 struct nvme_io_path *io_path, *tmp_io_path; 736 737 STAILQ_FOREACH_SAFE(io_path, &nbdev_ch->io_path_list, stailq, tmp_io_path) { 738 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 739 } 740 } 741 742 static int 743 bdev_nvme_create_bdev_channel_cb(void *io_device, void *ctx_buf) 744 { 745 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 746 struct nvme_bdev *nbdev = io_device; 747 struct nvme_ns *nvme_ns; 748 int rc; 749 750 STAILQ_INIT(&nbdev_ch->io_path_list); 751 TAILQ_INIT(&nbdev_ch->retry_io_list); 752 753 pthread_mutex_lock(&nbdev->mutex); 754 755 nbdev_ch->mp_policy = nbdev->mp_policy; 756 nbdev_ch->mp_selector = nbdev->mp_selector; 757 nbdev_ch->rr_min_io = nbdev->rr_min_io; 758 759 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 760 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 761 if (rc != 0) { 762 pthread_mutex_unlock(&nbdev->mutex); 763 764 _bdev_nvme_delete_io_paths(nbdev_ch); 765 return rc; 766 } 767 } 768 pthread_mutex_unlock(&nbdev->mutex); 769 770 return 0; 771 } 772 773 /* If cpl != NULL, complete the bdev_io with nvme status based on 'cpl'. 774 * If cpl == NULL, complete the bdev_io with bdev status based on 'status'. 775 */ 776 static inline void 777 __bdev_nvme_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status, 778 const struct spdk_nvme_cpl *cpl) 779 { 780 spdk_trace_record(TRACE_BDEV_NVME_IO_DONE, 0, 0, (uintptr_t)bdev_io->driver_ctx, 781 (uintptr_t)bdev_io); 782 if (cpl) { 783 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 784 } else { 785 spdk_bdev_io_complete(bdev_io, status); 786 } 787 } 788 789 static void bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch); 790 791 static void 792 bdev_nvme_destroy_bdev_channel_cb(void *io_device, void *ctx_buf) 793 { 794 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 795 796 bdev_nvme_abort_retry_ios(nbdev_ch); 797 _bdev_nvme_delete_io_paths(nbdev_ch); 798 } 799 800 static inline bool 801 bdev_nvme_io_type_is_admin(enum spdk_bdev_io_type io_type) 802 { 803 switch (io_type) { 804 case SPDK_BDEV_IO_TYPE_RESET: 805 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 806 case SPDK_BDEV_IO_TYPE_ABORT: 807 return true; 808 default: 809 break; 810 } 811 812 return false; 813 } 814 815 static inline bool 816 nvme_ns_is_accessible(struct nvme_ns *nvme_ns) 817 { 818 if (spdk_unlikely(nvme_ns->ana_state_updating)) { 819 return false; 820 } 821 822 switch (nvme_ns->ana_state) { 823 case SPDK_NVME_ANA_OPTIMIZED_STATE: 824 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 825 return true; 826 default: 827 break; 828 } 829 830 return false; 831 } 832 833 static inline bool 834 nvme_qpair_is_connected(struct nvme_qpair *nvme_qpair) 835 { 836 if (spdk_unlikely(nvme_qpair->qpair == NULL)) { 837 return false; 838 } 839 840 if (spdk_unlikely(spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) != 841 SPDK_NVME_QPAIR_FAILURE_NONE)) { 842 return false; 843 } 844 845 if (spdk_unlikely(nvme_qpair->ctrlr_ch->reset_iter != NULL)) { 846 return false; 847 } 848 849 return true; 850 } 851 852 static inline bool 853 nvme_io_path_is_available(struct nvme_io_path *io_path) 854 { 855 if (spdk_unlikely(!nvme_qpair_is_connected(io_path->qpair))) { 856 return false; 857 } 858 859 if (spdk_unlikely(!nvme_ns_is_accessible(io_path->nvme_ns))) { 860 return false; 861 } 862 863 return true; 864 } 865 866 static inline bool 867 nvme_ctrlr_is_failed(struct nvme_ctrlr *nvme_ctrlr) 868 { 869 if (nvme_ctrlr->destruct) { 870 return true; 871 } 872 873 if (nvme_ctrlr->fast_io_fail_timedout) { 874 return true; 875 } 876 877 if (nvme_ctrlr->resetting) { 878 if (nvme_ctrlr->opts.reconnect_delay_sec != 0) { 879 return false; 880 } else { 881 return true; 882 } 883 } 884 885 if (nvme_ctrlr->reconnect_is_delayed) { 886 return false; 887 } 888 889 if (nvme_ctrlr->disabled) { 890 return true; 891 } 892 893 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 894 return true; 895 } else { 896 return false; 897 } 898 } 899 900 static bool 901 nvme_ctrlr_is_available(struct nvme_ctrlr *nvme_ctrlr) 902 { 903 if (nvme_ctrlr->destruct) { 904 return false; 905 } 906 907 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 908 return false; 909 } 910 911 if (nvme_ctrlr->resetting || nvme_ctrlr->reconnect_is_delayed) { 912 return false; 913 } 914 915 if (nvme_ctrlr->disabled) { 916 return false; 917 } 918 919 return true; 920 } 921 922 /* Simulate circular linked list. */ 923 static inline struct nvme_io_path * 924 nvme_io_path_get_next(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *prev_path) 925 { 926 struct nvme_io_path *next_path; 927 928 if (prev_path != NULL) { 929 next_path = STAILQ_NEXT(prev_path, stailq); 930 if (next_path != NULL) { 931 return next_path; 932 } 933 } 934 935 return STAILQ_FIRST(&nbdev_ch->io_path_list); 936 } 937 938 static struct nvme_io_path * 939 _bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 940 { 941 struct nvme_io_path *io_path, *start, *non_optimized = NULL; 942 943 start = nvme_io_path_get_next(nbdev_ch, nbdev_ch->current_io_path); 944 945 io_path = start; 946 do { 947 if (spdk_likely(nvme_qpair_is_connected(io_path->qpair) && 948 !io_path->nvme_ns->ana_state_updating)) { 949 switch (io_path->nvme_ns->ana_state) { 950 case SPDK_NVME_ANA_OPTIMIZED_STATE: 951 nbdev_ch->current_io_path = io_path; 952 return io_path; 953 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 954 if (non_optimized == NULL) { 955 non_optimized = io_path; 956 } 957 break; 958 default: 959 break; 960 } 961 } 962 io_path = nvme_io_path_get_next(nbdev_ch, io_path); 963 } while (io_path != start); 964 965 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) { 966 /* We come here only if there is no optimized path. Cache even non_optimized 967 * path for load balance across multiple non_optimized paths. 968 */ 969 nbdev_ch->current_io_path = non_optimized; 970 } 971 972 return non_optimized; 973 } 974 975 static struct nvme_io_path * 976 _bdev_nvme_find_io_path_min_qd(struct nvme_bdev_channel *nbdev_ch) 977 { 978 struct nvme_io_path *io_path; 979 struct nvme_io_path *optimized = NULL, *non_optimized = NULL; 980 uint32_t opt_min_qd = UINT32_MAX, non_opt_min_qd = UINT32_MAX; 981 uint32_t num_outstanding_reqs; 982 983 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 984 if (spdk_unlikely(!nvme_qpair_is_connected(io_path->qpair))) { 985 /* The device is currently resetting. */ 986 continue; 987 } 988 989 if (spdk_unlikely(io_path->nvme_ns->ana_state_updating)) { 990 continue; 991 } 992 993 num_outstanding_reqs = spdk_nvme_qpair_get_num_outstanding_reqs(io_path->qpair->qpair); 994 switch (io_path->nvme_ns->ana_state) { 995 case SPDK_NVME_ANA_OPTIMIZED_STATE: 996 if (num_outstanding_reqs < opt_min_qd) { 997 opt_min_qd = num_outstanding_reqs; 998 optimized = io_path; 999 } 1000 break; 1001 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 1002 if (num_outstanding_reqs < non_opt_min_qd) { 1003 non_opt_min_qd = num_outstanding_reqs; 1004 non_optimized = io_path; 1005 } 1006 break; 1007 default: 1008 break; 1009 } 1010 } 1011 1012 /* don't cache io path for BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH selector */ 1013 if (optimized != NULL) { 1014 return optimized; 1015 } 1016 1017 return non_optimized; 1018 } 1019 1020 static inline struct nvme_io_path * 1021 bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 1022 { 1023 if (spdk_likely(nbdev_ch->current_io_path != NULL)) { 1024 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE) { 1025 return nbdev_ch->current_io_path; 1026 } else if (nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 1027 if (++nbdev_ch->rr_counter < nbdev_ch->rr_min_io) { 1028 return nbdev_ch->current_io_path; 1029 } 1030 nbdev_ch->rr_counter = 0; 1031 } 1032 } 1033 1034 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE || 1035 nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 1036 return _bdev_nvme_find_io_path(nbdev_ch); 1037 } else { 1038 return _bdev_nvme_find_io_path_min_qd(nbdev_ch); 1039 } 1040 } 1041 1042 /* Return true if there is any io_path whose qpair is active or ctrlr is not failed, 1043 * or false otherwise. 1044 * 1045 * If any io_path has an active qpair but find_io_path() returned NULL, its namespace 1046 * is likely to be non-accessible now but may become accessible. 1047 * 1048 * If any io_path has an unfailed ctrlr but find_io_path() returned NULL, the ctrlr 1049 * is likely to be resetting now but the reset may succeed. A ctrlr is set to unfailed 1050 * when starting to reset it but it is set to failed when the reset failed. Hence, if 1051 * a ctrlr is unfailed, it is likely that it works fine or is resetting. 1052 */ 1053 static bool 1054 any_io_path_may_become_available(struct nvme_bdev_channel *nbdev_ch) 1055 { 1056 struct nvme_io_path *io_path; 1057 1058 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 1059 if (io_path->nvme_ns->ana_transition_timedout) { 1060 continue; 1061 } 1062 1063 if (nvme_qpair_is_connected(io_path->qpair) || 1064 !nvme_ctrlr_is_failed(io_path->qpair->ctrlr)) { 1065 return true; 1066 } 1067 } 1068 1069 return false; 1070 } 1071 1072 static void 1073 bdev_nvme_retry_io(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io) 1074 { 1075 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 1076 struct spdk_io_channel *ch; 1077 1078 if (nbdev_io->io_path != NULL && nvme_io_path_is_available(nbdev_io->io_path)) { 1079 _bdev_nvme_submit_request(nbdev_ch, bdev_io); 1080 } else { 1081 ch = spdk_io_channel_from_ctx(nbdev_ch); 1082 bdev_nvme_submit_request(ch, bdev_io); 1083 } 1084 } 1085 1086 static int 1087 bdev_nvme_retry_ios(void *arg) 1088 { 1089 struct nvme_bdev_channel *nbdev_ch = arg; 1090 struct nvme_bdev_io *bio, *tmp_bio; 1091 uint64_t now, delay_us; 1092 1093 now = spdk_get_ticks(); 1094 1095 TAILQ_FOREACH_SAFE(bio, &nbdev_ch->retry_io_list, retry_link, tmp_bio) { 1096 if (bio->retry_ticks > now) { 1097 break; 1098 } 1099 1100 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bio, retry_link); 1101 1102 bdev_nvme_retry_io(nbdev_ch, spdk_bdev_io_from_ctx(bio)); 1103 } 1104 1105 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1106 1107 bio = TAILQ_FIRST(&nbdev_ch->retry_io_list); 1108 if (bio != NULL) { 1109 delay_us = (bio->retry_ticks - now) * SPDK_SEC_TO_USEC / spdk_get_ticks_hz(); 1110 1111 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 1112 delay_us); 1113 } 1114 1115 return SPDK_POLLER_BUSY; 1116 } 1117 1118 static void 1119 bdev_nvme_queue_retry_io(struct nvme_bdev_channel *nbdev_ch, 1120 struct nvme_bdev_io *bio, uint64_t delay_ms) 1121 { 1122 struct nvme_bdev_io *tmp_bio; 1123 1124 bio->retry_ticks = spdk_get_ticks() + delay_ms * spdk_get_ticks_hz() / 1000ULL; 1125 1126 TAILQ_FOREACH_REVERSE(tmp_bio, &nbdev_ch->retry_io_list, retry_io_head, retry_link) { 1127 if (tmp_bio->retry_ticks <= bio->retry_ticks) { 1128 TAILQ_INSERT_AFTER(&nbdev_ch->retry_io_list, tmp_bio, bio, 1129 retry_link); 1130 return; 1131 } 1132 } 1133 1134 /* No earlier I/Os were found. This I/O must be the new head. */ 1135 TAILQ_INSERT_HEAD(&nbdev_ch->retry_io_list, bio, retry_link); 1136 1137 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1138 1139 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 1140 delay_ms * 1000ULL); 1141 } 1142 1143 static void 1144 bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch) 1145 { 1146 struct nvme_bdev_io *bio, *tmp_bio; 1147 1148 TAILQ_FOREACH_SAFE(bio, &nbdev_ch->retry_io_list, retry_link, tmp_bio) { 1149 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bio, retry_link); 1150 __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_ABORTED, NULL); 1151 } 1152 1153 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1154 } 1155 1156 static int 1157 bdev_nvme_abort_retry_io(struct nvme_bdev_channel *nbdev_ch, 1158 struct nvme_bdev_io *bio_to_abort) 1159 { 1160 struct nvme_bdev_io *bio; 1161 1162 TAILQ_FOREACH(bio, &nbdev_ch->retry_io_list, retry_link) { 1163 if (bio == bio_to_abort) { 1164 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bio, retry_link); 1165 __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_ABORTED, NULL); 1166 return 0; 1167 } 1168 } 1169 1170 return -ENOENT; 1171 } 1172 1173 static void 1174 bdev_nvme_update_nvme_error_stat(struct spdk_bdev_io *bdev_io, const struct spdk_nvme_cpl *cpl) 1175 { 1176 struct nvme_bdev *nbdev; 1177 uint16_t sct, sc; 1178 1179 assert(spdk_nvme_cpl_is_error(cpl)); 1180 1181 nbdev = bdev_io->bdev->ctxt; 1182 1183 if (nbdev->err_stat == NULL) { 1184 return; 1185 } 1186 1187 sct = cpl->status.sct; 1188 sc = cpl->status.sc; 1189 1190 pthread_mutex_lock(&nbdev->mutex); 1191 1192 nbdev->err_stat->status_type[sct]++; 1193 switch (sct) { 1194 case SPDK_NVME_SCT_GENERIC: 1195 case SPDK_NVME_SCT_COMMAND_SPECIFIC: 1196 case SPDK_NVME_SCT_MEDIA_ERROR: 1197 case SPDK_NVME_SCT_PATH: 1198 nbdev->err_stat->status[sct][sc]++; 1199 break; 1200 default: 1201 break; 1202 } 1203 1204 pthread_mutex_unlock(&nbdev->mutex); 1205 } 1206 1207 static inline void 1208 bdev_nvme_update_io_path_stat(struct nvme_bdev_io *bio) 1209 { 1210 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1211 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 1212 uint32_t blocklen = bdev_io->bdev->blocklen; 1213 struct spdk_bdev_io_stat *stat; 1214 uint64_t tsc_diff; 1215 1216 if (bio->io_path->stat == NULL) { 1217 return; 1218 } 1219 1220 tsc_diff = spdk_get_ticks() - bio->submit_tsc; 1221 stat = bio->io_path->stat; 1222 1223 switch (bdev_io->type) { 1224 case SPDK_BDEV_IO_TYPE_READ: 1225 stat->bytes_read += num_blocks * blocklen; 1226 stat->num_read_ops++; 1227 stat->read_latency_ticks += tsc_diff; 1228 if (stat->max_read_latency_ticks < tsc_diff) { 1229 stat->max_read_latency_ticks = tsc_diff; 1230 } 1231 if (stat->min_read_latency_ticks > tsc_diff) { 1232 stat->min_read_latency_ticks = tsc_diff; 1233 } 1234 break; 1235 case SPDK_BDEV_IO_TYPE_WRITE: 1236 stat->bytes_written += num_blocks * blocklen; 1237 stat->num_write_ops++; 1238 stat->write_latency_ticks += tsc_diff; 1239 if (stat->max_write_latency_ticks < tsc_diff) { 1240 stat->max_write_latency_ticks = tsc_diff; 1241 } 1242 if (stat->min_write_latency_ticks > tsc_diff) { 1243 stat->min_write_latency_ticks = tsc_diff; 1244 } 1245 break; 1246 case SPDK_BDEV_IO_TYPE_UNMAP: 1247 stat->bytes_unmapped += num_blocks * blocklen; 1248 stat->num_unmap_ops++; 1249 stat->unmap_latency_ticks += tsc_diff; 1250 if (stat->max_unmap_latency_ticks < tsc_diff) { 1251 stat->max_unmap_latency_ticks = tsc_diff; 1252 } 1253 if (stat->min_unmap_latency_ticks > tsc_diff) { 1254 stat->min_unmap_latency_ticks = tsc_diff; 1255 } 1256 break; 1257 case SPDK_BDEV_IO_TYPE_ZCOPY: 1258 /* Track the data in the start phase only */ 1259 if (!bdev_io->u.bdev.zcopy.start) { 1260 break; 1261 } 1262 if (bdev_io->u.bdev.zcopy.populate) { 1263 stat->bytes_read += num_blocks * blocklen; 1264 stat->num_read_ops++; 1265 stat->read_latency_ticks += tsc_diff; 1266 if (stat->max_read_latency_ticks < tsc_diff) { 1267 stat->max_read_latency_ticks = tsc_diff; 1268 } 1269 if (stat->min_read_latency_ticks > tsc_diff) { 1270 stat->min_read_latency_ticks = tsc_diff; 1271 } 1272 } else { 1273 stat->bytes_written += num_blocks * blocklen; 1274 stat->num_write_ops++; 1275 stat->write_latency_ticks += tsc_diff; 1276 if (stat->max_write_latency_ticks < tsc_diff) { 1277 stat->max_write_latency_ticks = tsc_diff; 1278 } 1279 if (stat->min_write_latency_ticks > tsc_diff) { 1280 stat->min_write_latency_ticks = tsc_diff; 1281 } 1282 } 1283 break; 1284 case SPDK_BDEV_IO_TYPE_COPY: 1285 stat->bytes_copied += num_blocks * blocklen; 1286 stat->num_copy_ops++; 1287 stat->copy_latency_ticks += tsc_diff; 1288 if (stat->max_copy_latency_ticks < tsc_diff) { 1289 stat->max_copy_latency_ticks = tsc_diff; 1290 } 1291 if (stat->min_copy_latency_ticks > tsc_diff) { 1292 stat->min_copy_latency_ticks = tsc_diff; 1293 } 1294 break; 1295 default: 1296 break; 1297 } 1298 } 1299 1300 static bool 1301 bdev_nvme_check_retry_io(struct nvme_bdev_io *bio, 1302 const struct spdk_nvme_cpl *cpl, 1303 struct nvme_bdev_channel *nbdev_ch, 1304 uint64_t *_delay_ms) 1305 { 1306 struct nvme_io_path *io_path = bio->io_path; 1307 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 1308 const struct spdk_nvme_ctrlr_data *cdata; 1309 1310 if (spdk_nvme_cpl_is_path_error(cpl) || 1311 spdk_nvme_cpl_is_aborted_sq_deletion(cpl) || 1312 !nvme_io_path_is_available(io_path) || 1313 !nvme_ctrlr_is_available(nvme_ctrlr)) { 1314 bdev_nvme_clear_current_io_path(nbdev_ch); 1315 bio->io_path = NULL; 1316 if (spdk_nvme_cpl_is_ana_error(cpl)) { 1317 if (nvme_ctrlr_read_ana_log_page(nvme_ctrlr) == 0) { 1318 io_path->nvme_ns->ana_state_updating = true; 1319 } 1320 } 1321 if (!any_io_path_may_become_available(nbdev_ch)) { 1322 return false; 1323 } 1324 *_delay_ms = 0; 1325 } else { 1326 bio->retry_count++; 1327 1328 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 1329 1330 if (cpl->status.crd != 0) { 1331 *_delay_ms = cdata->crdt[cpl->status.crd] * 100; 1332 } else { 1333 *_delay_ms = 0; 1334 } 1335 } 1336 1337 return true; 1338 } 1339 1340 static inline void 1341 bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio, 1342 const struct spdk_nvme_cpl *cpl) 1343 { 1344 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1345 struct nvme_bdev_channel *nbdev_ch; 1346 uint64_t delay_ms; 1347 1348 assert(!bdev_nvme_io_type_is_admin(bdev_io->type)); 1349 1350 if (spdk_likely(spdk_nvme_cpl_is_success(cpl))) { 1351 bdev_nvme_update_io_path_stat(bio); 1352 goto complete; 1353 } 1354 1355 /* Update error counts before deciding if retry is needed. 1356 * Hence, error counts may be more than the number of I/O errors. 1357 */ 1358 bdev_nvme_update_nvme_error_stat(bdev_io, cpl); 1359 1360 if (cpl->status.dnr != 0 || spdk_nvme_cpl_is_aborted_by_request(cpl) || 1361 (g_opts.bdev_retry_count != -1 && bio->retry_count >= g_opts.bdev_retry_count)) { 1362 goto complete; 1363 } 1364 1365 /* At this point we don't know whether the sequence was successfully executed or not, so we 1366 * cannot retry the IO */ 1367 if (bdev_io->u.bdev.accel_sequence != NULL) { 1368 goto complete; 1369 } 1370 1371 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1372 1373 if (bdev_nvme_check_retry_io(bio, cpl, nbdev_ch, &delay_ms)) { 1374 bdev_nvme_queue_retry_io(nbdev_ch, bio, delay_ms); 1375 return; 1376 } 1377 1378 complete: 1379 bio->retry_count = 0; 1380 bio->submit_tsc = 0; 1381 bdev_io->u.bdev.accel_sequence = NULL; 1382 __bdev_nvme_io_complete(bdev_io, 0, cpl); 1383 } 1384 1385 static inline void 1386 bdev_nvme_io_complete(struct nvme_bdev_io *bio, int rc) 1387 { 1388 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1389 struct nvme_bdev_channel *nbdev_ch; 1390 enum spdk_bdev_io_status io_status; 1391 1392 assert(!bdev_nvme_io_type_is_admin(bdev_io->type)); 1393 1394 switch (rc) { 1395 case 0: 1396 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1397 break; 1398 case -ENOMEM: 1399 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1400 break; 1401 case -ENXIO: 1402 if (g_opts.bdev_retry_count == -1 || bio->retry_count < g_opts.bdev_retry_count) { 1403 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1404 1405 bdev_nvme_clear_current_io_path(nbdev_ch); 1406 bio->io_path = NULL; 1407 1408 if (any_io_path_may_become_available(nbdev_ch)) { 1409 bdev_nvme_queue_retry_io(nbdev_ch, bio, 1000ULL); 1410 return; 1411 } 1412 } 1413 1414 /* fallthrough */ 1415 default: 1416 spdk_accel_sequence_abort(bdev_io->u.bdev.accel_sequence); 1417 bdev_io->u.bdev.accel_sequence = NULL; 1418 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1419 break; 1420 } 1421 1422 bio->retry_count = 0; 1423 bio->submit_tsc = 0; 1424 __bdev_nvme_io_complete(bdev_io, io_status, NULL); 1425 } 1426 1427 static inline void 1428 bdev_nvme_admin_complete(struct nvme_bdev_io *bio, int rc) 1429 { 1430 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1431 enum spdk_bdev_io_status io_status; 1432 1433 switch (rc) { 1434 case 0: 1435 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1436 break; 1437 case -ENOMEM: 1438 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1439 break; 1440 case -ENXIO: 1441 /* fallthrough */ 1442 default: 1443 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1444 break; 1445 } 1446 1447 __bdev_nvme_io_complete(bdev_io, io_status, NULL); 1448 } 1449 1450 static void 1451 bdev_nvme_clear_io_path_caches_done(struct spdk_io_channel_iter *i, int status) 1452 { 1453 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 1454 1455 pthread_mutex_lock(&nvme_ctrlr->mutex); 1456 1457 assert(nvme_ctrlr->io_path_cache_clearing == true); 1458 nvme_ctrlr->io_path_cache_clearing = false; 1459 1460 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 1461 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1462 return; 1463 } 1464 1465 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1466 1467 nvme_ctrlr_unregister(nvme_ctrlr); 1468 } 1469 1470 static void 1471 _bdev_nvme_clear_io_path_cache(struct nvme_qpair *nvme_qpair) 1472 { 1473 struct nvme_io_path *io_path; 1474 1475 TAILQ_FOREACH(io_path, &nvme_qpair->io_path_list, tailq) { 1476 if (io_path->nbdev_ch == NULL) { 1477 continue; 1478 } 1479 bdev_nvme_clear_current_io_path(io_path->nbdev_ch); 1480 } 1481 } 1482 1483 static void 1484 bdev_nvme_clear_io_path_cache(struct spdk_io_channel_iter *i) 1485 { 1486 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 1487 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 1488 1489 assert(ctrlr_ch->qpair != NULL); 1490 1491 _bdev_nvme_clear_io_path_cache(ctrlr_ch->qpair); 1492 1493 spdk_for_each_channel_continue(i, 0); 1494 } 1495 1496 static void 1497 bdev_nvme_clear_io_path_caches(struct nvme_ctrlr *nvme_ctrlr) 1498 { 1499 pthread_mutex_lock(&nvme_ctrlr->mutex); 1500 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 1501 nvme_ctrlr->io_path_cache_clearing) { 1502 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1503 return; 1504 } 1505 1506 nvme_ctrlr->io_path_cache_clearing = true; 1507 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1508 1509 spdk_for_each_channel(nvme_ctrlr, 1510 bdev_nvme_clear_io_path_cache, 1511 NULL, 1512 bdev_nvme_clear_io_path_caches_done); 1513 } 1514 1515 static struct nvme_qpair * 1516 nvme_poll_group_get_qpair(struct nvme_poll_group *group, struct spdk_nvme_qpair *qpair) 1517 { 1518 struct nvme_qpair *nvme_qpair; 1519 1520 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1521 if (nvme_qpair->qpair == qpair) { 1522 break; 1523 } 1524 } 1525 1526 return nvme_qpair; 1527 } 1528 1529 static void nvme_qpair_delete(struct nvme_qpair *nvme_qpair); 1530 1531 static void 1532 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx) 1533 { 1534 struct nvme_poll_group *group = poll_group_ctx; 1535 struct nvme_qpair *nvme_qpair; 1536 struct nvme_ctrlr_channel *ctrlr_ch; 1537 int status; 1538 1539 nvme_qpair = nvme_poll_group_get_qpair(group, qpair); 1540 if (nvme_qpair == NULL) { 1541 return; 1542 } 1543 1544 if (nvme_qpair->qpair != NULL) { 1545 spdk_nvme_ctrlr_free_io_qpair(nvme_qpair->qpair); 1546 nvme_qpair->qpair = NULL; 1547 } 1548 1549 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1550 1551 ctrlr_ch = nvme_qpair->ctrlr_ch; 1552 1553 if (ctrlr_ch != NULL) { 1554 if (ctrlr_ch->reset_iter != NULL) { 1555 /* We are in a full reset sequence. */ 1556 if (ctrlr_ch->connect_poller != NULL) { 1557 /* qpair was failed to connect. Abort the reset sequence. */ 1558 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was failed to connect. abort the reset ctrlr sequence.\n", 1559 qpair); 1560 spdk_poller_unregister(&ctrlr_ch->connect_poller); 1561 status = -1; 1562 } else { 1563 /* qpair was completed to disconnect. Just move to the next ctrlr_channel. */ 1564 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was disconnected and freed in a reset ctrlr sequence.\n", 1565 qpair); 1566 status = 0; 1567 } 1568 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, status); 1569 ctrlr_ch->reset_iter = NULL; 1570 } else { 1571 /* qpair was disconnected unexpectedly. Reset controller for recovery. */ 1572 SPDK_NOTICELOG("qpair %p was disconnected and freed. reset controller.\n", qpair); 1573 bdev_nvme_failover_ctrlr(nvme_qpair->ctrlr); 1574 } 1575 } else { 1576 /* In this case, ctrlr_channel is already deleted. */ 1577 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was disconnected and freed. delete nvme_qpair.\n", qpair); 1578 nvme_qpair_delete(nvme_qpair); 1579 } 1580 } 1581 1582 static void 1583 bdev_nvme_check_io_qpairs(struct nvme_poll_group *group) 1584 { 1585 struct nvme_qpair *nvme_qpair; 1586 1587 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1588 if (nvme_qpair->qpair == NULL || nvme_qpair->ctrlr_ch == NULL) { 1589 continue; 1590 } 1591 1592 if (spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) != 1593 SPDK_NVME_QPAIR_FAILURE_NONE) { 1594 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1595 } 1596 } 1597 } 1598 1599 static int 1600 bdev_nvme_poll(void *arg) 1601 { 1602 struct nvme_poll_group *group = arg; 1603 int64_t num_completions; 1604 1605 if (group->collect_spin_stat && group->start_ticks == 0) { 1606 group->start_ticks = spdk_get_ticks(); 1607 } 1608 1609 num_completions = spdk_nvme_poll_group_process_completions(group->group, 0, 1610 bdev_nvme_disconnected_qpair_cb); 1611 if (group->collect_spin_stat) { 1612 if (num_completions > 0) { 1613 if (group->end_ticks != 0) { 1614 group->spin_ticks += (group->end_ticks - group->start_ticks); 1615 group->end_ticks = 0; 1616 } 1617 group->start_ticks = 0; 1618 } else { 1619 group->end_ticks = spdk_get_ticks(); 1620 } 1621 } 1622 1623 if (spdk_unlikely(num_completions < 0)) { 1624 bdev_nvme_check_io_qpairs(group); 1625 } 1626 1627 return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 1628 } 1629 1630 static int bdev_nvme_poll_adminq(void *arg); 1631 1632 static void 1633 bdev_nvme_change_adminq_poll_period(struct nvme_ctrlr *nvme_ctrlr, uint64_t new_period_us) 1634 { 1635 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 1636 1637 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, 1638 nvme_ctrlr, new_period_us); 1639 } 1640 1641 static int 1642 bdev_nvme_poll_adminq(void *arg) 1643 { 1644 int32_t rc; 1645 struct nvme_ctrlr *nvme_ctrlr = arg; 1646 nvme_ctrlr_disconnected_cb disconnected_cb; 1647 1648 assert(nvme_ctrlr != NULL); 1649 1650 rc = spdk_nvme_ctrlr_process_admin_completions(nvme_ctrlr->ctrlr); 1651 if (rc < 0) { 1652 disconnected_cb = nvme_ctrlr->disconnected_cb; 1653 nvme_ctrlr->disconnected_cb = NULL; 1654 1655 if (disconnected_cb != NULL) { 1656 bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 1657 g_opts.nvme_adminq_poll_period_us); 1658 disconnected_cb(nvme_ctrlr); 1659 } else { 1660 bdev_nvme_failover_ctrlr(nvme_ctrlr); 1661 } 1662 } else if (spdk_nvme_ctrlr_get_admin_qp_failure_reason(nvme_ctrlr->ctrlr) != 1663 SPDK_NVME_QPAIR_FAILURE_NONE) { 1664 bdev_nvme_clear_io_path_caches(nvme_ctrlr); 1665 } 1666 1667 return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY; 1668 } 1669 1670 static void 1671 nvme_bdev_free(void *io_device) 1672 { 1673 struct nvme_bdev *nvme_disk = io_device; 1674 1675 pthread_mutex_destroy(&nvme_disk->mutex); 1676 free(nvme_disk->disk.name); 1677 free(nvme_disk->err_stat); 1678 free(nvme_disk); 1679 } 1680 1681 static int 1682 bdev_nvme_destruct(void *ctx) 1683 { 1684 struct nvme_bdev *nvme_disk = ctx; 1685 struct nvme_ns *nvme_ns, *tmp_nvme_ns; 1686 1687 SPDK_DTRACE_PROBE2(bdev_nvme_destruct, nvme_disk->nbdev_ctrlr->name, nvme_disk->nsid); 1688 1689 TAILQ_FOREACH_SAFE(nvme_ns, &nvme_disk->nvme_ns_list, tailq, tmp_nvme_ns) { 1690 pthread_mutex_lock(&nvme_ns->ctrlr->mutex); 1691 1692 nvme_ns->bdev = NULL; 1693 1694 assert(nvme_ns->id > 0); 1695 1696 if (nvme_ctrlr_get_ns(nvme_ns->ctrlr, nvme_ns->id) == NULL) { 1697 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1698 1699 nvme_ctrlr_release(nvme_ns->ctrlr); 1700 nvme_ns_free(nvme_ns); 1701 } else { 1702 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1703 } 1704 } 1705 1706 pthread_mutex_lock(&g_bdev_nvme_mutex); 1707 TAILQ_REMOVE(&nvme_disk->nbdev_ctrlr->bdevs, nvme_disk, tailq); 1708 pthread_mutex_unlock(&g_bdev_nvme_mutex); 1709 1710 spdk_io_device_unregister(nvme_disk, nvme_bdev_free); 1711 1712 return 0; 1713 } 1714 1715 static int 1716 bdev_nvme_create_qpair(struct nvme_qpair *nvme_qpair) 1717 { 1718 struct nvme_ctrlr *nvme_ctrlr; 1719 struct spdk_nvme_io_qpair_opts opts; 1720 struct spdk_nvme_qpair *qpair; 1721 int rc; 1722 1723 nvme_ctrlr = nvme_qpair->ctrlr; 1724 1725 spdk_nvme_ctrlr_get_default_io_qpair_opts(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1726 opts.delay_cmd_submit = g_opts.delay_cmd_submit; 1727 opts.create_only = true; 1728 opts.async_mode = true; 1729 opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests); 1730 g_opts.io_queue_requests = opts.io_queue_requests; 1731 1732 qpair = spdk_nvme_ctrlr_alloc_io_qpair(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1733 if (qpair == NULL) { 1734 return -1; 1735 } 1736 1737 SPDK_DTRACE_PROBE3(bdev_nvme_create_qpair, nvme_ctrlr->nbdev_ctrlr->name, 1738 spdk_nvme_qpair_get_id(qpair), spdk_thread_get_id(nvme_ctrlr->thread)); 1739 1740 assert(nvme_qpair->group != NULL); 1741 1742 rc = spdk_nvme_poll_group_add(nvme_qpair->group->group, qpair); 1743 if (rc != 0) { 1744 SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n"); 1745 goto err; 1746 } 1747 1748 rc = spdk_nvme_ctrlr_connect_io_qpair(nvme_ctrlr->ctrlr, qpair); 1749 if (rc != 0) { 1750 SPDK_ERRLOG("Unable to connect I/O qpair.\n"); 1751 goto err; 1752 } 1753 1754 nvme_qpair->qpair = qpair; 1755 1756 if (!g_opts.disable_auto_failback) { 1757 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1758 } 1759 1760 return 0; 1761 1762 err: 1763 spdk_nvme_ctrlr_free_io_qpair(qpair); 1764 1765 return rc; 1766 } 1767 1768 static void 1769 bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i) 1770 { 1771 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 1772 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 1773 enum spdk_bdev_io_status status = SPDK_BDEV_IO_STATUS_SUCCESS; 1774 struct nvme_bdev_io *bio; 1775 1776 if (spdk_io_channel_iter_get_ctx(i) != NULL) { 1777 status = SPDK_BDEV_IO_STATUS_FAILED; 1778 } 1779 1780 while (!TAILQ_EMPTY(&ctrlr_ch->pending_resets)) { 1781 bio = TAILQ_FIRST(&ctrlr_ch->pending_resets); 1782 TAILQ_REMOVE(&ctrlr_ch->pending_resets, bio, retry_link); 1783 __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), status, NULL); 1784 } 1785 1786 spdk_for_each_channel_continue(i, 0); 1787 } 1788 1789 /* This function marks the current trid as failed by storing the current ticks 1790 * and then sets the next trid to the active trid within a controller if exists. 1791 * 1792 * The purpose of the boolean return value is to request the caller to disconnect 1793 * the current trid now to try connecting the next trid. 1794 */ 1795 static bool 1796 bdev_nvme_failover_trid(struct nvme_ctrlr *nvme_ctrlr, bool remove, bool start) 1797 { 1798 struct nvme_path_id *path_id, *next_path; 1799 int rc __attribute__((unused)); 1800 1801 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 1802 assert(path_id); 1803 assert(path_id == nvme_ctrlr->active_path_id); 1804 next_path = TAILQ_NEXT(path_id, link); 1805 1806 /* Update the last failed time. It means the trid is failed if its last 1807 * failed time is non-zero. 1808 */ 1809 path_id->last_failed_tsc = spdk_get_ticks(); 1810 1811 if (next_path == NULL) { 1812 /* There is no alternate trid within a controller. */ 1813 return false; 1814 } 1815 1816 if (!start && nvme_ctrlr->opts.reconnect_delay_sec == 0) { 1817 /* Connect is not retried in a controller reset sequence. Connecting 1818 * the next trid will be done by the next bdev_nvme_failover_ctrlr() call. 1819 */ 1820 return false; 1821 } 1822 1823 assert(path_id->trid.trtype != SPDK_NVME_TRANSPORT_PCIE); 1824 1825 SPDK_NOTICELOG("Start failover from %s:%s to %s:%s\n", path_id->trid.traddr, 1826 path_id->trid.trsvcid, next_path->trid.traddr, next_path->trid.trsvcid); 1827 1828 spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr); 1829 nvme_ctrlr->active_path_id = next_path; 1830 rc = spdk_nvme_ctrlr_set_trid(nvme_ctrlr->ctrlr, &next_path->trid); 1831 assert(rc == 0); 1832 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 1833 if (!remove) { 1834 /** Shuffle the old trid to the end of the list and use the new one. 1835 * Allows for round robin through multiple connections. 1836 */ 1837 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, path_id, link); 1838 } else { 1839 free(path_id); 1840 } 1841 1842 if (start || next_path->last_failed_tsc == 0) { 1843 /* bdev_nvme_failover_ctrlr() is just called or the next trid is not failed 1844 * or used yet. Try the next trid now. 1845 */ 1846 return true; 1847 } 1848 1849 if (spdk_get_ticks() > next_path->last_failed_tsc + spdk_get_ticks_hz() * 1850 nvme_ctrlr->opts.reconnect_delay_sec) { 1851 /* Enough backoff passed since the next trid failed. Try the next trid now. */ 1852 return true; 1853 } 1854 1855 /* The next trid will be tried after reconnect_delay_sec seconds. */ 1856 return false; 1857 } 1858 1859 static bool 1860 bdev_nvme_check_ctrlr_loss_timeout(struct nvme_ctrlr *nvme_ctrlr) 1861 { 1862 int32_t elapsed; 1863 1864 if (nvme_ctrlr->opts.ctrlr_loss_timeout_sec == 0 || 1865 nvme_ctrlr->opts.ctrlr_loss_timeout_sec == -1) { 1866 return false; 1867 } 1868 1869 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 1870 if (elapsed >= nvme_ctrlr->opts.ctrlr_loss_timeout_sec) { 1871 return true; 1872 } else { 1873 return false; 1874 } 1875 } 1876 1877 static bool 1878 bdev_nvme_check_fast_io_fail_timeout(struct nvme_ctrlr *nvme_ctrlr) 1879 { 1880 uint32_t elapsed; 1881 1882 if (nvme_ctrlr->opts.fast_io_fail_timeout_sec == 0) { 1883 return false; 1884 } 1885 1886 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 1887 if (elapsed >= nvme_ctrlr->opts.fast_io_fail_timeout_sec) { 1888 return true; 1889 } else { 1890 return false; 1891 } 1892 } 1893 1894 static void bdev_nvme_reset_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, bool success); 1895 1896 static void 1897 nvme_ctrlr_disconnect(struct nvme_ctrlr *nvme_ctrlr, nvme_ctrlr_disconnected_cb cb_fn) 1898 { 1899 int rc; 1900 1901 rc = spdk_nvme_ctrlr_disconnect(nvme_ctrlr->ctrlr); 1902 if (rc != 0) { 1903 /* Disconnect fails if ctrlr is already resetting or removed. In this case, 1904 * fail the reset sequence immediately. 1905 */ 1906 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 1907 return; 1908 } 1909 1910 /* spdk_nvme_ctrlr_disconnect() may complete asynchronously later by polling adminq. 1911 * Set callback here to execute the specified operation after ctrlr is really disconnected. 1912 */ 1913 assert(nvme_ctrlr->disconnected_cb == NULL); 1914 nvme_ctrlr->disconnected_cb = cb_fn; 1915 1916 /* During disconnection, reduce the period to poll adminq more often. */ 1917 bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 0); 1918 } 1919 1920 enum bdev_nvme_op_after_reset { 1921 OP_NONE, 1922 OP_COMPLETE_PENDING_DESTRUCT, 1923 OP_DESTRUCT, 1924 OP_DELAYED_RECONNECT, 1925 OP_FAILOVER, 1926 }; 1927 1928 typedef enum bdev_nvme_op_after_reset _bdev_nvme_op_after_reset; 1929 1930 static _bdev_nvme_op_after_reset 1931 bdev_nvme_check_op_after_reset(struct nvme_ctrlr *nvme_ctrlr, bool success) 1932 { 1933 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 1934 /* Complete pending destruct after reset completes. */ 1935 return OP_COMPLETE_PENDING_DESTRUCT; 1936 } else if (nvme_ctrlr->pending_failover) { 1937 nvme_ctrlr->pending_failover = false; 1938 nvme_ctrlr->reset_start_tsc = 0; 1939 return OP_FAILOVER; 1940 } else if (success || nvme_ctrlr->opts.reconnect_delay_sec == 0) { 1941 nvme_ctrlr->reset_start_tsc = 0; 1942 return OP_NONE; 1943 } else if (bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 1944 return OP_DESTRUCT; 1945 } else { 1946 if (bdev_nvme_check_fast_io_fail_timeout(nvme_ctrlr)) { 1947 nvme_ctrlr->fast_io_fail_timedout = true; 1948 } 1949 return OP_DELAYED_RECONNECT; 1950 } 1951 } 1952 1953 static int bdev_nvme_delete_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool hotplug); 1954 static void bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 1955 1956 static int 1957 bdev_nvme_reconnect_delay_timer_expired(void *ctx) 1958 { 1959 struct nvme_ctrlr *nvme_ctrlr = ctx; 1960 1961 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect_delay, nvme_ctrlr->nbdev_ctrlr->name); 1962 pthread_mutex_lock(&nvme_ctrlr->mutex); 1963 1964 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 1965 1966 if (!nvme_ctrlr->reconnect_is_delayed) { 1967 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1968 return SPDK_POLLER_BUSY; 1969 } 1970 1971 nvme_ctrlr->reconnect_is_delayed = false; 1972 1973 if (nvme_ctrlr->destruct) { 1974 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1975 return SPDK_POLLER_BUSY; 1976 } 1977 1978 assert(nvme_ctrlr->resetting == false); 1979 nvme_ctrlr->resetting = true; 1980 1981 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1982 1983 spdk_poller_resume(nvme_ctrlr->adminq_timer_poller); 1984 1985 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 1986 return SPDK_POLLER_BUSY; 1987 } 1988 1989 static void 1990 bdev_nvme_start_reconnect_delay_timer(struct nvme_ctrlr *nvme_ctrlr) 1991 { 1992 spdk_poller_pause(nvme_ctrlr->adminq_timer_poller); 1993 1994 assert(nvme_ctrlr->reconnect_is_delayed == false); 1995 nvme_ctrlr->reconnect_is_delayed = true; 1996 1997 assert(nvme_ctrlr->reconnect_delay_timer == NULL); 1998 nvme_ctrlr->reconnect_delay_timer = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_delay_timer_expired, 1999 nvme_ctrlr, 2000 nvme_ctrlr->opts.reconnect_delay_sec * SPDK_SEC_TO_USEC); 2001 } 2002 2003 static void remove_discovery_entry(struct nvme_ctrlr *nvme_ctrlr); 2004 2005 static void 2006 _bdev_nvme_reset_ctrlr_complete(struct spdk_io_channel_iter *i, int status) 2007 { 2008 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2009 bool success = spdk_io_channel_iter_get_ctx(i) == NULL; 2010 bdev_nvme_ctrlr_op_cb ctrlr_op_cb_fn = nvme_ctrlr->ctrlr_op_cb_fn; 2011 void *ctrlr_op_cb_arg = nvme_ctrlr->ctrlr_op_cb_arg; 2012 enum bdev_nvme_op_after_reset op_after_reset; 2013 2014 assert(nvme_ctrlr->thread == spdk_get_thread()); 2015 2016 nvme_ctrlr->ctrlr_op_cb_fn = NULL; 2017 nvme_ctrlr->ctrlr_op_cb_arg = NULL; 2018 2019 if (!success) { 2020 SPDK_ERRLOG("Resetting controller failed.\n"); 2021 } else { 2022 SPDK_NOTICELOG("Resetting controller successful.\n"); 2023 } 2024 2025 pthread_mutex_lock(&nvme_ctrlr->mutex); 2026 nvme_ctrlr->resetting = false; 2027 nvme_ctrlr->dont_retry = false; 2028 nvme_ctrlr->in_failover = false; 2029 2030 op_after_reset = bdev_nvme_check_op_after_reset(nvme_ctrlr, success); 2031 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2032 2033 /* Delay callbacks when the next operation is a failover. */ 2034 if (ctrlr_op_cb_fn && op_after_reset != OP_FAILOVER) { 2035 ctrlr_op_cb_fn(ctrlr_op_cb_arg, success ? 0 : -1); 2036 } 2037 2038 switch (op_after_reset) { 2039 case OP_COMPLETE_PENDING_DESTRUCT: 2040 nvme_ctrlr_unregister(nvme_ctrlr); 2041 break; 2042 case OP_DESTRUCT: 2043 bdev_nvme_delete_ctrlr(nvme_ctrlr, false); 2044 remove_discovery_entry(nvme_ctrlr); 2045 break; 2046 case OP_DELAYED_RECONNECT: 2047 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_start_reconnect_delay_timer); 2048 break; 2049 case OP_FAILOVER: 2050 nvme_ctrlr->ctrlr_op_cb_fn = ctrlr_op_cb_fn; 2051 nvme_ctrlr->ctrlr_op_cb_arg = ctrlr_op_cb_arg; 2052 bdev_nvme_failover_ctrlr(nvme_ctrlr); 2053 break; 2054 default: 2055 break; 2056 } 2057 } 2058 2059 static void 2060 bdev_nvme_reset_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, bool success) 2061 { 2062 pthread_mutex_lock(&nvme_ctrlr->mutex); 2063 if (!success) { 2064 /* Connecting the active trid failed. Set the next alternate trid to the 2065 * active trid if it exists. 2066 */ 2067 if (bdev_nvme_failover_trid(nvme_ctrlr, false, false)) { 2068 /* The next alternate trid exists and is ready to try. Try it now. */ 2069 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2070 2071 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr); 2072 return; 2073 } 2074 2075 /* We came here if there is no alternate trid or if the next trid exists but 2076 * is not ready to try. We will try the active trid after reconnect_delay_sec 2077 * seconds if it is non-zero or at the next reset call otherwise. 2078 */ 2079 } else { 2080 /* Connecting the active trid succeeded. Clear the last failed time because it 2081 * means the trid is failed if its last failed time is non-zero. 2082 */ 2083 nvme_ctrlr->active_path_id->last_failed_tsc = 0; 2084 } 2085 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2086 2087 /* Make sure we clear any pending resets before returning. */ 2088 spdk_for_each_channel(nvme_ctrlr, 2089 bdev_nvme_complete_pending_resets, 2090 success ? NULL : (void *)0x1, 2091 _bdev_nvme_reset_ctrlr_complete); 2092 } 2093 2094 static void 2095 bdev_nvme_reset_create_qpairs_failed(struct spdk_io_channel_iter *i, int status) 2096 { 2097 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2098 2099 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 2100 } 2101 2102 static void 2103 bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i) 2104 { 2105 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 2106 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch); 2107 struct nvme_qpair *nvme_qpair; 2108 2109 nvme_qpair = ctrlr_ch->qpair; 2110 assert(nvme_qpair != NULL); 2111 2112 _bdev_nvme_clear_io_path_cache(nvme_qpair); 2113 2114 if (nvme_qpair->qpair != NULL) { 2115 if (nvme_qpair->ctrlr->dont_retry) { 2116 spdk_nvme_qpair_set_abort_dnr(nvme_qpair->qpair, true); 2117 } 2118 spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); 2119 2120 /* The current full reset sequence will move to the next 2121 * ctrlr_channel after the qpair is actually disconnected. 2122 */ 2123 assert(ctrlr_ch->reset_iter == NULL); 2124 ctrlr_ch->reset_iter = i; 2125 } else { 2126 spdk_for_each_channel_continue(i, 0); 2127 } 2128 } 2129 2130 static void 2131 bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status) 2132 { 2133 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2134 2135 if (status == 0) { 2136 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, true); 2137 } else { 2138 /* Delete the added qpairs and quiesce ctrlr to make the states clean. */ 2139 spdk_for_each_channel(nvme_ctrlr, 2140 bdev_nvme_reset_destroy_qpair, 2141 NULL, 2142 bdev_nvme_reset_create_qpairs_failed); 2143 } 2144 } 2145 2146 static int 2147 bdev_nvme_reset_check_qpair_connected(void *ctx) 2148 { 2149 struct nvme_ctrlr_channel *ctrlr_ch = ctx; 2150 2151 if (ctrlr_ch->reset_iter == NULL) { 2152 /* qpair was already failed to connect and the reset sequence is being aborted. */ 2153 assert(ctrlr_ch->connect_poller == NULL); 2154 assert(ctrlr_ch->qpair->qpair == NULL); 2155 return SPDK_POLLER_BUSY; 2156 } 2157 2158 assert(ctrlr_ch->qpair->qpair != NULL); 2159 2160 if (!spdk_nvme_qpair_is_connected(ctrlr_ch->qpair->qpair)) { 2161 return SPDK_POLLER_BUSY; 2162 } 2163 2164 spdk_poller_unregister(&ctrlr_ch->connect_poller); 2165 2166 /* qpair was completed to connect. Move to the next ctrlr_channel */ 2167 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 2168 ctrlr_ch->reset_iter = NULL; 2169 2170 if (!g_opts.disable_auto_failback) { 2171 _bdev_nvme_clear_io_path_cache(ctrlr_ch->qpair); 2172 } 2173 2174 return SPDK_POLLER_BUSY; 2175 } 2176 2177 static void 2178 bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i) 2179 { 2180 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 2181 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 2182 int rc; 2183 2184 rc = bdev_nvme_create_qpair(ctrlr_ch->qpair); 2185 if (rc == 0) { 2186 ctrlr_ch->connect_poller = SPDK_POLLER_REGISTER(bdev_nvme_reset_check_qpair_connected, 2187 ctrlr_ch, 0); 2188 2189 /* The current full reset sequence will move to the next 2190 * ctrlr_channel after the qpair is actually connected. 2191 */ 2192 assert(ctrlr_ch->reset_iter == NULL); 2193 ctrlr_ch->reset_iter = i; 2194 } else { 2195 spdk_for_each_channel_continue(i, rc); 2196 } 2197 } 2198 2199 static int 2200 bdev_nvme_reconnect_ctrlr_poll(void *arg) 2201 { 2202 struct nvme_ctrlr *nvme_ctrlr = arg; 2203 int rc = -ETIMEDOUT; 2204 2205 if (!bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 2206 rc = spdk_nvme_ctrlr_reconnect_poll_async(nvme_ctrlr->ctrlr); 2207 if (rc == -EAGAIN) { 2208 return SPDK_POLLER_BUSY; 2209 } 2210 } 2211 2212 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 2213 if (rc == 0) { 2214 /* Recreate all of the I/O queue pairs */ 2215 spdk_for_each_channel(nvme_ctrlr, 2216 bdev_nvme_reset_create_qpair, 2217 NULL, 2218 bdev_nvme_reset_create_qpairs_done); 2219 } else { 2220 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 2221 } 2222 return SPDK_POLLER_BUSY; 2223 } 2224 2225 static void 2226 bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2227 { 2228 spdk_nvme_ctrlr_reconnect_async(nvme_ctrlr->ctrlr); 2229 2230 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect, nvme_ctrlr->nbdev_ctrlr->name); 2231 assert(nvme_ctrlr->reset_detach_poller == NULL); 2232 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_ctrlr_poll, 2233 nvme_ctrlr, 0); 2234 } 2235 2236 static void 2237 bdev_nvme_reset_destroy_qpair_done(struct spdk_io_channel_iter *i, int status) 2238 { 2239 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2240 2241 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reset, nvme_ctrlr->nbdev_ctrlr->name); 2242 assert(status == 0); 2243 2244 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2245 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2246 } else { 2247 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr); 2248 } 2249 } 2250 2251 static void 2252 bdev_nvme_reset_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr) 2253 { 2254 spdk_for_each_channel(nvme_ctrlr, 2255 bdev_nvme_reset_destroy_qpair, 2256 NULL, 2257 bdev_nvme_reset_destroy_qpair_done); 2258 } 2259 2260 static void 2261 bdev_nvme_reconnect_ctrlr_now(void *ctx) 2262 { 2263 struct nvme_ctrlr *nvme_ctrlr = ctx; 2264 2265 assert(nvme_ctrlr->resetting == true); 2266 assert(nvme_ctrlr->thread == spdk_get_thread()); 2267 2268 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 2269 2270 spdk_poller_resume(nvme_ctrlr->adminq_timer_poller); 2271 2272 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2273 } 2274 2275 static void 2276 _bdev_nvme_reset_ctrlr(void *ctx) 2277 { 2278 struct nvme_ctrlr *nvme_ctrlr = ctx; 2279 2280 assert(nvme_ctrlr->resetting == true); 2281 assert(nvme_ctrlr->thread == spdk_get_thread()); 2282 2283 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2284 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reset_destroy_qpairs); 2285 } else { 2286 bdev_nvme_reset_destroy_qpairs(nvme_ctrlr); 2287 } 2288 } 2289 2290 static int 2291 bdev_nvme_reset_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2292 { 2293 spdk_msg_fn msg_fn; 2294 2295 pthread_mutex_lock(&nvme_ctrlr->mutex); 2296 if (nvme_ctrlr->destruct) { 2297 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2298 return -ENXIO; 2299 } 2300 2301 if (nvme_ctrlr->resetting) { 2302 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2303 SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); 2304 return -EBUSY; 2305 } 2306 2307 if (nvme_ctrlr->disabled) { 2308 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2309 SPDK_NOTICELOG("Unable to perform reset. Controller is disabled.\n"); 2310 return -EALREADY; 2311 } 2312 2313 nvme_ctrlr->resetting = true; 2314 nvme_ctrlr->dont_retry = true; 2315 2316 if (nvme_ctrlr->reconnect_is_delayed) { 2317 SPDK_DEBUGLOG(bdev_nvme, "Reconnect is already scheduled.\n"); 2318 msg_fn = bdev_nvme_reconnect_ctrlr_now; 2319 nvme_ctrlr->reconnect_is_delayed = false; 2320 } else { 2321 msg_fn = _bdev_nvme_reset_ctrlr; 2322 assert(nvme_ctrlr->reset_start_tsc == 0); 2323 } 2324 2325 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2326 2327 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2328 2329 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 2330 return 0; 2331 } 2332 2333 static int 2334 bdev_nvme_enable_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2335 { 2336 pthread_mutex_lock(&nvme_ctrlr->mutex); 2337 if (nvme_ctrlr->destruct) { 2338 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2339 return -ENXIO; 2340 } 2341 2342 if (nvme_ctrlr->resetting) { 2343 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2344 return -EBUSY; 2345 } 2346 2347 if (!nvme_ctrlr->disabled) { 2348 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2349 return -EALREADY; 2350 } 2351 2352 nvme_ctrlr->disabled = false; 2353 nvme_ctrlr->resetting = true; 2354 2355 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2356 2357 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2358 2359 spdk_thread_send_msg(nvme_ctrlr->thread, bdev_nvme_reconnect_ctrlr_now, nvme_ctrlr); 2360 return 0; 2361 } 2362 2363 static void 2364 _bdev_nvme_disable_ctrlr_complete(struct spdk_io_channel_iter *i, int status) 2365 { 2366 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2367 bdev_nvme_ctrlr_op_cb ctrlr_op_cb_fn = nvme_ctrlr->ctrlr_op_cb_fn; 2368 void *ctrlr_op_cb_arg = nvme_ctrlr->ctrlr_op_cb_arg; 2369 enum bdev_nvme_op_after_reset op_after_disable; 2370 2371 assert(nvme_ctrlr->thread == spdk_get_thread()); 2372 2373 nvme_ctrlr->ctrlr_op_cb_fn = NULL; 2374 nvme_ctrlr->ctrlr_op_cb_arg = NULL; 2375 2376 pthread_mutex_lock(&nvme_ctrlr->mutex); 2377 2378 nvme_ctrlr->resetting = false; 2379 nvme_ctrlr->dont_retry = false; 2380 2381 op_after_disable = bdev_nvme_check_op_after_reset(nvme_ctrlr, true); 2382 2383 nvme_ctrlr->disabled = true; 2384 spdk_poller_pause(nvme_ctrlr->adminq_timer_poller); 2385 2386 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2387 2388 if (ctrlr_op_cb_fn) { 2389 ctrlr_op_cb_fn(ctrlr_op_cb_arg, 0); 2390 } 2391 2392 switch (op_after_disable) { 2393 case OP_COMPLETE_PENDING_DESTRUCT: 2394 nvme_ctrlr_unregister(nvme_ctrlr); 2395 break; 2396 default: 2397 break; 2398 } 2399 2400 } 2401 2402 static void 2403 bdev_nvme_disable_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr) 2404 { 2405 /* Make sure we clear any pending resets before returning. */ 2406 spdk_for_each_channel(nvme_ctrlr, 2407 bdev_nvme_complete_pending_resets, 2408 NULL, 2409 _bdev_nvme_disable_ctrlr_complete); 2410 } 2411 2412 static void 2413 bdev_nvme_disable_destroy_qpairs_done(struct spdk_io_channel_iter *i, int status) 2414 { 2415 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2416 2417 assert(status == 0); 2418 2419 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2420 bdev_nvme_disable_ctrlr_complete(nvme_ctrlr); 2421 } else { 2422 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_disable_ctrlr_complete); 2423 } 2424 } 2425 2426 static void 2427 bdev_nvme_disable_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr) 2428 { 2429 spdk_for_each_channel(nvme_ctrlr, 2430 bdev_nvme_reset_destroy_qpair, 2431 NULL, 2432 bdev_nvme_disable_destroy_qpairs_done); 2433 } 2434 2435 static void 2436 _bdev_nvme_cancel_reconnect_and_disable_ctrlr(void *ctx) 2437 { 2438 struct nvme_ctrlr *nvme_ctrlr = ctx; 2439 2440 assert(nvme_ctrlr->resetting == true); 2441 assert(nvme_ctrlr->thread == spdk_get_thread()); 2442 2443 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 2444 2445 bdev_nvme_disable_ctrlr_complete(nvme_ctrlr); 2446 } 2447 2448 static void 2449 _bdev_nvme_disconnect_and_disable_ctrlr(void *ctx) 2450 { 2451 struct nvme_ctrlr *nvme_ctrlr = ctx; 2452 2453 assert(nvme_ctrlr->resetting == true); 2454 assert(nvme_ctrlr->thread == spdk_get_thread()); 2455 2456 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2457 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_disable_destroy_qpairs); 2458 } else { 2459 bdev_nvme_disable_destroy_qpairs(nvme_ctrlr); 2460 } 2461 } 2462 2463 static int 2464 bdev_nvme_disable_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2465 { 2466 spdk_msg_fn msg_fn; 2467 2468 pthread_mutex_lock(&nvme_ctrlr->mutex); 2469 if (nvme_ctrlr->destruct) { 2470 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2471 return -ENXIO; 2472 } 2473 2474 if (nvme_ctrlr->resetting) { 2475 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2476 return -EBUSY; 2477 } 2478 2479 if (nvme_ctrlr->disabled) { 2480 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2481 return -EALREADY; 2482 } 2483 2484 nvme_ctrlr->resetting = true; 2485 nvme_ctrlr->dont_retry = true; 2486 2487 if (nvme_ctrlr->reconnect_is_delayed) { 2488 msg_fn = _bdev_nvme_cancel_reconnect_and_disable_ctrlr; 2489 nvme_ctrlr->reconnect_is_delayed = false; 2490 } else { 2491 msg_fn = _bdev_nvme_disconnect_and_disable_ctrlr; 2492 } 2493 2494 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2495 2496 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2497 2498 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 2499 return 0; 2500 } 2501 2502 static int 2503 nvme_ctrlr_op(struct nvme_ctrlr *nvme_ctrlr, enum nvme_ctrlr_op op, 2504 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2505 { 2506 int rc; 2507 2508 switch (op) { 2509 case NVME_CTRLR_OP_RESET: 2510 rc = bdev_nvme_reset_ctrlr(nvme_ctrlr); 2511 break; 2512 case NVME_CTRLR_OP_ENABLE: 2513 rc = bdev_nvme_enable_ctrlr(nvme_ctrlr); 2514 break; 2515 case NVME_CTRLR_OP_DISABLE: 2516 rc = bdev_nvme_disable_ctrlr(nvme_ctrlr); 2517 break; 2518 default: 2519 rc = -EINVAL; 2520 break; 2521 } 2522 2523 if (rc == 0) { 2524 assert(nvme_ctrlr->ctrlr_op_cb_fn == NULL); 2525 assert(nvme_ctrlr->ctrlr_op_cb_arg == NULL); 2526 nvme_ctrlr->ctrlr_op_cb_fn = cb_fn; 2527 nvme_ctrlr->ctrlr_op_cb_arg = cb_arg; 2528 } 2529 return rc; 2530 } 2531 2532 struct nvme_ctrlr_op_rpc_ctx { 2533 struct nvme_ctrlr *nvme_ctrlr; 2534 struct spdk_thread *orig_thread; 2535 enum nvme_ctrlr_op op; 2536 int rc; 2537 bdev_nvme_ctrlr_op_cb cb_fn; 2538 void *cb_arg; 2539 }; 2540 2541 static void 2542 _nvme_ctrlr_op_rpc_complete(void *_ctx) 2543 { 2544 struct nvme_ctrlr_op_rpc_ctx *ctx = _ctx; 2545 2546 assert(ctx != NULL); 2547 assert(ctx->cb_fn != NULL); 2548 2549 ctx->cb_fn(ctx->cb_arg, ctx->rc); 2550 2551 free(ctx); 2552 } 2553 2554 static void 2555 nvme_ctrlr_op_rpc_complete(void *cb_arg, int rc) 2556 { 2557 struct nvme_ctrlr_op_rpc_ctx *ctx = cb_arg; 2558 2559 ctx->rc = rc; 2560 2561 spdk_thread_send_msg(ctx->orig_thread, _nvme_ctrlr_op_rpc_complete, ctx); 2562 } 2563 2564 void 2565 nvme_ctrlr_op_rpc(struct nvme_ctrlr *nvme_ctrlr, enum nvme_ctrlr_op op, 2566 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2567 { 2568 struct nvme_ctrlr_op_rpc_ctx *ctx; 2569 int rc; 2570 2571 assert(cb_fn != NULL); 2572 2573 ctx = calloc(1, sizeof(*ctx)); 2574 if (ctx == NULL) { 2575 SPDK_ERRLOG("Failed to allocate nvme_ctrlr_op_rpc_ctx.\n"); 2576 cb_fn(cb_arg, -ENOMEM); 2577 return; 2578 } 2579 2580 ctx->orig_thread = spdk_get_thread(); 2581 ctx->cb_fn = cb_fn; 2582 ctx->cb_arg = cb_arg; 2583 2584 rc = nvme_ctrlr_op(nvme_ctrlr, op, nvme_ctrlr_op_rpc_complete, ctx); 2585 if (rc == 0) { 2586 return; 2587 } else if (rc == -EALREADY) { 2588 rc = 0; 2589 } 2590 2591 nvme_ctrlr_op_rpc_complete(ctx, rc); 2592 } 2593 2594 static void nvme_bdev_ctrlr_op_rpc_continue(void *cb_arg, int rc); 2595 2596 static void 2597 _nvme_bdev_ctrlr_op_rpc_continue(void *_ctx) 2598 { 2599 struct nvme_ctrlr_op_rpc_ctx *ctx = _ctx; 2600 struct nvme_ctrlr *prev_nvme_ctrlr, *next_nvme_ctrlr; 2601 int rc; 2602 2603 prev_nvme_ctrlr = ctx->nvme_ctrlr; 2604 ctx->nvme_ctrlr = NULL; 2605 2606 if (ctx->rc != 0) { 2607 goto complete; 2608 } 2609 2610 next_nvme_ctrlr = TAILQ_NEXT(prev_nvme_ctrlr, tailq); 2611 if (next_nvme_ctrlr == NULL) { 2612 goto complete; 2613 } 2614 2615 rc = nvme_ctrlr_op(next_nvme_ctrlr, ctx->op, nvme_bdev_ctrlr_op_rpc_continue, ctx); 2616 if (rc == 0) { 2617 ctx->nvme_ctrlr = next_nvme_ctrlr; 2618 return; 2619 } else if (rc == -EALREADY) { 2620 ctx->nvme_ctrlr = next_nvme_ctrlr; 2621 rc = 0; 2622 } 2623 2624 ctx->rc = rc; 2625 2626 complete: 2627 ctx->cb_fn(ctx->cb_arg, ctx->rc); 2628 free(ctx); 2629 } 2630 2631 static void 2632 nvme_bdev_ctrlr_op_rpc_continue(void *cb_arg, int rc) 2633 { 2634 struct nvme_ctrlr_op_rpc_ctx *ctx = cb_arg; 2635 2636 ctx->rc = rc; 2637 2638 spdk_thread_send_msg(ctx->orig_thread, _nvme_bdev_ctrlr_op_rpc_continue, ctx); 2639 } 2640 2641 void 2642 nvme_bdev_ctrlr_op_rpc(struct nvme_bdev_ctrlr *nbdev_ctrlr, enum nvme_ctrlr_op op, 2643 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2644 { 2645 struct nvme_ctrlr_op_rpc_ctx *ctx; 2646 struct nvme_ctrlr *nvme_ctrlr; 2647 int rc; 2648 2649 assert(cb_fn != NULL); 2650 2651 ctx = calloc(1, sizeof(*ctx)); 2652 if (ctx == NULL) { 2653 SPDK_ERRLOG("Failed to allocate nvme_ctrlr_op_rpc_ctx.\n"); 2654 cb_fn(cb_arg, -ENOMEM); 2655 return; 2656 } 2657 2658 ctx->orig_thread = spdk_get_thread(); 2659 ctx->op = op; 2660 ctx->cb_fn = cb_fn; 2661 ctx->cb_arg = cb_arg; 2662 2663 nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 2664 assert(nvme_ctrlr != NULL); 2665 2666 rc = nvme_ctrlr_op(nvme_ctrlr, op, nvme_bdev_ctrlr_op_rpc_continue, ctx); 2667 if (rc == 0) { 2668 ctx->nvme_ctrlr = nvme_ctrlr; 2669 return; 2670 } else if (rc == -EALREADY) { 2671 ctx->nvme_ctrlr = nvme_ctrlr; 2672 rc = 0; 2673 } 2674 2675 nvme_bdev_ctrlr_op_rpc_continue(ctx, rc); 2676 } 2677 2678 static int _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio); 2679 2680 static void 2681 _bdev_nvme_reset_io_complete(struct spdk_io_channel_iter *i, int status) 2682 { 2683 struct nvme_bdev_io *bio = spdk_io_channel_iter_get_ctx(i); 2684 enum spdk_bdev_io_status io_status; 2685 2686 if (bio->cpl.cdw0 == 0) { 2687 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 2688 } else { 2689 io_status = SPDK_BDEV_IO_STATUS_FAILED; 2690 } 2691 2692 __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), io_status, NULL); 2693 } 2694 2695 static void 2696 bdev_nvme_abort_bdev_channel(struct spdk_io_channel_iter *i) 2697 { 2698 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 2699 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 2700 2701 bdev_nvme_abort_retry_ios(nbdev_ch); 2702 2703 spdk_for_each_channel_continue(i, 0); 2704 } 2705 2706 static void 2707 bdev_nvme_reset_io_complete(struct nvme_bdev_io *bio) 2708 { 2709 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2710 struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt; 2711 2712 /* Abort all queued I/Os for retry. */ 2713 spdk_for_each_channel(nbdev, 2714 bdev_nvme_abort_bdev_channel, 2715 bio, 2716 _bdev_nvme_reset_io_complete); 2717 } 2718 2719 static void 2720 _bdev_nvme_reset_io_continue(void *ctx) 2721 { 2722 struct nvme_bdev_io *bio = ctx; 2723 struct nvme_io_path *prev_io_path, *next_io_path; 2724 int rc; 2725 2726 prev_io_path = bio->io_path; 2727 bio->io_path = NULL; 2728 2729 if (bio->cpl.cdw0 != 0) { 2730 goto complete; 2731 } 2732 2733 next_io_path = STAILQ_NEXT(prev_io_path, stailq); 2734 if (next_io_path == NULL) { 2735 goto complete; 2736 } 2737 2738 rc = _bdev_nvme_reset_io(next_io_path, bio); 2739 if (rc == 0) { 2740 return; 2741 } 2742 2743 bio->cpl.cdw0 = 1; 2744 2745 complete: 2746 bdev_nvme_reset_io_complete(bio); 2747 } 2748 2749 static void 2750 bdev_nvme_reset_io_continue(void *cb_arg, int rc) 2751 { 2752 struct nvme_bdev_io *bio = cb_arg; 2753 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2754 2755 bio->cpl.cdw0 = (rc == 0) ? 0 : 1; 2756 2757 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), _bdev_nvme_reset_io_continue, bio); 2758 } 2759 2760 static int 2761 _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio) 2762 { 2763 struct nvme_ctrlr_channel *ctrlr_ch; 2764 int rc; 2765 2766 rc = nvme_ctrlr_op(io_path->qpair->ctrlr, NVME_CTRLR_OP_RESET, 2767 bdev_nvme_reset_io_continue, bio); 2768 if (rc == 0) { 2769 assert(bio->io_path == NULL); 2770 bio->io_path = io_path; 2771 } else if (rc == -EBUSY) { 2772 ctrlr_ch = io_path->qpair->ctrlr_ch; 2773 assert(ctrlr_ch != NULL); 2774 /* 2775 * Reset call is queued only if it is from the app framework. This is on purpose so that 2776 * we don't interfere with the app framework reset strategy. i.e. we are deferring to the 2777 * upper level. If they are in the middle of a reset, we won't try to schedule another one. 2778 */ 2779 TAILQ_INSERT_TAIL(&ctrlr_ch->pending_resets, bio, retry_link); 2780 rc = 0; 2781 } 2782 2783 return rc; 2784 } 2785 2786 static void 2787 bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio) 2788 { 2789 struct nvme_io_path *io_path; 2790 int rc; 2791 2792 bio->cpl.cdw0 = 0; 2793 2794 /* Reset all nvme_ctrlrs of a bdev controller sequentially. */ 2795 io_path = STAILQ_FIRST(&nbdev_ch->io_path_list); 2796 assert(io_path != NULL); 2797 2798 rc = _bdev_nvme_reset_io(io_path, bio); 2799 if (rc != 0) { 2800 /* If the current nvme_ctrlr is disabled, skip it and move to the next nvme_ctrlr. */ 2801 bdev_nvme_reset_io_continue(bio, rc == -EALREADY); 2802 } 2803 } 2804 2805 static int 2806 bdev_nvme_failover_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool remove) 2807 { 2808 if (nvme_ctrlr->destruct) { 2809 /* Don't bother resetting if the controller is in the process of being destructed. */ 2810 return -ENXIO; 2811 } 2812 2813 if (nvme_ctrlr->resetting) { 2814 if (!nvme_ctrlr->in_failover) { 2815 SPDK_NOTICELOG("Reset is already in progress. Defer failover until reset completes.\n"); 2816 2817 /* Defer failover until reset completes. */ 2818 nvme_ctrlr->pending_failover = true; 2819 return -EINPROGRESS; 2820 } else { 2821 SPDK_NOTICELOG("Unable to perform failover, already in progress.\n"); 2822 return -EBUSY; 2823 } 2824 } 2825 2826 bdev_nvme_failover_trid(nvme_ctrlr, remove, true); 2827 2828 if (nvme_ctrlr->reconnect_is_delayed) { 2829 SPDK_NOTICELOG("Reconnect is already scheduled.\n"); 2830 2831 /* We rely on the next reconnect for the failover. */ 2832 return -EALREADY; 2833 } 2834 2835 if (nvme_ctrlr->disabled) { 2836 SPDK_NOTICELOG("Controller is disabled.\n"); 2837 2838 /* We rely on the enablement for the failover. */ 2839 return -EALREADY; 2840 } 2841 2842 nvme_ctrlr->resetting = true; 2843 nvme_ctrlr->in_failover = true; 2844 2845 assert(nvme_ctrlr->reset_start_tsc == 0); 2846 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2847 2848 return 0; 2849 } 2850 2851 static int 2852 bdev_nvme_failover_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2853 { 2854 int rc; 2855 2856 pthread_mutex_lock(&nvme_ctrlr->mutex); 2857 rc = bdev_nvme_failover_ctrlr_unsafe(nvme_ctrlr, false); 2858 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2859 2860 if (rc == 0) { 2861 spdk_thread_send_msg(nvme_ctrlr->thread, _bdev_nvme_reset_ctrlr, nvme_ctrlr); 2862 } else if (rc == -EALREADY) { 2863 rc = 0; 2864 } 2865 2866 return rc; 2867 } 2868 2869 static int bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, 2870 uint64_t num_blocks); 2871 2872 static int bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, 2873 uint64_t num_blocks); 2874 2875 static int bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, 2876 uint64_t src_offset_blocks, 2877 uint64_t num_blocks); 2878 2879 static void 2880 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 2881 bool success) 2882 { 2883 struct nvme_bdev_io *bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 2884 int ret; 2885 2886 if (!success) { 2887 ret = -EINVAL; 2888 goto exit; 2889 } 2890 2891 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 2892 ret = -ENXIO; 2893 goto exit; 2894 } 2895 2896 ret = bdev_nvme_readv(bio, 2897 bdev_io->u.bdev.iovs, 2898 bdev_io->u.bdev.iovcnt, 2899 bdev_io->u.bdev.md_buf, 2900 bdev_io->u.bdev.num_blocks, 2901 bdev_io->u.bdev.offset_blocks, 2902 bdev_io->u.bdev.dif_check_flags, 2903 bdev_io->u.bdev.memory_domain, 2904 bdev_io->u.bdev.memory_domain_ctx, 2905 bdev_io->u.bdev.accel_sequence); 2906 2907 exit: 2908 if (spdk_unlikely(ret != 0)) { 2909 bdev_nvme_io_complete(bio, ret); 2910 } 2911 } 2912 2913 static inline void 2914 _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io) 2915 { 2916 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 2917 struct spdk_bdev *bdev = bdev_io->bdev; 2918 struct nvme_bdev_io *nbdev_io_to_abort; 2919 int rc = 0; 2920 2921 switch (bdev_io->type) { 2922 case SPDK_BDEV_IO_TYPE_READ: 2923 if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) { 2924 2925 rc = bdev_nvme_readv(nbdev_io, 2926 bdev_io->u.bdev.iovs, 2927 bdev_io->u.bdev.iovcnt, 2928 bdev_io->u.bdev.md_buf, 2929 bdev_io->u.bdev.num_blocks, 2930 bdev_io->u.bdev.offset_blocks, 2931 bdev_io->u.bdev.dif_check_flags, 2932 bdev_io->u.bdev.memory_domain, 2933 bdev_io->u.bdev.memory_domain_ctx, 2934 bdev_io->u.bdev.accel_sequence); 2935 } else { 2936 spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb, 2937 bdev_io->u.bdev.num_blocks * bdev->blocklen); 2938 rc = 0; 2939 } 2940 break; 2941 case SPDK_BDEV_IO_TYPE_WRITE: 2942 rc = bdev_nvme_writev(nbdev_io, 2943 bdev_io->u.bdev.iovs, 2944 bdev_io->u.bdev.iovcnt, 2945 bdev_io->u.bdev.md_buf, 2946 bdev_io->u.bdev.num_blocks, 2947 bdev_io->u.bdev.offset_blocks, 2948 bdev_io->u.bdev.dif_check_flags, 2949 bdev_io->u.bdev.memory_domain, 2950 bdev_io->u.bdev.memory_domain_ctx, 2951 bdev_io->u.bdev.accel_sequence); 2952 break; 2953 case SPDK_BDEV_IO_TYPE_COMPARE: 2954 rc = bdev_nvme_comparev(nbdev_io, 2955 bdev_io->u.bdev.iovs, 2956 bdev_io->u.bdev.iovcnt, 2957 bdev_io->u.bdev.md_buf, 2958 bdev_io->u.bdev.num_blocks, 2959 bdev_io->u.bdev.offset_blocks, 2960 bdev_io->u.bdev.dif_check_flags); 2961 break; 2962 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 2963 rc = bdev_nvme_comparev_and_writev(nbdev_io, 2964 bdev_io->u.bdev.iovs, 2965 bdev_io->u.bdev.iovcnt, 2966 bdev_io->u.bdev.fused_iovs, 2967 bdev_io->u.bdev.fused_iovcnt, 2968 bdev_io->u.bdev.md_buf, 2969 bdev_io->u.bdev.num_blocks, 2970 bdev_io->u.bdev.offset_blocks, 2971 bdev_io->u.bdev.dif_check_flags); 2972 break; 2973 case SPDK_BDEV_IO_TYPE_UNMAP: 2974 rc = bdev_nvme_unmap(nbdev_io, 2975 bdev_io->u.bdev.offset_blocks, 2976 bdev_io->u.bdev.num_blocks); 2977 break; 2978 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2979 rc = bdev_nvme_write_zeroes(nbdev_io, 2980 bdev_io->u.bdev.offset_blocks, 2981 bdev_io->u.bdev.num_blocks); 2982 break; 2983 case SPDK_BDEV_IO_TYPE_RESET: 2984 nbdev_io->io_path = NULL; 2985 bdev_nvme_reset_io(nbdev_ch, nbdev_io); 2986 return; 2987 2988 case SPDK_BDEV_IO_TYPE_FLUSH: 2989 bdev_nvme_io_complete(nbdev_io, 0); 2990 return; 2991 2992 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 2993 rc = bdev_nvme_zone_appendv(nbdev_io, 2994 bdev_io->u.bdev.iovs, 2995 bdev_io->u.bdev.iovcnt, 2996 bdev_io->u.bdev.md_buf, 2997 bdev_io->u.bdev.num_blocks, 2998 bdev_io->u.bdev.offset_blocks, 2999 bdev_io->u.bdev.dif_check_flags); 3000 break; 3001 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 3002 rc = bdev_nvme_get_zone_info(nbdev_io, 3003 bdev_io->u.zone_mgmt.zone_id, 3004 bdev_io->u.zone_mgmt.num_zones, 3005 bdev_io->u.zone_mgmt.buf); 3006 break; 3007 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 3008 rc = bdev_nvme_zone_management(nbdev_io, 3009 bdev_io->u.zone_mgmt.zone_id, 3010 bdev_io->u.zone_mgmt.zone_action); 3011 break; 3012 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 3013 nbdev_io->io_path = NULL; 3014 bdev_nvme_admin_passthru(nbdev_ch, 3015 nbdev_io, 3016 &bdev_io->u.nvme_passthru.cmd, 3017 bdev_io->u.nvme_passthru.buf, 3018 bdev_io->u.nvme_passthru.nbytes); 3019 return; 3020 3021 case SPDK_BDEV_IO_TYPE_NVME_IO: 3022 rc = bdev_nvme_io_passthru(nbdev_io, 3023 &bdev_io->u.nvme_passthru.cmd, 3024 bdev_io->u.nvme_passthru.buf, 3025 bdev_io->u.nvme_passthru.nbytes); 3026 break; 3027 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3028 rc = bdev_nvme_io_passthru_md(nbdev_io, 3029 &bdev_io->u.nvme_passthru.cmd, 3030 bdev_io->u.nvme_passthru.buf, 3031 bdev_io->u.nvme_passthru.nbytes, 3032 bdev_io->u.nvme_passthru.md_buf, 3033 bdev_io->u.nvme_passthru.md_len); 3034 break; 3035 case SPDK_BDEV_IO_TYPE_NVME_IOV_MD: 3036 rc = bdev_nvme_iov_passthru_md(nbdev_io, 3037 &bdev_io->u.nvme_passthru.cmd, 3038 bdev_io->u.nvme_passthru.iovs, 3039 bdev_io->u.nvme_passthru.iovcnt, 3040 bdev_io->u.nvme_passthru.nbytes, 3041 bdev_io->u.nvme_passthru.md_buf, 3042 bdev_io->u.nvme_passthru.md_len); 3043 break; 3044 case SPDK_BDEV_IO_TYPE_ABORT: 3045 nbdev_io->io_path = NULL; 3046 nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx; 3047 bdev_nvme_abort(nbdev_ch, 3048 nbdev_io, 3049 nbdev_io_to_abort); 3050 return; 3051 3052 case SPDK_BDEV_IO_TYPE_COPY: 3053 rc = bdev_nvme_copy(nbdev_io, 3054 bdev_io->u.bdev.offset_blocks, 3055 bdev_io->u.bdev.copy.src_offset_blocks, 3056 bdev_io->u.bdev.num_blocks); 3057 break; 3058 default: 3059 rc = -EINVAL; 3060 break; 3061 } 3062 3063 if (spdk_unlikely(rc != 0)) { 3064 bdev_nvme_io_complete(nbdev_io, rc); 3065 } 3066 } 3067 3068 static void 3069 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 3070 { 3071 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 3072 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 3073 3074 if (spdk_likely(nbdev_io->submit_tsc == 0)) { 3075 nbdev_io->submit_tsc = spdk_bdev_io_get_submit_tsc(bdev_io); 3076 } else { 3077 /* There are cases where submit_tsc != 0, i.e. retry I/O. 3078 * We need to update submit_tsc here. 3079 */ 3080 nbdev_io->submit_tsc = spdk_get_ticks(); 3081 } 3082 3083 spdk_trace_record(TRACE_BDEV_NVME_IO_START, 0, 0, (uintptr_t)nbdev_io, (uintptr_t)bdev_io); 3084 nbdev_io->io_path = bdev_nvme_find_io_path(nbdev_ch); 3085 if (spdk_unlikely(!nbdev_io->io_path)) { 3086 if (!bdev_nvme_io_type_is_admin(bdev_io->type)) { 3087 bdev_nvme_io_complete(nbdev_io, -ENXIO); 3088 return; 3089 } 3090 3091 /* Admin commands do not use the optimal I/O path. 3092 * Simply fall through even if it is not found. 3093 */ 3094 } 3095 3096 _bdev_nvme_submit_request(nbdev_ch, bdev_io); 3097 } 3098 3099 static bool 3100 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 3101 { 3102 struct nvme_bdev *nbdev = ctx; 3103 struct nvme_ns *nvme_ns; 3104 struct spdk_nvme_ns *ns; 3105 struct spdk_nvme_ctrlr *ctrlr; 3106 const struct spdk_nvme_ctrlr_data *cdata; 3107 3108 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 3109 assert(nvme_ns != NULL); 3110 ns = nvme_ns->ns; 3111 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 3112 3113 switch (io_type) { 3114 case SPDK_BDEV_IO_TYPE_READ: 3115 case SPDK_BDEV_IO_TYPE_WRITE: 3116 case SPDK_BDEV_IO_TYPE_RESET: 3117 case SPDK_BDEV_IO_TYPE_FLUSH: 3118 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 3119 case SPDK_BDEV_IO_TYPE_NVME_IO: 3120 case SPDK_BDEV_IO_TYPE_ABORT: 3121 return true; 3122 3123 case SPDK_BDEV_IO_TYPE_COMPARE: 3124 return spdk_nvme_ns_supports_compare(ns); 3125 3126 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3127 return spdk_nvme_ns_get_md_size(ns) ? true : false; 3128 3129 case SPDK_BDEV_IO_TYPE_UNMAP: 3130 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3131 return cdata->oncs.dsm; 3132 3133 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3134 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3135 return cdata->oncs.write_zeroes; 3136 3137 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 3138 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 3139 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) { 3140 return true; 3141 } 3142 return false; 3143 3144 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 3145 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 3146 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS; 3147 3148 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 3149 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS && 3150 spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ZONE_APPEND_SUPPORTED; 3151 3152 case SPDK_BDEV_IO_TYPE_COPY: 3153 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3154 return cdata->oncs.copy; 3155 3156 default: 3157 return false; 3158 } 3159 } 3160 3161 static int 3162 nvme_qpair_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ctrlr_channel *ctrlr_ch) 3163 { 3164 struct nvme_qpair *nvme_qpair; 3165 struct spdk_io_channel *pg_ch; 3166 int rc; 3167 3168 nvme_qpair = calloc(1, sizeof(*nvme_qpair)); 3169 if (!nvme_qpair) { 3170 SPDK_ERRLOG("Failed to alloc nvme_qpair.\n"); 3171 return -1; 3172 } 3173 3174 TAILQ_INIT(&nvme_qpair->io_path_list); 3175 3176 nvme_qpair->ctrlr = nvme_ctrlr; 3177 nvme_qpair->ctrlr_ch = ctrlr_ch; 3178 3179 pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs); 3180 if (!pg_ch) { 3181 free(nvme_qpair); 3182 return -1; 3183 } 3184 3185 nvme_qpair->group = spdk_io_channel_get_ctx(pg_ch); 3186 3187 #ifdef SPDK_CONFIG_VTUNE 3188 nvme_qpair->group->collect_spin_stat = true; 3189 #else 3190 nvme_qpair->group->collect_spin_stat = false; 3191 #endif 3192 3193 if (!nvme_ctrlr->disabled) { 3194 /* If a nvme_ctrlr is disabled, don't try to create qpair for it. Qpair will 3195 * be created when it's enabled. 3196 */ 3197 rc = bdev_nvme_create_qpair(nvme_qpair); 3198 if (rc != 0) { 3199 /* nvme_ctrlr can't create IO qpair if connection is down. 3200 * If reconnect_delay_sec is non-zero, creating IO qpair is retried 3201 * after reconnect_delay_sec seconds. If bdev_retry_count is non-zero, 3202 * submitted IO will be queued until IO qpair is successfully created. 3203 * 3204 * Hence, if both are satisfied, ignore the failure. 3205 */ 3206 if (nvme_ctrlr->opts.reconnect_delay_sec == 0 || g_opts.bdev_retry_count == 0) { 3207 spdk_put_io_channel(pg_ch); 3208 free(nvme_qpair); 3209 return rc; 3210 } 3211 } 3212 } 3213 3214 TAILQ_INSERT_TAIL(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 3215 3216 ctrlr_ch->qpair = nvme_qpair; 3217 3218 pthread_mutex_lock(&nvme_qpair->ctrlr->mutex); 3219 nvme_qpair->ctrlr->ref++; 3220 pthread_mutex_unlock(&nvme_qpair->ctrlr->mutex); 3221 3222 return 0; 3223 } 3224 3225 static int 3226 bdev_nvme_create_ctrlr_channel_cb(void *io_device, void *ctx_buf) 3227 { 3228 struct nvme_ctrlr *nvme_ctrlr = io_device; 3229 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 3230 3231 TAILQ_INIT(&ctrlr_ch->pending_resets); 3232 3233 return nvme_qpair_create(nvme_ctrlr, ctrlr_ch); 3234 } 3235 3236 static void 3237 nvme_qpair_delete(struct nvme_qpair *nvme_qpair) 3238 { 3239 struct nvme_io_path *io_path, *next; 3240 3241 assert(nvme_qpair->group != NULL); 3242 3243 TAILQ_FOREACH_SAFE(io_path, &nvme_qpair->io_path_list, tailq, next) { 3244 TAILQ_REMOVE(&nvme_qpair->io_path_list, io_path, tailq); 3245 nvme_io_path_free(io_path); 3246 } 3247 3248 TAILQ_REMOVE(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 3249 3250 spdk_put_io_channel(spdk_io_channel_from_ctx(nvme_qpair->group)); 3251 3252 nvme_ctrlr_release(nvme_qpair->ctrlr); 3253 3254 free(nvme_qpair); 3255 } 3256 3257 static void 3258 bdev_nvme_destroy_ctrlr_channel_cb(void *io_device, void *ctx_buf) 3259 { 3260 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 3261 struct nvme_qpair *nvme_qpair; 3262 3263 nvme_qpair = ctrlr_ch->qpair; 3264 assert(nvme_qpair != NULL); 3265 3266 _bdev_nvme_clear_io_path_cache(nvme_qpair); 3267 3268 if (nvme_qpair->qpair != NULL) { 3269 if (ctrlr_ch->reset_iter == NULL) { 3270 spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); 3271 } else { 3272 /* Skip current ctrlr_channel in a full reset sequence because 3273 * it is being deleted now. The qpair is already being disconnected. 3274 * We do not have to restart disconnecting it. 3275 */ 3276 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 3277 } 3278 3279 /* We cannot release a reference to the poll group now. 3280 * The qpair may be disconnected asynchronously later. 3281 * We need to poll it until it is actually disconnected. 3282 * Just detach the qpair from the deleting ctrlr_channel. 3283 */ 3284 nvme_qpair->ctrlr_ch = NULL; 3285 } else { 3286 assert(ctrlr_ch->reset_iter == NULL); 3287 3288 nvme_qpair_delete(nvme_qpair); 3289 } 3290 } 3291 3292 static inline struct spdk_io_channel * 3293 bdev_nvme_get_accel_channel(struct nvme_poll_group *group) 3294 { 3295 if (spdk_unlikely(!group->accel_channel)) { 3296 group->accel_channel = spdk_accel_get_io_channel(); 3297 if (!group->accel_channel) { 3298 SPDK_ERRLOG("Cannot get the accel_channel for bdev nvme polling group=%p\n", 3299 group); 3300 return NULL; 3301 } 3302 } 3303 3304 return group->accel_channel; 3305 } 3306 3307 static void 3308 bdev_nvme_submit_accel_crc32c(void *ctx, uint32_t *dst, struct iovec *iov, 3309 uint32_t iov_cnt, uint32_t seed, 3310 spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) 3311 { 3312 struct spdk_io_channel *accel_ch; 3313 struct nvme_poll_group *group = ctx; 3314 int rc; 3315 3316 assert(cb_fn != NULL); 3317 3318 accel_ch = bdev_nvme_get_accel_channel(group); 3319 if (spdk_unlikely(accel_ch == NULL)) { 3320 cb_fn(cb_arg, -ENOMEM); 3321 return; 3322 } 3323 3324 rc = spdk_accel_submit_crc32cv(accel_ch, dst, iov, iov_cnt, seed, cb_fn, cb_arg); 3325 if (rc) { 3326 /* For the two cases, spdk_accel_submit_crc32cv does not call the user's cb_fn */ 3327 if (rc == -ENOMEM || rc == -EINVAL) { 3328 cb_fn(cb_arg, rc); 3329 } 3330 SPDK_ERRLOG("Cannot complete the accelerated crc32c operation with iov=%p\n", iov); 3331 } 3332 } 3333 3334 static void 3335 bdev_nvme_finish_sequence(void *seq, spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) 3336 { 3337 spdk_accel_sequence_finish(seq, cb_fn, cb_arg); 3338 } 3339 3340 static void 3341 bdev_nvme_abort_sequence(void *seq) 3342 { 3343 spdk_accel_sequence_abort(seq); 3344 } 3345 3346 static void 3347 bdev_nvme_reverse_sequence(void *seq) 3348 { 3349 spdk_accel_sequence_reverse(seq); 3350 } 3351 3352 static int 3353 bdev_nvme_append_crc32c(void *ctx, void **seq, uint32_t *dst, struct iovec *iovs, uint32_t iovcnt, 3354 struct spdk_memory_domain *domain, void *domain_ctx, uint32_t seed, 3355 spdk_nvme_accel_step_cb cb_fn, void *cb_arg) 3356 { 3357 struct spdk_io_channel *ch; 3358 struct nvme_poll_group *group = ctx; 3359 3360 ch = bdev_nvme_get_accel_channel(group); 3361 if (spdk_unlikely(ch == NULL)) { 3362 return -ENOMEM; 3363 } 3364 3365 return spdk_accel_append_crc32c((struct spdk_accel_sequence **)seq, ch, dst, iovs, iovcnt, 3366 domain, domain_ctx, seed, cb_fn, cb_arg); 3367 } 3368 3369 static struct spdk_nvme_accel_fn_table g_bdev_nvme_accel_fn_table = { 3370 .table_size = sizeof(struct spdk_nvme_accel_fn_table), 3371 .submit_accel_crc32c = bdev_nvme_submit_accel_crc32c, 3372 .append_crc32c = bdev_nvme_append_crc32c, 3373 .finish_sequence = bdev_nvme_finish_sequence, 3374 .reverse_sequence = bdev_nvme_reverse_sequence, 3375 .abort_sequence = bdev_nvme_abort_sequence, 3376 }; 3377 3378 static int 3379 bdev_nvme_create_poll_group_cb(void *io_device, void *ctx_buf) 3380 { 3381 struct nvme_poll_group *group = ctx_buf; 3382 3383 TAILQ_INIT(&group->qpair_list); 3384 3385 group->group = spdk_nvme_poll_group_create(group, &g_bdev_nvme_accel_fn_table); 3386 if (group->group == NULL) { 3387 return -1; 3388 } 3389 3390 group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us); 3391 3392 if (group->poller == NULL) { 3393 spdk_nvme_poll_group_destroy(group->group); 3394 return -1; 3395 } 3396 3397 return 0; 3398 } 3399 3400 static void 3401 bdev_nvme_destroy_poll_group_cb(void *io_device, void *ctx_buf) 3402 { 3403 struct nvme_poll_group *group = ctx_buf; 3404 3405 assert(TAILQ_EMPTY(&group->qpair_list)); 3406 3407 if (group->accel_channel) { 3408 spdk_put_io_channel(group->accel_channel); 3409 } 3410 3411 spdk_poller_unregister(&group->poller); 3412 if (spdk_nvme_poll_group_destroy(group->group)) { 3413 SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.\n"); 3414 assert(false); 3415 } 3416 } 3417 3418 static struct spdk_io_channel * 3419 bdev_nvme_get_io_channel(void *ctx) 3420 { 3421 struct nvme_bdev *nvme_bdev = ctx; 3422 3423 return spdk_get_io_channel(nvme_bdev); 3424 } 3425 3426 static void * 3427 bdev_nvme_get_module_ctx(void *ctx) 3428 { 3429 struct nvme_bdev *nvme_bdev = ctx; 3430 struct nvme_ns *nvme_ns; 3431 3432 if (!nvme_bdev || nvme_bdev->disk.module != &nvme_if) { 3433 return NULL; 3434 } 3435 3436 nvme_ns = TAILQ_FIRST(&nvme_bdev->nvme_ns_list); 3437 if (!nvme_ns) { 3438 return NULL; 3439 } 3440 3441 return nvme_ns->ns; 3442 } 3443 3444 static const char * 3445 _nvme_ana_state_str(enum spdk_nvme_ana_state ana_state) 3446 { 3447 switch (ana_state) { 3448 case SPDK_NVME_ANA_OPTIMIZED_STATE: 3449 return "optimized"; 3450 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 3451 return "non_optimized"; 3452 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 3453 return "inaccessible"; 3454 case SPDK_NVME_ANA_PERSISTENT_LOSS_STATE: 3455 return "persistent_loss"; 3456 case SPDK_NVME_ANA_CHANGE_STATE: 3457 return "change"; 3458 default: 3459 return NULL; 3460 } 3461 } 3462 3463 static int 3464 bdev_nvme_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size) 3465 { 3466 struct spdk_memory_domain **_domains = NULL; 3467 struct nvme_bdev *nbdev = ctx; 3468 struct nvme_ns *nvme_ns; 3469 int i = 0, _array_size = array_size; 3470 int rc = 0; 3471 3472 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 3473 if (domains && array_size >= i) { 3474 _domains = &domains[i]; 3475 } else { 3476 _domains = NULL; 3477 } 3478 rc = spdk_nvme_ctrlr_get_memory_domains(nvme_ns->ctrlr->ctrlr, _domains, _array_size); 3479 if (rc > 0) { 3480 i += rc; 3481 if (_array_size >= rc) { 3482 _array_size -= rc; 3483 } else { 3484 _array_size = 0; 3485 } 3486 } else if (rc < 0) { 3487 return rc; 3488 } 3489 } 3490 3491 return i; 3492 } 3493 3494 static const char * 3495 nvme_ctrlr_get_state_str(struct nvme_ctrlr *nvme_ctrlr) 3496 { 3497 if (nvme_ctrlr->destruct) { 3498 return "deleting"; 3499 } else if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 3500 return "failed"; 3501 } else if (nvme_ctrlr->resetting) { 3502 return "resetting"; 3503 } else if (nvme_ctrlr->reconnect_is_delayed > 0) { 3504 return "reconnect_is_delayed"; 3505 } else if (nvme_ctrlr->disabled) { 3506 return "disabled"; 3507 } else { 3508 return "enabled"; 3509 } 3510 } 3511 3512 void 3513 nvme_ctrlr_info_json(struct spdk_json_write_ctx *w, struct nvme_ctrlr *nvme_ctrlr) 3514 { 3515 struct spdk_nvme_transport_id *trid; 3516 const struct spdk_nvme_ctrlr_opts *opts; 3517 const struct spdk_nvme_ctrlr_data *cdata; 3518 struct nvme_path_id *path_id; 3519 3520 spdk_json_write_object_begin(w); 3521 3522 spdk_json_write_named_string(w, "state", nvme_ctrlr_get_state_str(nvme_ctrlr)); 3523 3524 #ifdef SPDK_CONFIG_NVME_CUSE 3525 size_t cuse_name_size = 128; 3526 char cuse_name[cuse_name_size]; 3527 3528 int rc = spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr, cuse_name, &cuse_name_size); 3529 if (rc == 0) { 3530 spdk_json_write_named_string(w, "cuse_device", cuse_name); 3531 } 3532 #endif 3533 trid = &nvme_ctrlr->active_path_id->trid; 3534 spdk_json_write_named_object_begin(w, "trid"); 3535 nvme_bdev_dump_trid_json(trid, w); 3536 spdk_json_write_object_end(w); 3537 3538 path_id = TAILQ_NEXT(nvme_ctrlr->active_path_id, link); 3539 if (path_id != NULL) { 3540 spdk_json_write_named_array_begin(w, "alternate_trids"); 3541 do { 3542 trid = &path_id->trid; 3543 spdk_json_write_object_begin(w); 3544 nvme_bdev_dump_trid_json(trid, w); 3545 spdk_json_write_object_end(w); 3546 3547 path_id = TAILQ_NEXT(path_id, link); 3548 } while (path_id != NULL); 3549 spdk_json_write_array_end(w); 3550 } 3551 3552 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 3553 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 3554 3555 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 3556 spdk_json_write_named_object_begin(w, "host"); 3557 spdk_json_write_named_string(w, "nqn", opts->hostnqn); 3558 spdk_json_write_named_string(w, "addr", opts->src_addr); 3559 spdk_json_write_named_string(w, "svcid", opts->src_svcid); 3560 spdk_json_write_object_end(w); 3561 3562 spdk_json_write_object_end(w); 3563 } 3564 3565 static void 3566 nvme_namespace_info_json(struct spdk_json_write_ctx *w, 3567 struct nvme_ns *nvme_ns) 3568 { 3569 struct spdk_nvme_ns *ns; 3570 struct spdk_nvme_ctrlr *ctrlr; 3571 const struct spdk_nvme_ctrlr_data *cdata; 3572 const struct spdk_nvme_transport_id *trid; 3573 union spdk_nvme_vs_register vs; 3574 const struct spdk_nvme_ns_data *nsdata; 3575 char buf[128]; 3576 3577 ns = nvme_ns->ns; 3578 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 3579 3580 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3581 trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); 3582 vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr); 3583 3584 spdk_json_write_object_begin(w); 3585 3586 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 3587 spdk_json_write_named_string(w, "pci_address", trid->traddr); 3588 } 3589 3590 spdk_json_write_named_object_begin(w, "trid"); 3591 3592 nvme_bdev_dump_trid_json(trid, w); 3593 3594 spdk_json_write_object_end(w); 3595 3596 #ifdef SPDK_CONFIG_NVME_CUSE 3597 size_t cuse_name_size = 128; 3598 char cuse_name[cuse_name_size]; 3599 3600 int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns), 3601 cuse_name, &cuse_name_size); 3602 if (rc == 0) { 3603 spdk_json_write_named_string(w, "cuse_device", cuse_name); 3604 } 3605 #endif 3606 3607 spdk_json_write_named_object_begin(w, "ctrlr_data"); 3608 3609 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 3610 3611 spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid); 3612 3613 snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn); 3614 spdk_str_trim(buf); 3615 spdk_json_write_named_string(w, "model_number", buf); 3616 3617 snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn); 3618 spdk_str_trim(buf); 3619 spdk_json_write_named_string(w, "serial_number", buf); 3620 3621 snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr); 3622 spdk_str_trim(buf); 3623 spdk_json_write_named_string(w, "firmware_revision", buf); 3624 3625 if (cdata->subnqn[0] != '\0') { 3626 spdk_json_write_named_string(w, "subnqn", cdata->subnqn); 3627 } 3628 3629 spdk_json_write_named_object_begin(w, "oacs"); 3630 3631 spdk_json_write_named_uint32(w, "security", cdata->oacs.security); 3632 spdk_json_write_named_uint32(w, "format", cdata->oacs.format); 3633 spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware); 3634 spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage); 3635 3636 spdk_json_write_object_end(w); 3637 3638 spdk_json_write_named_bool(w, "multi_ctrlr", cdata->cmic.multi_ctrlr); 3639 spdk_json_write_named_bool(w, "ana_reporting", cdata->cmic.ana_reporting); 3640 3641 spdk_json_write_object_end(w); 3642 3643 spdk_json_write_named_object_begin(w, "vs"); 3644 3645 spdk_json_write_name(w, "nvme_version"); 3646 if (vs.bits.ter) { 3647 spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter); 3648 } else { 3649 spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr); 3650 } 3651 3652 spdk_json_write_object_end(w); 3653 3654 nsdata = spdk_nvme_ns_get_data(ns); 3655 3656 spdk_json_write_named_object_begin(w, "ns_data"); 3657 3658 spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns)); 3659 3660 if (cdata->cmic.ana_reporting) { 3661 spdk_json_write_named_string(w, "ana_state", 3662 _nvme_ana_state_str(nvme_ns->ana_state)); 3663 } 3664 3665 spdk_json_write_named_bool(w, "can_share", nsdata->nmic.can_share); 3666 3667 spdk_json_write_object_end(w); 3668 3669 if (cdata->oacs.security) { 3670 spdk_json_write_named_object_begin(w, "security"); 3671 3672 spdk_json_write_named_bool(w, "opal", nvme_ns->bdev->opal); 3673 3674 spdk_json_write_object_end(w); 3675 } 3676 3677 spdk_json_write_object_end(w); 3678 } 3679 3680 static const char * 3681 nvme_bdev_get_mp_policy_str(struct nvme_bdev *nbdev) 3682 { 3683 switch (nbdev->mp_policy) { 3684 case BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE: 3685 return "active_passive"; 3686 case BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE: 3687 return "active_active"; 3688 default: 3689 assert(false); 3690 return "invalid"; 3691 } 3692 } 3693 3694 static int 3695 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) 3696 { 3697 struct nvme_bdev *nvme_bdev = ctx; 3698 struct nvme_ns *nvme_ns; 3699 3700 pthread_mutex_lock(&nvme_bdev->mutex); 3701 spdk_json_write_named_array_begin(w, "nvme"); 3702 TAILQ_FOREACH(nvme_ns, &nvme_bdev->nvme_ns_list, tailq) { 3703 nvme_namespace_info_json(w, nvme_ns); 3704 } 3705 spdk_json_write_array_end(w); 3706 spdk_json_write_named_string(w, "mp_policy", nvme_bdev_get_mp_policy_str(nvme_bdev)); 3707 pthread_mutex_unlock(&nvme_bdev->mutex); 3708 3709 return 0; 3710 } 3711 3712 static void 3713 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3714 { 3715 /* No config per bdev needed */ 3716 } 3717 3718 static uint64_t 3719 bdev_nvme_get_spin_time(struct spdk_io_channel *ch) 3720 { 3721 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 3722 struct nvme_io_path *io_path; 3723 struct nvme_poll_group *group; 3724 uint64_t spin_time = 0; 3725 3726 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 3727 group = io_path->qpair->group; 3728 3729 if (!group || !group->collect_spin_stat) { 3730 continue; 3731 } 3732 3733 if (group->end_ticks != 0) { 3734 group->spin_ticks += (group->end_ticks - group->start_ticks); 3735 group->end_ticks = 0; 3736 } 3737 3738 spin_time += group->spin_ticks; 3739 group->start_ticks = 0; 3740 group->spin_ticks = 0; 3741 } 3742 3743 return (spin_time * 1000000ULL) / spdk_get_ticks_hz(); 3744 } 3745 3746 static void 3747 bdev_nvme_reset_device_stat(void *ctx) 3748 { 3749 struct nvme_bdev *nbdev = ctx; 3750 3751 if (nbdev->err_stat != NULL) { 3752 memset(nbdev->err_stat, 0, sizeof(struct nvme_error_stat)); 3753 } 3754 } 3755 3756 /* JSON string should be lowercases and underscore delimited string. */ 3757 static void 3758 bdev_nvme_format_nvme_status(char *dst, const char *src) 3759 { 3760 char tmp[256]; 3761 3762 spdk_strcpy_replace(dst, 256, src, " - ", "_"); 3763 spdk_strcpy_replace(tmp, 256, dst, "-", "_"); 3764 spdk_strcpy_replace(dst, 256, tmp, " ", "_"); 3765 spdk_strlwr(dst); 3766 } 3767 3768 static void 3769 bdev_nvme_dump_device_stat_json(void *ctx, struct spdk_json_write_ctx *w) 3770 { 3771 struct nvme_bdev *nbdev = ctx; 3772 struct spdk_nvme_status status = {}; 3773 uint16_t sct, sc; 3774 char status_json[256]; 3775 const char *status_str; 3776 3777 if (nbdev->err_stat == NULL) { 3778 return; 3779 } 3780 3781 spdk_json_write_named_object_begin(w, "nvme_error"); 3782 3783 spdk_json_write_named_object_begin(w, "status_type"); 3784 for (sct = 0; sct < 8; sct++) { 3785 if (nbdev->err_stat->status_type[sct] == 0) { 3786 continue; 3787 } 3788 status.sct = sct; 3789 3790 status_str = spdk_nvme_cpl_get_status_type_string(&status); 3791 assert(status_str != NULL); 3792 bdev_nvme_format_nvme_status(status_json, status_str); 3793 3794 spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status_type[sct]); 3795 } 3796 spdk_json_write_object_end(w); 3797 3798 spdk_json_write_named_object_begin(w, "status_code"); 3799 for (sct = 0; sct < 4; sct++) { 3800 status.sct = sct; 3801 for (sc = 0; sc < 256; sc++) { 3802 if (nbdev->err_stat->status[sct][sc] == 0) { 3803 continue; 3804 } 3805 status.sc = sc; 3806 3807 status_str = spdk_nvme_cpl_get_status_string(&status); 3808 assert(status_str != NULL); 3809 bdev_nvme_format_nvme_status(status_json, status_str); 3810 3811 spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status[sct][sc]); 3812 } 3813 } 3814 spdk_json_write_object_end(w); 3815 3816 spdk_json_write_object_end(w); 3817 } 3818 3819 static bool 3820 bdev_nvme_accel_sequence_supported(void *ctx, enum spdk_bdev_io_type type) 3821 { 3822 struct nvme_bdev *nbdev = ctx; 3823 struct spdk_nvme_ctrlr *ctrlr; 3824 3825 if (!g_opts.allow_accel_sequence) { 3826 return false; 3827 } 3828 3829 switch (type) { 3830 case SPDK_BDEV_IO_TYPE_WRITE: 3831 case SPDK_BDEV_IO_TYPE_READ: 3832 break; 3833 default: 3834 return false; 3835 } 3836 3837 ctrlr = bdev_nvme_get_ctrlr(&nbdev->disk); 3838 assert(ctrlr != NULL); 3839 3840 return spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ACCEL_SEQUENCE_SUPPORTED; 3841 } 3842 3843 static const struct spdk_bdev_fn_table nvmelib_fn_table = { 3844 .destruct = bdev_nvme_destruct, 3845 .submit_request = bdev_nvme_submit_request, 3846 .io_type_supported = bdev_nvme_io_type_supported, 3847 .get_io_channel = bdev_nvme_get_io_channel, 3848 .dump_info_json = bdev_nvme_dump_info_json, 3849 .write_config_json = bdev_nvme_write_config_json, 3850 .get_spin_time = bdev_nvme_get_spin_time, 3851 .get_module_ctx = bdev_nvme_get_module_ctx, 3852 .get_memory_domains = bdev_nvme_get_memory_domains, 3853 .accel_sequence_supported = bdev_nvme_accel_sequence_supported, 3854 .reset_device_stat = bdev_nvme_reset_device_stat, 3855 .dump_device_stat_json = bdev_nvme_dump_device_stat_json, 3856 }; 3857 3858 typedef int (*bdev_nvme_parse_ana_log_page_cb)( 3859 const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg); 3860 3861 static int 3862 bdev_nvme_parse_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 3863 bdev_nvme_parse_ana_log_page_cb cb_fn, void *cb_arg) 3864 { 3865 struct spdk_nvme_ana_group_descriptor *copied_desc; 3866 uint8_t *orig_desc; 3867 uint32_t i, desc_size, copy_len; 3868 int rc = 0; 3869 3870 if (nvme_ctrlr->ana_log_page == NULL) { 3871 return -EINVAL; 3872 } 3873 3874 copied_desc = nvme_ctrlr->copied_ana_desc; 3875 3876 orig_desc = (uint8_t *)nvme_ctrlr->ana_log_page + sizeof(struct spdk_nvme_ana_page); 3877 copy_len = nvme_ctrlr->max_ana_log_page_size - sizeof(struct spdk_nvme_ana_page); 3878 3879 for (i = 0; i < nvme_ctrlr->ana_log_page->num_ana_group_desc; i++) { 3880 memcpy(copied_desc, orig_desc, copy_len); 3881 3882 rc = cb_fn(copied_desc, cb_arg); 3883 if (rc != 0) { 3884 break; 3885 } 3886 3887 desc_size = sizeof(struct spdk_nvme_ana_group_descriptor) + 3888 copied_desc->num_of_nsid * sizeof(uint32_t); 3889 orig_desc += desc_size; 3890 copy_len -= desc_size; 3891 } 3892 3893 return rc; 3894 } 3895 3896 static int 3897 nvme_ns_ana_transition_timedout(void *ctx) 3898 { 3899 struct nvme_ns *nvme_ns = ctx; 3900 3901 spdk_poller_unregister(&nvme_ns->anatt_timer); 3902 nvme_ns->ana_transition_timedout = true; 3903 3904 return SPDK_POLLER_BUSY; 3905 } 3906 3907 static void 3908 _nvme_ns_set_ana_state(struct nvme_ns *nvme_ns, 3909 const struct spdk_nvme_ana_group_descriptor *desc) 3910 { 3911 const struct spdk_nvme_ctrlr_data *cdata; 3912 3913 nvme_ns->ana_group_id = desc->ana_group_id; 3914 nvme_ns->ana_state = desc->ana_state; 3915 nvme_ns->ana_state_updating = false; 3916 3917 switch (nvme_ns->ana_state) { 3918 case SPDK_NVME_ANA_OPTIMIZED_STATE: 3919 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 3920 nvme_ns->ana_transition_timedout = false; 3921 spdk_poller_unregister(&nvme_ns->anatt_timer); 3922 break; 3923 3924 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 3925 case SPDK_NVME_ANA_CHANGE_STATE: 3926 if (nvme_ns->anatt_timer != NULL) { 3927 break; 3928 } 3929 3930 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 3931 nvme_ns->anatt_timer = SPDK_POLLER_REGISTER(nvme_ns_ana_transition_timedout, 3932 nvme_ns, 3933 cdata->anatt * SPDK_SEC_TO_USEC); 3934 break; 3935 default: 3936 break; 3937 } 3938 } 3939 3940 static int 3941 nvme_ns_set_ana_state(const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg) 3942 { 3943 struct nvme_ns *nvme_ns = cb_arg; 3944 uint32_t i; 3945 3946 for (i = 0; i < desc->num_of_nsid; i++) { 3947 if (desc->nsid[i] != spdk_nvme_ns_get_id(nvme_ns->ns)) { 3948 continue; 3949 } 3950 3951 _nvme_ns_set_ana_state(nvme_ns, desc); 3952 return 1; 3953 } 3954 3955 return 0; 3956 } 3957 3958 static struct spdk_uuid 3959 nvme_generate_uuid(const char *sn, uint32_t nsid) 3960 { 3961 struct spdk_uuid new_uuid, namespace_uuid; 3962 char merged_str[SPDK_NVME_CTRLR_SN_LEN + NSID_STR_LEN + 1] = {'\0'}; 3963 /* This namespace UUID was generated using uuid_generate() method. */ 3964 const char *namespace_str = {"edaed2de-24bc-4b07-b559-f47ecbe730fd"}; 3965 int size; 3966 3967 assert(strlen(sn) <= SPDK_NVME_CTRLR_SN_LEN); 3968 3969 spdk_uuid_set_null(&new_uuid); 3970 spdk_uuid_set_null(&namespace_uuid); 3971 3972 size = snprintf(merged_str, sizeof(merged_str), "%s%"PRIu32, sn, nsid); 3973 assert(size > 0 && (unsigned long)size < sizeof(merged_str)); 3974 3975 spdk_uuid_parse(&namespace_uuid, namespace_str); 3976 3977 spdk_uuid_generate_sha1(&new_uuid, &namespace_uuid, merged_str, size); 3978 3979 return new_uuid; 3980 } 3981 3982 static int 3983 nvme_disk_create(struct spdk_bdev *disk, const char *base_name, 3984 struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns, 3985 uint32_t prchk_flags, void *ctx) 3986 { 3987 const struct spdk_uuid *uuid; 3988 const uint8_t *nguid; 3989 const struct spdk_nvme_ctrlr_data *cdata; 3990 const struct spdk_nvme_ns_data *nsdata; 3991 const struct spdk_nvme_ctrlr_opts *opts; 3992 enum spdk_nvme_csi csi; 3993 uint32_t atomic_bs, phys_bs, bs; 3994 char sn_tmp[SPDK_NVME_CTRLR_SN_LEN + 1] = {'\0'}; 3995 3996 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3997 csi = spdk_nvme_ns_get_csi(ns); 3998 opts = spdk_nvme_ctrlr_get_opts(ctrlr); 3999 4000 switch (csi) { 4001 case SPDK_NVME_CSI_NVM: 4002 disk->product_name = "NVMe disk"; 4003 break; 4004 case SPDK_NVME_CSI_ZNS: 4005 disk->product_name = "NVMe ZNS disk"; 4006 disk->zoned = true; 4007 disk->zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 4008 disk->max_zone_append_size = spdk_nvme_zns_ctrlr_get_max_zone_append_size(ctrlr) / 4009 spdk_nvme_ns_get_extended_sector_size(ns); 4010 disk->max_open_zones = spdk_nvme_zns_ns_get_max_open_zones(ns); 4011 disk->max_active_zones = spdk_nvme_zns_ns_get_max_active_zones(ns); 4012 break; 4013 default: 4014 SPDK_ERRLOG("unsupported CSI: %u\n", csi); 4015 return -ENOTSUP; 4016 } 4017 4018 nguid = spdk_nvme_ns_get_nguid(ns); 4019 if (!nguid) { 4020 uuid = spdk_nvme_ns_get_uuid(ns); 4021 if (uuid) { 4022 disk->uuid = *uuid; 4023 } else if (g_opts.generate_uuids) { 4024 spdk_strcpy_pad(sn_tmp, cdata->sn, SPDK_NVME_CTRLR_SN_LEN, '\0'); 4025 disk->uuid = nvme_generate_uuid(sn_tmp, spdk_nvme_ns_get_id(ns)); 4026 } 4027 } else { 4028 memcpy(&disk->uuid, nguid, sizeof(disk->uuid)); 4029 } 4030 4031 disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns)); 4032 if (!disk->name) { 4033 return -ENOMEM; 4034 } 4035 4036 disk->write_cache = 0; 4037 if (cdata->vwc.present) { 4038 /* Enable if the Volatile Write Cache exists */ 4039 disk->write_cache = 1; 4040 } 4041 if (cdata->oncs.write_zeroes) { 4042 disk->max_write_zeroes = UINT16_MAX + 1; 4043 } 4044 disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns); 4045 disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns); 4046 disk->max_segment_size = spdk_nvme_ctrlr_get_max_xfer_size(ctrlr); 4047 /* NVMe driver will split one request into multiple requests 4048 * based on MDTS and stripe boundary, the bdev layer will use 4049 * max_segment_size and max_num_segments to split one big IO 4050 * into multiple requests, then small request can't run out 4051 * of NVMe internal requests data structure. 4052 */ 4053 if (opts && opts->io_queue_requests) { 4054 disk->max_num_segments = opts->io_queue_requests / 2; 4055 } 4056 if (spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_SGL_SUPPORTED) { 4057 /* The nvme driver will try to split I/O that have too many 4058 * SGEs, but it doesn't work if that last SGE doesn't end on 4059 * an aggregate total that is block aligned. The bdev layer has 4060 * a more robust splitting framework, so use that instead for 4061 * this case. (See issue #3269.) 4062 */ 4063 uint16_t max_sges = spdk_nvme_ctrlr_get_max_sges(ctrlr); 4064 4065 if (disk->max_num_segments == 0) { 4066 disk->max_num_segments = max_sges; 4067 } else { 4068 disk->max_num_segments = spdk_min(disk->max_num_segments, max_sges); 4069 } 4070 } 4071 disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns); 4072 4073 nsdata = spdk_nvme_ns_get_data(ns); 4074 bs = spdk_nvme_ns_get_sector_size(ns); 4075 atomic_bs = bs; 4076 phys_bs = bs; 4077 if (nsdata->nabo == 0) { 4078 if (nsdata->nsfeat.ns_atomic_write_unit && nsdata->nawupf) { 4079 atomic_bs = bs * (1 + nsdata->nawupf); 4080 } else { 4081 atomic_bs = bs * (1 + cdata->awupf); 4082 } 4083 } 4084 if (nsdata->nsfeat.optperf) { 4085 phys_bs = bs * (1 + nsdata->npwg); 4086 } 4087 disk->phys_blocklen = spdk_min(phys_bs, atomic_bs); 4088 4089 disk->md_len = spdk_nvme_ns_get_md_size(ns); 4090 if (disk->md_len != 0) { 4091 disk->md_interleave = nsdata->flbas.extended; 4092 disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns); 4093 if (disk->dif_type != SPDK_DIF_DISABLE) { 4094 disk->dif_is_head_of_md = nsdata->dps.md_start; 4095 disk->dif_check_flags = prchk_flags; 4096 } 4097 } 4098 4099 if (!(spdk_nvme_ctrlr_get_flags(ctrlr) & 4100 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) { 4101 disk->acwu = 0; 4102 } else if (nsdata->nsfeat.ns_atomic_write_unit) { 4103 disk->acwu = nsdata->nacwu + 1; /* 0-based */ 4104 } else { 4105 disk->acwu = cdata->acwu + 1; /* 0-based */ 4106 } 4107 4108 if (cdata->oncs.copy) { 4109 /* For now bdev interface allows only single segment copy */ 4110 disk->max_copy = nsdata->mssrl; 4111 } 4112 4113 disk->ctxt = ctx; 4114 disk->fn_table = &nvmelib_fn_table; 4115 disk->module = &nvme_if; 4116 4117 return 0; 4118 } 4119 4120 static struct nvme_bdev * 4121 nvme_bdev_alloc(void) 4122 { 4123 struct nvme_bdev *bdev; 4124 int rc; 4125 4126 bdev = calloc(1, sizeof(*bdev)); 4127 if (!bdev) { 4128 SPDK_ERRLOG("bdev calloc() failed\n"); 4129 return NULL; 4130 } 4131 4132 if (g_opts.nvme_error_stat) { 4133 bdev->err_stat = calloc(1, sizeof(struct nvme_error_stat)); 4134 if (!bdev->err_stat) { 4135 SPDK_ERRLOG("err_stat calloc() failed\n"); 4136 free(bdev); 4137 return NULL; 4138 } 4139 } 4140 4141 rc = pthread_mutex_init(&bdev->mutex, NULL); 4142 if (rc != 0) { 4143 free(bdev->err_stat); 4144 free(bdev); 4145 return NULL; 4146 } 4147 4148 bdev->ref = 1; 4149 bdev->mp_policy = BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE; 4150 bdev->mp_selector = BDEV_NVME_MP_SELECTOR_ROUND_ROBIN; 4151 bdev->rr_min_io = UINT32_MAX; 4152 TAILQ_INIT(&bdev->nvme_ns_list); 4153 4154 return bdev; 4155 } 4156 4157 static int 4158 nvme_bdev_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4159 { 4160 struct nvme_bdev *bdev; 4161 struct nvme_bdev_ctrlr *nbdev_ctrlr = nvme_ctrlr->nbdev_ctrlr; 4162 int rc; 4163 4164 bdev = nvme_bdev_alloc(); 4165 if (bdev == NULL) { 4166 SPDK_ERRLOG("Failed to allocate NVMe bdev\n"); 4167 return -ENOMEM; 4168 } 4169 4170 bdev->opal = nvme_ctrlr->opal_dev != NULL; 4171 4172 rc = nvme_disk_create(&bdev->disk, nbdev_ctrlr->name, nvme_ctrlr->ctrlr, 4173 nvme_ns->ns, nvme_ctrlr->opts.prchk_flags, bdev); 4174 if (rc != 0) { 4175 SPDK_ERRLOG("Failed to create NVMe disk\n"); 4176 nvme_bdev_free(bdev); 4177 return rc; 4178 } 4179 4180 spdk_io_device_register(bdev, 4181 bdev_nvme_create_bdev_channel_cb, 4182 bdev_nvme_destroy_bdev_channel_cb, 4183 sizeof(struct nvme_bdev_channel), 4184 bdev->disk.name); 4185 4186 nvme_ns->bdev = bdev; 4187 bdev->nsid = nvme_ns->id; 4188 TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); 4189 4190 bdev->nbdev_ctrlr = nbdev_ctrlr; 4191 TAILQ_INSERT_TAIL(&nbdev_ctrlr->bdevs, bdev, tailq); 4192 4193 rc = spdk_bdev_register(&bdev->disk); 4194 if (rc != 0) { 4195 SPDK_ERRLOG("spdk_bdev_register() failed\n"); 4196 spdk_io_device_unregister(bdev, NULL); 4197 nvme_ns->bdev = NULL; 4198 TAILQ_REMOVE(&nbdev_ctrlr->bdevs, bdev, tailq); 4199 nvme_bdev_free(bdev); 4200 return rc; 4201 } 4202 4203 return 0; 4204 } 4205 4206 static bool 4207 bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2) 4208 { 4209 const struct spdk_nvme_ns_data *nsdata1, *nsdata2; 4210 const struct spdk_uuid *uuid1, *uuid2; 4211 4212 nsdata1 = spdk_nvme_ns_get_data(ns1); 4213 nsdata2 = spdk_nvme_ns_get_data(ns2); 4214 uuid1 = spdk_nvme_ns_get_uuid(ns1); 4215 uuid2 = spdk_nvme_ns_get_uuid(ns2); 4216 4217 return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid)) == 0 && 4218 nsdata1->eui64 == nsdata2->eui64 && 4219 ((uuid1 == NULL && uuid2 == NULL) || 4220 (uuid1 != NULL && uuid2 != NULL && spdk_uuid_compare(uuid1, uuid2) == 0)) && 4221 spdk_nvme_ns_get_csi(ns1) == spdk_nvme_ns_get_csi(ns2); 4222 } 4223 4224 static bool 4225 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 4226 struct spdk_nvme_ctrlr_opts *opts) 4227 { 4228 struct nvme_probe_skip_entry *entry; 4229 4230 TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) { 4231 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 4232 return false; 4233 } 4234 } 4235 4236 opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst; 4237 opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight; 4238 opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight; 4239 opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight; 4240 opts->disable_read_ana_log_page = true; 4241 4242 SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr); 4243 4244 return true; 4245 } 4246 4247 static void 4248 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl) 4249 { 4250 struct nvme_ctrlr *nvme_ctrlr = ctx; 4251 4252 if (spdk_nvme_cpl_is_error(cpl)) { 4253 SPDK_WARNLOG("Abort failed. Resetting controller. sc is %u, sct is %u.\n", cpl->status.sc, 4254 cpl->status.sct); 4255 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4256 } else if (cpl->cdw0 & 0x1) { 4257 SPDK_WARNLOG("Specified command could not be aborted.\n"); 4258 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4259 } 4260 } 4261 4262 static void 4263 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr, 4264 struct spdk_nvme_qpair *qpair, uint16_t cid) 4265 { 4266 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 4267 union spdk_nvme_csts_register csts; 4268 int rc; 4269 4270 assert(nvme_ctrlr->ctrlr == ctrlr); 4271 4272 SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid); 4273 4274 /* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O 4275 * queue. (Note: qpair == NULL when there's an admin cmd timeout.) Otherwise we 4276 * would submit another fabrics cmd on the admin queue to read CSTS and check for its 4277 * completion recursively. 4278 */ 4279 if (nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) { 4280 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); 4281 if (csts.bits.cfs) { 4282 SPDK_ERRLOG("Controller Fatal Status, reset required\n"); 4283 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4284 return; 4285 } 4286 } 4287 4288 switch (g_opts.action_on_timeout) { 4289 case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT: 4290 if (qpair) { 4291 /* Don't send abort to ctrlr when ctrlr is not available. */ 4292 pthread_mutex_lock(&nvme_ctrlr->mutex); 4293 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 4294 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4295 SPDK_NOTICELOG("Quit abort. Ctrlr is not available.\n"); 4296 return; 4297 } 4298 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4299 4300 rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid, 4301 nvme_abort_cpl, nvme_ctrlr); 4302 if (rc == 0) { 4303 return; 4304 } 4305 4306 SPDK_ERRLOG("Unable to send abort. Resetting, rc is %d.\n", rc); 4307 } 4308 4309 /* FALLTHROUGH */ 4310 case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET: 4311 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4312 break; 4313 case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE: 4314 SPDK_DEBUGLOG(bdev_nvme, "No action for nvme controller timeout.\n"); 4315 break; 4316 default: 4317 SPDK_ERRLOG("An invalid timeout action value is found.\n"); 4318 break; 4319 } 4320 } 4321 4322 static struct nvme_ns * 4323 nvme_ns_alloc(void) 4324 { 4325 struct nvme_ns *nvme_ns; 4326 4327 nvme_ns = calloc(1, sizeof(struct nvme_ns)); 4328 if (nvme_ns == NULL) { 4329 return NULL; 4330 } 4331 4332 if (g_opts.io_path_stat) { 4333 nvme_ns->stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); 4334 if (nvme_ns->stat == NULL) { 4335 free(nvme_ns); 4336 return NULL; 4337 } 4338 spdk_bdev_reset_io_stat(nvme_ns->stat, SPDK_BDEV_RESET_STAT_MAXMIN); 4339 } 4340 4341 return nvme_ns; 4342 } 4343 4344 static void 4345 nvme_ns_free(struct nvme_ns *nvme_ns) 4346 { 4347 free(nvme_ns->stat); 4348 free(nvme_ns); 4349 } 4350 4351 static void 4352 nvme_ctrlr_populate_namespace_done(struct nvme_ns *nvme_ns, int rc) 4353 { 4354 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 4355 struct nvme_async_probe_ctx *ctx = nvme_ns->probe_ctx; 4356 4357 if (rc == 0) { 4358 nvme_ns->probe_ctx = NULL; 4359 pthread_mutex_lock(&nvme_ctrlr->mutex); 4360 nvme_ctrlr->ref++; 4361 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4362 } else { 4363 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4364 nvme_ns_free(nvme_ns); 4365 } 4366 4367 if (ctx) { 4368 ctx->populates_in_progress--; 4369 if (ctx->populates_in_progress == 0) { 4370 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 4371 } 4372 } 4373 } 4374 4375 static void 4376 bdev_nvme_add_io_path(struct spdk_io_channel_iter *i) 4377 { 4378 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4379 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4380 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4381 int rc; 4382 4383 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 4384 if (rc != 0) { 4385 SPDK_ERRLOG("Failed to add I/O path to bdev_channel dynamically.\n"); 4386 } 4387 4388 spdk_for_each_channel_continue(i, rc); 4389 } 4390 4391 static void 4392 bdev_nvme_delete_io_path(struct spdk_io_channel_iter *i) 4393 { 4394 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4395 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4396 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4397 struct nvme_io_path *io_path; 4398 4399 io_path = _bdev_nvme_get_io_path(nbdev_ch, nvme_ns); 4400 if (io_path != NULL) { 4401 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 4402 } 4403 4404 spdk_for_each_channel_continue(i, 0); 4405 } 4406 4407 static void 4408 bdev_nvme_add_io_path_failed(struct spdk_io_channel_iter *i, int status) 4409 { 4410 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4411 4412 nvme_ctrlr_populate_namespace_done(nvme_ns, -1); 4413 } 4414 4415 static void 4416 bdev_nvme_add_io_path_done(struct spdk_io_channel_iter *i, int status) 4417 { 4418 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4419 struct nvme_bdev *bdev = spdk_io_channel_iter_get_io_device(i); 4420 4421 if (status == 0) { 4422 nvme_ctrlr_populate_namespace_done(nvme_ns, 0); 4423 } else { 4424 /* Delete the added io_paths and fail populating the namespace. */ 4425 spdk_for_each_channel(bdev, 4426 bdev_nvme_delete_io_path, 4427 nvme_ns, 4428 bdev_nvme_add_io_path_failed); 4429 } 4430 } 4431 4432 static int 4433 nvme_bdev_add_ns(struct nvme_bdev *bdev, struct nvme_ns *nvme_ns) 4434 { 4435 struct nvme_ns *tmp_ns; 4436 const struct spdk_nvme_ns_data *nsdata; 4437 4438 nsdata = spdk_nvme_ns_get_data(nvme_ns->ns); 4439 if (!nsdata->nmic.can_share) { 4440 SPDK_ERRLOG("Namespace cannot be shared.\n"); 4441 return -EINVAL; 4442 } 4443 4444 pthread_mutex_lock(&bdev->mutex); 4445 4446 tmp_ns = TAILQ_FIRST(&bdev->nvme_ns_list); 4447 assert(tmp_ns != NULL); 4448 4449 if (!bdev_nvme_compare_ns(nvme_ns->ns, tmp_ns->ns)) { 4450 pthread_mutex_unlock(&bdev->mutex); 4451 SPDK_ERRLOG("Namespaces are not identical.\n"); 4452 return -EINVAL; 4453 } 4454 4455 bdev->ref++; 4456 TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); 4457 nvme_ns->bdev = bdev; 4458 4459 pthread_mutex_unlock(&bdev->mutex); 4460 4461 /* Add nvme_io_path to nvme_bdev_channels dynamically. */ 4462 spdk_for_each_channel(bdev, 4463 bdev_nvme_add_io_path, 4464 nvme_ns, 4465 bdev_nvme_add_io_path_done); 4466 4467 return 0; 4468 } 4469 4470 static void 4471 nvme_ctrlr_populate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4472 { 4473 struct spdk_nvme_ns *ns; 4474 struct nvme_bdev *bdev; 4475 int rc = 0; 4476 4477 ns = spdk_nvme_ctrlr_get_ns(nvme_ctrlr->ctrlr, nvme_ns->id); 4478 if (!ns) { 4479 SPDK_DEBUGLOG(bdev_nvme, "Invalid NS %d\n", nvme_ns->id); 4480 rc = -EINVAL; 4481 goto done; 4482 } 4483 4484 nvme_ns->ns = ns; 4485 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 4486 4487 if (nvme_ctrlr->ana_log_page != NULL) { 4488 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ns_set_ana_state, nvme_ns); 4489 } 4490 4491 bdev = nvme_bdev_ctrlr_get_bdev(nvme_ctrlr->nbdev_ctrlr, nvme_ns->id); 4492 if (bdev == NULL) { 4493 rc = nvme_bdev_create(nvme_ctrlr, nvme_ns); 4494 } else { 4495 rc = nvme_bdev_add_ns(bdev, nvme_ns); 4496 if (rc == 0) { 4497 return; 4498 } 4499 } 4500 done: 4501 nvme_ctrlr_populate_namespace_done(nvme_ns, rc); 4502 } 4503 4504 static void 4505 nvme_ctrlr_depopulate_namespace_done(struct nvme_ns *nvme_ns) 4506 { 4507 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 4508 4509 assert(nvme_ctrlr != NULL); 4510 4511 pthread_mutex_lock(&nvme_ctrlr->mutex); 4512 4513 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4514 4515 if (nvme_ns->bdev != NULL) { 4516 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4517 return; 4518 } 4519 4520 nvme_ns_free(nvme_ns); 4521 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4522 4523 nvme_ctrlr_release(nvme_ctrlr); 4524 } 4525 4526 static void 4527 bdev_nvme_delete_io_path_done(struct spdk_io_channel_iter *i, int status) 4528 { 4529 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4530 4531 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 4532 } 4533 4534 static void 4535 nvme_ctrlr_depopulate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4536 { 4537 struct nvme_bdev *bdev; 4538 4539 spdk_poller_unregister(&nvme_ns->anatt_timer); 4540 4541 bdev = nvme_ns->bdev; 4542 if (bdev != NULL) { 4543 pthread_mutex_lock(&bdev->mutex); 4544 4545 assert(bdev->ref > 0); 4546 bdev->ref--; 4547 if (bdev->ref == 0) { 4548 pthread_mutex_unlock(&bdev->mutex); 4549 4550 spdk_bdev_unregister(&bdev->disk, NULL, NULL); 4551 } else { 4552 /* spdk_bdev_unregister() is not called until the last nvme_ns is 4553 * depopulated. Hence we need to remove nvme_ns from bdev->nvme_ns_list 4554 * and clear nvme_ns->bdev here. 4555 */ 4556 TAILQ_REMOVE(&bdev->nvme_ns_list, nvme_ns, tailq); 4557 nvme_ns->bdev = NULL; 4558 4559 pthread_mutex_unlock(&bdev->mutex); 4560 4561 /* Delete nvme_io_paths from nvme_bdev_channels dynamically. After that, 4562 * we call depopulate_namespace_done() to avoid use-after-free. 4563 */ 4564 spdk_for_each_channel(bdev, 4565 bdev_nvme_delete_io_path, 4566 nvme_ns, 4567 bdev_nvme_delete_io_path_done); 4568 return; 4569 } 4570 } 4571 4572 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 4573 } 4574 4575 static void 4576 nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 4577 struct nvme_async_probe_ctx *ctx) 4578 { 4579 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 4580 struct nvme_ns *nvme_ns, *next; 4581 struct spdk_nvme_ns *ns; 4582 struct nvme_bdev *bdev; 4583 uint32_t nsid; 4584 int rc; 4585 uint64_t num_sectors; 4586 4587 if (ctx) { 4588 /* Initialize this count to 1 to handle the populate functions 4589 * calling nvme_ctrlr_populate_namespace_done() immediately. 4590 */ 4591 ctx->populates_in_progress = 1; 4592 } 4593 4594 /* First loop over our existing namespaces and see if they have been 4595 * removed. */ 4596 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 4597 while (nvme_ns != NULL) { 4598 next = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 4599 4600 if (spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) { 4601 /* NS is still there but attributes may have changed */ 4602 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id); 4603 num_sectors = spdk_nvme_ns_get_num_sectors(ns); 4604 bdev = nvme_ns->bdev; 4605 assert(bdev != NULL); 4606 if (bdev->disk.blockcnt != num_sectors) { 4607 SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n", 4608 nvme_ns->id, 4609 bdev->disk.name, 4610 bdev->disk.blockcnt, 4611 num_sectors); 4612 rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors); 4613 if (rc != 0) { 4614 SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n", 4615 bdev->disk.name, rc); 4616 } 4617 } 4618 } else { 4619 /* Namespace was removed */ 4620 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 4621 } 4622 4623 nvme_ns = next; 4624 } 4625 4626 /* Loop through all of the namespaces at the nvme level and see if any of them are new */ 4627 nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 4628 while (nsid != 0) { 4629 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 4630 4631 if (nvme_ns == NULL) { 4632 /* Found a new one */ 4633 nvme_ns = nvme_ns_alloc(); 4634 if (nvme_ns == NULL) { 4635 SPDK_ERRLOG("Failed to allocate namespace\n"); 4636 /* This just fails to attach the namespace. It may work on a future attempt. */ 4637 continue; 4638 } 4639 4640 nvme_ns->id = nsid; 4641 nvme_ns->ctrlr = nvme_ctrlr; 4642 4643 nvme_ns->bdev = NULL; 4644 4645 if (ctx) { 4646 ctx->populates_in_progress++; 4647 } 4648 nvme_ns->probe_ctx = ctx; 4649 4650 RB_INSERT(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4651 4652 nvme_ctrlr_populate_namespace(nvme_ctrlr, nvme_ns); 4653 } 4654 4655 nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid); 4656 } 4657 4658 if (ctx) { 4659 /* Decrement this count now that the loop is over to account 4660 * for the one we started with. If the count is then 0, we 4661 * know any populate_namespace functions completed immediately, 4662 * so we'll kick the callback here. 4663 */ 4664 ctx->populates_in_progress--; 4665 if (ctx->populates_in_progress == 0) { 4666 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 4667 } 4668 } 4669 4670 } 4671 4672 static void 4673 nvme_ctrlr_depopulate_namespaces(struct nvme_ctrlr *nvme_ctrlr) 4674 { 4675 struct nvme_ns *nvme_ns, *tmp; 4676 4677 RB_FOREACH_SAFE(nvme_ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp) { 4678 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 4679 } 4680 } 4681 4682 static uint32_t 4683 nvme_ctrlr_get_ana_log_page_size(struct nvme_ctrlr *nvme_ctrlr) 4684 { 4685 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 4686 const struct spdk_nvme_ctrlr_data *cdata; 4687 uint32_t nsid, ns_count = 0; 4688 4689 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 4690 4691 for (nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 4692 nsid != 0; nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid)) { 4693 ns_count++; 4694 } 4695 4696 return sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 4697 sizeof(struct spdk_nvme_ana_group_descriptor) + ns_count * 4698 sizeof(uint32_t); 4699 } 4700 4701 static int 4702 nvme_ctrlr_set_ana_states(const struct spdk_nvme_ana_group_descriptor *desc, 4703 void *cb_arg) 4704 { 4705 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 4706 struct nvme_ns *nvme_ns; 4707 uint32_t i, nsid; 4708 4709 for (i = 0; i < desc->num_of_nsid; i++) { 4710 nsid = desc->nsid[i]; 4711 if (nsid == 0) { 4712 continue; 4713 } 4714 4715 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 4716 4717 assert(nvme_ns != NULL); 4718 if (nvme_ns == NULL) { 4719 /* Target told us that an inactive namespace had an ANA change */ 4720 continue; 4721 } 4722 4723 _nvme_ns_set_ana_state(nvme_ns, desc); 4724 } 4725 4726 return 0; 4727 } 4728 4729 static void 4730 bdev_nvme_disable_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 4731 { 4732 struct nvme_ns *nvme_ns; 4733 4734 spdk_free(nvme_ctrlr->ana_log_page); 4735 nvme_ctrlr->ana_log_page = NULL; 4736 4737 for (nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 4738 nvme_ns != NULL; 4739 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns)) { 4740 nvme_ns->ana_state_updating = false; 4741 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 4742 } 4743 } 4744 4745 static void 4746 nvme_ctrlr_read_ana_log_page_done(void *ctx, const struct spdk_nvme_cpl *cpl) 4747 { 4748 struct nvme_ctrlr *nvme_ctrlr = ctx; 4749 4750 if (cpl != NULL && spdk_nvme_cpl_is_success(cpl)) { 4751 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ctrlr_set_ana_states, 4752 nvme_ctrlr); 4753 } else { 4754 bdev_nvme_disable_read_ana_log_page(nvme_ctrlr); 4755 } 4756 4757 pthread_mutex_lock(&nvme_ctrlr->mutex); 4758 4759 assert(nvme_ctrlr->ana_log_page_updating == true); 4760 nvme_ctrlr->ana_log_page_updating = false; 4761 4762 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 4763 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4764 4765 nvme_ctrlr_unregister(nvme_ctrlr); 4766 } else { 4767 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4768 4769 bdev_nvme_clear_io_path_caches(nvme_ctrlr); 4770 } 4771 } 4772 4773 static int 4774 nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 4775 { 4776 uint32_t ana_log_page_size; 4777 int rc; 4778 4779 if (nvme_ctrlr->ana_log_page == NULL) { 4780 return -EINVAL; 4781 } 4782 4783 ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); 4784 4785 if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { 4786 SPDK_ERRLOG("ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", 4787 ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); 4788 return -EINVAL; 4789 } 4790 4791 pthread_mutex_lock(&nvme_ctrlr->mutex); 4792 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 4793 nvme_ctrlr->ana_log_page_updating) { 4794 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4795 return -EBUSY; 4796 } 4797 4798 nvme_ctrlr->ana_log_page_updating = true; 4799 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4800 4801 rc = spdk_nvme_ctrlr_cmd_get_log_page(nvme_ctrlr->ctrlr, 4802 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 4803 SPDK_NVME_GLOBAL_NS_TAG, 4804 nvme_ctrlr->ana_log_page, 4805 ana_log_page_size, 0, 4806 nvme_ctrlr_read_ana_log_page_done, 4807 nvme_ctrlr); 4808 if (rc != 0) { 4809 nvme_ctrlr_read_ana_log_page_done(nvme_ctrlr, NULL); 4810 } 4811 4812 return rc; 4813 } 4814 4815 static void 4816 dummy_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 4817 { 4818 } 4819 4820 struct bdev_nvme_set_preferred_path_ctx { 4821 struct spdk_bdev_desc *desc; 4822 struct nvme_ns *nvme_ns; 4823 bdev_nvme_set_preferred_path_cb cb_fn; 4824 void *cb_arg; 4825 }; 4826 4827 static void 4828 bdev_nvme_set_preferred_path_done(struct spdk_io_channel_iter *i, int status) 4829 { 4830 struct bdev_nvme_set_preferred_path_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4831 4832 assert(ctx != NULL); 4833 assert(ctx->desc != NULL); 4834 assert(ctx->cb_fn != NULL); 4835 4836 spdk_bdev_close(ctx->desc); 4837 4838 ctx->cb_fn(ctx->cb_arg, status); 4839 4840 free(ctx); 4841 } 4842 4843 static void 4844 _bdev_nvme_set_preferred_path(struct spdk_io_channel_iter *i) 4845 { 4846 struct bdev_nvme_set_preferred_path_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4847 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4848 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4849 struct nvme_io_path *io_path, *prev; 4850 4851 prev = NULL; 4852 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 4853 if (io_path->nvme_ns == ctx->nvme_ns) { 4854 break; 4855 } 4856 prev = io_path; 4857 } 4858 4859 if (io_path != NULL) { 4860 if (prev != NULL) { 4861 STAILQ_REMOVE_AFTER(&nbdev_ch->io_path_list, prev, stailq); 4862 STAILQ_INSERT_HEAD(&nbdev_ch->io_path_list, io_path, stailq); 4863 } 4864 4865 /* We can set io_path to nbdev_ch->current_io_path directly here. 4866 * However, it needs to be conditional. To simplify the code, 4867 * just clear nbdev_ch->current_io_path and let find_io_path() 4868 * fill it. 4869 * 4870 * Automatic failback may be disabled. Hence even if the io_path is 4871 * already at the head, clear nbdev_ch->current_io_path. 4872 */ 4873 bdev_nvme_clear_current_io_path(nbdev_ch); 4874 } 4875 4876 spdk_for_each_channel_continue(i, 0); 4877 } 4878 4879 static struct nvme_ns * 4880 bdev_nvme_set_preferred_ns(struct nvme_bdev *nbdev, uint16_t cntlid) 4881 { 4882 struct nvme_ns *nvme_ns, *prev; 4883 const struct spdk_nvme_ctrlr_data *cdata; 4884 4885 prev = NULL; 4886 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 4887 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 4888 4889 if (cdata->cntlid == cntlid) { 4890 break; 4891 } 4892 prev = nvme_ns; 4893 } 4894 4895 if (nvme_ns != NULL && prev != NULL) { 4896 TAILQ_REMOVE(&nbdev->nvme_ns_list, nvme_ns, tailq); 4897 TAILQ_INSERT_HEAD(&nbdev->nvme_ns_list, nvme_ns, tailq); 4898 } 4899 4900 return nvme_ns; 4901 } 4902 4903 /* This function supports only multipath mode. There is only a single I/O path 4904 * for each NVMe-oF controller. Hence, just move the matched I/O path to the 4905 * head of the I/O path list for each NVMe bdev channel. 4906 * 4907 * NVMe bdev channel may be acquired after completing this function. move the 4908 * matched namespace to the head of the namespace list for the NVMe bdev too. 4909 */ 4910 void 4911 bdev_nvme_set_preferred_path(const char *name, uint16_t cntlid, 4912 bdev_nvme_set_preferred_path_cb cb_fn, void *cb_arg) 4913 { 4914 struct bdev_nvme_set_preferred_path_ctx *ctx; 4915 struct spdk_bdev *bdev; 4916 struct nvme_bdev *nbdev; 4917 int rc = 0; 4918 4919 assert(cb_fn != NULL); 4920 4921 ctx = calloc(1, sizeof(*ctx)); 4922 if (ctx == NULL) { 4923 SPDK_ERRLOG("Failed to alloc context.\n"); 4924 rc = -ENOMEM; 4925 goto err_alloc; 4926 } 4927 4928 ctx->cb_fn = cb_fn; 4929 ctx->cb_arg = cb_arg; 4930 4931 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 4932 if (rc != 0) { 4933 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 4934 goto err_open; 4935 } 4936 4937 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 4938 4939 if (bdev->module != &nvme_if) { 4940 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 4941 rc = -ENODEV; 4942 goto err_bdev; 4943 } 4944 4945 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 4946 4947 pthread_mutex_lock(&nbdev->mutex); 4948 4949 ctx->nvme_ns = bdev_nvme_set_preferred_ns(nbdev, cntlid); 4950 if (ctx->nvme_ns == NULL) { 4951 pthread_mutex_unlock(&nbdev->mutex); 4952 4953 SPDK_ERRLOG("bdev %s does not have namespace to controller %u.\n", name, cntlid); 4954 rc = -ENODEV; 4955 goto err_bdev; 4956 } 4957 4958 pthread_mutex_unlock(&nbdev->mutex); 4959 4960 spdk_for_each_channel(nbdev, 4961 _bdev_nvme_set_preferred_path, 4962 ctx, 4963 bdev_nvme_set_preferred_path_done); 4964 return; 4965 4966 err_bdev: 4967 spdk_bdev_close(ctx->desc); 4968 err_open: 4969 free(ctx); 4970 err_alloc: 4971 cb_fn(cb_arg, rc); 4972 } 4973 4974 struct bdev_nvme_set_multipath_policy_ctx { 4975 struct spdk_bdev_desc *desc; 4976 bdev_nvme_set_multipath_policy_cb cb_fn; 4977 void *cb_arg; 4978 }; 4979 4980 static void 4981 bdev_nvme_set_multipath_policy_done(struct spdk_io_channel_iter *i, int status) 4982 { 4983 struct bdev_nvme_set_multipath_policy_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4984 4985 assert(ctx != NULL); 4986 assert(ctx->desc != NULL); 4987 assert(ctx->cb_fn != NULL); 4988 4989 spdk_bdev_close(ctx->desc); 4990 4991 ctx->cb_fn(ctx->cb_arg, status); 4992 4993 free(ctx); 4994 } 4995 4996 static void 4997 _bdev_nvme_set_multipath_policy(struct spdk_io_channel_iter *i) 4998 { 4999 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 5000 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 5001 struct nvme_bdev *nbdev = spdk_io_channel_get_io_device(_ch); 5002 5003 nbdev_ch->mp_policy = nbdev->mp_policy; 5004 nbdev_ch->mp_selector = nbdev->mp_selector; 5005 nbdev_ch->rr_min_io = nbdev->rr_min_io; 5006 bdev_nvme_clear_current_io_path(nbdev_ch); 5007 5008 spdk_for_each_channel_continue(i, 0); 5009 } 5010 5011 void 5012 bdev_nvme_set_multipath_policy(const char *name, enum bdev_nvme_multipath_policy policy, 5013 enum bdev_nvme_multipath_selector selector, uint32_t rr_min_io, 5014 bdev_nvme_set_multipath_policy_cb cb_fn, void *cb_arg) 5015 { 5016 struct bdev_nvme_set_multipath_policy_ctx *ctx; 5017 struct spdk_bdev *bdev; 5018 struct nvme_bdev *nbdev; 5019 int rc; 5020 5021 assert(cb_fn != NULL); 5022 5023 if (policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE && selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 5024 if (rr_min_io == UINT32_MAX) { 5025 rr_min_io = 1; 5026 } else if (rr_min_io == 0) { 5027 rc = -EINVAL; 5028 goto exit; 5029 } 5030 } else if (rr_min_io != UINT32_MAX) { 5031 rc = -EINVAL; 5032 goto exit; 5033 } 5034 5035 ctx = calloc(1, sizeof(*ctx)); 5036 if (ctx == NULL) { 5037 SPDK_ERRLOG("Failed to alloc context.\n"); 5038 rc = -ENOMEM; 5039 goto exit; 5040 } 5041 5042 ctx->cb_fn = cb_fn; 5043 ctx->cb_arg = cb_arg; 5044 5045 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 5046 if (rc != 0) { 5047 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 5048 rc = -ENODEV; 5049 goto err_open; 5050 } 5051 5052 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 5053 if (bdev->module != &nvme_if) { 5054 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 5055 rc = -ENODEV; 5056 goto err_module; 5057 } 5058 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 5059 5060 pthread_mutex_lock(&nbdev->mutex); 5061 nbdev->mp_policy = policy; 5062 nbdev->mp_selector = selector; 5063 nbdev->rr_min_io = rr_min_io; 5064 pthread_mutex_unlock(&nbdev->mutex); 5065 5066 spdk_for_each_channel(nbdev, 5067 _bdev_nvme_set_multipath_policy, 5068 ctx, 5069 bdev_nvme_set_multipath_policy_done); 5070 return; 5071 5072 err_module: 5073 spdk_bdev_close(ctx->desc); 5074 err_open: 5075 free(ctx); 5076 exit: 5077 cb_fn(cb_arg, rc); 5078 } 5079 5080 static void 5081 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 5082 { 5083 struct nvme_ctrlr *nvme_ctrlr = arg; 5084 union spdk_nvme_async_event_completion event; 5085 5086 if (spdk_nvme_cpl_is_error(cpl)) { 5087 SPDK_WARNLOG("AER request execute failed\n"); 5088 return; 5089 } 5090 5091 event.raw = cpl->cdw0; 5092 if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 5093 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) { 5094 nvme_ctrlr_populate_namespaces(nvme_ctrlr, NULL); 5095 } else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 5096 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_ANA_CHANGE)) { 5097 nvme_ctrlr_read_ana_log_page(nvme_ctrlr); 5098 } 5099 } 5100 5101 static void 5102 free_nvme_async_probe_ctx(struct nvme_async_probe_ctx *ctx) 5103 { 5104 spdk_keyring_put_key(ctx->drv_opts.tls_psk); 5105 free(ctx); 5106 } 5107 5108 static void 5109 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, int rc) 5110 { 5111 if (ctx->cb_fn) { 5112 ctx->cb_fn(ctx->cb_ctx, ctx->reported_bdevs, rc); 5113 } 5114 5115 ctx->namespaces_populated = true; 5116 if (ctx->probe_done) { 5117 /* The probe was already completed, so we need to free the context 5118 * here. This can happen for cases like OCSSD, where we need to 5119 * send additional commands to the SSD after attach. 5120 */ 5121 free_nvme_async_probe_ctx(ctx); 5122 } 5123 } 5124 5125 static void 5126 nvme_ctrlr_create_done(struct nvme_ctrlr *nvme_ctrlr, 5127 struct nvme_async_probe_ctx *ctx) 5128 { 5129 spdk_io_device_register(nvme_ctrlr, 5130 bdev_nvme_create_ctrlr_channel_cb, 5131 bdev_nvme_destroy_ctrlr_channel_cb, 5132 sizeof(struct nvme_ctrlr_channel), 5133 nvme_ctrlr->nbdev_ctrlr->name); 5134 5135 nvme_ctrlr_populate_namespaces(nvme_ctrlr, ctx); 5136 } 5137 5138 static void 5139 nvme_ctrlr_init_ana_log_page_done(void *_ctx, const struct spdk_nvme_cpl *cpl) 5140 { 5141 struct nvme_ctrlr *nvme_ctrlr = _ctx; 5142 struct nvme_async_probe_ctx *ctx = nvme_ctrlr->probe_ctx; 5143 5144 nvme_ctrlr->probe_ctx = NULL; 5145 5146 if (spdk_nvme_cpl_is_error(cpl)) { 5147 nvme_ctrlr_delete(nvme_ctrlr); 5148 5149 if (ctx != NULL) { 5150 ctx->reported_bdevs = 0; 5151 populate_namespaces_cb(ctx, -1); 5152 } 5153 return; 5154 } 5155 5156 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 5157 } 5158 5159 static int 5160 nvme_ctrlr_init_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 5161 struct nvme_async_probe_ctx *ctx) 5162 { 5163 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 5164 const struct spdk_nvme_ctrlr_data *cdata; 5165 uint32_t ana_log_page_size; 5166 5167 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5168 5169 /* Set buffer size enough to include maximum number of allowed namespaces. */ 5170 ana_log_page_size = sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 5171 sizeof(struct spdk_nvme_ana_group_descriptor) + cdata->mnan * 5172 sizeof(uint32_t); 5173 5174 nvme_ctrlr->ana_log_page = spdk_zmalloc(ana_log_page_size, 64, NULL, 5175 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 5176 if (nvme_ctrlr->ana_log_page == NULL) { 5177 SPDK_ERRLOG("could not allocate ANA log page buffer\n"); 5178 return -ENXIO; 5179 } 5180 5181 /* Each descriptor in a ANA log page is not ensured to be 8-bytes aligned. 5182 * Hence copy each descriptor to a temporary area when parsing it. 5183 * 5184 * Allocate a buffer whose size is as large as ANA log page buffer because 5185 * we do not know the size of a descriptor until actually reading it. 5186 */ 5187 nvme_ctrlr->copied_ana_desc = calloc(1, ana_log_page_size); 5188 if (nvme_ctrlr->copied_ana_desc == NULL) { 5189 SPDK_ERRLOG("could not allocate a buffer to parse ANA descriptor\n"); 5190 return -ENOMEM; 5191 } 5192 5193 nvme_ctrlr->max_ana_log_page_size = ana_log_page_size; 5194 5195 nvme_ctrlr->probe_ctx = ctx; 5196 5197 /* Then, set the read size only to include the current active namespaces. */ 5198 ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); 5199 5200 if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { 5201 SPDK_ERRLOG("ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", 5202 ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); 5203 return -EINVAL; 5204 } 5205 5206 return spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, 5207 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 5208 SPDK_NVME_GLOBAL_NS_TAG, 5209 nvme_ctrlr->ana_log_page, 5210 ana_log_page_size, 0, 5211 nvme_ctrlr_init_ana_log_page_done, 5212 nvme_ctrlr); 5213 } 5214 5215 /* hostnqn and subnqn were already verified before attaching a controller. 5216 * Hence check only the multipath capability and cntlid here. 5217 */ 5218 static bool 5219 bdev_nvme_check_multipath(struct nvme_bdev_ctrlr *nbdev_ctrlr, struct spdk_nvme_ctrlr *ctrlr) 5220 { 5221 struct nvme_ctrlr *tmp; 5222 const struct spdk_nvme_ctrlr_data *cdata, *tmp_cdata; 5223 5224 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5225 5226 if (!cdata->cmic.multi_ctrlr) { 5227 SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); 5228 return false; 5229 } 5230 5231 TAILQ_FOREACH(tmp, &nbdev_ctrlr->ctrlrs, tailq) { 5232 tmp_cdata = spdk_nvme_ctrlr_get_data(tmp->ctrlr); 5233 5234 if (!tmp_cdata->cmic.multi_ctrlr) { 5235 SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); 5236 return false; 5237 } 5238 if (cdata->cntlid == tmp_cdata->cntlid) { 5239 SPDK_ERRLOG("cntlid %u are duplicated.\n", tmp_cdata->cntlid); 5240 return false; 5241 } 5242 } 5243 5244 return true; 5245 } 5246 5247 static int 5248 nvme_bdev_ctrlr_create(const char *name, struct nvme_ctrlr *nvme_ctrlr) 5249 { 5250 struct nvme_bdev_ctrlr *nbdev_ctrlr; 5251 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 5252 int rc = 0; 5253 5254 pthread_mutex_lock(&g_bdev_nvme_mutex); 5255 5256 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 5257 if (nbdev_ctrlr != NULL) { 5258 if (!bdev_nvme_check_multipath(nbdev_ctrlr, ctrlr)) { 5259 rc = -EINVAL; 5260 goto exit; 5261 } 5262 } else { 5263 nbdev_ctrlr = calloc(1, sizeof(*nbdev_ctrlr)); 5264 if (nbdev_ctrlr == NULL) { 5265 SPDK_ERRLOG("Failed to allocate nvme_bdev_ctrlr.\n"); 5266 rc = -ENOMEM; 5267 goto exit; 5268 } 5269 nbdev_ctrlr->name = strdup(name); 5270 if (nbdev_ctrlr->name == NULL) { 5271 SPDK_ERRLOG("Failed to allocate name of nvme_bdev_ctrlr.\n"); 5272 free(nbdev_ctrlr); 5273 goto exit; 5274 } 5275 TAILQ_INIT(&nbdev_ctrlr->ctrlrs); 5276 TAILQ_INIT(&nbdev_ctrlr->bdevs); 5277 TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 5278 } 5279 nvme_ctrlr->nbdev_ctrlr = nbdev_ctrlr; 5280 TAILQ_INSERT_TAIL(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 5281 exit: 5282 pthread_mutex_unlock(&g_bdev_nvme_mutex); 5283 return rc; 5284 } 5285 5286 static int 5287 nvme_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr, 5288 const char *name, 5289 const struct spdk_nvme_transport_id *trid, 5290 struct nvme_async_probe_ctx *ctx) 5291 { 5292 struct nvme_ctrlr *nvme_ctrlr; 5293 struct nvme_path_id *path_id; 5294 const struct spdk_nvme_ctrlr_data *cdata; 5295 int rc; 5296 5297 nvme_ctrlr = calloc(1, sizeof(*nvme_ctrlr)); 5298 if (nvme_ctrlr == NULL) { 5299 SPDK_ERRLOG("Failed to allocate device struct\n"); 5300 return -ENOMEM; 5301 } 5302 5303 rc = pthread_mutex_init(&nvme_ctrlr->mutex, NULL); 5304 if (rc != 0) { 5305 free(nvme_ctrlr); 5306 return rc; 5307 } 5308 5309 TAILQ_INIT(&nvme_ctrlr->trids); 5310 RB_INIT(&nvme_ctrlr->namespaces); 5311 5312 /* Get another reference to the key, so the first one can be released from probe_ctx */ 5313 if (ctx != NULL && ctx->drv_opts.tls_psk != NULL) { 5314 nvme_ctrlr->psk = spdk_keyring_get_key(spdk_key_get_name(ctx->drv_opts.tls_psk)); 5315 if (nvme_ctrlr->psk == NULL) { 5316 /* Could only happen if the key was removed in the meantime */ 5317 SPDK_ERRLOG("Couldn't get a reference to the key '%s'\n", 5318 spdk_key_get_name(ctx->drv_opts.tls_psk)); 5319 rc = -ENOKEY; 5320 goto err; 5321 } 5322 } 5323 5324 path_id = calloc(1, sizeof(*path_id)); 5325 if (path_id == NULL) { 5326 SPDK_ERRLOG("Failed to allocate trid entry pointer\n"); 5327 rc = -ENOMEM; 5328 goto err; 5329 } 5330 5331 path_id->trid = *trid; 5332 if (ctx != NULL) { 5333 memcpy(path_id->hostid.hostaddr, ctx->drv_opts.src_addr, sizeof(path_id->hostid.hostaddr)); 5334 memcpy(path_id->hostid.hostsvcid, ctx->drv_opts.src_svcid, sizeof(path_id->hostid.hostsvcid)); 5335 } 5336 nvme_ctrlr->active_path_id = path_id; 5337 TAILQ_INSERT_HEAD(&nvme_ctrlr->trids, path_id, link); 5338 5339 nvme_ctrlr->thread = spdk_get_thread(); 5340 nvme_ctrlr->ctrlr = ctrlr; 5341 nvme_ctrlr->ref = 1; 5342 5343 if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) { 5344 SPDK_ERRLOG("OCSSDs are not supported"); 5345 rc = -ENOTSUP; 5346 goto err; 5347 } 5348 5349 if (ctx != NULL) { 5350 memcpy(&nvme_ctrlr->opts, &ctx->bdev_opts, sizeof(ctx->bdev_opts)); 5351 } else { 5352 bdev_nvme_get_default_ctrlr_opts(&nvme_ctrlr->opts); 5353 } 5354 5355 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_ctrlr, 5356 g_opts.nvme_adminq_poll_period_us); 5357 5358 if (g_opts.timeout_us > 0) { 5359 /* Register timeout callback. Timeout values for IO vs. admin reqs can be different. */ 5360 /* If timeout_admin_us is 0 (not specified), admin uses same timeout as IO. */ 5361 uint64_t adm_timeout_us = (g_opts.timeout_admin_us == 0) ? 5362 g_opts.timeout_us : g_opts.timeout_admin_us; 5363 spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us, 5364 adm_timeout_us, timeout_cb, nvme_ctrlr); 5365 } 5366 5367 spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_ctrlr); 5368 spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_ctrlr); 5369 5370 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 5371 SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) { 5372 nvme_ctrlr->opal_dev = spdk_opal_dev_construct(ctrlr); 5373 } 5374 5375 rc = nvme_bdev_ctrlr_create(name, nvme_ctrlr); 5376 if (rc != 0) { 5377 goto err; 5378 } 5379 5380 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5381 5382 if (cdata->cmic.ana_reporting) { 5383 rc = nvme_ctrlr_init_ana_log_page(nvme_ctrlr, ctx); 5384 if (rc == 0) { 5385 return 0; 5386 } 5387 } else { 5388 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 5389 return 0; 5390 } 5391 5392 err: 5393 nvme_ctrlr_delete(nvme_ctrlr); 5394 return rc; 5395 } 5396 5397 void 5398 bdev_nvme_get_default_ctrlr_opts(struct nvme_ctrlr_opts *opts) 5399 { 5400 opts->prchk_flags = 0; 5401 opts->ctrlr_loss_timeout_sec = g_opts.ctrlr_loss_timeout_sec; 5402 opts->reconnect_delay_sec = g_opts.reconnect_delay_sec; 5403 opts->fast_io_fail_timeout_sec = g_opts.fast_io_fail_timeout_sec; 5404 } 5405 5406 static void 5407 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5408 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *drv_opts) 5409 { 5410 char *name; 5411 5412 name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++); 5413 if (!name) { 5414 SPDK_ERRLOG("Failed to assign name to NVMe device\n"); 5415 return; 5416 } 5417 5418 if (nvme_ctrlr_create(ctrlr, name, trid, NULL) == 0) { 5419 SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name); 5420 } else { 5421 SPDK_ERRLOG("Failed to attach to %s (%s)\n", trid->traddr, name); 5422 } 5423 5424 free(name); 5425 } 5426 5427 static void 5428 _nvme_ctrlr_destruct(void *ctx) 5429 { 5430 struct nvme_ctrlr *nvme_ctrlr = ctx; 5431 5432 nvme_ctrlr_depopulate_namespaces(nvme_ctrlr); 5433 nvme_ctrlr_release(nvme_ctrlr); 5434 } 5435 5436 static int 5437 bdev_nvme_delete_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) 5438 { 5439 struct nvme_probe_skip_entry *entry; 5440 5441 /* The controller's destruction was already started */ 5442 if (nvme_ctrlr->destruct) { 5443 return -EALREADY; 5444 } 5445 5446 if (!hotplug && 5447 nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { 5448 entry = calloc(1, sizeof(*entry)); 5449 if (!entry) { 5450 return -ENOMEM; 5451 } 5452 entry->trid = nvme_ctrlr->active_path_id->trid; 5453 TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq); 5454 } 5455 5456 nvme_ctrlr->destruct = true; 5457 return 0; 5458 } 5459 5460 static int 5461 bdev_nvme_delete_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) 5462 { 5463 int rc; 5464 5465 pthread_mutex_lock(&nvme_ctrlr->mutex); 5466 rc = bdev_nvme_delete_ctrlr_unsafe(nvme_ctrlr, hotplug); 5467 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5468 5469 if (rc == 0) { 5470 _nvme_ctrlr_destruct(nvme_ctrlr); 5471 } else if (rc == -EALREADY) { 5472 rc = 0; 5473 } 5474 5475 return rc; 5476 } 5477 5478 static void 5479 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr) 5480 { 5481 struct nvme_ctrlr *nvme_ctrlr = cb_ctx; 5482 5483 bdev_nvme_delete_ctrlr(nvme_ctrlr, true); 5484 } 5485 5486 static int 5487 bdev_nvme_hotplug_probe(void *arg) 5488 { 5489 if (g_hotplug_probe_ctx == NULL) { 5490 spdk_poller_unregister(&g_hotplug_probe_poller); 5491 return SPDK_POLLER_IDLE; 5492 } 5493 5494 if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) { 5495 g_hotplug_probe_ctx = NULL; 5496 spdk_poller_unregister(&g_hotplug_probe_poller); 5497 } 5498 5499 return SPDK_POLLER_BUSY; 5500 } 5501 5502 static int 5503 bdev_nvme_hotplug(void *arg) 5504 { 5505 struct spdk_nvme_transport_id trid_pcie; 5506 5507 if (g_hotplug_probe_ctx) { 5508 return SPDK_POLLER_BUSY; 5509 } 5510 5511 memset(&trid_pcie, 0, sizeof(trid_pcie)); 5512 spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE); 5513 5514 g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL, 5515 hotplug_probe_cb, attach_cb, NULL); 5516 5517 if (g_hotplug_probe_ctx) { 5518 assert(g_hotplug_probe_poller == NULL); 5519 g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000); 5520 } 5521 5522 return SPDK_POLLER_BUSY; 5523 } 5524 5525 void 5526 bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts) 5527 { 5528 *opts = g_opts; 5529 } 5530 5531 static bool bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, 5532 uint32_t reconnect_delay_sec, 5533 uint32_t fast_io_fail_timeout_sec); 5534 5535 static int 5536 bdev_nvme_validate_opts(const struct spdk_bdev_nvme_opts *opts) 5537 { 5538 if ((opts->timeout_us == 0) && (opts->timeout_admin_us != 0)) { 5539 /* Can't set timeout_admin_us without also setting timeout_us */ 5540 SPDK_WARNLOG("Invalid options: Can't have (timeout_us == 0) with (timeout_admin_us > 0)\n"); 5541 return -EINVAL; 5542 } 5543 5544 if (opts->bdev_retry_count < -1) { 5545 SPDK_WARNLOG("Invalid option: bdev_retry_count can't be less than -1.\n"); 5546 return -EINVAL; 5547 } 5548 5549 if (!bdev_nvme_check_io_error_resiliency_params(opts->ctrlr_loss_timeout_sec, 5550 opts->reconnect_delay_sec, 5551 opts->fast_io_fail_timeout_sec)) { 5552 return -EINVAL; 5553 } 5554 5555 return 0; 5556 } 5557 5558 int 5559 bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts) 5560 { 5561 int ret; 5562 5563 ret = bdev_nvme_validate_opts(opts); 5564 if (ret) { 5565 SPDK_WARNLOG("Failed to set nvme opts.\n"); 5566 return ret; 5567 } 5568 5569 if (g_bdev_nvme_init_thread != NULL) { 5570 if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 5571 return -EPERM; 5572 } 5573 } 5574 5575 if (opts->rdma_srq_size != 0 || 5576 opts->rdma_max_cq_size != 0 || 5577 opts->rdma_cm_event_timeout_ms != 0) { 5578 struct spdk_nvme_transport_opts drv_opts; 5579 5580 spdk_nvme_transport_get_opts(&drv_opts, sizeof(drv_opts)); 5581 if (opts->rdma_srq_size != 0) { 5582 drv_opts.rdma_srq_size = opts->rdma_srq_size; 5583 } 5584 if (opts->rdma_max_cq_size != 0) { 5585 drv_opts.rdma_max_cq_size = opts->rdma_max_cq_size; 5586 } 5587 if (opts->rdma_cm_event_timeout_ms != 0) { 5588 drv_opts.rdma_cm_event_timeout_ms = opts->rdma_cm_event_timeout_ms; 5589 } 5590 5591 ret = spdk_nvme_transport_set_opts(&drv_opts, sizeof(drv_opts)); 5592 if (ret) { 5593 SPDK_ERRLOG("Failed to set NVMe transport opts.\n"); 5594 return ret; 5595 } 5596 } 5597 5598 g_opts = *opts; 5599 5600 return 0; 5601 } 5602 5603 struct set_nvme_hotplug_ctx { 5604 uint64_t period_us; 5605 bool enabled; 5606 spdk_msg_fn fn; 5607 void *fn_ctx; 5608 }; 5609 5610 static void 5611 set_nvme_hotplug_period_cb(void *_ctx) 5612 { 5613 struct set_nvme_hotplug_ctx *ctx = _ctx; 5614 5615 spdk_poller_unregister(&g_hotplug_poller); 5616 if (ctx->enabled) { 5617 g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us); 5618 } 5619 5620 g_nvme_hotplug_poll_period_us = ctx->period_us; 5621 g_nvme_hotplug_enabled = ctx->enabled; 5622 if (ctx->fn) { 5623 ctx->fn(ctx->fn_ctx); 5624 } 5625 5626 free(ctx); 5627 } 5628 5629 int 5630 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx) 5631 { 5632 struct set_nvme_hotplug_ctx *ctx; 5633 5634 if (enabled == true && !spdk_process_is_primary()) { 5635 return -EPERM; 5636 } 5637 5638 ctx = calloc(1, sizeof(*ctx)); 5639 if (ctx == NULL) { 5640 return -ENOMEM; 5641 } 5642 5643 period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us; 5644 ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX); 5645 ctx->enabled = enabled; 5646 ctx->fn = cb; 5647 ctx->fn_ctx = cb_ctx; 5648 5649 spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx); 5650 return 0; 5651 } 5652 5653 static void 5654 nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 5655 struct nvme_async_probe_ctx *ctx) 5656 { 5657 struct nvme_ns *nvme_ns; 5658 struct nvme_bdev *nvme_bdev; 5659 size_t j; 5660 5661 assert(nvme_ctrlr != NULL); 5662 5663 if (ctx->names == NULL) { 5664 ctx->reported_bdevs = 0; 5665 populate_namespaces_cb(ctx, 0); 5666 return; 5667 } 5668 5669 /* 5670 * Report the new bdevs that were created in this call. 5671 * There can be more than one bdev per NVMe controller. 5672 */ 5673 j = 0; 5674 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 5675 while (nvme_ns != NULL) { 5676 nvme_bdev = nvme_ns->bdev; 5677 if (j < ctx->max_bdevs) { 5678 ctx->names[j] = nvme_bdev->disk.name; 5679 j++; 5680 } else { 5681 SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n", 5682 ctx->max_bdevs); 5683 ctx->reported_bdevs = 0; 5684 populate_namespaces_cb(ctx, -ERANGE); 5685 return; 5686 } 5687 5688 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 5689 } 5690 5691 ctx->reported_bdevs = j; 5692 populate_namespaces_cb(ctx, 0); 5693 } 5694 5695 static int 5696 bdev_nvme_check_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 5697 struct spdk_nvme_ctrlr *new_ctrlr, 5698 struct spdk_nvme_transport_id *trid) 5699 { 5700 struct nvme_path_id *tmp_trid; 5701 5702 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 5703 SPDK_ERRLOG("PCIe failover is not supported.\n"); 5704 return -ENOTSUP; 5705 } 5706 5707 /* Currently we only support failover to the same transport type. */ 5708 if (nvme_ctrlr->active_path_id->trid.trtype != trid->trtype) { 5709 SPDK_WARNLOG("Failover from trtype: %s to a different trtype: %s is not supported currently\n", 5710 spdk_nvme_transport_id_trtype_str(nvme_ctrlr->active_path_id->trid.trtype), 5711 spdk_nvme_transport_id_trtype_str(trid->trtype)); 5712 return -EINVAL; 5713 } 5714 5715 5716 /* Currently we only support failover to the same NQN. */ 5717 if (strncmp(trid->subnqn, nvme_ctrlr->active_path_id->trid.subnqn, SPDK_NVMF_NQN_MAX_LEN)) { 5718 SPDK_WARNLOG("Failover from subnqn: %s to a different subnqn: %s is not supported currently\n", 5719 nvme_ctrlr->active_path_id->trid.subnqn, trid->subnqn); 5720 return -EINVAL; 5721 } 5722 5723 /* Skip all the other checks if we've already registered this path. */ 5724 TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) { 5725 if (!spdk_nvme_transport_id_compare(&tmp_trid->trid, trid)) { 5726 SPDK_WARNLOG("This path (traddr: %s subnqn: %s) is already registered\n", trid->traddr, 5727 trid->subnqn); 5728 return -EEXIST; 5729 } 5730 } 5731 5732 return 0; 5733 } 5734 5735 static int 5736 bdev_nvme_check_secondary_namespace(struct nvme_ctrlr *nvme_ctrlr, 5737 struct spdk_nvme_ctrlr *new_ctrlr) 5738 { 5739 struct nvme_ns *nvme_ns; 5740 struct spdk_nvme_ns *new_ns; 5741 5742 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 5743 while (nvme_ns != NULL) { 5744 new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nvme_ns->id); 5745 assert(new_ns != NULL); 5746 5747 if (!bdev_nvme_compare_ns(nvme_ns->ns, new_ns)) { 5748 return -EINVAL; 5749 } 5750 5751 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 5752 } 5753 5754 return 0; 5755 } 5756 5757 static int 5758 _bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 5759 struct spdk_nvme_transport_id *trid) 5760 { 5761 struct nvme_path_id *active_id, *new_trid, *tmp_trid; 5762 5763 new_trid = calloc(1, sizeof(*new_trid)); 5764 if (new_trid == NULL) { 5765 return -ENOMEM; 5766 } 5767 new_trid->trid = *trid; 5768 5769 active_id = nvme_ctrlr->active_path_id; 5770 assert(active_id != NULL); 5771 assert(active_id == TAILQ_FIRST(&nvme_ctrlr->trids)); 5772 5773 /* Skip the active trid not to replace it until it is failed. */ 5774 tmp_trid = TAILQ_NEXT(active_id, link); 5775 if (tmp_trid == NULL) { 5776 goto add_tail; 5777 } 5778 5779 /* It means the trid is faled if its last failed time is non-zero. 5780 * Insert the new alternate trid before any failed trid. 5781 */ 5782 TAILQ_FOREACH_FROM(tmp_trid, &nvme_ctrlr->trids, link) { 5783 if (tmp_trid->last_failed_tsc != 0) { 5784 TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link); 5785 return 0; 5786 } 5787 } 5788 5789 add_tail: 5790 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, new_trid, link); 5791 return 0; 5792 } 5793 5794 /* This is the case that a secondary path is added to an existing 5795 * nvme_ctrlr for failover. After checking if it can access the same 5796 * namespaces as the primary path, it is disconnected until failover occurs. 5797 */ 5798 static int 5799 bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 5800 struct spdk_nvme_ctrlr *new_ctrlr, 5801 struct spdk_nvme_transport_id *trid) 5802 { 5803 int rc; 5804 5805 assert(nvme_ctrlr != NULL); 5806 5807 pthread_mutex_lock(&nvme_ctrlr->mutex); 5808 5809 rc = bdev_nvme_check_secondary_trid(nvme_ctrlr, new_ctrlr, trid); 5810 if (rc != 0) { 5811 goto exit; 5812 } 5813 5814 rc = bdev_nvme_check_secondary_namespace(nvme_ctrlr, new_ctrlr); 5815 if (rc != 0) { 5816 goto exit; 5817 } 5818 5819 rc = _bdev_nvme_add_secondary_trid(nvme_ctrlr, trid); 5820 5821 exit: 5822 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5823 5824 spdk_nvme_detach(new_ctrlr); 5825 5826 return rc; 5827 } 5828 5829 static void 5830 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5831 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 5832 { 5833 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 5834 struct nvme_async_probe_ctx *ctx; 5835 int rc; 5836 5837 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 5838 ctx->ctrlr_attached = true; 5839 5840 rc = nvme_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx); 5841 if (rc != 0) { 5842 ctx->reported_bdevs = 0; 5843 populate_namespaces_cb(ctx, rc); 5844 } 5845 } 5846 5847 static void 5848 connect_set_failover_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5849 struct spdk_nvme_ctrlr *ctrlr, 5850 const struct spdk_nvme_ctrlr_opts *opts) 5851 { 5852 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 5853 struct nvme_ctrlr *nvme_ctrlr; 5854 struct nvme_async_probe_ctx *ctx; 5855 int rc; 5856 5857 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 5858 ctx->ctrlr_attached = true; 5859 5860 nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->base_name); 5861 if (nvme_ctrlr) { 5862 rc = bdev_nvme_add_secondary_trid(nvme_ctrlr, ctrlr, &ctx->trid); 5863 } else { 5864 rc = -ENODEV; 5865 } 5866 5867 ctx->reported_bdevs = 0; 5868 populate_namespaces_cb(ctx, rc); 5869 } 5870 5871 static int 5872 bdev_nvme_async_poll(void *arg) 5873 { 5874 struct nvme_async_probe_ctx *ctx = arg; 5875 int rc; 5876 5877 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 5878 if (spdk_unlikely(rc != -EAGAIN)) { 5879 ctx->probe_done = true; 5880 spdk_poller_unregister(&ctx->poller); 5881 if (!ctx->ctrlr_attached) { 5882 /* The probe is done, but no controller was attached. 5883 * That means we had a failure, so report -EIO back to 5884 * the caller (usually the RPC). populate_namespaces_cb() 5885 * will take care of freeing the nvme_async_probe_ctx. 5886 */ 5887 ctx->reported_bdevs = 0; 5888 populate_namespaces_cb(ctx, -EIO); 5889 } else if (ctx->namespaces_populated) { 5890 /* The namespaces for the attached controller were all 5891 * populated and the response was already sent to the 5892 * caller (usually the RPC). So free the context here. 5893 */ 5894 free_nvme_async_probe_ctx(ctx); 5895 } 5896 } 5897 5898 return SPDK_POLLER_BUSY; 5899 } 5900 5901 static bool 5902 bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, 5903 uint32_t reconnect_delay_sec, 5904 uint32_t fast_io_fail_timeout_sec) 5905 { 5906 if (ctrlr_loss_timeout_sec < -1) { 5907 SPDK_ERRLOG("ctrlr_loss_timeout_sec can't be less than -1.\n"); 5908 return false; 5909 } else if (ctrlr_loss_timeout_sec == -1) { 5910 if (reconnect_delay_sec == 0) { 5911 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 5912 return false; 5913 } else if (fast_io_fail_timeout_sec != 0 && 5914 fast_io_fail_timeout_sec < reconnect_delay_sec) { 5915 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io-fail_timeout_sec.\n"); 5916 return false; 5917 } 5918 } else if (ctrlr_loss_timeout_sec != 0) { 5919 if (reconnect_delay_sec == 0) { 5920 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 5921 return false; 5922 } else if (reconnect_delay_sec > (uint32_t)ctrlr_loss_timeout_sec) { 5923 SPDK_ERRLOG("reconnect_delay_sec can't be more than ctrlr_loss_timeout_sec.\n"); 5924 return false; 5925 } else if (fast_io_fail_timeout_sec != 0) { 5926 if (fast_io_fail_timeout_sec < reconnect_delay_sec) { 5927 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io_fail_timeout_sec.\n"); 5928 return false; 5929 } else if (fast_io_fail_timeout_sec > (uint32_t)ctrlr_loss_timeout_sec) { 5930 SPDK_ERRLOG("fast_io_fail_timeout_sec can't be more than ctrlr_loss_timeout_sec.\n"); 5931 return false; 5932 } 5933 } 5934 } else if (reconnect_delay_sec != 0 || fast_io_fail_timeout_sec != 0) { 5935 SPDK_ERRLOG("Both reconnect_delay_sec and fast_io_fail_timeout_sec must be 0 if ctrlr_loss_timeout_sec is 0.\n"); 5936 return false; 5937 } 5938 5939 return true; 5940 } 5941 5942 static int 5943 bdev_nvme_load_psk(const char *fname, char *buf, size_t bufsz) 5944 { 5945 FILE *psk_file; 5946 struct stat statbuf; 5947 int rc; 5948 #define TCP_PSK_INVALID_PERMISSIONS 0177 5949 5950 if (stat(fname, &statbuf) != 0) { 5951 SPDK_ERRLOG("Could not read permissions for PSK file\n"); 5952 return -EACCES; 5953 } 5954 5955 if ((statbuf.st_mode & TCP_PSK_INVALID_PERMISSIONS) != 0) { 5956 SPDK_ERRLOG("Incorrect permissions for PSK file\n"); 5957 return -EPERM; 5958 } 5959 if ((size_t)statbuf.st_size >= bufsz) { 5960 SPDK_ERRLOG("Invalid PSK: too long\n"); 5961 return -EINVAL; 5962 } 5963 psk_file = fopen(fname, "r"); 5964 if (psk_file == NULL) { 5965 SPDK_ERRLOG("Could not open PSK file\n"); 5966 return -EINVAL; 5967 } 5968 5969 memset(buf, 0, bufsz); 5970 rc = fread(buf, 1, statbuf.st_size, psk_file); 5971 if (rc != statbuf.st_size) { 5972 SPDK_ERRLOG("Failed to read PSK\n"); 5973 fclose(psk_file); 5974 return -EINVAL; 5975 } 5976 5977 fclose(psk_file); 5978 return 0; 5979 } 5980 5981 int 5982 bdev_nvme_create(struct spdk_nvme_transport_id *trid, 5983 const char *base_name, 5984 const char **names, 5985 uint32_t count, 5986 spdk_bdev_create_nvme_fn cb_fn, 5987 void *cb_ctx, 5988 struct spdk_nvme_ctrlr_opts *drv_opts, 5989 struct nvme_ctrlr_opts *bdev_opts, 5990 bool multipath) 5991 { 5992 struct nvme_probe_skip_entry *entry, *tmp; 5993 struct nvme_async_probe_ctx *ctx; 5994 spdk_nvme_attach_cb attach_cb; 5995 int rc, len; 5996 5997 /* TODO expand this check to include both the host and target TRIDs. 5998 * Only if both are the same should we fail. 5999 */ 6000 if (nvme_ctrlr_get(trid) != NULL) { 6001 SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr); 6002 return -EEXIST; 6003 } 6004 6005 len = strnlen(base_name, SPDK_CONTROLLER_NAME_MAX); 6006 6007 if (len == 0 || len == SPDK_CONTROLLER_NAME_MAX) { 6008 SPDK_ERRLOG("controller name must be between 1 and %d characters\n", SPDK_CONTROLLER_NAME_MAX - 1); 6009 return -EINVAL; 6010 } 6011 6012 if (bdev_opts != NULL && 6013 !bdev_nvme_check_io_error_resiliency_params(bdev_opts->ctrlr_loss_timeout_sec, 6014 bdev_opts->reconnect_delay_sec, 6015 bdev_opts->fast_io_fail_timeout_sec)) { 6016 return -EINVAL; 6017 } 6018 6019 ctx = calloc(1, sizeof(*ctx)); 6020 if (!ctx) { 6021 return -ENOMEM; 6022 } 6023 ctx->base_name = base_name; 6024 ctx->names = names; 6025 ctx->max_bdevs = count; 6026 ctx->cb_fn = cb_fn; 6027 ctx->cb_ctx = cb_ctx; 6028 ctx->trid = *trid; 6029 6030 if (bdev_opts) { 6031 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 6032 } else { 6033 bdev_nvme_get_default_ctrlr_opts(&ctx->bdev_opts); 6034 } 6035 6036 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 6037 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) { 6038 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 6039 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 6040 free(entry); 6041 break; 6042 } 6043 } 6044 } 6045 6046 if (drv_opts) { 6047 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 6048 } else { 6049 spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->drv_opts, sizeof(ctx->drv_opts)); 6050 } 6051 6052 ctx->drv_opts.transport_retry_count = g_opts.transport_retry_count; 6053 ctx->drv_opts.transport_ack_timeout = g_opts.transport_ack_timeout; 6054 ctx->drv_opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms; 6055 ctx->drv_opts.disable_read_ana_log_page = true; 6056 ctx->drv_opts.transport_tos = g_opts.transport_tos; 6057 6058 if (ctx->bdev_opts.psk[0] != '\0') { 6059 /* Try to use the keyring first */ 6060 ctx->drv_opts.tls_psk = spdk_keyring_get_key(ctx->bdev_opts.psk); 6061 if (ctx->drv_opts.tls_psk == NULL) { 6062 rc = bdev_nvme_load_psk(ctx->bdev_opts.psk, 6063 ctx->drv_opts.psk, sizeof(ctx->drv_opts.psk)); 6064 if (rc != 0) { 6065 SPDK_ERRLOG("Could not load PSK from %s\n", ctx->bdev_opts.psk); 6066 free_nvme_async_probe_ctx(ctx); 6067 return rc; 6068 } 6069 } 6070 } 6071 6072 if (nvme_bdev_ctrlr_get_by_name(base_name) == NULL || multipath) { 6073 attach_cb = connect_attach_cb; 6074 } else { 6075 attach_cb = connect_set_failover_cb; 6076 } 6077 6078 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, attach_cb); 6079 if (ctx->probe_ctx == NULL) { 6080 SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr); 6081 free_nvme_async_probe_ctx(ctx); 6082 return -ENODEV; 6083 } 6084 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000); 6085 6086 return 0; 6087 } 6088 6089 struct bdev_nvme_delete_ctx { 6090 char *name; 6091 struct nvme_path_id path_id; 6092 bdev_nvme_delete_done_fn delete_done; 6093 void *delete_done_ctx; 6094 uint64_t timeout_ticks; 6095 struct spdk_poller *poller; 6096 }; 6097 6098 static void 6099 free_bdev_nvme_delete_ctx(struct bdev_nvme_delete_ctx *ctx) 6100 { 6101 if (ctx != NULL) { 6102 free(ctx->name); 6103 free(ctx); 6104 } 6105 } 6106 6107 static bool 6108 nvme_path_id_compare(struct nvme_path_id *p, const struct nvme_path_id *path_id) 6109 { 6110 if (path_id->trid.trtype != 0) { 6111 if (path_id->trid.trtype == SPDK_NVME_TRANSPORT_CUSTOM) { 6112 if (strcasecmp(path_id->trid.trstring, p->trid.trstring) != 0) { 6113 return false; 6114 } 6115 } else { 6116 if (path_id->trid.trtype != p->trid.trtype) { 6117 return false; 6118 } 6119 } 6120 } 6121 6122 if (!spdk_mem_all_zero(path_id->trid.traddr, sizeof(path_id->trid.traddr))) { 6123 if (strcasecmp(path_id->trid.traddr, p->trid.traddr) != 0) { 6124 return false; 6125 } 6126 } 6127 6128 if (path_id->trid.adrfam != 0) { 6129 if (path_id->trid.adrfam != p->trid.adrfam) { 6130 return false; 6131 } 6132 } 6133 6134 if (!spdk_mem_all_zero(path_id->trid.trsvcid, sizeof(path_id->trid.trsvcid))) { 6135 if (strcasecmp(path_id->trid.trsvcid, p->trid.trsvcid) != 0) { 6136 return false; 6137 } 6138 } 6139 6140 if (!spdk_mem_all_zero(path_id->trid.subnqn, sizeof(path_id->trid.subnqn))) { 6141 if (strcmp(path_id->trid.subnqn, p->trid.subnqn) != 0) { 6142 return false; 6143 } 6144 } 6145 6146 if (!spdk_mem_all_zero(path_id->hostid.hostaddr, sizeof(path_id->hostid.hostaddr))) { 6147 if (strcmp(path_id->hostid.hostaddr, p->hostid.hostaddr) != 0) { 6148 return false; 6149 } 6150 } 6151 6152 if (!spdk_mem_all_zero(path_id->hostid.hostsvcid, sizeof(path_id->hostid.hostsvcid))) { 6153 if (strcmp(path_id->hostid.hostsvcid, p->hostid.hostsvcid) != 0) { 6154 return false; 6155 } 6156 } 6157 6158 return true; 6159 } 6160 6161 static bool 6162 nvme_path_id_exists(const char *name, const struct nvme_path_id *path_id) 6163 { 6164 struct nvme_bdev_ctrlr *nbdev_ctrlr; 6165 struct nvme_ctrlr *ctrlr; 6166 struct nvme_path_id *p; 6167 6168 pthread_mutex_lock(&g_bdev_nvme_mutex); 6169 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 6170 if (!nbdev_ctrlr) { 6171 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6172 return false; 6173 } 6174 6175 TAILQ_FOREACH(ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 6176 pthread_mutex_lock(&ctrlr->mutex); 6177 TAILQ_FOREACH(p, &ctrlr->trids, link) { 6178 if (nvme_path_id_compare(p, path_id)) { 6179 pthread_mutex_unlock(&ctrlr->mutex); 6180 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6181 return true; 6182 } 6183 } 6184 pthread_mutex_unlock(&ctrlr->mutex); 6185 } 6186 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6187 6188 return false; 6189 } 6190 6191 static int 6192 bdev_nvme_delete_complete_poll(void *arg) 6193 { 6194 struct bdev_nvme_delete_ctx *ctx = arg; 6195 int rc = 0; 6196 6197 if (nvme_path_id_exists(ctx->name, &ctx->path_id)) { 6198 if (ctx->timeout_ticks > spdk_get_ticks()) { 6199 return SPDK_POLLER_BUSY; 6200 } 6201 6202 SPDK_ERRLOG("NVMe path '%s' still exists after delete\n", ctx->name); 6203 rc = -ETIMEDOUT; 6204 } 6205 6206 spdk_poller_unregister(&ctx->poller); 6207 6208 ctx->delete_done(ctx->delete_done_ctx, rc); 6209 free_bdev_nvme_delete_ctx(ctx); 6210 6211 return SPDK_POLLER_BUSY; 6212 } 6213 6214 static int 6215 _bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, const struct nvme_path_id *path_id) 6216 { 6217 struct nvme_path_id *p, *t; 6218 spdk_msg_fn msg_fn; 6219 int rc = -ENXIO; 6220 6221 pthread_mutex_lock(&nvme_ctrlr->mutex); 6222 6223 TAILQ_FOREACH_REVERSE_SAFE(p, &nvme_ctrlr->trids, nvme_paths, link, t) { 6224 if (p == TAILQ_FIRST(&nvme_ctrlr->trids)) { 6225 break; 6226 } 6227 6228 if (!nvme_path_id_compare(p, path_id)) { 6229 continue; 6230 } 6231 6232 /* We are not using the specified path. */ 6233 TAILQ_REMOVE(&nvme_ctrlr->trids, p, link); 6234 free(p); 6235 rc = 0; 6236 } 6237 6238 if (p == NULL || !nvme_path_id_compare(p, path_id)) { 6239 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6240 return rc; 6241 } 6242 6243 /* If we made it here, then this path is a match! Now we need to remove it. */ 6244 6245 /* This is the active path in use right now. The active path is always the first in the list. */ 6246 assert(p == nvme_ctrlr->active_path_id); 6247 6248 if (!TAILQ_NEXT(p, link)) { 6249 /* The current path is the only path. */ 6250 msg_fn = _nvme_ctrlr_destruct; 6251 rc = bdev_nvme_delete_ctrlr_unsafe(nvme_ctrlr, false); 6252 } else { 6253 /* There is an alternative path. */ 6254 msg_fn = _bdev_nvme_reset_ctrlr; 6255 rc = bdev_nvme_failover_ctrlr_unsafe(nvme_ctrlr, true); 6256 } 6257 6258 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6259 6260 if (rc == 0) { 6261 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 6262 } else if (rc == -EALREADY) { 6263 rc = 0; 6264 } 6265 6266 return rc; 6267 } 6268 6269 int 6270 bdev_nvme_delete(const char *name, const struct nvme_path_id *path_id, 6271 bdev_nvme_delete_done_fn delete_done, void *delete_done_ctx) 6272 { 6273 struct nvme_bdev_ctrlr *nbdev_ctrlr; 6274 struct nvme_ctrlr *nvme_ctrlr, *tmp_nvme_ctrlr; 6275 struct bdev_nvme_delete_ctx *ctx = NULL; 6276 int rc = -ENXIO, _rc; 6277 6278 if (name == NULL || path_id == NULL) { 6279 rc = -EINVAL; 6280 goto exit; 6281 } 6282 6283 pthread_mutex_lock(&g_bdev_nvme_mutex); 6284 6285 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 6286 if (nbdev_ctrlr == NULL) { 6287 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6288 6289 SPDK_ERRLOG("Failed to find NVMe bdev controller\n"); 6290 rc = -ENODEV; 6291 goto exit; 6292 } 6293 6294 TAILQ_FOREACH_SAFE(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq, tmp_nvme_ctrlr) { 6295 _rc = _bdev_nvme_delete(nvme_ctrlr, path_id); 6296 if (_rc < 0 && _rc != -ENXIO) { 6297 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6298 rc = _rc; 6299 goto exit; 6300 } else if (_rc == 0) { 6301 /* We traverse all remaining nvme_ctrlrs even if one nvme_ctrlr 6302 * was deleted successfully. To remember the successful deletion, 6303 * overwrite rc only if _rc is zero. 6304 */ 6305 rc = 0; 6306 } 6307 } 6308 6309 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6310 6311 if (rc != 0 || delete_done == NULL) { 6312 goto exit; 6313 } 6314 6315 ctx = calloc(1, sizeof(*ctx)); 6316 if (ctx == NULL) { 6317 SPDK_ERRLOG("Failed to allocate context for bdev_nvme_delete\n"); 6318 rc = -ENOMEM; 6319 goto exit; 6320 } 6321 6322 ctx->name = strdup(name); 6323 if (ctx->name == NULL) { 6324 SPDK_ERRLOG("Failed to copy controller name for deletion\n"); 6325 rc = -ENOMEM; 6326 goto exit; 6327 } 6328 6329 ctx->delete_done = delete_done; 6330 ctx->delete_done_ctx = delete_done_ctx; 6331 ctx->path_id = *path_id; 6332 ctx->timeout_ticks = spdk_get_ticks() + 10 * spdk_get_ticks_hz(); 6333 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_delete_complete_poll, ctx, 1000); 6334 if (ctx->poller == NULL) { 6335 SPDK_ERRLOG("Failed to register bdev_nvme_delete poller\n"); 6336 rc = -ENOMEM; 6337 goto exit; 6338 } 6339 6340 exit: 6341 if (rc != 0) { 6342 free_bdev_nvme_delete_ctx(ctx); 6343 } 6344 6345 return rc; 6346 } 6347 6348 #define DISCOVERY_INFOLOG(ctx, format, ...) \ 6349 SPDK_INFOLOG(bdev_nvme, "Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 6350 6351 #define DISCOVERY_ERRLOG(ctx, format, ...) \ 6352 SPDK_ERRLOG("Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 6353 6354 struct discovery_entry_ctx { 6355 char name[128]; 6356 struct spdk_nvme_transport_id trid; 6357 struct spdk_nvme_ctrlr_opts drv_opts; 6358 struct spdk_nvmf_discovery_log_page_entry entry; 6359 TAILQ_ENTRY(discovery_entry_ctx) tailq; 6360 struct discovery_ctx *ctx; 6361 }; 6362 6363 struct discovery_ctx { 6364 char *name; 6365 spdk_bdev_nvme_start_discovery_fn start_cb_fn; 6366 spdk_bdev_nvme_stop_discovery_fn stop_cb_fn; 6367 void *cb_ctx; 6368 struct spdk_nvme_probe_ctx *probe_ctx; 6369 struct spdk_nvme_detach_ctx *detach_ctx; 6370 struct spdk_nvme_ctrlr *ctrlr; 6371 struct spdk_nvme_transport_id trid; 6372 struct discovery_entry_ctx *entry_ctx_in_use; 6373 struct spdk_poller *poller; 6374 struct spdk_nvme_ctrlr_opts drv_opts; 6375 struct nvme_ctrlr_opts bdev_opts; 6376 struct spdk_nvmf_discovery_log_page *log_page; 6377 TAILQ_ENTRY(discovery_ctx) tailq; 6378 TAILQ_HEAD(, discovery_entry_ctx) nvm_entry_ctxs; 6379 TAILQ_HEAD(, discovery_entry_ctx) discovery_entry_ctxs; 6380 int rc; 6381 bool wait_for_attach; 6382 uint64_t timeout_ticks; 6383 /* Denotes that the discovery service is being started. We're waiting 6384 * for the initial connection to the discovery controller to be 6385 * established and attach discovered NVM ctrlrs. 6386 */ 6387 bool initializing; 6388 /* Denotes if a discovery is currently in progress for this context. 6389 * That includes connecting to newly discovered subsystems. Used to 6390 * ensure we do not start a new discovery until an existing one is 6391 * complete. 6392 */ 6393 bool in_progress; 6394 6395 /* Denotes if another discovery is needed after the one in progress 6396 * completes. Set when we receive an AER completion while a discovery 6397 * is already in progress. 6398 */ 6399 bool pending; 6400 6401 /* Signal to the discovery context poller that it should stop the 6402 * discovery service, including detaching from the current discovery 6403 * controller. 6404 */ 6405 bool stop; 6406 6407 struct spdk_thread *calling_thread; 6408 uint32_t index; 6409 uint32_t attach_in_progress; 6410 char *hostnqn; 6411 6412 /* Denotes if the discovery service was started by the mdns discovery. 6413 */ 6414 bool from_mdns_discovery_service; 6415 }; 6416 6417 TAILQ_HEAD(discovery_ctxs, discovery_ctx); 6418 static struct discovery_ctxs g_discovery_ctxs = TAILQ_HEAD_INITIALIZER(g_discovery_ctxs); 6419 6420 static void get_discovery_log_page(struct discovery_ctx *ctx); 6421 6422 static void 6423 free_discovery_ctx(struct discovery_ctx *ctx) 6424 { 6425 free(ctx->log_page); 6426 free(ctx->hostnqn); 6427 free(ctx->name); 6428 free(ctx); 6429 } 6430 6431 static void 6432 discovery_complete(struct discovery_ctx *ctx) 6433 { 6434 ctx->initializing = false; 6435 ctx->in_progress = false; 6436 if (ctx->pending) { 6437 ctx->pending = false; 6438 get_discovery_log_page(ctx); 6439 } 6440 } 6441 6442 static void 6443 build_trid_from_log_page_entry(struct spdk_nvme_transport_id *trid, 6444 struct spdk_nvmf_discovery_log_page_entry *entry) 6445 { 6446 char *space; 6447 6448 trid->trtype = entry->trtype; 6449 trid->adrfam = entry->adrfam; 6450 memcpy(trid->traddr, entry->traddr, sizeof(entry->traddr)); 6451 memcpy(trid->trsvcid, entry->trsvcid, sizeof(entry->trsvcid)); 6452 /* Because the source buffer (entry->subnqn) is longer than trid->subnqn, and 6453 * before call to this function trid->subnqn is zeroed out, we need 6454 * to copy sizeof(trid->subnqn) minus one byte to make sure the last character 6455 * remains 0. Then we can shorten the string (replace ' ' with 0) if required 6456 */ 6457 memcpy(trid->subnqn, entry->subnqn, sizeof(trid->subnqn) - 1); 6458 6459 /* We want the traddr, trsvcid and subnqn fields to be NULL-terminated. 6460 * But the log page entries typically pad them with spaces, not zeroes. 6461 * So add a NULL terminator to each of these fields at the appropriate 6462 * location. 6463 */ 6464 space = strchr(trid->traddr, ' '); 6465 if (space) { 6466 *space = 0; 6467 } 6468 space = strchr(trid->trsvcid, ' '); 6469 if (space) { 6470 *space = 0; 6471 } 6472 space = strchr(trid->subnqn, ' '); 6473 if (space) { 6474 *space = 0; 6475 } 6476 } 6477 6478 static void 6479 _stop_discovery(void *_ctx) 6480 { 6481 struct discovery_ctx *ctx = _ctx; 6482 6483 if (ctx->attach_in_progress > 0) { 6484 spdk_thread_send_msg(spdk_get_thread(), _stop_discovery, ctx); 6485 return; 6486 } 6487 6488 ctx->stop = true; 6489 6490 while (!TAILQ_EMPTY(&ctx->nvm_entry_ctxs)) { 6491 struct discovery_entry_ctx *entry_ctx; 6492 struct nvme_path_id path = {}; 6493 6494 entry_ctx = TAILQ_FIRST(&ctx->nvm_entry_ctxs); 6495 path.trid = entry_ctx->trid; 6496 bdev_nvme_delete(entry_ctx->name, &path, NULL, NULL); 6497 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 6498 free(entry_ctx); 6499 } 6500 6501 while (!TAILQ_EMPTY(&ctx->discovery_entry_ctxs)) { 6502 struct discovery_entry_ctx *entry_ctx; 6503 6504 entry_ctx = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 6505 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 6506 free(entry_ctx); 6507 } 6508 6509 free(ctx->entry_ctx_in_use); 6510 ctx->entry_ctx_in_use = NULL; 6511 } 6512 6513 static void 6514 stop_discovery(struct discovery_ctx *ctx, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 6515 { 6516 ctx->stop_cb_fn = cb_fn; 6517 ctx->cb_ctx = cb_ctx; 6518 6519 if (ctx->attach_in_progress > 0) { 6520 DISCOVERY_INFOLOG(ctx, "stopping discovery with attach_in_progress: %"PRIu32"\n", 6521 ctx->attach_in_progress); 6522 } 6523 6524 _stop_discovery(ctx); 6525 } 6526 6527 static void 6528 remove_discovery_entry(struct nvme_ctrlr *nvme_ctrlr) 6529 { 6530 struct discovery_ctx *d_ctx; 6531 struct nvme_path_id *path_id; 6532 struct spdk_nvme_transport_id trid = {}; 6533 struct discovery_entry_ctx *entry_ctx, *tmp; 6534 6535 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 6536 6537 TAILQ_FOREACH(d_ctx, &g_discovery_ctxs, tailq) { 6538 TAILQ_FOREACH_SAFE(entry_ctx, &d_ctx->nvm_entry_ctxs, tailq, tmp) { 6539 build_trid_from_log_page_entry(&trid, &entry_ctx->entry); 6540 if (spdk_nvme_transport_id_compare(&trid, &path_id->trid) != 0) { 6541 continue; 6542 } 6543 6544 TAILQ_REMOVE(&d_ctx->nvm_entry_ctxs, entry_ctx, tailq); 6545 free(entry_ctx); 6546 DISCOVERY_INFOLOG(d_ctx, "Remove discovery entry: %s:%s:%s\n", 6547 trid.subnqn, trid.traddr, trid.trsvcid); 6548 6549 /* Fail discovery ctrlr to force reattach attempt */ 6550 spdk_nvme_ctrlr_fail(d_ctx->ctrlr); 6551 } 6552 } 6553 } 6554 6555 static void 6556 discovery_remove_controllers(struct discovery_ctx *ctx) 6557 { 6558 struct spdk_nvmf_discovery_log_page *log_page = ctx->log_page; 6559 struct discovery_entry_ctx *entry_ctx, *tmp; 6560 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 6561 struct spdk_nvme_transport_id old_trid = {}; 6562 uint64_t numrec, i; 6563 bool found; 6564 6565 numrec = from_le64(&log_page->numrec); 6566 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->nvm_entry_ctxs, tailq, tmp) { 6567 found = false; 6568 old_entry = &entry_ctx->entry; 6569 build_trid_from_log_page_entry(&old_trid, old_entry); 6570 for (i = 0; i < numrec; i++) { 6571 new_entry = &log_page->entries[i]; 6572 if (!memcmp(old_entry, new_entry, sizeof(*old_entry))) { 6573 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s found again\n", 6574 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 6575 found = true; 6576 break; 6577 } 6578 } 6579 if (!found) { 6580 struct nvme_path_id path = {}; 6581 6582 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s not found\n", 6583 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 6584 6585 path.trid = entry_ctx->trid; 6586 bdev_nvme_delete(entry_ctx->name, &path, NULL, NULL); 6587 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 6588 free(entry_ctx); 6589 } 6590 } 6591 free(log_page); 6592 ctx->log_page = NULL; 6593 discovery_complete(ctx); 6594 } 6595 6596 static void 6597 complete_discovery_start(struct discovery_ctx *ctx, int status) 6598 { 6599 ctx->timeout_ticks = 0; 6600 ctx->rc = status; 6601 if (ctx->start_cb_fn) { 6602 ctx->start_cb_fn(ctx->cb_ctx, status); 6603 ctx->start_cb_fn = NULL; 6604 ctx->cb_ctx = NULL; 6605 } 6606 } 6607 6608 static void 6609 discovery_attach_controller_done(void *cb_ctx, size_t bdev_count, int rc) 6610 { 6611 struct discovery_entry_ctx *entry_ctx = cb_ctx; 6612 struct discovery_ctx *ctx = entry_ctx->ctx; 6613 6614 DISCOVERY_INFOLOG(ctx, "attach %s done\n", entry_ctx->name); 6615 ctx->attach_in_progress--; 6616 if (ctx->attach_in_progress == 0) { 6617 complete_discovery_start(ctx, ctx->rc); 6618 if (ctx->initializing && ctx->rc != 0) { 6619 DISCOVERY_ERRLOG(ctx, "stopping discovery due to errors: %d\n", ctx->rc); 6620 stop_discovery(ctx, NULL, ctx->cb_ctx); 6621 } else { 6622 discovery_remove_controllers(ctx); 6623 } 6624 } 6625 } 6626 6627 static struct discovery_entry_ctx * 6628 create_discovery_entry_ctx(struct discovery_ctx *ctx, struct spdk_nvme_transport_id *trid) 6629 { 6630 struct discovery_entry_ctx *new_ctx; 6631 6632 new_ctx = calloc(1, sizeof(*new_ctx)); 6633 if (new_ctx == NULL) { 6634 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6635 return NULL; 6636 } 6637 6638 new_ctx->ctx = ctx; 6639 memcpy(&new_ctx->trid, trid, sizeof(*trid)); 6640 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 6641 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 6642 return new_ctx; 6643 } 6644 6645 static void 6646 discovery_log_page_cb(void *cb_arg, int rc, const struct spdk_nvme_cpl *cpl, 6647 struct spdk_nvmf_discovery_log_page *log_page) 6648 { 6649 struct discovery_ctx *ctx = cb_arg; 6650 struct discovery_entry_ctx *entry_ctx, *tmp; 6651 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 6652 uint64_t numrec, i; 6653 bool found; 6654 6655 if (rc || spdk_nvme_cpl_is_error(cpl)) { 6656 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 6657 return; 6658 } 6659 6660 ctx->log_page = log_page; 6661 assert(ctx->attach_in_progress == 0); 6662 numrec = from_le64(&log_page->numrec); 6663 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->discovery_entry_ctxs, tailq, tmp) { 6664 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 6665 free(entry_ctx); 6666 } 6667 for (i = 0; i < numrec; i++) { 6668 found = false; 6669 new_entry = &log_page->entries[i]; 6670 if (new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY_CURRENT || 6671 new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) { 6672 struct discovery_entry_ctx *new_ctx; 6673 struct spdk_nvme_transport_id trid = {}; 6674 6675 build_trid_from_log_page_entry(&trid, new_entry); 6676 new_ctx = create_discovery_entry_ctx(ctx, &trid); 6677 if (new_ctx == NULL) { 6678 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6679 break; 6680 } 6681 6682 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, new_ctx, tailq); 6683 continue; 6684 } 6685 TAILQ_FOREACH(entry_ctx, &ctx->nvm_entry_ctxs, tailq) { 6686 old_entry = &entry_ctx->entry; 6687 if (!memcmp(new_entry, old_entry, sizeof(*new_entry))) { 6688 found = true; 6689 break; 6690 } 6691 } 6692 if (!found) { 6693 struct discovery_entry_ctx *subnqn_ctx = NULL, *new_ctx; 6694 struct discovery_ctx *d_ctx; 6695 6696 TAILQ_FOREACH(d_ctx, &g_discovery_ctxs, tailq) { 6697 TAILQ_FOREACH(subnqn_ctx, &d_ctx->nvm_entry_ctxs, tailq) { 6698 if (!memcmp(subnqn_ctx->entry.subnqn, new_entry->subnqn, 6699 sizeof(new_entry->subnqn))) { 6700 break; 6701 } 6702 } 6703 if (subnqn_ctx) { 6704 break; 6705 } 6706 } 6707 6708 new_ctx = calloc(1, sizeof(*new_ctx)); 6709 if (new_ctx == NULL) { 6710 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6711 break; 6712 } 6713 6714 new_ctx->ctx = ctx; 6715 memcpy(&new_ctx->entry, new_entry, sizeof(*new_entry)); 6716 build_trid_from_log_page_entry(&new_ctx->trid, new_entry); 6717 if (subnqn_ctx) { 6718 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s", subnqn_ctx->name); 6719 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new path for %s\n", 6720 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 6721 new_ctx->name); 6722 } else { 6723 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s%d", ctx->name, ctx->index++); 6724 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new subsystem %s\n", 6725 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 6726 new_ctx->name); 6727 } 6728 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 6729 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 6730 rc = bdev_nvme_create(&new_ctx->trid, new_ctx->name, NULL, 0, 6731 discovery_attach_controller_done, new_ctx, 6732 &new_ctx->drv_opts, &ctx->bdev_opts, true); 6733 if (rc == 0) { 6734 TAILQ_INSERT_TAIL(&ctx->nvm_entry_ctxs, new_ctx, tailq); 6735 ctx->attach_in_progress++; 6736 } else { 6737 DISCOVERY_ERRLOG(ctx, "bdev_nvme_create failed (%s)\n", spdk_strerror(-rc)); 6738 } 6739 } 6740 } 6741 6742 if (ctx->attach_in_progress == 0) { 6743 discovery_remove_controllers(ctx); 6744 } 6745 } 6746 6747 static void 6748 get_discovery_log_page(struct discovery_ctx *ctx) 6749 { 6750 int rc; 6751 6752 assert(ctx->in_progress == false); 6753 ctx->in_progress = true; 6754 rc = spdk_nvme_ctrlr_get_discovery_log_page(ctx->ctrlr, discovery_log_page_cb, ctx); 6755 if (rc != 0) { 6756 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 6757 } 6758 DISCOVERY_INFOLOG(ctx, "sent discovery log page command\n"); 6759 } 6760 6761 static void 6762 discovery_aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 6763 { 6764 struct discovery_ctx *ctx = arg; 6765 uint32_t log_page_id = (cpl->cdw0 & 0xFF0000) >> 16; 6766 6767 if (spdk_nvme_cpl_is_error(cpl)) { 6768 DISCOVERY_ERRLOG(ctx, "aer failed\n"); 6769 return; 6770 } 6771 6772 if (log_page_id != SPDK_NVME_LOG_DISCOVERY) { 6773 DISCOVERY_ERRLOG(ctx, "unexpected log page 0x%x\n", log_page_id); 6774 return; 6775 } 6776 6777 DISCOVERY_INFOLOG(ctx, "got aer\n"); 6778 if (ctx->in_progress) { 6779 ctx->pending = true; 6780 return; 6781 } 6782 6783 get_discovery_log_page(ctx); 6784 } 6785 6786 static void 6787 discovery_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 6788 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 6789 { 6790 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 6791 struct discovery_ctx *ctx; 6792 6793 ctx = SPDK_CONTAINEROF(user_opts, struct discovery_ctx, drv_opts); 6794 6795 DISCOVERY_INFOLOG(ctx, "discovery ctrlr attached\n"); 6796 ctx->probe_ctx = NULL; 6797 ctx->ctrlr = ctrlr; 6798 6799 if (ctx->rc != 0) { 6800 DISCOVERY_ERRLOG(ctx, "encountered error while attaching discovery ctrlr: %d\n", 6801 ctx->rc); 6802 return; 6803 } 6804 6805 spdk_nvme_ctrlr_register_aer_callback(ctx->ctrlr, discovery_aer_cb, ctx); 6806 } 6807 6808 static int 6809 discovery_poller(void *arg) 6810 { 6811 struct discovery_ctx *ctx = arg; 6812 struct spdk_nvme_transport_id *trid; 6813 int rc; 6814 6815 if (ctx->detach_ctx) { 6816 rc = spdk_nvme_detach_poll_async(ctx->detach_ctx); 6817 if (rc != -EAGAIN) { 6818 ctx->detach_ctx = NULL; 6819 ctx->ctrlr = NULL; 6820 } 6821 } else if (ctx->stop) { 6822 if (ctx->ctrlr != NULL) { 6823 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 6824 if (rc == 0) { 6825 return SPDK_POLLER_BUSY; 6826 } 6827 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 6828 } 6829 spdk_poller_unregister(&ctx->poller); 6830 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 6831 assert(ctx->start_cb_fn == NULL); 6832 if (ctx->stop_cb_fn != NULL) { 6833 ctx->stop_cb_fn(ctx->cb_ctx); 6834 } 6835 free_discovery_ctx(ctx); 6836 } else if (ctx->probe_ctx == NULL && ctx->ctrlr == NULL) { 6837 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 6838 DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); 6839 assert(ctx->initializing); 6840 spdk_poller_unregister(&ctx->poller); 6841 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 6842 complete_discovery_start(ctx, -ETIMEDOUT); 6843 stop_discovery(ctx, NULL, NULL); 6844 free_discovery_ctx(ctx); 6845 return SPDK_POLLER_BUSY; 6846 } 6847 6848 assert(ctx->entry_ctx_in_use == NULL); 6849 ctx->entry_ctx_in_use = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 6850 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 6851 trid = &ctx->entry_ctx_in_use->trid; 6852 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, discovery_attach_cb); 6853 if (ctx->probe_ctx) { 6854 spdk_poller_unregister(&ctx->poller); 6855 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000); 6856 } else { 6857 DISCOVERY_ERRLOG(ctx, "could not start discovery connect\n"); 6858 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 6859 ctx->entry_ctx_in_use = NULL; 6860 } 6861 } else if (ctx->probe_ctx) { 6862 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 6863 DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); 6864 complete_discovery_start(ctx, -ETIMEDOUT); 6865 return SPDK_POLLER_BUSY; 6866 } 6867 6868 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 6869 if (rc != -EAGAIN) { 6870 if (ctx->rc != 0) { 6871 assert(ctx->initializing); 6872 stop_discovery(ctx, NULL, ctx->cb_ctx); 6873 } else { 6874 assert(rc == 0); 6875 DISCOVERY_INFOLOG(ctx, "discovery ctrlr connected\n"); 6876 ctx->rc = rc; 6877 get_discovery_log_page(ctx); 6878 } 6879 } 6880 } else { 6881 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 6882 DISCOVERY_ERRLOG(ctx, "timed out while attaching NVM ctrlrs\n"); 6883 complete_discovery_start(ctx, -ETIMEDOUT); 6884 /* We need to wait until all NVM ctrlrs are attached before we stop the 6885 * discovery service to make sure we don't detach a ctrlr that is still 6886 * being attached. 6887 */ 6888 if (ctx->attach_in_progress == 0) { 6889 stop_discovery(ctx, NULL, ctx->cb_ctx); 6890 return SPDK_POLLER_BUSY; 6891 } 6892 } 6893 6894 rc = spdk_nvme_ctrlr_process_admin_completions(ctx->ctrlr); 6895 if (rc < 0) { 6896 spdk_poller_unregister(&ctx->poller); 6897 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 6898 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 6899 ctx->entry_ctx_in_use = NULL; 6900 6901 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 6902 if (rc != 0) { 6903 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 6904 ctx->ctrlr = NULL; 6905 } 6906 } 6907 } 6908 6909 return SPDK_POLLER_BUSY; 6910 } 6911 6912 static void 6913 start_discovery_poller(void *arg) 6914 { 6915 struct discovery_ctx *ctx = arg; 6916 6917 TAILQ_INSERT_TAIL(&g_discovery_ctxs, ctx, tailq); 6918 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 6919 } 6920 6921 int 6922 bdev_nvme_start_discovery(struct spdk_nvme_transport_id *trid, 6923 const char *base_name, 6924 struct spdk_nvme_ctrlr_opts *drv_opts, 6925 struct nvme_ctrlr_opts *bdev_opts, 6926 uint64_t attach_timeout, 6927 bool from_mdns, 6928 spdk_bdev_nvme_start_discovery_fn cb_fn, void *cb_ctx) 6929 { 6930 struct discovery_ctx *ctx; 6931 struct discovery_entry_ctx *discovery_entry_ctx; 6932 6933 snprintf(trid->subnqn, sizeof(trid->subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN); 6934 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 6935 if (strcmp(ctx->name, base_name) == 0) { 6936 return -EEXIST; 6937 } 6938 6939 if (ctx->entry_ctx_in_use != NULL) { 6940 if (!spdk_nvme_transport_id_compare(trid, &ctx->entry_ctx_in_use->trid)) { 6941 return -EEXIST; 6942 } 6943 } 6944 6945 TAILQ_FOREACH(discovery_entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 6946 if (!spdk_nvme_transport_id_compare(trid, &discovery_entry_ctx->trid)) { 6947 return -EEXIST; 6948 } 6949 } 6950 } 6951 6952 ctx = calloc(1, sizeof(*ctx)); 6953 if (ctx == NULL) { 6954 return -ENOMEM; 6955 } 6956 6957 ctx->name = strdup(base_name); 6958 if (ctx->name == NULL) { 6959 free_discovery_ctx(ctx); 6960 return -ENOMEM; 6961 } 6962 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 6963 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 6964 ctx->from_mdns_discovery_service = from_mdns; 6965 ctx->bdev_opts.from_discovery_service = true; 6966 ctx->calling_thread = spdk_get_thread(); 6967 ctx->start_cb_fn = cb_fn; 6968 ctx->cb_ctx = cb_ctx; 6969 ctx->initializing = true; 6970 if (ctx->start_cb_fn) { 6971 /* We can use this when dumping json to denote if this RPC parameter 6972 * was specified or not. 6973 */ 6974 ctx->wait_for_attach = true; 6975 } 6976 if (attach_timeout != 0) { 6977 ctx->timeout_ticks = spdk_get_ticks() + attach_timeout * 6978 spdk_get_ticks_hz() / 1000ull; 6979 } 6980 TAILQ_INIT(&ctx->nvm_entry_ctxs); 6981 TAILQ_INIT(&ctx->discovery_entry_ctxs); 6982 memcpy(&ctx->trid, trid, sizeof(*trid)); 6983 /* Even if user did not specify hostnqn, we can still strdup("\0"); */ 6984 ctx->hostnqn = strdup(ctx->drv_opts.hostnqn); 6985 if (ctx->hostnqn == NULL) { 6986 free_discovery_ctx(ctx); 6987 return -ENOMEM; 6988 } 6989 discovery_entry_ctx = create_discovery_entry_ctx(ctx, trid); 6990 if (discovery_entry_ctx == NULL) { 6991 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6992 free_discovery_ctx(ctx); 6993 return -ENOMEM; 6994 } 6995 6996 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, discovery_entry_ctx, tailq); 6997 spdk_thread_send_msg(g_bdev_nvme_init_thread, start_discovery_poller, ctx); 6998 return 0; 6999 } 7000 7001 int 7002 bdev_nvme_stop_discovery(const char *name, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 7003 { 7004 struct discovery_ctx *ctx; 7005 7006 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 7007 if (strcmp(name, ctx->name) == 0) { 7008 if (ctx->stop) { 7009 return -EALREADY; 7010 } 7011 /* If we're still starting the discovery service and ->rc is non-zero, we're 7012 * going to stop it as soon as we can 7013 */ 7014 if (ctx->initializing && ctx->rc != 0) { 7015 return -EALREADY; 7016 } 7017 stop_discovery(ctx, cb_fn, cb_ctx); 7018 return 0; 7019 } 7020 } 7021 7022 return -ENOENT; 7023 } 7024 7025 static int 7026 bdev_nvme_library_init(void) 7027 { 7028 g_bdev_nvme_init_thread = spdk_get_thread(); 7029 7030 spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_create_poll_group_cb, 7031 bdev_nvme_destroy_poll_group_cb, 7032 sizeof(struct nvme_poll_group), "nvme_poll_groups"); 7033 7034 return 0; 7035 } 7036 7037 static void 7038 bdev_nvme_fini_destruct_ctrlrs(void) 7039 { 7040 struct nvme_bdev_ctrlr *nbdev_ctrlr; 7041 struct nvme_ctrlr *nvme_ctrlr; 7042 7043 pthread_mutex_lock(&g_bdev_nvme_mutex); 7044 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 7045 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 7046 pthread_mutex_lock(&nvme_ctrlr->mutex); 7047 if (nvme_ctrlr->destruct) { 7048 /* This controller's destruction was already started 7049 * before the application started shutting down 7050 */ 7051 pthread_mutex_unlock(&nvme_ctrlr->mutex); 7052 continue; 7053 } 7054 nvme_ctrlr->destruct = true; 7055 pthread_mutex_unlock(&nvme_ctrlr->mutex); 7056 7057 spdk_thread_send_msg(nvme_ctrlr->thread, _nvme_ctrlr_destruct, 7058 nvme_ctrlr); 7059 } 7060 } 7061 7062 g_bdev_nvme_module_finish = true; 7063 if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 7064 pthread_mutex_unlock(&g_bdev_nvme_mutex); 7065 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 7066 spdk_bdev_module_fini_done(); 7067 return; 7068 } 7069 7070 pthread_mutex_unlock(&g_bdev_nvme_mutex); 7071 } 7072 7073 static void 7074 check_discovery_fini(void *arg) 7075 { 7076 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 7077 bdev_nvme_fini_destruct_ctrlrs(); 7078 } 7079 } 7080 7081 static void 7082 bdev_nvme_library_fini(void) 7083 { 7084 struct nvme_probe_skip_entry *entry, *entry_tmp; 7085 struct discovery_ctx *ctx; 7086 7087 spdk_poller_unregister(&g_hotplug_poller); 7088 free(g_hotplug_probe_ctx); 7089 g_hotplug_probe_ctx = NULL; 7090 7091 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) { 7092 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 7093 free(entry); 7094 } 7095 7096 assert(spdk_get_thread() == g_bdev_nvme_init_thread); 7097 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 7098 bdev_nvme_fini_destruct_ctrlrs(); 7099 } else { 7100 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 7101 stop_discovery(ctx, check_discovery_fini, NULL); 7102 } 7103 } 7104 } 7105 7106 static void 7107 bdev_nvme_verify_pi_error(struct nvme_bdev_io *bio) 7108 { 7109 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7110 struct spdk_bdev *bdev = bdev_io->bdev; 7111 struct spdk_dif_ctx dif_ctx; 7112 struct spdk_dif_error err_blk = {}; 7113 int rc; 7114 struct spdk_dif_ctx_init_ext_opts dif_opts; 7115 7116 dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format); 7117 dif_opts.dif_pi_format = SPDK_DIF_PI_FORMAT_16; 7118 rc = spdk_dif_ctx_init(&dif_ctx, 7119 bdev->blocklen, bdev->md_len, bdev->md_interleave, 7120 bdev->dif_is_head_of_md, bdev->dif_type, 7121 bdev_io->u.bdev.dif_check_flags, 7122 bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0, &dif_opts); 7123 if (rc != 0) { 7124 SPDK_ERRLOG("Initialization of DIF context failed\n"); 7125 return; 7126 } 7127 7128 if (bdev->md_interleave) { 7129 rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 7130 bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 7131 } else { 7132 struct iovec md_iov = { 7133 .iov_base = bdev_io->u.bdev.md_buf, 7134 .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len, 7135 }; 7136 7137 rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 7138 &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 7139 } 7140 7141 if (rc != 0) { 7142 SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n", 7143 err_blk.err_type, err_blk.err_offset); 7144 } else { 7145 SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n"); 7146 } 7147 } 7148 7149 static void 7150 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 7151 { 7152 struct nvme_bdev_io *bio = ref; 7153 7154 if (spdk_nvme_cpl_is_success(cpl)) { 7155 /* Run PI verification for read data buffer. */ 7156 bdev_nvme_verify_pi_error(bio); 7157 } 7158 7159 /* Return original completion status */ 7160 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 7161 } 7162 7163 static void 7164 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 7165 { 7166 struct nvme_bdev_io *bio = ref; 7167 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7168 int ret; 7169 7170 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 7171 SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n", 7172 cpl->status.sct, cpl->status.sc); 7173 7174 /* Save completion status to use after verifying PI error. */ 7175 bio->cpl = *cpl; 7176 7177 if (spdk_likely(nvme_io_path_is_available(bio->io_path))) { 7178 /* Read without PI checking to verify PI error. */ 7179 ret = bdev_nvme_no_pi_readv(bio, 7180 bdev_io->u.bdev.iovs, 7181 bdev_io->u.bdev.iovcnt, 7182 bdev_io->u.bdev.md_buf, 7183 bdev_io->u.bdev.num_blocks, 7184 bdev_io->u.bdev.offset_blocks); 7185 if (ret == 0) { 7186 return; 7187 } 7188 } 7189 } 7190 7191 bdev_nvme_io_complete_nvme_status(bio, cpl); 7192 } 7193 7194 static void 7195 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7196 { 7197 struct nvme_bdev_io *bio = ref; 7198 7199 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 7200 SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n", 7201 cpl->status.sct, cpl->status.sc); 7202 /* Run PI verification for write data buffer if PI error is detected. */ 7203 bdev_nvme_verify_pi_error(bio); 7204 } 7205 7206 bdev_nvme_io_complete_nvme_status(bio, cpl); 7207 } 7208 7209 static void 7210 bdev_nvme_zone_appendv_done(void *ref, const struct spdk_nvme_cpl *cpl) 7211 { 7212 struct nvme_bdev_io *bio = ref; 7213 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7214 7215 /* spdk_bdev_io_get_append_location() requires that the ALBA is stored in offset_blocks. 7216 * Additionally, offset_blocks has to be set before calling bdev_nvme_verify_pi_error(). 7217 */ 7218 bdev_io->u.bdev.offset_blocks = *(uint64_t *)&cpl->cdw0; 7219 7220 if (spdk_nvme_cpl_is_pi_error(cpl)) { 7221 SPDK_ERRLOG("zone append completed with PI error (sct=%d, sc=%d)\n", 7222 cpl->status.sct, cpl->status.sc); 7223 /* Run PI verification for zone append data buffer if PI error is detected. */ 7224 bdev_nvme_verify_pi_error(bio); 7225 } 7226 7227 bdev_nvme_io_complete_nvme_status(bio, cpl); 7228 } 7229 7230 static void 7231 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7232 { 7233 struct nvme_bdev_io *bio = ref; 7234 7235 if (spdk_nvme_cpl_is_pi_error(cpl)) { 7236 SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n", 7237 cpl->status.sct, cpl->status.sc); 7238 /* Run PI verification for compare data buffer if PI error is detected. */ 7239 bdev_nvme_verify_pi_error(bio); 7240 } 7241 7242 bdev_nvme_io_complete_nvme_status(bio, cpl); 7243 } 7244 7245 static void 7246 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7247 { 7248 struct nvme_bdev_io *bio = ref; 7249 7250 /* Compare operation completion */ 7251 if (!bio->first_fused_completed) { 7252 /* Save compare result for write callback */ 7253 bio->cpl = *cpl; 7254 bio->first_fused_completed = true; 7255 return; 7256 } 7257 7258 /* Write operation completion */ 7259 if (spdk_nvme_cpl_is_error(&bio->cpl)) { 7260 /* If bio->cpl is already an error, it means the compare operation failed. In that case, 7261 * complete the IO with the compare operation's status. 7262 */ 7263 if (!spdk_nvme_cpl_is_error(cpl)) { 7264 SPDK_ERRLOG("Unexpected write success after compare failure.\n"); 7265 } 7266 7267 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 7268 } else { 7269 bdev_nvme_io_complete_nvme_status(bio, cpl); 7270 } 7271 } 7272 7273 static void 7274 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl) 7275 { 7276 struct nvme_bdev_io *bio = ref; 7277 7278 bdev_nvme_io_complete_nvme_status(bio, cpl); 7279 } 7280 7281 static int 7282 fill_zone_from_report(struct spdk_bdev_zone_info *info, struct spdk_nvme_zns_zone_desc *desc) 7283 { 7284 switch (desc->zt) { 7285 case SPDK_NVME_ZONE_TYPE_SEQWR: 7286 info->type = SPDK_BDEV_ZONE_TYPE_SEQWR; 7287 break; 7288 default: 7289 SPDK_ERRLOG("Invalid zone type: %#x in zone report\n", desc->zt); 7290 return -EIO; 7291 } 7292 7293 switch (desc->zs) { 7294 case SPDK_NVME_ZONE_STATE_EMPTY: 7295 info->state = SPDK_BDEV_ZONE_STATE_EMPTY; 7296 break; 7297 case SPDK_NVME_ZONE_STATE_IOPEN: 7298 info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN; 7299 break; 7300 case SPDK_NVME_ZONE_STATE_EOPEN: 7301 info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN; 7302 break; 7303 case SPDK_NVME_ZONE_STATE_CLOSED: 7304 info->state = SPDK_BDEV_ZONE_STATE_CLOSED; 7305 break; 7306 case SPDK_NVME_ZONE_STATE_RONLY: 7307 info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY; 7308 break; 7309 case SPDK_NVME_ZONE_STATE_FULL: 7310 info->state = SPDK_BDEV_ZONE_STATE_FULL; 7311 break; 7312 case SPDK_NVME_ZONE_STATE_OFFLINE: 7313 info->state = SPDK_BDEV_ZONE_STATE_OFFLINE; 7314 break; 7315 default: 7316 SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", desc->zs); 7317 return -EIO; 7318 } 7319 7320 info->zone_id = desc->zslba; 7321 info->write_pointer = desc->wp; 7322 info->capacity = desc->zcap; 7323 7324 return 0; 7325 } 7326 7327 static void 7328 bdev_nvme_get_zone_info_done(void *ref, const struct spdk_nvme_cpl *cpl) 7329 { 7330 struct nvme_bdev_io *bio = ref; 7331 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7332 uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id; 7333 uint32_t zones_to_copy = bdev_io->u.zone_mgmt.num_zones; 7334 struct spdk_bdev_zone_info *info = bdev_io->u.zone_mgmt.buf; 7335 uint64_t max_zones_per_buf, i; 7336 uint32_t zone_report_bufsize; 7337 struct spdk_nvme_ns *ns; 7338 struct spdk_nvme_qpair *qpair; 7339 int ret; 7340 7341 if (spdk_nvme_cpl_is_error(cpl)) { 7342 goto out_complete_io_nvme_cpl; 7343 } 7344 7345 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 7346 ret = -ENXIO; 7347 goto out_complete_io_ret; 7348 } 7349 7350 ns = bio->io_path->nvme_ns->ns; 7351 qpair = bio->io_path->qpair->qpair; 7352 7353 zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 7354 max_zones_per_buf = (zone_report_bufsize - sizeof(*bio->zone_report_buf)) / 7355 sizeof(bio->zone_report_buf->descs[0]); 7356 7357 if (bio->zone_report_buf->nr_zones > max_zones_per_buf) { 7358 ret = -EINVAL; 7359 goto out_complete_io_ret; 7360 } 7361 7362 if (!bio->zone_report_buf->nr_zones) { 7363 ret = -EINVAL; 7364 goto out_complete_io_ret; 7365 } 7366 7367 for (i = 0; i < bio->zone_report_buf->nr_zones && bio->handled_zones < zones_to_copy; i++) { 7368 ret = fill_zone_from_report(&info[bio->handled_zones], 7369 &bio->zone_report_buf->descs[i]); 7370 if (ret) { 7371 goto out_complete_io_ret; 7372 } 7373 bio->handled_zones++; 7374 } 7375 7376 if (bio->handled_zones < zones_to_copy) { 7377 uint64_t zone_size_lba = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 7378 uint64_t slba = zone_id + (zone_size_lba * bio->handled_zones); 7379 7380 memset(bio->zone_report_buf, 0, zone_report_bufsize); 7381 ret = spdk_nvme_zns_report_zones(ns, qpair, 7382 bio->zone_report_buf, zone_report_bufsize, 7383 slba, SPDK_NVME_ZRA_LIST_ALL, true, 7384 bdev_nvme_get_zone_info_done, bio); 7385 if (!ret) { 7386 return; 7387 } else { 7388 goto out_complete_io_ret; 7389 } 7390 } 7391 7392 out_complete_io_nvme_cpl: 7393 free(bio->zone_report_buf); 7394 bio->zone_report_buf = NULL; 7395 bdev_nvme_io_complete_nvme_status(bio, cpl); 7396 return; 7397 7398 out_complete_io_ret: 7399 free(bio->zone_report_buf); 7400 bio->zone_report_buf = NULL; 7401 bdev_nvme_io_complete(bio, ret); 7402 } 7403 7404 static void 7405 bdev_nvme_zone_management_done(void *ref, const struct spdk_nvme_cpl *cpl) 7406 { 7407 struct nvme_bdev_io *bio = ref; 7408 7409 bdev_nvme_io_complete_nvme_status(bio, cpl); 7410 } 7411 7412 static void 7413 bdev_nvme_admin_passthru_complete_nvme_status(void *ctx) 7414 { 7415 struct nvme_bdev_io *bio = ctx; 7416 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7417 const struct spdk_nvme_cpl *cpl = &bio->cpl; 7418 7419 assert(bdev_nvme_io_type_is_admin(bdev_io->type)); 7420 7421 __bdev_nvme_io_complete(bdev_io, 0, cpl); 7422 } 7423 7424 static void 7425 bdev_nvme_abort_complete(void *ctx) 7426 { 7427 struct nvme_bdev_io *bio = ctx; 7428 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7429 7430 if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) { 7431 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS, NULL); 7432 } else { 7433 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED, NULL); 7434 } 7435 } 7436 7437 static void 7438 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl) 7439 { 7440 struct nvme_bdev_io *bio = ref; 7441 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7442 7443 bio->cpl = *cpl; 7444 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), bdev_nvme_abort_complete, bio); 7445 } 7446 7447 static void 7448 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl) 7449 { 7450 struct nvme_bdev_io *bio = ref; 7451 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7452 7453 bio->cpl = *cpl; 7454 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7455 bdev_nvme_admin_passthru_complete_nvme_status, bio); 7456 } 7457 7458 static void 7459 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset) 7460 { 7461 struct nvme_bdev_io *bio = ref; 7462 struct iovec *iov; 7463 7464 bio->iov_offset = sgl_offset; 7465 for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) { 7466 iov = &bio->iovs[bio->iovpos]; 7467 if (bio->iov_offset < iov->iov_len) { 7468 break; 7469 } 7470 7471 bio->iov_offset -= iov->iov_len; 7472 } 7473 } 7474 7475 static int 7476 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length) 7477 { 7478 struct nvme_bdev_io *bio = ref; 7479 struct iovec *iov; 7480 7481 assert(bio->iovpos < bio->iovcnt); 7482 7483 iov = &bio->iovs[bio->iovpos]; 7484 7485 *address = iov->iov_base; 7486 *length = iov->iov_len; 7487 7488 if (bio->iov_offset) { 7489 assert(bio->iov_offset <= iov->iov_len); 7490 *address += bio->iov_offset; 7491 *length -= bio->iov_offset; 7492 } 7493 7494 bio->iov_offset += *length; 7495 if (bio->iov_offset == iov->iov_len) { 7496 bio->iovpos++; 7497 bio->iov_offset = 0; 7498 } 7499 7500 return 0; 7501 } 7502 7503 static void 7504 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset) 7505 { 7506 struct nvme_bdev_io *bio = ref; 7507 struct iovec *iov; 7508 7509 bio->fused_iov_offset = sgl_offset; 7510 for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) { 7511 iov = &bio->fused_iovs[bio->fused_iovpos]; 7512 if (bio->fused_iov_offset < iov->iov_len) { 7513 break; 7514 } 7515 7516 bio->fused_iov_offset -= iov->iov_len; 7517 } 7518 } 7519 7520 static int 7521 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length) 7522 { 7523 struct nvme_bdev_io *bio = ref; 7524 struct iovec *iov; 7525 7526 assert(bio->fused_iovpos < bio->fused_iovcnt); 7527 7528 iov = &bio->fused_iovs[bio->fused_iovpos]; 7529 7530 *address = iov->iov_base; 7531 *length = iov->iov_len; 7532 7533 if (bio->fused_iov_offset) { 7534 assert(bio->fused_iov_offset <= iov->iov_len); 7535 *address += bio->fused_iov_offset; 7536 *length -= bio->fused_iov_offset; 7537 } 7538 7539 bio->fused_iov_offset += *length; 7540 if (bio->fused_iov_offset == iov->iov_len) { 7541 bio->fused_iovpos++; 7542 bio->fused_iov_offset = 0; 7543 } 7544 7545 return 0; 7546 } 7547 7548 static int 7549 bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7550 void *md, uint64_t lba_count, uint64_t lba) 7551 { 7552 int rc; 7553 7554 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n", 7555 lba_count, lba); 7556 7557 bio->iovs = iov; 7558 bio->iovcnt = iovcnt; 7559 bio->iovpos = 0; 7560 bio->iov_offset = 0; 7561 7562 rc = spdk_nvme_ns_cmd_readv_with_md(bio->io_path->nvme_ns->ns, 7563 bio->io_path->qpair->qpair, 7564 lba, lba_count, 7565 bdev_nvme_no_pi_readv_done, bio, 0, 7566 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 7567 md, 0, 0); 7568 7569 if (rc != 0 && rc != -ENOMEM) { 7570 SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc); 7571 } 7572 return rc; 7573 } 7574 7575 static int 7576 bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7577 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, 7578 struct spdk_memory_domain *domain, void *domain_ctx, 7579 struct spdk_accel_sequence *seq) 7580 { 7581 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7582 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7583 int rc; 7584 7585 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7586 lba_count, lba); 7587 7588 bio->iovs = iov; 7589 bio->iovcnt = iovcnt; 7590 bio->iovpos = 0; 7591 bio->iov_offset = 0; 7592 7593 if (domain != NULL || seq != NULL) { 7594 bio->ext_opts.size = SPDK_SIZEOF(&bio->ext_opts, accel_sequence); 7595 bio->ext_opts.memory_domain = domain; 7596 bio->ext_opts.memory_domain_ctx = domain_ctx; 7597 bio->ext_opts.io_flags = flags; 7598 bio->ext_opts.metadata = md; 7599 bio->ext_opts.accel_sequence = seq; 7600 7601 if (iovcnt == 1) { 7602 rc = spdk_nvme_ns_cmd_read_ext(ns, qpair, iov[0].iov_base, lba, lba_count, bdev_nvme_readv_done, 7603 bio, &bio->ext_opts); 7604 } else { 7605 rc = spdk_nvme_ns_cmd_readv_ext(ns, qpair, lba, lba_count, 7606 bdev_nvme_readv_done, bio, 7607 bdev_nvme_queued_reset_sgl, 7608 bdev_nvme_queued_next_sge, 7609 &bio->ext_opts); 7610 } 7611 } else if (iovcnt == 1) { 7612 rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base, 7613 md, lba, lba_count, bdev_nvme_readv_done, 7614 bio, flags, 0, 0); 7615 } else { 7616 rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count, 7617 bdev_nvme_readv_done, bio, flags, 7618 bdev_nvme_queued_reset_sgl, 7619 bdev_nvme_queued_next_sge, md, 0, 0); 7620 } 7621 7622 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 7623 SPDK_ERRLOG("readv failed: rc = %d\n", rc); 7624 } 7625 return rc; 7626 } 7627 7628 static int 7629 bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7630 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, 7631 struct spdk_memory_domain *domain, void *domain_ctx, 7632 struct spdk_accel_sequence *seq) 7633 { 7634 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7635 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7636 int rc; 7637 7638 SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7639 lba_count, lba); 7640 7641 bio->iovs = iov; 7642 bio->iovcnt = iovcnt; 7643 bio->iovpos = 0; 7644 bio->iov_offset = 0; 7645 7646 if (domain != NULL || seq != NULL) { 7647 bio->ext_opts.size = SPDK_SIZEOF(&bio->ext_opts, accel_sequence); 7648 bio->ext_opts.memory_domain = domain; 7649 bio->ext_opts.memory_domain_ctx = domain_ctx; 7650 bio->ext_opts.io_flags = flags; 7651 bio->ext_opts.metadata = md; 7652 bio->ext_opts.accel_sequence = seq; 7653 7654 if (iovcnt == 1) { 7655 rc = spdk_nvme_ns_cmd_write_ext(ns, qpair, iov[0].iov_base, lba, lba_count, bdev_nvme_writev_done, 7656 bio, &bio->ext_opts); 7657 } else { 7658 rc = spdk_nvme_ns_cmd_writev_ext(ns, qpair, lba, lba_count, 7659 bdev_nvme_writev_done, bio, 7660 bdev_nvme_queued_reset_sgl, 7661 bdev_nvme_queued_next_sge, 7662 &bio->ext_opts); 7663 } 7664 } else if (iovcnt == 1) { 7665 rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base, 7666 md, lba, lba_count, bdev_nvme_writev_done, 7667 bio, flags, 0, 0); 7668 } else { 7669 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 7670 bdev_nvme_writev_done, bio, flags, 7671 bdev_nvme_queued_reset_sgl, 7672 bdev_nvme_queued_next_sge, md, 0, 0); 7673 } 7674 7675 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 7676 SPDK_ERRLOG("writev failed: rc = %d\n", rc); 7677 } 7678 return rc; 7679 } 7680 7681 static int 7682 bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7683 void *md, uint64_t lba_count, uint64_t zslba, 7684 uint32_t flags) 7685 { 7686 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7687 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7688 int rc; 7689 7690 SPDK_DEBUGLOG(bdev_nvme, "zone append %" PRIu64 " blocks to zone start lba %#" PRIx64 "\n", 7691 lba_count, zslba); 7692 7693 bio->iovs = iov; 7694 bio->iovcnt = iovcnt; 7695 bio->iovpos = 0; 7696 bio->iov_offset = 0; 7697 7698 if (iovcnt == 1) { 7699 rc = spdk_nvme_zns_zone_append_with_md(ns, qpair, iov[0].iov_base, md, zslba, 7700 lba_count, 7701 bdev_nvme_zone_appendv_done, bio, 7702 flags, 7703 0, 0); 7704 } else { 7705 rc = spdk_nvme_zns_zone_appendv_with_md(ns, qpair, zslba, lba_count, 7706 bdev_nvme_zone_appendv_done, bio, flags, 7707 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 7708 md, 0, 0); 7709 } 7710 7711 if (rc != 0 && rc != -ENOMEM) { 7712 SPDK_ERRLOG("zone append failed: rc = %d\n", rc); 7713 } 7714 return rc; 7715 } 7716 7717 static int 7718 bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7719 void *md, uint64_t lba_count, uint64_t lba, 7720 uint32_t flags) 7721 { 7722 int rc; 7723 7724 SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7725 lba_count, lba); 7726 7727 bio->iovs = iov; 7728 bio->iovcnt = iovcnt; 7729 bio->iovpos = 0; 7730 bio->iov_offset = 0; 7731 7732 rc = spdk_nvme_ns_cmd_comparev_with_md(bio->io_path->nvme_ns->ns, 7733 bio->io_path->qpair->qpair, 7734 lba, lba_count, 7735 bdev_nvme_comparev_done, bio, flags, 7736 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 7737 md, 0, 0); 7738 7739 if (rc != 0 && rc != -ENOMEM) { 7740 SPDK_ERRLOG("comparev failed: rc = %d\n", rc); 7741 } 7742 return rc; 7743 } 7744 7745 static int 7746 bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, 7747 struct iovec *write_iov, int write_iovcnt, 7748 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags) 7749 { 7750 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7751 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7752 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7753 int rc; 7754 7755 SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7756 lba_count, lba); 7757 7758 bio->iovs = cmp_iov; 7759 bio->iovcnt = cmp_iovcnt; 7760 bio->iovpos = 0; 7761 bio->iov_offset = 0; 7762 bio->fused_iovs = write_iov; 7763 bio->fused_iovcnt = write_iovcnt; 7764 bio->fused_iovpos = 0; 7765 bio->fused_iov_offset = 0; 7766 7767 if (bdev_io->num_retries == 0) { 7768 bio->first_fused_submitted = false; 7769 bio->first_fused_completed = false; 7770 } 7771 7772 if (!bio->first_fused_submitted) { 7773 flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST; 7774 memset(&bio->cpl, 0, sizeof(bio->cpl)); 7775 7776 rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count, 7777 bdev_nvme_comparev_and_writev_done, bio, flags, 7778 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0); 7779 if (rc == 0) { 7780 bio->first_fused_submitted = true; 7781 flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST; 7782 } else { 7783 if (rc != -ENOMEM) { 7784 SPDK_ERRLOG("compare failed: rc = %d\n", rc); 7785 } 7786 return rc; 7787 } 7788 } 7789 7790 flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND; 7791 7792 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 7793 bdev_nvme_comparev_and_writev_done, bio, flags, 7794 bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0); 7795 if (rc != 0 && rc != -ENOMEM) { 7796 SPDK_ERRLOG("write failed: rc = %d\n", rc); 7797 rc = 0; 7798 } 7799 7800 return rc; 7801 } 7802 7803 static int 7804 bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 7805 { 7806 struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES]; 7807 struct spdk_nvme_dsm_range *range; 7808 uint64_t offset, remaining; 7809 uint64_t num_ranges_u64; 7810 uint16_t num_ranges; 7811 int rc; 7812 7813 num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) / 7814 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 7815 if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) { 7816 SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks); 7817 return -EINVAL; 7818 } 7819 num_ranges = (uint16_t)num_ranges_u64; 7820 7821 offset = offset_blocks; 7822 remaining = num_blocks; 7823 range = &dsm_ranges[0]; 7824 7825 /* Fill max-size ranges until the remaining blocks fit into one range */ 7826 while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) { 7827 range->attributes.raw = 0; 7828 range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 7829 range->starting_lba = offset; 7830 7831 offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 7832 remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 7833 range++; 7834 } 7835 7836 /* Final range describes the remaining blocks */ 7837 range->attributes.raw = 0; 7838 range->length = remaining; 7839 range->starting_lba = offset; 7840 7841 rc = spdk_nvme_ns_cmd_dataset_management(bio->io_path->nvme_ns->ns, 7842 bio->io_path->qpair->qpair, 7843 SPDK_NVME_DSM_ATTR_DEALLOCATE, 7844 dsm_ranges, num_ranges, 7845 bdev_nvme_queued_done, bio); 7846 7847 return rc; 7848 } 7849 7850 static int 7851 bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 7852 { 7853 if (num_blocks > UINT16_MAX + 1) { 7854 SPDK_ERRLOG("NVMe write zeroes is limited to 16-bit block count\n"); 7855 return -EINVAL; 7856 } 7857 7858 return spdk_nvme_ns_cmd_write_zeroes(bio->io_path->nvme_ns->ns, 7859 bio->io_path->qpair->qpair, 7860 offset_blocks, num_blocks, 7861 bdev_nvme_queued_done, bio, 7862 0); 7863 } 7864 7865 static int 7866 bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones, 7867 struct spdk_bdev_zone_info *info) 7868 { 7869 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7870 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7871 uint32_t zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 7872 uint64_t zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 7873 uint64_t total_zones = spdk_nvme_zns_ns_get_num_zones(ns); 7874 7875 if (zone_id % zone_size != 0) { 7876 return -EINVAL; 7877 } 7878 7879 if (num_zones > total_zones || !num_zones) { 7880 return -EINVAL; 7881 } 7882 7883 assert(!bio->zone_report_buf); 7884 bio->zone_report_buf = calloc(1, zone_report_bufsize); 7885 if (!bio->zone_report_buf) { 7886 return -ENOMEM; 7887 } 7888 7889 bio->handled_zones = 0; 7890 7891 return spdk_nvme_zns_report_zones(ns, qpair, bio->zone_report_buf, zone_report_bufsize, 7892 zone_id, SPDK_NVME_ZRA_LIST_ALL, true, 7893 bdev_nvme_get_zone_info_done, bio); 7894 } 7895 7896 static int 7897 bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 7898 enum spdk_bdev_zone_action action) 7899 { 7900 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7901 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7902 7903 switch (action) { 7904 case SPDK_BDEV_ZONE_CLOSE: 7905 return spdk_nvme_zns_close_zone(ns, qpair, zone_id, false, 7906 bdev_nvme_zone_management_done, bio); 7907 case SPDK_BDEV_ZONE_FINISH: 7908 return spdk_nvme_zns_finish_zone(ns, qpair, zone_id, false, 7909 bdev_nvme_zone_management_done, bio); 7910 case SPDK_BDEV_ZONE_OPEN: 7911 return spdk_nvme_zns_open_zone(ns, qpair, zone_id, false, 7912 bdev_nvme_zone_management_done, bio); 7913 case SPDK_BDEV_ZONE_RESET: 7914 return spdk_nvme_zns_reset_zone(ns, qpair, zone_id, false, 7915 bdev_nvme_zone_management_done, bio); 7916 case SPDK_BDEV_ZONE_OFFLINE: 7917 return spdk_nvme_zns_offline_zone(ns, qpair, zone_id, false, 7918 bdev_nvme_zone_management_done, bio); 7919 default: 7920 return -EINVAL; 7921 } 7922 } 7923 7924 static void 7925 bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 7926 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) 7927 { 7928 struct nvme_io_path *io_path; 7929 struct nvme_ctrlr *nvme_ctrlr; 7930 uint32_t max_xfer_size; 7931 int rc = -ENXIO; 7932 7933 /* Choose the first ctrlr which is not failed. */ 7934 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 7935 nvme_ctrlr = io_path->qpair->ctrlr; 7936 7937 /* We should skip any unavailable nvme_ctrlr rather than checking 7938 * if the return value of spdk_nvme_ctrlr_cmd_admin_raw() is -ENXIO. 7939 */ 7940 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 7941 continue; 7942 } 7943 7944 max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ctrlr->ctrlr); 7945 7946 if (nbytes > max_xfer_size) { 7947 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 7948 rc = -EINVAL; 7949 goto err; 7950 } 7951 7952 rc = spdk_nvme_ctrlr_cmd_admin_raw(nvme_ctrlr->ctrlr, cmd, buf, (uint32_t)nbytes, 7953 bdev_nvme_admin_passthru_done, bio); 7954 if (rc == 0) { 7955 return; 7956 } 7957 } 7958 7959 err: 7960 bdev_nvme_admin_complete(bio, rc); 7961 } 7962 7963 static int 7964 bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 7965 void *buf, size_t nbytes) 7966 { 7967 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7968 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7969 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 7970 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 7971 7972 if (nbytes > max_xfer_size) { 7973 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 7974 return -EINVAL; 7975 } 7976 7977 /* 7978 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 7979 * so fill it out automatically. 7980 */ 7981 cmd->nsid = spdk_nvme_ns_get_id(ns); 7982 7983 return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf, 7984 (uint32_t)nbytes, bdev_nvme_queued_done, bio); 7985 } 7986 7987 static int 7988 bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 7989 void *buf, size_t nbytes, void *md_buf, size_t md_len) 7990 { 7991 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7992 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7993 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 7994 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 7995 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 7996 7997 if (nbytes > max_xfer_size) { 7998 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 7999 return -EINVAL; 8000 } 8001 8002 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 8003 SPDK_ERRLOG("invalid meta data buffer size\n"); 8004 return -EINVAL; 8005 } 8006 8007 /* 8008 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 8009 * so fill it out automatically. 8010 */ 8011 cmd->nsid = spdk_nvme_ns_get_id(ns); 8012 8013 return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf, 8014 (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio); 8015 } 8016 8017 static int 8018 bdev_nvme_iov_passthru_md(struct nvme_bdev_io *bio, 8019 struct spdk_nvme_cmd *cmd, struct iovec *iov, int iovcnt, 8020 size_t nbytes, void *md_buf, size_t md_len) 8021 { 8022 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8023 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8024 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 8025 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 8026 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 8027 8028 bio->iovs = iov; 8029 bio->iovcnt = iovcnt; 8030 bio->iovpos = 0; 8031 bio->iov_offset = 0; 8032 8033 if (nbytes > max_xfer_size) { 8034 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 8035 return -EINVAL; 8036 } 8037 8038 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 8039 SPDK_ERRLOG("invalid meta data buffer size\n"); 8040 return -EINVAL; 8041 } 8042 8043 /* 8044 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands 8045 * require a nsid, so fill it out automatically. 8046 */ 8047 cmd->nsid = spdk_nvme_ns_get_id(ns); 8048 8049 return spdk_nvme_ctrlr_cmd_iov_raw_with_md( 8050 ctrlr, qpair, cmd, (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio, 8051 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge); 8052 } 8053 8054 static void 8055 bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 8056 struct nvme_bdev_io *bio_to_abort) 8057 { 8058 struct nvme_io_path *io_path; 8059 int rc = 0; 8060 8061 rc = bdev_nvme_abort_retry_io(nbdev_ch, bio_to_abort); 8062 if (rc == 0) { 8063 bdev_nvme_admin_complete(bio, 0); 8064 return; 8065 } 8066 8067 io_path = bio_to_abort->io_path; 8068 if (io_path != NULL) { 8069 rc = spdk_nvme_ctrlr_cmd_abort_ext(io_path->qpair->ctrlr->ctrlr, 8070 io_path->qpair->qpair, 8071 bio_to_abort, 8072 bdev_nvme_abort_done, bio); 8073 } else { 8074 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 8075 rc = spdk_nvme_ctrlr_cmd_abort_ext(io_path->qpair->ctrlr->ctrlr, 8076 NULL, 8077 bio_to_abort, 8078 bdev_nvme_abort_done, bio); 8079 8080 if (rc != -ENOENT) { 8081 break; 8082 } 8083 } 8084 } 8085 8086 if (rc != 0) { 8087 /* If no command was found or there was any error, complete the abort 8088 * request with failure. 8089 */ 8090 bdev_nvme_admin_complete(bio, rc); 8091 } 8092 } 8093 8094 static int 8095 bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, uint64_t src_offset_blocks, 8096 uint64_t num_blocks) 8097 { 8098 struct spdk_nvme_scc_source_range range = { 8099 .slba = src_offset_blocks, 8100 .nlb = num_blocks - 1 8101 }; 8102 8103 return spdk_nvme_ns_cmd_copy(bio->io_path->nvme_ns->ns, 8104 bio->io_path->qpair->qpair, 8105 &range, 1, dst_offset_blocks, 8106 bdev_nvme_queued_done, bio); 8107 } 8108 8109 static void 8110 bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w) 8111 { 8112 const char *action; 8113 8114 if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) { 8115 action = "reset"; 8116 } else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) { 8117 action = "abort"; 8118 } else { 8119 action = "none"; 8120 } 8121 8122 spdk_json_write_object_begin(w); 8123 8124 spdk_json_write_named_string(w, "method", "bdev_nvme_set_options"); 8125 8126 spdk_json_write_named_object_begin(w, "params"); 8127 spdk_json_write_named_string(w, "action_on_timeout", action); 8128 spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us); 8129 spdk_json_write_named_uint64(w, "timeout_admin_us", g_opts.timeout_admin_us); 8130 spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms); 8131 spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst); 8132 spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight); 8133 spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight); 8134 spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight); 8135 spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us); 8136 spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us); 8137 spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests); 8138 spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit); 8139 spdk_json_write_named_uint32(w, "transport_retry_count", g_opts.transport_retry_count); 8140 spdk_json_write_named_int32(w, "bdev_retry_count", g_opts.bdev_retry_count); 8141 spdk_json_write_named_uint8(w, "transport_ack_timeout", g_opts.transport_ack_timeout); 8142 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", g_opts.ctrlr_loss_timeout_sec); 8143 spdk_json_write_named_uint32(w, "reconnect_delay_sec", g_opts.reconnect_delay_sec); 8144 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", g_opts.fast_io_fail_timeout_sec); 8145 spdk_json_write_named_bool(w, "disable_auto_failback", g_opts.disable_auto_failback); 8146 spdk_json_write_named_bool(w, "generate_uuids", g_opts.generate_uuids); 8147 spdk_json_write_named_uint8(w, "transport_tos", g_opts.transport_tos); 8148 spdk_json_write_named_bool(w, "nvme_error_stat", g_opts.nvme_error_stat); 8149 spdk_json_write_named_uint32(w, "rdma_srq_size", g_opts.rdma_srq_size); 8150 spdk_json_write_named_bool(w, "io_path_stat", g_opts.io_path_stat); 8151 spdk_json_write_named_bool(w, "allow_accel_sequence", g_opts.allow_accel_sequence); 8152 spdk_json_write_named_uint32(w, "rdma_max_cq_size", g_opts.rdma_max_cq_size); 8153 spdk_json_write_named_uint16(w, "rdma_cm_event_timeout_ms", g_opts.rdma_cm_event_timeout_ms); 8154 spdk_json_write_object_end(w); 8155 8156 spdk_json_write_object_end(w); 8157 } 8158 8159 static void 8160 bdev_nvme_discovery_config_json(struct spdk_json_write_ctx *w, struct discovery_ctx *ctx) 8161 { 8162 struct spdk_nvme_transport_id trid; 8163 8164 spdk_json_write_object_begin(w); 8165 8166 spdk_json_write_named_string(w, "method", "bdev_nvme_start_discovery"); 8167 8168 spdk_json_write_named_object_begin(w, "params"); 8169 spdk_json_write_named_string(w, "name", ctx->name); 8170 spdk_json_write_named_string(w, "hostnqn", ctx->hostnqn); 8171 8172 trid = ctx->trid; 8173 memset(trid.subnqn, 0, sizeof(trid.subnqn)); 8174 nvme_bdev_dump_trid_json(&trid, w); 8175 8176 spdk_json_write_named_bool(w, "wait_for_attach", ctx->wait_for_attach); 8177 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", ctx->bdev_opts.ctrlr_loss_timeout_sec); 8178 spdk_json_write_named_uint32(w, "reconnect_delay_sec", ctx->bdev_opts.reconnect_delay_sec); 8179 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 8180 ctx->bdev_opts.fast_io_fail_timeout_sec); 8181 spdk_json_write_object_end(w); 8182 8183 spdk_json_write_object_end(w); 8184 } 8185 8186 #ifdef SPDK_CONFIG_NVME_CUSE 8187 static void 8188 nvme_ctrlr_cuse_config_json(struct spdk_json_write_ctx *w, 8189 struct nvme_ctrlr *nvme_ctrlr) 8190 { 8191 size_t cuse_name_size = 128; 8192 char cuse_name[cuse_name_size]; 8193 8194 if (spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr, 8195 cuse_name, &cuse_name_size) != 0) { 8196 return; 8197 } 8198 8199 spdk_json_write_object_begin(w); 8200 8201 spdk_json_write_named_string(w, "method", "bdev_nvme_cuse_register"); 8202 8203 spdk_json_write_named_object_begin(w, "params"); 8204 spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name); 8205 spdk_json_write_object_end(w); 8206 8207 spdk_json_write_object_end(w); 8208 } 8209 #endif 8210 8211 static void 8212 nvme_ctrlr_config_json(struct spdk_json_write_ctx *w, 8213 struct nvme_ctrlr *nvme_ctrlr) 8214 { 8215 struct spdk_nvme_transport_id *trid; 8216 const struct spdk_nvme_ctrlr_opts *opts; 8217 8218 if (nvme_ctrlr->opts.from_discovery_service) { 8219 /* Do not emit an RPC for this - it will be implicitly 8220 * covered by a separate bdev_nvme_start_discovery or 8221 * bdev_nvme_start_mdns_discovery RPC. 8222 */ 8223 return; 8224 } 8225 8226 trid = &nvme_ctrlr->active_path_id->trid; 8227 8228 spdk_json_write_object_begin(w); 8229 8230 spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller"); 8231 8232 spdk_json_write_named_object_begin(w, "params"); 8233 spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name); 8234 nvme_bdev_dump_trid_json(trid, w); 8235 spdk_json_write_named_bool(w, "prchk_reftag", 8236 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0); 8237 spdk_json_write_named_bool(w, "prchk_guard", 8238 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0); 8239 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", nvme_ctrlr->opts.ctrlr_loss_timeout_sec); 8240 spdk_json_write_named_uint32(w, "reconnect_delay_sec", nvme_ctrlr->opts.reconnect_delay_sec); 8241 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 8242 nvme_ctrlr->opts.fast_io_fail_timeout_sec); 8243 if (nvme_ctrlr->psk != NULL) { 8244 spdk_json_write_named_string(w, "psk", spdk_key_get_name(nvme_ctrlr->psk)); 8245 } else if (nvme_ctrlr->opts.psk[0] != '\0') { 8246 spdk_json_write_named_string(w, "psk", nvme_ctrlr->opts.psk); 8247 } 8248 8249 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 8250 spdk_json_write_named_string(w, "hostnqn", opts->hostnqn); 8251 spdk_json_write_named_bool(w, "hdgst", opts->header_digest); 8252 spdk_json_write_named_bool(w, "ddgst", opts->data_digest); 8253 if (opts->src_addr[0] != '\0') { 8254 spdk_json_write_named_string(w, "hostaddr", opts->src_addr); 8255 } 8256 if (opts->src_svcid[0] != '\0') { 8257 spdk_json_write_named_string(w, "hostsvcid", opts->src_svcid); 8258 } 8259 8260 spdk_json_write_object_end(w); 8261 8262 spdk_json_write_object_end(w); 8263 } 8264 8265 static void 8266 bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w) 8267 { 8268 spdk_json_write_object_begin(w); 8269 spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug"); 8270 8271 spdk_json_write_named_object_begin(w, "params"); 8272 spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us); 8273 spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled); 8274 spdk_json_write_object_end(w); 8275 8276 spdk_json_write_object_end(w); 8277 } 8278 8279 static int 8280 bdev_nvme_config_json(struct spdk_json_write_ctx *w) 8281 { 8282 struct nvme_bdev_ctrlr *nbdev_ctrlr; 8283 struct nvme_ctrlr *nvme_ctrlr; 8284 struct discovery_ctx *ctx; 8285 8286 bdev_nvme_opts_config_json(w); 8287 8288 pthread_mutex_lock(&g_bdev_nvme_mutex); 8289 8290 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 8291 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 8292 nvme_ctrlr_config_json(w, nvme_ctrlr); 8293 8294 #ifdef SPDK_CONFIG_NVME_CUSE 8295 nvme_ctrlr_cuse_config_json(w, nvme_ctrlr); 8296 #endif 8297 } 8298 } 8299 8300 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 8301 if (!ctx->from_mdns_discovery_service) { 8302 bdev_nvme_discovery_config_json(w, ctx); 8303 } 8304 } 8305 8306 bdev_nvme_mdns_discovery_config_json(w); 8307 8308 /* Dump as last parameter to give all NVMe bdevs chance to be constructed 8309 * before enabling hotplug poller. 8310 */ 8311 bdev_nvme_hotplug_config_json(w); 8312 8313 pthread_mutex_unlock(&g_bdev_nvme_mutex); 8314 return 0; 8315 } 8316 8317 struct spdk_nvme_ctrlr * 8318 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev) 8319 { 8320 struct nvme_bdev *nbdev; 8321 struct nvme_ns *nvme_ns; 8322 8323 if (!bdev || bdev->module != &nvme_if) { 8324 return NULL; 8325 } 8326 8327 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 8328 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 8329 assert(nvme_ns != NULL); 8330 8331 return nvme_ns->ctrlr->ctrlr; 8332 } 8333 8334 void 8335 nvme_io_path_info_json(struct spdk_json_write_ctx *w, struct nvme_io_path *io_path) 8336 { 8337 struct nvme_ns *nvme_ns = io_path->nvme_ns; 8338 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 8339 const struct spdk_nvme_ctrlr_data *cdata; 8340 const struct spdk_nvme_transport_id *trid; 8341 const char *adrfam_str; 8342 8343 spdk_json_write_object_begin(w); 8344 8345 spdk_json_write_named_string(w, "bdev_name", nvme_ns->bdev->disk.name); 8346 8347 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 8348 trid = spdk_nvme_ctrlr_get_transport_id(nvme_ctrlr->ctrlr); 8349 8350 spdk_json_write_named_uint32(w, "cntlid", cdata->cntlid); 8351 spdk_json_write_named_bool(w, "current", io_path->nbdev_ch != NULL && 8352 io_path == io_path->nbdev_ch->current_io_path); 8353 spdk_json_write_named_bool(w, "connected", nvme_qpair_is_connected(io_path->qpair)); 8354 spdk_json_write_named_bool(w, "accessible", nvme_ns_is_accessible(nvme_ns)); 8355 8356 spdk_json_write_named_object_begin(w, "transport"); 8357 spdk_json_write_named_string(w, "trtype", trid->trstring); 8358 spdk_json_write_named_string(w, "traddr", trid->traddr); 8359 if (trid->trsvcid[0] != '\0') { 8360 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 8361 } 8362 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 8363 if (adrfam_str) { 8364 spdk_json_write_named_string(w, "adrfam", adrfam_str); 8365 } 8366 spdk_json_write_object_end(w); 8367 8368 spdk_json_write_object_end(w); 8369 } 8370 8371 void 8372 bdev_nvme_get_discovery_info(struct spdk_json_write_ctx *w) 8373 { 8374 struct discovery_ctx *ctx; 8375 struct discovery_entry_ctx *entry_ctx; 8376 8377 spdk_json_write_array_begin(w); 8378 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 8379 spdk_json_write_object_begin(w); 8380 spdk_json_write_named_string(w, "name", ctx->name); 8381 8382 spdk_json_write_named_object_begin(w, "trid"); 8383 nvme_bdev_dump_trid_json(&ctx->trid, w); 8384 spdk_json_write_object_end(w); 8385 8386 spdk_json_write_named_array_begin(w, "referrals"); 8387 TAILQ_FOREACH(entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 8388 spdk_json_write_object_begin(w); 8389 spdk_json_write_named_object_begin(w, "trid"); 8390 nvme_bdev_dump_trid_json(&entry_ctx->trid, w); 8391 spdk_json_write_object_end(w); 8392 spdk_json_write_object_end(w); 8393 } 8394 spdk_json_write_array_end(w); 8395 8396 spdk_json_write_object_end(w); 8397 } 8398 spdk_json_write_array_end(w); 8399 } 8400 8401 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme) 8402 8403 SPDK_TRACE_REGISTER_FN(bdev_nvme_trace, "bdev_nvme", TRACE_GROUP_BDEV_NVME) 8404 { 8405 struct spdk_trace_tpoint_opts opts[] = { 8406 { 8407 "BDEV_NVME_IO_START", TRACE_BDEV_NVME_IO_START, 8408 OWNER_NONE, OBJECT_BDEV_NVME_IO, 1, 8409 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 8410 }, 8411 { 8412 "BDEV_NVME_IO_DONE", TRACE_BDEV_NVME_IO_DONE, 8413 OWNER_NONE, OBJECT_BDEV_NVME_IO, 0, 8414 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 8415 } 8416 }; 8417 8418 8419 spdk_trace_register_object(OBJECT_BDEV_NVME_IO, 'N'); 8420 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 8421 spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_SUBMIT, OBJECT_BDEV_NVME_IO, 0); 8422 spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_SUBMIT, OBJECT_BDEV_NVME_IO, 0); 8423 spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_COMPLETE, OBJECT_BDEV_NVME_IO, 0); 8424 spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_COMPLETE, OBJECT_BDEV_NVME_IO, 0); 8425 } 8426