1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 * Copyright (c) 2022 Dell Inc, or its subsidiaries. All rights reserved. 6 */ 7 8 #include "spdk/stdinc.h" 9 10 #include "bdev_nvme.h" 11 12 #include "spdk/accel.h" 13 #include "spdk/config.h" 14 #include "spdk/endian.h" 15 #include "spdk/bdev.h" 16 #include "spdk/json.h" 17 #include "spdk/likely.h" 18 #include "spdk/nvme.h" 19 #include "spdk/nvme_ocssd.h" 20 #include "spdk/nvme_zns.h" 21 #include "spdk/opal.h" 22 #include "spdk/thread.h" 23 #include "spdk/trace.h" 24 #include "spdk/string.h" 25 #include "spdk/util.h" 26 #include "spdk/uuid.h" 27 28 #include "spdk/bdev_module.h" 29 #include "spdk/log.h" 30 31 #include "spdk_internal/usdt.h" 32 #include "spdk_internal/trace_defs.h" 33 34 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true 35 #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS (10000) 36 37 #define NSID_STR_LEN 10 38 39 #define SPDK_CONTROLLER_NAME_MAX 512 40 41 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w); 42 43 struct nvme_bdev_io { 44 /** array of iovecs to transfer. */ 45 struct iovec *iovs; 46 47 /** Number of iovecs in iovs array. */ 48 int iovcnt; 49 50 /** Current iovec position. */ 51 int iovpos; 52 53 /** Offset in current iovec. */ 54 uint32_t iov_offset; 55 56 /** I/O path the current I/O or admin passthrough is submitted on, or the I/O path 57 * being reset in a reset I/O. 58 */ 59 struct nvme_io_path *io_path; 60 61 /** array of iovecs to transfer. */ 62 struct iovec *fused_iovs; 63 64 /** Number of iovecs in iovs array. */ 65 int fused_iovcnt; 66 67 /** Current iovec position. */ 68 int fused_iovpos; 69 70 /** Offset in current iovec. */ 71 uint32_t fused_iov_offset; 72 73 /** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */ 74 struct spdk_nvme_cpl cpl; 75 76 /** Extended IO opts passed by the user to bdev layer and mapped to NVME format */ 77 struct spdk_nvme_ns_cmd_ext_io_opts ext_opts; 78 79 /** Keeps track if first of fused commands was submitted */ 80 bool first_fused_submitted; 81 82 /** Keeps track if first of fused commands was completed */ 83 bool first_fused_completed; 84 85 /** Temporary pointer to zone report buffer */ 86 struct spdk_nvme_zns_zone_report *zone_report_buf; 87 88 /** Keep track of how many zones that have been copied to the spdk_bdev_zone_info struct */ 89 uint64_t handled_zones; 90 91 /** Expiration value in ticks to retry the current I/O. */ 92 uint64_t retry_ticks; 93 94 /* How many times the current I/O was retried. */ 95 int32_t retry_count; 96 97 /* Current tsc at submit time. */ 98 uint64_t submit_tsc; 99 }; 100 101 struct nvme_probe_skip_entry { 102 struct spdk_nvme_transport_id trid; 103 TAILQ_ENTRY(nvme_probe_skip_entry) tailq; 104 }; 105 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */ 106 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER( 107 g_skipped_nvme_ctrlrs); 108 109 static struct spdk_bdev_nvme_opts g_opts = { 110 .action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE, 111 .timeout_us = 0, 112 .timeout_admin_us = 0, 113 .keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS, 114 .transport_retry_count = 4, 115 .arbitration_burst = 0, 116 .low_priority_weight = 0, 117 .medium_priority_weight = 0, 118 .high_priority_weight = 0, 119 .nvme_adminq_poll_period_us = 10000ULL, 120 .nvme_ioq_poll_period_us = 0, 121 .io_queue_requests = 0, 122 .delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT, 123 .bdev_retry_count = 3, 124 .transport_ack_timeout = 0, 125 .ctrlr_loss_timeout_sec = 0, 126 .reconnect_delay_sec = 0, 127 .fast_io_fail_timeout_sec = 0, 128 .disable_auto_failback = false, 129 .generate_uuids = false, 130 .transport_tos = 0, 131 .nvme_error_stat = false, 132 .io_path_stat = false, 133 .allow_accel_sequence = false, 134 }; 135 136 #define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL 137 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT 100000ULL 138 139 static int g_hot_insert_nvme_controller_index = 0; 140 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT; 141 static bool g_nvme_hotplug_enabled = false; 142 struct spdk_thread *g_bdev_nvme_init_thread; 143 static struct spdk_poller *g_hotplug_poller; 144 static struct spdk_poller *g_hotplug_probe_poller; 145 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx; 146 147 static void nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 148 struct nvme_async_probe_ctx *ctx); 149 static void nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 150 struct nvme_async_probe_ctx *ctx); 151 static int bdev_nvme_library_init(void); 152 static void bdev_nvme_library_fini(void); 153 static void _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, 154 struct spdk_bdev_io *bdev_io); 155 static void bdev_nvme_submit_request(struct spdk_io_channel *ch, 156 struct spdk_bdev_io *bdev_io); 157 static int bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 158 void *md, uint64_t lba_count, uint64_t lba, 159 uint32_t flags, struct spdk_memory_domain *domain, void *domain_ctx, 160 struct spdk_accel_sequence *seq); 161 static int bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 162 void *md, uint64_t lba_count, uint64_t lba); 163 static int bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 164 void *md, uint64_t lba_count, uint64_t lba, 165 uint32_t flags, struct spdk_memory_domain *domain, void *domain_ctx, 166 struct spdk_accel_sequence *seq); 167 static int bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 168 void *md, uint64_t lba_count, 169 uint64_t zslba, uint32_t flags); 170 static int bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 171 void *md, uint64_t lba_count, uint64_t lba, 172 uint32_t flags); 173 static int bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, 174 struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov, 175 int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba, 176 uint32_t flags); 177 static int bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, 178 uint32_t num_zones, struct spdk_bdev_zone_info *info); 179 static int bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 180 enum spdk_bdev_zone_action action); 181 static void bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, 182 struct nvme_bdev_io *bio, 183 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); 184 static int bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 185 void *buf, size_t nbytes); 186 static int bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 187 void *buf, size_t nbytes, void *md_buf, size_t md_len); 188 static int bdev_nvme_iov_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 189 struct iovec *iov, int iovcnt, size_t nbytes, 190 void *md_buf, size_t md_len); 191 static void bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, 192 struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort); 193 static void bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio); 194 static int bdev_nvme_reset_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 195 static int bdev_nvme_failover_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 196 static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr); 197 static int nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr); 198 199 static struct nvme_ns *nvme_ns_alloc(void); 200 static void nvme_ns_free(struct nvme_ns *ns); 201 202 static int 203 nvme_ns_cmp(struct nvme_ns *ns1, struct nvme_ns *ns2) 204 { 205 return ns1->id < ns2->id ? -1 : ns1->id > ns2->id; 206 } 207 208 RB_GENERATE_STATIC(nvme_ns_tree, nvme_ns, node, nvme_ns_cmp); 209 210 struct spdk_nvme_qpair * 211 bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch) 212 { 213 struct nvme_ctrlr_channel *ctrlr_ch; 214 215 assert(ctrlr_io_ch != NULL); 216 217 ctrlr_ch = spdk_io_channel_get_ctx(ctrlr_io_ch); 218 219 return ctrlr_ch->qpair->qpair; 220 } 221 222 static int 223 bdev_nvme_get_ctx_size(void) 224 { 225 return sizeof(struct nvme_bdev_io); 226 } 227 228 static struct spdk_bdev_module nvme_if = { 229 .name = "nvme", 230 .async_fini = true, 231 .module_init = bdev_nvme_library_init, 232 .module_fini = bdev_nvme_library_fini, 233 .config_json = bdev_nvme_config_json, 234 .get_ctx_size = bdev_nvme_get_ctx_size, 235 236 }; 237 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if) 238 239 struct nvme_bdev_ctrlrs g_nvme_bdev_ctrlrs = TAILQ_HEAD_INITIALIZER(g_nvme_bdev_ctrlrs); 240 pthread_mutex_t g_bdev_nvme_mutex = PTHREAD_MUTEX_INITIALIZER; 241 bool g_bdev_nvme_module_finish; 242 243 struct nvme_bdev_ctrlr * 244 nvme_bdev_ctrlr_get_by_name(const char *name) 245 { 246 struct nvme_bdev_ctrlr *nbdev_ctrlr; 247 248 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 249 if (strcmp(name, nbdev_ctrlr->name) == 0) { 250 break; 251 } 252 } 253 254 return nbdev_ctrlr; 255 } 256 257 static struct nvme_ctrlr * 258 nvme_bdev_ctrlr_get_ctrlr(struct nvme_bdev_ctrlr *nbdev_ctrlr, 259 const struct spdk_nvme_transport_id *trid) 260 { 261 struct nvme_ctrlr *nvme_ctrlr; 262 263 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 264 if (spdk_nvme_transport_id_compare(trid, &nvme_ctrlr->active_path_id->trid) == 0) { 265 break; 266 } 267 } 268 269 return nvme_ctrlr; 270 } 271 272 struct nvme_ctrlr * 273 nvme_bdev_ctrlr_get_ctrlr_by_id(struct nvme_bdev_ctrlr *nbdev_ctrlr, 274 uint16_t cntlid) 275 { 276 struct nvme_ctrlr *nvme_ctrlr; 277 const struct spdk_nvme_ctrlr_data *cdata; 278 279 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 280 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 281 if (cdata->cntlid == cntlid) { 282 break; 283 } 284 } 285 286 return nvme_ctrlr; 287 } 288 289 static struct nvme_bdev * 290 nvme_bdev_ctrlr_get_bdev(struct nvme_bdev_ctrlr *nbdev_ctrlr, uint32_t nsid) 291 { 292 struct nvme_bdev *bdev; 293 294 pthread_mutex_lock(&g_bdev_nvme_mutex); 295 TAILQ_FOREACH(bdev, &nbdev_ctrlr->bdevs, tailq) { 296 if (bdev->nsid == nsid) { 297 break; 298 } 299 } 300 pthread_mutex_unlock(&g_bdev_nvme_mutex); 301 302 return bdev; 303 } 304 305 struct nvme_ns * 306 nvme_ctrlr_get_ns(struct nvme_ctrlr *nvme_ctrlr, uint32_t nsid) 307 { 308 struct nvme_ns ns; 309 310 assert(nsid > 0); 311 312 ns.id = nsid; 313 return RB_FIND(nvme_ns_tree, &nvme_ctrlr->namespaces, &ns); 314 } 315 316 struct nvme_ns * 317 nvme_ctrlr_get_first_active_ns(struct nvme_ctrlr *nvme_ctrlr) 318 { 319 return RB_MIN(nvme_ns_tree, &nvme_ctrlr->namespaces); 320 } 321 322 struct nvme_ns * 323 nvme_ctrlr_get_next_active_ns(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *ns) 324 { 325 if (ns == NULL) { 326 return NULL; 327 } 328 329 return RB_NEXT(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 330 } 331 332 static struct nvme_ctrlr * 333 nvme_ctrlr_get(const struct spdk_nvme_transport_id *trid) 334 { 335 struct nvme_bdev_ctrlr *nbdev_ctrlr; 336 struct nvme_ctrlr *nvme_ctrlr = NULL; 337 338 pthread_mutex_lock(&g_bdev_nvme_mutex); 339 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 340 nvme_ctrlr = nvme_bdev_ctrlr_get_ctrlr(nbdev_ctrlr, trid); 341 if (nvme_ctrlr != NULL) { 342 break; 343 } 344 } 345 pthread_mutex_unlock(&g_bdev_nvme_mutex); 346 347 return nvme_ctrlr; 348 } 349 350 struct nvme_ctrlr * 351 nvme_ctrlr_get_by_name(const char *name) 352 { 353 struct nvme_bdev_ctrlr *nbdev_ctrlr; 354 struct nvme_ctrlr *nvme_ctrlr = NULL; 355 356 if (name == NULL) { 357 return NULL; 358 } 359 360 pthread_mutex_lock(&g_bdev_nvme_mutex); 361 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 362 if (nbdev_ctrlr != NULL) { 363 nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 364 } 365 pthread_mutex_unlock(&g_bdev_nvme_mutex); 366 367 return nvme_ctrlr; 368 } 369 370 void 371 nvme_bdev_ctrlr_for_each(nvme_bdev_ctrlr_for_each_fn fn, void *ctx) 372 { 373 struct nvme_bdev_ctrlr *nbdev_ctrlr; 374 375 pthread_mutex_lock(&g_bdev_nvme_mutex); 376 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 377 fn(nbdev_ctrlr, ctx); 378 } 379 pthread_mutex_unlock(&g_bdev_nvme_mutex); 380 } 381 382 void 383 nvme_bdev_dump_trid_json(const struct spdk_nvme_transport_id *trid, struct spdk_json_write_ctx *w) 384 { 385 const char *trtype_str; 386 const char *adrfam_str; 387 388 trtype_str = spdk_nvme_transport_id_trtype_str(trid->trtype); 389 if (trtype_str) { 390 spdk_json_write_named_string(w, "trtype", trtype_str); 391 } 392 393 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 394 if (adrfam_str) { 395 spdk_json_write_named_string(w, "adrfam", adrfam_str); 396 } 397 398 if (trid->traddr[0] != '\0') { 399 spdk_json_write_named_string(w, "traddr", trid->traddr); 400 } 401 402 if (trid->trsvcid[0] != '\0') { 403 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 404 } 405 406 if (trid->subnqn[0] != '\0') { 407 spdk_json_write_named_string(w, "subnqn", trid->subnqn); 408 } 409 } 410 411 static void 412 nvme_bdev_ctrlr_delete(struct nvme_bdev_ctrlr *nbdev_ctrlr, 413 struct nvme_ctrlr *nvme_ctrlr) 414 { 415 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_delete, nvme_ctrlr->nbdev_ctrlr->name); 416 pthread_mutex_lock(&g_bdev_nvme_mutex); 417 418 TAILQ_REMOVE(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 419 if (!TAILQ_EMPTY(&nbdev_ctrlr->ctrlrs)) { 420 pthread_mutex_unlock(&g_bdev_nvme_mutex); 421 422 return; 423 } 424 TAILQ_REMOVE(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 425 426 pthread_mutex_unlock(&g_bdev_nvme_mutex); 427 428 assert(TAILQ_EMPTY(&nbdev_ctrlr->bdevs)); 429 430 free(nbdev_ctrlr->name); 431 free(nbdev_ctrlr); 432 } 433 434 static void 435 _nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 436 { 437 struct nvme_path_id *path_id, *tmp_path; 438 struct nvme_ns *ns, *tmp_ns; 439 440 free(nvme_ctrlr->copied_ana_desc); 441 spdk_free(nvme_ctrlr->ana_log_page); 442 443 if (nvme_ctrlr->opal_dev) { 444 spdk_opal_dev_destruct(nvme_ctrlr->opal_dev); 445 nvme_ctrlr->opal_dev = NULL; 446 } 447 448 if (nvme_ctrlr->nbdev_ctrlr) { 449 nvme_bdev_ctrlr_delete(nvme_ctrlr->nbdev_ctrlr, nvme_ctrlr); 450 } 451 452 RB_FOREACH_SAFE(ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp_ns) { 453 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 454 nvme_ns_free(ns); 455 } 456 457 TAILQ_FOREACH_SAFE(path_id, &nvme_ctrlr->trids, link, tmp_path) { 458 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 459 free(path_id); 460 } 461 462 pthread_mutex_destroy(&nvme_ctrlr->mutex); 463 464 free(nvme_ctrlr); 465 466 pthread_mutex_lock(&g_bdev_nvme_mutex); 467 if (g_bdev_nvme_module_finish && TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 468 pthread_mutex_unlock(&g_bdev_nvme_mutex); 469 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 470 spdk_bdev_module_fini_done(); 471 return; 472 } 473 pthread_mutex_unlock(&g_bdev_nvme_mutex); 474 } 475 476 static int 477 nvme_detach_poller(void *arg) 478 { 479 struct nvme_ctrlr *nvme_ctrlr = arg; 480 int rc; 481 482 rc = spdk_nvme_detach_poll_async(nvme_ctrlr->detach_ctx); 483 if (rc != -EAGAIN) { 484 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 485 _nvme_ctrlr_delete(nvme_ctrlr); 486 } 487 488 return SPDK_POLLER_BUSY; 489 } 490 491 static void 492 nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 493 { 494 int rc; 495 496 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 497 498 /* First, unregister the adminq poller, as the driver will poll adminq if necessary */ 499 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 500 501 /* If we got here, the reset/detach poller cannot be active */ 502 assert(nvme_ctrlr->reset_detach_poller == NULL); 503 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(nvme_detach_poller, 504 nvme_ctrlr, 1000); 505 if (nvme_ctrlr->reset_detach_poller == NULL) { 506 SPDK_ERRLOG("Failed to register detach poller\n"); 507 goto error; 508 } 509 510 rc = spdk_nvme_detach_async(nvme_ctrlr->ctrlr, &nvme_ctrlr->detach_ctx); 511 if (rc != 0) { 512 SPDK_ERRLOG("Failed to detach the NVMe controller\n"); 513 goto error; 514 } 515 516 return; 517 error: 518 /* We don't have a good way to handle errors here, so just do what we can and delete the 519 * controller without detaching the underlying NVMe device. 520 */ 521 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 522 _nvme_ctrlr_delete(nvme_ctrlr); 523 } 524 525 static void 526 nvme_ctrlr_unregister_cb(void *io_device) 527 { 528 struct nvme_ctrlr *nvme_ctrlr = io_device; 529 530 nvme_ctrlr_delete(nvme_ctrlr); 531 } 532 533 static void 534 nvme_ctrlr_unregister(void *ctx) 535 { 536 struct nvme_ctrlr *nvme_ctrlr = ctx; 537 538 spdk_io_device_unregister(nvme_ctrlr, nvme_ctrlr_unregister_cb); 539 } 540 541 static bool 542 nvme_ctrlr_can_be_unregistered(struct nvme_ctrlr *nvme_ctrlr) 543 { 544 if (!nvme_ctrlr->destruct) { 545 return false; 546 } 547 548 if (nvme_ctrlr->ref > 0) { 549 return false; 550 } 551 552 if (nvme_ctrlr->resetting) { 553 return false; 554 } 555 556 if (nvme_ctrlr->ana_log_page_updating) { 557 return false; 558 } 559 560 if (nvme_ctrlr->io_path_cache_clearing) { 561 return false; 562 } 563 564 return true; 565 } 566 567 static void 568 nvme_ctrlr_release(struct nvme_ctrlr *nvme_ctrlr) 569 { 570 pthread_mutex_lock(&nvme_ctrlr->mutex); 571 SPDK_DTRACE_PROBE2(bdev_nvme_ctrlr_release, nvme_ctrlr->nbdev_ctrlr->name, nvme_ctrlr->ref); 572 573 assert(nvme_ctrlr->ref > 0); 574 nvme_ctrlr->ref--; 575 576 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 577 pthread_mutex_unlock(&nvme_ctrlr->mutex); 578 return; 579 } 580 581 pthread_mutex_unlock(&nvme_ctrlr->mutex); 582 583 spdk_thread_exec_msg(nvme_ctrlr->thread, nvme_ctrlr_unregister, nvme_ctrlr); 584 } 585 586 static void 587 bdev_nvme_clear_current_io_path(struct nvme_bdev_channel *nbdev_ch) 588 { 589 nbdev_ch->current_io_path = NULL; 590 nbdev_ch->rr_counter = 0; 591 } 592 593 static struct nvme_io_path * 594 _bdev_nvme_get_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 595 { 596 struct nvme_io_path *io_path; 597 598 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 599 if (io_path->nvme_ns == nvme_ns) { 600 break; 601 } 602 } 603 604 return io_path; 605 } 606 607 static struct nvme_io_path * 608 nvme_io_path_alloc(void) 609 { 610 struct nvme_io_path *io_path; 611 612 io_path = calloc(1, sizeof(*io_path)); 613 if (io_path == NULL) { 614 SPDK_ERRLOG("Failed to alloc io_path.\n"); 615 return NULL; 616 } 617 618 if (g_opts.io_path_stat) { 619 io_path->stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); 620 if (io_path->stat == NULL) { 621 free(io_path); 622 SPDK_ERRLOG("Failed to alloc io_path stat.\n"); 623 return NULL; 624 } 625 spdk_bdev_reset_io_stat(io_path->stat, SPDK_BDEV_RESET_STAT_MAXMIN); 626 } 627 628 return io_path; 629 } 630 631 static void 632 nvme_io_path_free(struct nvme_io_path *io_path) 633 { 634 free(io_path->stat); 635 free(io_path); 636 } 637 638 static int 639 _bdev_nvme_add_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 640 { 641 struct nvme_io_path *io_path; 642 struct spdk_io_channel *ch; 643 struct nvme_ctrlr_channel *ctrlr_ch; 644 struct nvme_qpair *nvme_qpair; 645 646 io_path = nvme_io_path_alloc(); 647 if (io_path == NULL) { 648 return -ENOMEM; 649 } 650 651 io_path->nvme_ns = nvme_ns; 652 653 ch = spdk_get_io_channel(nvme_ns->ctrlr); 654 if (ch == NULL) { 655 nvme_io_path_free(io_path); 656 SPDK_ERRLOG("Failed to alloc io_channel.\n"); 657 return -ENOMEM; 658 } 659 660 ctrlr_ch = spdk_io_channel_get_ctx(ch); 661 662 nvme_qpair = ctrlr_ch->qpair; 663 assert(nvme_qpair != NULL); 664 665 io_path->qpair = nvme_qpair; 666 TAILQ_INSERT_TAIL(&nvme_qpair->io_path_list, io_path, tailq); 667 668 io_path->nbdev_ch = nbdev_ch; 669 STAILQ_INSERT_TAIL(&nbdev_ch->io_path_list, io_path, stailq); 670 671 bdev_nvme_clear_current_io_path(nbdev_ch); 672 673 return 0; 674 } 675 676 static void 677 bdev_nvme_clear_retry_io_path(struct nvme_bdev_channel *nbdev_ch, 678 struct nvme_io_path *io_path) 679 { 680 struct spdk_bdev_io *bdev_io; 681 struct nvme_bdev_io *bio; 682 683 TAILQ_FOREACH(bdev_io, &nbdev_ch->retry_io_list, module_link) { 684 bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 685 if (bio->io_path == io_path) { 686 bio->io_path = NULL; 687 } 688 } 689 } 690 691 static void 692 _bdev_nvme_delete_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *io_path) 693 { 694 struct spdk_io_channel *ch; 695 struct nvme_qpair *nvme_qpair; 696 struct nvme_ctrlr_channel *ctrlr_ch; 697 struct nvme_bdev *nbdev; 698 699 nbdev = spdk_io_channel_get_io_device(spdk_io_channel_from_ctx(nbdev_ch)); 700 701 /* Add the statistics to nvme_ns before this path is destroyed. */ 702 pthread_mutex_lock(&nbdev->mutex); 703 if (nbdev->ref != 0 && io_path->nvme_ns->stat != NULL && io_path->stat != NULL) { 704 spdk_bdev_add_io_stat(io_path->nvme_ns->stat, io_path->stat); 705 } 706 pthread_mutex_unlock(&nbdev->mutex); 707 708 bdev_nvme_clear_current_io_path(nbdev_ch); 709 bdev_nvme_clear_retry_io_path(nbdev_ch, io_path); 710 711 STAILQ_REMOVE(&nbdev_ch->io_path_list, io_path, nvme_io_path, stailq); 712 io_path->nbdev_ch = NULL; 713 714 nvme_qpair = io_path->qpair; 715 assert(nvme_qpair != NULL); 716 717 ctrlr_ch = nvme_qpair->ctrlr_ch; 718 assert(ctrlr_ch != NULL); 719 720 ch = spdk_io_channel_from_ctx(ctrlr_ch); 721 spdk_put_io_channel(ch); 722 723 /* After an io_path is removed, I/Os submitted to it may complete and update statistics 724 * of the io_path. To avoid heap-use-after-free error from this case, do not free the 725 * io_path here but free the io_path when the associated qpair is freed. It is ensured 726 * that all I/Os submitted to the io_path are completed when the associated qpair is freed. 727 */ 728 } 729 730 static void 731 _bdev_nvme_delete_io_paths(struct nvme_bdev_channel *nbdev_ch) 732 { 733 struct nvme_io_path *io_path, *tmp_io_path; 734 735 STAILQ_FOREACH_SAFE(io_path, &nbdev_ch->io_path_list, stailq, tmp_io_path) { 736 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 737 } 738 } 739 740 static int 741 bdev_nvme_create_bdev_channel_cb(void *io_device, void *ctx_buf) 742 { 743 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 744 struct nvme_bdev *nbdev = io_device; 745 struct nvme_ns *nvme_ns; 746 int rc; 747 748 STAILQ_INIT(&nbdev_ch->io_path_list); 749 TAILQ_INIT(&nbdev_ch->retry_io_list); 750 751 pthread_mutex_lock(&nbdev->mutex); 752 753 nbdev_ch->mp_policy = nbdev->mp_policy; 754 nbdev_ch->mp_selector = nbdev->mp_selector; 755 nbdev_ch->rr_min_io = nbdev->rr_min_io; 756 757 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 758 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 759 if (rc != 0) { 760 pthread_mutex_unlock(&nbdev->mutex); 761 762 _bdev_nvme_delete_io_paths(nbdev_ch); 763 return rc; 764 } 765 } 766 pthread_mutex_unlock(&nbdev->mutex); 767 768 return 0; 769 } 770 771 /* If cpl != NULL, complete the bdev_io with nvme status based on 'cpl'. 772 * If cpl == NULL, complete the bdev_io with bdev status based on 'status'. 773 */ 774 static inline void 775 __bdev_nvme_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status, 776 const struct spdk_nvme_cpl *cpl) 777 { 778 spdk_trace_record(TRACE_BDEV_NVME_IO_DONE, 0, 0, (uintptr_t)bdev_io->driver_ctx, 779 (uintptr_t)bdev_io); 780 if (cpl) { 781 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 782 } else { 783 spdk_bdev_io_complete(bdev_io, status); 784 } 785 } 786 787 static void bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch); 788 789 static void 790 bdev_nvme_destroy_bdev_channel_cb(void *io_device, void *ctx_buf) 791 { 792 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 793 794 bdev_nvme_abort_retry_ios(nbdev_ch); 795 _bdev_nvme_delete_io_paths(nbdev_ch); 796 } 797 798 static inline bool 799 bdev_nvme_io_type_is_admin(enum spdk_bdev_io_type io_type) 800 { 801 switch (io_type) { 802 case SPDK_BDEV_IO_TYPE_RESET: 803 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 804 case SPDK_BDEV_IO_TYPE_ABORT: 805 return true; 806 default: 807 break; 808 } 809 810 return false; 811 } 812 813 static inline bool 814 nvme_ns_is_accessible(struct nvme_ns *nvme_ns) 815 { 816 if (spdk_unlikely(nvme_ns->ana_state_updating)) { 817 return false; 818 } 819 820 switch (nvme_ns->ana_state) { 821 case SPDK_NVME_ANA_OPTIMIZED_STATE: 822 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 823 return true; 824 default: 825 break; 826 } 827 828 return false; 829 } 830 831 static inline bool 832 nvme_qpair_is_connected(struct nvme_qpair *nvme_qpair) 833 { 834 if (spdk_unlikely(nvme_qpair->qpair == NULL)) { 835 return false; 836 } 837 838 if (spdk_unlikely(spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) != 839 SPDK_NVME_QPAIR_FAILURE_NONE)) { 840 return false; 841 } 842 843 if (spdk_unlikely(nvme_qpair->ctrlr_ch->reset_iter != NULL)) { 844 return false; 845 } 846 847 return true; 848 } 849 850 static inline bool 851 nvme_io_path_is_available(struct nvme_io_path *io_path) 852 { 853 if (spdk_unlikely(!nvme_qpair_is_connected(io_path->qpair))) { 854 return false; 855 } 856 857 if (spdk_unlikely(!nvme_ns_is_accessible(io_path->nvme_ns))) { 858 return false; 859 } 860 861 return true; 862 } 863 864 static inline bool 865 nvme_ctrlr_is_failed(struct nvme_ctrlr *nvme_ctrlr) 866 { 867 if (nvme_ctrlr->destruct) { 868 return true; 869 } 870 871 if (nvme_ctrlr->fast_io_fail_timedout) { 872 return true; 873 } 874 875 if (nvme_ctrlr->resetting) { 876 if (nvme_ctrlr->opts.reconnect_delay_sec != 0) { 877 return false; 878 } else { 879 return true; 880 } 881 } 882 883 if (nvme_ctrlr->reconnect_is_delayed) { 884 return false; 885 } 886 887 if (nvme_ctrlr->disabled) { 888 return true; 889 } 890 891 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 892 return true; 893 } else { 894 return false; 895 } 896 } 897 898 static bool 899 nvme_ctrlr_is_available(struct nvme_ctrlr *nvme_ctrlr) 900 { 901 if (nvme_ctrlr->destruct) { 902 return false; 903 } 904 905 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 906 return false; 907 } 908 909 if (nvme_ctrlr->resetting || nvme_ctrlr->reconnect_is_delayed) { 910 return false; 911 } 912 913 if (nvme_ctrlr->disabled) { 914 return false; 915 } 916 917 return true; 918 } 919 920 /* Simulate circular linked list. */ 921 static inline struct nvme_io_path * 922 nvme_io_path_get_next(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *prev_path) 923 { 924 struct nvme_io_path *next_path; 925 926 if (prev_path != NULL) { 927 next_path = STAILQ_NEXT(prev_path, stailq); 928 if (next_path != NULL) { 929 return next_path; 930 } 931 } 932 933 return STAILQ_FIRST(&nbdev_ch->io_path_list); 934 } 935 936 static struct nvme_io_path * 937 _bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 938 { 939 struct nvme_io_path *io_path, *start, *non_optimized = NULL; 940 941 start = nvme_io_path_get_next(nbdev_ch, nbdev_ch->current_io_path); 942 943 io_path = start; 944 do { 945 if (spdk_likely(nvme_qpair_is_connected(io_path->qpair) && 946 !io_path->nvme_ns->ana_state_updating)) { 947 switch (io_path->nvme_ns->ana_state) { 948 case SPDK_NVME_ANA_OPTIMIZED_STATE: 949 nbdev_ch->current_io_path = io_path; 950 return io_path; 951 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 952 if (non_optimized == NULL) { 953 non_optimized = io_path; 954 } 955 break; 956 default: 957 break; 958 } 959 } 960 io_path = nvme_io_path_get_next(nbdev_ch, io_path); 961 } while (io_path != start); 962 963 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) { 964 /* We come here only if there is no optimized path. Cache even non_optimized 965 * path for load balance across multiple non_optimized paths. 966 */ 967 nbdev_ch->current_io_path = non_optimized; 968 } 969 970 return non_optimized; 971 } 972 973 static struct nvme_io_path * 974 _bdev_nvme_find_io_path_min_qd(struct nvme_bdev_channel *nbdev_ch) 975 { 976 struct nvme_io_path *io_path; 977 struct nvme_io_path *optimized = NULL, *non_optimized = NULL; 978 uint32_t opt_min_qd = UINT32_MAX, non_opt_min_qd = UINT32_MAX; 979 uint32_t num_outstanding_reqs; 980 981 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 982 if (spdk_unlikely(!nvme_qpair_is_connected(io_path->qpair))) { 983 /* The device is currently resetting. */ 984 continue; 985 } 986 987 if (spdk_unlikely(io_path->nvme_ns->ana_state_updating)) { 988 continue; 989 } 990 991 num_outstanding_reqs = spdk_nvme_qpair_get_num_outstanding_reqs(io_path->qpair->qpair); 992 switch (io_path->nvme_ns->ana_state) { 993 case SPDK_NVME_ANA_OPTIMIZED_STATE: 994 if (num_outstanding_reqs < opt_min_qd) { 995 opt_min_qd = num_outstanding_reqs; 996 optimized = io_path; 997 } 998 break; 999 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 1000 if (num_outstanding_reqs < non_opt_min_qd) { 1001 non_opt_min_qd = num_outstanding_reqs; 1002 non_optimized = io_path; 1003 } 1004 break; 1005 default: 1006 break; 1007 } 1008 } 1009 1010 /* don't cache io path for BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH selector */ 1011 if (optimized != NULL) { 1012 return optimized; 1013 } 1014 1015 return non_optimized; 1016 } 1017 1018 static inline struct nvme_io_path * 1019 bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 1020 { 1021 if (spdk_likely(nbdev_ch->current_io_path != NULL)) { 1022 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE) { 1023 return nbdev_ch->current_io_path; 1024 } else if (nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 1025 if (++nbdev_ch->rr_counter < nbdev_ch->rr_min_io) { 1026 return nbdev_ch->current_io_path; 1027 } 1028 nbdev_ch->rr_counter = 0; 1029 } 1030 } 1031 1032 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE || 1033 nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 1034 return _bdev_nvme_find_io_path(nbdev_ch); 1035 } else { 1036 return _bdev_nvme_find_io_path_min_qd(nbdev_ch); 1037 } 1038 } 1039 1040 /* Return true if there is any io_path whose qpair is active or ctrlr is not failed, 1041 * or false otherwise. 1042 * 1043 * If any io_path has an active qpair but find_io_path() returned NULL, its namespace 1044 * is likely to be non-accessible now but may become accessible. 1045 * 1046 * If any io_path has an unfailed ctrlr but find_io_path() returned NULL, the ctrlr 1047 * is likely to be resetting now but the reset may succeed. A ctrlr is set to unfailed 1048 * when starting to reset it but it is set to failed when the reset failed. Hence, if 1049 * a ctrlr is unfailed, it is likely that it works fine or is resetting. 1050 */ 1051 static bool 1052 any_io_path_may_become_available(struct nvme_bdev_channel *nbdev_ch) 1053 { 1054 struct nvme_io_path *io_path; 1055 1056 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 1057 if (io_path->nvme_ns->ana_transition_timedout) { 1058 continue; 1059 } 1060 1061 if (nvme_qpair_is_connected(io_path->qpair) || 1062 !nvme_ctrlr_is_failed(io_path->qpair->ctrlr)) { 1063 return true; 1064 } 1065 } 1066 1067 return false; 1068 } 1069 1070 static void 1071 bdev_nvme_retry_io(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io) 1072 { 1073 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 1074 struct spdk_io_channel *ch; 1075 1076 if (nbdev_io->io_path != NULL && nvme_io_path_is_available(nbdev_io->io_path)) { 1077 _bdev_nvme_submit_request(nbdev_ch, bdev_io); 1078 } else { 1079 ch = spdk_io_channel_from_ctx(nbdev_ch); 1080 bdev_nvme_submit_request(ch, bdev_io); 1081 } 1082 } 1083 1084 static int 1085 bdev_nvme_retry_ios(void *arg) 1086 { 1087 struct nvme_bdev_channel *nbdev_ch = arg; 1088 struct spdk_bdev_io *bdev_io, *tmp_bdev_io; 1089 struct nvme_bdev_io *bio; 1090 uint64_t now, delay_us; 1091 1092 now = spdk_get_ticks(); 1093 1094 TAILQ_FOREACH_SAFE(bdev_io, &nbdev_ch->retry_io_list, module_link, tmp_bdev_io) { 1095 bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 1096 if (bio->retry_ticks > now) { 1097 break; 1098 } 1099 1100 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io, module_link); 1101 1102 bdev_nvme_retry_io(nbdev_ch, bdev_io); 1103 } 1104 1105 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1106 1107 bdev_io = TAILQ_FIRST(&nbdev_ch->retry_io_list); 1108 if (bdev_io != NULL) { 1109 bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 1110 1111 delay_us = (bio->retry_ticks - now) * SPDK_SEC_TO_USEC / spdk_get_ticks_hz(); 1112 1113 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 1114 delay_us); 1115 } 1116 1117 return SPDK_POLLER_BUSY; 1118 } 1119 1120 static void 1121 bdev_nvme_queue_retry_io(struct nvme_bdev_channel *nbdev_ch, 1122 struct nvme_bdev_io *bio, uint64_t delay_ms) 1123 { 1124 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1125 struct spdk_bdev_io *tmp_bdev_io; 1126 struct nvme_bdev_io *tmp_bio; 1127 1128 bio->retry_ticks = spdk_get_ticks() + delay_ms * spdk_get_ticks_hz() / 1000ULL; 1129 1130 TAILQ_FOREACH_REVERSE(tmp_bdev_io, &nbdev_ch->retry_io_list, retry_io_head, module_link) { 1131 tmp_bio = (struct nvme_bdev_io *)tmp_bdev_io->driver_ctx; 1132 1133 if (tmp_bio->retry_ticks <= bio->retry_ticks) { 1134 TAILQ_INSERT_AFTER(&nbdev_ch->retry_io_list, tmp_bdev_io, bdev_io, 1135 module_link); 1136 return; 1137 } 1138 } 1139 1140 /* No earlier I/Os were found. This I/O must be the new head. */ 1141 TAILQ_INSERT_HEAD(&nbdev_ch->retry_io_list, bdev_io, module_link); 1142 1143 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1144 1145 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 1146 delay_ms * 1000ULL); 1147 } 1148 1149 static void 1150 bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch) 1151 { 1152 struct spdk_bdev_io *bdev_io, *tmp_io; 1153 1154 TAILQ_FOREACH_SAFE(bdev_io, &nbdev_ch->retry_io_list, module_link, tmp_io) { 1155 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io, module_link); 1156 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED, NULL); 1157 } 1158 1159 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1160 } 1161 1162 static int 1163 bdev_nvme_abort_retry_io(struct nvme_bdev_channel *nbdev_ch, 1164 struct nvme_bdev_io *bio_to_abort) 1165 { 1166 struct spdk_bdev_io *bdev_io_to_abort; 1167 1168 TAILQ_FOREACH(bdev_io_to_abort, &nbdev_ch->retry_io_list, module_link) { 1169 if ((struct nvme_bdev_io *)bdev_io_to_abort->driver_ctx == bio_to_abort) { 1170 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io_to_abort, module_link); 1171 __bdev_nvme_io_complete(bdev_io_to_abort, SPDK_BDEV_IO_STATUS_ABORTED, NULL); 1172 return 0; 1173 } 1174 } 1175 1176 return -ENOENT; 1177 } 1178 1179 static void 1180 bdev_nvme_update_nvme_error_stat(struct spdk_bdev_io *bdev_io, const struct spdk_nvme_cpl *cpl) 1181 { 1182 struct nvme_bdev *nbdev; 1183 uint16_t sct, sc; 1184 1185 assert(spdk_nvme_cpl_is_error(cpl)); 1186 1187 nbdev = bdev_io->bdev->ctxt; 1188 1189 if (nbdev->err_stat == NULL) { 1190 return; 1191 } 1192 1193 sct = cpl->status.sct; 1194 sc = cpl->status.sc; 1195 1196 pthread_mutex_lock(&nbdev->mutex); 1197 1198 nbdev->err_stat->status_type[sct]++; 1199 switch (sct) { 1200 case SPDK_NVME_SCT_GENERIC: 1201 case SPDK_NVME_SCT_COMMAND_SPECIFIC: 1202 case SPDK_NVME_SCT_MEDIA_ERROR: 1203 case SPDK_NVME_SCT_PATH: 1204 nbdev->err_stat->status[sct][sc]++; 1205 break; 1206 default: 1207 break; 1208 } 1209 1210 pthread_mutex_unlock(&nbdev->mutex); 1211 } 1212 1213 static inline void 1214 bdev_nvme_update_io_path_stat(struct nvme_bdev_io *bio) 1215 { 1216 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1217 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 1218 uint32_t blocklen = bdev_io->bdev->blocklen; 1219 struct spdk_bdev_io_stat *stat; 1220 uint64_t tsc_diff; 1221 1222 if (bio->io_path->stat == NULL) { 1223 return; 1224 } 1225 1226 tsc_diff = spdk_get_ticks() - bio->submit_tsc; 1227 stat = bio->io_path->stat; 1228 1229 switch (bdev_io->type) { 1230 case SPDK_BDEV_IO_TYPE_READ: 1231 stat->bytes_read += num_blocks * blocklen; 1232 stat->num_read_ops++; 1233 stat->read_latency_ticks += tsc_diff; 1234 if (stat->max_read_latency_ticks < tsc_diff) { 1235 stat->max_read_latency_ticks = tsc_diff; 1236 } 1237 if (stat->min_read_latency_ticks > tsc_diff) { 1238 stat->min_read_latency_ticks = tsc_diff; 1239 } 1240 break; 1241 case SPDK_BDEV_IO_TYPE_WRITE: 1242 stat->bytes_written += num_blocks * blocklen; 1243 stat->num_write_ops++; 1244 stat->write_latency_ticks += tsc_diff; 1245 if (stat->max_write_latency_ticks < tsc_diff) { 1246 stat->max_write_latency_ticks = tsc_diff; 1247 } 1248 if (stat->min_write_latency_ticks > tsc_diff) { 1249 stat->min_write_latency_ticks = tsc_diff; 1250 } 1251 break; 1252 case SPDK_BDEV_IO_TYPE_UNMAP: 1253 stat->bytes_unmapped += num_blocks * blocklen; 1254 stat->num_unmap_ops++; 1255 stat->unmap_latency_ticks += tsc_diff; 1256 if (stat->max_unmap_latency_ticks < tsc_diff) { 1257 stat->max_unmap_latency_ticks = tsc_diff; 1258 } 1259 if (stat->min_unmap_latency_ticks > tsc_diff) { 1260 stat->min_unmap_latency_ticks = tsc_diff; 1261 } 1262 break; 1263 case SPDK_BDEV_IO_TYPE_ZCOPY: 1264 /* Track the data in the start phase only */ 1265 if (!bdev_io->u.bdev.zcopy.start) { 1266 break; 1267 } 1268 if (bdev_io->u.bdev.zcopy.populate) { 1269 stat->bytes_read += num_blocks * blocklen; 1270 stat->num_read_ops++; 1271 stat->read_latency_ticks += tsc_diff; 1272 if (stat->max_read_latency_ticks < tsc_diff) { 1273 stat->max_read_latency_ticks = tsc_diff; 1274 } 1275 if (stat->min_read_latency_ticks > tsc_diff) { 1276 stat->min_read_latency_ticks = tsc_diff; 1277 } 1278 } else { 1279 stat->bytes_written += num_blocks * blocklen; 1280 stat->num_write_ops++; 1281 stat->write_latency_ticks += tsc_diff; 1282 if (stat->max_write_latency_ticks < tsc_diff) { 1283 stat->max_write_latency_ticks = tsc_diff; 1284 } 1285 if (stat->min_write_latency_ticks > tsc_diff) { 1286 stat->min_write_latency_ticks = tsc_diff; 1287 } 1288 } 1289 break; 1290 case SPDK_BDEV_IO_TYPE_COPY: 1291 stat->bytes_copied += num_blocks * blocklen; 1292 stat->num_copy_ops++; 1293 stat->copy_latency_ticks += tsc_diff; 1294 if (stat->max_copy_latency_ticks < tsc_diff) { 1295 stat->max_copy_latency_ticks = tsc_diff; 1296 } 1297 if (stat->min_copy_latency_ticks > tsc_diff) { 1298 stat->min_copy_latency_ticks = tsc_diff; 1299 } 1300 break; 1301 default: 1302 break; 1303 } 1304 } 1305 1306 static bool 1307 bdev_nvme_check_retry_io(struct nvme_bdev_io *bio, 1308 const struct spdk_nvme_cpl *cpl, 1309 struct nvme_bdev_channel *nbdev_ch, 1310 uint64_t *_delay_ms) 1311 { 1312 struct nvme_io_path *io_path = bio->io_path; 1313 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 1314 const struct spdk_nvme_ctrlr_data *cdata; 1315 1316 if (spdk_nvme_cpl_is_path_error(cpl) || 1317 spdk_nvme_cpl_is_aborted_sq_deletion(cpl) || 1318 !nvme_io_path_is_available(io_path) || 1319 !nvme_ctrlr_is_available(nvme_ctrlr)) { 1320 bdev_nvme_clear_current_io_path(nbdev_ch); 1321 bio->io_path = NULL; 1322 if (spdk_nvme_cpl_is_ana_error(cpl)) { 1323 if (nvme_ctrlr_read_ana_log_page(nvme_ctrlr) == 0) { 1324 io_path->nvme_ns->ana_state_updating = true; 1325 } 1326 } 1327 if (!any_io_path_may_become_available(nbdev_ch)) { 1328 return false; 1329 } 1330 *_delay_ms = 0; 1331 } else { 1332 bio->retry_count++; 1333 1334 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 1335 1336 if (cpl->status.crd != 0) { 1337 *_delay_ms = cdata->crdt[cpl->status.crd] * 100; 1338 } else { 1339 *_delay_ms = 0; 1340 } 1341 } 1342 1343 return true; 1344 } 1345 1346 static inline void 1347 bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio, 1348 const struct spdk_nvme_cpl *cpl) 1349 { 1350 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1351 struct nvme_bdev_channel *nbdev_ch; 1352 uint64_t delay_ms; 1353 1354 assert(!bdev_nvme_io_type_is_admin(bdev_io->type)); 1355 1356 if (spdk_likely(spdk_nvme_cpl_is_success(cpl))) { 1357 bdev_nvme_update_io_path_stat(bio); 1358 goto complete; 1359 } 1360 1361 /* Update error counts before deciding if retry is needed. 1362 * Hence, error counts may be more than the number of I/O errors. 1363 */ 1364 bdev_nvme_update_nvme_error_stat(bdev_io, cpl); 1365 1366 if (cpl->status.dnr != 0 || spdk_nvme_cpl_is_aborted_by_request(cpl) || 1367 (g_opts.bdev_retry_count != -1 && bio->retry_count >= g_opts.bdev_retry_count)) { 1368 goto complete; 1369 } 1370 1371 /* At this point we don't know whether the sequence was successfully executed or not, so we 1372 * cannot retry the IO */ 1373 if (bdev_io->u.bdev.accel_sequence != NULL) { 1374 goto complete; 1375 } 1376 1377 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1378 1379 if (bdev_nvme_check_retry_io(bio, cpl, nbdev_ch, &delay_ms)) { 1380 bdev_nvme_queue_retry_io(nbdev_ch, bio, delay_ms); 1381 return; 1382 } 1383 1384 complete: 1385 bio->retry_count = 0; 1386 bio->submit_tsc = 0; 1387 bdev_io->u.bdev.accel_sequence = NULL; 1388 __bdev_nvme_io_complete(bdev_io, 0, cpl); 1389 } 1390 1391 static inline void 1392 bdev_nvme_io_complete(struct nvme_bdev_io *bio, int rc) 1393 { 1394 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1395 struct nvme_bdev_channel *nbdev_ch; 1396 enum spdk_bdev_io_status io_status; 1397 1398 assert(!bdev_nvme_io_type_is_admin(bdev_io->type)); 1399 1400 switch (rc) { 1401 case 0: 1402 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1403 break; 1404 case -ENOMEM: 1405 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1406 break; 1407 case -ENXIO: 1408 if (g_opts.bdev_retry_count == -1 || bio->retry_count < g_opts.bdev_retry_count) { 1409 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1410 1411 bdev_nvme_clear_current_io_path(nbdev_ch); 1412 bio->io_path = NULL; 1413 1414 if (any_io_path_may_become_available(nbdev_ch)) { 1415 bdev_nvme_queue_retry_io(nbdev_ch, bio, 1000ULL); 1416 return; 1417 } 1418 } 1419 1420 /* fallthrough */ 1421 default: 1422 spdk_accel_sequence_abort(bdev_io->u.bdev.accel_sequence); 1423 bdev_io->u.bdev.accel_sequence = NULL; 1424 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1425 break; 1426 } 1427 1428 bio->retry_count = 0; 1429 bio->submit_tsc = 0; 1430 __bdev_nvme_io_complete(bdev_io, io_status, NULL); 1431 } 1432 1433 static inline void 1434 bdev_nvme_admin_complete(struct nvme_bdev_io *bio, int rc) 1435 { 1436 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1437 enum spdk_bdev_io_status io_status; 1438 1439 switch (rc) { 1440 case 0: 1441 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1442 break; 1443 case -ENOMEM: 1444 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1445 break; 1446 case -ENXIO: 1447 /* fallthrough */ 1448 default: 1449 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1450 break; 1451 } 1452 1453 __bdev_nvme_io_complete(bdev_io, io_status, NULL); 1454 } 1455 1456 static void 1457 bdev_nvme_clear_io_path_caches_done(struct spdk_io_channel_iter *i, int status) 1458 { 1459 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 1460 1461 pthread_mutex_lock(&nvme_ctrlr->mutex); 1462 1463 assert(nvme_ctrlr->io_path_cache_clearing == true); 1464 nvme_ctrlr->io_path_cache_clearing = false; 1465 1466 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 1467 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1468 return; 1469 } 1470 1471 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1472 1473 nvme_ctrlr_unregister(nvme_ctrlr); 1474 } 1475 1476 static void 1477 _bdev_nvme_clear_io_path_cache(struct nvme_qpair *nvme_qpair) 1478 { 1479 struct nvme_io_path *io_path; 1480 1481 TAILQ_FOREACH(io_path, &nvme_qpair->io_path_list, tailq) { 1482 if (io_path->nbdev_ch == NULL) { 1483 continue; 1484 } 1485 bdev_nvme_clear_current_io_path(io_path->nbdev_ch); 1486 } 1487 } 1488 1489 static void 1490 bdev_nvme_clear_io_path_cache(struct spdk_io_channel_iter *i) 1491 { 1492 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 1493 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 1494 1495 assert(ctrlr_ch->qpair != NULL); 1496 1497 _bdev_nvme_clear_io_path_cache(ctrlr_ch->qpair); 1498 1499 spdk_for_each_channel_continue(i, 0); 1500 } 1501 1502 static void 1503 bdev_nvme_clear_io_path_caches(struct nvme_ctrlr *nvme_ctrlr) 1504 { 1505 pthread_mutex_lock(&nvme_ctrlr->mutex); 1506 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 1507 nvme_ctrlr->io_path_cache_clearing) { 1508 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1509 return; 1510 } 1511 1512 nvme_ctrlr->io_path_cache_clearing = true; 1513 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1514 1515 spdk_for_each_channel(nvme_ctrlr, 1516 bdev_nvme_clear_io_path_cache, 1517 NULL, 1518 bdev_nvme_clear_io_path_caches_done); 1519 } 1520 1521 static struct nvme_qpair * 1522 nvme_poll_group_get_qpair(struct nvme_poll_group *group, struct spdk_nvme_qpair *qpair) 1523 { 1524 struct nvme_qpair *nvme_qpair; 1525 1526 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1527 if (nvme_qpair->qpair == qpair) { 1528 break; 1529 } 1530 } 1531 1532 return nvme_qpair; 1533 } 1534 1535 static void nvme_qpair_delete(struct nvme_qpair *nvme_qpair); 1536 1537 static void 1538 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx) 1539 { 1540 struct nvme_poll_group *group = poll_group_ctx; 1541 struct nvme_qpair *nvme_qpair; 1542 struct nvme_ctrlr_channel *ctrlr_ch; 1543 int status; 1544 1545 nvme_qpair = nvme_poll_group_get_qpair(group, qpair); 1546 if (nvme_qpair == NULL) { 1547 return; 1548 } 1549 1550 if (nvme_qpair->qpair != NULL) { 1551 spdk_nvme_ctrlr_free_io_qpair(nvme_qpair->qpair); 1552 nvme_qpair->qpair = NULL; 1553 } 1554 1555 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1556 1557 ctrlr_ch = nvme_qpair->ctrlr_ch; 1558 1559 if (ctrlr_ch != NULL) { 1560 if (ctrlr_ch->reset_iter != NULL) { 1561 /* We are in a full reset sequence. */ 1562 if (ctrlr_ch->connect_poller != NULL) { 1563 /* qpair was failed to connect. Abort the reset sequence. */ 1564 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was failed to connect. abort the reset ctrlr sequence.\n", 1565 qpair); 1566 spdk_poller_unregister(&ctrlr_ch->connect_poller); 1567 status = -1; 1568 } else { 1569 /* qpair was completed to disconnect. Just move to the next ctrlr_channel. */ 1570 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was disconnected and freed in a reset ctrlr sequence.\n", 1571 qpair); 1572 status = 0; 1573 } 1574 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, status); 1575 ctrlr_ch->reset_iter = NULL; 1576 } else { 1577 /* qpair was disconnected unexpectedly. Reset controller for recovery. */ 1578 SPDK_NOTICELOG("qpair %p was disconnected and freed. reset controller.\n", qpair); 1579 bdev_nvme_failover_ctrlr(nvme_qpair->ctrlr); 1580 } 1581 } else { 1582 /* In this case, ctrlr_channel is already deleted. */ 1583 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was disconnected and freed. delete nvme_qpair.\n", qpair); 1584 nvme_qpair_delete(nvme_qpair); 1585 } 1586 } 1587 1588 static void 1589 bdev_nvme_check_io_qpairs(struct nvme_poll_group *group) 1590 { 1591 struct nvme_qpair *nvme_qpair; 1592 1593 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1594 if (nvme_qpair->qpair == NULL || nvme_qpair->ctrlr_ch == NULL) { 1595 continue; 1596 } 1597 1598 if (spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) != 1599 SPDK_NVME_QPAIR_FAILURE_NONE) { 1600 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1601 } 1602 } 1603 } 1604 1605 static int 1606 bdev_nvme_poll(void *arg) 1607 { 1608 struct nvme_poll_group *group = arg; 1609 int64_t num_completions; 1610 1611 if (group->collect_spin_stat && group->start_ticks == 0) { 1612 group->start_ticks = spdk_get_ticks(); 1613 } 1614 1615 num_completions = spdk_nvme_poll_group_process_completions(group->group, 0, 1616 bdev_nvme_disconnected_qpair_cb); 1617 if (group->collect_spin_stat) { 1618 if (num_completions > 0) { 1619 if (group->end_ticks != 0) { 1620 group->spin_ticks += (group->end_ticks - group->start_ticks); 1621 group->end_ticks = 0; 1622 } 1623 group->start_ticks = 0; 1624 } else { 1625 group->end_ticks = spdk_get_ticks(); 1626 } 1627 } 1628 1629 if (spdk_unlikely(num_completions < 0)) { 1630 bdev_nvme_check_io_qpairs(group); 1631 } 1632 1633 return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 1634 } 1635 1636 static int bdev_nvme_poll_adminq(void *arg); 1637 1638 static void 1639 bdev_nvme_change_adminq_poll_period(struct nvme_ctrlr *nvme_ctrlr, uint64_t new_period_us) 1640 { 1641 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 1642 1643 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, 1644 nvme_ctrlr, new_period_us); 1645 } 1646 1647 static int 1648 bdev_nvme_poll_adminq(void *arg) 1649 { 1650 int32_t rc; 1651 struct nvme_ctrlr *nvme_ctrlr = arg; 1652 nvme_ctrlr_disconnected_cb disconnected_cb; 1653 1654 assert(nvme_ctrlr != NULL); 1655 1656 rc = spdk_nvme_ctrlr_process_admin_completions(nvme_ctrlr->ctrlr); 1657 if (rc < 0) { 1658 disconnected_cb = nvme_ctrlr->disconnected_cb; 1659 nvme_ctrlr->disconnected_cb = NULL; 1660 1661 if (disconnected_cb != NULL) { 1662 bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 1663 g_opts.nvme_adminq_poll_period_us); 1664 disconnected_cb(nvme_ctrlr); 1665 } else { 1666 bdev_nvme_failover_ctrlr(nvme_ctrlr); 1667 } 1668 } else if (spdk_nvme_ctrlr_get_admin_qp_failure_reason(nvme_ctrlr->ctrlr) != 1669 SPDK_NVME_QPAIR_FAILURE_NONE) { 1670 bdev_nvme_clear_io_path_caches(nvme_ctrlr); 1671 } 1672 1673 return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY; 1674 } 1675 1676 static void 1677 nvme_bdev_free(void *io_device) 1678 { 1679 struct nvme_bdev *nvme_disk = io_device; 1680 1681 pthread_mutex_destroy(&nvme_disk->mutex); 1682 free(nvme_disk->disk.name); 1683 free(nvme_disk->err_stat); 1684 free(nvme_disk); 1685 } 1686 1687 static int 1688 bdev_nvme_destruct(void *ctx) 1689 { 1690 struct nvme_bdev *nvme_disk = ctx; 1691 struct nvme_ns *nvme_ns, *tmp_nvme_ns; 1692 1693 SPDK_DTRACE_PROBE2(bdev_nvme_destruct, nvme_disk->nbdev_ctrlr->name, nvme_disk->nsid); 1694 1695 TAILQ_FOREACH_SAFE(nvme_ns, &nvme_disk->nvme_ns_list, tailq, tmp_nvme_ns) { 1696 pthread_mutex_lock(&nvme_ns->ctrlr->mutex); 1697 1698 nvme_ns->bdev = NULL; 1699 1700 assert(nvme_ns->id > 0); 1701 1702 if (nvme_ctrlr_get_ns(nvme_ns->ctrlr, nvme_ns->id) == NULL) { 1703 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1704 1705 nvme_ctrlr_release(nvme_ns->ctrlr); 1706 nvme_ns_free(nvme_ns); 1707 } else { 1708 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1709 } 1710 } 1711 1712 pthread_mutex_lock(&g_bdev_nvme_mutex); 1713 TAILQ_REMOVE(&nvme_disk->nbdev_ctrlr->bdevs, nvme_disk, tailq); 1714 pthread_mutex_unlock(&g_bdev_nvme_mutex); 1715 1716 spdk_io_device_unregister(nvme_disk, nvme_bdev_free); 1717 1718 return 0; 1719 } 1720 1721 static int 1722 bdev_nvme_create_qpair(struct nvme_qpair *nvme_qpair) 1723 { 1724 struct nvme_ctrlr *nvme_ctrlr; 1725 struct spdk_nvme_io_qpair_opts opts; 1726 struct spdk_nvme_qpair *qpair; 1727 int rc; 1728 1729 nvme_ctrlr = nvme_qpair->ctrlr; 1730 1731 spdk_nvme_ctrlr_get_default_io_qpair_opts(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1732 opts.delay_cmd_submit = g_opts.delay_cmd_submit; 1733 opts.create_only = true; 1734 opts.async_mode = true; 1735 opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests); 1736 g_opts.io_queue_requests = opts.io_queue_requests; 1737 1738 qpair = spdk_nvme_ctrlr_alloc_io_qpair(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1739 if (qpair == NULL) { 1740 return -1; 1741 } 1742 1743 SPDK_DTRACE_PROBE3(bdev_nvme_create_qpair, nvme_ctrlr->nbdev_ctrlr->name, 1744 spdk_nvme_qpair_get_id(qpair), spdk_thread_get_id(nvme_ctrlr->thread)); 1745 1746 assert(nvme_qpair->group != NULL); 1747 1748 rc = spdk_nvme_poll_group_add(nvme_qpair->group->group, qpair); 1749 if (rc != 0) { 1750 SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n"); 1751 goto err; 1752 } 1753 1754 rc = spdk_nvme_ctrlr_connect_io_qpair(nvme_ctrlr->ctrlr, qpair); 1755 if (rc != 0) { 1756 SPDK_ERRLOG("Unable to connect I/O qpair.\n"); 1757 goto err; 1758 } 1759 1760 nvme_qpair->qpair = qpair; 1761 1762 if (!g_opts.disable_auto_failback) { 1763 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1764 } 1765 1766 return 0; 1767 1768 err: 1769 spdk_nvme_ctrlr_free_io_qpair(qpair); 1770 1771 return rc; 1772 } 1773 1774 static void 1775 bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i) 1776 { 1777 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 1778 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 1779 enum spdk_bdev_io_status status = SPDK_BDEV_IO_STATUS_SUCCESS; 1780 struct spdk_bdev_io *bdev_io; 1781 1782 if (spdk_io_channel_iter_get_ctx(i) != NULL) { 1783 status = SPDK_BDEV_IO_STATUS_FAILED; 1784 } 1785 1786 while (!TAILQ_EMPTY(&ctrlr_ch->pending_resets)) { 1787 bdev_io = TAILQ_FIRST(&ctrlr_ch->pending_resets); 1788 TAILQ_REMOVE(&ctrlr_ch->pending_resets, bdev_io, module_link); 1789 __bdev_nvme_io_complete(bdev_io, status, NULL); 1790 } 1791 1792 spdk_for_each_channel_continue(i, 0); 1793 } 1794 1795 /* This function marks the current trid as failed by storing the current ticks 1796 * and then sets the next trid to the active trid within a controller if exists. 1797 * 1798 * The purpose of the boolean return value is to request the caller to disconnect 1799 * the current trid now to try connecting the next trid. 1800 */ 1801 static bool 1802 bdev_nvme_failover_trid(struct nvme_ctrlr *nvme_ctrlr, bool remove, bool start) 1803 { 1804 struct nvme_path_id *path_id, *next_path; 1805 int rc __attribute__((unused)); 1806 1807 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 1808 assert(path_id); 1809 assert(path_id == nvme_ctrlr->active_path_id); 1810 next_path = TAILQ_NEXT(path_id, link); 1811 1812 /* Update the last failed time. It means the trid is failed if its last 1813 * failed time is non-zero. 1814 */ 1815 path_id->last_failed_tsc = spdk_get_ticks(); 1816 1817 if (next_path == NULL) { 1818 /* There is no alternate trid within a controller. */ 1819 return false; 1820 } 1821 1822 if (!start && nvme_ctrlr->opts.reconnect_delay_sec == 0) { 1823 /* Connect is not retried in a controller reset sequence. Connecting 1824 * the next trid will be done by the next bdev_nvme_failover_ctrlr() call. 1825 */ 1826 return false; 1827 } 1828 1829 assert(path_id->trid.trtype != SPDK_NVME_TRANSPORT_PCIE); 1830 1831 SPDK_NOTICELOG("Start failover from %s:%s to %s:%s\n", path_id->trid.traddr, 1832 path_id->trid.trsvcid, next_path->trid.traddr, next_path->trid.trsvcid); 1833 1834 spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr); 1835 nvme_ctrlr->active_path_id = next_path; 1836 rc = spdk_nvme_ctrlr_set_trid(nvme_ctrlr->ctrlr, &next_path->trid); 1837 assert(rc == 0); 1838 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 1839 if (!remove) { 1840 /** Shuffle the old trid to the end of the list and use the new one. 1841 * Allows for round robin through multiple connections. 1842 */ 1843 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, path_id, link); 1844 } else { 1845 free(path_id); 1846 } 1847 1848 if (start || next_path->last_failed_tsc == 0) { 1849 /* bdev_nvme_failover_ctrlr() is just called or the next trid is not failed 1850 * or used yet. Try the next trid now. 1851 */ 1852 return true; 1853 } 1854 1855 if (spdk_get_ticks() > next_path->last_failed_tsc + spdk_get_ticks_hz() * 1856 nvme_ctrlr->opts.reconnect_delay_sec) { 1857 /* Enough backoff passed since the next trid failed. Try the next trid now. */ 1858 return true; 1859 } 1860 1861 /* The next trid will be tried after reconnect_delay_sec seconds. */ 1862 return false; 1863 } 1864 1865 static bool 1866 bdev_nvme_check_ctrlr_loss_timeout(struct nvme_ctrlr *nvme_ctrlr) 1867 { 1868 int32_t elapsed; 1869 1870 if (nvme_ctrlr->opts.ctrlr_loss_timeout_sec == 0 || 1871 nvme_ctrlr->opts.ctrlr_loss_timeout_sec == -1) { 1872 return false; 1873 } 1874 1875 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 1876 if (elapsed >= nvme_ctrlr->opts.ctrlr_loss_timeout_sec) { 1877 return true; 1878 } else { 1879 return false; 1880 } 1881 } 1882 1883 static bool 1884 bdev_nvme_check_fast_io_fail_timeout(struct nvme_ctrlr *nvme_ctrlr) 1885 { 1886 uint32_t elapsed; 1887 1888 if (nvme_ctrlr->opts.fast_io_fail_timeout_sec == 0) { 1889 return false; 1890 } 1891 1892 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 1893 if (elapsed >= nvme_ctrlr->opts.fast_io_fail_timeout_sec) { 1894 return true; 1895 } else { 1896 return false; 1897 } 1898 } 1899 1900 static void bdev_nvme_reset_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, bool success); 1901 1902 static void 1903 nvme_ctrlr_disconnect(struct nvme_ctrlr *nvme_ctrlr, nvme_ctrlr_disconnected_cb cb_fn) 1904 { 1905 int rc; 1906 1907 rc = spdk_nvme_ctrlr_disconnect(nvme_ctrlr->ctrlr); 1908 if (rc != 0) { 1909 /* Disconnect fails if ctrlr is already resetting or removed. In this case, 1910 * fail the reset sequence immediately. 1911 */ 1912 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 1913 return; 1914 } 1915 1916 /* spdk_nvme_ctrlr_disconnect() may complete asynchronously later by polling adminq. 1917 * Set callback here to execute the specified operation after ctrlr is really disconnected. 1918 */ 1919 assert(nvme_ctrlr->disconnected_cb == NULL); 1920 nvme_ctrlr->disconnected_cb = cb_fn; 1921 1922 /* During disconnection, reduce the period to poll adminq more often. */ 1923 bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 0); 1924 } 1925 1926 enum bdev_nvme_op_after_reset { 1927 OP_NONE, 1928 OP_COMPLETE_PENDING_DESTRUCT, 1929 OP_DESTRUCT, 1930 OP_DELAYED_RECONNECT, 1931 OP_FAILOVER, 1932 }; 1933 1934 typedef enum bdev_nvme_op_after_reset _bdev_nvme_op_after_reset; 1935 1936 static _bdev_nvme_op_after_reset 1937 bdev_nvme_check_op_after_reset(struct nvme_ctrlr *nvme_ctrlr, bool success) 1938 { 1939 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 1940 /* Complete pending destruct after reset completes. */ 1941 return OP_COMPLETE_PENDING_DESTRUCT; 1942 } else if (nvme_ctrlr->pending_failover) { 1943 nvme_ctrlr->pending_failover = false; 1944 nvme_ctrlr->reset_start_tsc = 0; 1945 return OP_FAILOVER; 1946 } else if (success || nvme_ctrlr->opts.reconnect_delay_sec == 0) { 1947 nvme_ctrlr->reset_start_tsc = 0; 1948 return OP_NONE; 1949 } else if (bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 1950 return OP_DESTRUCT; 1951 } else { 1952 if (bdev_nvme_check_fast_io_fail_timeout(nvme_ctrlr)) { 1953 nvme_ctrlr->fast_io_fail_timedout = true; 1954 } 1955 return OP_DELAYED_RECONNECT; 1956 } 1957 } 1958 1959 static int bdev_nvme_delete_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool hotplug); 1960 static void bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 1961 1962 static int 1963 bdev_nvme_reconnect_delay_timer_expired(void *ctx) 1964 { 1965 struct nvme_ctrlr *nvme_ctrlr = ctx; 1966 1967 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect_delay, nvme_ctrlr->nbdev_ctrlr->name); 1968 pthread_mutex_lock(&nvme_ctrlr->mutex); 1969 1970 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 1971 1972 if (!nvme_ctrlr->reconnect_is_delayed) { 1973 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1974 return SPDK_POLLER_BUSY; 1975 } 1976 1977 nvme_ctrlr->reconnect_is_delayed = false; 1978 1979 if (nvme_ctrlr->destruct) { 1980 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1981 return SPDK_POLLER_BUSY; 1982 } 1983 1984 assert(nvme_ctrlr->resetting == false); 1985 nvme_ctrlr->resetting = true; 1986 1987 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1988 1989 spdk_poller_resume(nvme_ctrlr->adminq_timer_poller); 1990 1991 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 1992 return SPDK_POLLER_BUSY; 1993 } 1994 1995 static void 1996 bdev_nvme_start_reconnect_delay_timer(struct nvme_ctrlr *nvme_ctrlr) 1997 { 1998 spdk_poller_pause(nvme_ctrlr->adminq_timer_poller); 1999 2000 assert(nvme_ctrlr->reconnect_is_delayed == false); 2001 nvme_ctrlr->reconnect_is_delayed = true; 2002 2003 assert(nvme_ctrlr->reconnect_delay_timer == NULL); 2004 nvme_ctrlr->reconnect_delay_timer = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_delay_timer_expired, 2005 nvme_ctrlr, 2006 nvme_ctrlr->opts.reconnect_delay_sec * SPDK_SEC_TO_USEC); 2007 } 2008 2009 static void remove_discovery_entry(struct nvme_ctrlr *nvme_ctrlr); 2010 2011 static void 2012 _bdev_nvme_reset_ctrlr_complete(struct spdk_io_channel_iter *i, int status) 2013 { 2014 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2015 bool success = spdk_io_channel_iter_get_ctx(i) == NULL; 2016 bdev_nvme_ctrlr_op_cb ctrlr_op_cb_fn = nvme_ctrlr->ctrlr_op_cb_fn; 2017 void *ctrlr_op_cb_arg = nvme_ctrlr->ctrlr_op_cb_arg; 2018 enum bdev_nvme_op_after_reset op_after_reset; 2019 2020 assert(nvme_ctrlr->thread == spdk_get_thread()); 2021 2022 nvme_ctrlr->ctrlr_op_cb_fn = NULL; 2023 nvme_ctrlr->ctrlr_op_cb_arg = NULL; 2024 2025 if (!success) { 2026 SPDK_ERRLOG("Resetting controller failed.\n"); 2027 } else { 2028 SPDK_NOTICELOG("Resetting controller successful.\n"); 2029 } 2030 2031 pthread_mutex_lock(&nvme_ctrlr->mutex); 2032 nvme_ctrlr->resetting = false; 2033 nvme_ctrlr->dont_retry = false; 2034 nvme_ctrlr->in_failover = false; 2035 2036 op_after_reset = bdev_nvme_check_op_after_reset(nvme_ctrlr, success); 2037 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2038 2039 /* Delay callbacks when the next operation is a failover. */ 2040 if (ctrlr_op_cb_fn && op_after_reset != OP_FAILOVER) { 2041 ctrlr_op_cb_fn(ctrlr_op_cb_arg, success ? 0 : -1); 2042 } 2043 2044 switch (op_after_reset) { 2045 case OP_COMPLETE_PENDING_DESTRUCT: 2046 nvme_ctrlr_unregister(nvme_ctrlr); 2047 break; 2048 case OP_DESTRUCT: 2049 bdev_nvme_delete_ctrlr(nvme_ctrlr, false); 2050 remove_discovery_entry(nvme_ctrlr); 2051 break; 2052 case OP_DELAYED_RECONNECT: 2053 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_start_reconnect_delay_timer); 2054 break; 2055 case OP_FAILOVER: 2056 nvme_ctrlr->ctrlr_op_cb_fn = ctrlr_op_cb_fn; 2057 nvme_ctrlr->ctrlr_op_cb_arg = ctrlr_op_cb_arg; 2058 bdev_nvme_failover_ctrlr(nvme_ctrlr); 2059 break; 2060 default: 2061 break; 2062 } 2063 } 2064 2065 static void 2066 bdev_nvme_reset_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, bool success) 2067 { 2068 pthread_mutex_lock(&nvme_ctrlr->mutex); 2069 if (!success) { 2070 /* Connecting the active trid failed. Set the next alternate trid to the 2071 * active trid if it exists. 2072 */ 2073 if (bdev_nvme_failover_trid(nvme_ctrlr, false, false)) { 2074 /* The next alternate trid exists and is ready to try. Try it now. */ 2075 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2076 2077 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr); 2078 return; 2079 } 2080 2081 /* We came here if there is no alternate trid or if the next trid exists but 2082 * is not ready to try. We will try the active trid after reconnect_delay_sec 2083 * seconds if it is non-zero or at the next reset call otherwise. 2084 */ 2085 } else { 2086 /* Connecting the active trid succeeded. Clear the last failed time because it 2087 * means the trid is failed if its last failed time is non-zero. 2088 */ 2089 nvme_ctrlr->active_path_id->last_failed_tsc = 0; 2090 } 2091 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2092 2093 /* Make sure we clear any pending resets before returning. */ 2094 spdk_for_each_channel(nvme_ctrlr, 2095 bdev_nvme_complete_pending_resets, 2096 success ? NULL : (void *)0x1, 2097 _bdev_nvme_reset_ctrlr_complete); 2098 } 2099 2100 static void 2101 bdev_nvme_reset_create_qpairs_failed(struct spdk_io_channel_iter *i, int status) 2102 { 2103 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2104 2105 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 2106 } 2107 2108 static void 2109 bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i) 2110 { 2111 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 2112 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch); 2113 struct nvme_qpair *nvme_qpair; 2114 2115 nvme_qpair = ctrlr_ch->qpair; 2116 assert(nvme_qpair != NULL); 2117 2118 _bdev_nvme_clear_io_path_cache(nvme_qpair); 2119 2120 if (nvme_qpair->qpair != NULL) { 2121 if (nvme_qpair->ctrlr->dont_retry) { 2122 spdk_nvme_qpair_set_abort_dnr(nvme_qpair->qpair, true); 2123 } 2124 spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); 2125 2126 /* The current full reset sequence will move to the next 2127 * ctrlr_channel after the qpair is actually disconnected. 2128 */ 2129 assert(ctrlr_ch->reset_iter == NULL); 2130 ctrlr_ch->reset_iter = i; 2131 } else { 2132 spdk_for_each_channel_continue(i, 0); 2133 } 2134 } 2135 2136 static void 2137 bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status) 2138 { 2139 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2140 2141 if (status == 0) { 2142 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, true); 2143 } else { 2144 /* Delete the added qpairs and quiesce ctrlr to make the states clean. */ 2145 spdk_for_each_channel(nvme_ctrlr, 2146 bdev_nvme_reset_destroy_qpair, 2147 NULL, 2148 bdev_nvme_reset_create_qpairs_failed); 2149 } 2150 } 2151 2152 static int 2153 bdev_nvme_reset_check_qpair_connected(void *ctx) 2154 { 2155 struct nvme_ctrlr_channel *ctrlr_ch = ctx; 2156 2157 if (ctrlr_ch->reset_iter == NULL) { 2158 /* qpair was already failed to connect and the reset sequence is being aborted. */ 2159 assert(ctrlr_ch->connect_poller == NULL); 2160 assert(ctrlr_ch->qpair->qpair == NULL); 2161 return SPDK_POLLER_BUSY; 2162 } 2163 2164 assert(ctrlr_ch->qpair->qpair != NULL); 2165 2166 if (!spdk_nvme_qpair_is_connected(ctrlr_ch->qpair->qpair)) { 2167 return SPDK_POLLER_BUSY; 2168 } 2169 2170 spdk_poller_unregister(&ctrlr_ch->connect_poller); 2171 2172 /* qpair was completed to connect. Move to the next ctrlr_channel */ 2173 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 2174 ctrlr_ch->reset_iter = NULL; 2175 2176 if (!g_opts.disable_auto_failback) { 2177 _bdev_nvme_clear_io_path_cache(ctrlr_ch->qpair); 2178 } 2179 2180 return SPDK_POLLER_BUSY; 2181 } 2182 2183 static void 2184 bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i) 2185 { 2186 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 2187 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 2188 int rc; 2189 2190 rc = bdev_nvme_create_qpair(ctrlr_ch->qpair); 2191 if (rc == 0) { 2192 ctrlr_ch->connect_poller = SPDK_POLLER_REGISTER(bdev_nvme_reset_check_qpair_connected, 2193 ctrlr_ch, 0); 2194 2195 /* The current full reset sequence will move to the next 2196 * ctrlr_channel after the qpair is actually connected. 2197 */ 2198 assert(ctrlr_ch->reset_iter == NULL); 2199 ctrlr_ch->reset_iter = i; 2200 } else { 2201 spdk_for_each_channel_continue(i, rc); 2202 } 2203 } 2204 2205 static int 2206 bdev_nvme_reconnect_ctrlr_poll(void *arg) 2207 { 2208 struct nvme_ctrlr *nvme_ctrlr = arg; 2209 int rc = -ETIMEDOUT; 2210 2211 if (!bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 2212 rc = spdk_nvme_ctrlr_reconnect_poll_async(nvme_ctrlr->ctrlr); 2213 if (rc == -EAGAIN) { 2214 return SPDK_POLLER_BUSY; 2215 } 2216 } 2217 2218 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 2219 if (rc == 0) { 2220 /* Recreate all of the I/O queue pairs */ 2221 spdk_for_each_channel(nvme_ctrlr, 2222 bdev_nvme_reset_create_qpair, 2223 NULL, 2224 bdev_nvme_reset_create_qpairs_done); 2225 } else { 2226 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 2227 } 2228 return SPDK_POLLER_BUSY; 2229 } 2230 2231 static void 2232 bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2233 { 2234 spdk_nvme_ctrlr_reconnect_async(nvme_ctrlr->ctrlr); 2235 2236 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect, nvme_ctrlr->nbdev_ctrlr->name); 2237 assert(nvme_ctrlr->reset_detach_poller == NULL); 2238 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_ctrlr_poll, 2239 nvme_ctrlr, 0); 2240 } 2241 2242 static void 2243 bdev_nvme_reset_destroy_qpair_done(struct spdk_io_channel_iter *i, int status) 2244 { 2245 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2246 2247 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reset, nvme_ctrlr->nbdev_ctrlr->name); 2248 assert(status == 0); 2249 2250 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2251 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2252 } else { 2253 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr); 2254 } 2255 } 2256 2257 static void 2258 bdev_nvme_reset_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr) 2259 { 2260 spdk_for_each_channel(nvme_ctrlr, 2261 bdev_nvme_reset_destroy_qpair, 2262 NULL, 2263 bdev_nvme_reset_destroy_qpair_done); 2264 } 2265 2266 static void 2267 bdev_nvme_reconnect_ctrlr_now(void *ctx) 2268 { 2269 struct nvme_ctrlr *nvme_ctrlr = ctx; 2270 2271 assert(nvme_ctrlr->resetting == true); 2272 assert(nvme_ctrlr->thread == spdk_get_thread()); 2273 2274 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 2275 2276 spdk_poller_resume(nvme_ctrlr->adminq_timer_poller); 2277 2278 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2279 } 2280 2281 static void 2282 _bdev_nvme_reset_ctrlr(void *ctx) 2283 { 2284 struct nvme_ctrlr *nvme_ctrlr = ctx; 2285 2286 assert(nvme_ctrlr->resetting == true); 2287 assert(nvme_ctrlr->thread == spdk_get_thread()); 2288 2289 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2290 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reset_destroy_qpairs); 2291 } else { 2292 bdev_nvme_reset_destroy_qpairs(nvme_ctrlr); 2293 } 2294 } 2295 2296 static int 2297 bdev_nvme_reset_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2298 { 2299 spdk_msg_fn msg_fn; 2300 2301 pthread_mutex_lock(&nvme_ctrlr->mutex); 2302 if (nvme_ctrlr->destruct) { 2303 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2304 return -ENXIO; 2305 } 2306 2307 if (nvme_ctrlr->resetting) { 2308 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2309 SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); 2310 return -EBUSY; 2311 } 2312 2313 if (nvme_ctrlr->disabled) { 2314 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2315 SPDK_NOTICELOG("Unable to perform reset. Controller is disabled.\n"); 2316 return -EALREADY; 2317 } 2318 2319 nvme_ctrlr->resetting = true; 2320 nvme_ctrlr->dont_retry = true; 2321 2322 if (nvme_ctrlr->reconnect_is_delayed) { 2323 SPDK_DEBUGLOG(bdev_nvme, "Reconnect is already scheduled.\n"); 2324 msg_fn = bdev_nvme_reconnect_ctrlr_now; 2325 nvme_ctrlr->reconnect_is_delayed = false; 2326 } else { 2327 msg_fn = _bdev_nvme_reset_ctrlr; 2328 assert(nvme_ctrlr->reset_start_tsc == 0); 2329 } 2330 2331 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2332 2333 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2334 2335 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 2336 return 0; 2337 } 2338 2339 static int 2340 bdev_nvme_enable_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2341 { 2342 pthread_mutex_lock(&nvme_ctrlr->mutex); 2343 if (nvme_ctrlr->destruct) { 2344 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2345 return -ENXIO; 2346 } 2347 2348 if (nvme_ctrlr->resetting) { 2349 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2350 return -EBUSY; 2351 } 2352 2353 if (!nvme_ctrlr->disabled) { 2354 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2355 return -EALREADY; 2356 } 2357 2358 nvme_ctrlr->disabled = false; 2359 nvme_ctrlr->resetting = true; 2360 2361 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2362 2363 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2364 2365 spdk_thread_send_msg(nvme_ctrlr->thread, bdev_nvme_reconnect_ctrlr_now, nvme_ctrlr); 2366 return 0; 2367 } 2368 2369 static void 2370 _bdev_nvme_disable_ctrlr_complete(struct spdk_io_channel_iter *i, int status) 2371 { 2372 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2373 bdev_nvme_ctrlr_op_cb ctrlr_op_cb_fn = nvme_ctrlr->ctrlr_op_cb_fn; 2374 void *ctrlr_op_cb_arg = nvme_ctrlr->ctrlr_op_cb_arg; 2375 enum bdev_nvme_op_after_reset op_after_disable; 2376 2377 assert(nvme_ctrlr->thread == spdk_get_thread()); 2378 2379 nvme_ctrlr->ctrlr_op_cb_fn = NULL; 2380 nvme_ctrlr->ctrlr_op_cb_arg = NULL; 2381 2382 pthread_mutex_lock(&nvme_ctrlr->mutex); 2383 2384 nvme_ctrlr->resetting = false; 2385 nvme_ctrlr->dont_retry = false; 2386 2387 op_after_disable = bdev_nvme_check_op_after_reset(nvme_ctrlr, true); 2388 2389 nvme_ctrlr->disabled = true; 2390 spdk_poller_pause(nvme_ctrlr->adminq_timer_poller); 2391 2392 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2393 2394 if (ctrlr_op_cb_fn) { 2395 ctrlr_op_cb_fn(ctrlr_op_cb_arg, 0); 2396 } 2397 2398 switch (op_after_disable) { 2399 case OP_COMPLETE_PENDING_DESTRUCT: 2400 nvme_ctrlr_unregister(nvme_ctrlr); 2401 break; 2402 default: 2403 break; 2404 } 2405 2406 } 2407 2408 static void 2409 bdev_nvme_disable_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr) 2410 { 2411 /* Make sure we clear any pending resets before returning. */ 2412 spdk_for_each_channel(nvme_ctrlr, 2413 bdev_nvme_complete_pending_resets, 2414 NULL, 2415 _bdev_nvme_disable_ctrlr_complete); 2416 } 2417 2418 static void 2419 bdev_nvme_disable_destroy_qpairs_done(struct spdk_io_channel_iter *i, int status) 2420 { 2421 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2422 2423 assert(status == 0); 2424 2425 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2426 bdev_nvme_disable_ctrlr_complete(nvme_ctrlr); 2427 } else { 2428 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_disable_ctrlr_complete); 2429 } 2430 } 2431 2432 static void 2433 bdev_nvme_disable_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr) 2434 { 2435 spdk_for_each_channel(nvme_ctrlr, 2436 bdev_nvme_reset_destroy_qpair, 2437 NULL, 2438 bdev_nvme_disable_destroy_qpairs_done); 2439 } 2440 2441 static void 2442 _bdev_nvme_cancel_reconnect_and_disable_ctrlr(void *ctx) 2443 { 2444 struct nvme_ctrlr *nvme_ctrlr = ctx; 2445 2446 assert(nvme_ctrlr->resetting == true); 2447 assert(nvme_ctrlr->thread == spdk_get_thread()); 2448 2449 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 2450 2451 bdev_nvme_disable_ctrlr_complete(nvme_ctrlr); 2452 } 2453 2454 static void 2455 _bdev_nvme_disconnect_and_disable_ctrlr(void *ctx) 2456 { 2457 struct nvme_ctrlr *nvme_ctrlr = ctx; 2458 2459 assert(nvme_ctrlr->resetting == true); 2460 assert(nvme_ctrlr->thread == spdk_get_thread()); 2461 2462 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2463 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_disable_destroy_qpairs); 2464 } else { 2465 bdev_nvme_disable_destroy_qpairs(nvme_ctrlr); 2466 } 2467 } 2468 2469 static int 2470 bdev_nvme_disable_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2471 { 2472 spdk_msg_fn msg_fn; 2473 2474 pthread_mutex_lock(&nvme_ctrlr->mutex); 2475 if (nvme_ctrlr->destruct) { 2476 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2477 return -ENXIO; 2478 } 2479 2480 if (nvme_ctrlr->resetting) { 2481 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2482 return -EBUSY; 2483 } 2484 2485 if (nvme_ctrlr->disabled) { 2486 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2487 return -EALREADY; 2488 } 2489 2490 nvme_ctrlr->resetting = true; 2491 nvme_ctrlr->dont_retry = true; 2492 2493 if (nvme_ctrlr->reconnect_is_delayed) { 2494 msg_fn = _bdev_nvme_cancel_reconnect_and_disable_ctrlr; 2495 nvme_ctrlr->reconnect_is_delayed = false; 2496 } else { 2497 msg_fn = _bdev_nvme_disconnect_and_disable_ctrlr; 2498 } 2499 2500 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2501 2502 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2503 2504 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 2505 return 0; 2506 } 2507 2508 static int 2509 nvme_ctrlr_op(struct nvme_ctrlr *nvme_ctrlr, enum nvme_ctrlr_op op, 2510 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2511 { 2512 int rc; 2513 2514 switch (op) { 2515 case NVME_CTRLR_OP_RESET: 2516 rc = bdev_nvme_reset_ctrlr(nvme_ctrlr); 2517 break; 2518 case NVME_CTRLR_OP_ENABLE: 2519 rc = bdev_nvme_enable_ctrlr(nvme_ctrlr); 2520 break; 2521 case NVME_CTRLR_OP_DISABLE: 2522 rc = bdev_nvme_disable_ctrlr(nvme_ctrlr); 2523 break; 2524 default: 2525 rc = -EINVAL; 2526 break; 2527 } 2528 2529 if (rc == 0) { 2530 assert(nvme_ctrlr->ctrlr_op_cb_fn == NULL); 2531 assert(nvme_ctrlr->ctrlr_op_cb_arg == NULL); 2532 nvme_ctrlr->ctrlr_op_cb_fn = cb_fn; 2533 nvme_ctrlr->ctrlr_op_cb_arg = cb_arg; 2534 } 2535 return rc; 2536 } 2537 2538 struct nvme_ctrlr_op_rpc_ctx { 2539 struct nvme_ctrlr *nvme_ctrlr; 2540 struct spdk_thread *orig_thread; 2541 enum nvme_ctrlr_op op; 2542 int rc; 2543 bdev_nvme_ctrlr_op_cb cb_fn; 2544 void *cb_arg; 2545 }; 2546 2547 static void 2548 _nvme_ctrlr_op_rpc_complete(void *_ctx) 2549 { 2550 struct nvme_ctrlr_op_rpc_ctx *ctx = _ctx; 2551 2552 assert(ctx != NULL); 2553 assert(ctx->cb_fn != NULL); 2554 2555 ctx->cb_fn(ctx->cb_arg, ctx->rc); 2556 2557 free(ctx); 2558 } 2559 2560 static void 2561 nvme_ctrlr_op_rpc_complete(void *cb_arg, int rc) 2562 { 2563 struct nvme_ctrlr_op_rpc_ctx *ctx = cb_arg; 2564 2565 ctx->rc = rc; 2566 2567 spdk_thread_send_msg(ctx->orig_thread, _nvme_ctrlr_op_rpc_complete, ctx); 2568 } 2569 2570 void 2571 nvme_ctrlr_op_rpc(struct nvme_ctrlr *nvme_ctrlr, enum nvme_ctrlr_op op, 2572 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2573 { 2574 struct nvme_ctrlr_op_rpc_ctx *ctx; 2575 int rc; 2576 2577 assert(cb_fn != NULL); 2578 2579 ctx = calloc(1, sizeof(*ctx)); 2580 if (ctx == NULL) { 2581 SPDK_ERRLOG("Failed to allocate nvme_ctrlr_op_rpc_ctx.\n"); 2582 cb_fn(cb_arg, -ENOMEM); 2583 return; 2584 } 2585 2586 ctx->orig_thread = spdk_get_thread(); 2587 ctx->cb_fn = cb_fn; 2588 ctx->cb_arg = cb_arg; 2589 2590 rc = nvme_ctrlr_op(nvme_ctrlr, op, nvme_ctrlr_op_rpc_complete, ctx); 2591 if (rc == 0) { 2592 return; 2593 } else if (rc == -EALREADY) { 2594 rc = 0; 2595 } 2596 2597 nvme_ctrlr_op_rpc_complete(ctx, rc); 2598 } 2599 2600 static void nvme_bdev_ctrlr_op_rpc_continue(void *cb_arg, int rc); 2601 2602 static void 2603 _nvme_bdev_ctrlr_op_rpc_continue(void *_ctx) 2604 { 2605 struct nvme_ctrlr_op_rpc_ctx *ctx = _ctx; 2606 struct nvme_ctrlr *prev_nvme_ctrlr, *next_nvme_ctrlr; 2607 int rc; 2608 2609 prev_nvme_ctrlr = ctx->nvme_ctrlr; 2610 ctx->nvme_ctrlr = NULL; 2611 2612 if (ctx->rc != 0) { 2613 goto complete; 2614 } 2615 2616 next_nvme_ctrlr = TAILQ_NEXT(prev_nvme_ctrlr, tailq); 2617 if (next_nvme_ctrlr == NULL) { 2618 goto complete; 2619 } 2620 2621 rc = nvme_ctrlr_op(next_nvme_ctrlr, ctx->op, nvme_bdev_ctrlr_op_rpc_continue, ctx); 2622 if (rc == 0) { 2623 ctx->nvme_ctrlr = next_nvme_ctrlr; 2624 return; 2625 } else if (rc == -EALREADY) { 2626 ctx->nvme_ctrlr = next_nvme_ctrlr; 2627 rc = 0; 2628 } 2629 2630 ctx->rc = rc; 2631 2632 complete: 2633 ctx->cb_fn(ctx->cb_arg, ctx->rc); 2634 free(ctx); 2635 } 2636 2637 static void 2638 nvme_bdev_ctrlr_op_rpc_continue(void *cb_arg, int rc) 2639 { 2640 struct nvme_ctrlr_op_rpc_ctx *ctx = cb_arg; 2641 2642 ctx->rc = rc; 2643 2644 spdk_thread_send_msg(ctx->orig_thread, _nvme_bdev_ctrlr_op_rpc_continue, ctx); 2645 } 2646 2647 void 2648 nvme_bdev_ctrlr_op_rpc(struct nvme_bdev_ctrlr *nbdev_ctrlr, enum nvme_ctrlr_op op, 2649 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2650 { 2651 struct nvme_ctrlr_op_rpc_ctx *ctx; 2652 struct nvme_ctrlr *nvme_ctrlr; 2653 int rc; 2654 2655 assert(cb_fn != NULL); 2656 2657 ctx = calloc(1, sizeof(*ctx)); 2658 if (ctx == NULL) { 2659 SPDK_ERRLOG("Failed to allocate nvme_ctrlr_op_rpc_ctx.\n"); 2660 cb_fn(cb_arg, -ENOMEM); 2661 return; 2662 } 2663 2664 ctx->orig_thread = spdk_get_thread(); 2665 ctx->op = op; 2666 ctx->cb_fn = cb_fn; 2667 ctx->cb_arg = cb_arg; 2668 2669 nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 2670 assert(nvme_ctrlr != NULL); 2671 2672 rc = nvme_ctrlr_op(nvme_ctrlr, op, nvme_bdev_ctrlr_op_rpc_continue, ctx); 2673 if (rc == 0) { 2674 ctx->nvme_ctrlr = nvme_ctrlr; 2675 return; 2676 } else if (rc == -EALREADY) { 2677 ctx->nvme_ctrlr = nvme_ctrlr; 2678 rc = 0; 2679 } 2680 2681 nvme_bdev_ctrlr_op_rpc_continue(ctx, rc); 2682 } 2683 2684 static int _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio); 2685 2686 static void 2687 _bdev_nvme_reset_io_complete(struct spdk_io_channel_iter *i, int status) 2688 { 2689 struct nvme_bdev_io *bio = spdk_io_channel_iter_get_ctx(i); 2690 enum spdk_bdev_io_status io_status; 2691 2692 if (bio->cpl.cdw0 == 0) { 2693 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 2694 } else { 2695 io_status = SPDK_BDEV_IO_STATUS_FAILED; 2696 } 2697 2698 __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), io_status, NULL); 2699 } 2700 2701 static void 2702 bdev_nvme_abort_bdev_channel(struct spdk_io_channel_iter *i) 2703 { 2704 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 2705 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 2706 2707 bdev_nvme_abort_retry_ios(nbdev_ch); 2708 2709 spdk_for_each_channel_continue(i, 0); 2710 } 2711 2712 static void 2713 bdev_nvme_reset_io_complete(struct nvme_bdev_io *bio) 2714 { 2715 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2716 struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt; 2717 2718 /* Abort all queued I/Os for retry. */ 2719 spdk_for_each_channel(nbdev, 2720 bdev_nvme_abort_bdev_channel, 2721 bio, 2722 _bdev_nvme_reset_io_complete); 2723 } 2724 2725 static void 2726 _bdev_nvme_reset_io_continue(void *ctx) 2727 { 2728 struct nvme_bdev_io *bio = ctx; 2729 struct nvme_io_path *prev_io_path, *next_io_path; 2730 int rc; 2731 2732 prev_io_path = bio->io_path; 2733 bio->io_path = NULL; 2734 2735 if (bio->cpl.cdw0 != 0) { 2736 goto complete; 2737 } 2738 2739 next_io_path = STAILQ_NEXT(prev_io_path, stailq); 2740 if (next_io_path == NULL) { 2741 goto complete; 2742 } 2743 2744 rc = _bdev_nvme_reset_io(next_io_path, bio); 2745 if (rc == 0) { 2746 return; 2747 } 2748 2749 bio->cpl.cdw0 = 1; 2750 2751 complete: 2752 bdev_nvme_reset_io_complete(bio); 2753 } 2754 2755 static void 2756 bdev_nvme_reset_io_continue(void *cb_arg, int rc) 2757 { 2758 struct nvme_bdev_io *bio = cb_arg; 2759 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2760 2761 bio->cpl.cdw0 = (rc == 0) ? 0 : 1; 2762 2763 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), _bdev_nvme_reset_io_continue, bio); 2764 } 2765 2766 static int 2767 _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio) 2768 { 2769 struct nvme_ctrlr_channel *ctrlr_ch; 2770 struct spdk_bdev_io *bdev_io; 2771 int rc; 2772 2773 rc = nvme_ctrlr_op(io_path->qpair->ctrlr, NVME_CTRLR_OP_RESET, 2774 bdev_nvme_reset_io_continue, bio); 2775 if (rc == 0) { 2776 assert(bio->io_path == NULL); 2777 bio->io_path = io_path; 2778 } else if (rc == -EBUSY) { 2779 ctrlr_ch = io_path->qpair->ctrlr_ch; 2780 assert(ctrlr_ch != NULL); 2781 /* 2782 * Reset call is queued only if it is from the app framework. This is on purpose so that 2783 * we don't interfere with the app framework reset strategy. i.e. we are deferring to the 2784 * upper level. If they are in the middle of a reset, we won't try to schedule another one. 2785 */ 2786 bdev_io = spdk_bdev_io_from_ctx(bio); 2787 TAILQ_INSERT_TAIL(&ctrlr_ch->pending_resets, bdev_io, module_link); 2788 rc = 0; 2789 } 2790 2791 return rc; 2792 } 2793 2794 static void 2795 bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio) 2796 { 2797 struct nvme_io_path *io_path; 2798 int rc; 2799 2800 bio->cpl.cdw0 = 0; 2801 2802 /* Reset all nvme_ctrlrs of a bdev controller sequentially. */ 2803 io_path = STAILQ_FIRST(&nbdev_ch->io_path_list); 2804 assert(io_path != NULL); 2805 2806 rc = _bdev_nvme_reset_io(io_path, bio); 2807 if (rc != 0) { 2808 /* If the current nvme_ctrlr is disabled, skip it and move to the next nvme_ctrlr. */ 2809 bdev_nvme_reset_io_continue(bio, rc == -EALREADY); 2810 } 2811 } 2812 2813 static int 2814 bdev_nvme_failover_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool remove) 2815 { 2816 if (nvme_ctrlr->destruct) { 2817 /* Don't bother resetting if the controller is in the process of being destructed. */ 2818 return -ENXIO; 2819 } 2820 2821 if (nvme_ctrlr->resetting) { 2822 if (!nvme_ctrlr->in_failover) { 2823 SPDK_NOTICELOG("Reset is already in progress. Defer failover until reset completes.\n"); 2824 2825 /* Defer failover until reset completes. */ 2826 nvme_ctrlr->pending_failover = true; 2827 return -EINPROGRESS; 2828 } else { 2829 SPDK_NOTICELOG("Unable to perform failover, already in progress.\n"); 2830 return -EBUSY; 2831 } 2832 } 2833 2834 bdev_nvme_failover_trid(nvme_ctrlr, remove, true); 2835 2836 if (nvme_ctrlr->reconnect_is_delayed) { 2837 SPDK_NOTICELOG("Reconnect is already scheduled.\n"); 2838 2839 /* We rely on the next reconnect for the failover. */ 2840 return -EALREADY; 2841 } 2842 2843 if (nvme_ctrlr->disabled) { 2844 SPDK_NOTICELOG("Controller is disabled.\n"); 2845 2846 /* We rely on the enablement for the failover. */ 2847 return -EALREADY; 2848 } 2849 2850 nvme_ctrlr->resetting = true; 2851 nvme_ctrlr->in_failover = true; 2852 2853 assert(nvme_ctrlr->reset_start_tsc == 0); 2854 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2855 2856 return 0; 2857 } 2858 2859 static int 2860 bdev_nvme_failover_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2861 { 2862 int rc; 2863 2864 pthread_mutex_lock(&nvme_ctrlr->mutex); 2865 rc = bdev_nvme_failover_ctrlr_unsafe(nvme_ctrlr, false); 2866 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2867 2868 if (rc == 0) { 2869 spdk_thread_send_msg(nvme_ctrlr->thread, _bdev_nvme_reset_ctrlr, nvme_ctrlr); 2870 } else if (rc == -EALREADY) { 2871 rc = 0; 2872 } 2873 2874 return rc; 2875 } 2876 2877 static int bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, 2878 uint64_t num_blocks); 2879 2880 static int bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, 2881 uint64_t num_blocks); 2882 2883 static int bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, 2884 uint64_t src_offset_blocks, 2885 uint64_t num_blocks); 2886 2887 static void 2888 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 2889 bool success) 2890 { 2891 struct nvme_bdev_io *bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 2892 int ret; 2893 2894 if (!success) { 2895 ret = -EINVAL; 2896 goto exit; 2897 } 2898 2899 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 2900 ret = -ENXIO; 2901 goto exit; 2902 } 2903 2904 ret = bdev_nvme_readv(bio, 2905 bdev_io->u.bdev.iovs, 2906 bdev_io->u.bdev.iovcnt, 2907 bdev_io->u.bdev.md_buf, 2908 bdev_io->u.bdev.num_blocks, 2909 bdev_io->u.bdev.offset_blocks, 2910 bdev_io->u.bdev.dif_check_flags, 2911 bdev_io->u.bdev.memory_domain, 2912 bdev_io->u.bdev.memory_domain_ctx, 2913 bdev_io->u.bdev.accel_sequence); 2914 2915 exit: 2916 if (spdk_unlikely(ret != 0)) { 2917 bdev_nvme_io_complete(bio, ret); 2918 } 2919 } 2920 2921 static inline void 2922 _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io) 2923 { 2924 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 2925 struct spdk_bdev *bdev = bdev_io->bdev; 2926 struct nvme_bdev_io *nbdev_io_to_abort; 2927 int rc = 0; 2928 2929 switch (bdev_io->type) { 2930 case SPDK_BDEV_IO_TYPE_READ: 2931 if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) { 2932 2933 rc = bdev_nvme_readv(nbdev_io, 2934 bdev_io->u.bdev.iovs, 2935 bdev_io->u.bdev.iovcnt, 2936 bdev_io->u.bdev.md_buf, 2937 bdev_io->u.bdev.num_blocks, 2938 bdev_io->u.bdev.offset_blocks, 2939 bdev_io->u.bdev.dif_check_flags, 2940 bdev_io->u.bdev.memory_domain, 2941 bdev_io->u.bdev.memory_domain_ctx, 2942 bdev_io->u.bdev.accel_sequence); 2943 } else { 2944 spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb, 2945 bdev_io->u.bdev.num_blocks * bdev->blocklen); 2946 rc = 0; 2947 } 2948 break; 2949 case SPDK_BDEV_IO_TYPE_WRITE: 2950 rc = bdev_nvme_writev(nbdev_io, 2951 bdev_io->u.bdev.iovs, 2952 bdev_io->u.bdev.iovcnt, 2953 bdev_io->u.bdev.md_buf, 2954 bdev_io->u.bdev.num_blocks, 2955 bdev_io->u.bdev.offset_blocks, 2956 bdev_io->u.bdev.dif_check_flags, 2957 bdev_io->u.bdev.memory_domain, 2958 bdev_io->u.bdev.memory_domain_ctx, 2959 bdev_io->u.bdev.accel_sequence); 2960 break; 2961 case SPDK_BDEV_IO_TYPE_COMPARE: 2962 rc = bdev_nvme_comparev(nbdev_io, 2963 bdev_io->u.bdev.iovs, 2964 bdev_io->u.bdev.iovcnt, 2965 bdev_io->u.bdev.md_buf, 2966 bdev_io->u.bdev.num_blocks, 2967 bdev_io->u.bdev.offset_blocks, 2968 bdev_io->u.bdev.dif_check_flags); 2969 break; 2970 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 2971 rc = bdev_nvme_comparev_and_writev(nbdev_io, 2972 bdev_io->u.bdev.iovs, 2973 bdev_io->u.bdev.iovcnt, 2974 bdev_io->u.bdev.fused_iovs, 2975 bdev_io->u.bdev.fused_iovcnt, 2976 bdev_io->u.bdev.md_buf, 2977 bdev_io->u.bdev.num_blocks, 2978 bdev_io->u.bdev.offset_blocks, 2979 bdev_io->u.bdev.dif_check_flags); 2980 break; 2981 case SPDK_BDEV_IO_TYPE_UNMAP: 2982 rc = bdev_nvme_unmap(nbdev_io, 2983 bdev_io->u.bdev.offset_blocks, 2984 bdev_io->u.bdev.num_blocks); 2985 break; 2986 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2987 rc = bdev_nvme_write_zeroes(nbdev_io, 2988 bdev_io->u.bdev.offset_blocks, 2989 bdev_io->u.bdev.num_blocks); 2990 break; 2991 case SPDK_BDEV_IO_TYPE_RESET: 2992 nbdev_io->io_path = NULL; 2993 bdev_nvme_reset_io(nbdev_ch, nbdev_io); 2994 return; 2995 2996 case SPDK_BDEV_IO_TYPE_FLUSH: 2997 bdev_nvme_io_complete(nbdev_io, 0); 2998 return; 2999 3000 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 3001 rc = bdev_nvme_zone_appendv(nbdev_io, 3002 bdev_io->u.bdev.iovs, 3003 bdev_io->u.bdev.iovcnt, 3004 bdev_io->u.bdev.md_buf, 3005 bdev_io->u.bdev.num_blocks, 3006 bdev_io->u.bdev.offset_blocks, 3007 bdev_io->u.bdev.dif_check_flags); 3008 break; 3009 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 3010 rc = bdev_nvme_get_zone_info(nbdev_io, 3011 bdev_io->u.zone_mgmt.zone_id, 3012 bdev_io->u.zone_mgmt.num_zones, 3013 bdev_io->u.zone_mgmt.buf); 3014 break; 3015 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 3016 rc = bdev_nvme_zone_management(nbdev_io, 3017 bdev_io->u.zone_mgmt.zone_id, 3018 bdev_io->u.zone_mgmt.zone_action); 3019 break; 3020 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 3021 nbdev_io->io_path = NULL; 3022 bdev_nvme_admin_passthru(nbdev_ch, 3023 nbdev_io, 3024 &bdev_io->u.nvme_passthru.cmd, 3025 bdev_io->u.nvme_passthru.buf, 3026 bdev_io->u.nvme_passthru.nbytes); 3027 return; 3028 3029 case SPDK_BDEV_IO_TYPE_NVME_IO: 3030 rc = bdev_nvme_io_passthru(nbdev_io, 3031 &bdev_io->u.nvme_passthru.cmd, 3032 bdev_io->u.nvme_passthru.buf, 3033 bdev_io->u.nvme_passthru.nbytes); 3034 break; 3035 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3036 rc = bdev_nvme_io_passthru_md(nbdev_io, 3037 &bdev_io->u.nvme_passthru.cmd, 3038 bdev_io->u.nvme_passthru.buf, 3039 bdev_io->u.nvme_passthru.nbytes, 3040 bdev_io->u.nvme_passthru.md_buf, 3041 bdev_io->u.nvme_passthru.md_len); 3042 break; 3043 case SPDK_BDEV_IO_TYPE_NVME_IOV_MD: 3044 rc = bdev_nvme_iov_passthru_md(nbdev_io, 3045 &bdev_io->u.nvme_passthru.cmd, 3046 bdev_io->u.nvme_passthru.iovs, 3047 bdev_io->u.nvme_passthru.iovcnt, 3048 bdev_io->u.nvme_passthru.nbytes, 3049 bdev_io->u.nvme_passthru.md_buf, 3050 bdev_io->u.nvme_passthru.md_len); 3051 break; 3052 case SPDK_BDEV_IO_TYPE_ABORT: 3053 nbdev_io->io_path = NULL; 3054 nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx; 3055 bdev_nvme_abort(nbdev_ch, 3056 nbdev_io, 3057 nbdev_io_to_abort); 3058 return; 3059 3060 case SPDK_BDEV_IO_TYPE_COPY: 3061 rc = bdev_nvme_copy(nbdev_io, 3062 bdev_io->u.bdev.offset_blocks, 3063 bdev_io->u.bdev.copy.src_offset_blocks, 3064 bdev_io->u.bdev.num_blocks); 3065 break; 3066 default: 3067 rc = -EINVAL; 3068 break; 3069 } 3070 3071 if (spdk_unlikely(rc != 0)) { 3072 bdev_nvme_io_complete(nbdev_io, rc); 3073 } 3074 } 3075 3076 static void 3077 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 3078 { 3079 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 3080 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 3081 3082 if (spdk_likely(nbdev_io->submit_tsc == 0)) { 3083 nbdev_io->submit_tsc = spdk_bdev_io_get_submit_tsc(bdev_io); 3084 } else { 3085 /* There are cases where submit_tsc != 0, i.e. retry I/O. 3086 * We need to update submit_tsc here. 3087 */ 3088 nbdev_io->submit_tsc = spdk_get_ticks(); 3089 } 3090 3091 spdk_trace_record(TRACE_BDEV_NVME_IO_START, 0, 0, (uintptr_t)nbdev_io, (uintptr_t)bdev_io); 3092 nbdev_io->io_path = bdev_nvme_find_io_path(nbdev_ch); 3093 if (spdk_unlikely(!nbdev_io->io_path)) { 3094 if (!bdev_nvme_io_type_is_admin(bdev_io->type)) { 3095 bdev_nvme_io_complete(nbdev_io, -ENXIO); 3096 return; 3097 } 3098 3099 /* Admin commands do not use the optimal I/O path. 3100 * Simply fall through even if it is not found. 3101 */ 3102 } 3103 3104 _bdev_nvme_submit_request(nbdev_ch, bdev_io); 3105 } 3106 3107 static bool 3108 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 3109 { 3110 struct nvme_bdev *nbdev = ctx; 3111 struct nvme_ns *nvme_ns; 3112 struct spdk_nvme_ns *ns; 3113 struct spdk_nvme_ctrlr *ctrlr; 3114 const struct spdk_nvme_ctrlr_data *cdata; 3115 3116 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 3117 assert(nvme_ns != NULL); 3118 ns = nvme_ns->ns; 3119 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 3120 3121 switch (io_type) { 3122 case SPDK_BDEV_IO_TYPE_READ: 3123 case SPDK_BDEV_IO_TYPE_WRITE: 3124 case SPDK_BDEV_IO_TYPE_RESET: 3125 case SPDK_BDEV_IO_TYPE_FLUSH: 3126 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 3127 case SPDK_BDEV_IO_TYPE_NVME_IO: 3128 case SPDK_BDEV_IO_TYPE_ABORT: 3129 return true; 3130 3131 case SPDK_BDEV_IO_TYPE_COMPARE: 3132 return spdk_nvme_ns_supports_compare(ns); 3133 3134 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3135 return spdk_nvme_ns_get_md_size(ns) ? true : false; 3136 3137 case SPDK_BDEV_IO_TYPE_UNMAP: 3138 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3139 return cdata->oncs.dsm; 3140 3141 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3142 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3143 return cdata->oncs.write_zeroes; 3144 3145 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 3146 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 3147 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) { 3148 return true; 3149 } 3150 return false; 3151 3152 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 3153 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 3154 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS; 3155 3156 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 3157 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS && 3158 spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ZONE_APPEND_SUPPORTED; 3159 3160 case SPDK_BDEV_IO_TYPE_COPY: 3161 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3162 return cdata->oncs.copy; 3163 3164 default: 3165 return false; 3166 } 3167 } 3168 3169 static int 3170 nvme_qpair_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ctrlr_channel *ctrlr_ch) 3171 { 3172 struct nvme_qpair *nvme_qpair; 3173 struct spdk_io_channel *pg_ch; 3174 int rc; 3175 3176 nvme_qpair = calloc(1, sizeof(*nvme_qpair)); 3177 if (!nvme_qpair) { 3178 SPDK_ERRLOG("Failed to alloc nvme_qpair.\n"); 3179 return -1; 3180 } 3181 3182 TAILQ_INIT(&nvme_qpair->io_path_list); 3183 3184 nvme_qpair->ctrlr = nvme_ctrlr; 3185 nvme_qpair->ctrlr_ch = ctrlr_ch; 3186 3187 pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs); 3188 if (!pg_ch) { 3189 free(nvme_qpair); 3190 return -1; 3191 } 3192 3193 nvme_qpair->group = spdk_io_channel_get_ctx(pg_ch); 3194 3195 #ifdef SPDK_CONFIG_VTUNE 3196 nvme_qpair->group->collect_spin_stat = true; 3197 #else 3198 nvme_qpair->group->collect_spin_stat = false; 3199 #endif 3200 3201 if (!nvme_ctrlr->disabled) { 3202 /* If a nvme_ctrlr is disabled, don't try to create qpair for it. Qpair will 3203 * be created when it's enabled. 3204 */ 3205 rc = bdev_nvme_create_qpair(nvme_qpair); 3206 if (rc != 0) { 3207 /* nvme_ctrlr can't create IO qpair if connection is down. 3208 * If reconnect_delay_sec is non-zero, creating IO qpair is retried 3209 * after reconnect_delay_sec seconds. If bdev_retry_count is non-zero, 3210 * submitted IO will be queued until IO qpair is successfully created. 3211 * 3212 * Hence, if both are satisfied, ignore the failure. 3213 */ 3214 if (nvme_ctrlr->opts.reconnect_delay_sec == 0 || g_opts.bdev_retry_count == 0) { 3215 spdk_put_io_channel(pg_ch); 3216 free(nvme_qpair); 3217 return rc; 3218 } 3219 } 3220 } 3221 3222 TAILQ_INSERT_TAIL(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 3223 3224 ctrlr_ch->qpair = nvme_qpair; 3225 3226 pthread_mutex_lock(&nvme_qpair->ctrlr->mutex); 3227 nvme_qpair->ctrlr->ref++; 3228 pthread_mutex_unlock(&nvme_qpair->ctrlr->mutex); 3229 3230 return 0; 3231 } 3232 3233 static int 3234 bdev_nvme_create_ctrlr_channel_cb(void *io_device, void *ctx_buf) 3235 { 3236 struct nvme_ctrlr *nvme_ctrlr = io_device; 3237 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 3238 3239 TAILQ_INIT(&ctrlr_ch->pending_resets); 3240 3241 return nvme_qpair_create(nvme_ctrlr, ctrlr_ch); 3242 } 3243 3244 static void 3245 nvme_qpair_delete(struct nvme_qpair *nvme_qpair) 3246 { 3247 struct nvme_io_path *io_path, *next; 3248 3249 assert(nvme_qpair->group != NULL); 3250 3251 TAILQ_FOREACH_SAFE(io_path, &nvme_qpair->io_path_list, tailq, next) { 3252 TAILQ_REMOVE(&nvme_qpair->io_path_list, io_path, tailq); 3253 nvme_io_path_free(io_path); 3254 } 3255 3256 TAILQ_REMOVE(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 3257 3258 spdk_put_io_channel(spdk_io_channel_from_ctx(nvme_qpair->group)); 3259 3260 nvme_ctrlr_release(nvme_qpair->ctrlr); 3261 3262 free(nvme_qpair); 3263 } 3264 3265 static void 3266 bdev_nvme_destroy_ctrlr_channel_cb(void *io_device, void *ctx_buf) 3267 { 3268 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 3269 struct nvme_qpair *nvme_qpair; 3270 3271 nvme_qpair = ctrlr_ch->qpair; 3272 assert(nvme_qpair != NULL); 3273 3274 _bdev_nvme_clear_io_path_cache(nvme_qpair); 3275 3276 if (nvme_qpair->qpair != NULL) { 3277 if (ctrlr_ch->reset_iter == NULL) { 3278 spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); 3279 } else { 3280 /* Skip current ctrlr_channel in a full reset sequence because 3281 * it is being deleted now. The qpair is already being disconnected. 3282 * We do not have to restart disconnecting it. 3283 */ 3284 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 3285 } 3286 3287 /* We cannot release a reference to the poll group now. 3288 * The qpair may be disconnected asynchronously later. 3289 * We need to poll it until it is actually disconnected. 3290 * Just detach the qpair from the deleting ctrlr_channel. 3291 */ 3292 nvme_qpair->ctrlr_ch = NULL; 3293 } else { 3294 assert(ctrlr_ch->reset_iter == NULL); 3295 3296 nvme_qpair_delete(nvme_qpair); 3297 } 3298 } 3299 3300 static inline struct spdk_io_channel * 3301 bdev_nvme_get_accel_channel(struct nvme_poll_group *group) 3302 { 3303 if (spdk_unlikely(!group->accel_channel)) { 3304 group->accel_channel = spdk_accel_get_io_channel(); 3305 if (!group->accel_channel) { 3306 SPDK_ERRLOG("Cannot get the accel_channel for bdev nvme polling group=%p\n", 3307 group); 3308 return NULL; 3309 } 3310 } 3311 3312 return group->accel_channel; 3313 } 3314 3315 static void 3316 bdev_nvme_submit_accel_crc32c(void *ctx, uint32_t *dst, struct iovec *iov, 3317 uint32_t iov_cnt, uint32_t seed, 3318 spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) 3319 { 3320 struct spdk_io_channel *accel_ch; 3321 struct nvme_poll_group *group = ctx; 3322 int rc; 3323 3324 assert(cb_fn != NULL); 3325 3326 accel_ch = bdev_nvme_get_accel_channel(group); 3327 if (spdk_unlikely(accel_ch == NULL)) { 3328 cb_fn(cb_arg, -ENOMEM); 3329 return; 3330 } 3331 3332 rc = spdk_accel_submit_crc32cv(accel_ch, dst, iov, iov_cnt, seed, cb_fn, cb_arg); 3333 if (rc) { 3334 /* For the two cases, spdk_accel_submit_crc32cv does not call the user's cb_fn */ 3335 if (rc == -ENOMEM || rc == -EINVAL) { 3336 cb_fn(cb_arg, rc); 3337 } 3338 SPDK_ERRLOG("Cannot complete the accelerated crc32c operation with iov=%p\n", iov); 3339 } 3340 } 3341 3342 static void 3343 bdev_nvme_finish_sequence(void *seq, spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) 3344 { 3345 spdk_accel_sequence_finish(seq, cb_fn, cb_arg); 3346 } 3347 3348 static void 3349 bdev_nvme_abort_sequence(void *seq) 3350 { 3351 spdk_accel_sequence_abort(seq); 3352 } 3353 3354 static void 3355 bdev_nvme_reverse_sequence(void *seq) 3356 { 3357 spdk_accel_sequence_reverse(seq); 3358 } 3359 3360 static int 3361 bdev_nvme_append_crc32c(void *ctx, void **seq, uint32_t *dst, struct iovec *iovs, uint32_t iovcnt, 3362 struct spdk_memory_domain *domain, void *domain_ctx, uint32_t seed, 3363 spdk_nvme_accel_step_cb cb_fn, void *cb_arg) 3364 { 3365 struct spdk_io_channel *ch; 3366 struct nvme_poll_group *group = ctx; 3367 3368 ch = bdev_nvme_get_accel_channel(group); 3369 if (spdk_unlikely(ch == NULL)) { 3370 return -ENOMEM; 3371 } 3372 3373 return spdk_accel_append_crc32c((struct spdk_accel_sequence **)seq, ch, dst, iovs, iovcnt, 3374 domain, domain_ctx, seed, cb_fn, cb_arg); 3375 } 3376 3377 static struct spdk_nvme_accel_fn_table g_bdev_nvme_accel_fn_table = { 3378 .table_size = sizeof(struct spdk_nvme_accel_fn_table), 3379 .submit_accel_crc32c = bdev_nvme_submit_accel_crc32c, 3380 .append_crc32c = bdev_nvme_append_crc32c, 3381 .finish_sequence = bdev_nvme_finish_sequence, 3382 .reverse_sequence = bdev_nvme_reverse_sequence, 3383 .abort_sequence = bdev_nvme_abort_sequence, 3384 }; 3385 3386 static int 3387 bdev_nvme_create_poll_group_cb(void *io_device, void *ctx_buf) 3388 { 3389 struct nvme_poll_group *group = ctx_buf; 3390 3391 TAILQ_INIT(&group->qpair_list); 3392 3393 group->group = spdk_nvme_poll_group_create(group, &g_bdev_nvme_accel_fn_table); 3394 if (group->group == NULL) { 3395 return -1; 3396 } 3397 3398 group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us); 3399 3400 if (group->poller == NULL) { 3401 spdk_nvme_poll_group_destroy(group->group); 3402 return -1; 3403 } 3404 3405 return 0; 3406 } 3407 3408 static void 3409 bdev_nvme_destroy_poll_group_cb(void *io_device, void *ctx_buf) 3410 { 3411 struct nvme_poll_group *group = ctx_buf; 3412 3413 assert(TAILQ_EMPTY(&group->qpair_list)); 3414 3415 if (group->accel_channel) { 3416 spdk_put_io_channel(group->accel_channel); 3417 } 3418 3419 spdk_poller_unregister(&group->poller); 3420 if (spdk_nvme_poll_group_destroy(group->group)) { 3421 SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.\n"); 3422 assert(false); 3423 } 3424 } 3425 3426 static struct spdk_io_channel * 3427 bdev_nvme_get_io_channel(void *ctx) 3428 { 3429 struct nvme_bdev *nvme_bdev = ctx; 3430 3431 return spdk_get_io_channel(nvme_bdev); 3432 } 3433 3434 static void * 3435 bdev_nvme_get_module_ctx(void *ctx) 3436 { 3437 struct nvme_bdev *nvme_bdev = ctx; 3438 struct nvme_ns *nvme_ns; 3439 3440 if (!nvme_bdev || nvme_bdev->disk.module != &nvme_if) { 3441 return NULL; 3442 } 3443 3444 nvme_ns = TAILQ_FIRST(&nvme_bdev->nvme_ns_list); 3445 if (!nvme_ns) { 3446 return NULL; 3447 } 3448 3449 return nvme_ns->ns; 3450 } 3451 3452 static const char * 3453 _nvme_ana_state_str(enum spdk_nvme_ana_state ana_state) 3454 { 3455 switch (ana_state) { 3456 case SPDK_NVME_ANA_OPTIMIZED_STATE: 3457 return "optimized"; 3458 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 3459 return "non_optimized"; 3460 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 3461 return "inaccessible"; 3462 case SPDK_NVME_ANA_PERSISTENT_LOSS_STATE: 3463 return "persistent_loss"; 3464 case SPDK_NVME_ANA_CHANGE_STATE: 3465 return "change"; 3466 default: 3467 return NULL; 3468 } 3469 } 3470 3471 static int 3472 bdev_nvme_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size) 3473 { 3474 struct spdk_memory_domain **_domains = NULL; 3475 struct nvme_bdev *nbdev = ctx; 3476 struct nvme_ns *nvme_ns; 3477 int i = 0, _array_size = array_size; 3478 int rc = 0; 3479 3480 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 3481 if (domains && array_size >= i) { 3482 _domains = &domains[i]; 3483 } else { 3484 _domains = NULL; 3485 } 3486 rc = spdk_nvme_ctrlr_get_memory_domains(nvme_ns->ctrlr->ctrlr, _domains, _array_size); 3487 if (rc > 0) { 3488 i += rc; 3489 if (_array_size >= rc) { 3490 _array_size -= rc; 3491 } else { 3492 _array_size = 0; 3493 } 3494 } else if (rc < 0) { 3495 return rc; 3496 } 3497 } 3498 3499 return i; 3500 } 3501 3502 static const char * 3503 nvme_ctrlr_get_state_str(struct nvme_ctrlr *nvme_ctrlr) 3504 { 3505 if (nvme_ctrlr->destruct) { 3506 return "deleting"; 3507 } else if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 3508 return "failed"; 3509 } else if (nvme_ctrlr->resetting) { 3510 return "resetting"; 3511 } else if (nvme_ctrlr->reconnect_is_delayed > 0) { 3512 return "reconnect_is_delayed"; 3513 } else if (nvme_ctrlr->disabled) { 3514 return "disabled"; 3515 } else { 3516 return "enabled"; 3517 } 3518 } 3519 3520 void 3521 nvme_ctrlr_info_json(struct spdk_json_write_ctx *w, struct nvme_ctrlr *nvme_ctrlr) 3522 { 3523 struct spdk_nvme_transport_id *trid; 3524 const struct spdk_nvme_ctrlr_opts *opts; 3525 const struct spdk_nvme_ctrlr_data *cdata; 3526 struct nvme_path_id *path_id; 3527 3528 spdk_json_write_object_begin(w); 3529 3530 spdk_json_write_named_string(w, "state", nvme_ctrlr_get_state_str(nvme_ctrlr)); 3531 3532 #ifdef SPDK_CONFIG_NVME_CUSE 3533 size_t cuse_name_size = 128; 3534 char cuse_name[cuse_name_size]; 3535 3536 int rc = spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr, cuse_name, &cuse_name_size); 3537 if (rc == 0) { 3538 spdk_json_write_named_string(w, "cuse_device", cuse_name); 3539 } 3540 #endif 3541 trid = &nvme_ctrlr->active_path_id->trid; 3542 spdk_json_write_named_object_begin(w, "trid"); 3543 nvme_bdev_dump_trid_json(trid, w); 3544 spdk_json_write_object_end(w); 3545 3546 path_id = TAILQ_NEXT(nvme_ctrlr->active_path_id, link); 3547 if (path_id != NULL) { 3548 spdk_json_write_named_array_begin(w, "alternate_trids"); 3549 do { 3550 trid = &path_id->trid; 3551 spdk_json_write_object_begin(w); 3552 nvme_bdev_dump_trid_json(trid, w); 3553 spdk_json_write_object_end(w); 3554 3555 path_id = TAILQ_NEXT(path_id, link); 3556 } while (path_id != NULL); 3557 spdk_json_write_array_end(w); 3558 } 3559 3560 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 3561 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 3562 3563 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 3564 spdk_json_write_named_object_begin(w, "host"); 3565 spdk_json_write_named_string(w, "nqn", opts->hostnqn); 3566 spdk_json_write_named_string(w, "addr", opts->src_addr); 3567 spdk_json_write_named_string(w, "svcid", opts->src_svcid); 3568 spdk_json_write_object_end(w); 3569 3570 spdk_json_write_object_end(w); 3571 } 3572 3573 static void 3574 nvme_namespace_info_json(struct spdk_json_write_ctx *w, 3575 struct nvme_ns *nvme_ns) 3576 { 3577 struct spdk_nvme_ns *ns; 3578 struct spdk_nvme_ctrlr *ctrlr; 3579 const struct spdk_nvme_ctrlr_data *cdata; 3580 const struct spdk_nvme_transport_id *trid; 3581 union spdk_nvme_vs_register vs; 3582 const struct spdk_nvme_ns_data *nsdata; 3583 char buf[128]; 3584 3585 ns = nvme_ns->ns; 3586 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 3587 3588 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3589 trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); 3590 vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr); 3591 3592 spdk_json_write_object_begin(w); 3593 3594 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 3595 spdk_json_write_named_string(w, "pci_address", trid->traddr); 3596 } 3597 3598 spdk_json_write_named_object_begin(w, "trid"); 3599 3600 nvme_bdev_dump_trid_json(trid, w); 3601 3602 spdk_json_write_object_end(w); 3603 3604 #ifdef SPDK_CONFIG_NVME_CUSE 3605 size_t cuse_name_size = 128; 3606 char cuse_name[cuse_name_size]; 3607 3608 int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns), 3609 cuse_name, &cuse_name_size); 3610 if (rc == 0) { 3611 spdk_json_write_named_string(w, "cuse_device", cuse_name); 3612 } 3613 #endif 3614 3615 spdk_json_write_named_object_begin(w, "ctrlr_data"); 3616 3617 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 3618 3619 spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid); 3620 3621 snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn); 3622 spdk_str_trim(buf); 3623 spdk_json_write_named_string(w, "model_number", buf); 3624 3625 snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn); 3626 spdk_str_trim(buf); 3627 spdk_json_write_named_string(w, "serial_number", buf); 3628 3629 snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr); 3630 spdk_str_trim(buf); 3631 spdk_json_write_named_string(w, "firmware_revision", buf); 3632 3633 if (cdata->subnqn[0] != '\0') { 3634 spdk_json_write_named_string(w, "subnqn", cdata->subnqn); 3635 } 3636 3637 spdk_json_write_named_object_begin(w, "oacs"); 3638 3639 spdk_json_write_named_uint32(w, "security", cdata->oacs.security); 3640 spdk_json_write_named_uint32(w, "format", cdata->oacs.format); 3641 spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware); 3642 spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage); 3643 3644 spdk_json_write_object_end(w); 3645 3646 spdk_json_write_named_bool(w, "multi_ctrlr", cdata->cmic.multi_ctrlr); 3647 spdk_json_write_named_bool(w, "ana_reporting", cdata->cmic.ana_reporting); 3648 3649 spdk_json_write_object_end(w); 3650 3651 spdk_json_write_named_object_begin(w, "vs"); 3652 3653 spdk_json_write_name(w, "nvme_version"); 3654 if (vs.bits.ter) { 3655 spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter); 3656 } else { 3657 spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr); 3658 } 3659 3660 spdk_json_write_object_end(w); 3661 3662 nsdata = spdk_nvme_ns_get_data(ns); 3663 3664 spdk_json_write_named_object_begin(w, "ns_data"); 3665 3666 spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns)); 3667 3668 if (cdata->cmic.ana_reporting) { 3669 spdk_json_write_named_string(w, "ana_state", 3670 _nvme_ana_state_str(nvme_ns->ana_state)); 3671 } 3672 3673 spdk_json_write_named_bool(w, "can_share", nsdata->nmic.can_share); 3674 3675 spdk_json_write_object_end(w); 3676 3677 if (cdata->oacs.security) { 3678 spdk_json_write_named_object_begin(w, "security"); 3679 3680 spdk_json_write_named_bool(w, "opal", nvme_ns->bdev->opal); 3681 3682 spdk_json_write_object_end(w); 3683 } 3684 3685 spdk_json_write_object_end(w); 3686 } 3687 3688 static const char * 3689 nvme_bdev_get_mp_policy_str(struct nvme_bdev *nbdev) 3690 { 3691 switch (nbdev->mp_policy) { 3692 case BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE: 3693 return "active_passive"; 3694 case BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE: 3695 return "active_active"; 3696 default: 3697 assert(false); 3698 return "invalid"; 3699 } 3700 } 3701 3702 static int 3703 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) 3704 { 3705 struct nvme_bdev *nvme_bdev = ctx; 3706 struct nvme_ns *nvme_ns; 3707 3708 pthread_mutex_lock(&nvme_bdev->mutex); 3709 spdk_json_write_named_array_begin(w, "nvme"); 3710 TAILQ_FOREACH(nvme_ns, &nvme_bdev->nvme_ns_list, tailq) { 3711 nvme_namespace_info_json(w, nvme_ns); 3712 } 3713 spdk_json_write_array_end(w); 3714 spdk_json_write_named_string(w, "mp_policy", nvme_bdev_get_mp_policy_str(nvme_bdev)); 3715 pthread_mutex_unlock(&nvme_bdev->mutex); 3716 3717 return 0; 3718 } 3719 3720 static void 3721 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3722 { 3723 /* No config per bdev needed */ 3724 } 3725 3726 static uint64_t 3727 bdev_nvme_get_spin_time(struct spdk_io_channel *ch) 3728 { 3729 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 3730 struct nvme_io_path *io_path; 3731 struct nvme_poll_group *group; 3732 uint64_t spin_time = 0; 3733 3734 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 3735 group = io_path->qpair->group; 3736 3737 if (!group || !group->collect_spin_stat) { 3738 continue; 3739 } 3740 3741 if (group->end_ticks != 0) { 3742 group->spin_ticks += (group->end_ticks - group->start_ticks); 3743 group->end_ticks = 0; 3744 } 3745 3746 spin_time += group->spin_ticks; 3747 group->start_ticks = 0; 3748 group->spin_ticks = 0; 3749 } 3750 3751 return (spin_time * 1000000ULL) / spdk_get_ticks_hz(); 3752 } 3753 3754 static void 3755 bdev_nvme_reset_device_stat(void *ctx) 3756 { 3757 struct nvme_bdev *nbdev = ctx; 3758 3759 if (nbdev->err_stat != NULL) { 3760 memset(nbdev->err_stat, 0, sizeof(struct nvme_error_stat)); 3761 } 3762 } 3763 3764 /* JSON string should be lowercases and underscore delimited string. */ 3765 static void 3766 bdev_nvme_format_nvme_status(char *dst, const char *src) 3767 { 3768 char tmp[256]; 3769 3770 spdk_strcpy_replace(dst, 256, src, " - ", "_"); 3771 spdk_strcpy_replace(tmp, 256, dst, "-", "_"); 3772 spdk_strcpy_replace(dst, 256, tmp, " ", "_"); 3773 spdk_strlwr(dst); 3774 } 3775 3776 static void 3777 bdev_nvme_dump_device_stat_json(void *ctx, struct spdk_json_write_ctx *w) 3778 { 3779 struct nvme_bdev *nbdev = ctx; 3780 struct spdk_nvme_status status = {}; 3781 uint16_t sct, sc; 3782 char status_json[256]; 3783 const char *status_str; 3784 3785 if (nbdev->err_stat == NULL) { 3786 return; 3787 } 3788 3789 spdk_json_write_named_object_begin(w, "nvme_error"); 3790 3791 spdk_json_write_named_object_begin(w, "status_type"); 3792 for (sct = 0; sct < 8; sct++) { 3793 if (nbdev->err_stat->status_type[sct] == 0) { 3794 continue; 3795 } 3796 status.sct = sct; 3797 3798 status_str = spdk_nvme_cpl_get_status_type_string(&status); 3799 assert(status_str != NULL); 3800 bdev_nvme_format_nvme_status(status_json, status_str); 3801 3802 spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status_type[sct]); 3803 } 3804 spdk_json_write_object_end(w); 3805 3806 spdk_json_write_named_object_begin(w, "status_code"); 3807 for (sct = 0; sct < 4; sct++) { 3808 status.sct = sct; 3809 for (sc = 0; sc < 256; sc++) { 3810 if (nbdev->err_stat->status[sct][sc] == 0) { 3811 continue; 3812 } 3813 status.sc = sc; 3814 3815 status_str = spdk_nvme_cpl_get_status_string(&status); 3816 assert(status_str != NULL); 3817 bdev_nvme_format_nvme_status(status_json, status_str); 3818 3819 spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status[sct][sc]); 3820 } 3821 } 3822 spdk_json_write_object_end(w); 3823 3824 spdk_json_write_object_end(w); 3825 } 3826 3827 static bool 3828 bdev_nvme_accel_sequence_supported(void *ctx, enum spdk_bdev_io_type type) 3829 { 3830 struct nvme_bdev *nbdev = ctx; 3831 struct spdk_nvme_ctrlr *ctrlr; 3832 3833 if (!g_opts.allow_accel_sequence) { 3834 return false; 3835 } 3836 3837 switch (type) { 3838 case SPDK_BDEV_IO_TYPE_WRITE: 3839 case SPDK_BDEV_IO_TYPE_READ: 3840 break; 3841 default: 3842 return false; 3843 } 3844 3845 ctrlr = bdev_nvme_get_ctrlr(&nbdev->disk); 3846 assert(ctrlr != NULL); 3847 3848 return spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ACCEL_SEQUENCE_SUPPORTED; 3849 } 3850 3851 static const struct spdk_bdev_fn_table nvmelib_fn_table = { 3852 .destruct = bdev_nvme_destruct, 3853 .submit_request = bdev_nvme_submit_request, 3854 .io_type_supported = bdev_nvme_io_type_supported, 3855 .get_io_channel = bdev_nvme_get_io_channel, 3856 .dump_info_json = bdev_nvme_dump_info_json, 3857 .write_config_json = bdev_nvme_write_config_json, 3858 .get_spin_time = bdev_nvme_get_spin_time, 3859 .get_module_ctx = bdev_nvme_get_module_ctx, 3860 .get_memory_domains = bdev_nvme_get_memory_domains, 3861 .accel_sequence_supported = bdev_nvme_accel_sequence_supported, 3862 .reset_device_stat = bdev_nvme_reset_device_stat, 3863 .dump_device_stat_json = bdev_nvme_dump_device_stat_json, 3864 }; 3865 3866 typedef int (*bdev_nvme_parse_ana_log_page_cb)( 3867 const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg); 3868 3869 static int 3870 bdev_nvme_parse_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 3871 bdev_nvme_parse_ana_log_page_cb cb_fn, void *cb_arg) 3872 { 3873 struct spdk_nvme_ana_group_descriptor *copied_desc; 3874 uint8_t *orig_desc; 3875 uint32_t i, desc_size, copy_len; 3876 int rc = 0; 3877 3878 if (nvme_ctrlr->ana_log_page == NULL) { 3879 return -EINVAL; 3880 } 3881 3882 copied_desc = nvme_ctrlr->copied_ana_desc; 3883 3884 orig_desc = (uint8_t *)nvme_ctrlr->ana_log_page + sizeof(struct spdk_nvme_ana_page); 3885 copy_len = nvme_ctrlr->max_ana_log_page_size - sizeof(struct spdk_nvme_ana_page); 3886 3887 for (i = 0; i < nvme_ctrlr->ana_log_page->num_ana_group_desc; i++) { 3888 memcpy(copied_desc, orig_desc, copy_len); 3889 3890 rc = cb_fn(copied_desc, cb_arg); 3891 if (rc != 0) { 3892 break; 3893 } 3894 3895 desc_size = sizeof(struct spdk_nvme_ana_group_descriptor) + 3896 copied_desc->num_of_nsid * sizeof(uint32_t); 3897 orig_desc += desc_size; 3898 copy_len -= desc_size; 3899 } 3900 3901 return rc; 3902 } 3903 3904 static int 3905 nvme_ns_ana_transition_timedout(void *ctx) 3906 { 3907 struct nvme_ns *nvme_ns = ctx; 3908 3909 spdk_poller_unregister(&nvme_ns->anatt_timer); 3910 nvme_ns->ana_transition_timedout = true; 3911 3912 return SPDK_POLLER_BUSY; 3913 } 3914 3915 static void 3916 _nvme_ns_set_ana_state(struct nvme_ns *nvme_ns, 3917 const struct spdk_nvme_ana_group_descriptor *desc) 3918 { 3919 const struct spdk_nvme_ctrlr_data *cdata; 3920 3921 nvme_ns->ana_group_id = desc->ana_group_id; 3922 nvme_ns->ana_state = desc->ana_state; 3923 nvme_ns->ana_state_updating = false; 3924 3925 switch (nvme_ns->ana_state) { 3926 case SPDK_NVME_ANA_OPTIMIZED_STATE: 3927 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 3928 nvme_ns->ana_transition_timedout = false; 3929 spdk_poller_unregister(&nvme_ns->anatt_timer); 3930 break; 3931 3932 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 3933 case SPDK_NVME_ANA_CHANGE_STATE: 3934 if (nvme_ns->anatt_timer != NULL) { 3935 break; 3936 } 3937 3938 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 3939 nvme_ns->anatt_timer = SPDK_POLLER_REGISTER(nvme_ns_ana_transition_timedout, 3940 nvme_ns, 3941 cdata->anatt * SPDK_SEC_TO_USEC); 3942 break; 3943 default: 3944 break; 3945 } 3946 } 3947 3948 static int 3949 nvme_ns_set_ana_state(const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg) 3950 { 3951 struct nvme_ns *nvme_ns = cb_arg; 3952 uint32_t i; 3953 3954 for (i = 0; i < desc->num_of_nsid; i++) { 3955 if (desc->nsid[i] != spdk_nvme_ns_get_id(nvme_ns->ns)) { 3956 continue; 3957 } 3958 3959 _nvme_ns_set_ana_state(nvme_ns, desc); 3960 return 1; 3961 } 3962 3963 return 0; 3964 } 3965 3966 static struct spdk_uuid 3967 nvme_generate_uuid(const char *sn, uint32_t nsid) 3968 { 3969 struct spdk_uuid new_uuid, namespace_uuid; 3970 char merged_str[SPDK_NVME_CTRLR_SN_LEN + NSID_STR_LEN + 1] = {'\0'}; 3971 /* This namespace UUID was generated using uuid_generate() method. */ 3972 const char *namespace_str = {"edaed2de-24bc-4b07-b559-f47ecbe730fd"}; 3973 int size; 3974 3975 assert(strlen(sn) <= SPDK_NVME_CTRLR_SN_LEN); 3976 3977 spdk_uuid_set_null(&new_uuid); 3978 spdk_uuid_set_null(&namespace_uuid); 3979 3980 size = snprintf(merged_str, sizeof(merged_str), "%s%"PRIu32, sn, nsid); 3981 assert(size > 0 && (unsigned long)size < sizeof(merged_str)); 3982 3983 spdk_uuid_parse(&namespace_uuid, namespace_str); 3984 3985 spdk_uuid_generate_sha1(&new_uuid, &namespace_uuid, merged_str, size); 3986 3987 return new_uuid; 3988 } 3989 3990 static int 3991 nvme_disk_create(struct spdk_bdev *disk, const char *base_name, 3992 struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns, 3993 uint32_t prchk_flags, void *ctx) 3994 { 3995 const struct spdk_uuid *uuid; 3996 const uint8_t *nguid; 3997 const struct spdk_nvme_ctrlr_data *cdata; 3998 const struct spdk_nvme_ns_data *nsdata; 3999 const struct spdk_nvme_ctrlr_opts *opts; 4000 enum spdk_nvme_csi csi; 4001 uint32_t atomic_bs, phys_bs, bs; 4002 char sn_tmp[SPDK_NVME_CTRLR_SN_LEN + 1] = {'\0'}; 4003 4004 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 4005 csi = spdk_nvme_ns_get_csi(ns); 4006 opts = spdk_nvme_ctrlr_get_opts(ctrlr); 4007 4008 switch (csi) { 4009 case SPDK_NVME_CSI_NVM: 4010 disk->product_name = "NVMe disk"; 4011 break; 4012 case SPDK_NVME_CSI_ZNS: 4013 disk->product_name = "NVMe ZNS disk"; 4014 disk->zoned = true; 4015 disk->zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 4016 disk->max_zone_append_size = spdk_nvme_zns_ctrlr_get_max_zone_append_size(ctrlr) / 4017 spdk_nvme_ns_get_extended_sector_size(ns); 4018 disk->max_open_zones = spdk_nvme_zns_ns_get_max_open_zones(ns); 4019 disk->max_active_zones = spdk_nvme_zns_ns_get_max_active_zones(ns); 4020 break; 4021 default: 4022 SPDK_ERRLOG("unsupported CSI: %u\n", csi); 4023 return -ENOTSUP; 4024 } 4025 4026 disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns)); 4027 if (!disk->name) { 4028 return -ENOMEM; 4029 } 4030 4031 disk->write_cache = 0; 4032 if (cdata->vwc.present) { 4033 /* Enable if the Volatile Write Cache exists */ 4034 disk->write_cache = 1; 4035 } 4036 if (cdata->oncs.write_zeroes) { 4037 disk->max_write_zeroes = UINT16_MAX + 1; 4038 } 4039 disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns); 4040 disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns); 4041 disk->max_segment_size = spdk_nvme_ctrlr_get_max_xfer_size(ctrlr); 4042 /* NVMe driver will split one request into multiple requests 4043 * based on MDTS and stripe boundary, the bdev layer will use 4044 * max_segment_size and max_num_segments to split one big IO 4045 * into multiple requests, then small request can't run out 4046 * of NVMe internal requests data structure. 4047 */ 4048 if (opts && opts->io_queue_requests) { 4049 disk->max_num_segments = opts->io_queue_requests / 2; 4050 } 4051 if (spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_SGL_SUPPORTED) { 4052 /* The nvme driver will try to split I/O that have too many 4053 * SGEs, but it doesn't work if that last SGE doesn't end on 4054 * an aggregate total that is block aligned. The bdev layer has 4055 * a more robust splitting framework, so use that instead for 4056 * this case. (See issue #3269.) 4057 */ 4058 uint16_t max_sges = spdk_nvme_ctrlr_get_max_sges(ctrlr); 4059 4060 if (disk->max_num_segments == 0) { 4061 disk->max_num_segments = max_sges; 4062 } else { 4063 disk->max_num_segments = spdk_min(disk->max_num_segments, max_sges); 4064 } 4065 } 4066 disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns); 4067 4068 nguid = spdk_nvme_ns_get_nguid(ns); 4069 if (!nguid) { 4070 uuid = spdk_nvme_ns_get_uuid(ns); 4071 if (uuid) { 4072 disk->uuid = *uuid; 4073 } else if (g_opts.generate_uuids) { 4074 spdk_strcpy_pad(sn_tmp, cdata->sn, SPDK_NVME_CTRLR_SN_LEN, '\0'); 4075 disk->uuid = nvme_generate_uuid(sn_tmp, spdk_nvme_ns_get_id(ns)); 4076 } 4077 } else { 4078 memcpy(&disk->uuid, nguid, sizeof(disk->uuid)); 4079 } 4080 4081 nsdata = spdk_nvme_ns_get_data(ns); 4082 bs = spdk_nvme_ns_get_sector_size(ns); 4083 atomic_bs = bs; 4084 phys_bs = bs; 4085 if (nsdata->nabo == 0) { 4086 if (nsdata->nsfeat.ns_atomic_write_unit && nsdata->nawupf) { 4087 atomic_bs = bs * (1 + nsdata->nawupf); 4088 } else { 4089 atomic_bs = bs * (1 + cdata->awupf); 4090 } 4091 } 4092 if (nsdata->nsfeat.optperf) { 4093 phys_bs = bs * (1 + nsdata->npwg); 4094 } 4095 disk->phys_blocklen = spdk_min(phys_bs, atomic_bs); 4096 4097 disk->md_len = spdk_nvme_ns_get_md_size(ns); 4098 if (disk->md_len != 0) { 4099 disk->md_interleave = nsdata->flbas.extended; 4100 disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns); 4101 if (disk->dif_type != SPDK_DIF_DISABLE) { 4102 disk->dif_is_head_of_md = nsdata->dps.md_start; 4103 disk->dif_check_flags = prchk_flags; 4104 } 4105 } 4106 4107 if (!(spdk_nvme_ctrlr_get_flags(ctrlr) & 4108 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) { 4109 disk->acwu = 0; 4110 } else if (nsdata->nsfeat.ns_atomic_write_unit) { 4111 disk->acwu = nsdata->nacwu + 1; /* 0-based */ 4112 } else { 4113 disk->acwu = cdata->acwu + 1; /* 0-based */ 4114 } 4115 4116 if (cdata->oncs.copy) { 4117 /* For now bdev interface allows only single segment copy */ 4118 disk->max_copy = nsdata->mssrl; 4119 } 4120 4121 disk->ctxt = ctx; 4122 disk->fn_table = &nvmelib_fn_table; 4123 disk->module = &nvme_if; 4124 4125 return 0; 4126 } 4127 4128 static struct nvme_bdev * 4129 nvme_bdev_alloc(void) 4130 { 4131 struct nvme_bdev *bdev; 4132 int rc; 4133 4134 bdev = calloc(1, sizeof(*bdev)); 4135 if (!bdev) { 4136 SPDK_ERRLOG("bdev calloc() failed\n"); 4137 return NULL; 4138 } 4139 4140 if (g_opts.nvme_error_stat) { 4141 bdev->err_stat = calloc(1, sizeof(struct nvme_error_stat)); 4142 if (!bdev->err_stat) { 4143 SPDK_ERRLOG("err_stat calloc() failed\n"); 4144 free(bdev); 4145 return NULL; 4146 } 4147 } 4148 4149 rc = pthread_mutex_init(&bdev->mutex, NULL); 4150 if (rc != 0) { 4151 free(bdev->err_stat); 4152 free(bdev); 4153 return NULL; 4154 } 4155 4156 bdev->ref = 1; 4157 bdev->mp_policy = BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE; 4158 bdev->mp_selector = BDEV_NVME_MP_SELECTOR_ROUND_ROBIN; 4159 bdev->rr_min_io = UINT32_MAX; 4160 TAILQ_INIT(&bdev->nvme_ns_list); 4161 4162 return bdev; 4163 } 4164 4165 static int 4166 nvme_bdev_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4167 { 4168 struct nvme_bdev *bdev; 4169 struct nvme_bdev_ctrlr *nbdev_ctrlr = nvme_ctrlr->nbdev_ctrlr; 4170 int rc; 4171 4172 bdev = nvme_bdev_alloc(); 4173 if (bdev == NULL) { 4174 SPDK_ERRLOG("Failed to allocate NVMe bdev\n"); 4175 return -ENOMEM; 4176 } 4177 4178 bdev->opal = nvme_ctrlr->opal_dev != NULL; 4179 4180 rc = nvme_disk_create(&bdev->disk, nbdev_ctrlr->name, nvme_ctrlr->ctrlr, 4181 nvme_ns->ns, nvme_ctrlr->opts.prchk_flags, bdev); 4182 if (rc != 0) { 4183 SPDK_ERRLOG("Failed to create NVMe disk\n"); 4184 nvme_bdev_free(bdev); 4185 return rc; 4186 } 4187 4188 spdk_io_device_register(bdev, 4189 bdev_nvme_create_bdev_channel_cb, 4190 bdev_nvme_destroy_bdev_channel_cb, 4191 sizeof(struct nvme_bdev_channel), 4192 bdev->disk.name); 4193 4194 nvme_ns->bdev = bdev; 4195 bdev->nsid = nvme_ns->id; 4196 TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); 4197 4198 bdev->nbdev_ctrlr = nbdev_ctrlr; 4199 TAILQ_INSERT_TAIL(&nbdev_ctrlr->bdevs, bdev, tailq); 4200 4201 rc = spdk_bdev_register(&bdev->disk); 4202 if (rc != 0) { 4203 SPDK_ERRLOG("spdk_bdev_register() failed\n"); 4204 spdk_io_device_unregister(bdev, NULL); 4205 nvme_ns->bdev = NULL; 4206 TAILQ_REMOVE(&nbdev_ctrlr->bdevs, bdev, tailq); 4207 nvme_bdev_free(bdev); 4208 return rc; 4209 } 4210 4211 return 0; 4212 } 4213 4214 static bool 4215 bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2) 4216 { 4217 const struct spdk_nvme_ns_data *nsdata1, *nsdata2; 4218 const struct spdk_uuid *uuid1, *uuid2; 4219 4220 nsdata1 = spdk_nvme_ns_get_data(ns1); 4221 nsdata2 = spdk_nvme_ns_get_data(ns2); 4222 uuid1 = spdk_nvme_ns_get_uuid(ns1); 4223 uuid2 = spdk_nvme_ns_get_uuid(ns2); 4224 4225 return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid)) == 0 && 4226 nsdata1->eui64 == nsdata2->eui64 && 4227 ((uuid1 == NULL && uuid2 == NULL) || 4228 (uuid1 != NULL && uuid2 != NULL && spdk_uuid_compare(uuid1, uuid2) == 0)) && 4229 spdk_nvme_ns_get_csi(ns1) == spdk_nvme_ns_get_csi(ns2); 4230 } 4231 4232 static bool 4233 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 4234 struct spdk_nvme_ctrlr_opts *opts) 4235 { 4236 struct nvme_probe_skip_entry *entry; 4237 4238 TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) { 4239 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 4240 return false; 4241 } 4242 } 4243 4244 opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst; 4245 opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight; 4246 opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight; 4247 opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight; 4248 opts->disable_read_ana_log_page = true; 4249 4250 SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr); 4251 4252 return true; 4253 } 4254 4255 static void 4256 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl) 4257 { 4258 struct nvme_ctrlr *nvme_ctrlr = ctx; 4259 4260 if (spdk_nvme_cpl_is_error(cpl)) { 4261 SPDK_WARNLOG("Abort failed. Resetting controller. sc is %u, sct is %u.\n", cpl->status.sc, 4262 cpl->status.sct); 4263 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4264 } else if (cpl->cdw0 & 0x1) { 4265 SPDK_WARNLOG("Specified command could not be aborted.\n"); 4266 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4267 } 4268 } 4269 4270 static void 4271 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr, 4272 struct spdk_nvme_qpair *qpair, uint16_t cid) 4273 { 4274 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 4275 union spdk_nvme_csts_register csts; 4276 int rc; 4277 4278 assert(nvme_ctrlr->ctrlr == ctrlr); 4279 4280 SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid); 4281 4282 /* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O 4283 * queue. (Note: qpair == NULL when there's an admin cmd timeout.) Otherwise we 4284 * would submit another fabrics cmd on the admin queue to read CSTS and check for its 4285 * completion recursively. 4286 */ 4287 if (nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) { 4288 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); 4289 if (csts.bits.cfs) { 4290 SPDK_ERRLOG("Controller Fatal Status, reset required\n"); 4291 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4292 return; 4293 } 4294 } 4295 4296 switch (g_opts.action_on_timeout) { 4297 case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT: 4298 if (qpair) { 4299 /* Don't send abort to ctrlr when ctrlr is not available. */ 4300 pthread_mutex_lock(&nvme_ctrlr->mutex); 4301 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 4302 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4303 SPDK_NOTICELOG("Quit abort. Ctrlr is not available.\n"); 4304 return; 4305 } 4306 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4307 4308 rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid, 4309 nvme_abort_cpl, nvme_ctrlr); 4310 if (rc == 0) { 4311 return; 4312 } 4313 4314 SPDK_ERRLOG("Unable to send abort. Resetting, rc is %d.\n", rc); 4315 } 4316 4317 /* FALLTHROUGH */ 4318 case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET: 4319 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4320 break; 4321 case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE: 4322 SPDK_DEBUGLOG(bdev_nvme, "No action for nvme controller timeout.\n"); 4323 break; 4324 default: 4325 SPDK_ERRLOG("An invalid timeout action value is found.\n"); 4326 break; 4327 } 4328 } 4329 4330 static struct nvme_ns * 4331 nvme_ns_alloc(void) 4332 { 4333 struct nvme_ns *nvme_ns; 4334 4335 nvme_ns = calloc(1, sizeof(struct nvme_ns)); 4336 if (nvme_ns == NULL) { 4337 return NULL; 4338 } 4339 4340 if (g_opts.io_path_stat) { 4341 nvme_ns->stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); 4342 if (nvme_ns->stat == NULL) { 4343 free(nvme_ns); 4344 return NULL; 4345 } 4346 spdk_bdev_reset_io_stat(nvme_ns->stat, SPDK_BDEV_RESET_STAT_MAXMIN); 4347 } 4348 4349 return nvme_ns; 4350 } 4351 4352 static void 4353 nvme_ns_free(struct nvme_ns *nvme_ns) 4354 { 4355 free(nvme_ns->stat); 4356 free(nvme_ns); 4357 } 4358 4359 static void 4360 nvme_ctrlr_populate_namespace_done(struct nvme_ns *nvme_ns, int rc) 4361 { 4362 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 4363 struct nvme_async_probe_ctx *ctx = nvme_ns->probe_ctx; 4364 4365 if (rc == 0) { 4366 nvme_ns->probe_ctx = NULL; 4367 pthread_mutex_lock(&nvme_ctrlr->mutex); 4368 nvme_ctrlr->ref++; 4369 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4370 } else { 4371 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4372 nvme_ns_free(nvme_ns); 4373 } 4374 4375 if (ctx) { 4376 ctx->populates_in_progress--; 4377 if (ctx->populates_in_progress == 0) { 4378 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 4379 } 4380 } 4381 } 4382 4383 static void 4384 bdev_nvme_add_io_path(struct spdk_io_channel_iter *i) 4385 { 4386 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4387 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4388 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4389 int rc; 4390 4391 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 4392 if (rc != 0) { 4393 SPDK_ERRLOG("Failed to add I/O path to bdev_channel dynamically.\n"); 4394 } 4395 4396 spdk_for_each_channel_continue(i, rc); 4397 } 4398 4399 static void 4400 bdev_nvme_delete_io_path(struct spdk_io_channel_iter *i) 4401 { 4402 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4403 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4404 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4405 struct nvme_io_path *io_path; 4406 4407 io_path = _bdev_nvme_get_io_path(nbdev_ch, nvme_ns); 4408 if (io_path != NULL) { 4409 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 4410 } 4411 4412 spdk_for_each_channel_continue(i, 0); 4413 } 4414 4415 static void 4416 bdev_nvme_add_io_path_failed(struct spdk_io_channel_iter *i, int status) 4417 { 4418 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4419 4420 nvme_ctrlr_populate_namespace_done(nvme_ns, -1); 4421 } 4422 4423 static void 4424 bdev_nvme_add_io_path_done(struct spdk_io_channel_iter *i, int status) 4425 { 4426 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4427 struct nvme_bdev *bdev = spdk_io_channel_iter_get_io_device(i); 4428 4429 if (status == 0) { 4430 nvme_ctrlr_populate_namespace_done(nvme_ns, 0); 4431 } else { 4432 /* Delete the added io_paths and fail populating the namespace. */ 4433 spdk_for_each_channel(bdev, 4434 bdev_nvme_delete_io_path, 4435 nvme_ns, 4436 bdev_nvme_add_io_path_failed); 4437 } 4438 } 4439 4440 static int 4441 nvme_bdev_add_ns(struct nvme_bdev *bdev, struct nvme_ns *nvme_ns) 4442 { 4443 struct nvme_ns *tmp_ns; 4444 const struct spdk_nvme_ns_data *nsdata; 4445 4446 nsdata = spdk_nvme_ns_get_data(nvme_ns->ns); 4447 if (!nsdata->nmic.can_share) { 4448 SPDK_ERRLOG("Namespace cannot be shared.\n"); 4449 return -EINVAL; 4450 } 4451 4452 pthread_mutex_lock(&bdev->mutex); 4453 4454 tmp_ns = TAILQ_FIRST(&bdev->nvme_ns_list); 4455 assert(tmp_ns != NULL); 4456 4457 if (!bdev_nvme_compare_ns(nvme_ns->ns, tmp_ns->ns)) { 4458 pthread_mutex_unlock(&bdev->mutex); 4459 SPDK_ERRLOG("Namespaces are not identical.\n"); 4460 return -EINVAL; 4461 } 4462 4463 bdev->ref++; 4464 TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); 4465 nvme_ns->bdev = bdev; 4466 4467 pthread_mutex_unlock(&bdev->mutex); 4468 4469 /* Add nvme_io_path to nvme_bdev_channels dynamically. */ 4470 spdk_for_each_channel(bdev, 4471 bdev_nvme_add_io_path, 4472 nvme_ns, 4473 bdev_nvme_add_io_path_done); 4474 4475 return 0; 4476 } 4477 4478 static void 4479 nvme_ctrlr_populate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4480 { 4481 struct spdk_nvme_ns *ns; 4482 struct nvme_bdev *bdev; 4483 int rc = 0; 4484 4485 ns = spdk_nvme_ctrlr_get_ns(nvme_ctrlr->ctrlr, nvme_ns->id); 4486 if (!ns) { 4487 SPDK_DEBUGLOG(bdev_nvme, "Invalid NS %d\n", nvme_ns->id); 4488 rc = -EINVAL; 4489 goto done; 4490 } 4491 4492 nvme_ns->ns = ns; 4493 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 4494 4495 if (nvme_ctrlr->ana_log_page != NULL) { 4496 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ns_set_ana_state, nvme_ns); 4497 } 4498 4499 bdev = nvme_bdev_ctrlr_get_bdev(nvme_ctrlr->nbdev_ctrlr, nvme_ns->id); 4500 if (bdev == NULL) { 4501 rc = nvme_bdev_create(nvme_ctrlr, nvme_ns); 4502 } else { 4503 rc = nvme_bdev_add_ns(bdev, nvme_ns); 4504 if (rc == 0) { 4505 return; 4506 } 4507 } 4508 done: 4509 nvme_ctrlr_populate_namespace_done(nvme_ns, rc); 4510 } 4511 4512 static void 4513 nvme_ctrlr_depopulate_namespace_done(struct nvme_ns *nvme_ns) 4514 { 4515 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 4516 4517 assert(nvme_ctrlr != NULL); 4518 4519 pthread_mutex_lock(&nvme_ctrlr->mutex); 4520 4521 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4522 4523 if (nvme_ns->bdev != NULL) { 4524 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4525 return; 4526 } 4527 4528 nvme_ns_free(nvme_ns); 4529 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4530 4531 nvme_ctrlr_release(nvme_ctrlr); 4532 } 4533 4534 static void 4535 bdev_nvme_delete_io_path_done(struct spdk_io_channel_iter *i, int status) 4536 { 4537 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4538 4539 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 4540 } 4541 4542 static void 4543 nvme_ctrlr_depopulate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4544 { 4545 struct nvme_bdev *bdev; 4546 4547 spdk_poller_unregister(&nvme_ns->anatt_timer); 4548 4549 bdev = nvme_ns->bdev; 4550 if (bdev != NULL) { 4551 pthread_mutex_lock(&bdev->mutex); 4552 4553 assert(bdev->ref > 0); 4554 bdev->ref--; 4555 if (bdev->ref == 0) { 4556 pthread_mutex_unlock(&bdev->mutex); 4557 4558 spdk_bdev_unregister(&bdev->disk, NULL, NULL); 4559 } else { 4560 /* spdk_bdev_unregister() is not called until the last nvme_ns is 4561 * depopulated. Hence we need to remove nvme_ns from bdev->nvme_ns_list 4562 * and clear nvme_ns->bdev here. 4563 */ 4564 TAILQ_REMOVE(&bdev->nvme_ns_list, nvme_ns, tailq); 4565 nvme_ns->bdev = NULL; 4566 4567 pthread_mutex_unlock(&bdev->mutex); 4568 4569 /* Delete nvme_io_paths from nvme_bdev_channels dynamically. After that, 4570 * we call depopulate_namespace_done() to avoid use-after-free. 4571 */ 4572 spdk_for_each_channel(bdev, 4573 bdev_nvme_delete_io_path, 4574 nvme_ns, 4575 bdev_nvme_delete_io_path_done); 4576 return; 4577 } 4578 } 4579 4580 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 4581 } 4582 4583 static void 4584 nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 4585 struct nvme_async_probe_ctx *ctx) 4586 { 4587 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 4588 struct nvme_ns *nvme_ns, *next; 4589 struct spdk_nvme_ns *ns; 4590 struct nvme_bdev *bdev; 4591 uint32_t nsid; 4592 int rc; 4593 uint64_t num_sectors; 4594 4595 if (ctx) { 4596 /* Initialize this count to 1 to handle the populate functions 4597 * calling nvme_ctrlr_populate_namespace_done() immediately. 4598 */ 4599 ctx->populates_in_progress = 1; 4600 } 4601 4602 /* First loop over our existing namespaces and see if they have been 4603 * removed. */ 4604 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 4605 while (nvme_ns != NULL) { 4606 next = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 4607 4608 if (spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) { 4609 /* NS is still there but attributes may have changed */ 4610 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id); 4611 num_sectors = spdk_nvme_ns_get_num_sectors(ns); 4612 bdev = nvme_ns->bdev; 4613 assert(bdev != NULL); 4614 if (bdev->disk.blockcnt != num_sectors) { 4615 SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n", 4616 nvme_ns->id, 4617 bdev->disk.name, 4618 bdev->disk.blockcnt, 4619 num_sectors); 4620 rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors); 4621 if (rc != 0) { 4622 SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n", 4623 bdev->disk.name, rc); 4624 } 4625 } 4626 } else { 4627 /* Namespace was removed */ 4628 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 4629 } 4630 4631 nvme_ns = next; 4632 } 4633 4634 /* Loop through all of the namespaces at the nvme level and see if any of them are new */ 4635 nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 4636 while (nsid != 0) { 4637 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 4638 4639 if (nvme_ns == NULL) { 4640 /* Found a new one */ 4641 nvme_ns = nvme_ns_alloc(); 4642 if (nvme_ns == NULL) { 4643 SPDK_ERRLOG("Failed to allocate namespace\n"); 4644 /* This just fails to attach the namespace. It may work on a future attempt. */ 4645 continue; 4646 } 4647 4648 nvme_ns->id = nsid; 4649 nvme_ns->ctrlr = nvme_ctrlr; 4650 4651 nvme_ns->bdev = NULL; 4652 4653 if (ctx) { 4654 ctx->populates_in_progress++; 4655 } 4656 nvme_ns->probe_ctx = ctx; 4657 4658 RB_INSERT(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4659 4660 nvme_ctrlr_populate_namespace(nvme_ctrlr, nvme_ns); 4661 } 4662 4663 nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid); 4664 } 4665 4666 if (ctx) { 4667 /* Decrement this count now that the loop is over to account 4668 * for the one we started with. If the count is then 0, we 4669 * know any populate_namespace functions completed immediately, 4670 * so we'll kick the callback here. 4671 */ 4672 ctx->populates_in_progress--; 4673 if (ctx->populates_in_progress == 0) { 4674 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 4675 } 4676 } 4677 4678 } 4679 4680 static void 4681 nvme_ctrlr_depopulate_namespaces(struct nvme_ctrlr *nvme_ctrlr) 4682 { 4683 struct nvme_ns *nvme_ns, *tmp; 4684 4685 RB_FOREACH_SAFE(nvme_ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp) { 4686 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 4687 } 4688 } 4689 4690 static uint32_t 4691 nvme_ctrlr_get_ana_log_page_size(struct nvme_ctrlr *nvme_ctrlr) 4692 { 4693 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 4694 const struct spdk_nvme_ctrlr_data *cdata; 4695 uint32_t nsid, ns_count = 0; 4696 4697 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 4698 4699 for (nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 4700 nsid != 0; nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid)) { 4701 ns_count++; 4702 } 4703 4704 return sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 4705 sizeof(struct spdk_nvme_ana_group_descriptor) + ns_count * 4706 sizeof(uint32_t); 4707 } 4708 4709 static int 4710 nvme_ctrlr_set_ana_states(const struct spdk_nvme_ana_group_descriptor *desc, 4711 void *cb_arg) 4712 { 4713 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 4714 struct nvme_ns *nvme_ns; 4715 uint32_t i, nsid; 4716 4717 for (i = 0; i < desc->num_of_nsid; i++) { 4718 nsid = desc->nsid[i]; 4719 if (nsid == 0) { 4720 continue; 4721 } 4722 4723 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 4724 4725 assert(nvme_ns != NULL); 4726 if (nvme_ns == NULL) { 4727 /* Target told us that an inactive namespace had an ANA change */ 4728 continue; 4729 } 4730 4731 _nvme_ns_set_ana_state(nvme_ns, desc); 4732 } 4733 4734 return 0; 4735 } 4736 4737 static void 4738 bdev_nvme_disable_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 4739 { 4740 struct nvme_ns *nvme_ns; 4741 4742 spdk_free(nvme_ctrlr->ana_log_page); 4743 nvme_ctrlr->ana_log_page = NULL; 4744 4745 for (nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 4746 nvme_ns != NULL; 4747 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns)) { 4748 nvme_ns->ana_state_updating = false; 4749 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 4750 } 4751 } 4752 4753 static void 4754 nvme_ctrlr_read_ana_log_page_done(void *ctx, const struct spdk_nvme_cpl *cpl) 4755 { 4756 struct nvme_ctrlr *nvme_ctrlr = ctx; 4757 4758 if (cpl != NULL && spdk_nvme_cpl_is_success(cpl)) { 4759 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ctrlr_set_ana_states, 4760 nvme_ctrlr); 4761 } else { 4762 bdev_nvme_disable_read_ana_log_page(nvme_ctrlr); 4763 } 4764 4765 pthread_mutex_lock(&nvme_ctrlr->mutex); 4766 4767 assert(nvme_ctrlr->ana_log_page_updating == true); 4768 nvme_ctrlr->ana_log_page_updating = false; 4769 4770 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 4771 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4772 4773 nvme_ctrlr_unregister(nvme_ctrlr); 4774 } else { 4775 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4776 4777 bdev_nvme_clear_io_path_caches(nvme_ctrlr); 4778 } 4779 } 4780 4781 static int 4782 nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 4783 { 4784 uint32_t ana_log_page_size; 4785 int rc; 4786 4787 if (nvme_ctrlr->ana_log_page == NULL) { 4788 return -EINVAL; 4789 } 4790 4791 ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); 4792 4793 if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { 4794 SPDK_ERRLOG("ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", 4795 ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); 4796 return -EINVAL; 4797 } 4798 4799 pthread_mutex_lock(&nvme_ctrlr->mutex); 4800 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 4801 nvme_ctrlr->ana_log_page_updating) { 4802 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4803 return -EBUSY; 4804 } 4805 4806 nvme_ctrlr->ana_log_page_updating = true; 4807 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4808 4809 rc = spdk_nvme_ctrlr_cmd_get_log_page(nvme_ctrlr->ctrlr, 4810 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 4811 SPDK_NVME_GLOBAL_NS_TAG, 4812 nvme_ctrlr->ana_log_page, 4813 ana_log_page_size, 0, 4814 nvme_ctrlr_read_ana_log_page_done, 4815 nvme_ctrlr); 4816 if (rc != 0) { 4817 nvme_ctrlr_read_ana_log_page_done(nvme_ctrlr, NULL); 4818 } 4819 4820 return rc; 4821 } 4822 4823 static void 4824 dummy_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 4825 { 4826 } 4827 4828 struct bdev_nvme_set_preferred_path_ctx { 4829 struct spdk_bdev_desc *desc; 4830 struct nvme_ns *nvme_ns; 4831 bdev_nvme_set_preferred_path_cb cb_fn; 4832 void *cb_arg; 4833 }; 4834 4835 static void 4836 bdev_nvme_set_preferred_path_done(struct spdk_io_channel_iter *i, int status) 4837 { 4838 struct bdev_nvme_set_preferred_path_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4839 4840 assert(ctx != NULL); 4841 assert(ctx->desc != NULL); 4842 assert(ctx->cb_fn != NULL); 4843 4844 spdk_bdev_close(ctx->desc); 4845 4846 ctx->cb_fn(ctx->cb_arg, status); 4847 4848 free(ctx); 4849 } 4850 4851 static void 4852 _bdev_nvme_set_preferred_path(struct spdk_io_channel_iter *i) 4853 { 4854 struct bdev_nvme_set_preferred_path_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4855 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4856 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4857 struct nvme_io_path *io_path, *prev; 4858 4859 prev = NULL; 4860 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 4861 if (io_path->nvme_ns == ctx->nvme_ns) { 4862 break; 4863 } 4864 prev = io_path; 4865 } 4866 4867 if (io_path != NULL) { 4868 if (prev != NULL) { 4869 STAILQ_REMOVE_AFTER(&nbdev_ch->io_path_list, prev, stailq); 4870 STAILQ_INSERT_HEAD(&nbdev_ch->io_path_list, io_path, stailq); 4871 } 4872 4873 /* We can set io_path to nbdev_ch->current_io_path directly here. 4874 * However, it needs to be conditional. To simplify the code, 4875 * just clear nbdev_ch->current_io_path and let find_io_path() 4876 * fill it. 4877 * 4878 * Automatic failback may be disabled. Hence even if the io_path is 4879 * already at the head, clear nbdev_ch->current_io_path. 4880 */ 4881 bdev_nvme_clear_current_io_path(nbdev_ch); 4882 } 4883 4884 spdk_for_each_channel_continue(i, 0); 4885 } 4886 4887 static struct nvme_ns * 4888 bdev_nvme_set_preferred_ns(struct nvme_bdev *nbdev, uint16_t cntlid) 4889 { 4890 struct nvme_ns *nvme_ns, *prev; 4891 const struct spdk_nvme_ctrlr_data *cdata; 4892 4893 prev = NULL; 4894 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 4895 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 4896 4897 if (cdata->cntlid == cntlid) { 4898 break; 4899 } 4900 prev = nvme_ns; 4901 } 4902 4903 if (nvme_ns != NULL && prev != NULL) { 4904 TAILQ_REMOVE(&nbdev->nvme_ns_list, nvme_ns, tailq); 4905 TAILQ_INSERT_HEAD(&nbdev->nvme_ns_list, nvme_ns, tailq); 4906 } 4907 4908 return nvme_ns; 4909 } 4910 4911 /* This function supports only multipath mode. There is only a single I/O path 4912 * for each NVMe-oF controller. Hence, just move the matched I/O path to the 4913 * head of the I/O path list for each NVMe bdev channel. 4914 * 4915 * NVMe bdev channel may be acquired after completing this function. move the 4916 * matched namespace to the head of the namespace list for the NVMe bdev too. 4917 */ 4918 void 4919 bdev_nvme_set_preferred_path(const char *name, uint16_t cntlid, 4920 bdev_nvme_set_preferred_path_cb cb_fn, void *cb_arg) 4921 { 4922 struct bdev_nvme_set_preferred_path_ctx *ctx; 4923 struct spdk_bdev *bdev; 4924 struct nvme_bdev *nbdev; 4925 int rc = 0; 4926 4927 assert(cb_fn != NULL); 4928 4929 ctx = calloc(1, sizeof(*ctx)); 4930 if (ctx == NULL) { 4931 SPDK_ERRLOG("Failed to alloc context.\n"); 4932 rc = -ENOMEM; 4933 goto err_alloc; 4934 } 4935 4936 ctx->cb_fn = cb_fn; 4937 ctx->cb_arg = cb_arg; 4938 4939 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 4940 if (rc != 0) { 4941 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 4942 goto err_open; 4943 } 4944 4945 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 4946 4947 if (bdev->module != &nvme_if) { 4948 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 4949 rc = -ENODEV; 4950 goto err_bdev; 4951 } 4952 4953 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 4954 4955 pthread_mutex_lock(&nbdev->mutex); 4956 4957 ctx->nvme_ns = bdev_nvme_set_preferred_ns(nbdev, cntlid); 4958 if (ctx->nvme_ns == NULL) { 4959 pthread_mutex_unlock(&nbdev->mutex); 4960 4961 SPDK_ERRLOG("bdev %s does not have namespace to controller %u.\n", name, cntlid); 4962 rc = -ENODEV; 4963 goto err_bdev; 4964 } 4965 4966 pthread_mutex_unlock(&nbdev->mutex); 4967 4968 spdk_for_each_channel(nbdev, 4969 _bdev_nvme_set_preferred_path, 4970 ctx, 4971 bdev_nvme_set_preferred_path_done); 4972 return; 4973 4974 err_bdev: 4975 spdk_bdev_close(ctx->desc); 4976 err_open: 4977 free(ctx); 4978 err_alloc: 4979 cb_fn(cb_arg, rc); 4980 } 4981 4982 struct bdev_nvme_set_multipath_policy_ctx { 4983 struct spdk_bdev_desc *desc; 4984 bdev_nvme_set_multipath_policy_cb cb_fn; 4985 void *cb_arg; 4986 }; 4987 4988 static void 4989 bdev_nvme_set_multipath_policy_done(struct spdk_io_channel_iter *i, int status) 4990 { 4991 struct bdev_nvme_set_multipath_policy_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4992 4993 assert(ctx != NULL); 4994 assert(ctx->desc != NULL); 4995 assert(ctx->cb_fn != NULL); 4996 4997 spdk_bdev_close(ctx->desc); 4998 4999 ctx->cb_fn(ctx->cb_arg, status); 5000 5001 free(ctx); 5002 } 5003 5004 static void 5005 _bdev_nvme_set_multipath_policy(struct spdk_io_channel_iter *i) 5006 { 5007 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 5008 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 5009 struct nvme_bdev *nbdev = spdk_io_channel_get_io_device(_ch); 5010 5011 nbdev_ch->mp_policy = nbdev->mp_policy; 5012 nbdev_ch->mp_selector = nbdev->mp_selector; 5013 nbdev_ch->rr_min_io = nbdev->rr_min_io; 5014 bdev_nvme_clear_current_io_path(nbdev_ch); 5015 5016 spdk_for_each_channel_continue(i, 0); 5017 } 5018 5019 void 5020 bdev_nvme_set_multipath_policy(const char *name, enum bdev_nvme_multipath_policy policy, 5021 enum bdev_nvme_multipath_selector selector, uint32_t rr_min_io, 5022 bdev_nvme_set_multipath_policy_cb cb_fn, void *cb_arg) 5023 { 5024 struct bdev_nvme_set_multipath_policy_ctx *ctx; 5025 struct spdk_bdev *bdev; 5026 struct nvme_bdev *nbdev; 5027 int rc; 5028 5029 assert(cb_fn != NULL); 5030 5031 if (policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE && selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 5032 if (rr_min_io == UINT32_MAX) { 5033 rr_min_io = 1; 5034 } else if (rr_min_io == 0) { 5035 rc = -EINVAL; 5036 goto exit; 5037 } 5038 } else if (rr_min_io != UINT32_MAX) { 5039 rc = -EINVAL; 5040 goto exit; 5041 } 5042 5043 ctx = calloc(1, sizeof(*ctx)); 5044 if (ctx == NULL) { 5045 SPDK_ERRLOG("Failed to alloc context.\n"); 5046 rc = -ENOMEM; 5047 goto exit; 5048 } 5049 5050 ctx->cb_fn = cb_fn; 5051 ctx->cb_arg = cb_arg; 5052 5053 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 5054 if (rc != 0) { 5055 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 5056 rc = -ENODEV; 5057 goto err_open; 5058 } 5059 5060 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 5061 if (bdev->module != &nvme_if) { 5062 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 5063 rc = -ENODEV; 5064 goto err_module; 5065 } 5066 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 5067 5068 pthread_mutex_lock(&nbdev->mutex); 5069 nbdev->mp_policy = policy; 5070 nbdev->mp_selector = selector; 5071 nbdev->rr_min_io = rr_min_io; 5072 pthread_mutex_unlock(&nbdev->mutex); 5073 5074 spdk_for_each_channel(nbdev, 5075 _bdev_nvme_set_multipath_policy, 5076 ctx, 5077 bdev_nvme_set_multipath_policy_done); 5078 return; 5079 5080 err_module: 5081 spdk_bdev_close(ctx->desc); 5082 err_open: 5083 free(ctx); 5084 exit: 5085 cb_fn(cb_arg, rc); 5086 } 5087 5088 static void 5089 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 5090 { 5091 struct nvme_ctrlr *nvme_ctrlr = arg; 5092 union spdk_nvme_async_event_completion event; 5093 5094 if (spdk_nvme_cpl_is_error(cpl)) { 5095 SPDK_WARNLOG("AER request execute failed\n"); 5096 return; 5097 } 5098 5099 event.raw = cpl->cdw0; 5100 if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 5101 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) { 5102 nvme_ctrlr_populate_namespaces(nvme_ctrlr, NULL); 5103 } else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 5104 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_ANA_CHANGE)) { 5105 nvme_ctrlr_read_ana_log_page(nvme_ctrlr); 5106 } 5107 } 5108 5109 static void 5110 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, int rc) 5111 { 5112 if (ctx->cb_fn) { 5113 ctx->cb_fn(ctx->cb_ctx, ctx->reported_bdevs, rc); 5114 } 5115 5116 ctx->namespaces_populated = true; 5117 if (ctx->probe_done) { 5118 /* The probe was already completed, so we need to free the context 5119 * here. This can happen for cases like OCSSD, where we need to 5120 * send additional commands to the SSD after attach. 5121 */ 5122 free(ctx); 5123 } 5124 } 5125 5126 static void 5127 nvme_ctrlr_create_done(struct nvme_ctrlr *nvme_ctrlr, 5128 struct nvme_async_probe_ctx *ctx) 5129 { 5130 spdk_io_device_register(nvme_ctrlr, 5131 bdev_nvme_create_ctrlr_channel_cb, 5132 bdev_nvme_destroy_ctrlr_channel_cb, 5133 sizeof(struct nvme_ctrlr_channel), 5134 nvme_ctrlr->nbdev_ctrlr->name); 5135 5136 nvme_ctrlr_populate_namespaces(nvme_ctrlr, ctx); 5137 } 5138 5139 static void 5140 nvme_ctrlr_init_ana_log_page_done(void *_ctx, const struct spdk_nvme_cpl *cpl) 5141 { 5142 struct nvme_ctrlr *nvme_ctrlr = _ctx; 5143 struct nvme_async_probe_ctx *ctx = nvme_ctrlr->probe_ctx; 5144 5145 nvme_ctrlr->probe_ctx = NULL; 5146 5147 if (spdk_nvme_cpl_is_error(cpl)) { 5148 nvme_ctrlr_delete(nvme_ctrlr); 5149 5150 if (ctx != NULL) { 5151 ctx->reported_bdevs = 0; 5152 populate_namespaces_cb(ctx, -1); 5153 } 5154 return; 5155 } 5156 5157 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 5158 } 5159 5160 static int 5161 nvme_ctrlr_init_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 5162 struct nvme_async_probe_ctx *ctx) 5163 { 5164 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 5165 const struct spdk_nvme_ctrlr_data *cdata; 5166 uint32_t ana_log_page_size; 5167 5168 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5169 5170 /* Set buffer size enough to include maximum number of allowed namespaces. */ 5171 ana_log_page_size = sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 5172 sizeof(struct spdk_nvme_ana_group_descriptor) + cdata->mnan * 5173 sizeof(uint32_t); 5174 5175 nvme_ctrlr->ana_log_page = spdk_zmalloc(ana_log_page_size, 64, NULL, 5176 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 5177 if (nvme_ctrlr->ana_log_page == NULL) { 5178 SPDK_ERRLOG("could not allocate ANA log page buffer\n"); 5179 return -ENXIO; 5180 } 5181 5182 /* Each descriptor in a ANA log page is not ensured to be 8-bytes aligned. 5183 * Hence copy each descriptor to a temporary area when parsing it. 5184 * 5185 * Allocate a buffer whose size is as large as ANA log page buffer because 5186 * we do not know the size of a descriptor until actually reading it. 5187 */ 5188 nvme_ctrlr->copied_ana_desc = calloc(1, ana_log_page_size); 5189 if (nvme_ctrlr->copied_ana_desc == NULL) { 5190 SPDK_ERRLOG("could not allocate a buffer to parse ANA descriptor\n"); 5191 return -ENOMEM; 5192 } 5193 5194 nvme_ctrlr->max_ana_log_page_size = ana_log_page_size; 5195 5196 nvme_ctrlr->probe_ctx = ctx; 5197 5198 /* Then, set the read size only to include the current active namespaces. */ 5199 ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); 5200 5201 if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { 5202 SPDK_ERRLOG("ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", 5203 ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); 5204 return -EINVAL; 5205 } 5206 5207 return spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, 5208 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 5209 SPDK_NVME_GLOBAL_NS_TAG, 5210 nvme_ctrlr->ana_log_page, 5211 ana_log_page_size, 0, 5212 nvme_ctrlr_init_ana_log_page_done, 5213 nvme_ctrlr); 5214 } 5215 5216 /* hostnqn and subnqn were already verified before attaching a controller. 5217 * Hence check only the multipath capability and cntlid here. 5218 */ 5219 static bool 5220 bdev_nvme_check_multipath(struct nvme_bdev_ctrlr *nbdev_ctrlr, struct spdk_nvme_ctrlr *ctrlr) 5221 { 5222 struct nvme_ctrlr *tmp; 5223 const struct spdk_nvme_ctrlr_data *cdata, *tmp_cdata; 5224 5225 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5226 5227 if (!cdata->cmic.multi_ctrlr) { 5228 SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); 5229 return false; 5230 } 5231 5232 TAILQ_FOREACH(tmp, &nbdev_ctrlr->ctrlrs, tailq) { 5233 tmp_cdata = spdk_nvme_ctrlr_get_data(tmp->ctrlr); 5234 5235 if (!tmp_cdata->cmic.multi_ctrlr) { 5236 SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); 5237 return false; 5238 } 5239 if (cdata->cntlid == tmp_cdata->cntlid) { 5240 SPDK_ERRLOG("cntlid %u are duplicated.\n", tmp_cdata->cntlid); 5241 return false; 5242 } 5243 } 5244 5245 return true; 5246 } 5247 5248 static int 5249 nvme_bdev_ctrlr_create(const char *name, struct nvme_ctrlr *nvme_ctrlr) 5250 { 5251 struct nvme_bdev_ctrlr *nbdev_ctrlr; 5252 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 5253 int rc = 0; 5254 5255 pthread_mutex_lock(&g_bdev_nvme_mutex); 5256 5257 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 5258 if (nbdev_ctrlr != NULL) { 5259 if (!bdev_nvme_check_multipath(nbdev_ctrlr, ctrlr)) { 5260 rc = -EINVAL; 5261 goto exit; 5262 } 5263 } else { 5264 nbdev_ctrlr = calloc(1, sizeof(*nbdev_ctrlr)); 5265 if (nbdev_ctrlr == NULL) { 5266 SPDK_ERRLOG("Failed to allocate nvme_bdev_ctrlr.\n"); 5267 rc = -ENOMEM; 5268 goto exit; 5269 } 5270 nbdev_ctrlr->name = strdup(name); 5271 if (nbdev_ctrlr->name == NULL) { 5272 SPDK_ERRLOG("Failed to allocate name of nvme_bdev_ctrlr.\n"); 5273 free(nbdev_ctrlr); 5274 goto exit; 5275 } 5276 TAILQ_INIT(&nbdev_ctrlr->ctrlrs); 5277 TAILQ_INIT(&nbdev_ctrlr->bdevs); 5278 TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 5279 } 5280 nvme_ctrlr->nbdev_ctrlr = nbdev_ctrlr; 5281 TAILQ_INSERT_TAIL(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 5282 exit: 5283 pthread_mutex_unlock(&g_bdev_nvme_mutex); 5284 return rc; 5285 } 5286 5287 static int 5288 nvme_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr, 5289 const char *name, 5290 const struct spdk_nvme_transport_id *trid, 5291 struct nvme_async_probe_ctx *ctx) 5292 { 5293 struct nvme_ctrlr *nvme_ctrlr; 5294 struct nvme_path_id *path_id; 5295 const struct spdk_nvme_ctrlr_data *cdata; 5296 int rc; 5297 5298 nvme_ctrlr = calloc(1, sizeof(*nvme_ctrlr)); 5299 if (nvme_ctrlr == NULL) { 5300 SPDK_ERRLOG("Failed to allocate device struct\n"); 5301 return -ENOMEM; 5302 } 5303 5304 rc = pthread_mutex_init(&nvme_ctrlr->mutex, NULL); 5305 if (rc != 0) { 5306 free(nvme_ctrlr); 5307 return rc; 5308 } 5309 5310 TAILQ_INIT(&nvme_ctrlr->trids); 5311 5312 RB_INIT(&nvme_ctrlr->namespaces); 5313 5314 path_id = calloc(1, sizeof(*path_id)); 5315 if (path_id == NULL) { 5316 SPDK_ERRLOG("Failed to allocate trid entry pointer\n"); 5317 rc = -ENOMEM; 5318 goto err; 5319 } 5320 5321 path_id->trid = *trid; 5322 if (ctx != NULL) { 5323 memcpy(path_id->hostid.hostaddr, ctx->drv_opts.src_addr, sizeof(path_id->hostid.hostaddr)); 5324 memcpy(path_id->hostid.hostsvcid, ctx->drv_opts.src_svcid, sizeof(path_id->hostid.hostsvcid)); 5325 } 5326 nvme_ctrlr->active_path_id = path_id; 5327 TAILQ_INSERT_HEAD(&nvme_ctrlr->trids, path_id, link); 5328 5329 nvme_ctrlr->thread = spdk_get_thread(); 5330 nvme_ctrlr->ctrlr = ctrlr; 5331 nvme_ctrlr->ref = 1; 5332 5333 if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) { 5334 SPDK_ERRLOG("OCSSDs are not supported"); 5335 rc = -ENOTSUP; 5336 goto err; 5337 } 5338 5339 if (ctx != NULL) { 5340 memcpy(&nvme_ctrlr->opts, &ctx->bdev_opts, sizeof(ctx->bdev_opts)); 5341 } else { 5342 bdev_nvme_get_default_ctrlr_opts(&nvme_ctrlr->opts); 5343 } 5344 5345 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_ctrlr, 5346 g_opts.nvme_adminq_poll_period_us); 5347 5348 if (g_opts.timeout_us > 0) { 5349 /* Register timeout callback. Timeout values for IO vs. admin reqs can be different. */ 5350 /* If timeout_admin_us is 0 (not specified), admin uses same timeout as IO. */ 5351 uint64_t adm_timeout_us = (g_opts.timeout_admin_us == 0) ? 5352 g_opts.timeout_us : g_opts.timeout_admin_us; 5353 spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us, 5354 adm_timeout_us, timeout_cb, nvme_ctrlr); 5355 } 5356 5357 spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_ctrlr); 5358 spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_ctrlr); 5359 5360 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 5361 SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) { 5362 nvme_ctrlr->opal_dev = spdk_opal_dev_construct(ctrlr); 5363 } 5364 5365 rc = nvme_bdev_ctrlr_create(name, nvme_ctrlr); 5366 if (rc != 0) { 5367 goto err; 5368 } 5369 5370 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5371 5372 if (cdata->cmic.ana_reporting) { 5373 rc = nvme_ctrlr_init_ana_log_page(nvme_ctrlr, ctx); 5374 if (rc == 0) { 5375 return 0; 5376 } 5377 } else { 5378 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 5379 return 0; 5380 } 5381 5382 err: 5383 nvme_ctrlr_delete(nvme_ctrlr); 5384 return rc; 5385 } 5386 5387 void 5388 bdev_nvme_get_default_ctrlr_opts(struct nvme_ctrlr_opts *opts) 5389 { 5390 opts->prchk_flags = 0; 5391 opts->ctrlr_loss_timeout_sec = g_opts.ctrlr_loss_timeout_sec; 5392 opts->reconnect_delay_sec = g_opts.reconnect_delay_sec; 5393 opts->fast_io_fail_timeout_sec = g_opts.fast_io_fail_timeout_sec; 5394 } 5395 5396 static void 5397 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5398 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *drv_opts) 5399 { 5400 char *name; 5401 5402 name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++); 5403 if (!name) { 5404 SPDK_ERRLOG("Failed to assign name to NVMe device\n"); 5405 return; 5406 } 5407 5408 if (nvme_ctrlr_create(ctrlr, name, trid, NULL) == 0) { 5409 SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name); 5410 } else { 5411 SPDK_ERRLOG("Failed to attach to %s (%s)\n", trid->traddr, name); 5412 } 5413 5414 free(name); 5415 } 5416 5417 static void 5418 _nvme_ctrlr_destruct(void *ctx) 5419 { 5420 struct nvme_ctrlr *nvme_ctrlr = ctx; 5421 5422 nvme_ctrlr_depopulate_namespaces(nvme_ctrlr); 5423 nvme_ctrlr_release(nvme_ctrlr); 5424 } 5425 5426 static int 5427 bdev_nvme_delete_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) 5428 { 5429 struct nvme_probe_skip_entry *entry; 5430 5431 /* The controller's destruction was already started */ 5432 if (nvme_ctrlr->destruct) { 5433 return -EALREADY; 5434 } 5435 5436 if (!hotplug && 5437 nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { 5438 entry = calloc(1, sizeof(*entry)); 5439 if (!entry) { 5440 return -ENOMEM; 5441 } 5442 entry->trid = nvme_ctrlr->active_path_id->trid; 5443 TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq); 5444 } 5445 5446 nvme_ctrlr->destruct = true; 5447 return 0; 5448 } 5449 5450 static int 5451 bdev_nvme_delete_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) 5452 { 5453 int rc; 5454 5455 pthread_mutex_lock(&nvme_ctrlr->mutex); 5456 rc = bdev_nvme_delete_ctrlr_unsafe(nvme_ctrlr, hotplug); 5457 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5458 5459 if (rc == 0) { 5460 _nvme_ctrlr_destruct(nvme_ctrlr); 5461 } else if (rc == -EALREADY) { 5462 rc = 0; 5463 } 5464 5465 return rc; 5466 } 5467 5468 static void 5469 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr) 5470 { 5471 struct nvme_ctrlr *nvme_ctrlr = cb_ctx; 5472 5473 bdev_nvme_delete_ctrlr(nvme_ctrlr, true); 5474 } 5475 5476 static int 5477 bdev_nvme_hotplug_probe(void *arg) 5478 { 5479 if (g_hotplug_probe_ctx == NULL) { 5480 spdk_poller_unregister(&g_hotplug_probe_poller); 5481 return SPDK_POLLER_IDLE; 5482 } 5483 5484 if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) { 5485 g_hotplug_probe_ctx = NULL; 5486 spdk_poller_unregister(&g_hotplug_probe_poller); 5487 } 5488 5489 return SPDK_POLLER_BUSY; 5490 } 5491 5492 static int 5493 bdev_nvme_hotplug(void *arg) 5494 { 5495 struct spdk_nvme_transport_id trid_pcie; 5496 5497 if (g_hotplug_probe_ctx) { 5498 return SPDK_POLLER_BUSY; 5499 } 5500 5501 memset(&trid_pcie, 0, sizeof(trid_pcie)); 5502 spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE); 5503 5504 g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL, 5505 hotplug_probe_cb, attach_cb, NULL); 5506 5507 if (g_hotplug_probe_ctx) { 5508 assert(g_hotplug_probe_poller == NULL); 5509 g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000); 5510 } 5511 5512 return SPDK_POLLER_BUSY; 5513 } 5514 5515 void 5516 bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts) 5517 { 5518 *opts = g_opts; 5519 } 5520 5521 static bool bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, 5522 uint32_t reconnect_delay_sec, 5523 uint32_t fast_io_fail_timeout_sec); 5524 5525 static int 5526 bdev_nvme_validate_opts(const struct spdk_bdev_nvme_opts *opts) 5527 { 5528 if ((opts->timeout_us == 0) && (opts->timeout_admin_us != 0)) { 5529 /* Can't set timeout_admin_us without also setting timeout_us */ 5530 SPDK_WARNLOG("Invalid options: Can't have (timeout_us == 0) with (timeout_admin_us > 0)\n"); 5531 return -EINVAL; 5532 } 5533 5534 if (opts->bdev_retry_count < -1) { 5535 SPDK_WARNLOG("Invalid option: bdev_retry_count can't be less than -1.\n"); 5536 return -EINVAL; 5537 } 5538 5539 if (!bdev_nvme_check_io_error_resiliency_params(opts->ctrlr_loss_timeout_sec, 5540 opts->reconnect_delay_sec, 5541 opts->fast_io_fail_timeout_sec)) { 5542 return -EINVAL; 5543 } 5544 5545 return 0; 5546 } 5547 5548 int 5549 bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts) 5550 { 5551 int ret; 5552 5553 ret = bdev_nvme_validate_opts(opts); 5554 if (ret) { 5555 SPDK_WARNLOG("Failed to set nvme opts.\n"); 5556 return ret; 5557 } 5558 5559 if (g_bdev_nvme_init_thread != NULL) { 5560 if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 5561 return -EPERM; 5562 } 5563 } 5564 5565 if (opts->rdma_srq_size != 0 || 5566 opts->rdma_max_cq_size != 0 || 5567 opts->rdma_cm_event_timeout_ms != 0) { 5568 struct spdk_nvme_transport_opts drv_opts; 5569 5570 spdk_nvme_transport_get_opts(&drv_opts, sizeof(drv_opts)); 5571 if (opts->rdma_srq_size != 0) { 5572 drv_opts.rdma_srq_size = opts->rdma_srq_size; 5573 } 5574 if (opts->rdma_max_cq_size != 0) { 5575 drv_opts.rdma_max_cq_size = opts->rdma_max_cq_size; 5576 } 5577 if (opts->rdma_cm_event_timeout_ms != 0) { 5578 drv_opts.rdma_cm_event_timeout_ms = opts->rdma_cm_event_timeout_ms; 5579 } 5580 5581 ret = spdk_nvme_transport_set_opts(&drv_opts, sizeof(drv_opts)); 5582 if (ret) { 5583 SPDK_ERRLOG("Failed to set NVMe transport opts.\n"); 5584 return ret; 5585 } 5586 } 5587 5588 g_opts = *opts; 5589 5590 return 0; 5591 } 5592 5593 struct set_nvme_hotplug_ctx { 5594 uint64_t period_us; 5595 bool enabled; 5596 spdk_msg_fn fn; 5597 void *fn_ctx; 5598 }; 5599 5600 static void 5601 set_nvme_hotplug_period_cb(void *_ctx) 5602 { 5603 struct set_nvme_hotplug_ctx *ctx = _ctx; 5604 5605 spdk_poller_unregister(&g_hotplug_poller); 5606 if (ctx->enabled) { 5607 g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us); 5608 } 5609 5610 g_nvme_hotplug_poll_period_us = ctx->period_us; 5611 g_nvme_hotplug_enabled = ctx->enabled; 5612 if (ctx->fn) { 5613 ctx->fn(ctx->fn_ctx); 5614 } 5615 5616 free(ctx); 5617 } 5618 5619 int 5620 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx) 5621 { 5622 struct set_nvme_hotplug_ctx *ctx; 5623 5624 if (enabled == true && !spdk_process_is_primary()) { 5625 return -EPERM; 5626 } 5627 5628 ctx = calloc(1, sizeof(*ctx)); 5629 if (ctx == NULL) { 5630 return -ENOMEM; 5631 } 5632 5633 period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us; 5634 ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX); 5635 ctx->enabled = enabled; 5636 ctx->fn = cb; 5637 ctx->fn_ctx = cb_ctx; 5638 5639 spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx); 5640 return 0; 5641 } 5642 5643 static void 5644 nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 5645 struct nvme_async_probe_ctx *ctx) 5646 { 5647 struct nvme_ns *nvme_ns; 5648 struct nvme_bdev *nvme_bdev; 5649 size_t j; 5650 5651 assert(nvme_ctrlr != NULL); 5652 5653 if (ctx->names == NULL) { 5654 ctx->reported_bdevs = 0; 5655 populate_namespaces_cb(ctx, 0); 5656 return; 5657 } 5658 5659 /* 5660 * Report the new bdevs that were created in this call. 5661 * There can be more than one bdev per NVMe controller. 5662 */ 5663 j = 0; 5664 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 5665 while (nvme_ns != NULL) { 5666 nvme_bdev = nvme_ns->bdev; 5667 if (j < ctx->max_bdevs) { 5668 ctx->names[j] = nvme_bdev->disk.name; 5669 j++; 5670 } else { 5671 SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n", 5672 ctx->max_bdevs); 5673 ctx->reported_bdevs = 0; 5674 populate_namespaces_cb(ctx, -ERANGE); 5675 return; 5676 } 5677 5678 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 5679 } 5680 5681 ctx->reported_bdevs = j; 5682 populate_namespaces_cb(ctx, 0); 5683 } 5684 5685 static int 5686 bdev_nvme_check_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 5687 struct spdk_nvme_ctrlr *new_ctrlr, 5688 struct spdk_nvme_transport_id *trid) 5689 { 5690 struct nvme_path_id *tmp_trid; 5691 5692 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 5693 SPDK_ERRLOG("PCIe failover is not supported.\n"); 5694 return -ENOTSUP; 5695 } 5696 5697 /* Currently we only support failover to the same transport type. */ 5698 if (nvme_ctrlr->active_path_id->trid.trtype != trid->trtype) { 5699 SPDK_WARNLOG("Failover from trtype: %s to a different trtype: %s is not supported currently\n", 5700 spdk_nvme_transport_id_trtype_str(nvme_ctrlr->active_path_id->trid.trtype), 5701 spdk_nvme_transport_id_trtype_str(trid->trtype)); 5702 return -EINVAL; 5703 } 5704 5705 5706 /* Currently we only support failover to the same NQN. */ 5707 if (strncmp(trid->subnqn, nvme_ctrlr->active_path_id->trid.subnqn, SPDK_NVMF_NQN_MAX_LEN)) { 5708 SPDK_WARNLOG("Failover from subnqn: %s to a different subnqn: %s is not supported currently\n", 5709 nvme_ctrlr->active_path_id->trid.subnqn, trid->subnqn); 5710 return -EINVAL; 5711 } 5712 5713 /* Skip all the other checks if we've already registered this path. */ 5714 TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) { 5715 if (!spdk_nvme_transport_id_compare(&tmp_trid->trid, trid)) { 5716 SPDK_WARNLOG("This path (traddr: %s subnqn: %s) is already registered\n", trid->traddr, 5717 trid->subnqn); 5718 return -EEXIST; 5719 } 5720 } 5721 5722 return 0; 5723 } 5724 5725 static int 5726 bdev_nvme_check_secondary_namespace(struct nvme_ctrlr *nvme_ctrlr, 5727 struct spdk_nvme_ctrlr *new_ctrlr) 5728 { 5729 struct nvme_ns *nvme_ns; 5730 struct spdk_nvme_ns *new_ns; 5731 5732 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 5733 while (nvme_ns != NULL) { 5734 new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nvme_ns->id); 5735 assert(new_ns != NULL); 5736 5737 if (!bdev_nvme_compare_ns(nvme_ns->ns, new_ns)) { 5738 return -EINVAL; 5739 } 5740 5741 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 5742 } 5743 5744 return 0; 5745 } 5746 5747 static int 5748 _bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 5749 struct spdk_nvme_transport_id *trid) 5750 { 5751 struct nvme_path_id *active_id, *new_trid, *tmp_trid; 5752 5753 new_trid = calloc(1, sizeof(*new_trid)); 5754 if (new_trid == NULL) { 5755 return -ENOMEM; 5756 } 5757 new_trid->trid = *trid; 5758 5759 active_id = nvme_ctrlr->active_path_id; 5760 assert(active_id != NULL); 5761 assert(active_id == TAILQ_FIRST(&nvme_ctrlr->trids)); 5762 5763 /* Skip the active trid not to replace it until it is failed. */ 5764 tmp_trid = TAILQ_NEXT(active_id, link); 5765 if (tmp_trid == NULL) { 5766 goto add_tail; 5767 } 5768 5769 /* It means the trid is faled if its last failed time is non-zero. 5770 * Insert the new alternate trid before any failed trid. 5771 */ 5772 TAILQ_FOREACH_FROM(tmp_trid, &nvme_ctrlr->trids, link) { 5773 if (tmp_trid->last_failed_tsc != 0) { 5774 TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link); 5775 return 0; 5776 } 5777 } 5778 5779 add_tail: 5780 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, new_trid, link); 5781 return 0; 5782 } 5783 5784 /* This is the case that a secondary path is added to an existing 5785 * nvme_ctrlr for failover. After checking if it can access the same 5786 * namespaces as the primary path, it is disconnected until failover occurs. 5787 */ 5788 static int 5789 bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 5790 struct spdk_nvme_ctrlr *new_ctrlr, 5791 struct spdk_nvme_transport_id *trid) 5792 { 5793 int rc; 5794 5795 assert(nvme_ctrlr != NULL); 5796 5797 pthread_mutex_lock(&nvme_ctrlr->mutex); 5798 5799 rc = bdev_nvme_check_secondary_trid(nvme_ctrlr, new_ctrlr, trid); 5800 if (rc != 0) { 5801 goto exit; 5802 } 5803 5804 rc = bdev_nvme_check_secondary_namespace(nvme_ctrlr, new_ctrlr); 5805 if (rc != 0) { 5806 goto exit; 5807 } 5808 5809 rc = _bdev_nvme_add_secondary_trid(nvme_ctrlr, trid); 5810 5811 exit: 5812 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5813 5814 spdk_nvme_detach(new_ctrlr); 5815 5816 return rc; 5817 } 5818 5819 static void 5820 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5821 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 5822 { 5823 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 5824 struct nvme_async_probe_ctx *ctx; 5825 int rc; 5826 5827 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 5828 ctx->ctrlr_attached = true; 5829 5830 rc = nvme_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx); 5831 if (rc != 0) { 5832 ctx->reported_bdevs = 0; 5833 populate_namespaces_cb(ctx, rc); 5834 } 5835 } 5836 5837 static void 5838 connect_set_failover_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5839 struct spdk_nvme_ctrlr *ctrlr, 5840 const struct spdk_nvme_ctrlr_opts *opts) 5841 { 5842 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 5843 struct nvme_ctrlr *nvme_ctrlr; 5844 struct nvme_async_probe_ctx *ctx; 5845 int rc; 5846 5847 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 5848 ctx->ctrlr_attached = true; 5849 5850 nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->base_name); 5851 if (nvme_ctrlr) { 5852 rc = bdev_nvme_add_secondary_trid(nvme_ctrlr, ctrlr, &ctx->trid); 5853 } else { 5854 rc = -ENODEV; 5855 } 5856 5857 ctx->reported_bdevs = 0; 5858 populate_namespaces_cb(ctx, rc); 5859 } 5860 5861 static int 5862 bdev_nvme_async_poll(void *arg) 5863 { 5864 struct nvme_async_probe_ctx *ctx = arg; 5865 int rc; 5866 5867 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 5868 if (spdk_unlikely(rc != -EAGAIN)) { 5869 ctx->probe_done = true; 5870 spdk_poller_unregister(&ctx->poller); 5871 if (!ctx->ctrlr_attached) { 5872 /* The probe is done, but no controller was attached. 5873 * That means we had a failure, so report -EIO back to 5874 * the caller (usually the RPC). populate_namespaces_cb() 5875 * will take care of freeing the nvme_async_probe_ctx. 5876 */ 5877 ctx->reported_bdevs = 0; 5878 populate_namespaces_cb(ctx, -EIO); 5879 } else if (ctx->namespaces_populated) { 5880 /* The namespaces for the attached controller were all 5881 * populated and the response was already sent to the 5882 * caller (usually the RPC). So free the context here. 5883 */ 5884 free(ctx); 5885 } 5886 } 5887 5888 return SPDK_POLLER_BUSY; 5889 } 5890 5891 static bool 5892 bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, 5893 uint32_t reconnect_delay_sec, 5894 uint32_t fast_io_fail_timeout_sec) 5895 { 5896 if (ctrlr_loss_timeout_sec < -1) { 5897 SPDK_ERRLOG("ctrlr_loss_timeout_sec can't be less than -1.\n"); 5898 return false; 5899 } else if (ctrlr_loss_timeout_sec == -1) { 5900 if (reconnect_delay_sec == 0) { 5901 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 5902 return false; 5903 } else if (fast_io_fail_timeout_sec != 0 && 5904 fast_io_fail_timeout_sec < reconnect_delay_sec) { 5905 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io-fail_timeout_sec.\n"); 5906 return false; 5907 } 5908 } else if (ctrlr_loss_timeout_sec != 0) { 5909 if (reconnect_delay_sec == 0) { 5910 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 5911 return false; 5912 } else if (reconnect_delay_sec > (uint32_t)ctrlr_loss_timeout_sec) { 5913 SPDK_ERRLOG("reconnect_delay_sec can't be more than ctrlr_loss_timeout_sec.\n"); 5914 return false; 5915 } else if (fast_io_fail_timeout_sec != 0) { 5916 if (fast_io_fail_timeout_sec < reconnect_delay_sec) { 5917 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io_fail_timeout_sec.\n"); 5918 return false; 5919 } else if (fast_io_fail_timeout_sec > (uint32_t)ctrlr_loss_timeout_sec) { 5920 SPDK_ERRLOG("fast_io_fail_timeout_sec can't be more than ctrlr_loss_timeout_sec.\n"); 5921 return false; 5922 } 5923 } 5924 } else if (reconnect_delay_sec != 0 || fast_io_fail_timeout_sec != 0) { 5925 SPDK_ERRLOG("Both reconnect_delay_sec and fast_io_fail_timeout_sec must be 0 if ctrlr_loss_timeout_sec is 0.\n"); 5926 return false; 5927 } 5928 5929 return true; 5930 } 5931 5932 int 5933 bdev_nvme_create(struct spdk_nvme_transport_id *trid, 5934 const char *base_name, 5935 const char **names, 5936 uint32_t count, 5937 spdk_bdev_create_nvme_fn cb_fn, 5938 void *cb_ctx, 5939 struct spdk_nvme_ctrlr_opts *drv_opts, 5940 struct nvme_ctrlr_opts *bdev_opts, 5941 bool multipath) 5942 { 5943 struct nvme_probe_skip_entry *entry, *tmp; 5944 struct nvme_async_probe_ctx *ctx; 5945 spdk_nvme_attach_cb attach_cb; 5946 int len; 5947 5948 /* TODO expand this check to include both the host and target TRIDs. 5949 * Only if both are the same should we fail. 5950 */ 5951 if (nvme_ctrlr_get(trid) != NULL) { 5952 SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr); 5953 return -EEXIST; 5954 } 5955 5956 len = strnlen(base_name, SPDK_CONTROLLER_NAME_MAX); 5957 5958 if (len == 0 || len == SPDK_CONTROLLER_NAME_MAX) { 5959 SPDK_ERRLOG("controller name must be between 1 and %d characters\n", SPDK_CONTROLLER_NAME_MAX - 1); 5960 return -EINVAL; 5961 } 5962 5963 if (bdev_opts != NULL && 5964 !bdev_nvme_check_io_error_resiliency_params(bdev_opts->ctrlr_loss_timeout_sec, 5965 bdev_opts->reconnect_delay_sec, 5966 bdev_opts->fast_io_fail_timeout_sec)) { 5967 return -EINVAL; 5968 } 5969 5970 ctx = calloc(1, sizeof(*ctx)); 5971 if (!ctx) { 5972 return -ENOMEM; 5973 } 5974 ctx->base_name = base_name; 5975 ctx->names = names; 5976 ctx->max_bdevs = count; 5977 ctx->cb_fn = cb_fn; 5978 ctx->cb_ctx = cb_ctx; 5979 ctx->trid = *trid; 5980 5981 if (bdev_opts) { 5982 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 5983 } else { 5984 bdev_nvme_get_default_ctrlr_opts(&ctx->bdev_opts); 5985 } 5986 5987 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 5988 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) { 5989 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 5990 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 5991 free(entry); 5992 break; 5993 } 5994 } 5995 } 5996 5997 if (drv_opts) { 5998 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 5999 } else { 6000 spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->drv_opts, sizeof(ctx->drv_opts)); 6001 } 6002 6003 ctx->drv_opts.transport_retry_count = g_opts.transport_retry_count; 6004 ctx->drv_opts.transport_ack_timeout = g_opts.transport_ack_timeout; 6005 ctx->drv_opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms; 6006 ctx->drv_opts.disable_read_ana_log_page = true; 6007 ctx->drv_opts.transport_tos = g_opts.transport_tos; 6008 6009 if (nvme_bdev_ctrlr_get_by_name(base_name) == NULL || multipath) { 6010 attach_cb = connect_attach_cb; 6011 } else { 6012 attach_cb = connect_set_failover_cb; 6013 } 6014 6015 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, attach_cb); 6016 if (ctx->probe_ctx == NULL) { 6017 SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr); 6018 free(ctx); 6019 return -ENODEV; 6020 } 6021 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000); 6022 6023 return 0; 6024 } 6025 6026 struct bdev_nvme_delete_ctx { 6027 char *name; 6028 struct nvme_path_id path_id; 6029 bdev_nvme_delete_done_fn delete_done; 6030 void *delete_done_ctx; 6031 uint64_t timeout_ticks; 6032 struct spdk_poller *poller; 6033 }; 6034 6035 static void 6036 free_bdev_nvme_delete_ctx(struct bdev_nvme_delete_ctx *ctx) 6037 { 6038 if (ctx != NULL) { 6039 free(ctx->name); 6040 free(ctx); 6041 } 6042 } 6043 6044 static bool 6045 nvme_path_id_compare(struct nvme_path_id *p, const struct nvme_path_id *path_id) 6046 { 6047 if (path_id->trid.trtype != 0) { 6048 if (path_id->trid.trtype == SPDK_NVME_TRANSPORT_CUSTOM) { 6049 if (strcasecmp(path_id->trid.trstring, p->trid.trstring) != 0) { 6050 return false; 6051 } 6052 } else { 6053 if (path_id->trid.trtype != p->trid.trtype) { 6054 return false; 6055 } 6056 } 6057 } 6058 6059 if (!spdk_mem_all_zero(path_id->trid.traddr, sizeof(path_id->trid.traddr))) { 6060 if (strcasecmp(path_id->trid.traddr, p->trid.traddr) != 0) { 6061 return false; 6062 } 6063 } 6064 6065 if (path_id->trid.adrfam != 0) { 6066 if (path_id->trid.adrfam != p->trid.adrfam) { 6067 return false; 6068 } 6069 } 6070 6071 if (!spdk_mem_all_zero(path_id->trid.trsvcid, sizeof(path_id->trid.trsvcid))) { 6072 if (strcasecmp(path_id->trid.trsvcid, p->trid.trsvcid) != 0) { 6073 return false; 6074 } 6075 } 6076 6077 if (!spdk_mem_all_zero(path_id->trid.subnqn, sizeof(path_id->trid.subnqn))) { 6078 if (strcmp(path_id->trid.subnqn, p->trid.subnqn) != 0) { 6079 return false; 6080 } 6081 } 6082 6083 if (!spdk_mem_all_zero(path_id->hostid.hostaddr, sizeof(path_id->hostid.hostaddr))) { 6084 if (strcmp(path_id->hostid.hostaddr, p->hostid.hostaddr) != 0) { 6085 return false; 6086 } 6087 } 6088 6089 if (!spdk_mem_all_zero(path_id->hostid.hostsvcid, sizeof(path_id->hostid.hostsvcid))) { 6090 if (strcmp(path_id->hostid.hostsvcid, p->hostid.hostsvcid) != 0) { 6091 return false; 6092 } 6093 } 6094 6095 return true; 6096 } 6097 6098 static bool 6099 nvme_path_id_exists(const char *name, const struct nvme_path_id *path_id) 6100 { 6101 struct nvme_bdev_ctrlr *nbdev_ctrlr; 6102 struct nvme_ctrlr *ctrlr; 6103 struct nvme_path_id *p; 6104 6105 pthread_mutex_lock(&g_bdev_nvme_mutex); 6106 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 6107 if (!nbdev_ctrlr) { 6108 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6109 return false; 6110 } 6111 6112 TAILQ_FOREACH(ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 6113 pthread_mutex_lock(&ctrlr->mutex); 6114 TAILQ_FOREACH(p, &ctrlr->trids, link) { 6115 if (nvme_path_id_compare(p, path_id)) { 6116 pthread_mutex_unlock(&ctrlr->mutex); 6117 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6118 return true; 6119 } 6120 } 6121 pthread_mutex_unlock(&ctrlr->mutex); 6122 } 6123 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6124 6125 return false; 6126 } 6127 6128 static int 6129 bdev_nvme_delete_complete_poll(void *arg) 6130 { 6131 struct bdev_nvme_delete_ctx *ctx = arg; 6132 int rc = 0; 6133 6134 if (nvme_path_id_exists(ctx->name, &ctx->path_id)) { 6135 if (ctx->timeout_ticks > spdk_get_ticks()) { 6136 return SPDK_POLLER_BUSY; 6137 } 6138 6139 SPDK_ERRLOG("NVMe path '%s' still exists after delete\n", ctx->name); 6140 rc = -ETIMEDOUT; 6141 } 6142 6143 spdk_poller_unregister(&ctx->poller); 6144 6145 ctx->delete_done(ctx->delete_done_ctx, rc); 6146 free_bdev_nvme_delete_ctx(ctx); 6147 6148 return SPDK_POLLER_BUSY; 6149 } 6150 6151 static int 6152 _bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, const struct nvme_path_id *path_id) 6153 { 6154 struct nvme_path_id *p, *t; 6155 spdk_msg_fn msg_fn; 6156 int rc = -ENXIO; 6157 6158 pthread_mutex_lock(&nvme_ctrlr->mutex); 6159 6160 TAILQ_FOREACH_REVERSE_SAFE(p, &nvme_ctrlr->trids, nvme_paths, link, t) { 6161 if (p == TAILQ_FIRST(&nvme_ctrlr->trids)) { 6162 break; 6163 } 6164 6165 if (!nvme_path_id_compare(p, path_id)) { 6166 continue; 6167 } 6168 6169 /* We are not using the specified path. */ 6170 TAILQ_REMOVE(&nvme_ctrlr->trids, p, link); 6171 free(p); 6172 rc = 0; 6173 } 6174 6175 if (p == NULL || !nvme_path_id_compare(p, path_id)) { 6176 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6177 return rc; 6178 } 6179 6180 /* If we made it here, then this path is a match! Now we need to remove it. */ 6181 6182 /* This is the active path in use right now. The active path is always the first in the list. */ 6183 assert(p == nvme_ctrlr->active_path_id); 6184 6185 if (!TAILQ_NEXT(p, link)) { 6186 /* The current path is the only path. */ 6187 msg_fn = _nvme_ctrlr_destruct; 6188 rc = bdev_nvme_delete_ctrlr_unsafe(nvme_ctrlr, false); 6189 } else { 6190 /* There is an alternative path. */ 6191 msg_fn = _bdev_nvme_reset_ctrlr; 6192 rc = bdev_nvme_failover_ctrlr_unsafe(nvme_ctrlr, true); 6193 } 6194 6195 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6196 6197 if (rc == 0) { 6198 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 6199 } else if (rc == -EALREADY) { 6200 rc = 0; 6201 } 6202 6203 return rc; 6204 } 6205 6206 int 6207 bdev_nvme_delete(const char *name, const struct nvme_path_id *path_id, 6208 bdev_nvme_delete_done_fn delete_done, void *delete_done_ctx) 6209 { 6210 struct nvme_bdev_ctrlr *nbdev_ctrlr; 6211 struct nvme_ctrlr *nvme_ctrlr, *tmp_nvme_ctrlr; 6212 struct bdev_nvme_delete_ctx *ctx = NULL; 6213 int rc = -ENXIO, _rc; 6214 6215 if (name == NULL || path_id == NULL) { 6216 rc = -EINVAL; 6217 goto exit; 6218 } 6219 6220 pthread_mutex_lock(&g_bdev_nvme_mutex); 6221 6222 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 6223 if (nbdev_ctrlr == NULL) { 6224 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6225 6226 SPDK_ERRLOG("Failed to find NVMe bdev controller\n"); 6227 rc = -ENODEV; 6228 goto exit; 6229 } 6230 6231 TAILQ_FOREACH_SAFE(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq, tmp_nvme_ctrlr) { 6232 _rc = _bdev_nvme_delete(nvme_ctrlr, path_id); 6233 if (_rc < 0 && _rc != -ENXIO) { 6234 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6235 rc = _rc; 6236 goto exit; 6237 } else if (_rc == 0) { 6238 /* We traverse all remaining nvme_ctrlrs even if one nvme_ctrlr 6239 * was deleted successfully. To remember the successful deletion, 6240 * overwrite rc only if _rc is zero. 6241 */ 6242 rc = 0; 6243 } 6244 } 6245 6246 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6247 6248 if (rc != 0 || delete_done == NULL) { 6249 goto exit; 6250 } 6251 6252 ctx = calloc(1, sizeof(*ctx)); 6253 if (ctx == NULL) { 6254 SPDK_ERRLOG("Failed to allocate context for bdev_nvme_delete\n"); 6255 rc = -ENOMEM; 6256 goto exit; 6257 } 6258 6259 ctx->name = strdup(name); 6260 if (ctx->name == NULL) { 6261 SPDK_ERRLOG("Failed to copy controller name for deletion\n"); 6262 rc = -ENOMEM; 6263 goto exit; 6264 } 6265 6266 ctx->delete_done = delete_done; 6267 ctx->delete_done_ctx = delete_done_ctx; 6268 ctx->path_id = *path_id; 6269 ctx->timeout_ticks = spdk_get_ticks() + 10 * spdk_get_ticks_hz(); 6270 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_delete_complete_poll, ctx, 1000); 6271 if (ctx->poller == NULL) { 6272 SPDK_ERRLOG("Failed to register bdev_nvme_delete poller\n"); 6273 rc = -ENOMEM; 6274 goto exit; 6275 } 6276 6277 exit: 6278 if (rc != 0) { 6279 free_bdev_nvme_delete_ctx(ctx); 6280 } 6281 6282 return rc; 6283 } 6284 6285 #define DISCOVERY_INFOLOG(ctx, format, ...) \ 6286 SPDK_INFOLOG(bdev_nvme, "Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 6287 6288 #define DISCOVERY_ERRLOG(ctx, format, ...) \ 6289 SPDK_ERRLOG("Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 6290 6291 struct discovery_entry_ctx { 6292 char name[128]; 6293 struct spdk_nvme_transport_id trid; 6294 struct spdk_nvme_ctrlr_opts drv_opts; 6295 struct spdk_nvmf_discovery_log_page_entry entry; 6296 TAILQ_ENTRY(discovery_entry_ctx) tailq; 6297 struct discovery_ctx *ctx; 6298 }; 6299 6300 struct discovery_ctx { 6301 char *name; 6302 spdk_bdev_nvme_start_discovery_fn start_cb_fn; 6303 spdk_bdev_nvme_stop_discovery_fn stop_cb_fn; 6304 void *cb_ctx; 6305 struct spdk_nvme_probe_ctx *probe_ctx; 6306 struct spdk_nvme_detach_ctx *detach_ctx; 6307 struct spdk_nvme_ctrlr *ctrlr; 6308 struct spdk_nvme_transport_id trid; 6309 struct discovery_entry_ctx *entry_ctx_in_use; 6310 struct spdk_poller *poller; 6311 struct spdk_nvme_ctrlr_opts drv_opts; 6312 struct nvme_ctrlr_opts bdev_opts; 6313 struct spdk_nvmf_discovery_log_page *log_page; 6314 TAILQ_ENTRY(discovery_ctx) tailq; 6315 TAILQ_HEAD(, discovery_entry_ctx) nvm_entry_ctxs; 6316 TAILQ_HEAD(, discovery_entry_ctx) discovery_entry_ctxs; 6317 int rc; 6318 bool wait_for_attach; 6319 uint64_t timeout_ticks; 6320 /* Denotes that the discovery service is being started. We're waiting 6321 * for the initial connection to the discovery controller to be 6322 * established and attach discovered NVM ctrlrs. 6323 */ 6324 bool initializing; 6325 /* Denotes if a discovery is currently in progress for this context. 6326 * That includes connecting to newly discovered subsystems. Used to 6327 * ensure we do not start a new discovery until an existing one is 6328 * complete. 6329 */ 6330 bool in_progress; 6331 6332 /* Denotes if another discovery is needed after the one in progress 6333 * completes. Set when we receive an AER completion while a discovery 6334 * is already in progress. 6335 */ 6336 bool pending; 6337 6338 /* Signal to the discovery context poller that it should stop the 6339 * discovery service, including detaching from the current discovery 6340 * controller. 6341 */ 6342 bool stop; 6343 6344 struct spdk_thread *calling_thread; 6345 uint32_t index; 6346 uint32_t attach_in_progress; 6347 char *hostnqn; 6348 6349 /* Denotes if the discovery service was started by the mdns discovery. 6350 */ 6351 bool from_mdns_discovery_service; 6352 }; 6353 6354 TAILQ_HEAD(discovery_ctxs, discovery_ctx); 6355 static struct discovery_ctxs g_discovery_ctxs = TAILQ_HEAD_INITIALIZER(g_discovery_ctxs); 6356 6357 static void get_discovery_log_page(struct discovery_ctx *ctx); 6358 6359 static void 6360 free_discovery_ctx(struct discovery_ctx *ctx) 6361 { 6362 free(ctx->log_page); 6363 free(ctx->hostnqn); 6364 free(ctx->name); 6365 free(ctx); 6366 } 6367 6368 static void 6369 discovery_complete(struct discovery_ctx *ctx) 6370 { 6371 ctx->initializing = false; 6372 ctx->in_progress = false; 6373 if (ctx->pending) { 6374 ctx->pending = false; 6375 get_discovery_log_page(ctx); 6376 } 6377 } 6378 6379 static void 6380 build_trid_from_log_page_entry(struct spdk_nvme_transport_id *trid, 6381 struct spdk_nvmf_discovery_log_page_entry *entry) 6382 { 6383 char *space; 6384 6385 trid->trtype = entry->trtype; 6386 trid->adrfam = entry->adrfam; 6387 memcpy(trid->traddr, entry->traddr, sizeof(entry->traddr)); 6388 memcpy(trid->trsvcid, entry->trsvcid, sizeof(entry->trsvcid)); 6389 /* Because the source buffer (entry->subnqn) is longer than trid->subnqn, and 6390 * before call to this function trid->subnqn is zeroed out, we need 6391 * to copy sizeof(trid->subnqn) minus one byte to make sure the last character 6392 * remains 0. Then we can shorten the string (replace ' ' with 0) if required 6393 */ 6394 memcpy(trid->subnqn, entry->subnqn, sizeof(trid->subnqn) - 1); 6395 6396 /* We want the traddr, trsvcid and subnqn fields to be NULL-terminated. 6397 * But the log page entries typically pad them with spaces, not zeroes. 6398 * So add a NULL terminator to each of these fields at the appropriate 6399 * location. 6400 */ 6401 space = strchr(trid->traddr, ' '); 6402 if (space) { 6403 *space = 0; 6404 } 6405 space = strchr(trid->trsvcid, ' '); 6406 if (space) { 6407 *space = 0; 6408 } 6409 space = strchr(trid->subnqn, ' '); 6410 if (space) { 6411 *space = 0; 6412 } 6413 } 6414 6415 static void 6416 _stop_discovery(void *_ctx) 6417 { 6418 struct discovery_ctx *ctx = _ctx; 6419 6420 if (ctx->attach_in_progress > 0) { 6421 spdk_thread_send_msg(spdk_get_thread(), _stop_discovery, ctx); 6422 return; 6423 } 6424 6425 ctx->stop = true; 6426 6427 while (!TAILQ_EMPTY(&ctx->nvm_entry_ctxs)) { 6428 struct discovery_entry_ctx *entry_ctx; 6429 struct nvme_path_id path = {}; 6430 6431 entry_ctx = TAILQ_FIRST(&ctx->nvm_entry_ctxs); 6432 path.trid = entry_ctx->trid; 6433 bdev_nvme_delete(entry_ctx->name, &path, NULL, NULL); 6434 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 6435 free(entry_ctx); 6436 } 6437 6438 while (!TAILQ_EMPTY(&ctx->discovery_entry_ctxs)) { 6439 struct discovery_entry_ctx *entry_ctx; 6440 6441 entry_ctx = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 6442 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 6443 free(entry_ctx); 6444 } 6445 6446 free(ctx->entry_ctx_in_use); 6447 ctx->entry_ctx_in_use = NULL; 6448 } 6449 6450 static void 6451 stop_discovery(struct discovery_ctx *ctx, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 6452 { 6453 ctx->stop_cb_fn = cb_fn; 6454 ctx->cb_ctx = cb_ctx; 6455 6456 if (ctx->attach_in_progress > 0) { 6457 DISCOVERY_INFOLOG(ctx, "stopping discovery with attach_in_progress: %"PRIu32"\n", 6458 ctx->attach_in_progress); 6459 } 6460 6461 _stop_discovery(ctx); 6462 } 6463 6464 static void 6465 remove_discovery_entry(struct nvme_ctrlr *nvme_ctrlr) 6466 { 6467 struct discovery_ctx *d_ctx; 6468 struct nvme_path_id *path_id; 6469 struct spdk_nvme_transport_id trid = {}; 6470 struct discovery_entry_ctx *entry_ctx, *tmp; 6471 6472 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 6473 6474 TAILQ_FOREACH(d_ctx, &g_discovery_ctxs, tailq) { 6475 TAILQ_FOREACH_SAFE(entry_ctx, &d_ctx->nvm_entry_ctxs, tailq, tmp) { 6476 build_trid_from_log_page_entry(&trid, &entry_ctx->entry); 6477 if (spdk_nvme_transport_id_compare(&trid, &path_id->trid) != 0) { 6478 continue; 6479 } 6480 6481 TAILQ_REMOVE(&d_ctx->nvm_entry_ctxs, entry_ctx, tailq); 6482 free(entry_ctx); 6483 DISCOVERY_INFOLOG(d_ctx, "Remove discovery entry: %s:%s:%s\n", 6484 trid.subnqn, trid.traddr, trid.trsvcid); 6485 6486 /* Fail discovery ctrlr to force reattach attempt */ 6487 spdk_nvme_ctrlr_fail(d_ctx->ctrlr); 6488 } 6489 } 6490 } 6491 6492 static void 6493 discovery_remove_controllers(struct discovery_ctx *ctx) 6494 { 6495 struct spdk_nvmf_discovery_log_page *log_page = ctx->log_page; 6496 struct discovery_entry_ctx *entry_ctx, *tmp; 6497 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 6498 struct spdk_nvme_transport_id old_trid = {}; 6499 uint64_t numrec, i; 6500 bool found; 6501 6502 numrec = from_le64(&log_page->numrec); 6503 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->nvm_entry_ctxs, tailq, tmp) { 6504 found = false; 6505 old_entry = &entry_ctx->entry; 6506 build_trid_from_log_page_entry(&old_trid, old_entry); 6507 for (i = 0; i < numrec; i++) { 6508 new_entry = &log_page->entries[i]; 6509 if (!memcmp(old_entry, new_entry, sizeof(*old_entry))) { 6510 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s found again\n", 6511 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 6512 found = true; 6513 break; 6514 } 6515 } 6516 if (!found) { 6517 struct nvme_path_id path = {}; 6518 6519 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s not found\n", 6520 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 6521 6522 path.trid = entry_ctx->trid; 6523 bdev_nvme_delete(entry_ctx->name, &path, NULL, NULL); 6524 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 6525 free(entry_ctx); 6526 } 6527 } 6528 free(log_page); 6529 ctx->log_page = NULL; 6530 discovery_complete(ctx); 6531 } 6532 6533 static void 6534 complete_discovery_start(struct discovery_ctx *ctx, int status) 6535 { 6536 ctx->timeout_ticks = 0; 6537 ctx->rc = status; 6538 if (ctx->start_cb_fn) { 6539 ctx->start_cb_fn(ctx->cb_ctx, status); 6540 ctx->start_cb_fn = NULL; 6541 ctx->cb_ctx = NULL; 6542 } 6543 } 6544 6545 static void 6546 discovery_attach_controller_done(void *cb_ctx, size_t bdev_count, int rc) 6547 { 6548 struct discovery_entry_ctx *entry_ctx = cb_ctx; 6549 struct discovery_ctx *ctx = entry_ctx->ctx; 6550 6551 DISCOVERY_INFOLOG(ctx, "attach %s done\n", entry_ctx->name); 6552 ctx->attach_in_progress--; 6553 if (ctx->attach_in_progress == 0) { 6554 complete_discovery_start(ctx, ctx->rc); 6555 if (ctx->initializing && ctx->rc != 0) { 6556 DISCOVERY_ERRLOG(ctx, "stopping discovery due to errors: %d\n", ctx->rc); 6557 stop_discovery(ctx, NULL, ctx->cb_ctx); 6558 } else { 6559 discovery_remove_controllers(ctx); 6560 } 6561 } 6562 } 6563 6564 static struct discovery_entry_ctx * 6565 create_discovery_entry_ctx(struct discovery_ctx *ctx, struct spdk_nvme_transport_id *trid) 6566 { 6567 struct discovery_entry_ctx *new_ctx; 6568 6569 new_ctx = calloc(1, sizeof(*new_ctx)); 6570 if (new_ctx == NULL) { 6571 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6572 return NULL; 6573 } 6574 6575 new_ctx->ctx = ctx; 6576 memcpy(&new_ctx->trid, trid, sizeof(*trid)); 6577 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 6578 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 6579 return new_ctx; 6580 } 6581 6582 static void 6583 discovery_log_page_cb(void *cb_arg, int rc, const struct spdk_nvme_cpl *cpl, 6584 struct spdk_nvmf_discovery_log_page *log_page) 6585 { 6586 struct discovery_ctx *ctx = cb_arg; 6587 struct discovery_entry_ctx *entry_ctx, *tmp; 6588 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 6589 uint64_t numrec, i; 6590 bool found; 6591 6592 if (rc || spdk_nvme_cpl_is_error(cpl)) { 6593 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 6594 return; 6595 } 6596 6597 ctx->log_page = log_page; 6598 assert(ctx->attach_in_progress == 0); 6599 numrec = from_le64(&log_page->numrec); 6600 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->discovery_entry_ctxs, tailq, tmp) { 6601 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 6602 free(entry_ctx); 6603 } 6604 for (i = 0; i < numrec; i++) { 6605 found = false; 6606 new_entry = &log_page->entries[i]; 6607 if (new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY_CURRENT || 6608 new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) { 6609 struct discovery_entry_ctx *new_ctx; 6610 struct spdk_nvme_transport_id trid = {}; 6611 6612 build_trid_from_log_page_entry(&trid, new_entry); 6613 new_ctx = create_discovery_entry_ctx(ctx, &trid); 6614 if (new_ctx == NULL) { 6615 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6616 break; 6617 } 6618 6619 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, new_ctx, tailq); 6620 continue; 6621 } 6622 TAILQ_FOREACH(entry_ctx, &ctx->nvm_entry_ctxs, tailq) { 6623 old_entry = &entry_ctx->entry; 6624 if (!memcmp(new_entry, old_entry, sizeof(*new_entry))) { 6625 found = true; 6626 break; 6627 } 6628 } 6629 if (!found) { 6630 struct discovery_entry_ctx *subnqn_ctx = NULL, *new_ctx; 6631 struct discovery_ctx *d_ctx; 6632 6633 TAILQ_FOREACH(d_ctx, &g_discovery_ctxs, tailq) { 6634 TAILQ_FOREACH(subnqn_ctx, &d_ctx->nvm_entry_ctxs, tailq) { 6635 if (!memcmp(subnqn_ctx->entry.subnqn, new_entry->subnqn, 6636 sizeof(new_entry->subnqn))) { 6637 break; 6638 } 6639 } 6640 if (subnqn_ctx) { 6641 break; 6642 } 6643 } 6644 6645 new_ctx = calloc(1, sizeof(*new_ctx)); 6646 if (new_ctx == NULL) { 6647 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6648 break; 6649 } 6650 6651 new_ctx->ctx = ctx; 6652 memcpy(&new_ctx->entry, new_entry, sizeof(*new_entry)); 6653 build_trid_from_log_page_entry(&new_ctx->trid, new_entry); 6654 if (subnqn_ctx) { 6655 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s", subnqn_ctx->name); 6656 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new path for %s\n", 6657 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 6658 new_ctx->name); 6659 } else { 6660 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s%d", ctx->name, ctx->index++); 6661 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new subsystem %s\n", 6662 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 6663 new_ctx->name); 6664 } 6665 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 6666 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 6667 rc = bdev_nvme_create(&new_ctx->trid, new_ctx->name, NULL, 0, 6668 discovery_attach_controller_done, new_ctx, 6669 &new_ctx->drv_opts, &ctx->bdev_opts, true); 6670 if (rc == 0) { 6671 TAILQ_INSERT_TAIL(&ctx->nvm_entry_ctxs, new_ctx, tailq); 6672 ctx->attach_in_progress++; 6673 } else { 6674 DISCOVERY_ERRLOG(ctx, "bdev_nvme_create failed (%s)\n", spdk_strerror(-rc)); 6675 } 6676 } 6677 } 6678 6679 if (ctx->attach_in_progress == 0) { 6680 discovery_remove_controllers(ctx); 6681 } 6682 } 6683 6684 static void 6685 get_discovery_log_page(struct discovery_ctx *ctx) 6686 { 6687 int rc; 6688 6689 assert(ctx->in_progress == false); 6690 ctx->in_progress = true; 6691 rc = spdk_nvme_ctrlr_get_discovery_log_page(ctx->ctrlr, discovery_log_page_cb, ctx); 6692 if (rc != 0) { 6693 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 6694 } 6695 DISCOVERY_INFOLOG(ctx, "sent discovery log page command\n"); 6696 } 6697 6698 static void 6699 discovery_aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 6700 { 6701 struct discovery_ctx *ctx = arg; 6702 uint32_t log_page_id = (cpl->cdw0 & 0xFF0000) >> 16; 6703 6704 if (spdk_nvme_cpl_is_error(cpl)) { 6705 DISCOVERY_ERRLOG(ctx, "aer failed\n"); 6706 return; 6707 } 6708 6709 if (log_page_id != SPDK_NVME_LOG_DISCOVERY) { 6710 DISCOVERY_ERRLOG(ctx, "unexpected log page 0x%x\n", log_page_id); 6711 return; 6712 } 6713 6714 DISCOVERY_INFOLOG(ctx, "got aer\n"); 6715 if (ctx->in_progress) { 6716 ctx->pending = true; 6717 return; 6718 } 6719 6720 get_discovery_log_page(ctx); 6721 } 6722 6723 static void 6724 discovery_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 6725 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 6726 { 6727 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 6728 struct discovery_ctx *ctx; 6729 6730 ctx = SPDK_CONTAINEROF(user_opts, struct discovery_ctx, drv_opts); 6731 6732 DISCOVERY_INFOLOG(ctx, "discovery ctrlr attached\n"); 6733 ctx->probe_ctx = NULL; 6734 ctx->ctrlr = ctrlr; 6735 6736 if (ctx->rc != 0) { 6737 DISCOVERY_ERRLOG(ctx, "encountered error while attaching discovery ctrlr: %d\n", 6738 ctx->rc); 6739 return; 6740 } 6741 6742 spdk_nvme_ctrlr_register_aer_callback(ctx->ctrlr, discovery_aer_cb, ctx); 6743 } 6744 6745 static int 6746 discovery_poller(void *arg) 6747 { 6748 struct discovery_ctx *ctx = arg; 6749 struct spdk_nvme_transport_id *trid; 6750 int rc; 6751 6752 if (ctx->detach_ctx) { 6753 rc = spdk_nvme_detach_poll_async(ctx->detach_ctx); 6754 if (rc != -EAGAIN) { 6755 ctx->detach_ctx = NULL; 6756 ctx->ctrlr = NULL; 6757 } 6758 } else if (ctx->stop) { 6759 if (ctx->ctrlr != NULL) { 6760 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 6761 if (rc == 0) { 6762 return SPDK_POLLER_BUSY; 6763 } 6764 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 6765 } 6766 spdk_poller_unregister(&ctx->poller); 6767 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 6768 assert(ctx->start_cb_fn == NULL); 6769 if (ctx->stop_cb_fn != NULL) { 6770 ctx->stop_cb_fn(ctx->cb_ctx); 6771 } 6772 free_discovery_ctx(ctx); 6773 } else if (ctx->probe_ctx == NULL && ctx->ctrlr == NULL) { 6774 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 6775 DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); 6776 assert(ctx->initializing); 6777 spdk_poller_unregister(&ctx->poller); 6778 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 6779 complete_discovery_start(ctx, -ETIMEDOUT); 6780 stop_discovery(ctx, NULL, NULL); 6781 free_discovery_ctx(ctx); 6782 return SPDK_POLLER_BUSY; 6783 } 6784 6785 assert(ctx->entry_ctx_in_use == NULL); 6786 ctx->entry_ctx_in_use = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 6787 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 6788 trid = &ctx->entry_ctx_in_use->trid; 6789 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, discovery_attach_cb); 6790 if (ctx->probe_ctx) { 6791 spdk_poller_unregister(&ctx->poller); 6792 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000); 6793 } else { 6794 DISCOVERY_ERRLOG(ctx, "could not start discovery connect\n"); 6795 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 6796 ctx->entry_ctx_in_use = NULL; 6797 } 6798 } else if (ctx->probe_ctx) { 6799 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 6800 DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); 6801 complete_discovery_start(ctx, -ETIMEDOUT); 6802 return SPDK_POLLER_BUSY; 6803 } 6804 6805 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 6806 if (rc != -EAGAIN) { 6807 if (ctx->rc != 0) { 6808 assert(ctx->initializing); 6809 stop_discovery(ctx, NULL, ctx->cb_ctx); 6810 } else { 6811 assert(rc == 0); 6812 DISCOVERY_INFOLOG(ctx, "discovery ctrlr connected\n"); 6813 ctx->rc = rc; 6814 get_discovery_log_page(ctx); 6815 } 6816 } 6817 } else { 6818 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 6819 DISCOVERY_ERRLOG(ctx, "timed out while attaching NVM ctrlrs\n"); 6820 complete_discovery_start(ctx, -ETIMEDOUT); 6821 /* We need to wait until all NVM ctrlrs are attached before we stop the 6822 * discovery service to make sure we don't detach a ctrlr that is still 6823 * being attached. 6824 */ 6825 if (ctx->attach_in_progress == 0) { 6826 stop_discovery(ctx, NULL, ctx->cb_ctx); 6827 return SPDK_POLLER_BUSY; 6828 } 6829 } 6830 6831 rc = spdk_nvme_ctrlr_process_admin_completions(ctx->ctrlr); 6832 if (rc < 0) { 6833 spdk_poller_unregister(&ctx->poller); 6834 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 6835 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 6836 ctx->entry_ctx_in_use = NULL; 6837 6838 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 6839 if (rc != 0) { 6840 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 6841 ctx->ctrlr = NULL; 6842 } 6843 } 6844 } 6845 6846 return SPDK_POLLER_BUSY; 6847 } 6848 6849 static void 6850 start_discovery_poller(void *arg) 6851 { 6852 struct discovery_ctx *ctx = arg; 6853 6854 TAILQ_INSERT_TAIL(&g_discovery_ctxs, ctx, tailq); 6855 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 6856 } 6857 6858 int 6859 bdev_nvme_start_discovery(struct spdk_nvme_transport_id *trid, 6860 const char *base_name, 6861 struct spdk_nvme_ctrlr_opts *drv_opts, 6862 struct nvme_ctrlr_opts *bdev_opts, 6863 uint64_t attach_timeout, 6864 bool from_mdns, 6865 spdk_bdev_nvme_start_discovery_fn cb_fn, void *cb_ctx) 6866 { 6867 struct discovery_ctx *ctx; 6868 struct discovery_entry_ctx *discovery_entry_ctx; 6869 6870 snprintf(trid->subnqn, sizeof(trid->subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN); 6871 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 6872 if (strcmp(ctx->name, base_name) == 0) { 6873 return -EEXIST; 6874 } 6875 6876 if (ctx->entry_ctx_in_use != NULL) { 6877 if (!spdk_nvme_transport_id_compare(trid, &ctx->entry_ctx_in_use->trid)) { 6878 return -EEXIST; 6879 } 6880 } 6881 6882 TAILQ_FOREACH(discovery_entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 6883 if (!spdk_nvme_transport_id_compare(trid, &discovery_entry_ctx->trid)) { 6884 return -EEXIST; 6885 } 6886 } 6887 } 6888 6889 ctx = calloc(1, sizeof(*ctx)); 6890 if (ctx == NULL) { 6891 return -ENOMEM; 6892 } 6893 6894 ctx->name = strdup(base_name); 6895 if (ctx->name == NULL) { 6896 free_discovery_ctx(ctx); 6897 return -ENOMEM; 6898 } 6899 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 6900 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 6901 ctx->from_mdns_discovery_service = from_mdns; 6902 ctx->bdev_opts.from_discovery_service = true; 6903 ctx->calling_thread = spdk_get_thread(); 6904 ctx->start_cb_fn = cb_fn; 6905 ctx->cb_ctx = cb_ctx; 6906 ctx->initializing = true; 6907 if (ctx->start_cb_fn) { 6908 /* We can use this when dumping json to denote if this RPC parameter 6909 * was specified or not. 6910 */ 6911 ctx->wait_for_attach = true; 6912 } 6913 if (attach_timeout != 0) { 6914 ctx->timeout_ticks = spdk_get_ticks() + attach_timeout * 6915 spdk_get_ticks_hz() / 1000ull; 6916 } 6917 TAILQ_INIT(&ctx->nvm_entry_ctxs); 6918 TAILQ_INIT(&ctx->discovery_entry_ctxs); 6919 memcpy(&ctx->trid, trid, sizeof(*trid)); 6920 /* Even if user did not specify hostnqn, we can still strdup("\0"); */ 6921 ctx->hostnqn = strdup(ctx->drv_opts.hostnqn); 6922 if (ctx->hostnqn == NULL) { 6923 free_discovery_ctx(ctx); 6924 return -ENOMEM; 6925 } 6926 discovery_entry_ctx = create_discovery_entry_ctx(ctx, trid); 6927 if (discovery_entry_ctx == NULL) { 6928 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6929 free_discovery_ctx(ctx); 6930 return -ENOMEM; 6931 } 6932 6933 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, discovery_entry_ctx, tailq); 6934 spdk_thread_send_msg(g_bdev_nvme_init_thread, start_discovery_poller, ctx); 6935 return 0; 6936 } 6937 6938 int 6939 bdev_nvme_stop_discovery(const char *name, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 6940 { 6941 struct discovery_ctx *ctx; 6942 6943 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 6944 if (strcmp(name, ctx->name) == 0) { 6945 if (ctx->stop) { 6946 return -EALREADY; 6947 } 6948 /* If we're still starting the discovery service and ->rc is non-zero, we're 6949 * going to stop it as soon as we can 6950 */ 6951 if (ctx->initializing && ctx->rc != 0) { 6952 return -EALREADY; 6953 } 6954 stop_discovery(ctx, cb_fn, cb_ctx); 6955 return 0; 6956 } 6957 } 6958 6959 return -ENOENT; 6960 } 6961 6962 static int 6963 bdev_nvme_library_init(void) 6964 { 6965 g_bdev_nvme_init_thread = spdk_get_thread(); 6966 6967 spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_create_poll_group_cb, 6968 bdev_nvme_destroy_poll_group_cb, 6969 sizeof(struct nvme_poll_group), "nvme_poll_groups"); 6970 6971 return 0; 6972 } 6973 6974 static void 6975 bdev_nvme_fini_destruct_ctrlrs(void) 6976 { 6977 struct nvme_bdev_ctrlr *nbdev_ctrlr; 6978 struct nvme_ctrlr *nvme_ctrlr; 6979 6980 pthread_mutex_lock(&g_bdev_nvme_mutex); 6981 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 6982 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 6983 pthread_mutex_lock(&nvme_ctrlr->mutex); 6984 if (nvme_ctrlr->destruct) { 6985 /* This controller's destruction was already started 6986 * before the application started shutting down 6987 */ 6988 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6989 continue; 6990 } 6991 nvme_ctrlr->destruct = true; 6992 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6993 6994 spdk_thread_send_msg(nvme_ctrlr->thread, _nvme_ctrlr_destruct, 6995 nvme_ctrlr); 6996 } 6997 } 6998 6999 g_bdev_nvme_module_finish = true; 7000 if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 7001 pthread_mutex_unlock(&g_bdev_nvme_mutex); 7002 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 7003 spdk_bdev_module_fini_done(); 7004 return; 7005 } 7006 7007 pthread_mutex_unlock(&g_bdev_nvme_mutex); 7008 } 7009 7010 static void 7011 check_discovery_fini(void *arg) 7012 { 7013 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 7014 bdev_nvme_fini_destruct_ctrlrs(); 7015 } 7016 } 7017 7018 static void 7019 bdev_nvme_library_fini(void) 7020 { 7021 struct nvme_probe_skip_entry *entry, *entry_tmp; 7022 struct discovery_ctx *ctx; 7023 7024 spdk_poller_unregister(&g_hotplug_poller); 7025 free(g_hotplug_probe_ctx); 7026 g_hotplug_probe_ctx = NULL; 7027 7028 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) { 7029 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 7030 free(entry); 7031 } 7032 7033 assert(spdk_get_thread() == g_bdev_nvme_init_thread); 7034 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 7035 bdev_nvme_fini_destruct_ctrlrs(); 7036 } else { 7037 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 7038 stop_discovery(ctx, check_discovery_fini, NULL); 7039 } 7040 } 7041 } 7042 7043 static void 7044 bdev_nvme_verify_pi_error(struct nvme_bdev_io *bio) 7045 { 7046 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7047 struct spdk_bdev *bdev = bdev_io->bdev; 7048 struct spdk_dif_ctx dif_ctx; 7049 struct spdk_dif_error err_blk = {}; 7050 int rc; 7051 struct spdk_dif_ctx_init_ext_opts dif_opts; 7052 7053 dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format); 7054 dif_opts.dif_pi_format = SPDK_DIF_PI_FORMAT_16; 7055 rc = spdk_dif_ctx_init(&dif_ctx, 7056 bdev->blocklen, bdev->md_len, bdev->md_interleave, 7057 bdev->dif_is_head_of_md, bdev->dif_type, 7058 bdev_io->u.bdev.dif_check_flags, 7059 bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0, &dif_opts); 7060 if (rc != 0) { 7061 SPDK_ERRLOG("Initialization of DIF context failed\n"); 7062 return; 7063 } 7064 7065 if (bdev->md_interleave) { 7066 rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 7067 bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 7068 } else { 7069 struct iovec md_iov = { 7070 .iov_base = bdev_io->u.bdev.md_buf, 7071 .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len, 7072 }; 7073 7074 rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 7075 &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 7076 } 7077 7078 if (rc != 0) { 7079 SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n", 7080 err_blk.err_type, err_blk.err_offset); 7081 } else { 7082 SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n"); 7083 } 7084 } 7085 7086 static void 7087 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 7088 { 7089 struct nvme_bdev_io *bio = ref; 7090 7091 if (spdk_nvme_cpl_is_success(cpl)) { 7092 /* Run PI verification for read data buffer. */ 7093 bdev_nvme_verify_pi_error(bio); 7094 } 7095 7096 /* Return original completion status */ 7097 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 7098 } 7099 7100 static void 7101 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 7102 { 7103 struct nvme_bdev_io *bio = ref; 7104 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7105 int ret; 7106 7107 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 7108 SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n", 7109 cpl->status.sct, cpl->status.sc); 7110 7111 /* Save completion status to use after verifying PI error. */ 7112 bio->cpl = *cpl; 7113 7114 if (spdk_likely(nvme_io_path_is_available(bio->io_path))) { 7115 /* Read without PI checking to verify PI error. */ 7116 ret = bdev_nvme_no_pi_readv(bio, 7117 bdev_io->u.bdev.iovs, 7118 bdev_io->u.bdev.iovcnt, 7119 bdev_io->u.bdev.md_buf, 7120 bdev_io->u.bdev.num_blocks, 7121 bdev_io->u.bdev.offset_blocks); 7122 if (ret == 0) { 7123 return; 7124 } 7125 } 7126 } 7127 7128 bdev_nvme_io_complete_nvme_status(bio, cpl); 7129 } 7130 7131 static void 7132 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7133 { 7134 struct nvme_bdev_io *bio = ref; 7135 7136 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 7137 SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n", 7138 cpl->status.sct, cpl->status.sc); 7139 /* Run PI verification for write data buffer if PI error is detected. */ 7140 bdev_nvme_verify_pi_error(bio); 7141 } 7142 7143 bdev_nvme_io_complete_nvme_status(bio, cpl); 7144 } 7145 7146 static void 7147 bdev_nvme_zone_appendv_done(void *ref, const struct spdk_nvme_cpl *cpl) 7148 { 7149 struct nvme_bdev_io *bio = ref; 7150 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7151 7152 /* spdk_bdev_io_get_append_location() requires that the ALBA is stored in offset_blocks. 7153 * Additionally, offset_blocks has to be set before calling bdev_nvme_verify_pi_error(). 7154 */ 7155 bdev_io->u.bdev.offset_blocks = *(uint64_t *)&cpl->cdw0; 7156 7157 if (spdk_nvme_cpl_is_pi_error(cpl)) { 7158 SPDK_ERRLOG("zone append completed with PI error (sct=%d, sc=%d)\n", 7159 cpl->status.sct, cpl->status.sc); 7160 /* Run PI verification for zone append data buffer if PI error is detected. */ 7161 bdev_nvme_verify_pi_error(bio); 7162 } 7163 7164 bdev_nvme_io_complete_nvme_status(bio, cpl); 7165 } 7166 7167 static void 7168 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7169 { 7170 struct nvme_bdev_io *bio = ref; 7171 7172 if (spdk_nvme_cpl_is_pi_error(cpl)) { 7173 SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n", 7174 cpl->status.sct, cpl->status.sc); 7175 /* Run PI verification for compare data buffer if PI error is detected. */ 7176 bdev_nvme_verify_pi_error(bio); 7177 } 7178 7179 bdev_nvme_io_complete_nvme_status(bio, cpl); 7180 } 7181 7182 static void 7183 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7184 { 7185 struct nvme_bdev_io *bio = ref; 7186 7187 /* Compare operation completion */ 7188 if (!bio->first_fused_completed) { 7189 /* Save compare result for write callback */ 7190 bio->cpl = *cpl; 7191 bio->first_fused_completed = true; 7192 return; 7193 } 7194 7195 /* Write operation completion */ 7196 if (spdk_nvme_cpl_is_error(&bio->cpl)) { 7197 /* If bio->cpl is already an error, it means the compare operation failed. In that case, 7198 * complete the IO with the compare operation's status. 7199 */ 7200 if (!spdk_nvme_cpl_is_error(cpl)) { 7201 SPDK_ERRLOG("Unexpected write success after compare failure.\n"); 7202 } 7203 7204 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 7205 } else { 7206 bdev_nvme_io_complete_nvme_status(bio, cpl); 7207 } 7208 } 7209 7210 static void 7211 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl) 7212 { 7213 struct nvme_bdev_io *bio = ref; 7214 7215 bdev_nvme_io_complete_nvme_status(bio, cpl); 7216 } 7217 7218 static int 7219 fill_zone_from_report(struct spdk_bdev_zone_info *info, struct spdk_nvme_zns_zone_desc *desc) 7220 { 7221 switch (desc->zt) { 7222 case SPDK_NVME_ZONE_TYPE_SEQWR: 7223 info->type = SPDK_BDEV_ZONE_TYPE_SEQWR; 7224 break; 7225 default: 7226 SPDK_ERRLOG("Invalid zone type: %#x in zone report\n", desc->zt); 7227 return -EIO; 7228 } 7229 7230 switch (desc->zs) { 7231 case SPDK_NVME_ZONE_STATE_EMPTY: 7232 info->state = SPDK_BDEV_ZONE_STATE_EMPTY; 7233 break; 7234 case SPDK_NVME_ZONE_STATE_IOPEN: 7235 info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN; 7236 break; 7237 case SPDK_NVME_ZONE_STATE_EOPEN: 7238 info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN; 7239 break; 7240 case SPDK_NVME_ZONE_STATE_CLOSED: 7241 info->state = SPDK_BDEV_ZONE_STATE_CLOSED; 7242 break; 7243 case SPDK_NVME_ZONE_STATE_RONLY: 7244 info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY; 7245 break; 7246 case SPDK_NVME_ZONE_STATE_FULL: 7247 info->state = SPDK_BDEV_ZONE_STATE_FULL; 7248 break; 7249 case SPDK_NVME_ZONE_STATE_OFFLINE: 7250 info->state = SPDK_BDEV_ZONE_STATE_OFFLINE; 7251 break; 7252 default: 7253 SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", desc->zs); 7254 return -EIO; 7255 } 7256 7257 info->zone_id = desc->zslba; 7258 info->write_pointer = desc->wp; 7259 info->capacity = desc->zcap; 7260 7261 return 0; 7262 } 7263 7264 static void 7265 bdev_nvme_get_zone_info_done(void *ref, const struct spdk_nvme_cpl *cpl) 7266 { 7267 struct nvme_bdev_io *bio = ref; 7268 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7269 uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id; 7270 uint32_t zones_to_copy = bdev_io->u.zone_mgmt.num_zones; 7271 struct spdk_bdev_zone_info *info = bdev_io->u.zone_mgmt.buf; 7272 uint64_t max_zones_per_buf, i; 7273 uint32_t zone_report_bufsize; 7274 struct spdk_nvme_ns *ns; 7275 struct spdk_nvme_qpair *qpair; 7276 int ret; 7277 7278 if (spdk_nvme_cpl_is_error(cpl)) { 7279 goto out_complete_io_nvme_cpl; 7280 } 7281 7282 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 7283 ret = -ENXIO; 7284 goto out_complete_io_ret; 7285 } 7286 7287 ns = bio->io_path->nvme_ns->ns; 7288 qpair = bio->io_path->qpair->qpair; 7289 7290 zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 7291 max_zones_per_buf = (zone_report_bufsize - sizeof(*bio->zone_report_buf)) / 7292 sizeof(bio->zone_report_buf->descs[0]); 7293 7294 if (bio->zone_report_buf->nr_zones > max_zones_per_buf) { 7295 ret = -EINVAL; 7296 goto out_complete_io_ret; 7297 } 7298 7299 if (!bio->zone_report_buf->nr_zones) { 7300 ret = -EINVAL; 7301 goto out_complete_io_ret; 7302 } 7303 7304 for (i = 0; i < bio->zone_report_buf->nr_zones && bio->handled_zones < zones_to_copy; i++) { 7305 ret = fill_zone_from_report(&info[bio->handled_zones], 7306 &bio->zone_report_buf->descs[i]); 7307 if (ret) { 7308 goto out_complete_io_ret; 7309 } 7310 bio->handled_zones++; 7311 } 7312 7313 if (bio->handled_zones < zones_to_copy) { 7314 uint64_t zone_size_lba = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 7315 uint64_t slba = zone_id + (zone_size_lba * bio->handled_zones); 7316 7317 memset(bio->zone_report_buf, 0, zone_report_bufsize); 7318 ret = spdk_nvme_zns_report_zones(ns, qpair, 7319 bio->zone_report_buf, zone_report_bufsize, 7320 slba, SPDK_NVME_ZRA_LIST_ALL, true, 7321 bdev_nvme_get_zone_info_done, bio); 7322 if (!ret) { 7323 return; 7324 } else { 7325 goto out_complete_io_ret; 7326 } 7327 } 7328 7329 out_complete_io_nvme_cpl: 7330 free(bio->zone_report_buf); 7331 bio->zone_report_buf = NULL; 7332 bdev_nvme_io_complete_nvme_status(bio, cpl); 7333 return; 7334 7335 out_complete_io_ret: 7336 free(bio->zone_report_buf); 7337 bio->zone_report_buf = NULL; 7338 bdev_nvme_io_complete(bio, ret); 7339 } 7340 7341 static void 7342 bdev_nvme_zone_management_done(void *ref, const struct spdk_nvme_cpl *cpl) 7343 { 7344 struct nvme_bdev_io *bio = ref; 7345 7346 bdev_nvme_io_complete_nvme_status(bio, cpl); 7347 } 7348 7349 static void 7350 bdev_nvme_admin_passthru_complete_nvme_status(void *ctx) 7351 { 7352 struct nvme_bdev_io *bio = ctx; 7353 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7354 const struct spdk_nvme_cpl *cpl = &bio->cpl; 7355 7356 assert(bdev_nvme_io_type_is_admin(bdev_io->type)); 7357 7358 __bdev_nvme_io_complete(bdev_io, 0, cpl); 7359 } 7360 7361 static void 7362 bdev_nvme_abort_complete(void *ctx) 7363 { 7364 struct nvme_bdev_io *bio = ctx; 7365 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7366 7367 if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) { 7368 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS, NULL); 7369 } else { 7370 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED, NULL); 7371 } 7372 } 7373 7374 static void 7375 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl) 7376 { 7377 struct nvme_bdev_io *bio = ref; 7378 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7379 7380 bio->cpl = *cpl; 7381 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), bdev_nvme_abort_complete, bio); 7382 } 7383 7384 static void 7385 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl) 7386 { 7387 struct nvme_bdev_io *bio = ref; 7388 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7389 7390 bio->cpl = *cpl; 7391 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7392 bdev_nvme_admin_passthru_complete_nvme_status, bio); 7393 } 7394 7395 static void 7396 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset) 7397 { 7398 struct nvme_bdev_io *bio = ref; 7399 struct iovec *iov; 7400 7401 bio->iov_offset = sgl_offset; 7402 for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) { 7403 iov = &bio->iovs[bio->iovpos]; 7404 if (bio->iov_offset < iov->iov_len) { 7405 break; 7406 } 7407 7408 bio->iov_offset -= iov->iov_len; 7409 } 7410 } 7411 7412 static int 7413 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length) 7414 { 7415 struct nvme_bdev_io *bio = ref; 7416 struct iovec *iov; 7417 7418 assert(bio->iovpos < bio->iovcnt); 7419 7420 iov = &bio->iovs[bio->iovpos]; 7421 7422 *address = iov->iov_base; 7423 *length = iov->iov_len; 7424 7425 if (bio->iov_offset) { 7426 assert(bio->iov_offset <= iov->iov_len); 7427 *address += bio->iov_offset; 7428 *length -= bio->iov_offset; 7429 } 7430 7431 bio->iov_offset += *length; 7432 if (bio->iov_offset == iov->iov_len) { 7433 bio->iovpos++; 7434 bio->iov_offset = 0; 7435 } 7436 7437 return 0; 7438 } 7439 7440 static void 7441 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset) 7442 { 7443 struct nvme_bdev_io *bio = ref; 7444 struct iovec *iov; 7445 7446 bio->fused_iov_offset = sgl_offset; 7447 for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) { 7448 iov = &bio->fused_iovs[bio->fused_iovpos]; 7449 if (bio->fused_iov_offset < iov->iov_len) { 7450 break; 7451 } 7452 7453 bio->fused_iov_offset -= iov->iov_len; 7454 } 7455 } 7456 7457 static int 7458 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length) 7459 { 7460 struct nvme_bdev_io *bio = ref; 7461 struct iovec *iov; 7462 7463 assert(bio->fused_iovpos < bio->fused_iovcnt); 7464 7465 iov = &bio->fused_iovs[bio->fused_iovpos]; 7466 7467 *address = iov->iov_base; 7468 *length = iov->iov_len; 7469 7470 if (bio->fused_iov_offset) { 7471 assert(bio->fused_iov_offset <= iov->iov_len); 7472 *address += bio->fused_iov_offset; 7473 *length -= bio->fused_iov_offset; 7474 } 7475 7476 bio->fused_iov_offset += *length; 7477 if (bio->fused_iov_offset == iov->iov_len) { 7478 bio->fused_iovpos++; 7479 bio->fused_iov_offset = 0; 7480 } 7481 7482 return 0; 7483 } 7484 7485 static int 7486 bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7487 void *md, uint64_t lba_count, uint64_t lba) 7488 { 7489 int rc; 7490 7491 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n", 7492 lba_count, lba); 7493 7494 bio->iovs = iov; 7495 bio->iovcnt = iovcnt; 7496 bio->iovpos = 0; 7497 bio->iov_offset = 0; 7498 7499 rc = spdk_nvme_ns_cmd_readv_with_md(bio->io_path->nvme_ns->ns, 7500 bio->io_path->qpair->qpair, 7501 lba, lba_count, 7502 bdev_nvme_no_pi_readv_done, bio, 0, 7503 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 7504 md, 0, 0); 7505 7506 if (rc != 0 && rc != -ENOMEM) { 7507 SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc); 7508 } 7509 return rc; 7510 } 7511 7512 static int 7513 bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7514 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, 7515 struct spdk_memory_domain *domain, void *domain_ctx, 7516 struct spdk_accel_sequence *seq) 7517 { 7518 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7519 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7520 int rc; 7521 7522 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7523 lba_count, lba); 7524 7525 bio->iovs = iov; 7526 bio->iovcnt = iovcnt; 7527 bio->iovpos = 0; 7528 bio->iov_offset = 0; 7529 7530 if (domain != NULL || seq != NULL) { 7531 bio->ext_opts.size = SPDK_SIZEOF(&bio->ext_opts, accel_sequence); 7532 bio->ext_opts.memory_domain = domain; 7533 bio->ext_opts.memory_domain_ctx = domain_ctx; 7534 bio->ext_opts.io_flags = flags; 7535 bio->ext_opts.metadata = md; 7536 bio->ext_opts.accel_sequence = seq; 7537 7538 if (iovcnt == 1) { 7539 rc = spdk_nvme_ns_cmd_read_ext(ns, qpair, iov[0].iov_base, lba, lba_count, bdev_nvme_readv_done, 7540 bio, &bio->ext_opts); 7541 } else { 7542 rc = spdk_nvme_ns_cmd_readv_ext(ns, qpair, lba, lba_count, 7543 bdev_nvme_readv_done, bio, 7544 bdev_nvme_queued_reset_sgl, 7545 bdev_nvme_queued_next_sge, 7546 &bio->ext_opts); 7547 } 7548 } else if (iovcnt == 1) { 7549 rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base, 7550 md, lba, lba_count, bdev_nvme_readv_done, 7551 bio, flags, 0, 0); 7552 } else { 7553 rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count, 7554 bdev_nvme_readv_done, bio, flags, 7555 bdev_nvme_queued_reset_sgl, 7556 bdev_nvme_queued_next_sge, md, 0, 0); 7557 } 7558 7559 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 7560 SPDK_ERRLOG("readv failed: rc = %d\n", rc); 7561 } 7562 return rc; 7563 } 7564 7565 static int 7566 bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7567 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, 7568 struct spdk_memory_domain *domain, void *domain_ctx, 7569 struct spdk_accel_sequence *seq) 7570 { 7571 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7572 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7573 int rc; 7574 7575 SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7576 lba_count, lba); 7577 7578 bio->iovs = iov; 7579 bio->iovcnt = iovcnt; 7580 bio->iovpos = 0; 7581 bio->iov_offset = 0; 7582 7583 if (domain != NULL || seq != NULL) { 7584 bio->ext_opts.size = SPDK_SIZEOF(&bio->ext_opts, accel_sequence); 7585 bio->ext_opts.memory_domain = domain; 7586 bio->ext_opts.memory_domain_ctx = domain_ctx; 7587 bio->ext_opts.io_flags = flags; 7588 bio->ext_opts.metadata = md; 7589 bio->ext_opts.accel_sequence = seq; 7590 7591 if (iovcnt == 1) { 7592 rc = spdk_nvme_ns_cmd_write_ext(ns, qpair, iov[0].iov_base, lba, lba_count, bdev_nvme_writev_done, 7593 bio, &bio->ext_opts); 7594 } else { 7595 rc = spdk_nvme_ns_cmd_writev_ext(ns, qpair, lba, lba_count, 7596 bdev_nvme_writev_done, bio, 7597 bdev_nvme_queued_reset_sgl, 7598 bdev_nvme_queued_next_sge, 7599 &bio->ext_opts); 7600 } 7601 } else if (iovcnt == 1) { 7602 rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base, 7603 md, lba, lba_count, bdev_nvme_writev_done, 7604 bio, flags, 0, 0); 7605 } else { 7606 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 7607 bdev_nvme_writev_done, bio, flags, 7608 bdev_nvme_queued_reset_sgl, 7609 bdev_nvme_queued_next_sge, md, 0, 0); 7610 } 7611 7612 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 7613 SPDK_ERRLOG("writev failed: rc = %d\n", rc); 7614 } 7615 return rc; 7616 } 7617 7618 static int 7619 bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7620 void *md, uint64_t lba_count, uint64_t zslba, 7621 uint32_t flags) 7622 { 7623 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7624 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7625 int rc; 7626 7627 SPDK_DEBUGLOG(bdev_nvme, "zone append %" PRIu64 " blocks to zone start lba %#" PRIx64 "\n", 7628 lba_count, zslba); 7629 7630 bio->iovs = iov; 7631 bio->iovcnt = iovcnt; 7632 bio->iovpos = 0; 7633 bio->iov_offset = 0; 7634 7635 if (iovcnt == 1) { 7636 rc = spdk_nvme_zns_zone_append_with_md(ns, qpair, iov[0].iov_base, md, zslba, 7637 lba_count, 7638 bdev_nvme_zone_appendv_done, bio, 7639 flags, 7640 0, 0); 7641 } else { 7642 rc = spdk_nvme_zns_zone_appendv_with_md(ns, qpair, zslba, lba_count, 7643 bdev_nvme_zone_appendv_done, bio, flags, 7644 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 7645 md, 0, 0); 7646 } 7647 7648 if (rc != 0 && rc != -ENOMEM) { 7649 SPDK_ERRLOG("zone append failed: rc = %d\n", rc); 7650 } 7651 return rc; 7652 } 7653 7654 static int 7655 bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7656 void *md, uint64_t lba_count, uint64_t lba, 7657 uint32_t flags) 7658 { 7659 int rc; 7660 7661 SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7662 lba_count, lba); 7663 7664 bio->iovs = iov; 7665 bio->iovcnt = iovcnt; 7666 bio->iovpos = 0; 7667 bio->iov_offset = 0; 7668 7669 rc = spdk_nvme_ns_cmd_comparev_with_md(bio->io_path->nvme_ns->ns, 7670 bio->io_path->qpair->qpair, 7671 lba, lba_count, 7672 bdev_nvme_comparev_done, bio, flags, 7673 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 7674 md, 0, 0); 7675 7676 if (rc != 0 && rc != -ENOMEM) { 7677 SPDK_ERRLOG("comparev failed: rc = %d\n", rc); 7678 } 7679 return rc; 7680 } 7681 7682 static int 7683 bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, 7684 struct iovec *write_iov, int write_iovcnt, 7685 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags) 7686 { 7687 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7688 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7689 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7690 int rc; 7691 7692 SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7693 lba_count, lba); 7694 7695 bio->iovs = cmp_iov; 7696 bio->iovcnt = cmp_iovcnt; 7697 bio->iovpos = 0; 7698 bio->iov_offset = 0; 7699 bio->fused_iovs = write_iov; 7700 bio->fused_iovcnt = write_iovcnt; 7701 bio->fused_iovpos = 0; 7702 bio->fused_iov_offset = 0; 7703 7704 if (bdev_io->num_retries == 0) { 7705 bio->first_fused_submitted = false; 7706 bio->first_fused_completed = false; 7707 } 7708 7709 if (!bio->first_fused_submitted) { 7710 flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST; 7711 memset(&bio->cpl, 0, sizeof(bio->cpl)); 7712 7713 rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count, 7714 bdev_nvme_comparev_and_writev_done, bio, flags, 7715 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0); 7716 if (rc == 0) { 7717 bio->first_fused_submitted = true; 7718 flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST; 7719 } else { 7720 if (rc != -ENOMEM) { 7721 SPDK_ERRLOG("compare failed: rc = %d\n", rc); 7722 } 7723 return rc; 7724 } 7725 } 7726 7727 flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND; 7728 7729 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 7730 bdev_nvme_comparev_and_writev_done, bio, flags, 7731 bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0); 7732 if (rc != 0 && rc != -ENOMEM) { 7733 SPDK_ERRLOG("write failed: rc = %d\n", rc); 7734 rc = 0; 7735 } 7736 7737 return rc; 7738 } 7739 7740 static int 7741 bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 7742 { 7743 struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES]; 7744 struct spdk_nvme_dsm_range *range; 7745 uint64_t offset, remaining; 7746 uint64_t num_ranges_u64; 7747 uint16_t num_ranges; 7748 int rc; 7749 7750 num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) / 7751 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 7752 if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) { 7753 SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks); 7754 return -EINVAL; 7755 } 7756 num_ranges = (uint16_t)num_ranges_u64; 7757 7758 offset = offset_blocks; 7759 remaining = num_blocks; 7760 range = &dsm_ranges[0]; 7761 7762 /* Fill max-size ranges until the remaining blocks fit into one range */ 7763 while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) { 7764 range->attributes.raw = 0; 7765 range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 7766 range->starting_lba = offset; 7767 7768 offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 7769 remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 7770 range++; 7771 } 7772 7773 /* Final range describes the remaining blocks */ 7774 range->attributes.raw = 0; 7775 range->length = remaining; 7776 range->starting_lba = offset; 7777 7778 rc = spdk_nvme_ns_cmd_dataset_management(bio->io_path->nvme_ns->ns, 7779 bio->io_path->qpair->qpair, 7780 SPDK_NVME_DSM_ATTR_DEALLOCATE, 7781 dsm_ranges, num_ranges, 7782 bdev_nvme_queued_done, bio); 7783 7784 return rc; 7785 } 7786 7787 static int 7788 bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 7789 { 7790 if (num_blocks > UINT16_MAX + 1) { 7791 SPDK_ERRLOG("NVMe write zeroes is limited to 16-bit block count\n"); 7792 return -EINVAL; 7793 } 7794 7795 return spdk_nvme_ns_cmd_write_zeroes(bio->io_path->nvme_ns->ns, 7796 bio->io_path->qpair->qpair, 7797 offset_blocks, num_blocks, 7798 bdev_nvme_queued_done, bio, 7799 0); 7800 } 7801 7802 static int 7803 bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones, 7804 struct spdk_bdev_zone_info *info) 7805 { 7806 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7807 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7808 uint32_t zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 7809 uint64_t zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 7810 uint64_t total_zones = spdk_nvme_zns_ns_get_num_zones(ns); 7811 7812 if (zone_id % zone_size != 0) { 7813 return -EINVAL; 7814 } 7815 7816 if (num_zones > total_zones || !num_zones) { 7817 return -EINVAL; 7818 } 7819 7820 assert(!bio->zone_report_buf); 7821 bio->zone_report_buf = calloc(1, zone_report_bufsize); 7822 if (!bio->zone_report_buf) { 7823 return -ENOMEM; 7824 } 7825 7826 bio->handled_zones = 0; 7827 7828 return spdk_nvme_zns_report_zones(ns, qpair, bio->zone_report_buf, zone_report_bufsize, 7829 zone_id, SPDK_NVME_ZRA_LIST_ALL, true, 7830 bdev_nvme_get_zone_info_done, bio); 7831 } 7832 7833 static int 7834 bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 7835 enum spdk_bdev_zone_action action) 7836 { 7837 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7838 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7839 7840 switch (action) { 7841 case SPDK_BDEV_ZONE_CLOSE: 7842 return spdk_nvme_zns_close_zone(ns, qpair, zone_id, false, 7843 bdev_nvme_zone_management_done, bio); 7844 case SPDK_BDEV_ZONE_FINISH: 7845 return spdk_nvme_zns_finish_zone(ns, qpair, zone_id, false, 7846 bdev_nvme_zone_management_done, bio); 7847 case SPDK_BDEV_ZONE_OPEN: 7848 return spdk_nvme_zns_open_zone(ns, qpair, zone_id, false, 7849 bdev_nvme_zone_management_done, bio); 7850 case SPDK_BDEV_ZONE_RESET: 7851 return spdk_nvme_zns_reset_zone(ns, qpair, zone_id, false, 7852 bdev_nvme_zone_management_done, bio); 7853 case SPDK_BDEV_ZONE_OFFLINE: 7854 return spdk_nvme_zns_offline_zone(ns, qpair, zone_id, false, 7855 bdev_nvme_zone_management_done, bio); 7856 default: 7857 return -EINVAL; 7858 } 7859 } 7860 7861 static void 7862 bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 7863 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) 7864 { 7865 struct nvme_io_path *io_path; 7866 struct nvme_ctrlr *nvme_ctrlr; 7867 uint32_t max_xfer_size; 7868 int rc = -ENXIO; 7869 7870 /* Choose the first ctrlr which is not failed. */ 7871 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 7872 nvme_ctrlr = io_path->qpair->ctrlr; 7873 7874 /* We should skip any unavailable nvme_ctrlr rather than checking 7875 * if the return value of spdk_nvme_ctrlr_cmd_admin_raw() is -ENXIO. 7876 */ 7877 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 7878 continue; 7879 } 7880 7881 max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ctrlr->ctrlr); 7882 7883 if (nbytes > max_xfer_size) { 7884 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 7885 rc = -EINVAL; 7886 goto err; 7887 } 7888 7889 rc = spdk_nvme_ctrlr_cmd_admin_raw(nvme_ctrlr->ctrlr, cmd, buf, (uint32_t)nbytes, 7890 bdev_nvme_admin_passthru_done, bio); 7891 if (rc == 0) { 7892 return; 7893 } 7894 } 7895 7896 err: 7897 bdev_nvme_admin_complete(bio, rc); 7898 } 7899 7900 static int 7901 bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 7902 void *buf, size_t nbytes) 7903 { 7904 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7905 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7906 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 7907 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 7908 7909 if (nbytes > max_xfer_size) { 7910 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 7911 return -EINVAL; 7912 } 7913 7914 /* 7915 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 7916 * so fill it out automatically. 7917 */ 7918 cmd->nsid = spdk_nvme_ns_get_id(ns); 7919 7920 return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf, 7921 (uint32_t)nbytes, bdev_nvme_queued_done, bio); 7922 } 7923 7924 static int 7925 bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 7926 void *buf, size_t nbytes, void *md_buf, size_t md_len) 7927 { 7928 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7929 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7930 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 7931 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 7932 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 7933 7934 if (nbytes > max_xfer_size) { 7935 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 7936 return -EINVAL; 7937 } 7938 7939 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 7940 SPDK_ERRLOG("invalid meta data buffer size\n"); 7941 return -EINVAL; 7942 } 7943 7944 /* 7945 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 7946 * so fill it out automatically. 7947 */ 7948 cmd->nsid = spdk_nvme_ns_get_id(ns); 7949 7950 return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf, 7951 (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio); 7952 } 7953 7954 static int 7955 bdev_nvme_iov_passthru_md(struct nvme_bdev_io *bio, 7956 struct spdk_nvme_cmd *cmd, struct iovec *iov, int iovcnt, 7957 size_t nbytes, void *md_buf, size_t md_len) 7958 { 7959 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7960 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7961 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 7962 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 7963 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 7964 7965 bio->iovs = iov; 7966 bio->iovcnt = iovcnt; 7967 bio->iovpos = 0; 7968 bio->iov_offset = 0; 7969 7970 if (nbytes > max_xfer_size) { 7971 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 7972 return -EINVAL; 7973 } 7974 7975 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 7976 SPDK_ERRLOG("invalid meta data buffer size\n"); 7977 return -EINVAL; 7978 } 7979 7980 /* 7981 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands 7982 * require a nsid, so fill it out automatically. 7983 */ 7984 cmd->nsid = spdk_nvme_ns_get_id(ns); 7985 7986 return spdk_nvme_ctrlr_cmd_iov_raw_with_md( 7987 ctrlr, qpair, cmd, (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio, 7988 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge); 7989 } 7990 7991 static void 7992 bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 7993 struct nvme_bdev_io *bio_to_abort) 7994 { 7995 struct nvme_io_path *io_path; 7996 int rc = 0; 7997 7998 rc = bdev_nvme_abort_retry_io(nbdev_ch, bio_to_abort); 7999 if (rc == 0) { 8000 bdev_nvme_admin_complete(bio, 0); 8001 return; 8002 } 8003 8004 io_path = bio_to_abort->io_path; 8005 if (io_path != NULL) { 8006 rc = spdk_nvme_ctrlr_cmd_abort_ext(io_path->qpair->ctrlr->ctrlr, 8007 io_path->qpair->qpair, 8008 bio_to_abort, 8009 bdev_nvme_abort_done, bio); 8010 } else { 8011 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 8012 rc = spdk_nvme_ctrlr_cmd_abort_ext(io_path->qpair->ctrlr->ctrlr, 8013 NULL, 8014 bio_to_abort, 8015 bdev_nvme_abort_done, bio); 8016 8017 if (rc != -ENOENT) { 8018 break; 8019 } 8020 } 8021 } 8022 8023 if (rc != 0) { 8024 /* If no command was found or there was any error, complete the abort 8025 * request with failure. 8026 */ 8027 bdev_nvme_admin_complete(bio, rc); 8028 } 8029 } 8030 8031 static int 8032 bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, uint64_t src_offset_blocks, 8033 uint64_t num_blocks) 8034 { 8035 struct spdk_nvme_scc_source_range range = { 8036 .slba = src_offset_blocks, 8037 .nlb = num_blocks - 1 8038 }; 8039 8040 return spdk_nvme_ns_cmd_copy(bio->io_path->nvme_ns->ns, 8041 bio->io_path->qpair->qpair, 8042 &range, 1, dst_offset_blocks, 8043 bdev_nvme_queued_done, bio); 8044 } 8045 8046 static void 8047 bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w) 8048 { 8049 const char *action; 8050 8051 if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) { 8052 action = "reset"; 8053 } else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) { 8054 action = "abort"; 8055 } else { 8056 action = "none"; 8057 } 8058 8059 spdk_json_write_object_begin(w); 8060 8061 spdk_json_write_named_string(w, "method", "bdev_nvme_set_options"); 8062 8063 spdk_json_write_named_object_begin(w, "params"); 8064 spdk_json_write_named_string(w, "action_on_timeout", action); 8065 spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us); 8066 spdk_json_write_named_uint64(w, "timeout_admin_us", g_opts.timeout_admin_us); 8067 spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms); 8068 spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst); 8069 spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight); 8070 spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight); 8071 spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight); 8072 spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us); 8073 spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us); 8074 spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests); 8075 spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit); 8076 spdk_json_write_named_uint32(w, "transport_retry_count", g_opts.transport_retry_count); 8077 spdk_json_write_named_int32(w, "bdev_retry_count", g_opts.bdev_retry_count); 8078 spdk_json_write_named_uint8(w, "transport_ack_timeout", g_opts.transport_ack_timeout); 8079 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", g_opts.ctrlr_loss_timeout_sec); 8080 spdk_json_write_named_uint32(w, "reconnect_delay_sec", g_opts.reconnect_delay_sec); 8081 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", g_opts.fast_io_fail_timeout_sec); 8082 spdk_json_write_named_bool(w, "disable_auto_failback", g_opts.disable_auto_failback); 8083 spdk_json_write_named_bool(w, "generate_uuids", g_opts.generate_uuids); 8084 spdk_json_write_named_uint8(w, "transport_tos", g_opts.transport_tos); 8085 spdk_json_write_named_bool(w, "nvme_error_stat", g_opts.nvme_error_stat); 8086 spdk_json_write_named_uint32(w, "rdma_srq_size", g_opts.rdma_srq_size); 8087 spdk_json_write_named_bool(w, "io_path_stat", g_opts.io_path_stat); 8088 spdk_json_write_named_bool(w, "allow_accel_sequence", g_opts.allow_accel_sequence); 8089 spdk_json_write_named_uint32(w, "rdma_max_cq_size", g_opts.rdma_max_cq_size); 8090 spdk_json_write_named_uint16(w, "rdma_cm_event_timeout_ms", g_opts.rdma_cm_event_timeout_ms); 8091 spdk_json_write_object_end(w); 8092 8093 spdk_json_write_object_end(w); 8094 } 8095 8096 static void 8097 bdev_nvme_discovery_config_json(struct spdk_json_write_ctx *w, struct discovery_ctx *ctx) 8098 { 8099 struct spdk_nvme_transport_id trid; 8100 8101 spdk_json_write_object_begin(w); 8102 8103 spdk_json_write_named_string(w, "method", "bdev_nvme_start_discovery"); 8104 8105 spdk_json_write_named_object_begin(w, "params"); 8106 spdk_json_write_named_string(w, "name", ctx->name); 8107 spdk_json_write_named_string(w, "hostnqn", ctx->hostnqn); 8108 8109 trid = ctx->trid; 8110 memset(trid.subnqn, 0, sizeof(trid.subnqn)); 8111 nvme_bdev_dump_trid_json(&trid, w); 8112 8113 spdk_json_write_named_bool(w, "wait_for_attach", ctx->wait_for_attach); 8114 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", ctx->bdev_opts.ctrlr_loss_timeout_sec); 8115 spdk_json_write_named_uint32(w, "reconnect_delay_sec", ctx->bdev_opts.reconnect_delay_sec); 8116 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 8117 ctx->bdev_opts.fast_io_fail_timeout_sec); 8118 spdk_json_write_object_end(w); 8119 8120 spdk_json_write_object_end(w); 8121 } 8122 8123 #ifdef SPDK_CONFIG_NVME_CUSE 8124 static void 8125 nvme_ctrlr_cuse_config_json(struct spdk_json_write_ctx *w, 8126 struct nvme_ctrlr *nvme_ctrlr) 8127 { 8128 size_t cuse_name_size = 128; 8129 char cuse_name[cuse_name_size]; 8130 8131 if (spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr, 8132 cuse_name, &cuse_name_size) != 0) { 8133 return; 8134 } 8135 8136 spdk_json_write_object_begin(w); 8137 8138 spdk_json_write_named_string(w, "method", "bdev_nvme_cuse_register"); 8139 8140 spdk_json_write_named_object_begin(w, "params"); 8141 spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name); 8142 spdk_json_write_object_end(w); 8143 8144 spdk_json_write_object_end(w); 8145 } 8146 #endif 8147 8148 static void 8149 nvme_ctrlr_config_json(struct spdk_json_write_ctx *w, 8150 struct nvme_ctrlr *nvme_ctrlr) 8151 { 8152 struct spdk_nvme_transport_id *trid; 8153 const struct spdk_nvme_ctrlr_opts *opts; 8154 8155 if (nvme_ctrlr->opts.from_discovery_service) { 8156 /* Do not emit an RPC for this - it will be implicitly 8157 * covered by a separate bdev_nvme_start_discovery or 8158 * bdev_nvme_start_mdns_discovery RPC. 8159 */ 8160 return; 8161 } 8162 8163 trid = &nvme_ctrlr->active_path_id->trid; 8164 8165 spdk_json_write_object_begin(w); 8166 8167 spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller"); 8168 8169 spdk_json_write_named_object_begin(w, "params"); 8170 spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name); 8171 nvme_bdev_dump_trid_json(trid, w); 8172 spdk_json_write_named_bool(w, "prchk_reftag", 8173 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0); 8174 spdk_json_write_named_bool(w, "prchk_guard", 8175 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0); 8176 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", nvme_ctrlr->opts.ctrlr_loss_timeout_sec); 8177 spdk_json_write_named_uint32(w, "reconnect_delay_sec", nvme_ctrlr->opts.reconnect_delay_sec); 8178 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 8179 nvme_ctrlr->opts.fast_io_fail_timeout_sec); 8180 if (nvme_ctrlr->opts.psk_path[0] != '\0') { 8181 spdk_json_write_named_string(w, "psk", nvme_ctrlr->opts.psk_path); 8182 } 8183 8184 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 8185 spdk_json_write_named_string(w, "hostnqn", opts->hostnqn); 8186 spdk_json_write_named_bool(w, "hdgst", opts->header_digest); 8187 spdk_json_write_named_bool(w, "ddgst", opts->data_digest); 8188 if (opts->src_addr[0] != '\0') { 8189 spdk_json_write_named_string(w, "hostaddr", opts->src_addr); 8190 } 8191 if (opts->src_svcid[0] != '\0') { 8192 spdk_json_write_named_string(w, "hostsvcid", opts->src_svcid); 8193 } 8194 8195 spdk_json_write_object_end(w); 8196 8197 spdk_json_write_object_end(w); 8198 } 8199 8200 static void 8201 bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w) 8202 { 8203 spdk_json_write_object_begin(w); 8204 spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug"); 8205 8206 spdk_json_write_named_object_begin(w, "params"); 8207 spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us); 8208 spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled); 8209 spdk_json_write_object_end(w); 8210 8211 spdk_json_write_object_end(w); 8212 } 8213 8214 static int 8215 bdev_nvme_config_json(struct spdk_json_write_ctx *w) 8216 { 8217 struct nvme_bdev_ctrlr *nbdev_ctrlr; 8218 struct nvme_ctrlr *nvme_ctrlr; 8219 struct discovery_ctx *ctx; 8220 8221 bdev_nvme_opts_config_json(w); 8222 8223 pthread_mutex_lock(&g_bdev_nvme_mutex); 8224 8225 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 8226 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 8227 nvme_ctrlr_config_json(w, nvme_ctrlr); 8228 8229 #ifdef SPDK_CONFIG_NVME_CUSE 8230 nvme_ctrlr_cuse_config_json(w, nvme_ctrlr); 8231 #endif 8232 } 8233 } 8234 8235 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 8236 if (!ctx->from_mdns_discovery_service) { 8237 bdev_nvme_discovery_config_json(w, ctx); 8238 } 8239 } 8240 8241 bdev_nvme_mdns_discovery_config_json(w); 8242 8243 /* Dump as last parameter to give all NVMe bdevs chance to be constructed 8244 * before enabling hotplug poller. 8245 */ 8246 bdev_nvme_hotplug_config_json(w); 8247 8248 pthread_mutex_unlock(&g_bdev_nvme_mutex); 8249 return 0; 8250 } 8251 8252 struct spdk_nvme_ctrlr * 8253 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev) 8254 { 8255 struct nvme_bdev *nbdev; 8256 struct nvme_ns *nvme_ns; 8257 8258 if (!bdev || bdev->module != &nvme_if) { 8259 return NULL; 8260 } 8261 8262 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 8263 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 8264 assert(nvme_ns != NULL); 8265 8266 return nvme_ns->ctrlr->ctrlr; 8267 } 8268 8269 void 8270 nvme_io_path_info_json(struct spdk_json_write_ctx *w, struct nvme_io_path *io_path) 8271 { 8272 struct nvme_ns *nvme_ns = io_path->nvme_ns; 8273 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 8274 const struct spdk_nvme_ctrlr_data *cdata; 8275 const struct spdk_nvme_transport_id *trid; 8276 const char *adrfam_str; 8277 8278 spdk_json_write_object_begin(w); 8279 8280 spdk_json_write_named_string(w, "bdev_name", nvme_ns->bdev->disk.name); 8281 8282 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 8283 trid = spdk_nvme_ctrlr_get_transport_id(nvme_ctrlr->ctrlr); 8284 8285 spdk_json_write_named_uint32(w, "cntlid", cdata->cntlid); 8286 spdk_json_write_named_bool(w, "current", io_path->nbdev_ch != NULL && 8287 io_path == io_path->nbdev_ch->current_io_path); 8288 spdk_json_write_named_bool(w, "connected", nvme_qpair_is_connected(io_path->qpair)); 8289 spdk_json_write_named_bool(w, "accessible", nvme_ns_is_accessible(nvme_ns)); 8290 8291 spdk_json_write_named_object_begin(w, "transport"); 8292 spdk_json_write_named_string(w, "trtype", trid->trstring); 8293 spdk_json_write_named_string(w, "traddr", trid->traddr); 8294 if (trid->trsvcid[0] != '\0') { 8295 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 8296 } 8297 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 8298 if (adrfam_str) { 8299 spdk_json_write_named_string(w, "adrfam", adrfam_str); 8300 } 8301 spdk_json_write_object_end(w); 8302 8303 spdk_json_write_object_end(w); 8304 } 8305 8306 void 8307 bdev_nvme_get_discovery_info(struct spdk_json_write_ctx *w) 8308 { 8309 struct discovery_ctx *ctx; 8310 struct discovery_entry_ctx *entry_ctx; 8311 8312 spdk_json_write_array_begin(w); 8313 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 8314 spdk_json_write_object_begin(w); 8315 spdk_json_write_named_string(w, "name", ctx->name); 8316 8317 spdk_json_write_named_object_begin(w, "trid"); 8318 nvme_bdev_dump_trid_json(&ctx->trid, w); 8319 spdk_json_write_object_end(w); 8320 8321 spdk_json_write_named_array_begin(w, "referrals"); 8322 TAILQ_FOREACH(entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 8323 spdk_json_write_object_begin(w); 8324 spdk_json_write_named_object_begin(w, "trid"); 8325 nvme_bdev_dump_trid_json(&entry_ctx->trid, w); 8326 spdk_json_write_object_end(w); 8327 spdk_json_write_object_end(w); 8328 } 8329 spdk_json_write_array_end(w); 8330 8331 spdk_json_write_object_end(w); 8332 } 8333 spdk_json_write_array_end(w); 8334 } 8335 8336 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme) 8337 8338 SPDK_TRACE_REGISTER_FN(bdev_nvme_trace, "bdev_nvme", TRACE_GROUP_BDEV_NVME) 8339 { 8340 struct spdk_trace_tpoint_opts opts[] = { 8341 { 8342 "BDEV_NVME_IO_START", TRACE_BDEV_NVME_IO_START, 8343 OWNER_NONE, OBJECT_BDEV_NVME_IO, 1, 8344 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 8345 }, 8346 { 8347 "BDEV_NVME_IO_DONE", TRACE_BDEV_NVME_IO_DONE, 8348 OWNER_NONE, OBJECT_BDEV_NVME_IO, 0, 8349 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 8350 } 8351 }; 8352 8353 8354 spdk_trace_register_object(OBJECT_BDEV_NVME_IO, 'N'); 8355 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 8356 spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_SUBMIT, OBJECT_BDEV_NVME_IO, 0); 8357 spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_SUBMIT, OBJECT_BDEV_NVME_IO, 0); 8358 spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_COMPLETE, OBJECT_BDEV_NVME_IO, 0); 8359 spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_COMPLETE, OBJECT_BDEV_NVME_IO, 0); 8360 } 8361