1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 * Copyright (c) 2022 Dell Inc, or its subsidiaries. All rights reserved. 6 */ 7 8 #include "spdk/stdinc.h" 9 10 #include "bdev_nvme.h" 11 12 #include "spdk/accel.h" 13 #include "spdk/config.h" 14 #include "spdk/endian.h" 15 #include "spdk/bdev.h" 16 #include "spdk/json.h" 17 #include "spdk/likely.h" 18 #include "spdk/nvme.h" 19 #include "spdk/nvme_ocssd.h" 20 #include "spdk/nvme_zns.h" 21 #include "spdk/opal.h" 22 #include "spdk/thread.h" 23 #include "spdk/trace.h" 24 #include "spdk/string.h" 25 #include "spdk/util.h" 26 #include "spdk/uuid.h" 27 28 #include "spdk/bdev_module.h" 29 #include "spdk/log.h" 30 31 #include "spdk_internal/usdt.h" 32 #include "spdk_internal/trace_defs.h" 33 34 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true 35 #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS (10000) 36 37 #define NSID_STR_LEN 10 38 39 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w); 40 41 struct nvme_bdev_io { 42 /** array of iovecs to transfer. */ 43 struct iovec *iovs; 44 45 /** Number of iovecs in iovs array. */ 46 int iovcnt; 47 48 /** Current iovec position. */ 49 int iovpos; 50 51 /** Offset in current iovec. */ 52 uint32_t iov_offset; 53 54 /** I/O path the current I/O or admin passthrough is submitted on, or the I/O path 55 * being reset in a reset I/O. 56 */ 57 struct nvme_io_path *io_path; 58 59 /** array of iovecs to transfer. */ 60 struct iovec *fused_iovs; 61 62 /** Number of iovecs in iovs array. */ 63 int fused_iovcnt; 64 65 /** Current iovec position. */ 66 int fused_iovpos; 67 68 /** Offset in current iovec. */ 69 uint32_t fused_iov_offset; 70 71 /** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */ 72 struct spdk_nvme_cpl cpl; 73 74 /** Extended IO opts passed by the user to bdev layer and mapped to NVME format */ 75 struct spdk_nvme_ns_cmd_ext_io_opts ext_opts; 76 77 /** Keeps track if first of fused commands was submitted */ 78 bool first_fused_submitted; 79 80 /** Keeps track if first of fused commands was completed */ 81 bool first_fused_completed; 82 83 /** Temporary pointer to zone report buffer */ 84 struct spdk_nvme_zns_zone_report *zone_report_buf; 85 86 /** Keep track of how many zones that have been copied to the spdk_bdev_zone_info struct */ 87 uint64_t handled_zones; 88 89 /** Expiration value in ticks to retry the current I/O. */ 90 uint64_t retry_ticks; 91 92 /* How many times the current I/O was retried. */ 93 int32_t retry_count; 94 95 /* Current tsc at submit time. */ 96 uint64_t submit_tsc; 97 }; 98 99 struct nvme_probe_skip_entry { 100 struct spdk_nvme_transport_id trid; 101 TAILQ_ENTRY(nvme_probe_skip_entry) tailq; 102 }; 103 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */ 104 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER( 105 g_skipped_nvme_ctrlrs); 106 107 static struct spdk_bdev_nvme_opts g_opts = { 108 .action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE, 109 .timeout_us = 0, 110 .timeout_admin_us = 0, 111 .keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS, 112 .transport_retry_count = 4, 113 .arbitration_burst = 0, 114 .low_priority_weight = 0, 115 .medium_priority_weight = 0, 116 .high_priority_weight = 0, 117 .nvme_adminq_poll_period_us = 10000ULL, 118 .nvme_ioq_poll_period_us = 0, 119 .io_queue_requests = 0, 120 .delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT, 121 .bdev_retry_count = 3, 122 .transport_ack_timeout = 0, 123 .ctrlr_loss_timeout_sec = 0, 124 .reconnect_delay_sec = 0, 125 .fast_io_fail_timeout_sec = 0, 126 .disable_auto_failback = false, 127 .generate_uuids = false, 128 .transport_tos = 0, 129 .nvme_error_stat = false, 130 .io_path_stat = false, 131 .allow_accel_sequence = false, 132 }; 133 134 #define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL 135 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT 100000ULL 136 137 static int g_hot_insert_nvme_controller_index = 0; 138 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT; 139 static bool g_nvme_hotplug_enabled = false; 140 struct spdk_thread *g_bdev_nvme_init_thread; 141 static struct spdk_poller *g_hotplug_poller; 142 static struct spdk_poller *g_hotplug_probe_poller; 143 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx; 144 145 static void nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 146 struct nvme_async_probe_ctx *ctx); 147 static void nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 148 struct nvme_async_probe_ctx *ctx); 149 static int bdev_nvme_library_init(void); 150 static void bdev_nvme_library_fini(void); 151 static void _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, 152 struct spdk_bdev_io *bdev_io); 153 static void bdev_nvme_submit_request(struct spdk_io_channel *ch, 154 struct spdk_bdev_io *bdev_io); 155 static int bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 156 void *md, uint64_t lba_count, uint64_t lba, 157 uint32_t flags, struct spdk_memory_domain *domain, void *domain_ctx, 158 struct spdk_accel_sequence *seq); 159 static int bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 160 void *md, uint64_t lba_count, uint64_t lba); 161 static int bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 162 void *md, uint64_t lba_count, uint64_t lba, 163 uint32_t flags, struct spdk_memory_domain *domain, void *domain_ctx, 164 struct spdk_accel_sequence *seq); 165 static int bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 166 void *md, uint64_t lba_count, 167 uint64_t zslba, uint32_t flags); 168 static int bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 169 void *md, uint64_t lba_count, uint64_t lba, 170 uint32_t flags); 171 static int bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, 172 struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov, 173 int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba, 174 uint32_t flags); 175 static int bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, 176 uint32_t num_zones, struct spdk_bdev_zone_info *info); 177 static int bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 178 enum spdk_bdev_zone_action action); 179 static void bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, 180 struct nvme_bdev_io *bio, 181 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); 182 static int bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 183 void *buf, size_t nbytes); 184 static int bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 185 void *buf, size_t nbytes, void *md_buf, size_t md_len); 186 static int bdev_nvme_iov_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 187 struct iovec *iov, int iovcnt, size_t nbytes, 188 void *md_buf, size_t md_len); 189 static void bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, 190 struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort); 191 static void bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio); 192 static int bdev_nvme_reset_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 193 static int bdev_nvme_failover_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool remove); 194 static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr); 195 static int nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr); 196 197 static struct nvme_ns *nvme_ns_alloc(void); 198 static void nvme_ns_free(struct nvme_ns *ns); 199 200 static int 201 nvme_ns_cmp(struct nvme_ns *ns1, struct nvme_ns *ns2) 202 { 203 return ns1->id < ns2->id ? -1 : ns1->id > ns2->id; 204 } 205 206 RB_GENERATE_STATIC(nvme_ns_tree, nvme_ns, node, nvme_ns_cmp); 207 208 struct spdk_nvme_qpair * 209 bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch) 210 { 211 struct nvme_ctrlr_channel *ctrlr_ch; 212 213 assert(ctrlr_io_ch != NULL); 214 215 ctrlr_ch = spdk_io_channel_get_ctx(ctrlr_io_ch); 216 217 return ctrlr_ch->qpair->qpair; 218 } 219 220 static int 221 bdev_nvme_get_ctx_size(void) 222 { 223 return sizeof(struct nvme_bdev_io); 224 } 225 226 static struct spdk_bdev_module nvme_if = { 227 .name = "nvme", 228 .async_fini = true, 229 .module_init = bdev_nvme_library_init, 230 .module_fini = bdev_nvme_library_fini, 231 .config_json = bdev_nvme_config_json, 232 .get_ctx_size = bdev_nvme_get_ctx_size, 233 234 }; 235 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if) 236 237 struct nvme_bdev_ctrlrs g_nvme_bdev_ctrlrs = TAILQ_HEAD_INITIALIZER(g_nvme_bdev_ctrlrs); 238 pthread_mutex_t g_bdev_nvme_mutex = PTHREAD_MUTEX_INITIALIZER; 239 bool g_bdev_nvme_module_finish; 240 241 struct nvme_bdev_ctrlr * 242 nvme_bdev_ctrlr_get_by_name(const char *name) 243 { 244 struct nvme_bdev_ctrlr *nbdev_ctrlr; 245 246 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 247 if (strcmp(name, nbdev_ctrlr->name) == 0) { 248 break; 249 } 250 } 251 252 return nbdev_ctrlr; 253 } 254 255 static struct nvme_ctrlr * 256 nvme_bdev_ctrlr_get_ctrlr(struct nvme_bdev_ctrlr *nbdev_ctrlr, 257 const struct spdk_nvme_transport_id *trid) 258 { 259 struct nvme_ctrlr *nvme_ctrlr; 260 261 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 262 if (spdk_nvme_transport_id_compare(trid, &nvme_ctrlr->active_path_id->trid) == 0) { 263 break; 264 } 265 } 266 267 return nvme_ctrlr; 268 } 269 270 struct nvme_ctrlr * 271 nvme_bdev_ctrlr_get_ctrlr_by_id(struct nvme_bdev_ctrlr *nbdev_ctrlr, 272 uint16_t cntlid) 273 { 274 struct nvme_ctrlr *nvme_ctrlr; 275 const struct spdk_nvme_ctrlr_data *cdata; 276 277 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 278 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 279 if (cdata->cntlid == cntlid) { 280 break; 281 } 282 } 283 284 return nvme_ctrlr; 285 } 286 287 static struct nvme_bdev * 288 nvme_bdev_ctrlr_get_bdev(struct nvme_bdev_ctrlr *nbdev_ctrlr, uint32_t nsid) 289 { 290 struct nvme_bdev *bdev; 291 292 pthread_mutex_lock(&g_bdev_nvme_mutex); 293 TAILQ_FOREACH(bdev, &nbdev_ctrlr->bdevs, tailq) { 294 if (bdev->nsid == nsid) { 295 break; 296 } 297 } 298 pthread_mutex_unlock(&g_bdev_nvme_mutex); 299 300 return bdev; 301 } 302 303 struct nvme_ns * 304 nvme_ctrlr_get_ns(struct nvme_ctrlr *nvme_ctrlr, uint32_t nsid) 305 { 306 struct nvme_ns ns; 307 308 assert(nsid > 0); 309 310 ns.id = nsid; 311 return RB_FIND(nvme_ns_tree, &nvme_ctrlr->namespaces, &ns); 312 } 313 314 struct nvme_ns * 315 nvme_ctrlr_get_first_active_ns(struct nvme_ctrlr *nvme_ctrlr) 316 { 317 return RB_MIN(nvme_ns_tree, &nvme_ctrlr->namespaces); 318 } 319 320 struct nvme_ns * 321 nvme_ctrlr_get_next_active_ns(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *ns) 322 { 323 if (ns == NULL) { 324 return NULL; 325 } 326 327 return RB_NEXT(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 328 } 329 330 static struct nvme_ctrlr * 331 nvme_ctrlr_get(const struct spdk_nvme_transport_id *trid) 332 { 333 struct nvme_bdev_ctrlr *nbdev_ctrlr; 334 struct nvme_ctrlr *nvme_ctrlr = NULL; 335 336 pthread_mutex_lock(&g_bdev_nvme_mutex); 337 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 338 nvme_ctrlr = nvme_bdev_ctrlr_get_ctrlr(nbdev_ctrlr, trid); 339 if (nvme_ctrlr != NULL) { 340 break; 341 } 342 } 343 pthread_mutex_unlock(&g_bdev_nvme_mutex); 344 345 return nvme_ctrlr; 346 } 347 348 struct nvme_ctrlr * 349 nvme_ctrlr_get_by_name(const char *name) 350 { 351 struct nvme_bdev_ctrlr *nbdev_ctrlr; 352 struct nvme_ctrlr *nvme_ctrlr = NULL; 353 354 if (name == NULL) { 355 return NULL; 356 } 357 358 pthread_mutex_lock(&g_bdev_nvme_mutex); 359 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 360 if (nbdev_ctrlr != NULL) { 361 nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 362 } 363 pthread_mutex_unlock(&g_bdev_nvme_mutex); 364 365 return nvme_ctrlr; 366 } 367 368 void 369 nvme_bdev_ctrlr_for_each(nvme_bdev_ctrlr_for_each_fn fn, void *ctx) 370 { 371 struct nvme_bdev_ctrlr *nbdev_ctrlr; 372 373 pthread_mutex_lock(&g_bdev_nvme_mutex); 374 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 375 fn(nbdev_ctrlr, ctx); 376 } 377 pthread_mutex_unlock(&g_bdev_nvme_mutex); 378 } 379 380 void 381 nvme_bdev_dump_trid_json(const struct spdk_nvme_transport_id *trid, struct spdk_json_write_ctx *w) 382 { 383 const char *trtype_str; 384 const char *adrfam_str; 385 386 trtype_str = spdk_nvme_transport_id_trtype_str(trid->trtype); 387 if (trtype_str) { 388 spdk_json_write_named_string(w, "trtype", trtype_str); 389 } 390 391 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 392 if (adrfam_str) { 393 spdk_json_write_named_string(w, "adrfam", adrfam_str); 394 } 395 396 if (trid->traddr[0] != '\0') { 397 spdk_json_write_named_string(w, "traddr", trid->traddr); 398 } 399 400 if (trid->trsvcid[0] != '\0') { 401 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 402 } 403 404 if (trid->subnqn[0] != '\0') { 405 spdk_json_write_named_string(w, "subnqn", trid->subnqn); 406 } 407 } 408 409 static void 410 nvme_bdev_ctrlr_delete(struct nvme_bdev_ctrlr *nbdev_ctrlr, 411 struct nvme_ctrlr *nvme_ctrlr) 412 { 413 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_delete, nvme_ctrlr->nbdev_ctrlr->name); 414 pthread_mutex_lock(&g_bdev_nvme_mutex); 415 416 TAILQ_REMOVE(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 417 if (!TAILQ_EMPTY(&nbdev_ctrlr->ctrlrs)) { 418 pthread_mutex_unlock(&g_bdev_nvme_mutex); 419 420 return; 421 } 422 TAILQ_REMOVE(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 423 424 pthread_mutex_unlock(&g_bdev_nvme_mutex); 425 426 assert(TAILQ_EMPTY(&nbdev_ctrlr->bdevs)); 427 428 free(nbdev_ctrlr->name); 429 free(nbdev_ctrlr); 430 } 431 432 static void 433 _nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 434 { 435 struct nvme_path_id *path_id, *tmp_path; 436 struct nvme_ns *ns, *tmp_ns; 437 438 free(nvme_ctrlr->copied_ana_desc); 439 spdk_free(nvme_ctrlr->ana_log_page); 440 441 if (nvme_ctrlr->opal_dev) { 442 spdk_opal_dev_destruct(nvme_ctrlr->opal_dev); 443 nvme_ctrlr->opal_dev = NULL; 444 } 445 446 if (nvme_ctrlr->nbdev_ctrlr) { 447 nvme_bdev_ctrlr_delete(nvme_ctrlr->nbdev_ctrlr, nvme_ctrlr); 448 } 449 450 RB_FOREACH_SAFE(ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp_ns) { 451 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 452 nvme_ns_free(ns); 453 } 454 455 TAILQ_FOREACH_SAFE(path_id, &nvme_ctrlr->trids, link, tmp_path) { 456 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 457 free(path_id); 458 } 459 460 pthread_mutex_destroy(&nvme_ctrlr->mutex); 461 462 free(nvme_ctrlr); 463 464 pthread_mutex_lock(&g_bdev_nvme_mutex); 465 if (g_bdev_nvme_module_finish && TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 466 pthread_mutex_unlock(&g_bdev_nvme_mutex); 467 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 468 spdk_bdev_module_fini_done(); 469 return; 470 } 471 pthread_mutex_unlock(&g_bdev_nvme_mutex); 472 } 473 474 static int 475 nvme_detach_poller(void *arg) 476 { 477 struct nvme_ctrlr *nvme_ctrlr = arg; 478 int rc; 479 480 rc = spdk_nvme_detach_poll_async(nvme_ctrlr->detach_ctx); 481 if (rc != -EAGAIN) { 482 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 483 _nvme_ctrlr_delete(nvme_ctrlr); 484 } 485 486 return SPDK_POLLER_BUSY; 487 } 488 489 static void 490 nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 491 { 492 int rc; 493 494 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 495 496 /* First, unregister the adminq poller, as the driver will poll adminq if necessary */ 497 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 498 499 /* If we got here, the reset/detach poller cannot be active */ 500 assert(nvme_ctrlr->reset_detach_poller == NULL); 501 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(nvme_detach_poller, 502 nvme_ctrlr, 1000); 503 if (nvme_ctrlr->reset_detach_poller == NULL) { 504 SPDK_ERRLOG("Failed to register detach poller\n"); 505 goto error; 506 } 507 508 rc = spdk_nvme_detach_async(nvme_ctrlr->ctrlr, &nvme_ctrlr->detach_ctx); 509 if (rc != 0) { 510 SPDK_ERRLOG("Failed to detach the NVMe controller\n"); 511 goto error; 512 } 513 514 return; 515 error: 516 /* We don't have a good way to handle errors here, so just do what we can and delete the 517 * controller without detaching the underlying NVMe device. 518 */ 519 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 520 _nvme_ctrlr_delete(nvme_ctrlr); 521 } 522 523 static void 524 nvme_ctrlr_unregister_cb(void *io_device) 525 { 526 struct nvme_ctrlr *nvme_ctrlr = io_device; 527 528 nvme_ctrlr_delete(nvme_ctrlr); 529 } 530 531 static void 532 nvme_ctrlr_unregister(void *ctx) 533 { 534 struct nvme_ctrlr *nvme_ctrlr = ctx; 535 536 spdk_io_device_unregister(nvme_ctrlr, nvme_ctrlr_unregister_cb); 537 } 538 539 static bool 540 nvme_ctrlr_can_be_unregistered(struct nvme_ctrlr *nvme_ctrlr) 541 { 542 if (!nvme_ctrlr->destruct) { 543 return false; 544 } 545 546 if (nvme_ctrlr->ref > 0) { 547 return false; 548 } 549 550 if (nvme_ctrlr->resetting) { 551 return false; 552 } 553 554 if (nvme_ctrlr->ana_log_page_updating) { 555 return false; 556 } 557 558 if (nvme_ctrlr->io_path_cache_clearing) { 559 return false; 560 } 561 562 return true; 563 } 564 565 static void 566 nvme_ctrlr_release(struct nvme_ctrlr *nvme_ctrlr) 567 { 568 pthread_mutex_lock(&nvme_ctrlr->mutex); 569 SPDK_DTRACE_PROBE2(bdev_nvme_ctrlr_release, nvme_ctrlr->nbdev_ctrlr->name, nvme_ctrlr->ref); 570 571 assert(nvme_ctrlr->ref > 0); 572 nvme_ctrlr->ref--; 573 574 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 575 pthread_mutex_unlock(&nvme_ctrlr->mutex); 576 return; 577 } 578 579 pthread_mutex_unlock(&nvme_ctrlr->mutex); 580 581 spdk_thread_exec_msg(nvme_ctrlr->thread, nvme_ctrlr_unregister, nvme_ctrlr); 582 } 583 584 static void 585 bdev_nvme_clear_current_io_path(struct nvme_bdev_channel *nbdev_ch) 586 { 587 nbdev_ch->current_io_path = NULL; 588 nbdev_ch->rr_counter = 0; 589 } 590 591 static struct nvme_io_path * 592 _bdev_nvme_get_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 593 { 594 struct nvme_io_path *io_path; 595 596 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 597 if (io_path->nvme_ns == nvme_ns) { 598 break; 599 } 600 } 601 602 return io_path; 603 } 604 605 static struct nvme_io_path * 606 nvme_io_path_alloc(void) 607 { 608 struct nvme_io_path *io_path; 609 610 io_path = calloc(1, sizeof(*io_path)); 611 if (io_path == NULL) { 612 SPDK_ERRLOG("Failed to alloc io_path.\n"); 613 return NULL; 614 } 615 616 if (g_opts.io_path_stat) { 617 io_path->stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); 618 if (io_path->stat == NULL) { 619 free(io_path); 620 SPDK_ERRLOG("Failed to alloc io_path stat.\n"); 621 return NULL; 622 } 623 spdk_bdev_reset_io_stat(io_path->stat, SPDK_BDEV_RESET_STAT_MAXMIN); 624 } 625 626 return io_path; 627 } 628 629 static void 630 nvme_io_path_free(struct nvme_io_path *io_path) 631 { 632 free(io_path->stat); 633 free(io_path); 634 } 635 636 static int 637 _bdev_nvme_add_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 638 { 639 struct nvme_io_path *io_path; 640 struct spdk_io_channel *ch; 641 struct nvme_ctrlr_channel *ctrlr_ch; 642 struct nvme_qpair *nvme_qpair; 643 644 io_path = nvme_io_path_alloc(); 645 if (io_path == NULL) { 646 return -ENOMEM; 647 } 648 649 io_path->nvme_ns = nvme_ns; 650 651 ch = spdk_get_io_channel(nvme_ns->ctrlr); 652 if (ch == NULL) { 653 nvme_io_path_free(io_path); 654 SPDK_ERRLOG("Failed to alloc io_channel.\n"); 655 return -ENOMEM; 656 } 657 658 ctrlr_ch = spdk_io_channel_get_ctx(ch); 659 660 nvme_qpair = ctrlr_ch->qpair; 661 assert(nvme_qpair != NULL); 662 663 io_path->qpair = nvme_qpair; 664 TAILQ_INSERT_TAIL(&nvme_qpair->io_path_list, io_path, tailq); 665 666 io_path->nbdev_ch = nbdev_ch; 667 STAILQ_INSERT_TAIL(&nbdev_ch->io_path_list, io_path, stailq); 668 669 bdev_nvme_clear_current_io_path(nbdev_ch); 670 671 return 0; 672 } 673 674 static void 675 bdev_nvme_clear_retry_io_path(struct nvme_bdev_channel *nbdev_ch, 676 struct nvme_io_path *io_path) 677 { 678 struct spdk_bdev_io *bdev_io; 679 struct nvme_bdev_io *bio; 680 681 TAILQ_FOREACH(bdev_io, &nbdev_ch->retry_io_list, module_link) { 682 bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 683 if (bio->io_path == io_path) { 684 bio->io_path = NULL; 685 } 686 } 687 } 688 689 static void 690 _bdev_nvme_delete_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *io_path) 691 { 692 struct spdk_io_channel *ch; 693 struct nvme_qpair *nvme_qpair; 694 struct nvme_ctrlr_channel *ctrlr_ch; 695 struct nvme_bdev *nbdev; 696 697 nbdev = spdk_io_channel_get_io_device(spdk_io_channel_from_ctx(nbdev_ch)); 698 699 /* Add the statistics to nvme_ns before this path is destroyed. */ 700 pthread_mutex_lock(&nbdev->mutex); 701 if (nbdev->ref != 0 && io_path->nvme_ns->stat != NULL && io_path->stat != NULL) { 702 spdk_bdev_add_io_stat(io_path->nvme_ns->stat, io_path->stat); 703 } 704 pthread_mutex_unlock(&nbdev->mutex); 705 706 bdev_nvme_clear_current_io_path(nbdev_ch); 707 bdev_nvme_clear_retry_io_path(nbdev_ch, io_path); 708 709 STAILQ_REMOVE(&nbdev_ch->io_path_list, io_path, nvme_io_path, stailq); 710 io_path->nbdev_ch = NULL; 711 712 nvme_qpair = io_path->qpair; 713 assert(nvme_qpair != NULL); 714 715 ctrlr_ch = nvme_qpair->ctrlr_ch; 716 assert(ctrlr_ch != NULL); 717 718 ch = spdk_io_channel_from_ctx(ctrlr_ch); 719 spdk_put_io_channel(ch); 720 721 /* After an io_path is removed, I/Os submitted to it may complete and update statistics 722 * of the io_path. To avoid heap-use-after-free error from this case, do not free the 723 * io_path here but free the io_path when the associated qpair is freed. It is ensured 724 * that all I/Os submitted to the io_path are completed when the associated qpair is freed. 725 */ 726 } 727 728 static void 729 _bdev_nvme_delete_io_paths(struct nvme_bdev_channel *nbdev_ch) 730 { 731 struct nvme_io_path *io_path, *tmp_io_path; 732 733 STAILQ_FOREACH_SAFE(io_path, &nbdev_ch->io_path_list, stailq, tmp_io_path) { 734 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 735 } 736 } 737 738 static int 739 bdev_nvme_create_bdev_channel_cb(void *io_device, void *ctx_buf) 740 { 741 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 742 struct nvme_bdev *nbdev = io_device; 743 struct nvme_ns *nvme_ns; 744 int rc; 745 746 STAILQ_INIT(&nbdev_ch->io_path_list); 747 TAILQ_INIT(&nbdev_ch->retry_io_list); 748 749 pthread_mutex_lock(&nbdev->mutex); 750 751 nbdev_ch->mp_policy = nbdev->mp_policy; 752 nbdev_ch->mp_selector = nbdev->mp_selector; 753 nbdev_ch->rr_min_io = nbdev->rr_min_io; 754 755 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 756 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 757 if (rc != 0) { 758 pthread_mutex_unlock(&nbdev->mutex); 759 760 _bdev_nvme_delete_io_paths(nbdev_ch); 761 return rc; 762 } 763 } 764 pthread_mutex_unlock(&nbdev->mutex); 765 766 return 0; 767 } 768 769 /* If cpl != NULL, complete the bdev_io with nvme status based on 'cpl'. 770 * If cpl == NULL, complete the bdev_io with bdev status based on 'status'. 771 */ 772 static inline void 773 __bdev_nvme_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status, 774 const struct spdk_nvme_cpl *cpl) 775 { 776 spdk_trace_record(TRACE_BDEV_NVME_IO_DONE, 0, 0, (uintptr_t)bdev_io->driver_ctx, 777 (uintptr_t)bdev_io); 778 if (cpl) { 779 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 780 } else { 781 spdk_bdev_io_complete(bdev_io, status); 782 } 783 } 784 785 static void bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch); 786 787 static void 788 bdev_nvme_destroy_bdev_channel_cb(void *io_device, void *ctx_buf) 789 { 790 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 791 792 bdev_nvme_abort_retry_ios(nbdev_ch); 793 _bdev_nvme_delete_io_paths(nbdev_ch); 794 } 795 796 static inline bool 797 bdev_nvme_io_type_is_admin(enum spdk_bdev_io_type io_type) 798 { 799 switch (io_type) { 800 case SPDK_BDEV_IO_TYPE_RESET: 801 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 802 case SPDK_BDEV_IO_TYPE_ABORT: 803 return true; 804 default: 805 break; 806 } 807 808 return false; 809 } 810 811 static inline bool 812 nvme_ns_is_accessible(struct nvme_ns *nvme_ns) 813 { 814 if (spdk_unlikely(nvme_ns->ana_state_updating)) { 815 return false; 816 } 817 818 switch (nvme_ns->ana_state) { 819 case SPDK_NVME_ANA_OPTIMIZED_STATE: 820 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 821 return true; 822 default: 823 break; 824 } 825 826 return false; 827 } 828 829 static inline bool 830 nvme_qpair_is_connected(struct nvme_qpair *nvme_qpair) 831 { 832 if (spdk_unlikely(nvme_qpair->qpair == NULL)) { 833 return false; 834 } 835 836 if (spdk_unlikely(spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) != 837 SPDK_NVME_QPAIR_FAILURE_NONE)) { 838 return false; 839 } 840 841 if (spdk_unlikely(nvme_qpair->ctrlr_ch->reset_iter != NULL)) { 842 return false; 843 } 844 845 if (spdk_nvme_ctrlr_get_admin_qp_failure_reason(nvme_qpair->ctrlr->ctrlr) != 846 SPDK_NVME_QPAIR_FAILURE_NONE) { 847 return false; 848 } 849 850 return true; 851 } 852 853 static inline bool 854 nvme_io_path_is_available(struct nvme_io_path *io_path) 855 { 856 if (spdk_unlikely(!nvme_qpair_is_connected(io_path->qpair))) { 857 return false; 858 } 859 860 if (spdk_unlikely(!nvme_ns_is_accessible(io_path->nvme_ns))) { 861 return false; 862 } 863 864 return true; 865 } 866 867 static inline bool 868 nvme_ctrlr_is_failed(struct nvme_ctrlr *nvme_ctrlr) 869 { 870 if (nvme_ctrlr->destruct) { 871 return true; 872 } 873 874 if (nvme_ctrlr->fast_io_fail_timedout) { 875 return true; 876 } 877 878 if (nvme_ctrlr->resetting) { 879 if (nvme_ctrlr->opts.reconnect_delay_sec != 0) { 880 return false; 881 } else { 882 return true; 883 } 884 } 885 886 if (nvme_ctrlr->reconnect_is_delayed) { 887 return false; 888 } 889 890 if (nvme_ctrlr->disabled) { 891 return true; 892 } 893 894 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 895 return true; 896 } else { 897 return false; 898 } 899 } 900 901 static bool 902 nvme_ctrlr_is_available(struct nvme_ctrlr *nvme_ctrlr) 903 { 904 if (nvme_ctrlr->destruct) { 905 return false; 906 } 907 908 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 909 return false; 910 } 911 912 if (nvme_ctrlr->resetting || nvme_ctrlr->reconnect_is_delayed) { 913 return false; 914 } 915 916 if (nvme_ctrlr->disabled) { 917 return false; 918 } 919 920 return true; 921 } 922 923 /* Simulate circular linked list. */ 924 static inline struct nvme_io_path * 925 nvme_io_path_get_next(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *prev_path) 926 { 927 struct nvme_io_path *next_path; 928 929 if (prev_path != NULL) { 930 next_path = STAILQ_NEXT(prev_path, stailq); 931 if (next_path != NULL) { 932 return next_path; 933 } 934 } 935 936 return STAILQ_FIRST(&nbdev_ch->io_path_list); 937 } 938 939 static struct nvme_io_path * 940 _bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 941 { 942 struct nvme_io_path *io_path, *start, *non_optimized = NULL; 943 944 start = nvme_io_path_get_next(nbdev_ch, nbdev_ch->current_io_path); 945 946 io_path = start; 947 do { 948 if (spdk_likely(nvme_qpair_is_connected(io_path->qpair) && 949 !io_path->nvme_ns->ana_state_updating)) { 950 switch (io_path->nvme_ns->ana_state) { 951 case SPDK_NVME_ANA_OPTIMIZED_STATE: 952 nbdev_ch->current_io_path = io_path; 953 return io_path; 954 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 955 if (non_optimized == NULL) { 956 non_optimized = io_path; 957 } 958 break; 959 default: 960 break; 961 } 962 } 963 io_path = nvme_io_path_get_next(nbdev_ch, io_path); 964 } while (io_path != start); 965 966 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) { 967 /* We come here only if there is no optimized path. Cache even non_optimized 968 * path for load balance across multiple non_optimized paths. 969 */ 970 nbdev_ch->current_io_path = non_optimized; 971 } 972 973 return non_optimized; 974 } 975 976 static struct nvme_io_path * 977 _bdev_nvme_find_io_path_min_qd(struct nvme_bdev_channel *nbdev_ch) 978 { 979 struct nvme_io_path *io_path; 980 struct nvme_io_path *optimized = NULL, *non_optimized = NULL; 981 uint32_t opt_min_qd = UINT32_MAX, non_opt_min_qd = UINT32_MAX; 982 uint32_t num_outstanding_reqs; 983 984 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 985 if (spdk_unlikely(!nvme_qpair_is_connected(io_path->qpair))) { 986 /* The device is currently resetting. */ 987 continue; 988 } 989 990 if (spdk_unlikely(io_path->nvme_ns->ana_state_updating)) { 991 continue; 992 } 993 994 num_outstanding_reqs = spdk_nvme_qpair_get_num_outstanding_reqs(io_path->qpair->qpair); 995 switch (io_path->nvme_ns->ana_state) { 996 case SPDK_NVME_ANA_OPTIMIZED_STATE: 997 if (num_outstanding_reqs < opt_min_qd) { 998 opt_min_qd = num_outstanding_reqs; 999 optimized = io_path; 1000 } 1001 break; 1002 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 1003 if (num_outstanding_reqs < non_opt_min_qd) { 1004 non_opt_min_qd = num_outstanding_reqs; 1005 non_optimized = io_path; 1006 } 1007 break; 1008 default: 1009 break; 1010 } 1011 } 1012 1013 /* don't cache io path for BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH selector */ 1014 if (optimized != NULL) { 1015 return optimized; 1016 } 1017 1018 return non_optimized; 1019 } 1020 1021 static inline struct nvme_io_path * 1022 bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 1023 { 1024 if (spdk_likely(nbdev_ch->current_io_path != NULL)) { 1025 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE) { 1026 return nbdev_ch->current_io_path; 1027 } else if (nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 1028 if (++nbdev_ch->rr_counter < nbdev_ch->rr_min_io) { 1029 return nbdev_ch->current_io_path; 1030 } 1031 nbdev_ch->rr_counter = 0; 1032 } 1033 } 1034 1035 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE || 1036 nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 1037 return _bdev_nvme_find_io_path(nbdev_ch); 1038 } else { 1039 return _bdev_nvme_find_io_path_min_qd(nbdev_ch); 1040 } 1041 } 1042 1043 /* Return true if there is any io_path whose qpair is active or ctrlr is not failed, 1044 * or false otherwise. 1045 * 1046 * If any io_path has an active qpair but find_io_path() returned NULL, its namespace 1047 * is likely to be non-accessible now but may become accessible. 1048 * 1049 * If any io_path has an unfailed ctrlr but find_io_path() returned NULL, the ctrlr 1050 * is likely to be resetting now but the reset may succeed. A ctrlr is set to unfailed 1051 * when starting to reset it but it is set to failed when the reset failed. Hence, if 1052 * a ctrlr is unfailed, it is likely that it works fine or is resetting. 1053 */ 1054 static bool 1055 any_io_path_may_become_available(struct nvme_bdev_channel *nbdev_ch) 1056 { 1057 struct nvme_io_path *io_path; 1058 1059 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 1060 if (io_path->nvme_ns->ana_transition_timedout) { 1061 continue; 1062 } 1063 1064 if (nvme_qpair_is_connected(io_path->qpair) || 1065 !nvme_ctrlr_is_failed(io_path->qpair->ctrlr)) { 1066 return true; 1067 } 1068 } 1069 1070 return false; 1071 } 1072 1073 static void 1074 bdev_nvme_retry_io(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io) 1075 { 1076 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 1077 struct spdk_io_channel *ch; 1078 1079 if (nbdev_io->io_path != NULL && nvme_io_path_is_available(nbdev_io->io_path)) { 1080 _bdev_nvme_submit_request(nbdev_ch, bdev_io); 1081 } else { 1082 ch = spdk_io_channel_from_ctx(nbdev_ch); 1083 bdev_nvme_submit_request(ch, bdev_io); 1084 } 1085 } 1086 1087 static int 1088 bdev_nvme_retry_ios(void *arg) 1089 { 1090 struct nvme_bdev_channel *nbdev_ch = arg; 1091 struct spdk_bdev_io *bdev_io, *tmp_bdev_io; 1092 struct nvme_bdev_io *bio; 1093 uint64_t now, delay_us; 1094 1095 now = spdk_get_ticks(); 1096 1097 TAILQ_FOREACH_SAFE(bdev_io, &nbdev_ch->retry_io_list, module_link, tmp_bdev_io) { 1098 bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 1099 if (bio->retry_ticks > now) { 1100 break; 1101 } 1102 1103 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io, module_link); 1104 1105 bdev_nvme_retry_io(nbdev_ch, bdev_io); 1106 } 1107 1108 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1109 1110 bdev_io = TAILQ_FIRST(&nbdev_ch->retry_io_list); 1111 if (bdev_io != NULL) { 1112 bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 1113 1114 delay_us = (bio->retry_ticks - now) * SPDK_SEC_TO_USEC / spdk_get_ticks_hz(); 1115 1116 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 1117 delay_us); 1118 } 1119 1120 return SPDK_POLLER_BUSY; 1121 } 1122 1123 static void 1124 bdev_nvme_queue_retry_io(struct nvme_bdev_channel *nbdev_ch, 1125 struct nvme_bdev_io *bio, uint64_t delay_ms) 1126 { 1127 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1128 struct spdk_bdev_io *tmp_bdev_io; 1129 struct nvme_bdev_io *tmp_bio; 1130 1131 bio->retry_ticks = spdk_get_ticks() + delay_ms * spdk_get_ticks_hz() / 1000ULL; 1132 1133 TAILQ_FOREACH_REVERSE(tmp_bdev_io, &nbdev_ch->retry_io_list, retry_io_head, module_link) { 1134 tmp_bio = (struct nvme_bdev_io *)tmp_bdev_io->driver_ctx; 1135 1136 if (tmp_bio->retry_ticks <= bio->retry_ticks) { 1137 TAILQ_INSERT_AFTER(&nbdev_ch->retry_io_list, tmp_bdev_io, bdev_io, 1138 module_link); 1139 return; 1140 } 1141 } 1142 1143 /* No earlier I/Os were found. This I/O must be the new head. */ 1144 TAILQ_INSERT_HEAD(&nbdev_ch->retry_io_list, bdev_io, module_link); 1145 1146 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1147 1148 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 1149 delay_ms * 1000ULL); 1150 } 1151 1152 static void 1153 bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch) 1154 { 1155 struct spdk_bdev_io *bdev_io, *tmp_io; 1156 1157 TAILQ_FOREACH_SAFE(bdev_io, &nbdev_ch->retry_io_list, module_link, tmp_io) { 1158 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io, module_link); 1159 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED, NULL); 1160 } 1161 1162 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1163 } 1164 1165 static int 1166 bdev_nvme_abort_retry_io(struct nvme_bdev_channel *nbdev_ch, 1167 struct nvme_bdev_io *bio_to_abort) 1168 { 1169 struct spdk_bdev_io *bdev_io_to_abort; 1170 1171 TAILQ_FOREACH(bdev_io_to_abort, &nbdev_ch->retry_io_list, module_link) { 1172 if ((struct nvme_bdev_io *)bdev_io_to_abort->driver_ctx == bio_to_abort) { 1173 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io_to_abort, module_link); 1174 __bdev_nvme_io_complete(bdev_io_to_abort, SPDK_BDEV_IO_STATUS_ABORTED, NULL); 1175 return 0; 1176 } 1177 } 1178 1179 return -ENOENT; 1180 } 1181 1182 static void 1183 bdev_nvme_update_nvme_error_stat(struct spdk_bdev_io *bdev_io, const struct spdk_nvme_cpl *cpl) 1184 { 1185 struct nvme_bdev *nbdev; 1186 uint16_t sct, sc; 1187 1188 assert(spdk_nvme_cpl_is_error(cpl)); 1189 1190 nbdev = bdev_io->bdev->ctxt; 1191 1192 if (nbdev->err_stat == NULL) { 1193 return; 1194 } 1195 1196 sct = cpl->status.sct; 1197 sc = cpl->status.sc; 1198 1199 pthread_mutex_lock(&nbdev->mutex); 1200 1201 nbdev->err_stat->status_type[sct]++; 1202 switch (sct) { 1203 case SPDK_NVME_SCT_GENERIC: 1204 case SPDK_NVME_SCT_COMMAND_SPECIFIC: 1205 case SPDK_NVME_SCT_MEDIA_ERROR: 1206 case SPDK_NVME_SCT_PATH: 1207 nbdev->err_stat->status[sct][sc]++; 1208 break; 1209 default: 1210 break; 1211 } 1212 1213 pthread_mutex_unlock(&nbdev->mutex); 1214 } 1215 1216 static inline void 1217 bdev_nvme_update_io_path_stat(struct nvme_bdev_io *bio) 1218 { 1219 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1220 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 1221 uint32_t blocklen = bdev_io->bdev->blocklen; 1222 struct spdk_bdev_io_stat *stat; 1223 uint64_t tsc_diff; 1224 1225 if (bio->io_path->stat == NULL) { 1226 return; 1227 } 1228 1229 tsc_diff = spdk_get_ticks() - bio->submit_tsc; 1230 stat = bio->io_path->stat; 1231 1232 switch (bdev_io->type) { 1233 case SPDK_BDEV_IO_TYPE_READ: 1234 stat->bytes_read += num_blocks * blocklen; 1235 stat->num_read_ops++; 1236 stat->read_latency_ticks += tsc_diff; 1237 if (stat->max_read_latency_ticks < tsc_diff) { 1238 stat->max_read_latency_ticks = tsc_diff; 1239 } 1240 if (stat->min_read_latency_ticks > tsc_diff) { 1241 stat->min_read_latency_ticks = tsc_diff; 1242 } 1243 break; 1244 case SPDK_BDEV_IO_TYPE_WRITE: 1245 stat->bytes_written += num_blocks * blocklen; 1246 stat->num_write_ops++; 1247 stat->write_latency_ticks += tsc_diff; 1248 if (stat->max_write_latency_ticks < tsc_diff) { 1249 stat->max_write_latency_ticks = tsc_diff; 1250 } 1251 if (stat->min_write_latency_ticks > tsc_diff) { 1252 stat->min_write_latency_ticks = tsc_diff; 1253 } 1254 break; 1255 case SPDK_BDEV_IO_TYPE_UNMAP: 1256 stat->bytes_unmapped += num_blocks * blocklen; 1257 stat->num_unmap_ops++; 1258 stat->unmap_latency_ticks += tsc_diff; 1259 if (stat->max_unmap_latency_ticks < tsc_diff) { 1260 stat->max_unmap_latency_ticks = tsc_diff; 1261 } 1262 if (stat->min_unmap_latency_ticks > tsc_diff) { 1263 stat->min_unmap_latency_ticks = tsc_diff; 1264 } 1265 break; 1266 case SPDK_BDEV_IO_TYPE_ZCOPY: 1267 /* Track the data in the start phase only */ 1268 if (!bdev_io->u.bdev.zcopy.start) { 1269 break; 1270 } 1271 if (bdev_io->u.bdev.zcopy.populate) { 1272 stat->bytes_read += num_blocks * blocklen; 1273 stat->num_read_ops++; 1274 stat->read_latency_ticks += tsc_diff; 1275 if (stat->max_read_latency_ticks < tsc_diff) { 1276 stat->max_read_latency_ticks = tsc_diff; 1277 } 1278 if (stat->min_read_latency_ticks > tsc_diff) { 1279 stat->min_read_latency_ticks = tsc_diff; 1280 } 1281 } else { 1282 stat->bytes_written += num_blocks * blocklen; 1283 stat->num_write_ops++; 1284 stat->write_latency_ticks += tsc_diff; 1285 if (stat->max_write_latency_ticks < tsc_diff) { 1286 stat->max_write_latency_ticks = tsc_diff; 1287 } 1288 if (stat->min_write_latency_ticks > tsc_diff) { 1289 stat->min_write_latency_ticks = tsc_diff; 1290 } 1291 } 1292 break; 1293 case SPDK_BDEV_IO_TYPE_COPY: 1294 stat->bytes_copied += num_blocks * blocklen; 1295 stat->num_copy_ops++; 1296 stat->copy_latency_ticks += tsc_diff; 1297 if (stat->max_copy_latency_ticks < tsc_diff) { 1298 stat->max_copy_latency_ticks = tsc_diff; 1299 } 1300 if (stat->min_copy_latency_ticks > tsc_diff) { 1301 stat->min_copy_latency_ticks = tsc_diff; 1302 } 1303 break; 1304 default: 1305 break; 1306 } 1307 } 1308 1309 static bool 1310 bdev_nvme_check_retry_io(struct nvme_bdev_io *bio, 1311 const struct spdk_nvme_cpl *cpl, 1312 struct nvme_bdev_channel *nbdev_ch, 1313 uint64_t *_delay_ms) 1314 { 1315 struct nvme_io_path *io_path = bio->io_path; 1316 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 1317 const struct spdk_nvme_ctrlr_data *cdata; 1318 1319 if (spdk_nvme_cpl_is_path_error(cpl) || 1320 spdk_nvme_cpl_is_aborted_sq_deletion(cpl) || 1321 !nvme_io_path_is_available(io_path) || 1322 !nvme_ctrlr_is_available(nvme_ctrlr)) { 1323 bdev_nvme_clear_current_io_path(nbdev_ch); 1324 bio->io_path = NULL; 1325 if (spdk_nvme_cpl_is_ana_error(cpl)) { 1326 if (nvme_ctrlr_read_ana_log_page(nvme_ctrlr) == 0) { 1327 io_path->nvme_ns->ana_state_updating = true; 1328 } 1329 } 1330 if (!any_io_path_may_become_available(nbdev_ch)) { 1331 return false; 1332 } 1333 *_delay_ms = 0; 1334 } else { 1335 bio->retry_count++; 1336 1337 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 1338 1339 if (cpl->status.crd != 0) { 1340 *_delay_ms = cdata->crdt[cpl->status.crd] * 100; 1341 } else { 1342 *_delay_ms = 0; 1343 } 1344 } 1345 1346 return true; 1347 } 1348 1349 static inline void 1350 bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio, 1351 const struct spdk_nvme_cpl *cpl) 1352 { 1353 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1354 struct nvme_bdev_channel *nbdev_ch; 1355 uint64_t delay_ms; 1356 1357 assert(!bdev_nvme_io_type_is_admin(bdev_io->type)); 1358 1359 if (spdk_likely(spdk_nvme_cpl_is_success(cpl))) { 1360 bdev_nvme_update_io_path_stat(bio); 1361 goto complete; 1362 } 1363 1364 /* Update error counts before deciding if retry is needed. 1365 * Hence, error counts may be more than the number of I/O errors. 1366 */ 1367 bdev_nvme_update_nvme_error_stat(bdev_io, cpl); 1368 1369 if (cpl->status.dnr != 0 || spdk_nvme_cpl_is_aborted_by_request(cpl) || 1370 (g_opts.bdev_retry_count != -1 && bio->retry_count >= g_opts.bdev_retry_count)) { 1371 goto complete; 1372 } 1373 1374 /* At this point we don't know whether the sequence was successfully executed or not, so we 1375 * cannot retry the IO */ 1376 if (bdev_io->u.bdev.accel_sequence != NULL) { 1377 goto complete; 1378 } 1379 1380 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1381 1382 if (bdev_nvme_check_retry_io(bio, cpl, nbdev_ch, &delay_ms)) { 1383 bdev_nvme_queue_retry_io(nbdev_ch, bio, delay_ms); 1384 return; 1385 } 1386 1387 complete: 1388 bio->retry_count = 0; 1389 bio->submit_tsc = 0; 1390 bdev_io->u.bdev.accel_sequence = NULL; 1391 __bdev_nvme_io_complete(bdev_io, 0, cpl); 1392 } 1393 1394 static inline void 1395 bdev_nvme_io_complete(struct nvme_bdev_io *bio, int rc) 1396 { 1397 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1398 struct nvme_bdev_channel *nbdev_ch; 1399 enum spdk_bdev_io_status io_status; 1400 1401 assert(!bdev_nvme_io_type_is_admin(bdev_io->type)); 1402 1403 switch (rc) { 1404 case 0: 1405 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1406 break; 1407 case -ENOMEM: 1408 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1409 break; 1410 case -ENXIO: 1411 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1412 1413 bdev_nvme_clear_current_io_path(nbdev_ch); 1414 bio->io_path = NULL; 1415 1416 if (any_io_path_may_become_available(nbdev_ch)) { 1417 bdev_nvme_queue_retry_io(nbdev_ch, bio, 1000ULL); 1418 return; 1419 } 1420 1421 /* fallthrough */ 1422 default: 1423 spdk_accel_sequence_abort(bdev_io->u.bdev.accel_sequence); 1424 bdev_io->u.bdev.accel_sequence = NULL; 1425 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1426 break; 1427 } 1428 1429 bio->retry_count = 0; 1430 bio->submit_tsc = 0; 1431 __bdev_nvme_io_complete(bdev_io, io_status, NULL); 1432 } 1433 1434 static inline void 1435 bdev_nvme_admin_complete(struct nvme_bdev_io *bio, int rc) 1436 { 1437 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1438 enum spdk_bdev_io_status io_status; 1439 1440 switch (rc) { 1441 case 0: 1442 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1443 break; 1444 case -ENOMEM: 1445 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1446 break; 1447 case -ENXIO: 1448 /* fallthrough */ 1449 default: 1450 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1451 break; 1452 } 1453 1454 __bdev_nvme_io_complete(bdev_io, io_status, NULL); 1455 } 1456 1457 static void 1458 bdev_nvme_clear_io_path_caches_done(struct spdk_io_channel_iter *i, int status) 1459 { 1460 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 1461 1462 pthread_mutex_lock(&nvme_ctrlr->mutex); 1463 1464 assert(nvme_ctrlr->io_path_cache_clearing == true); 1465 nvme_ctrlr->io_path_cache_clearing = false; 1466 1467 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 1468 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1469 return; 1470 } 1471 1472 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1473 1474 nvme_ctrlr_unregister(nvme_ctrlr); 1475 } 1476 1477 static void 1478 _bdev_nvme_clear_io_path_cache(struct nvme_qpair *nvme_qpair) 1479 { 1480 struct nvme_io_path *io_path; 1481 1482 TAILQ_FOREACH(io_path, &nvme_qpair->io_path_list, tailq) { 1483 if (io_path->nbdev_ch == NULL) { 1484 continue; 1485 } 1486 bdev_nvme_clear_current_io_path(io_path->nbdev_ch); 1487 } 1488 } 1489 1490 static void 1491 bdev_nvme_clear_io_path_cache(struct spdk_io_channel_iter *i) 1492 { 1493 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 1494 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 1495 1496 assert(ctrlr_ch->qpair != NULL); 1497 1498 _bdev_nvme_clear_io_path_cache(ctrlr_ch->qpair); 1499 1500 spdk_for_each_channel_continue(i, 0); 1501 } 1502 1503 static void 1504 bdev_nvme_clear_io_path_caches(struct nvme_ctrlr *nvme_ctrlr) 1505 { 1506 pthread_mutex_lock(&nvme_ctrlr->mutex); 1507 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 1508 nvme_ctrlr->io_path_cache_clearing) { 1509 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1510 return; 1511 } 1512 1513 nvme_ctrlr->io_path_cache_clearing = true; 1514 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1515 1516 spdk_for_each_channel(nvme_ctrlr, 1517 bdev_nvme_clear_io_path_cache, 1518 NULL, 1519 bdev_nvme_clear_io_path_caches_done); 1520 } 1521 1522 static struct nvme_qpair * 1523 nvme_poll_group_get_qpair(struct nvme_poll_group *group, struct spdk_nvme_qpair *qpair) 1524 { 1525 struct nvme_qpair *nvme_qpair; 1526 1527 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1528 if (nvme_qpair->qpair == qpair) { 1529 break; 1530 } 1531 } 1532 1533 return nvme_qpair; 1534 } 1535 1536 static void nvme_qpair_delete(struct nvme_qpair *nvme_qpair); 1537 1538 static void 1539 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx) 1540 { 1541 struct nvme_poll_group *group = poll_group_ctx; 1542 struct nvme_qpair *nvme_qpair; 1543 struct nvme_ctrlr_channel *ctrlr_ch; 1544 int status; 1545 1546 nvme_qpair = nvme_poll_group_get_qpair(group, qpair); 1547 if (nvme_qpair == NULL) { 1548 return; 1549 } 1550 1551 if (nvme_qpair->qpair != NULL) { 1552 spdk_nvme_ctrlr_free_io_qpair(nvme_qpair->qpair); 1553 nvme_qpair->qpair = NULL; 1554 } 1555 1556 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1557 1558 ctrlr_ch = nvme_qpair->ctrlr_ch; 1559 1560 if (ctrlr_ch != NULL) { 1561 if (ctrlr_ch->reset_iter != NULL) { 1562 /* We are in a full reset sequence. */ 1563 if (ctrlr_ch->connect_poller != NULL) { 1564 /* qpair was failed to connect. Abort the reset sequence. */ 1565 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was failed to connect. abort the reset ctrlr sequence.\n", 1566 qpair); 1567 spdk_poller_unregister(&ctrlr_ch->connect_poller); 1568 status = -1; 1569 } else { 1570 /* qpair was completed to disconnect. Just move to the next ctrlr_channel. */ 1571 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was disconnected and freed in a reset ctrlr sequence.\n", 1572 qpair); 1573 status = 0; 1574 } 1575 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, status); 1576 ctrlr_ch->reset_iter = NULL; 1577 } else { 1578 /* qpair was disconnected unexpectedly. Reset controller for recovery. */ 1579 SPDK_NOTICELOG("qpair %p was disconnected and freed. reset controller.\n", qpair); 1580 bdev_nvme_failover_ctrlr(nvme_qpair->ctrlr, false); 1581 } 1582 } else { 1583 /* In this case, ctrlr_channel is already deleted. */ 1584 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was disconnected and freed. delete nvme_qpair.\n", qpair); 1585 nvme_qpair_delete(nvme_qpair); 1586 } 1587 } 1588 1589 static void 1590 bdev_nvme_check_io_qpairs(struct nvme_poll_group *group) 1591 { 1592 struct nvme_qpair *nvme_qpair; 1593 1594 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1595 if (nvme_qpair->qpair == NULL || nvme_qpair->ctrlr_ch == NULL) { 1596 continue; 1597 } 1598 1599 if (spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) != 1600 SPDK_NVME_QPAIR_FAILURE_NONE) { 1601 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1602 } 1603 } 1604 } 1605 1606 static int 1607 bdev_nvme_poll(void *arg) 1608 { 1609 struct nvme_poll_group *group = arg; 1610 int64_t num_completions; 1611 1612 if (group->collect_spin_stat && group->start_ticks == 0) { 1613 group->start_ticks = spdk_get_ticks(); 1614 } 1615 1616 num_completions = spdk_nvme_poll_group_process_completions(group->group, 0, 1617 bdev_nvme_disconnected_qpair_cb); 1618 if (group->collect_spin_stat) { 1619 if (num_completions > 0) { 1620 if (group->end_ticks != 0) { 1621 group->spin_ticks += (group->end_ticks - group->start_ticks); 1622 group->end_ticks = 0; 1623 } 1624 group->start_ticks = 0; 1625 } else { 1626 group->end_ticks = spdk_get_ticks(); 1627 } 1628 } 1629 1630 if (spdk_unlikely(num_completions < 0)) { 1631 bdev_nvme_check_io_qpairs(group); 1632 } 1633 1634 return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 1635 } 1636 1637 static int bdev_nvme_poll_adminq(void *arg); 1638 1639 static void 1640 bdev_nvme_change_adminq_poll_period(struct nvme_ctrlr *nvme_ctrlr, uint64_t new_period_us) 1641 { 1642 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 1643 1644 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, 1645 nvme_ctrlr, new_period_us); 1646 } 1647 1648 static int 1649 bdev_nvme_poll_adminq(void *arg) 1650 { 1651 int32_t rc; 1652 struct nvme_ctrlr *nvme_ctrlr = arg; 1653 nvme_ctrlr_disconnected_cb disconnected_cb; 1654 1655 assert(nvme_ctrlr != NULL); 1656 1657 rc = spdk_nvme_ctrlr_process_admin_completions(nvme_ctrlr->ctrlr); 1658 if (rc < 0) { 1659 disconnected_cb = nvme_ctrlr->disconnected_cb; 1660 nvme_ctrlr->disconnected_cb = NULL; 1661 1662 if (disconnected_cb != NULL) { 1663 bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 1664 g_opts.nvme_adminq_poll_period_us); 1665 disconnected_cb(nvme_ctrlr); 1666 } else { 1667 bdev_nvme_failover_ctrlr(nvme_ctrlr, false); 1668 } 1669 } else if (spdk_nvme_ctrlr_get_admin_qp_failure_reason(nvme_ctrlr->ctrlr) != 1670 SPDK_NVME_QPAIR_FAILURE_NONE) { 1671 bdev_nvme_clear_io_path_caches(nvme_ctrlr); 1672 } 1673 1674 return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY; 1675 } 1676 1677 static void 1678 nvme_bdev_free(void *io_device) 1679 { 1680 struct nvme_bdev *nvme_disk = io_device; 1681 1682 pthread_mutex_destroy(&nvme_disk->mutex); 1683 free(nvme_disk->disk.name); 1684 free(nvme_disk->err_stat); 1685 free(nvme_disk); 1686 } 1687 1688 static int 1689 bdev_nvme_destruct(void *ctx) 1690 { 1691 struct nvme_bdev *nvme_disk = ctx; 1692 struct nvme_ns *nvme_ns, *tmp_nvme_ns; 1693 1694 SPDK_DTRACE_PROBE2(bdev_nvme_destruct, nvme_disk->nbdev_ctrlr->name, nvme_disk->nsid); 1695 1696 TAILQ_FOREACH_SAFE(nvme_ns, &nvme_disk->nvme_ns_list, tailq, tmp_nvme_ns) { 1697 pthread_mutex_lock(&nvme_ns->ctrlr->mutex); 1698 1699 nvme_ns->bdev = NULL; 1700 1701 assert(nvme_ns->id > 0); 1702 1703 if (nvme_ctrlr_get_ns(nvme_ns->ctrlr, nvme_ns->id) == NULL) { 1704 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1705 1706 nvme_ctrlr_release(nvme_ns->ctrlr); 1707 nvme_ns_free(nvme_ns); 1708 } else { 1709 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1710 } 1711 } 1712 1713 pthread_mutex_lock(&g_bdev_nvme_mutex); 1714 TAILQ_REMOVE(&nvme_disk->nbdev_ctrlr->bdevs, nvme_disk, tailq); 1715 pthread_mutex_unlock(&g_bdev_nvme_mutex); 1716 1717 spdk_io_device_unregister(nvme_disk, nvme_bdev_free); 1718 1719 return 0; 1720 } 1721 1722 static int 1723 bdev_nvme_create_qpair(struct nvme_qpair *nvme_qpair) 1724 { 1725 struct nvme_ctrlr *nvme_ctrlr; 1726 struct spdk_nvme_io_qpair_opts opts; 1727 struct spdk_nvme_qpair *qpair; 1728 int rc; 1729 1730 nvme_ctrlr = nvme_qpair->ctrlr; 1731 1732 spdk_nvme_ctrlr_get_default_io_qpair_opts(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1733 opts.delay_cmd_submit = g_opts.delay_cmd_submit; 1734 opts.create_only = true; 1735 opts.async_mode = true; 1736 opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests); 1737 g_opts.io_queue_requests = opts.io_queue_requests; 1738 1739 qpair = spdk_nvme_ctrlr_alloc_io_qpair(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1740 if (qpair == NULL) { 1741 return -1; 1742 } 1743 1744 SPDK_DTRACE_PROBE3(bdev_nvme_create_qpair, nvme_ctrlr->nbdev_ctrlr->name, 1745 spdk_nvme_qpair_get_id(qpair), spdk_thread_get_id(nvme_ctrlr->thread)); 1746 1747 assert(nvme_qpair->group != NULL); 1748 1749 rc = spdk_nvme_poll_group_add(nvme_qpair->group->group, qpair); 1750 if (rc != 0) { 1751 SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n"); 1752 goto err; 1753 } 1754 1755 rc = spdk_nvme_ctrlr_connect_io_qpair(nvme_ctrlr->ctrlr, qpair); 1756 if (rc != 0) { 1757 SPDK_ERRLOG("Unable to connect I/O qpair.\n"); 1758 goto err; 1759 } 1760 1761 nvme_qpair->qpair = qpair; 1762 1763 if (!g_opts.disable_auto_failback) { 1764 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1765 } 1766 1767 return 0; 1768 1769 err: 1770 spdk_nvme_ctrlr_free_io_qpair(qpair); 1771 1772 return rc; 1773 } 1774 1775 static void 1776 bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i) 1777 { 1778 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 1779 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 1780 enum spdk_bdev_io_status status = SPDK_BDEV_IO_STATUS_SUCCESS; 1781 struct spdk_bdev_io *bdev_io; 1782 1783 if (spdk_io_channel_iter_get_ctx(i) != NULL) { 1784 status = SPDK_BDEV_IO_STATUS_FAILED; 1785 } 1786 1787 while (!TAILQ_EMPTY(&ctrlr_ch->pending_resets)) { 1788 bdev_io = TAILQ_FIRST(&ctrlr_ch->pending_resets); 1789 TAILQ_REMOVE(&ctrlr_ch->pending_resets, bdev_io, module_link); 1790 __bdev_nvme_io_complete(bdev_io, status, NULL); 1791 } 1792 1793 spdk_for_each_channel_continue(i, 0); 1794 } 1795 1796 /* This function marks the current trid as failed by storing the current ticks 1797 * and then sets the next trid to the active trid within a controller if exists. 1798 * 1799 * The purpose of the boolean return value is to request the caller to disconnect 1800 * the current trid now to try connecting the next trid. 1801 */ 1802 static bool 1803 bdev_nvme_failover_trid(struct nvme_ctrlr *nvme_ctrlr, bool remove, bool start) 1804 { 1805 struct nvme_path_id *path_id, *next_path; 1806 int rc __attribute__((unused)); 1807 1808 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 1809 assert(path_id); 1810 assert(path_id == nvme_ctrlr->active_path_id); 1811 next_path = TAILQ_NEXT(path_id, link); 1812 1813 /* Update the last failed time. It means the trid is failed if its last 1814 * failed time is non-zero. 1815 */ 1816 path_id->last_failed_tsc = spdk_get_ticks(); 1817 1818 if (next_path == NULL) { 1819 /* There is no alternate trid within a controller. */ 1820 return false; 1821 } 1822 1823 if (!start && nvme_ctrlr->opts.reconnect_delay_sec == 0) { 1824 /* Connect is not retried in a controller reset sequence. Connecting 1825 * the next trid will be done by the next bdev_nvme_failover_ctrlr() call. 1826 */ 1827 return false; 1828 } 1829 1830 assert(path_id->trid.trtype != SPDK_NVME_TRANSPORT_PCIE); 1831 1832 SPDK_NOTICELOG("Start failover from %s:%s to %s:%s\n", path_id->trid.traddr, 1833 path_id->trid.trsvcid, next_path->trid.traddr, next_path->trid.trsvcid); 1834 1835 spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr); 1836 nvme_ctrlr->active_path_id = next_path; 1837 rc = spdk_nvme_ctrlr_set_trid(nvme_ctrlr->ctrlr, &next_path->trid); 1838 assert(rc == 0); 1839 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 1840 if (!remove) { 1841 /** Shuffle the old trid to the end of the list and use the new one. 1842 * Allows for round robin through multiple connections. 1843 */ 1844 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, path_id, link); 1845 } else { 1846 free(path_id); 1847 } 1848 1849 if (start || next_path->last_failed_tsc == 0) { 1850 /* bdev_nvme_failover_ctrlr() is just called or the next trid is not failed 1851 * or used yet. Try the next trid now. 1852 */ 1853 return true; 1854 } 1855 1856 if (spdk_get_ticks() > next_path->last_failed_tsc + spdk_get_ticks_hz() * 1857 nvme_ctrlr->opts.reconnect_delay_sec) { 1858 /* Enough backoff passed since the next trid failed. Try the next trid now. */ 1859 return true; 1860 } 1861 1862 /* The next trid will be tried after reconnect_delay_sec seconds. */ 1863 return false; 1864 } 1865 1866 static bool 1867 bdev_nvme_check_ctrlr_loss_timeout(struct nvme_ctrlr *nvme_ctrlr) 1868 { 1869 int32_t elapsed; 1870 1871 if (nvme_ctrlr->opts.ctrlr_loss_timeout_sec == 0 || 1872 nvme_ctrlr->opts.ctrlr_loss_timeout_sec == -1) { 1873 return false; 1874 } 1875 1876 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 1877 if (elapsed >= nvme_ctrlr->opts.ctrlr_loss_timeout_sec) { 1878 return true; 1879 } else { 1880 return false; 1881 } 1882 } 1883 1884 static bool 1885 bdev_nvme_check_fast_io_fail_timeout(struct nvme_ctrlr *nvme_ctrlr) 1886 { 1887 uint32_t elapsed; 1888 1889 if (nvme_ctrlr->opts.fast_io_fail_timeout_sec == 0) { 1890 return false; 1891 } 1892 1893 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 1894 if (elapsed >= nvme_ctrlr->opts.fast_io_fail_timeout_sec) { 1895 return true; 1896 } else { 1897 return false; 1898 } 1899 } 1900 1901 static void bdev_nvme_reset_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, bool success); 1902 1903 static void 1904 nvme_ctrlr_disconnect(struct nvme_ctrlr *nvme_ctrlr, nvme_ctrlr_disconnected_cb cb_fn) 1905 { 1906 int rc; 1907 1908 rc = spdk_nvme_ctrlr_disconnect(nvme_ctrlr->ctrlr); 1909 if (rc != 0) { 1910 /* Disconnect fails if ctrlr is already resetting or removed. In this case, 1911 * fail the reset sequence immediately. 1912 */ 1913 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 1914 return; 1915 } 1916 1917 /* spdk_nvme_ctrlr_disconnect() may complete asynchronously later by polling adminq. 1918 * Set callback here to execute the specified operation after ctrlr is really disconnected. 1919 */ 1920 assert(nvme_ctrlr->disconnected_cb == NULL); 1921 nvme_ctrlr->disconnected_cb = cb_fn; 1922 1923 /* During disconnection, reduce the period to poll adminq more often. */ 1924 bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 0); 1925 } 1926 1927 enum bdev_nvme_op_after_reset { 1928 OP_NONE, 1929 OP_COMPLETE_PENDING_DESTRUCT, 1930 OP_DESTRUCT, 1931 OP_DELAYED_RECONNECT, 1932 OP_FAILOVER, 1933 }; 1934 1935 typedef enum bdev_nvme_op_after_reset _bdev_nvme_op_after_reset; 1936 1937 static _bdev_nvme_op_after_reset 1938 bdev_nvme_check_op_after_reset(struct nvme_ctrlr *nvme_ctrlr, bool success) 1939 { 1940 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 1941 /* Complete pending destruct after reset completes. */ 1942 return OP_COMPLETE_PENDING_DESTRUCT; 1943 } else if (nvme_ctrlr->pending_failover) { 1944 nvme_ctrlr->pending_failover = false; 1945 nvme_ctrlr->reset_start_tsc = 0; 1946 return OP_FAILOVER; 1947 } else if (success || nvme_ctrlr->opts.reconnect_delay_sec == 0) { 1948 nvme_ctrlr->reset_start_tsc = 0; 1949 return OP_NONE; 1950 } else if (bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 1951 return OP_DESTRUCT; 1952 } else { 1953 if (bdev_nvme_check_fast_io_fail_timeout(nvme_ctrlr)) { 1954 nvme_ctrlr->fast_io_fail_timedout = true; 1955 } 1956 return OP_DELAYED_RECONNECT; 1957 } 1958 } 1959 1960 static int bdev_nvme_delete_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool hotplug); 1961 static void bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 1962 1963 static int 1964 bdev_nvme_reconnect_delay_timer_expired(void *ctx) 1965 { 1966 struct nvme_ctrlr *nvme_ctrlr = ctx; 1967 1968 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect_delay, nvme_ctrlr->nbdev_ctrlr->name); 1969 pthread_mutex_lock(&nvme_ctrlr->mutex); 1970 1971 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 1972 1973 if (!nvme_ctrlr->reconnect_is_delayed) { 1974 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1975 return SPDK_POLLER_BUSY; 1976 } 1977 1978 nvme_ctrlr->reconnect_is_delayed = false; 1979 1980 if (nvme_ctrlr->destruct) { 1981 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1982 return SPDK_POLLER_BUSY; 1983 } 1984 1985 assert(nvme_ctrlr->resetting == false); 1986 nvme_ctrlr->resetting = true; 1987 1988 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1989 1990 spdk_poller_resume(nvme_ctrlr->adminq_timer_poller); 1991 1992 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 1993 return SPDK_POLLER_BUSY; 1994 } 1995 1996 static void 1997 bdev_nvme_start_reconnect_delay_timer(struct nvme_ctrlr *nvme_ctrlr) 1998 { 1999 spdk_poller_pause(nvme_ctrlr->adminq_timer_poller); 2000 2001 assert(nvme_ctrlr->reconnect_is_delayed == false); 2002 nvme_ctrlr->reconnect_is_delayed = true; 2003 2004 assert(nvme_ctrlr->reconnect_delay_timer == NULL); 2005 nvme_ctrlr->reconnect_delay_timer = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_delay_timer_expired, 2006 nvme_ctrlr, 2007 nvme_ctrlr->opts.reconnect_delay_sec * SPDK_SEC_TO_USEC); 2008 } 2009 2010 static void remove_discovery_entry(struct nvme_ctrlr *nvme_ctrlr); 2011 2012 static void 2013 _bdev_nvme_reset_ctrlr_complete(struct spdk_io_channel_iter *i, int status) 2014 { 2015 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2016 bool success = spdk_io_channel_iter_get_ctx(i) == NULL; 2017 bdev_nvme_ctrlr_op_cb ctrlr_op_cb_fn = nvme_ctrlr->ctrlr_op_cb_fn; 2018 void *ctrlr_op_cb_arg = nvme_ctrlr->ctrlr_op_cb_arg; 2019 enum bdev_nvme_op_after_reset op_after_reset; 2020 2021 assert(nvme_ctrlr->thread == spdk_get_thread()); 2022 2023 nvme_ctrlr->ctrlr_op_cb_fn = NULL; 2024 nvme_ctrlr->ctrlr_op_cb_arg = NULL; 2025 2026 if (!success) { 2027 SPDK_ERRLOG("Resetting controller failed.\n"); 2028 } else { 2029 SPDK_NOTICELOG("Resetting controller successful.\n"); 2030 } 2031 2032 pthread_mutex_lock(&nvme_ctrlr->mutex); 2033 nvme_ctrlr->resetting = false; 2034 nvme_ctrlr->dont_retry = false; 2035 nvme_ctrlr->in_failover = false; 2036 2037 op_after_reset = bdev_nvme_check_op_after_reset(nvme_ctrlr, success); 2038 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2039 2040 if (ctrlr_op_cb_fn) { 2041 ctrlr_op_cb_fn(ctrlr_op_cb_arg, success ? 0 : -1); 2042 } 2043 2044 switch (op_after_reset) { 2045 case OP_COMPLETE_PENDING_DESTRUCT: 2046 nvme_ctrlr_unregister(nvme_ctrlr); 2047 break; 2048 case OP_DESTRUCT: 2049 bdev_nvme_delete_ctrlr(nvme_ctrlr, false); 2050 remove_discovery_entry(nvme_ctrlr); 2051 break; 2052 case OP_DELAYED_RECONNECT: 2053 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_start_reconnect_delay_timer); 2054 break; 2055 case OP_FAILOVER: 2056 bdev_nvme_failover_ctrlr(nvme_ctrlr, false); 2057 break; 2058 default: 2059 break; 2060 } 2061 } 2062 2063 static void 2064 bdev_nvme_reset_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, bool success) 2065 { 2066 pthread_mutex_lock(&nvme_ctrlr->mutex); 2067 if (!success) { 2068 /* Connecting the active trid failed. Set the next alternate trid to the 2069 * active trid if it exists. 2070 */ 2071 if (bdev_nvme_failover_trid(nvme_ctrlr, false, false)) { 2072 /* The next alternate trid exists and is ready to try. Try it now. */ 2073 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2074 2075 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr); 2076 return; 2077 } 2078 2079 /* We came here if there is no alternate trid or if the next trid exists but 2080 * is not ready to try. We will try the active trid after reconnect_delay_sec 2081 * seconds if it is non-zero or at the next reset call otherwise. 2082 */ 2083 } else { 2084 /* Connecting the active trid succeeded. Clear the last failed time because it 2085 * means the trid is failed if its last failed time is non-zero. 2086 */ 2087 nvme_ctrlr->active_path_id->last_failed_tsc = 0; 2088 } 2089 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2090 2091 /* Make sure we clear any pending resets before returning. */ 2092 spdk_for_each_channel(nvme_ctrlr, 2093 bdev_nvme_complete_pending_resets, 2094 success ? NULL : (void *)0x1, 2095 _bdev_nvme_reset_ctrlr_complete); 2096 } 2097 2098 static void 2099 bdev_nvme_reset_create_qpairs_failed(struct spdk_io_channel_iter *i, int status) 2100 { 2101 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2102 2103 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 2104 } 2105 2106 static void 2107 bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i) 2108 { 2109 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 2110 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch); 2111 struct nvme_qpair *nvme_qpair; 2112 2113 nvme_qpair = ctrlr_ch->qpair; 2114 assert(nvme_qpair != NULL); 2115 2116 _bdev_nvme_clear_io_path_cache(nvme_qpair); 2117 2118 if (nvme_qpair->qpair != NULL) { 2119 if (nvme_qpair->ctrlr->dont_retry) { 2120 spdk_nvme_qpair_set_abort_dnr(nvme_qpair->qpair, true); 2121 } 2122 spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); 2123 2124 /* The current full reset sequence will move to the next 2125 * ctrlr_channel after the qpair is actually disconnected. 2126 */ 2127 assert(ctrlr_ch->reset_iter == NULL); 2128 ctrlr_ch->reset_iter = i; 2129 } else { 2130 spdk_for_each_channel_continue(i, 0); 2131 } 2132 } 2133 2134 static void 2135 bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status) 2136 { 2137 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2138 2139 if (status == 0) { 2140 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, true); 2141 } else { 2142 /* Delete the added qpairs and quiesce ctrlr to make the states clean. */ 2143 spdk_for_each_channel(nvme_ctrlr, 2144 bdev_nvme_reset_destroy_qpair, 2145 NULL, 2146 bdev_nvme_reset_create_qpairs_failed); 2147 } 2148 } 2149 2150 static int 2151 bdev_nvme_reset_check_qpair_connected(void *ctx) 2152 { 2153 struct nvme_ctrlr_channel *ctrlr_ch = ctx; 2154 2155 if (ctrlr_ch->reset_iter == NULL) { 2156 /* qpair was already failed to connect and the reset sequence is being aborted. */ 2157 assert(ctrlr_ch->connect_poller == NULL); 2158 assert(ctrlr_ch->qpair->qpair == NULL); 2159 return SPDK_POLLER_BUSY; 2160 } 2161 2162 assert(ctrlr_ch->qpair->qpair != NULL); 2163 2164 if (!spdk_nvme_qpair_is_connected(ctrlr_ch->qpair->qpair)) { 2165 return SPDK_POLLER_BUSY; 2166 } 2167 2168 spdk_poller_unregister(&ctrlr_ch->connect_poller); 2169 2170 /* qpair was completed to connect. Move to the next ctrlr_channel */ 2171 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 2172 ctrlr_ch->reset_iter = NULL; 2173 2174 return SPDK_POLLER_BUSY; 2175 } 2176 2177 static void 2178 bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i) 2179 { 2180 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 2181 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 2182 int rc; 2183 2184 rc = bdev_nvme_create_qpair(ctrlr_ch->qpair); 2185 if (rc == 0) { 2186 ctrlr_ch->connect_poller = SPDK_POLLER_REGISTER(bdev_nvme_reset_check_qpair_connected, 2187 ctrlr_ch, 0); 2188 2189 /* The current full reset sequence will move to the next 2190 * ctrlr_channel after the qpair is actually connected. 2191 */ 2192 assert(ctrlr_ch->reset_iter == NULL); 2193 ctrlr_ch->reset_iter = i; 2194 } else { 2195 spdk_for_each_channel_continue(i, rc); 2196 } 2197 } 2198 2199 static int 2200 bdev_nvme_reconnect_ctrlr_poll(void *arg) 2201 { 2202 struct nvme_ctrlr *nvme_ctrlr = arg; 2203 int rc = -ETIMEDOUT; 2204 2205 if (!bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 2206 rc = spdk_nvme_ctrlr_reconnect_poll_async(nvme_ctrlr->ctrlr); 2207 if (rc == -EAGAIN) { 2208 return SPDK_POLLER_BUSY; 2209 } 2210 } 2211 2212 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 2213 if (rc == 0) { 2214 /* Recreate all of the I/O queue pairs */ 2215 spdk_for_each_channel(nvme_ctrlr, 2216 bdev_nvme_reset_create_qpair, 2217 NULL, 2218 bdev_nvme_reset_create_qpairs_done); 2219 } else { 2220 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 2221 } 2222 return SPDK_POLLER_BUSY; 2223 } 2224 2225 static void 2226 bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2227 { 2228 spdk_nvme_ctrlr_reconnect_async(nvme_ctrlr->ctrlr); 2229 2230 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect, nvme_ctrlr->nbdev_ctrlr->name); 2231 assert(nvme_ctrlr->reset_detach_poller == NULL); 2232 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_ctrlr_poll, 2233 nvme_ctrlr, 0); 2234 } 2235 2236 static void 2237 bdev_nvme_reset_destroy_qpair_done(struct spdk_io_channel_iter *i, int status) 2238 { 2239 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2240 2241 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reset, nvme_ctrlr->nbdev_ctrlr->name); 2242 assert(status == 0); 2243 2244 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2245 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2246 } else { 2247 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr); 2248 } 2249 } 2250 2251 static void 2252 bdev_nvme_reset_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr) 2253 { 2254 spdk_for_each_channel(nvme_ctrlr, 2255 bdev_nvme_reset_destroy_qpair, 2256 NULL, 2257 bdev_nvme_reset_destroy_qpair_done); 2258 } 2259 2260 static void 2261 bdev_nvme_reconnect_ctrlr_now(void *ctx) 2262 { 2263 struct nvme_ctrlr *nvme_ctrlr = ctx; 2264 2265 assert(nvme_ctrlr->resetting == true); 2266 assert(nvme_ctrlr->thread == spdk_get_thread()); 2267 2268 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 2269 2270 spdk_poller_resume(nvme_ctrlr->adminq_timer_poller); 2271 2272 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2273 } 2274 2275 static void 2276 _bdev_nvme_reset_ctrlr(void *ctx) 2277 { 2278 struct nvme_ctrlr *nvme_ctrlr = ctx; 2279 2280 assert(nvme_ctrlr->resetting == true); 2281 assert(nvme_ctrlr->thread == spdk_get_thread()); 2282 2283 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2284 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reset_destroy_qpairs); 2285 } else { 2286 bdev_nvme_reset_destroy_qpairs(nvme_ctrlr); 2287 } 2288 } 2289 2290 static int 2291 bdev_nvme_reset_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2292 { 2293 spdk_msg_fn msg_fn; 2294 2295 pthread_mutex_lock(&nvme_ctrlr->mutex); 2296 if (nvme_ctrlr->destruct) { 2297 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2298 return -ENXIO; 2299 } 2300 2301 if (nvme_ctrlr->resetting) { 2302 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2303 SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); 2304 return -EBUSY; 2305 } 2306 2307 if (nvme_ctrlr->disabled) { 2308 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2309 SPDK_NOTICELOG("Unable to perform reset. Controller is disabled.\n"); 2310 return -EALREADY; 2311 } 2312 2313 nvme_ctrlr->resetting = true; 2314 nvme_ctrlr->dont_retry = true; 2315 2316 if (nvme_ctrlr->reconnect_is_delayed) { 2317 SPDK_DEBUGLOG(bdev_nvme, "Reconnect is already scheduled.\n"); 2318 msg_fn = bdev_nvme_reconnect_ctrlr_now; 2319 nvme_ctrlr->reconnect_is_delayed = false; 2320 } else { 2321 msg_fn = _bdev_nvme_reset_ctrlr; 2322 assert(nvme_ctrlr->reset_start_tsc == 0); 2323 } 2324 2325 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2326 2327 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2328 2329 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 2330 return 0; 2331 } 2332 2333 static int 2334 bdev_nvme_enable_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2335 { 2336 pthread_mutex_lock(&nvme_ctrlr->mutex); 2337 if (nvme_ctrlr->destruct) { 2338 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2339 return -ENXIO; 2340 } 2341 2342 if (nvme_ctrlr->resetting) { 2343 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2344 return -EBUSY; 2345 } 2346 2347 if (!nvme_ctrlr->disabled) { 2348 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2349 return -EALREADY; 2350 } 2351 2352 nvme_ctrlr->disabled = false; 2353 nvme_ctrlr->resetting = true; 2354 2355 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2356 2357 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2358 2359 spdk_thread_send_msg(nvme_ctrlr->thread, bdev_nvme_reconnect_ctrlr_now, nvme_ctrlr); 2360 return 0; 2361 } 2362 2363 static void 2364 _bdev_nvme_disable_ctrlr_complete(struct spdk_io_channel_iter *i, int status) 2365 { 2366 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2367 bdev_nvme_ctrlr_op_cb ctrlr_op_cb_fn = nvme_ctrlr->ctrlr_op_cb_fn; 2368 void *ctrlr_op_cb_arg = nvme_ctrlr->ctrlr_op_cb_arg; 2369 enum bdev_nvme_op_after_reset op_after_disable; 2370 2371 assert(nvme_ctrlr->thread == spdk_get_thread()); 2372 2373 nvme_ctrlr->ctrlr_op_cb_fn = NULL; 2374 nvme_ctrlr->ctrlr_op_cb_arg = NULL; 2375 2376 pthread_mutex_lock(&nvme_ctrlr->mutex); 2377 2378 nvme_ctrlr->resetting = false; 2379 nvme_ctrlr->dont_retry = false; 2380 2381 op_after_disable = bdev_nvme_check_op_after_reset(nvme_ctrlr, true); 2382 2383 nvme_ctrlr->disabled = true; 2384 spdk_poller_pause(nvme_ctrlr->adminq_timer_poller); 2385 2386 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2387 2388 if (ctrlr_op_cb_fn) { 2389 ctrlr_op_cb_fn(ctrlr_op_cb_arg, 0); 2390 } 2391 2392 switch (op_after_disable) { 2393 case OP_COMPLETE_PENDING_DESTRUCT: 2394 nvme_ctrlr_unregister(nvme_ctrlr); 2395 break; 2396 default: 2397 break; 2398 } 2399 2400 } 2401 2402 static void 2403 bdev_nvme_disable_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr) 2404 { 2405 /* Make sure we clear any pending resets before returning. */ 2406 spdk_for_each_channel(nvme_ctrlr, 2407 bdev_nvme_complete_pending_resets, 2408 NULL, 2409 _bdev_nvme_disable_ctrlr_complete); 2410 } 2411 2412 static void 2413 bdev_nvme_disable_destroy_qpairs_done(struct spdk_io_channel_iter *i, int status) 2414 { 2415 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2416 2417 assert(status == 0); 2418 2419 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2420 bdev_nvme_disable_ctrlr_complete(nvme_ctrlr); 2421 } else { 2422 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_disable_ctrlr_complete); 2423 } 2424 } 2425 2426 static void 2427 bdev_nvme_disable_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr) 2428 { 2429 spdk_for_each_channel(nvme_ctrlr, 2430 bdev_nvme_reset_destroy_qpair, 2431 NULL, 2432 bdev_nvme_disable_destroy_qpairs_done); 2433 } 2434 2435 static void 2436 _bdev_nvme_cancel_reconnect_and_disable_ctrlr(void *ctx) 2437 { 2438 struct nvme_ctrlr *nvme_ctrlr = ctx; 2439 2440 assert(nvme_ctrlr->resetting == true); 2441 assert(nvme_ctrlr->thread == spdk_get_thread()); 2442 2443 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 2444 2445 bdev_nvme_disable_ctrlr_complete(nvme_ctrlr); 2446 } 2447 2448 static void 2449 _bdev_nvme_disconnect_and_disable_ctrlr(void *ctx) 2450 { 2451 struct nvme_ctrlr *nvme_ctrlr = ctx; 2452 2453 assert(nvme_ctrlr->resetting == true); 2454 assert(nvme_ctrlr->thread == spdk_get_thread()); 2455 2456 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2457 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_disable_destroy_qpairs); 2458 } else { 2459 bdev_nvme_disable_destroy_qpairs(nvme_ctrlr); 2460 } 2461 } 2462 2463 static int 2464 bdev_nvme_disable_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2465 { 2466 spdk_msg_fn msg_fn; 2467 2468 pthread_mutex_lock(&nvme_ctrlr->mutex); 2469 if (nvme_ctrlr->destruct) { 2470 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2471 return -ENXIO; 2472 } 2473 2474 if (nvme_ctrlr->resetting) { 2475 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2476 return -EBUSY; 2477 } 2478 2479 if (nvme_ctrlr->disabled) { 2480 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2481 return -EALREADY; 2482 } 2483 2484 nvme_ctrlr->resetting = true; 2485 nvme_ctrlr->dont_retry = true; 2486 2487 if (nvme_ctrlr->reconnect_is_delayed) { 2488 msg_fn = _bdev_nvme_cancel_reconnect_and_disable_ctrlr; 2489 nvme_ctrlr->reconnect_is_delayed = false; 2490 } else { 2491 msg_fn = _bdev_nvme_disconnect_and_disable_ctrlr; 2492 } 2493 2494 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2495 2496 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2497 2498 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 2499 return 0; 2500 } 2501 2502 static int 2503 nvme_ctrlr_op(struct nvme_ctrlr *nvme_ctrlr, enum nvme_ctrlr_op op, 2504 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2505 { 2506 int rc; 2507 2508 switch (op) { 2509 case NVME_CTRLR_OP_RESET: 2510 rc = bdev_nvme_reset_ctrlr(nvme_ctrlr); 2511 break; 2512 case NVME_CTRLR_OP_ENABLE: 2513 rc = bdev_nvme_enable_ctrlr(nvme_ctrlr); 2514 break; 2515 case NVME_CTRLR_OP_DISABLE: 2516 rc = bdev_nvme_disable_ctrlr(nvme_ctrlr); 2517 break; 2518 default: 2519 rc = -EINVAL; 2520 break; 2521 } 2522 2523 if (rc == 0) { 2524 assert(nvme_ctrlr->ctrlr_op_cb_fn == NULL); 2525 assert(nvme_ctrlr->ctrlr_op_cb_arg == NULL); 2526 nvme_ctrlr->ctrlr_op_cb_fn = cb_fn; 2527 nvme_ctrlr->ctrlr_op_cb_arg = cb_arg; 2528 } 2529 return rc; 2530 } 2531 2532 struct nvme_ctrlr_op_rpc_ctx { 2533 struct nvme_ctrlr *nvme_ctrlr; 2534 struct spdk_thread *orig_thread; 2535 enum nvme_ctrlr_op op; 2536 int rc; 2537 bdev_nvme_ctrlr_op_cb cb_fn; 2538 void *cb_arg; 2539 }; 2540 2541 static void 2542 _nvme_ctrlr_op_rpc_complete(void *_ctx) 2543 { 2544 struct nvme_ctrlr_op_rpc_ctx *ctx = _ctx; 2545 2546 assert(ctx != NULL); 2547 assert(ctx->cb_fn != NULL); 2548 2549 ctx->cb_fn(ctx->cb_arg, ctx->rc); 2550 2551 free(ctx); 2552 } 2553 2554 static void 2555 nvme_ctrlr_op_rpc_complete(void *cb_arg, int rc) 2556 { 2557 struct nvme_ctrlr_op_rpc_ctx *ctx = cb_arg; 2558 2559 ctx->rc = rc; 2560 2561 spdk_thread_send_msg(ctx->orig_thread, _nvme_ctrlr_op_rpc_complete, ctx); 2562 } 2563 2564 void 2565 nvme_ctrlr_op_rpc(struct nvme_ctrlr *nvme_ctrlr, enum nvme_ctrlr_op op, 2566 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2567 { 2568 struct nvme_ctrlr_op_rpc_ctx *ctx; 2569 int rc; 2570 2571 assert(cb_fn != NULL); 2572 2573 ctx = calloc(1, sizeof(*ctx)); 2574 if (ctx == NULL) { 2575 SPDK_ERRLOG("Failed to allocate nvme_ctrlr_op_rpc_ctx.\n"); 2576 cb_fn(cb_arg, -ENOMEM); 2577 return; 2578 } 2579 2580 ctx->orig_thread = spdk_get_thread(); 2581 ctx->cb_fn = cb_fn; 2582 ctx->cb_arg = cb_arg; 2583 2584 rc = nvme_ctrlr_op(nvme_ctrlr, op, nvme_ctrlr_op_rpc_complete, ctx); 2585 if (rc == 0) { 2586 return; 2587 } else if (rc == -EALREADY) { 2588 rc = 0; 2589 } 2590 2591 nvme_ctrlr_op_rpc_complete(ctx, rc); 2592 } 2593 2594 static void nvme_bdev_ctrlr_op_rpc_continue(void *cb_arg, int rc); 2595 2596 static void 2597 _nvme_bdev_ctrlr_op_rpc_continue(void *_ctx) 2598 { 2599 struct nvme_ctrlr_op_rpc_ctx *ctx = _ctx; 2600 struct nvme_ctrlr *prev_nvme_ctrlr, *next_nvme_ctrlr; 2601 int rc; 2602 2603 prev_nvme_ctrlr = ctx->nvme_ctrlr; 2604 ctx->nvme_ctrlr = NULL; 2605 2606 if (ctx->rc != 0) { 2607 goto complete; 2608 } 2609 2610 next_nvme_ctrlr = TAILQ_NEXT(prev_nvme_ctrlr, tailq); 2611 if (next_nvme_ctrlr == NULL) { 2612 goto complete; 2613 } 2614 2615 rc = nvme_ctrlr_op(next_nvme_ctrlr, ctx->op, nvme_bdev_ctrlr_op_rpc_continue, ctx); 2616 if (rc == 0) { 2617 ctx->nvme_ctrlr = next_nvme_ctrlr; 2618 return; 2619 } else if (rc == -EALREADY) { 2620 ctx->nvme_ctrlr = next_nvme_ctrlr; 2621 rc = 0; 2622 } 2623 2624 ctx->rc = rc; 2625 2626 complete: 2627 ctx->cb_fn(ctx->cb_arg, ctx->rc); 2628 free(ctx); 2629 } 2630 2631 static void 2632 nvme_bdev_ctrlr_op_rpc_continue(void *cb_arg, int rc) 2633 { 2634 struct nvme_ctrlr_op_rpc_ctx *ctx = cb_arg; 2635 2636 ctx->rc = rc; 2637 2638 spdk_thread_send_msg(ctx->orig_thread, _nvme_bdev_ctrlr_op_rpc_continue, ctx); 2639 } 2640 2641 void 2642 nvme_bdev_ctrlr_op_rpc(struct nvme_bdev_ctrlr *nbdev_ctrlr, enum nvme_ctrlr_op op, 2643 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2644 { 2645 struct nvme_ctrlr_op_rpc_ctx *ctx; 2646 struct nvme_ctrlr *nvme_ctrlr; 2647 int rc; 2648 2649 assert(cb_fn != NULL); 2650 2651 ctx = calloc(1, sizeof(*ctx)); 2652 if (ctx == NULL) { 2653 SPDK_ERRLOG("Failed to allocate nvme_ctrlr_op_rpc_ctx.\n"); 2654 cb_fn(cb_arg, -ENOMEM); 2655 return; 2656 } 2657 2658 ctx->orig_thread = spdk_get_thread(); 2659 ctx->op = op; 2660 ctx->cb_fn = cb_fn; 2661 ctx->cb_arg = cb_arg; 2662 2663 nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 2664 assert(nvme_ctrlr != NULL); 2665 2666 rc = nvme_ctrlr_op(nvme_ctrlr, op, nvme_bdev_ctrlr_op_rpc_continue, ctx); 2667 if (rc == 0) { 2668 ctx->nvme_ctrlr = nvme_ctrlr; 2669 return; 2670 } else if (rc == -EALREADY) { 2671 ctx->nvme_ctrlr = nvme_ctrlr; 2672 rc = 0; 2673 } 2674 2675 nvme_bdev_ctrlr_op_rpc_continue(ctx, rc); 2676 } 2677 2678 static int _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio); 2679 2680 static void 2681 _bdev_nvme_reset_io_complete(struct spdk_io_channel_iter *i, int status) 2682 { 2683 struct nvme_bdev_io *bio = spdk_io_channel_iter_get_ctx(i); 2684 enum spdk_bdev_io_status io_status; 2685 2686 if (bio->cpl.cdw0 == 0) { 2687 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 2688 } else { 2689 io_status = SPDK_BDEV_IO_STATUS_FAILED; 2690 } 2691 2692 __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), io_status, NULL); 2693 } 2694 2695 static void 2696 bdev_nvme_abort_bdev_channel(struct spdk_io_channel_iter *i) 2697 { 2698 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 2699 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 2700 2701 bdev_nvme_abort_retry_ios(nbdev_ch); 2702 2703 spdk_for_each_channel_continue(i, 0); 2704 } 2705 2706 static void 2707 bdev_nvme_reset_io_complete(struct nvme_bdev_io *bio) 2708 { 2709 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2710 struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt; 2711 2712 /* Abort all queued I/Os for retry. */ 2713 spdk_for_each_channel(nbdev, 2714 bdev_nvme_abort_bdev_channel, 2715 bio, 2716 _bdev_nvme_reset_io_complete); 2717 } 2718 2719 static void 2720 _bdev_nvme_reset_io_continue(void *ctx) 2721 { 2722 struct nvme_bdev_io *bio = ctx; 2723 struct nvme_io_path *prev_io_path, *next_io_path; 2724 int rc; 2725 2726 prev_io_path = bio->io_path; 2727 bio->io_path = NULL; 2728 2729 if (bio->cpl.cdw0 != 0) { 2730 goto complete; 2731 } 2732 2733 next_io_path = STAILQ_NEXT(prev_io_path, stailq); 2734 if (next_io_path == NULL) { 2735 goto complete; 2736 } 2737 2738 rc = _bdev_nvme_reset_io(next_io_path, bio); 2739 if (rc == 0) { 2740 return; 2741 } 2742 2743 bio->cpl.cdw0 = 1; 2744 2745 complete: 2746 bdev_nvme_reset_io_complete(bio); 2747 } 2748 2749 static void 2750 bdev_nvme_reset_io_continue(void *cb_arg, int rc) 2751 { 2752 struct nvme_bdev_io *bio = cb_arg; 2753 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2754 2755 bio->cpl.cdw0 = (rc == 0) ? 0 : 1; 2756 2757 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), _bdev_nvme_reset_io_continue, bio); 2758 } 2759 2760 static int 2761 _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio) 2762 { 2763 struct nvme_ctrlr_channel *ctrlr_ch; 2764 struct spdk_bdev_io *bdev_io; 2765 int rc; 2766 2767 rc = nvme_ctrlr_op(io_path->qpair->ctrlr, NVME_CTRLR_OP_RESET, 2768 bdev_nvme_reset_io_continue, bio); 2769 if (rc == 0) { 2770 assert(bio->io_path == NULL); 2771 bio->io_path = io_path; 2772 } else if (rc == -EBUSY) { 2773 ctrlr_ch = io_path->qpair->ctrlr_ch; 2774 assert(ctrlr_ch != NULL); 2775 /* 2776 * Reset call is queued only if it is from the app framework. This is on purpose so that 2777 * we don't interfere with the app framework reset strategy. i.e. we are deferring to the 2778 * upper level. If they are in the middle of a reset, we won't try to schedule another one. 2779 */ 2780 bdev_io = spdk_bdev_io_from_ctx(bio); 2781 TAILQ_INSERT_TAIL(&ctrlr_ch->pending_resets, bdev_io, module_link); 2782 rc = 0; 2783 } 2784 2785 return rc; 2786 } 2787 2788 static void 2789 bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio) 2790 { 2791 struct nvme_io_path *io_path; 2792 int rc; 2793 2794 bio->cpl.cdw0 = 0; 2795 2796 /* Reset all nvme_ctrlrs of a bdev controller sequentially. */ 2797 io_path = STAILQ_FIRST(&nbdev_ch->io_path_list); 2798 assert(io_path != NULL); 2799 2800 rc = _bdev_nvme_reset_io(io_path, bio); 2801 if (rc != 0) { 2802 /* If the current nvme_ctrlr is disabled, skip it and move to the next nvme_ctrlr. */ 2803 bdev_nvme_reset_io_continue(bio, rc == -EALREADY); 2804 } 2805 } 2806 2807 static int 2808 bdev_nvme_failover_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool remove) 2809 { 2810 if (nvme_ctrlr->destruct) { 2811 /* Don't bother resetting if the controller is in the process of being destructed. */ 2812 return -ENXIO; 2813 } 2814 2815 if (nvme_ctrlr->resetting) { 2816 if (!nvme_ctrlr->in_failover) { 2817 SPDK_NOTICELOG("Reset is already in progress. Defer failover until reset completes.\n"); 2818 2819 /* Defer failover until reset completes. */ 2820 nvme_ctrlr->pending_failover = true; 2821 return -EINPROGRESS; 2822 } else { 2823 SPDK_NOTICELOG("Unable to perform failover, already in progress.\n"); 2824 return -EBUSY; 2825 } 2826 } 2827 2828 bdev_nvme_failover_trid(nvme_ctrlr, remove, true); 2829 2830 if (nvme_ctrlr->reconnect_is_delayed) { 2831 SPDK_NOTICELOG("Reconnect is already scheduled.\n"); 2832 2833 /* We rely on the next reconnect for the failover. */ 2834 return -EALREADY; 2835 } 2836 2837 if (nvme_ctrlr->disabled) { 2838 SPDK_NOTICELOG("Controller is disabled.\n"); 2839 2840 /* We rely on the enablement for the failover. */ 2841 return -EALREADY; 2842 } 2843 2844 nvme_ctrlr->resetting = true; 2845 nvme_ctrlr->in_failover = true; 2846 2847 assert(nvme_ctrlr->reset_start_tsc == 0); 2848 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2849 2850 return 0; 2851 } 2852 2853 static int 2854 bdev_nvme_failover_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool remove) 2855 { 2856 int rc; 2857 2858 pthread_mutex_lock(&nvme_ctrlr->mutex); 2859 rc = bdev_nvme_failover_ctrlr_unsafe(nvme_ctrlr, remove); 2860 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2861 2862 if (rc == 0) { 2863 spdk_thread_send_msg(nvme_ctrlr->thread, _bdev_nvme_reset_ctrlr, nvme_ctrlr); 2864 } else if (rc == -EALREADY) { 2865 rc = 0; 2866 } 2867 2868 return rc; 2869 } 2870 2871 static int bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, 2872 uint64_t num_blocks); 2873 2874 static int bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, 2875 uint64_t num_blocks); 2876 2877 static int bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, 2878 uint64_t src_offset_blocks, 2879 uint64_t num_blocks); 2880 2881 static void 2882 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 2883 bool success) 2884 { 2885 struct nvme_bdev_io *bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 2886 struct spdk_bdev *bdev = bdev_io->bdev; 2887 int ret; 2888 2889 if (!success) { 2890 ret = -EINVAL; 2891 goto exit; 2892 } 2893 2894 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 2895 ret = -ENXIO; 2896 goto exit; 2897 } 2898 2899 ret = bdev_nvme_readv(bio, 2900 bdev_io->u.bdev.iovs, 2901 bdev_io->u.bdev.iovcnt, 2902 bdev_io->u.bdev.md_buf, 2903 bdev_io->u.bdev.num_blocks, 2904 bdev_io->u.bdev.offset_blocks, 2905 bdev->dif_check_flags, 2906 bdev_io->u.bdev.memory_domain, 2907 bdev_io->u.bdev.memory_domain_ctx, 2908 bdev_io->u.bdev.accel_sequence); 2909 2910 exit: 2911 if (spdk_unlikely(ret != 0)) { 2912 bdev_nvme_io_complete(bio, ret); 2913 } 2914 } 2915 2916 static inline void 2917 _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io) 2918 { 2919 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 2920 struct spdk_bdev *bdev = bdev_io->bdev; 2921 struct nvme_bdev_io *nbdev_io_to_abort; 2922 int rc = 0; 2923 2924 switch (bdev_io->type) { 2925 case SPDK_BDEV_IO_TYPE_READ: 2926 if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) { 2927 rc = bdev_nvme_readv(nbdev_io, 2928 bdev_io->u.bdev.iovs, 2929 bdev_io->u.bdev.iovcnt, 2930 bdev_io->u.bdev.md_buf, 2931 bdev_io->u.bdev.num_blocks, 2932 bdev_io->u.bdev.offset_blocks, 2933 bdev->dif_check_flags, 2934 bdev_io->u.bdev.memory_domain, 2935 bdev_io->u.bdev.memory_domain_ctx, 2936 bdev_io->u.bdev.accel_sequence); 2937 } else { 2938 spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb, 2939 bdev_io->u.bdev.num_blocks * bdev->blocklen); 2940 rc = 0; 2941 } 2942 break; 2943 case SPDK_BDEV_IO_TYPE_WRITE: 2944 rc = bdev_nvme_writev(nbdev_io, 2945 bdev_io->u.bdev.iovs, 2946 bdev_io->u.bdev.iovcnt, 2947 bdev_io->u.bdev.md_buf, 2948 bdev_io->u.bdev.num_blocks, 2949 bdev_io->u.bdev.offset_blocks, 2950 bdev->dif_check_flags, 2951 bdev_io->u.bdev.memory_domain, 2952 bdev_io->u.bdev.memory_domain_ctx, 2953 bdev_io->u.bdev.accel_sequence); 2954 break; 2955 case SPDK_BDEV_IO_TYPE_COMPARE: 2956 rc = bdev_nvme_comparev(nbdev_io, 2957 bdev_io->u.bdev.iovs, 2958 bdev_io->u.bdev.iovcnt, 2959 bdev_io->u.bdev.md_buf, 2960 bdev_io->u.bdev.num_blocks, 2961 bdev_io->u.bdev.offset_blocks, 2962 bdev->dif_check_flags); 2963 break; 2964 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 2965 rc = bdev_nvme_comparev_and_writev(nbdev_io, 2966 bdev_io->u.bdev.iovs, 2967 bdev_io->u.bdev.iovcnt, 2968 bdev_io->u.bdev.fused_iovs, 2969 bdev_io->u.bdev.fused_iovcnt, 2970 bdev_io->u.bdev.md_buf, 2971 bdev_io->u.bdev.num_blocks, 2972 bdev_io->u.bdev.offset_blocks, 2973 bdev->dif_check_flags); 2974 break; 2975 case SPDK_BDEV_IO_TYPE_UNMAP: 2976 rc = bdev_nvme_unmap(nbdev_io, 2977 bdev_io->u.bdev.offset_blocks, 2978 bdev_io->u.bdev.num_blocks); 2979 break; 2980 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2981 rc = bdev_nvme_write_zeroes(nbdev_io, 2982 bdev_io->u.bdev.offset_blocks, 2983 bdev_io->u.bdev.num_blocks); 2984 break; 2985 case SPDK_BDEV_IO_TYPE_RESET: 2986 nbdev_io->io_path = NULL; 2987 bdev_nvme_reset_io(nbdev_ch, nbdev_io); 2988 return; 2989 2990 case SPDK_BDEV_IO_TYPE_FLUSH: 2991 bdev_nvme_io_complete(nbdev_io, 0); 2992 return; 2993 2994 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 2995 rc = bdev_nvme_zone_appendv(nbdev_io, 2996 bdev_io->u.bdev.iovs, 2997 bdev_io->u.bdev.iovcnt, 2998 bdev_io->u.bdev.md_buf, 2999 bdev_io->u.bdev.num_blocks, 3000 bdev_io->u.bdev.offset_blocks, 3001 bdev->dif_check_flags); 3002 break; 3003 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 3004 rc = bdev_nvme_get_zone_info(nbdev_io, 3005 bdev_io->u.zone_mgmt.zone_id, 3006 bdev_io->u.zone_mgmt.num_zones, 3007 bdev_io->u.zone_mgmt.buf); 3008 break; 3009 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 3010 rc = bdev_nvme_zone_management(nbdev_io, 3011 bdev_io->u.zone_mgmt.zone_id, 3012 bdev_io->u.zone_mgmt.zone_action); 3013 break; 3014 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 3015 nbdev_io->io_path = NULL; 3016 bdev_nvme_admin_passthru(nbdev_ch, 3017 nbdev_io, 3018 &bdev_io->u.nvme_passthru.cmd, 3019 bdev_io->u.nvme_passthru.buf, 3020 bdev_io->u.nvme_passthru.nbytes); 3021 return; 3022 3023 case SPDK_BDEV_IO_TYPE_NVME_IO: 3024 rc = bdev_nvme_io_passthru(nbdev_io, 3025 &bdev_io->u.nvme_passthru.cmd, 3026 bdev_io->u.nvme_passthru.buf, 3027 bdev_io->u.nvme_passthru.nbytes); 3028 break; 3029 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3030 rc = bdev_nvme_io_passthru_md(nbdev_io, 3031 &bdev_io->u.nvme_passthru.cmd, 3032 bdev_io->u.nvme_passthru.buf, 3033 bdev_io->u.nvme_passthru.nbytes, 3034 bdev_io->u.nvme_passthru.md_buf, 3035 bdev_io->u.nvme_passthru.md_len); 3036 break; 3037 case SPDK_BDEV_IO_TYPE_NVME_IOV_MD: 3038 rc = bdev_nvme_iov_passthru_md(nbdev_io, 3039 &bdev_io->u.nvme_passthru.cmd, 3040 bdev_io->u.nvme_passthru.iovs, 3041 bdev_io->u.nvme_passthru.iovcnt, 3042 bdev_io->u.nvme_passthru.nbytes, 3043 bdev_io->u.nvme_passthru.md_buf, 3044 bdev_io->u.nvme_passthru.md_len); 3045 break; 3046 case SPDK_BDEV_IO_TYPE_ABORT: 3047 nbdev_io->io_path = NULL; 3048 nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx; 3049 bdev_nvme_abort(nbdev_ch, 3050 nbdev_io, 3051 nbdev_io_to_abort); 3052 return; 3053 3054 case SPDK_BDEV_IO_TYPE_COPY: 3055 rc = bdev_nvme_copy(nbdev_io, 3056 bdev_io->u.bdev.offset_blocks, 3057 bdev_io->u.bdev.copy.src_offset_blocks, 3058 bdev_io->u.bdev.num_blocks); 3059 break; 3060 default: 3061 rc = -EINVAL; 3062 break; 3063 } 3064 3065 if (spdk_unlikely(rc != 0)) { 3066 bdev_nvme_io_complete(nbdev_io, rc); 3067 } 3068 } 3069 3070 static void 3071 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 3072 { 3073 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 3074 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 3075 3076 if (spdk_likely(nbdev_io->submit_tsc == 0)) { 3077 nbdev_io->submit_tsc = spdk_bdev_io_get_submit_tsc(bdev_io); 3078 } else { 3079 /* There are cases where submit_tsc != 0, i.e. retry I/O. 3080 * We need to update submit_tsc here. 3081 */ 3082 nbdev_io->submit_tsc = spdk_get_ticks(); 3083 } 3084 3085 spdk_trace_record(TRACE_BDEV_NVME_IO_START, 0, 0, (uintptr_t)nbdev_io, (uintptr_t)bdev_io); 3086 nbdev_io->io_path = bdev_nvme_find_io_path(nbdev_ch); 3087 if (spdk_unlikely(!nbdev_io->io_path)) { 3088 if (!bdev_nvme_io_type_is_admin(bdev_io->type)) { 3089 bdev_nvme_io_complete(nbdev_io, -ENXIO); 3090 return; 3091 } 3092 3093 /* Admin commands do not use the optimal I/O path. 3094 * Simply fall through even if it is not found. 3095 */ 3096 } 3097 3098 _bdev_nvme_submit_request(nbdev_ch, bdev_io); 3099 } 3100 3101 static bool 3102 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 3103 { 3104 struct nvme_bdev *nbdev = ctx; 3105 struct nvme_ns *nvme_ns; 3106 struct spdk_nvme_ns *ns; 3107 struct spdk_nvme_ctrlr *ctrlr; 3108 const struct spdk_nvme_ctrlr_data *cdata; 3109 3110 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 3111 assert(nvme_ns != NULL); 3112 ns = nvme_ns->ns; 3113 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 3114 3115 switch (io_type) { 3116 case SPDK_BDEV_IO_TYPE_READ: 3117 case SPDK_BDEV_IO_TYPE_WRITE: 3118 case SPDK_BDEV_IO_TYPE_RESET: 3119 case SPDK_BDEV_IO_TYPE_FLUSH: 3120 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 3121 case SPDK_BDEV_IO_TYPE_NVME_IO: 3122 case SPDK_BDEV_IO_TYPE_ABORT: 3123 return true; 3124 3125 case SPDK_BDEV_IO_TYPE_COMPARE: 3126 return spdk_nvme_ns_supports_compare(ns); 3127 3128 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3129 return spdk_nvme_ns_get_md_size(ns) ? true : false; 3130 3131 case SPDK_BDEV_IO_TYPE_UNMAP: 3132 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3133 return cdata->oncs.dsm; 3134 3135 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3136 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3137 return cdata->oncs.write_zeroes; 3138 3139 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 3140 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 3141 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) { 3142 return true; 3143 } 3144 return false; 3145 3146 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 3147 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 3148 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS; 3149 3150 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 3151 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS && 3152 spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ZONE_APPEND_SUPPORTED; 3153 3154 case SPDK_BDEV_IO_TYPE_COPY: 3155 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3156 return cdata->oncs.copy; 3157 3158 default: 3159 return false; 3160 } 3161 } 3162 3163 static int 3164 nvme_qpair_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ctrlr_channel *ctrlr_ch) 3165 { 3166 struct nvme_qpair *nvme_qpair; 3167 struct spdk_io_channel *pg_ch; 3168 int rc; 3169 3170 nvme_qpair = calloc(1, sizeof(*nvme_qpair)); 3171 if (!nvme_qpair) { 3172 SPDK_ERRLOG("Failed to alloc nvme_qpair.\n"); 3173 return -1; 3174 } 3175 3176 TAILQ_INIT(&nvme_qpair->io_path_list); 3177 3178 nvme_qpair->ctrlr = nvme_ctrlr; 3179 nvme_qpair->ctrlr_ch = ctrlr_ch; 3180 3181 pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs); 3182 if (!pg_ch) { 3183 free(nvme_qpair); 3184 return -1; 3185 } 3186 3187 nvme_qpair->group = spdk_io_channel_get_ctx(pg_ch); 3188 3189 #ifdef SPDK_CONFIG_VTUNE 3190 nvme_qpair->group->collect_spin_stat = true; 3191 #else 3192 nvme_qpair->group->collect_spin_stat = false; 3193 #endif 3194 3195 if (!nvme_ctrlr->disabled) { 3196 /* If a nvme_ctrlr is disabled, don't try to create qpair for it. Qpair will 3197 * be created when it's enabled. 3198 */ 3199 rc = bdev_nvme_create_qpair(nvme_qpair); 3200 if (rc != 0) { 3201 /* nvme_ctrlr can't create IO qpair if connection is down. 3202 * If reconnect_delay_sec is non-zero, creating IO qpair is retried 3203 * after reconnect_delay_sec seconds. If bdev_retry_count is non-zero, 3204 * submitted IO will be queued until IO qpair is successfully created. 3205 * 3206 * Hence, if both are satisfied, ignore the failure. 3207 */ 3208 if (nvme_ctrlr->opts.reconnect_delay_sec == 0 || g_opts.bdev_retry_count == 0) { 3209 spdk_put_io_channel(pg_ch); 3210 free(nvme_qpair); 3211 return rc; 3212 } 3213 } 3214 } 3215 3216 TAILQ_INSERT_TAIL(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 3217 3218 ctrlr_ch->qpair = nvme_qpair; 3219 3220 pthread_mutex_lock(&nvme_qpair->ctrlr->mutex); 3221 nvme_qpair->ctrlr->ref++; 3222 pthread_mutex_unlock(&nvme_qpair->ctrlr->mutex); 3223 3224 return 0; 3225 } 3226 3227 static int 3228 bdev_nvme_create_ctrlr_channel_cb(void *io_device, void *ctx_buf) 3229 { 3230 struct nvme_ctrlr *nvme_ctrlr = io_device; 3231 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 3232 3233 TAILQ_INIT(&ctrlr_ch->pending_resets); 3234 3235 return nvme_qpair_create(nvme_ctrlr, ctrlr_ch); 3236 } 3237 3238 static void 3239 nvme_qpair_delete(struct nvme_qpair *nvme_qpair) 3240 { 3241 struct nvme_io_path *io_path, *next; 3242 3243 assert(nvme_qpair->group != NULL); 3244 3245 TAILQ_FOREACH_SAFE(io_path, &nvme_qpair->io_path_list, tailq, next) { 3246 TAILQ_REMOVE(&nvme_qpair->io_path_list, io_path, tailq); 3247 nvme_io_path_free(io_path); 3248 } 3249 3250 TAILQ_REMOVE(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 3251 3252 spdk_put_io_channel(spdk_io_channel_from_ctx(nvme_qpair->group)); 3253 3254 nvme_ctrlr_release(nvme_qpair->ctrlr); 3255 3256 free(nvme_qpair); 3257 } 3258 3259 static void 3260 bdev_nvme_destroy_ctrlr_channel_cb(void *io_device, void *ctx_buf) 3261 { 3262 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 3263 struct nvme_qpair *nvme_qpair; 3264 3265 nvme_qpair = ctrlr_ch->qpair; 3266 assert(nvme_qpair != NULL); 3267 3268 _bdev_nvme_clear_io_path_cache(nvme_qpair); 3269 3270 if (nvme_qpair->qpair != NULL) { 3271 if (ctrlr_ch->reset_iter == NULL) { 3272 spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); 3273 } else { 3274 /* Skip current ctrlr_channel in a full reset sequence because 3275 * it is being deleted now. The qpair is already being disconnected. 3276 * We do not have to restart disconnecting it. 3277 */ 3278 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 3279 } 3280 3281 /* We cannot release a reference to the poll group now. 3282 * The qpair may be disconnected asynchronously later. 3283 * We need to poll it until it is actually disconnected. 3284 * Just detach the qpair from the deleting ctrlr_channel. 3285 */ 3286 nvme_qpair->ctrlr_ch = NULL; 3287 } else { 3288 assert(ctrlr_ch->reset_iter == NULL); 3289 3290 nvme_qpair_delete(nvme_qpair); 3291 } 3292 } 3293 3294 static inline struct spdk_io_channel * 3295 bdev_nvme_get_accel_channel(struct nvme_poll_group *group) 3296 { 3297 if (spdk_unlikely(!group->accel_channel)) { 3298 group->accel_channel = spdk_accel_get_io_channel(); 3299 if (!group->accel_channel) { 3300 SPDK_ERRLOG("Cannot get the accel_channel for bdev nvme polling group=%p\n", 3301 group); 3302 return NULL; 3303 } 3304 } 3305 3306 return group->accel_channel; 3307 } 3308 3309 static void 3310 bdev_nvme_submit_accel_crc32c(void *ctx, uint32_t *dst, struct iovec *iov, 3311 uint32_t iov_cnt, uint32_t seed, 3312 spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) 3313 { 3314 struct spdk_io_channel *accel_ch; 3315 struct nvme_poll_group *group = ctx; 3316 int rc; 3317 3318 assert(cb_fn != NULL); 3319 3320 accel_ch = bdev_nvme_get_accel_channel(group); 3321 if (spdk_unlikely(accel_ch == NULL)) { 3322 cb_fn(cb_arg, -ENOMEM); 3323 return; 3324 } 3325 3326 rc = spdk_accel_submit_crc32cv(accel_ch, dst, iov, iov_cnt, seed, cb_fn, cb_arg); 3327 if (rc) { 3328 /* For the two cases, spdk_accel_submit_crc32cv does not call the user's cb_fn */ 3329 if (rc == -ENOMEM || rc == -EINVAL) { 3330 cb_fn(cb_arg, rc); 3331 } 3332 SPDK_ERRLOG("Cannot complete the accelerated crc32c operation with iov=%p\n", iov); 3333 } 3334 } 3335 3336 static void 3337 bdev_nvme_finish_sequence(void *seq, spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) 3338 { 3339 spdk_accel_sequence_finish(seq, cb_fn, cb_arg); 3340 } 3341 3342 static void 3343 bdev_nvme_abort_sequence(void *seq) 3344 { 3345 spdk_accel_sequence_abort(seq); 3346 } 3347 3348 static void 3349 bdev_nvme_reverse_sequence(void *seq) 3350 { 3351 spdk_accel_sequence_reverse(seq); 3352 } 3353 3354 static int 3355 bdev_nvme_append_crc32c(void *ctx, void **seq, uint32_t *dst, struct iovec *iovs, uint32_t iovcnt, 3356 struct spdk_memory_domain *domain, void *domain_ctx, uint32_t seed, 3357 spdk_nvme_accel_step_cb cb_fn, void *cb_arg) 3358 { 3359 struct spdk_io_channel *ch; 3360 struct nvme_poll_group *group = ctx; 3361 3362 ch = bdev_nvme_get_accel_channel(group); 3363 if (spdk_unlikely(ch == NULL)) { 3364 return -ENOMEM; 3365 } 3366 3367 return spdk_accel_append_crc32c((struct spdk_accel_sequence **)seq, ch, dst, iovs, iovcnt, 3368 domain, domain_ctx, seed, cb_fn, cb_arg); 3369 } 3370 3371 static struct spdk_nvme_accel_fn_table g_bdev_nvme_accel_fn_table = { 3372 .table_size = sizeof(struct spdk_nvme_accel_fn_table), 3373 .submit_accel_crc32c = bdev_nvme_submit_accel_crc32c, 3374 .append_crc32c = bdev_nvme_append_crc32c, 3375 .finish_sequence = bdev_nvme_finish_sequence, 3376 .reverse_sequence = bdev_nvme_reverse_sequence, 3377 .abort_sequence = bdev_nvme_abort_sequence, 3378 }; 3379 3380 static int 3381 bdev_nvme_create_poll_group_cb(void *io_device, void *ctx_buf) 3382 { 3383 struct nvme_poll_group *group = ctx_buf; 3384 3385 TAILQ_INIT(&group->qpair_list); 3386 3387 group->group = spdk_nvme_poll_group_create(group, &g_bdev_nvme_accel_fn_table); 3388 if (group->group == NULL) { 3389 return -1; 3390 } 3391 3392 group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us); 3393 3394 if (group->poller == NULL) { 3395 spdk_nvme_poll_group_destroy(group->group); 3396 return -1; 3397 } 3398 3399 return 0; 3400 } 3401 3402 static void 3403 bdev_nvme_destroy_poll_group_cb(void *io_device, void *ctx_buf) 3404 { 3405 struct nvme_poll_group *group = ctx_buf; 3406 3407 assert(TAILQ_EMPTY(&group->qpair_list)); 3408 3409 if (group->accel_channel) { 3410 spdk_put_io_channel(group->accel_channel); 3411 } 3412 3413 spdk_poller_unregister(&group->poller); 3414 if (spdk_nvme_poll_group_destroy(group->group)) { 3415 SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.\n"); 3416 assert(false); 3417 } 3418 } 3419 3420 static struct spdk_io_channel * 3421 bdev_nvme_get_io_channel(void *ctx) 3422 { 3423 struct nvme_bdev *nvme_bdev = ctx; 3424 3425 return spdk_get_io_channel(nvme_bdev); 3426 } 3427 3428 static void * 3429 bdev_nvme_get_module_ctx(void *ctx) 3430 { 3431 struct nvme_bdev *nvme_bdev = ctx; 3432 struct nvme_ns *nvme_ns; 3433 3434 if (!nvme_bdev || nvme_bdev->disk.module != &nvme_if) { 3435 return NULL; 3436 } 3437 3438 nvme_ns = TAILQ_FIRST(&nvme_bdev->nvme_ns_list); 3439 if (!nvme_ns) { 3440 return NULL; 3441 } 3442 3443 return nvme_ns->ns; 3444 } 3445 3446 static const char * 3447 _nvme_ana_state_str(enum spdk_nvme_ana_state ana_state) 3448 { 3449 switch (ana_state) { 3450 case SPDK_NVME_ANA_OPTIMIZED_STATE: 3451 return "optimized"; 3452 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 3453 return "non_optimized"; 3454 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 3455 return "inaccessible"; 3456 case SPDK_NVME_ANA_PERSISTENT_LOSS_STATE: 3457 return "persistent_loss"; 3458 case SPDK_NVME_ANA_CHANGE_STATE: 3459 return "change"; 3460 default: 3461 return NULL; 3462 } 3463 } 3464 3465 static int 3466 bdev_nvme_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size) 3467 { 3468 struct spdk_memory_domain **_domains = NULL; 3469 struct nvme_bdev *nbdev = ctx; 3470 struct nvme_ns *nvme_ns; 3471 int i = 0, _array_size = array_size; 3472 int rc = 0; 3473 3474 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 3475 if (domains && array_size >= i) { 3476 _domains = &domains[i]; 3477 } else { 3478 _domains = NULL; 3479 } 3480 rc = spdk_nvme_ctrlr_get_memory_domains(nvme_ns->ctrlr->ctrlr, _domains, _array_size); 3481 if (rc > 0) { 3482 i += rc; 3483 if (_array_size >= rc) { 3484 _array_size -= rc; 3485 } else { 3486 _array_size = 0; 3487 } 3488 } else if (rc < 0) { 3489 return rc; 3490 } 3491 } 3492 3493 return i; 3494 } 3495 3496 static const char * 3497 nvme_ctrlr_get_state_str(struct nvme_ctrlr *nvme_ctrlr) 3498 { 3499 if (nvme_ctrlr->destruct) { 3500 return "deleting"; 3501 } else if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 3502 return "failed"; 3503 } else if (nvme_ctrlr->resetting) { 3504 return "resetting"; 3505 } else if (nvme_ctrlr->reconnect_is_delayed > 0) { 3506 return "reconnect_is_delayed"; 3507 } else if (nvme_ctrlr->disabled) { 3508 return "disabled"; 3509 } else { 3510 return "enabled"; 3511 } 3512 } 3513 3514 void 3515 nvme_ctrlr_info_json(struct spdk_json_write_ctx *w, struct nvme_ctrlr *nvme_ctrlr) 3516 { 3517 struct spdk_nvme_transport_id *trid; 3518 const struct spdk_nvme_ctrlr_opts *opts; 3519 const struct spdk_nvme_ctrlr_data *cdata; 3520 struct nvme_path_id *path_id; 3521 3522 spdk_json_write_object_begin(w); 3523 3524 spdk_json_write_named_string(w, "state", nvme_ctrlr_get_state_str(nvme_ctrlr)); 3525 3526 #ifdef SPDK_CONFIG_NVME_CUSE 3527 size_t cuse_name_size = 128; 3528 char cuse_name[cuse_name_size]; 3529 3530 int rc = spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr, cuse_name, &cuse_name_size); 3531 if (rc == 0) { 3532 spdk_json_write_named_string(w, "cuse_device", cuse_name); 3533 } 3534 #endif 3535 trid = &nvme_ctrlr->active_path_id->trid; 3536 spdk_json_write_named_object_begin(w, "trid"); 3537 nvme_bdev_dump_trid_json(trid, w); 3538 spdk_json_write_object_end(w); 3539 3540 path_id = TAILQ_NEXT(nvme_ctrlr->active_path_id, link); 3541 if (path_id != NULL) { 3542 spdk_json_write_named_array_begin(w, "alternate_trids"); 3543 do { 3544 trid = &path_id->trid; 3545 spdk_json_write_object_begin(w); 3546 nvme_bdev_dump_trid_json(trid, w); 3547 spdk_json_write_object_end(w); 3548 3549 path_id = TAILQ_NEXT(path_id, link); 3550 } while (path_id != NULL); 3551 spdk_json_write_array_end(w); 3552 } 3553 3554 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 3555 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 3556 3557 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 3558 spdk_json_write_named_object_begin(w, "host"); 3559 spdk_json_write_named_string(w, "nqn", opts->hostnqn); 3560 spdk_json_write_named_string(w, "addr", opts->src_addr); 3561 spdk_json_write_named_string(w, "svcid", opts->src_svcid); 3562 spdk_json_write_object_end(w); 3563 3564 spdk_json_write_object_end(w); 3565 } 3566 3567 static void 3568 nvme_namespace_info_json(struct spdk_json_write_ctx *w, 3569 struct nvme_ns *nvme_ns) 3570 { 3571 struct spdk_nvme_ns *ns; 3572 struct spdk_nvme_ctrlr *ctrlr; 3573 const struct spdk_nvme_ctrlr_data *cdata; 3574 const struct spdk_nvme_transport_id *trid; 3575 union spdk_nvme_vs_register vs; 3576 const struct spdk_nvme_ns_data *nsdata; 3577 char buf[128]; 3578 3579 ns = nvme_ns->ns; 3580 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 3581 3582 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3583 trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); 3584 vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr); 3585 3586 spdk_json_write_object_begin(w); 3587 3588 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 3589 spdk_json_write_named_string(w, "pci_address", trid->traddr); 3590 } 3591 3592 spdk_json_write_named_object_begin(w, "trid"); 3593 3594 nvme_bdev_dump_trid_json(trid, w); 3595 3596 spdk_json_write_object_end(w); 3597 3598 #ifdef SPDK_CONFIG_NVME_CUSE 3599 size_t cuse_name_size = 128; 3600 char cuse_name[cuse_name_size]; 3601 3602 int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns), 3603 cuse_name, &cuse_name_size); 3604 if (rc == 0) { 3605 spdk_json_write_named_string(w, "cuse_device", cuse_name); 3606 } 3607 #endif 3608 3609 spdk_json_write_named_object_begin(w, "ctrlr_data"); 3610 3611 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 3612 3613 spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid); 3614 3615 snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn); 3616 spdk_str_trim(buf); 3617 spdk_json_write_named_string(w, "model_number", buf); 3618 3619 snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn); 3620 spdk_str_trim(buf); 3621 spdk_json_write_named_string(w, "serial_number", buf); 3622 3623 snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr); 3624 spdk_str_trim(buf); 3625 spdk_json_write_named_string(w, "firmware_revision", buf); 3626 3627 if (cdata->subnqn[0] != '\0') { 3628 spdk_json_write_named_string(w, "subnqn", cdata->subnqn); 3629 } 3630 3631 spdk_json_write_named_object_begin(w, "oacs"); 3632 3633 spdk_json_write_named_uint32(w, "security", cdata->oacs.security); 3634 spdk_json_write_named_uint32(w, "format", cdata->oacs.format); 3635 spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware); 3636 spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage); 3637 3638 spdk_json_write_object_end(w); 3639 3640 spdk_json_write_named_bool(w, "multi_ctrlr", cdata->cmic.multi_ctrlr); 3641 spdk_json_write_named_bool(w, "ana_reporting", cdata->cmic.ana_reporting); 3642 3643 spdk_json_write_object_end(w); 3644 3645 spdk_json_write_named_object_begin(w, "vs"); 3646 3647 spdk_json_write_name(w, "nvme_version"); 3648 if (vs.bits.ter) { 3649 spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter); 3650 } else { 3651 spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr); 3652 } 3653 3654 spdk_json_write_object_end(w); 3655 3656 nsdata = spdk_nvme_ns_get_data(ns); 3657 3658 spdk_json_write_named_object_begin(w, "ns_data"); 3659 3660 spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns)); 3661 3662 if (cdata->cmic.ana_reporting) { 3663 spdk_json_write_named_string(w, "ana_state", 3664 _nvme_ana_state_str(nvme_ns->ana_state)); 3665 } 3666 3667 spdk_json_write_named_bool(w, "can_share", nsdata->nmic.can_share); 3668 3669 spdk_json_write_object_end(w); 3670 3671 if (cdata->oacs.security) { 3672 spdk_json_write_named_object_begin(w, "security"); 3673 3674 spdk_json_write_named_bool(w, "opal", nvme_ns->bdev->opal); 3675 3676 spdk_json_write_object_end(w); 3677 } 3678 3679 spdk_json_write_object_end(w); 3680 } 3681 3682 static const char * 3683 nvme_bdev_get_mp_policy_str(struct nvme_bdev *nbdev) 3684 { 3685 switch (nbdev->mp_policy) { 3686 case BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE: 3687 return "active_passive"; 3688 case BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE: 3689 return "active_active"; 3690 default: 3691 assert(false); 3692 return "invalid"; 3693 } 3694 } 3695 3696 static int 3697 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) 3698 { 3699 struct nvme_bdev *nvme_bdev = ctx; 3700 struct nvme_ns *nvme_ns; 3701 3702 pthread_mutex_lock(&nvme_bdev->mutex); 3703 spdk_json_write_named_array_begin(w, "nvme"); 3704 TAILQ_FOREACH(nvme_ns, &nvme_bdev->nvme_ns_list, tailq) { 3705 nvme_namespace_info_json(w, nvme_ns); 3706 } 3707 spdk_json_write_array_end(w); 3708 spdk_json_write_named_string(w, "mp_policy", nvme_bdev_get_mp_policy_str(nvme_bdev)); 3709 pthread_mutex_unlock(&nvme_bdev->mutex); 3710 3711 return 0; 3712 } 3713 3714 static void 3715 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3716 { 3717 /* No config per bdev needed */ 3718 } 3719 3720 static uint64_t 3721 bdev_nvme_get_spin_time(struct spdk_io_channel *ch) 3722 { 3723 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 3724 struct nvme_io_path *io_path; 3725 struct nvme_poll_group *group; 3726 uint64_t spin_time = 0; 3727 3728 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 3729 group = io_path->qpair->group; 3730 3731 if (!group || !group->collect_spin_stat) { 3732 continue; 3733 } 3734 3735 if (group->end_ticks != 0) { 3736 group->spin_ticks += (group->end_ticks - group->start_ticks); 3737 group->end_ticks = 0; 3738 } 3739 3740 spin_time += group->spin_ticks; 3741 group->start_ticks = 0; 3742 group->spin_ticks = 0; 3743 } 3744 3745 return (spin_time * 1000000ULL) / spdk_get_ticks_hz(); 3746 } 3747 3748 static void 3749 bdev_nvme_reset_device_stat(void *ctx) 3750 { 3751 struct nvme_bdev *nbdev = ctx; 3752 3753 if (nbdev->err_stat != NULL) { 3754 memset(nbdev->err_stat, 0, sizeof(struct nvme_error_stat)); 3755 } 3756 } 3757 3758 /* JSON string should be lowercases and underscore delimited string. */ 3759 static void 3760 bdev_nvme_format_nvme_status(char *dst, const char *src) 3761 { 3762 char tmp[256]; 3763 3764 spdk_strcpy_replace(dst, 256, src, " - ", "_"); 3765 spdk_strcpy_replace(tmp, 256, dst, "-", "_"); 3766 spdk_strcpy_replace(dst, 256, tmp, " ", "_"); 3767 spdk_strlwr(dst); 3768 } 3769 3770 static void 3771 bdev_nvme_dump_device_stat_json(void *ctx, struct spdk_json_write_ctx *w) 3772 { 3773 struct nvme_bdev *nbdev = ctx; 3774 struct spdk_nvme_status status = {}; 3775 uint16_t sct, sc; 3776 char status_json[256]; 3777 const char *status_str; 3778 3779 if (nbdev->err_stat == NULL) { 3780 return; 3781 } 3782 3783 spdk_json_write_named_object_begin(w, "nvme_error"); 3784 3785 spdk_json_write_named_object_begin(w, "status_type"); 3786 for (sct = 0; sct < 8; sct++) { 3787 if (nbdev->err_stat->status_type[sct] == 0) { 3788 continue; 3789 } 3790 status.sct = sct; 3791 3792 status_str = spdk_nvme_cpl_get_status_type_string(&status); 3793 assert(status_str != NULL); 3794 bdev_nvme_format_nvme_status(status_json, status_str); 3795 3796 spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status_type[sct]); 3797 } 3798 spdk_json_write_object_end(w); 3799 3800 spdk_json_write_named_object_begin(w, "status_code"); 3801 for (sct = 0; sct < 4; sct++) { 3802 status.sct = sct; 3803 for (sc = 0; sc < 256; sc++) { 3804 if (nbdev->err_stat->status[sct][sc] == 0) { 3805 continue; 3806 } 3807 status.sc = sc; 3808 3809 status_str = spdk_nvme_cpl_get_status_string(&status); 3810 assert(status_str != NULL); 3811 bdev_nvme_format_nvme_status(status_json, status_str); 3812 3813 spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status[sct][sc]); 3814 } 3815 } 3816 spdk_json_write_object_end(w); 3817 3818 spdk_json_write_object_end(w); 3819 } 3820 3821 static bool 3822 bdev_nvme_accel_sequence_supported(void *ctx, enum spdk_bdev_io_type type) 3823 { 3824 struct nvme_bdev *nbdev = ctx; 3825 struct spdk_nvme_ctrlr *ctrlr; 3826 3827 if (!g_opts.allow_accel_sequence) { 3828 return false; 3829 } 3830 3831 switch (type) { 3832 case SPDK_BDEV_IO_TYPE_WRITE: 3833 case SPDK_BDEV_IO_TYPE_READ: 3834 break; 3835 default: 3836 return false; 3837 } 3838 3839 ctrlr = bdev_nvme_get_ctrlr(&nbdev->disk); 3840 assert(ctrlr != NULL); 3841 3842 return spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ACCEL_SEQUENCE_SUPPORTED; 3843 } 3844 3845 static const struct spdk_bdev_fn_table nvmelib_fn_table = { 3846 .destruct = bdev_nvme_destruct, 3847 .submit_request = bdev_nvme_submit_request, 3848 .io_type_supported = bdev_nvme_io_type_supported, 3849 .get_io_channel = bdev_nvme_get_io_channel, 3850 .dump_info_json = bdev_nvme_dump_info_json, 3851 .write_config_json = bdev_nvme_write_config_json, 3852 .get_spin_time = bdev_nvme_get_spin_time, 3853 .get_module_ctx = bdev_nvme_get_module_ctx, 3854 .get_memory_domains = bdev_nvme_get_memory_domains, 3855 .accel_sequence_supported = bdev_nvme_accel_sequence_supported, 3856 .reset_device_stat = bdev_nvme_reset_device_stat, 3857 .dump_device_stat_json = bdev_nvme_dump_device_stat_json, 3858 }; 3859 3860 typedef int (*bdev_nvme_parse_ana_log_page_cb)( 3861 const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg); 3862 3863 static int 3864 bdev_nvme_parse_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 3865 bdev_nvme_parse_ana_log_page_cb cb_fn, void *cb_arg) 3866 { 3867 struct spdk_nvme_ana_group_descriptor *copied_desc; 3868 uint8_t *orig_desc; 3869 uint32_t i, desc_size, copy_len; 3870 int rc = 0; 3871 3872 if (nvme_ctrlr->ana_log_page == NULL) { 3873 return -EINVAL; 3874 } 3875 3876 copied_desc = nvme_ctrlr->copied_ana_desc; 3877 3878 orig_desc = (uint8_t *)nvme_ctrlr->ana_log_page + sizeof(struct spdk_nvme_ana_page); 3879 copy_len = nvme_ctrlr->max_ana_log_page_size - sizeof(struct spdk_nvme_ana_page); 3880 3881 for (i = 0; i < nvme_ctrlr->ana_log_page->num_ana_group_desc; i++) { 3882 memcpy(copied_desc, orig_desc, copy_len); 3883 3884 rc = cb_fn(copied_desc, cb_arg); 3885 if (rc != 0) { 3886 break; 3887 } 3888 3889 desc_size = sizeof(struct spdk_nvme_ana_group_descriptor) + 3890 copied_desc->num_of_nsid * sizeof(uint32_t); 3891 orig_desc += desc_size; 3892 copy_len -= desc_size; 3893 } 3894 3895 return rc; 3896 } 3897 3898 static int 3899 nvme_ns_ana_transition_timedout(void *ctx) 3900 { 3901 struct nvme_ns *nvme_ns = ctx; 3902 3903 spdk_poller_unregister(&nvme_ns->anatt_timer); 3904 nvme_ns->ana_transition_timedout = true; 3905 3906 return SPDK_POLLER_BUSY; 3907 } 3908 3909 static void 3910 _nvme_ns_set_ana_state(struct nvme_ns *nvme_ns, 3911 const struct spdk_nvme_ana_group_descriptor *desc) 3912 { 3913 const struct spdk_nvme_ctrlr_data *cdata; 3914 3915 nvme_ns->ana_group_id = desc->ana_group_id; 3916 nvme_ns->ana_state = desc->ana_state; 3917 nvme_ns->ana_state_updating = false; 3918 3919 switch (nvme_ns->ana_state) { 3920 case SPDK_NVME_ANA_OPTIMIZED_STATE: 3921 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 3922 nvme_ns->ana_transition_timedout = false; 3923 spdk_poller_unregister(&nvme_ns->anatt_timer); 3924 break; 3925 3926 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 3927 case SPDK_NVME_ANA_CHANGE_STATE: 3928 if (nvme_ns->anatt_timer != NULL) { 3929 break; 3930 } 3931 3932 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 3933 nvme_ns->anatt_timer = SPDK_POLLER_REGISTER(nvme_ns_ana_transition_timedout, 3934 nvme_ns, 3935 cdata->anatt * SPDK_SEC_TO_USEC); 3936 break; 3937 default: 3938 break; 3939 } 3940 } 3941 3942 static int 3943 nvme_ns_set_ana_state(const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg) 3944 { 3945 struct nvme_ns *nvme_ns = cb_arg; 3946 uint32_t i; 3947 3948 for (i = 0; i < desc->num_of_nsid; i++) { 3949 if (desc->nsid[i] != spdk_nvme_ns_get_id(nvme_ns->ns)) { 3950 continue; 3951 } 3952 3953 _nvme_ns_set_ana_state(nvme_ns, desc); 3954 return 1; 3955 } 3956 3957 return 0; 3958 } 3959 3960 static struct spdk_uuid 3961 nvme_generate_uuid(const char *sn, uint32_t nsid) 3962 { 3963 struct spdk_uuid new_uuid, namespace_uuid; 3964 char merged_str[SPDK_NVME_CTRLR_SN_LEN + NSID_STR_LEN + 1] = {'\0'}; 3965 /* This namespace UUID was generated using uuid_generate() method. */ 3966 const char *namespace_str = {"edaed2de-24bc-4b07-b559-f47ecbe730fd"}; 3967 int size; 3968 3969 assert(strlen(sn) <= SPDK_NVME_CTRLR_SN_LEN); 3970 3971 spdk_uuid_set_null(&new_uuid); 3972 spdk_uuid_set_null(&namespace_uuid); 3973 3974 size = snprintf(merged_str, sizeof(merged_str), "%s%"PRIu32, sn, nsid); 3975 assert(size > 0 && (unsigned long)size < sizeof(merged_str)); 3976 3977 spdk_uuid_parse(&namespace_uuid, namespace_str); 3978 3979 spdk_uuid_generate_sha1(&new_uuid, &namespace_uuid, merged_str, size); 3980 3981 return new_uuid; 3982 } 3983 3984 static int 3985 nvme_disk_create(struct spdk_bdev *disk, const char *base_name, 3986 struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns, 3987 uint32_t prchk_flags, void *ctx) 3988 { 3989 const struct spdk_uuid *uuid; 3990 const uint8_t *nguid; 3991 const struct spdk_nvme_ctrlr_data *cdata; 3992 const struct spdk_nvme_ns_data *nsdata; 3993 const struct spdk_nvme_ctrlr_opts *opts; 3994 enum spdk_nvme_csi csi; 3995 uint32_t atomic_bs, phys_bs, bs; 3996 char sn_tmp[SPDK_NVME_CTRLR_SN_LEN + 1] = {'\0'}; 3997 3998 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3999 csi = spdk_nvme_ns_get_csi(ns); 4000 opts = spdk_nvme_ctrlr_get_opts(ctrlr); 4001 4002 switch (csi) { 4003 case SPDK_NVME_CSI_NVM: 4004 disk->product_name = "NVMe disk"; 4005 break; 4006 case SPDK_NVME_CSI_ZNS: 4007 disk->product_name = "NVMe ZNS disk"; 4008 disk->zoned = true; 4009 disk->zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 4010 disk->max_zone_append_size = spdk_nvme_zns_ctrlr_get_max_zone_append_size(ctrlr) / 4011 spdk_nvme_ns_get_extended_sector_size(ns); 4012 disk->max_open_zones = spdk_nvme_zns_ns_get_max_open_zones(ns); 4013 disk->max_active_zones = spdk_nvme_zns_ns_get_max_active_zones(ns); 4014 break; 4015 default: 4016 SPDK_ERRLOG("unsupported CSI: %u\n", csi); 4017 return -ENOTSUP; 4018 } 4019 4020 disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns)); 4021 if (!disk->name) { 4022 return -ENOMEM; 4023 } 4024 4025 disk->write_cache = 0; 4026 if (cdata->vwc.present) { 4027 /* Enable if the Volatile Write Cache exists */ 4028 disk->write_cache = 1; 4029 } 4030 if (cdata->oncs.write_zeroes) { 4031 disk->max_write_zeroes = UINT16_MAX + 1; 4032 } 4033 disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns); 4034 disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns); 4035 disk->max_segment_size = spdk_nvme_ctrlr_get_max_xfer_size(ctrlr); 4036 /* NVMe driver will split one request into multiple requests 4037 * based on MDTS and stripe boundary, the bdev layer will use 4038 * max_segment_size and max_num_segments to split one big IO 4039 * into multiple requests, then small request can't run out 4040 * of NVMe internal requests data structure. 4041 */ 4042 if (opts && opts->io_queue_requests) { 4043 disk->max_num_segments = opts->io_queue_requests / 2; 4044 } 4045 disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns); 4046 4047 nguid = spdk_nvme_ns_get_nguid(ns); 4048 if (!nguid) { 4049 uuid = spdk_nvme_ns_get_uuid(ns); 4050 if (uuid) { 4051 disk->uuid = *uuid; 4052 } else if (g_opts.generate_uuids) { 4053 spdk_strcpy_pad(sn_tmp, cdata->sn, SPDK_NVME_CTRLR_SN_LEN, '\0'); 4054 disk->uuid = nvme_generate_uuid(sn_tmp, spdk_nvme_ns_get_id(ns)); 4055 } 4056 } else { 4057 memcpy(&disk->uuid, nguid, sizeof(disk->uuid)); 4058 } 4059 4060 nsdata = spdk_nvme_ns_get_data(ns); 4061 bs = spdk_nvme_ns_get_sector_size(ns); 4062 atomic_bs = bs; 4063 phys_bs = bs; 4064 if (nsdata->nabo == 0) { 4065 if (nsdata->nsfeat.ns_atomic_write_unit && nsdata->nawupf) { 4066 atomic_bs = bs * (1 + nsdata->nawupf); 4067 } else { 4068 atomic_bs = bs * (1 + cdata->awupf); 4069 } 4070 } 4071 if (nsdata->nsfeat.optperf) { 4072 phys_bs = bs * (1 + nsdata->npwg); 4073 } 4074 disk->phys_blocklen = spdk_min(phys_bs, atomic_bs); 4075 4076 disk->md_len = spdk_nvme_ns_get_md_size(ns); 4077 if (disk->md_len != 0) { 4078 disk->md_interleave = nsdata->flbas.extended; 4079 disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns); 4080 if (disk->dif_type != SPDK_DIF_DISABLE) { 4081 disk->dif_is_head_of_md = nsdata->dps.md_start; 4082 disk->dif_check_flags = prchk_flags; 4083 } 4084 } 4085 4086 if (!(spdk_nvme_ctrlr_get_flags(ctrlr) & 4087 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) { 4088 disk->acwu = 0; 4089 } else if (nsdata->nsfeat.ns_atomic_write_unit) { 4090 disk->acwu = nsdata->nacwu + 1; /* 0-based */ 4091 } else { 4092 disk->acwu = cdata->acwu + 1; /* 0-based */ 4093 } 4094 4095 if (cdata->oncs.copy) { 4096 /* For now bdev interface allows only single segment copy */ 4097 disk->max_copy = nsdata->mssrl; 4098 } 4099 4100 disk->ctxt = ctx; 4101 disk->fn_table = &nvmelib_fn_table; 4102 disk->module = &nvme_if; 4103 4104 return 0; 4105 } 4106 4107 static struct nvme_bdev * 4108 nvme_bdev_alloc(void) 4109 { 4110 struct nvme_bdev *bdev; 4111 int rc; 4112 4113 bdev = calloc(1, sizeof(*bdev)); 4114 if (!bdev) { 4115 SPDK_ERRLOG("bdev calloc() failed\n"); 4116 return NULL; 4117 } 4118 4119 if (g_opts.nvme_error_stat) { 4120 bdev->err_stat = calloc(1, sizeof(struct nvme_error_stat)); 4121 if (!bdev->err_stat) { 4122 SPDK_ERRLOG("err_stat calloc() failed\n"); 4123 free(bdev); 4124 return NULL; 4125 } 4126 } 4127 4128 rc = pthread_mutex_init(&bdev->mutex, NULL); 4129 if (rc != 0) { 4130 free(bdev->err_stat); 4131 free(bdev); 4132 return NULL; 4133 } 4134 4135 bdev->ref = 1; 4136 bdev->mp_policy = BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE; 4137 bdev->mp_selector = BDEV_NVME_MP_SELECTOR_ROUND_ROBIN; 4138 bdev->rr_min_io = UINT32_MAX; 4139 TAILQ_INIT(&bdev->nvme_ns_list); 4140 4141 return bdev; 4142 } 4143 4144 static int 4145 nvme_bdev_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4146 { 4147 struct nvme_bdev *bdev; 4148 struct nvme_bdev_ctrlr *nbdev_ctrlr = nvme_ctrlr->nbdev_ctrlr; 4149 int rc; 4150 4151 bdev = nvme_bdev_alloc(); 4152 if (bdev == NULL) { 4153 SPDK_ERRLOG("Failed to allocate NVMe bdev\n"); 4154 return -ENOMEM; 4155 } 4156 4157 bdev->opal = nvme_ctrlr->opal_dev != NULL; 4158 4159 rc = nvme_disk_create(&bdev->disk, nbdev_ctrlr->name, nvme_ctrlr->ctrlr, 4160 nvme_ns->ns, nvme_ctrlr->opts.prchk_flags, bdev); 4161 if (rc != 0) { 4162 SPDK_ERRLOG("Failed to create NVMe disk\n"); 4163 nvme_bdev_free(bdev); 4164 return rc; 4165 } 4166 4167 spdk_io_device_register(bdev, 4168 bdev_nvme_create_bdev_channel_cb, 4169 bdev_nvme_destroy_bdev_channel_cb, 4170 sizeof(struct nvme_bdev_channel), 4171 bdev->disk.name); 4172 4173 nvme_ns->bdev = bdev; 4174 bdev->nsid = nvme_ns->id; 4175 TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); 4176 4177 bdev->nbdev_ctrlr = nbdev_ctrlr; 4178 TAILQ_INSERT_TAIL(&nbdev_ctrlr->bdevs, bdev, tailq); 4179 4180 rc = spdk_bdev_register(&bdev->disk); 4181 if (rc != 0) { 4182 SPDK_ERRLOG("spdk_bdev_register() failed\n"); 4183 spdk_io_device_unregister(bdev, NULL); 4184 nvme_ns->bdev = NULL; 4185 TAILQ_REMOVE(&nbdev_ctrlr->bdevs, bdev, tailq); 4186 nvme_bdev_free(bdev); 4187 return rc; 4188 } 4189 4190 return 0; 4191 } 4192 4193 static bool 4194 bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2) 4195 { 4196 const struct spdk_nvme_ns_data *nsdata1, *nsdata2; 4197 const struct spdk_uuid *uuid1, *uuid2; 4198 4199 nsdata1 = spdk_nvme_ns_get_data(ns1); 4200 nsdata2 = spdk_nvme_ns_get_data(ns2); 4201 uuid1 = spdk_nvme_ns_get_uuid(ns1); 4202 uuid2 = spdk_nvme_ns_get_uuid(ns2); 4203 4204 return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid)) == 0 && 4205 nsdata1->eui64 == nsdata2->eui64 && 4206 ((uuid1 == NULL && uuid2 == NULL) || 4207 (uuid1 != NULL && uuid2 != NULL && spdk_uuid_compare(uuid1, uuid2) == 0)) && 4208 spdk_nvme_ns_get_csi(ns1) == spdk_nvme_ns_get_csi(ns2); 4209 } 4210 4211 static bool 4212 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 4213 struct spdk_nvme_ctrlr_opts *opts) 4214 { 4215 struct nvme_probe_skip_entry *entry; 4216 4217 TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) { 4218 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 4219 return false; 4220 } 4221 } 4222 4223 opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst; 4224 opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight; 4225 opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight; 4226 opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight; 4227 opts->disable_read_ana_log_page = true; 4228 4229 SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr); 4230 4231 return true; 4232 } 4233 4234 static void 4235 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl) 4236 { 4237 struct nvme_ctrlr *nvme_ctrlr = ctx; 4238 4239 if (spdk_nvme_cpl_is_error(cpl)) { 4240 SPDK_WARNLOG("Abort failed. Resetting controller. sc is %u, sct is %u.\n", cpl->status.sc, 4241 cpl->status.sct); 4242 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4243 } else if (cpl->cdw0 & 0x1) { 4244 SPDK_WARNLOG("Specified command could not be aborted.\n"); 4245 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4246 } 4247 } 4248 4249 static void 4250 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr, 4251 struct spdk_nvme_qpair *qpair, uint16_t cid) 4252 { 4253 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 4254 union spdk_nvme_csts_register csts; 4255 int rc; 4256 4257 assert(nvme_ctrlr->ctrlr == ctrlr); 4258 4259 SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid); 4260 4261 /* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O 4262 * queue. (Note: qpair == NULL when there's an admin cmd timeout.) Otherwise we 4263 * would submit another fabrics cmd on the admin queue to read CSTS and check for its 4264 * completion recursively. 4265 */ 4266 if (nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) { 4267 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); 4268 if (csts.bits.cfs) { 4269 SPDK_ERRLOG("Controller Fatal Status, reset required\n"); 4270 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4271 return; 4272 } 4273 } 4274 4275 switch (g_opts.action_on_timeout) { 4276 case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT: 4277 if (qpair) { 4278 /* Don't send abort to ctrlr when ctrlr is not available. */ 4279 pthread_mutex_lock(&nvme_ctrlr->mutex); 4280 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 4281 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4282 SPDK_NOTICELOG("Quit abort. Ctrlr is not available.\n"); 4283 return; 4284 } 4285 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4286 4287 rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid, 4288 nvme_abort_cpl, nvme_ctrlr); 4289 if (rc == 0) { 4290 return; 4291 } 4292 4293 SPDK_ERRLOG("Unable to send abort. Resetting, rc is %d.\n", rc); 4294 } 4295 4296 /* FALLTHROUGH */ 4297 case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET: 4298 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4299 break; 4300 case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE: 4301 SPDK_DEBUGLOG(bdev_nvme, "No action for nvme controller timeout.\n"); 4302 break; 4303 default: 4304 SPDK_ERRLOG("An invalid timeout action value is found.\n"); 4305 break; 4306 } 4307 } 4308 4309 static struct nvme_ns * 4310 nvme_ns_alloc(void) 4311 { 4312 struct nvme_ns *nvme_ns; 4313 4314 nvme_ns = calloc(1, sizeof(struct nvme_ns)); 4315 if (nvme_ns == NULL) { 4316 return NULL; 4317 } 4318 4319 if (g_opts.io_path_stat) { 4320 nvme_ns->stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); 4321 if (nvme_ns->stat == NULL) { 4322 free(nvme_ns); 4323 return NULL; 4324 } 4325 spdk_bdev_reset_io_stat(nvme_ns->stat, SPDK_BDEV_RESET_STAT_MAXMIN); 4326 } 4327 4328 return nvme_ns; 4329 } 4330 4331 static void 4332 nvme_ns_free(struct nvme_ns *nvme_ns) 4333 { 4334 free(nvme_ns->stat); 4335 free(nvme_ns); 4336 } 4337 4338 static void 4339 nvme_ctrlr_populate_namespace_done(struct nvme_ns *nvme_ns, int rc) 4340 { 4341 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 4342 struct nvme_async_probe_ctx *ctx = nvme_ns->probe_ctx; 4343 4344 if (rc == 0) { 4345 nvme_ns->probe_ctx = NULL; 4346 pthread_mutex_lock(&nvme_ctrlr->mutex); 4347 nvme_ctrlr->ref++; 4348 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4349 } else { 4350 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4351 nvme_ns_free(nvme_ns); 4352 } 4353 4354 if (ctx) { 4355 ctx->populates_in_progress--; 4356 if (ctx->populates_in_progress == 0) { 4357 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 4358 } 4359 } 4360 } 4361 4362 static void 4363 bdev_nvme_add_io_path(struct spdk_io_channel_iter *i) 4364 { 4365 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4366 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4367 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4368 int rc; 4369 4370 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 4371 if (rc != 0) { 4372 SPDK_ERRLOG("Failed to add I/O path to bdev_channel dynamically.\n"); 4373 } 4374 4375 spdk_for_each_channel_continue(i, rc); 4376 } 4377 4378 static void 4379 bdev_nvme_delete_io_path(struct spdk_io_channel_iter *i) 4380 { 4381 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4382 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4383 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4384 struct nvme_io_path *io_path; 4385 4386 io_path = _bdev_nvme_get_io_path(nbdev_ch, nvme_ns); 4387 if (io_path != NULL) { 4388 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 4389 } 4390 4391 spdk_for_each_channel_continue(i, 0); 4392 } 4393 4394 static void 4395 bdev_nvme_add_io_path_failed(struct spdk_io_channel_iter *i, int status) 4396 { 4397 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4398 4399 nvme_ctrlr_populate_namespace_done(nvme_ns, -1); 4400 } 4401 4402 static void 4403 bdev_nvme_add_io_path_done(struct spdk_io_channel_iter *i, int status) 4404 { 4405 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4406 struct nvme_bdev *bdev = spdk_io_channel_iter_get_io_device(i); 4407 4408 if (status == 0) { 4409 nvme_ctrlr_populate_namespace_done(nvme_ns, 0); 4410 } else { 4411 /* Delete the added io_paths and fail populating the namespace. */ 4412 spdk_for_each_channel(bdev, 4413 bdev_nvme_delete_io_path, 4414 nvme_ns, 4415 bdev_nvme_add_io_path_failed); 4416 } 4417 } 4418 4419 static int 4420 nvme_bdev_add_ns(struct nvme_bdev *bdev, struct nvme_ns *nvme_ns) 4421 { 4422 struct nvme_ns *tmp_ns; 4423 const struct spdk_nvme_ns_data *nsdata; 4424 4425 nsdata = spdk_nvme_ns_get_data(nvme_ns->ns); 4426 if (!nsdata->nmic.can_share) { 4427 SPDK_ERRLOG("Namespace cannot be shared.\n"); 4428 return -EINVAL; 4429 } 4430 4431 pthread_mutex_lock(&bdev->mutex); 4432 4433 tmp_ns = TAILQ_FIRST(&bdev->nvme_ns_list); 4434 assert(tmp_ns != NULL); 4435 4436 if (!bdev_nvme_compare_ns(nvme_ns->ns, tmp_ns->ns)) { 4437 pthread_mutex_unlock(&bdev->mutex); 4438 SPDK_ERRLOG("Namespaces are not identical.\n"); 4439 return -EINVAL; 4440 } 4441 4442 bdev->ref++; 4443 TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); 4444 nvme_ns->bdev = bdev; 4445 4446 pthread_mutex_unlock(&bdev->mutex); 4447 4448 /* Add nvme_io_path to nvme_bdev_channels dynamically. */ 4449 spdk_for_each_channel(bdev, 4450 bdev_nvme_add_io_path, 4451 nvme_ns, 4452 bdev_nvme_add_io_path_done); 4453 4454 return 0; 4455 } 4456 4457 static void 4458 nvme_ctrlr_populate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4459 { 4460 struct spdk_nvme_ns *ns; 4461 struct nvme_bdev *bdev; 4462 int rc = 0; 4463 4464 ns = spdk_nvme_ctrlr_get_ns(nvme_ctrlr->ctrlr, nvme_ns->id); 4465 if (!ns) { 4466 SPDK_DEBUGLOG(bdev_nvme, "Invalid NS %d\n", nvme_ns->id); 4467 rc = -EINVAL; 4468 goto done; 4469 } 4470 4471 nvme_ns->ns = ns; 4472 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 4473 4474 if (nvme_ctrlr->ana_log_page != NULL) { 4475 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ns_set_ana_state, nvme_ns); 4476 } 4477 4478 bdev = nvme_bdev_ctrlr_get_bdev(nvme_ctrlr->nbdev_ctrlr, nvme_ns->id); 4479 if (bdev == NULL) { 4480 rc = nvme_bdev_create(nvme_ctrlr, nvme_ns); 4481 } else { 4482 rc = nvme_bdev_add_ns(bdev, nvme_ns); 4483 if (rc == 0) { 4484 return; 4485 } 4486 } 4487 done: 4488 nvme_ctrlr_populate_namespace_done(nvme_ns, rc); 4489 } 4490 4491 static void 4492 nvme_ctrlr_depopulate_namespace_done(struct nvme_ns *nvme_ns) 4493 { 4494 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 4495 4496 assert(nvme_ctrlr != NULL); 4497 4498 pthread_mutex_lock(&nvme_ctrlr->mutex); 4499 4500 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4501 4502 if (nvme_ns->bdev != NULL) { 4503 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4504 return; 4505 } 4506 4507 nvme_ns_free(nvme_ns); 4508 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4509 4510 nvme_ctrlr_release(nvme_ctrlr); 4511 } 4512 4513 static void 4514 bdev_nvme_delete_io_path_done(struct spdk_io_channel_iter *i, int status) 4515 { 4516 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4517 4518 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 4519 } 4520 4521 static void 4522 nvme_ctrlr_depopulate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4523 { 4524 struct nvme_bdev *bdev; 4525 4526 spdk_poller_unregister(&nvme_ns->anatt_timer); 4527 4528 bdev = nvme_ns->bdev; 4529 if (bdev != NULL) { 4530 pthread_mutex_lock(&bdev->mutex); 4531 4532 assert(bdev->ref > 0); 4533 bdev->ref--; 4534 if (bdev->ref == 0) { 4535 pthread_mutex_unlock(&bdev->mutex); 4536 4537 spdk_bdev_unregister(&bdev->disk, NULL, NULL); 4538 } else { 4539 /* spdk_bdev_unregister() is not called until the last nvme_ns is 4540 * depopulated. Hence we need to remove nvme_ns from bdev->nvme_ns_list 4541 * and clear nvme_ns->bdev here. 4542 */ 4543 TAILQ_REMOVE(&bdev->nvme_ns_list, nvme_ns, tailq); 4544 nvme_ns->bdev = NULL; 4545 4546 pthread_mutex_unlock(&bdev->mutex); 4547 4548 /* Delete nvme_io_paths from nvme_bdev_channels dynamically. After that, 4549 * we call depopulate_namespace_done() to avoid use-after-free. 4550 */ 4551 spdk_for_each_channel(bdev, 4552 bdev_nvme_delete_io_path, 4553 nvme_ns, 4554 bdev_nvme_delete_io_path_done); 4555 return; 4556 } 4557 } 4558 4559 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 4560 } 4561 4562 static void 4563 nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 4564 struct nvme_async_probe_ctx *ctx) 4565 { 4566 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 4567 struct nvme_ns *nvme_ns, *next; 4568 struct spdk_nvme_ns *ns; 4569 struct nvme_bdev *bdev; 4570 uint32_t nsid; 4571 int rc; 4572 uint64_t num_sectors; 4573 4574 if (ctx) { 4575 /* Initialize this count to 1 to handle the populate functions 4576 * calling nvme_ctrlr_populate_namespace_done() immediately. 4577 */ 4578 ctx->populates_in_progress = 1; 4579 } 4580 4581 /* First loop over our existing namespaces and see if they have been 4582 * removed. */ 4583 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 4584 while (nvme_ns != NULL) { 4585 next = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 4586 4587 if (spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) { 4588 /* NS is still there but attributes may have changed */ 4589 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id); 4590 num_sectors = spdk_nvme_ns_get_num_sectors(ns); 4591 bdev = nvme_ns->bdev; 4592 assert(bdev != NULL); 4593 if (bdev->disk.blockcnt != num_sectors) { 4594 SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n", 4595 nvme_ns->id, 4596 bdev->disk.name, 4597 bdev->disk.blockcnt, 4598 num_sectors); 4599 rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors); 4600 if (rc != 0) { 4601 SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n", 4602 bdev->disk.name, rc); 4603 } 4604 } 4605 } else { 4606 /* Namespace was removed */ 4607 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 4608 } 4609 4610 nvme_ns = next; 4611 } 4612 4613 /* Loop through all of the namespaces at the nvme level and see if any of them are new */ 4614 nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 4615 while (nsid != 0) { 4616 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 4617 4618 if (nvme_ns == NULL) { 4619 /* Found a new one */ 4620 nvme_ns = nvme_ns_alloc(); 4621 if (nvme_ns == NULL) { 4622 SPDK_ERRLOG("Failed to allocate namespace\n"); 4623 /* This just fails to attach the namespace. It may work on a future attempt. */ 4624 continue; 4625 } 4626 4627 nvme_ns->id = nsid; 4628 nvme_ns->ctrlr = nvme_ctrlr; 4629 4630 nvme_ns->bdev = NULL; 4631 4632 if (ctx) { 4633 ctx->populates_in_progress++; 4634 } 4635 nvme_ns->probe_ctx = ctx; 4636 4637 RB_INSERT(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4638 4639 nvme_ctrlr_populate_namespace(nvme_ctrlr, nvme_ns); 4640 } 4641 4642 nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid); 4643 } 4644 4645 if (ctx) { 4646 /* Decrement this count now that the loop is over to account 4647 * for the one we started with. If the count is then 0, we 4648 * know any populate_namespace functions completed immediately, 4649 * so we'll kick the callback here. 4650 */ 4651 ctx->populates_in_progress--; 4652 if (ctx->populates_in_progress == 0) { 4653 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 4654 } 4655 } 4656 4657 } 4658 4659 static void 4660 nvme_ctrlr_depopulate_namespaces(struct nvme_ctrlr *nvme_ctrlr) 4661 { 4662 struct nvme_ns *nvme_ns, *tmp; 4663 4664 RB_FOREACH_SAFE(nvme_ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp) { 4665 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 4666 } 4667 } 4668 4669 static uint32_t 4670 nvme_ctrlr_get_ana_log_page_size(struct nvme_ctrlr *nvme_ctrlr) 4671 { 4672 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 4673 const struct spdk_nvme_ctrlr_data *cdata; 4674 uint32_t nsid, ns_count = 0; 4675 4676 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 4677 4678 for (nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 4679 nsid != 0; nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid)) { 4680 ns_count++; 4681 } 4682 4683 return sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 4684 sizeof(struct spdk_nvme_ana_group_descriptor) + ns_count * 4685 sizeof(uint32_t); 4686 } 4687 4688 static int 4689 nvme_ctrlr_set_ana_states(const struct spdk_nvme_ana_group_descriptor *desc, 4690 void *cb_arg) 4691 { 4692 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 4693 struct nvme_ns *nvme_ns; 4694 uint32_t i, nsid; 4695 4696 for (i = 0; i < desc->num_of_nsid; i++) { 4697 nsid = desc->nsid[i]; 4698 if (nsid == 0) { 4699 continue; 4700 } 4701 4702 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 4703 4704 assert(nvme_ns != NULL); 4705 if (nvme_ns == NULL) { 4706 /* Target told us that an inactive namespace had an ANA change */ 4707 continue; 4708 } 4709 4710 _nvme_ns_set_ana_state(nvme_ns, desc); 4711 } 4712 4713 return 0; 4714 } 4715 4716 static void 4717 bdev_nvme_disable_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 4718 { 4719 struct nvme_ns *nvme_ns; 4720 4721 spdk_free(nvme_ctrlr->ana_log_page); 4722 nvme_ctrlr->ana_log_page = NULL; 4723 4724 for (nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 4725 nvme_ns != NULL; 4726 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns)) { 4727 nvme_ns->ana_state_updating = false; 4728 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 4729 } 4730 } 4731 4732 static void 4733 nvme_ctrlr_read_ana_log_page_done(void *ctx, const struct spdk_nvme_cpl *cpl) 4734 { 4735 struct nvme_ctrlr *nvme_ctrlr = ctx; 4736 4737 if (cpl != NULL && spdk_nvme_cpl_is_success(cpl)) { 4738 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ctrlr_set_ana_states, 4739 nvme_ctrlr); 4740 } else { 4741 bdev_nvme_disable_read_ana_log_page(nvme_ctrlr); 4742 } 4743 4744 pthread_mutex_lock(&nvme_ctrlr->mutex); 4745 4746 assert(nvme_ctrlr->ana_log_page_updating == true); 4747 nvme_ctrlr->ana_log_page_updating = false; 4748 4749 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 4750 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4751 4752 nvme_ctrlr_unregister(nvme_ctrlr); 4753 } else { 4754 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4755 4756 bdev_nvme_clear_io_path_caches(nvme_ctrlr); 4757 } 4758 } 4759 4760 static int 4761 nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 4762 { 4763 uint32_t ana_log_page_size; 4764 int rc; 4765 4766 if (nvme_ctrlr->ana_log_page == NULL) { 4767 return -EINVAL; 4768 } 4769 4770 ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); 4771 4772 if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { 4773 SPDK_ERRLOG("ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", 4774 ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); 4775 return -EINVAL; 4776 } 4777 4778 pthread_mutex_lock(&nvme_ctrlr->mutex); 4779 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 4780 nvme_ctrlr->ana_log_page_updating) { 4781 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4782 return -EBUSY; 4783 } 4784 4785 nvme_ctrlr->ana_log_page_updating = true; 4786 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4787 4788 rc = spdk_nvme_ctrlr_cmd_get_log_page(nvme_ctrlr->ctrlr, 4789 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 4790 SPDK_NVME_GLOBAL_NS_TAG, 4791 nvme_ctrlr->ana_log_page, 4792 ana_log_page_size, 0, 4793 nvme_ctrlr_read_ana_log_page_done, 4794 nvme_ctrlr); 4795 if (rc != 0) { 4796 nvme_ctrlr_read_ana_log_page_done(nvme_ctrlr, NULL); 4797 } 4798 4799 return rc; 4800 } 4801 4802 static void 4803 dummy_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 4804 { 4805 } 4806 4807 struct bdev_nvme_set_preferred_path_ctx { 4808 struct spdk_bdev_desc *desc; 4809 struct nvme_ns *nvme_ns; 4810 bdev_nvme_set_preferred_path_cb cb_fn; 4811 void *cb_arg; 4812 }; 4813 4814 static void 4815 bdev_nvme_set_preferred_path_done(struct spdk_io_channel_iter *i, int status) 4816 { 4817 struct bdev_nvme_set_preferred_path_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4818 4819 assert(ctx != NULL); 4820 assert(ctx->desc != NULL); 4821 assert(ctx->cb_fn != NULL); 4822 4823 spdk_bdev_close(ctx->desc); 4824 4825 ctx->cb_fn(ctx->cb_arg, status); 4826 4827 free(ctx); 4828 } 4829 4830 static void 4831 _bdev_nvme_set_preferred_path(struct spdk_io_channel_iter *i) 4832 { 4833 struct bdev_nvme_set_preferred_path_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4834 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4835 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4836 struct nvme_io_path *io_path, *prev; 4837 4838 prev = NULL; 4839 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 4840 if (io_path->nvme_ns == ctx->nvme_ns) { 4841 break; 4842 } 4843 prev = io_path; 4844 } 4845 4846 if (io_path != NULL) { 4847 if (prev != NULL) { 4848 STAILQ_REMOVE_AFTER(&nbdev_ch->io_path_list, prev, stailq); 4849 STAILQ_INSERT_HEAD(&nbdev_ch->io_path_list, io_path, stailq); 4850 } 4851 4852 /* We can set io_path to nbdev_ch->current_io_path directly here. 4853 * However, it needs to be conditional. To simplify the code, 4854 * just clear nbdev_ch->current_io_path and let find_io_path() 4855 * fill it. 4856 * 4857 * Automatic failback may be disabled. Hence even if the io_path is 4858 * already at the head, clear nbdev_ch->current_io_path. 4859 */ 4860 bdev_nvme_clear_current_io_path(nbdev_ch); 4861 } 4862 4863 spdk_for_each_channel_continue(i, 0); 4864 } 4865 4866 static struct nvme_ns * 4867 bdev_nvme_set_preferred_ns(struct nvme_bdev *nbdev, uint16_t cntlid) 4868 { 4869 struct nvme_ns *nvme_ns, *prev; 4870 const struct spdk_nvme_ctrlr_data *cdata; 4871 4872 prev = NULL; 4873 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 4874 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 4875 4876 if (cdata->cntlid == cntlid) { 4877 break; 4878 } 4879 prev = nvme_ns; 4880 } 4881 4882 if (nvme_ns != NULL && prev != NULL) { 4883 TAILQ_REMOVE(&nbdev->nvme_ns_list, nvme_ns, tailq); 4884 TAILQ_INSERT_HEAD(&nbdev->nvme_ns_list, nvme_ns, tailq); 4885 } 4886 4887 return nvme_ns; 4888 } 4889 4890 /* This function supports only multipath mode. There is only a single I/O path 4891 * for each NVMe-oF controller. Hence, just move the matched I/O path to the 4892 * head of the I/O path list for each NVMe bdev channel. 4893 * 4894 * NVMe bdev channel may be acquired after completing this function. move the 4895 * matched namespace to the head of the namespace list for the NVMe bdev too. 4896 */ 4897 void 4898 bdev_nvme_set_preferred_path(const char *name, uint16_t cntlid, 4899 bdev_nvme_set_preferred_path_cb cb_fn, void *cb_arg) 4900 { 4901 struct bdev_nvme_set_preferred_path_ctx *ctx; 4902 struct spdk_bdev *bdev; 4903 struct nvme_bdev *nbdev; 4904 int rc = 0; 4905 4906 assert(cb_fn != NULL); 4907 4908 ctx = calloc(1, sizeof(*ctx)); 4909 if (ctx == NULL) { 4910 SPDK_ERRLOG("Failed to alloc context.\n"); 4911 rc = -ENOMEM; 4912 goto err_alloc; 4913 } 4914 4915 ctx->cb_fn = cb_fn; 4916 ctx->cb_arg = cb_arg; 4917 4918 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 4919 if (rc != 0) { 4920 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 4921 goto err_open; 4922 } 4923 4924 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 4925 4926 if (bdev->module != &nvme_if) { 4927 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 4928 rc = -ENODEV; 4929 goto err_bdev; 4930 } 4931 4932 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 4933 4934 pthread_mutex_lock(&nbdev->mutex); 4935 4936 ctx->nvme_ns = bdev_nvme_set_preferred_ns(nbdev, cntlid); 4937 if (ctx->nvme_ns == NULL) { 4938 pthread_mutex_unlock(&nbdev->mutex); 4939 4940 SPDK_ERRLOG("bdev %s does not have namespace to controller %u.\n", name, cntlid); 4941 rc = -ENODEV; 4942 goto err_bdev; 4943 } 4944 4945 pthread_mutex_unlock(&nbdev->mutex); 4946 4947 spdk_for_each_channel(nbdev, 4948 _bdev_nvme_set_preferred_path, 4949 ctx, 4950 bdev_nvme_set_preferred_path_done); 4951 return; 4952 4953 err_bdev: 4954 spdk_bdev_close(ctx->desc); 4955 err_open: 4956 free(ctx); 4957 err_alloc: 4958 cb_fn(cb_arg, rc); 4959 } 4960 4961 struct bdev_nvme_set_multipath_policy_ctx { 4962 struct spdk_bdev_desc *desc; 4963 bdev_nvme_set_multipath_policy_cb cb_fn; 4964 void *cb_arg; 4965 }; 4966 4967 static void 4968 bdev_nvme_set_multipath_policy_done(struct spdk_io_channel_iter *i, int status) 4969 { 4970 struct bdev_nvme_set_multipath_policy_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4971 4972 assert(ctx != NULL); 4973 assert(ctx->desc != NULL); 4974 assert(ctx->cb_fn != NULL); 4975 4976 spdk_bdev_close(ctx->desc); 4977 4978 ctx->cb_fn(ctx->cb_arg, status); 4979 4980 free(ctx); 4981 } 4982 4983 static void 4984 _bdev_nvme_set_multipath_policy(struct spdk_io_channel_iter *i) 4985 { 4986 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4987 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4988 struct nvme_bdev *nbdev = spdk_io_channel_get_io_device(_ch); 4989 4990 nbdev_ch->mp_policy = nbdev->mp_policy; 4991 nbdev_ch->mp_selector = nbdev->mp_selector; 4992 nbdev_ch->rr_min_io = nbdev->rr_min_io; 4993 bdev_nvme_clear_current_io_path(nbdev_ch); 4994 4995 spdk_for_each_channel_continue(i, 0); 4996 } 4997 4998 void 4999 bdev_nvme_set_multipath_policy(const char *name, enum bdev_nvme_multipath_policy policy, 5000 enum bdev_nvme_multipath_selector selector, uint32_t rr_min_io, 5001 bdev_nvme_set_multipath_policy_cb cb_fn, void *cb_arg) 5002 { 5003 struct bdev_nvme_set_multipath_policy_ctx *ctx; 5004 struct spdk_bdev *bdev; 5005 struct nvme_bdev *nbdev; 5006 int rc; 5007 5008 assert(cb_fn != NULL); 5009 5010 if (policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE && selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 5011 if (rr_min_io == UINT32_MAX) { 5012 rr_min_io = 1; 5013 } else if (rr_min_io == 0) { 5014 rc = -EINVAL; 5015 goto exit; 5016 } 5017 } else if (rr_min_io != UINT32_MAX) { 5018 rc = -EINVAL; 5019 goto exit; 5020 } 5021 5022 ctx = calloc(1, sizeof(*ctx)); 5023 if (ctx == NULL) { 5024 SPDK_ERRLOG("Failed to alloc context.\n"); 5025 rc = -ENOMEM; 5026 goto exit; 5027 } 5028 5029 ctx->cb_fn = cb_fn; 5030 ctx->cb_arg = cb_arg; 5031 5032 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 5033 if (rc != 0) { 5034 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 5035 rc = -ENODEV; 5036 goto err_open; 5037 } 5038 5039 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 5040 if (bdev->module != &nvme_if) { 5041 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 5042 rc = -ENODEV; 5043 goto err_module; 5044 } 5045 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 5046 5047 pthread_mutex_lock(&nbdev->mutex); 5048 nbdev->mp_policy = policy; 5049 nbdev->mp_selector = selector; 5050 nbdev->rr_min_io = rr_min_io; 5051 pthread_mutex_unlock(&nbdev->mutex); 5052 5053 spdk_for_each_channel(nbdev, 5054 _bdev_nvme_set_multipath_policy, 5055 ctx, 5056 bdev_nvme_set_multipath_policy_done); 5057 return; 5058 5059 err_module: 5060 spdk_bdev_close(ctx->desc); 5061 err_open: 5062 free(ctx); 5063 exit: 5064 cb_fn(cb_arg, rc); 5065 } 5066 5067 static void 5068 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 5069 { 5070 struct nvme_ctrlr *nvme_ctrlr = arg; 5071 union spdk_nvme_async_event_completion event; 5072 5073 if (spdk_nvme_cpl_is_error(cpl)) { 5074 SPDK_WARNLOG("AER request execute failed\n"); 5075 return; 5076 } 5077 5078 event.raw = cpl->cdw0; 5079 if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 5080 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) { 5081 nvme_ctrlr_populate_namespaces(nvme_ctrlr, NULL); 5082 } else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 5083 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_ANA_CHANGE)) { 5084 nvme_ctrlr_read_ana_log_page(nvme_ctrlr); 5085 } 5086 } 5087 5088 static void 5089 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, int rc) 5090 { 5091 if (ctx->cb_fn) { 5092 ctx->cb_fn(ctx->cb_ctx, ctx->reported_bdevs, rc); 5093 } 5094 5095 ctx->namespaces_populated = true; 5096 if (ctx->probe_done) { 5097 /* The probe was already completed, so we need to free the context 5098 * here. This can happen for cases like OCSSD, where we need to 5099 * send additional commands to the SSD after attach. 5100 */ 5101 free(ctx); 5102 } 5103 } 5104 5105 static void 5106 nvme_ctrlr_create_done(struct nvme_ctrlr *nvme_ctrlr, 5107 struct nvme_async_probe_ctx *ctx) 5108 { 5109 spdk_io_device_register(nvme_ctrlr, 5110 bdev_nvme_create_ctrlr_channel_cb, 5111 bdev_nvme_destroy_ctrlr_channel_cb, 5112 sizeof(struct nvme_ctrlr_channel), 5113 nvme_ctrlr->nbdev_ctrlr->name); 5114 5115 nvme_ctrlr_populate_namespaces(nvme_ctrlr, ctx); 5116 } 5117 5118 static void 5119 nvme_ctrlr_init_ana_log_page_done(void *_ctx, const struct spdk_nvme_cpl *cpl) 5120 { 5121 struct nvme_ctrlr *nvme_ctrlr = _ctx; 5122 struct nvme_async_probe_ctx *ctx = nvme_ctrlr->probe_ctx; 5123 5124 nvme_ctrlr->probe_ctx = NULL; 5125 5126 if (spdk_nvme_cpl_is_error(cpl)) { 5127 nvme_ctrlr_delete(nvme_ctrlr); 5128 5129 if (ctx != NULL) { 5130 ctx->reported_bdevs = 0; 5131 populate_namespaces_cb(ctx, -1); 5132 } 5133 return; 5134 } 5135 5136 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 5137 } 5138 5139 static int 5140 nvme_ctrlr_init_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 5141 struct nvme_async_probe_ctx *ctx) 5142 { 5143 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 5144 const struct spdk_nvme_ctrlr_data *cdata; 5145 uint32_t ana_log_page_size; 5146 5147 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5148 5149 /* Set buffer size enough to include maximum number of allowed namespaces. */ 5150 ana_log_page_size = sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 5151 sizeof(struct spdk_nvme_ana_group_descriptor) + cdata->mnan * 5152 sizeof(uint32_t); 5153 5154 nvme_ctrlr->ana_log_page = spdk_zmalloc(ana_log_page_size, 64, NULL, 5155 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 5156 if (nvme_ctrlr->ana_log_page == NULL) { 5157 SPDK_ERRLOG("could not allocate ANA log page buffer\n"); 5158 return -ENXIO; 5159 } 5160 5161 /* Each descriptor in a ANA log page is not ensured to be 8-bytes aligned. 5162 * Hence copy each descriptor to a temporary area when parsing it. 5163 * 5164 * Allocate a buffer whose size is as large as ANA log page buffer because 5165 * we do not know the size of a descriptor until actually reading it. 5166 */ 5167 nvme_ctrlr->copied_ana_desc = calloc(1, ana_log_page_size); 5168 if (nvme_ctrlr->copied_ana_desc == NULL) { 5169 SPDK_ERRLOG("could not allocate a buffer to parse ANA descriptor\n"); 5170 return -ENOMEM; 5171 } 5172 5173 nvme_ctrlr->max_ana_log_page_size = ana_log_page_size; 5174 5175 nvme_ctrlr->probe_ctx = ctx; 5176 5177 /* Then, set the read size only to include the current active namespaces. */ 5178 ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); 5179 5180 if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { 5181 SPDK_ERRLOG("ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", 5182 ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); 5183 return -EINVAL; 5184 } 5185 5186 return spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, 5187 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 5188 SPDK_NVME_GLOBAL_NS_TAG, 5189 nvme_ctrlr->ana_log_page, 5190 ana_log_page_size, 0, 5191 nvme_ctrlr_init_ana_log_page_done, 5192 nvme_ctrlr); 5193 } 5194 5195 /* hostnqn and subnqn were already verified before attaching a controller. 5196 * Hence check only the multipath capability and cntlid here. 5197 */ 5198 static bool 5199 bdev_nvme_check_multipath(struct nvme_bdev_ctrlr *nbdev_ctrlr, struct spdk_nvme_ctrlr *ctrlr) 5200 { 5201 struct nvme_ctrlr *tmp; 5202 const struct spdk_nvme_ctrlr_data *cdata, *tmp_cdata; 5203 5204 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5205 5206 if (!cdata->cmic.multi_ctrlr) { 5207 SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); 5208 return false; 5209 } 5210 5211 TAILQ_FOREACH(tmp, &nbdev_ctrlr->ctrlrs, tailq) { 5212 tmp_cdata = spdk_nvme_ctrlr_get_data(tmp->ctrlr); 5213 5214 if (!tmp_cdata->cmic.multi_ctrlr) { 5215 SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); 5216 return false; 5217 } 5218 if (cdata->cntlid == tmp_cdata->cntlid) { 5219 SPDK_ERRLOG("cntlid %u are duplicated.\n", tmp_cdata->cntlid); 5220 return false; 5221 } 5222 } 5223 5224 return true; 5225 } 5226 5227 static int 5228 nvme_bdev_ctrlr_create(const char *name, struct nvme_ctrlr *nvme_ctrlr) 5229 { 5230 struct nvme_bdev_ctrlr *nbdev_ctrlr; 5231 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 5232 int rc = 0; 5233 5234 pthread_mutex_lock(&g_bdev_nvme_mutex); 5235 5236 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 5237 if (nbdev_ctrlr != NULL) { 5238 if (!bdev_nvme_check_multipath(nbdev_ctrlr, ctrlr)) { 5239 rc = -EINVAL; 5240 goto exit; 5241 } 5242 } else { 5243 nbdev_ctrlr = calloc(1, sizeof(*nbdev_ctrlr)); 5244 if (nbdev_ctrlr == NULL) { 5245 SPDK_ERRLOG("Failed to allocate nvme_bdev_ctrlr.\n"); 5246 rc = -ENOMEM; 5247 goto exit; 5248 } 5249 nbdev_ctrlr->name = strdup(name); 5250 if (nbdev_ctrlr->name == NULL) { 5251 SPDK_ERRLOG("Failed to allocate name of nvme_bdev_ctrlr.\n"); 5252 free(nbdev_ctrlr); 5253 goto exit; 5254 } 5255 TAILQ_INIT(&nbdev_ctrlr->ctrlrs); 5256 TAILQ_INIT(&nbdev_ctrlr->bdevs); 5257 TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 5258 } 5259 nvme_ctrlr->nbdev_ctrlr = nbdev_ctrlr; 5260 TAILQ_INSERT_TAIL(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 5261 exit: 5262 pthread_mutex_unlock(&g_bdev_nvme_mutex); 5263 return rc; 5264 } 5265 5266 static int 5267 nvme_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr, 5268 const char *name, 5269 const struct spdk_nvme_transport_id *trid, 5270 struct nvme_async_probe_ctx *ctx) 5271 { 5272 struct nvme_ctrlr *nvme_ctrlr; 5273 struct nvme_path_id *path_id; 5274 const struct spdk_nvme_ctrlr_data *cdata; 5275 int rc; 5276 5277 nvme_ctrlr = calloc(1, sizeof(*nvme_ctrlr)); 5278 if (nvme_ctrlr == NULL) { 5279 SPDK_ERRLOG("Failed to allocate device struct\n"); 5280 return -ENOMEM; 5281 } 5282 5283 rc = pthread_mutex_init(&nvme_ctrlr->mutex, NULL); 5284 if (rc != 0) { 5285 free(nvme_ctrlr); 5286 return rc; 5287 } 5288 5289 TAILQ_INIT(&nvme_ctrlr->trids); 5290 5291 RB_INIT(&nvme_ctrlr->namespaces); 5292 5293 path_id = calloc(1, sizeof(*path_id)); 5294 if (path_id == NULL) { 5295 SPDK_ERRLOG("Failed to allocate trid entry pointer\n"); 5296 rc = -ENOMEM; 5297 goto err; 5298 } 5299 5300 path_id->trid = *trid; 5301 if (ctx != NULL) { 5302 memcpy(path_id->hostid.hostaddr, ctx->drv_opts.src_addr, sizeof(path_id->hostid.hostaddr)); 5303 memcpy(path_id->hostid.hostsvcid, ctx->drv_opts.src_svcid, sizeof(path_id->hostid.hostsvcid)); 5304 } 5305 nvme_ctrlr->active_path_id = path_id; 5306 TAILQ_INSERT_HEAD(&nvme_ctrlr->trids, path_id, link); 5307 5308 nvme_ctrlr->thread = spdk_get_thread(); 5309 nvme_ctrlr->ctrlr = ctrlr; 5310 nvme_ctrlr->ref = 1; 5311 5312 if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) { 5313 SPDK_ERRLOG("OCSSDs are not supported"); 5314 rc = -ENOTSUP; 5315 goto err; 5316 } 5317 5318 if (ctx != NULL) { 5319 memcpy(&nvme_ctrlr->opts, &ctx->bdev_opts, sizeof(ctx->bdev_opts)); 5320 } else { 5321 bdev_nvme_get_default_ctrlr_opts(&nvme_ctrlr->opts); 5322 } 5323 5324 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_ctrlr, 5325 g_opts.nvme_adminq_poll_period_us); 5326 5327 if (g_opts.timeout_us > 0) { 5328 /* Register timeout callback. Timeout values for IO vs. admin reqs can be different. */ 5329 /* If timeout_admin_us is 0 (not specified), admin uses same timeout as IO. */ 5330 uint64_t adm_timeout_us = (g_opts.timeout_admin_us == 0) ? 5331 g_opts.timeout_us : g_opts.timeout_admin_us; 5332 spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us, 5333 adm_timeout_us, timeout_cb, nvme_ctrlr); 5334 } 5335 5336 spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_ctrlr); 5337 spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_ctrlr); 5338 5339 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 5340 SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) { 5341 nvme_ctrlr->opal_dev = spdk_opal_dev_construct(ctrlr); 5342 } 5343 5344 rc = nvme_bdev_ctrlr_create(name, nvme_ctrlr); 5345 if (rc != 0) { 5346 goto err; 5347 } 5348 5349 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5350 5351 if (cdata->cmic.ana_reporting) { 5352 rc = nvme_ctrlr_init_ana_log_page(nvme_ctrlr, ctx); 5353 if (rc == 0) { 5354 return 0; 5355 } 5356 } else { 5357 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 5358 return 0; 5359 } 5360 5361 err: 5362 nvme_ctrlr_delete(nvme_ctrlr); 5363 return rc; 5364 } 5365 5366 void 5367 bdev_nvme_get_default_ctrlr_opts(struct nvme_ctrlr_opts *opts) 5368 { 5369 opts->prchk_flags = 0; 5370 opts->ctrlr_loss_timeout_sec = g_opts.ctrlr_loss_timeout_sec; 5371 opts->reconnect_delay_sec = g_opts.reconnect_delay_sec; 5372 opts->fast_io_fail_timeout_sec = g_opts.fast_io_fail_timeout_sec; 5373 } 5374 5375 static void 5376 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5377 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *drv_opts) 5378 { 5379 char *name; 5380 5381 name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++); 5382 if (!name) { 5383 SPDK_ERRLOG("Failed to assign name to NVMe device\n"); 5384 return; 5385 } 5386 5387 if (nvme_ctrlr_create(ctrlr, name, trid, NULL) == 0) { 5388 SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name); 5389 } else { 5390 SPDK_ERRLOG("Failed to attach to %s (%s)\n", trid->traddr, name); 5391 } 5392 5393 free(name); 5394 } 5395 5396 static void 5397 _nvme_ctrlr_destruct(void *ctx) 5398 { 5399 struct nvme_ctrlr *nvme_ctrlr = ctx; 5400 5401 nvme_ctrlr_depopulate_namespaces(nvme_ctrlr); 5402 nvme_ctrlr_release(nvme_ctrlr); 5403 } 5404 5405 static int 5406 bdev_nvme_delete_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) 5407 { 5408 struct nvme_probe_skip_entry *entry; 5409 5410 /* The controller's destruction was already started */ 5411 if (nvme_ctrlr->destruct) { 5412 return -EALREADY; 5413 } 5414 5415 if (!hotplug && 5416 nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { 5417 entry = calloc(1, sizeof(*entry)); 5418 if (!entry) { 5419 return -ENOMEM; 5420 } 5421 entry->trid = nvme_ctrlr->active_path_id->trid; 5422 TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq); 5423 } 5424 5425 nvme_ctrlr->destruct = true; 5426 return 0; 5427 } 5428 5429 static int 5430 bdev_nvme_delete_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) 5431 { 5432 int rc; 5433 5434 pthread_mutex_lock(&nvme_ctrlr->mutex); 5435 rc = bdev_nvme_delete_ctrlr_unsafe(nvme_ctrlr, hotplug); 5436 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5437 5438 if (rc == 0) { 5439 _nvme_ctrlr_destruct(nvme_ctrlr); 5440 } else if (rc == -EALREADY) { 5441 rc = 0; 5442 } 5443 5444 return rc; 5445 } 5446 5447 static void 5448 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr) 5449 { 5450 struct nvme_ctrlr *nvme_ctrlr = cb_ctx; 5451 5452 bdev_nvme_delete_ctrlr(nvme_ctrlr, true); 5453 } 5454 5455 static int 5456 bdev_nvme_hotplug_probe(void *arg) 5457 { 5458 if (g_hotplug_probe_ctx == NULL) { 5459 spdk_poller_unregister(&g_hotplug_probe_poller); 5460 return SPDK_POLLER_IDLE; 5461 } 5462 5463 if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) { 5464 g_hotplug_probe_ctx = NULL; 5465 spdk_poller_unregister(&g_hotplug_probe_poller); 5466 } 5467 5468 return SPDK_POLLER_BUSY; 5469 } 5470 5471 static int 5472 bdev_nvme_hotplug(void *arg) 5473 { 5474 struct spdk_nvme_transport_id trid_pcie; 5475 5476 if (g_hotplug_probe_ctx) { 5477 return SPDK_POLLER_BUSY; 5478 } 5479 5480 memset(&trid_pcie, 0, sizeof(trid_pcie)); 5481 spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE); 5482 5483 g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL, 5484 hotplug_probe_cb, attach_cb, NULL); 5485 5486 if (g_hotplug_probe_ctx) { 5487 assert(g_hotplug_probe_poller == NULL); 5488 g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000); 5489 } 5490 5491 return SPDK_POLLER_BUSY; 5492 } 5493 5494 void 5495 bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts) 5496 { 5497 *opts = g_opts; 5498 } 5499 5500 static bool bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, 5501 uint32_t reconnect_delay_sec, 5502 uint32_t fast_io_fail_timeout_sec); 5503 5504 static int 5505 bdev_nvme_validate_opts(const struct spdk_bdev_nvme_opts *opts) 5506 { 5507 if ((opts->timeout_us == 0) && (opts->timeout_admin_us != 0)) { 5508 /* Can't set timeout_admin_us without also setting timeout_us */ 5509 SPDK_WARNLOG("Invalid options: Can't have (timeout_us == 0) with (timeout_admin_us > 0)\n"); 5510 return -EINVAL; 5511 } 5512 5513 if (opts->bdev_retry_count < -1) { 5514 SPDK_WARNLOG("Invalid option: bdev_retry_count can't be less than -1.\n"); 5515 return -EINVAL; 5516 } 5517 5518 if (!bdev_nvme_check_io_error_resiliency_params(opts->ctrlr_loss_timeout_sec, 5519 opts->reconnect_delay_sec, 5520 opts->fast_io_fail_timeout_sec)) { 5521 return -EINVAL; 5522 } 5523 5524 return 0; 5525 } 5526 5527 int 5528 bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts) 5529 { 5530 int ret; 5531 5532 ret = bdev_nvme_validate_opts(opts); 5533 if (ret) { 5534 SPDK_WARNLOG("Failed to set nvme opts.\n"); 5535 return ret; 5536 } 5537 5538 if (g_bdev_nvme_init_thread != NULL) { 5539 if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 5540 return -EPERM; 5541 } 5542 } 5543 5544 if (opts->rdma_srq_size != 0 || 5545 opts->rdma_max_cq_size != 0) { 5546 struct spdk_nvme_transport_opts drv_opts; 5547 5548 spdk_nvme_transport_get_opts(&drv_opts, sizeof(drv_opts)); 5549 if (opts->rdma_srq_size != 0) { 5550 drv_opts.rdma_srq_size = opts->rdma_srq_size; 5551 } 5552 if (opts->rdma_max_cq_size != 0) { 5553 drv_opts.rdma_max_cq_size = opts->rdma_max_cq_size; 5554 } 5555 5556 ret = spdk_nvme_transport_set_opts(&drv_opts, sizeof(drv_opts)); 5557 if (ret) { 5558 SPDK_ERRLOG("Failed to set NVMe transport opts.\n"); 5559 return ret; 5560 } 5561 } 5562 5563 g_opts = *opts; 5564 5565 return 0; 5566 } 5567 5568 struct set_nvme_hotplug_ctx { 5569 uint64_t period_us; 5570 bool enabled; 5571 spdk_msg_fn fn; 5572 void *fn_ctx; 5573 }; 5574 5575 static void 5576 set_nvme_hotplug_period_cb(void *_ctx) 5577 { 5578 struct set_nvme_hotplug_ctx *ctx = _ctx; 5579 5580 spdk_poller_unregister(&g_hotplug_poller); 5581 if (ctx->enabled) { 5582 g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us); 5583 } 5584 5585 g_nvme_hotplug_poll_period_us = ctx->period_us; 5586 g_nvme_hotplug_enabled = ctx->enabled; 5587 if (ctx->fn) { 5588 ctx->fn(ctx->fn_ctx); 5589 } 5590 5591 free(ctx); 5592 } 5593 5594 int 5595 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx) 5596 { 5597 struct set_nvme_hotplug_ctx *ctx; 5598 5599 if (enabled == true && !spdk_process_is_primary()) { 5600 return -EPERM; 5601 } 5602 5603 ctx = calloc(1, sizeof(*ctx)); 5604 if (ctx == NULL) { 5605 return -ENOMEM; 5606 } 5607 5608 period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us; 5609 ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX); 5610 ctx->enabled = enabled; 5611 ctx->fn = cb; 5612 ctx->fn_ctx = cb_ctx; 5613 5614 spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx); 5615 return 0; 5616 } 5617 5618 static void 5619 nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 5620 struct nvme_async_probe_ctx *ctx) 5621 { 5622 struct nvme_ns *nvme_ns; 5623 struct nvme_bdev *nvme_bdev; 5624 size_t j; 5625 5626 assert(nvme_ctrlr != NULL); 5627 5628 if (ctx->names == NULL) { 5629 ctx->reported_bdevs = 0; 5630 populate_namespaces_cb(ctx, 0); 5631 return; 5632 } 5633 5634 /* 5635 * Report the new bdevs that were created in this call. 5636 * There can be more than one bdev per NVMe controller. 5637 */ 5638 j = 0; 5639 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 5640 while (nvme_ns != NULL) { 5641 nvme_bdev = nvme_ns->bdev; 5642 if (j < ctx->max_bdevs) { 5643 ctx->names[j] = nvme_bdev->disk.name; 5644 j++; 5645 } else { 5646 SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n", 5647 ctx->max_bdevs); 5648 ctx->reported_bdevs = 0; 5649 populate_namespaces_cb(ctx, -ERANGE); 5650 return; 5651 } 5652 5653 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 5654 } 5655 5656 ctx->reported_bdevs = j; 5657 populate_namespaces_cb(ctx, 0); 5658 } 5659 5660 static int 5661 bdev_nvme_check_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 5662 struct spdk_nvme_ctrlr *new_ctrlr, 5663 struct spdk_nvme_transport_id *trid) 5664 { 5665 struct nvme_path_id *tmp_trid; 5666 5667 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 5668 SPDK_ERRLOG("PCIe failover is not supported.\n"); 5669 return -ENOTSUP; 5670 } 5671 5672 /* Currently we only support failover to the same transport type. */ 5673 if (nvme_ctrlr->active_path_id->trid.trtype != trid->trtype) { 5674 SPDK_WARNLOG("Failover from trtype: %s to a different trtype: %s is not supported currently\n", 5675 spdk_nvme_transport_id_trtype_str(nvme_ctrlr->active_path_id->trid.trtype), 5676 spdk_nvme_transport_id_trtype_str(trid->trtype)); 5677 return -EINVAL; 5678 } 5679 5680 5681 /* Currently we only support failover to the same NQN. */ 5682 if (strncmp(trid->subnqn, nvme_ctrlr->active_path_id->trid.subnqn, SPDK_NVMF_NQN_MAX_LEN)) { 5683 SPDK_WARNLOG("Failover from subnqn: %s to a different subnqn: %s is not supported currently\n", 5684 nvme_ctrlr->active_path_id->trid.subnqn, trid->subnqn); 5685 return -EINVAL; 5686 } 5687 5688 /* Skip all the other checks if we've already registered this path. */ 5689 TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) { 5690 if (!spdk_nvme_transport_id_compare(&tmp_trid->trid, trid)) { 5691 SPDK_WARNLOG("This path (traddr: %s subnqn: %s) is already registered\n", trid->traddr, 5692 trid->subnqn); 5693 return -EEXIST; 5694 } 5695 } 5696 5697 return 0; 5698 } 5699 5700 static int 5701 bdev_nvme_check_secondary_namespace(struct nvme_ctrlr *nvme_ctrlr, 5702 struct spdk_nvme_ctrlr *new_ctrlr) 5703 { 5704 struct nvme_ns *nvme_ns; 5705 struct spdk_nvme_ns *new_ns; 5706 5707 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 5708 while (nvme_ns != NULL) { 5709 new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nvme_ns->id); 5710 assert(new_ns != NULL); 5711 5712 if (!bdev_nvme_compare_ns(nvme_ns->ns, new_ns)) { 5713 return -EINVAL; 5714 } 5715 5716 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 5717 } 5718 5719 return 0; 5720 } 5721 5722 static int 5723 _bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 5724 struct spdk_nvme_transport_id *trid) 5725 { 5726 struct nvme_path_id *active_id, *new_trid, *tmp_trid; 5727 5728 new_trid = calloc(1, sizeof(*new_trid)); 5729 if (new_trid == NULL) { 5730 return -ENOMEM; 5731 } 5732 new_trid->trid = *trid; 5733 5734 active_id = nvme_ctrlr->active_path_id; 5735 assert(active_id != NULL); 5736 assert(active_id == TAILQ_FIRST(&nvme_ctrlr->trids)); 5737 5738 /* Skip the active trid not to replace it until it is failed. */ 5739 tmp_trid = TAILQ_NEXT(active_id, link); 5740 if (tmp_trid == NULL) { 5741 goto add_tail; 5742 } 5743 5744 /* It means the trid is faled if its last failed time is non-zero. 5745 * Insert the new alternate trid before any failed trid. 5746 */ 5747 TAILQ_FOREACH_FROM(tmp_trid, &nvme_ctrlr->trids, link) { 5748 if (tmp_trid->last_failed_tsc != 0) { 5749 TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link); 5750 return 0; 5751 } 5752 } 5753 5754 add_tail: 5755 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, new_trid, link); 5756 return 0; 5757 } 5758 5759 /* This is the case that a secondary path is added to an existing 5760 * nvme_ctrlr for failover. After checking if it can access the same 5761 * namespaces as the primary path, it is disconnected until failover occurs. 5762 */ 5763 static int 5764 bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 5765 struct spdk_nvme_ctrlr *new_ctrlr, 5766 struct spdk_nvme_transport_id *trid) 5767 { 5768 int rc; 5769 5770 assert(nvme_ctrlr != NULL); 5771 5772 pthread_mutex_lock(&nvme_ctrlr->mutex); 5773 5774 rc = bdev_nvme_check_secondary_trid(nvme_ctrlr, new_ctrlr, trid); 5775 if (rc != 0) { 5776 goto exit; 5777 } 5778 5779 rc = bdev_nvme_check_secondary_namespace(nvme_ctrlr, new_ctrlr); 5780 if (rc != 0) { 5781 goto exit; 5782 } 5783 5784 rc = _bdev_nvme_add_secondary_trid(nvme_ctrlr, trid); 5785 5786 exit: 5787 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5788 5789 spdk_nvme_detach(new_ctrlr); 5790 5791 return rc; 5792 } 5793 5794 static void 5795 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5796 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 5797 { 5798 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 5799 struct nvme_async_probe_ctx *ctx; 5800 int rc; 5801 5802 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 5803 ctx->ctrlr_attached = true; 5804 5805 rc = nvme_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx); 5806 if (rc != 0) { 5807 ctx->reported_bdevs = 0; 5808 populate_namespaces_cb(ctx, rc); 5809 } 5810 } 5811 5812 static void 5813 connect_set_failover_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5814 struct spdk_nvme_ctrlr *ctrlr, 5815 const struct spdk_nvme_ctrlr_opts *opts) 5816 { 5817 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 5818 struct nvme_ctrlr *nvme_ctrlr; 5819 struct nvme_async_probe_ctx *ctx; 5820 int rc; 5821 5822 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 5823 ctx->ctrlr_attached = true; 5824 5825 nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->base_name); 5826 if (nvme_ctrlr) { 5827 rc = bdev_nvme_add_secondary_trid(nvme_ctrlr, ctrlr, &ctx->trid); 5828 } else { 5829 rc = -ENODEV; 5830 } 5831 5832 ctx->reported_bdevs = 0; 5833 populate_namespaces_cb(ctx, rc); 5834 } 5835 5836 static int 5837 bdev_nvme_async_poll(void *arg) 5838 { 5839 struct nvme_async_probe_ctx *ctx = arg; 5840 int rc; 5841 5842 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 5843 if (spdk_unlikely(rc != -EAGAIN)) { 5844 ctx->probe_done = true; 5845 spdk_poller_unregister(&ctx->poller); 5846 if (!ctx->ctrlr_attached) { 5847 /* The probe is done, but no controller was attached. 5848 * That means we had a failure, so report -EIO back to 5849 * the caller (usually the RPC). populate_namespaces_cb() 5850 * will take care of freeing the nvme_async_probe_ctx. 5851 */ 5852 ctx->reported_bdevs = 0; 5853 populate_namespaces_cb(ctx, -EIO); 5854 } else if (ctx->namespaces_populated) { 5855 /* The namespaces for the attached controller were all 5856 * populated and the response was already sent to the 5857 * caller (usually the RPC). So free the context here. 5858 */ 5859 free(ctx); 5860 } 5861 } 5862 5863 return SPDK_POLLER_BUSY; 5864 } 5865 5866 static bool 5867 bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, 5868 uint32_t reconnect_delay_sec, 5869 uint32_t fast_io_fail_timeout_sec) 5870 { 5871 if (ctrlr_loss_timeout_sec < -1) { 5872 SPDK_ERRLOG("ctrlr_loss_timeout_sec can't be less than -1.\n"); 5873 return false; 5874 } else if (ctrlr_loss_timeout_sec == -1) { 5875 if (reconnect_delay_sec == 0) { 5876 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 5877 return false; 5878 } else if (fast_io_fail_timeout_sec != 0 && 5879 fast_io_fail_timeout_sec < reconnect_delay_sec) { 5880 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io-fail_timeout_sec.\n"); 5881 return false; 5882 } 5883 } else if (ctrlr_loss_timeout_sec != 0) { 5884 if (reconnect_delay_sec == 0) { 5885 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 5886 return false; 5887 } else if (reconnect_delay_sec > (uint32_t)ctrlr_loss_timeout_sec) { 5888 SPDK_ERRLOG("reconnect_delay_sec can't be more than ctrlr_loss_timeout_sec.\n"); 5889 return false; 5890 } else if (fast_io_fail_timeout_sec != 0) { 5891 if (fast_io_fail_timeout_sec < reconnect_delay_sec) { 5892 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io_fail_timeout_sec.\n"); 5893 return false; 5894 } else if (fast_io_fail_timeout_sec > (uint32_t)ctrlr_loss_timeout_sec) { 5895 SPDK_ERRLOG("fast_io_fail_timeout_sec can't be more than ctrlr_loss_timeout_sec.\n"); 5896 return false; 5897 } 5898 } 5899 } else if (reconnect_delay_sec != 0 || fast_io_fail_timeout_sec != 0) { 5900 SPDK_ERRLOG("Both reconnect_delay_sec and fast_io_fail_timeout_sec must be 0 if ctrlr_loss_timeout_sec is 0.\n"); 5901 return false; 5902 } 5903 5904 return true; 5905 } 5906 5907 int 5908 bdev_nvme_create(struct spdk_nvme_transport_id *trid, 5909 const char *base_name, 5910 const char **names, 5911 uint32_t count, 5912 spdk_bdev_create_nvme_fn cb_fn, 5913 void *cb_ctx, 5914 struct spdk_nvme_ctrlr_opts *drv_opts, 5915 struct nvme_ctrlr_opts *bdev_opts, 5916 bool multipath) 5917 { 5918 struct nvme_probe_skip_entry *entry, *tmp; 5919 struct nvme_async_probe_ctx *ctx; 5920 spdk_nvme_attach_cb attach_cb; 5921 5922 /* TODO expand this check to include both the host and target TRIDs. 5923 * Only if both are the same should we fail. 5924 */ 5925 if (nvme_ctrlr_get(trid) != NULL) { 5926 SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr); 5927 return -EEXIST; 5928 } 5929 5930 if (bdev_opts != NULL && 5931 !bdev_nvme_check_io_error_resiliency_params(bdev_opts->ctrlr_loss_timeout_sec, 5932 bdev_opts->reconnect_delay_sec, 5933 bdev_opts->fast_io_fail_timeout_sec)) { 5934 return -EINVAL; 5935 } 5936 5937 ctx = calloc(1, sizeof(*ctx)); 5938 if (!ctx) { 5939 return -ENOMEM; 5940 } 5941 ctx->base_name = base_name; 5942 ctx->names = names; 5943 ctx->max_bdevs = count; 5944 ctx->cb_fn = cb_fn; 5945 ctx->cb_ctx = cb_ctx; 5946 ctx->trid = *trid; 5947 5948 if (bdev_opts) { 5949 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 5950 } else { 5951 bdev_nvme_get_default_ctrlr_opts(&ctx->bdev_opts); 5952 } 5953 5954 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 5955 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) { 5956 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 5957 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 5958 free(entry); 5959 break; 5960 } 5961 } 5962 } 5963 5964 if (drv_opts) { 5965 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 5966 } else { 5967 spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->drv_opts, sizeof(ctx->drv_opts)); 5968 } 5969 5970 ctx->drv_opts.transport_retry_count = g_opts.transport_retry_count; 5971 ctx->drv_opts.transport_ack_timeout = g_opts.transport_ack_timeout; 5972 ctx->drv_opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms; 5973 ctx->drv_opts.disable_read_ana_log_page = true; 5974 ctx->drv_opts.transport_tos = g_opts.transport_tos; 5975 5976 if (nvme_bdev_ctrlr_get_by_name(base_name) == NULL || multipath) { 5977 attach_cb = connect_attach_cb; 5978 } else { 5979 attach_cb = connect_set_failover_cb; 5980 } 5981 5982 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, attach_cb); 5983 if (ctx->probe_ctx == NULL) { 5984 SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr); 5985 free(ctx); 5986 return -ENODEV; 5987 } 5988 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000); 5989 5990 return 0; 5991 } 5992 5993 struct bdev_nvme_delete_ctx { 5994 char *name; 5995 struct nvme_path_id path_id; 5996 bdev_nvme_delete_done_fn delete_done; 5997 void *delete_done_ctx; 5998 uint64_t timeout_ticks; 5999 struct spdk_poller *poller; 6000 }; 6001 6002 static void 6003 free_bdev_nvme_delete_ctx(struct bdev_nvme_delete_ctx *ctx) 6004 { 6005 if (ctx != NULL) { 6006 free(ctx->name); 6007 free(ctx); 6008 } 6009 } 6010 6011 static bool 6012 nvme_path_id_compare(struct nvme_path_id *p, const struct nvme_path_id *path_id) 6013 { 6014 if (path_id->trid.trtype != 0) { 6015 if (path_id->trid.trtype == SPDK_NVME_TRANSPORT_CUSTOM) { 6016 if (strcasecmp(path_id->trid.trstring, p->trid.trstring) != 0) { 6017 return false; 6018 } 6019 } else { 6020 if (path_id->trid.trtype != p->trid.trtype) { 6021 return false; 6022 } 6023 } 6024 } 6025 6026 if (!spdk_mem_all_zero(path_id->trid.traddr, sizeof(path_id->trid.traddr))) { 6027 if (strcasecmp(path_id->trid.traddr, p->trid.traddr) != 0) { 6028 return false; 6029 } 6030 } 6031 6032 if (path_id->trid.adrfam != 0) { 6033 if (path_id->trid.adrfam != p->trid.adrfam) { 6034 return false; 6035 } 6036 } 6037 6038 if (!spdk_mem_all_zero(path_id->trid.trsvcid, sizeof(path_id->trid.trsvcid))) { 6039 if (strcasecmp(path_id->trid.trsvcid, p->trid.trsvcid) != 0) { 6040 return false; 6041 } 6042 } 6043 6044 if (!spdk_mem_all_zero(path_id->trid.subnqn, sizeof(path_id->trid.subnqn))) { 6045 if (strcmp(path_id->trid.subnqn, p->trid.subnqn) != 0) { 6046 return false; 6047 } 6048 } 6049 6050 if (!spdk_mem_all_zero(path_id->hostid.hostaddr, sizeof(path_id->hostid.hostaddr))) { 6051 if (strcmp(path_id->hostid.hostaddr, p->hostid.hostaddr) != 0) { 6052 return false; 6053 } 6054 } 6055 6056 if (!spdk_mem_all_zero(path_id->hostid.hostsvcid, sizeof(path_id->hostid.hostsvcid))) { 6057 if (strcmp(path_id->hostid.hostsvcid, p->hostid.hostsvcid) != 0) { 6058 return false; 6059 } 6060 } 6061 6062 return true; 6063 } 6064 6065 static bool 6066 nvme_path_id_exists(const char *name, const struct nvme_path_id *path_id) 6067 { 6068 struct nvme_bdev_ctrlr *nbdev_ctrlr; 6069 struct nvme_ctrlr *ctrlr; 6070 struct nvme_path_id *p; 6071 6072 pthread_mutex_lock(&g_bdev_nvme_mutex); 6073 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 6074 if (!nbdev_ctrlr) { 6075 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6076 return false; 6077 } 6078 6079 TAILQ_FOREACH(ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 6080 pthread_mutex_lock(&ctrlr->mutex); 6081 TAILQ_FOREACH(p, &ctrlr->trids, link) { 6082 if (nvme_path_id_compare(p, path_id)) { 6083 pthread_mutex_unlock(&ctrlr->mutex); 6084 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6085 return true; 6086 } 6087 } 6088 pthread_mutex_unlock(&ctrlr->mutex); 6089 } 6090 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6091 6092 return false; 6093 } 6094 6095 static int 6096 bdev_nvme_delete_complete_poll(void *arg) 6097 { 6098 struct bdev_nvme_delete_ctx *ctx = arg; 6099 int rc = 0; 6100 6101 if (nvme_path_id_exists(ctx->name, &ctx->path_id)) { 6102 if (ctx->timeout_ticks > spdk_get_ticks()) { 6103 return SPDK_POLLER_BUSY; 6104 } 6105 6106 SPDK_ERRLOG("NVMe path '%s' still exists after delete\n", ctx->name); 6107 rc = -ETIMEDOUT; 6108 } 6109 6110 spdk_poller_unregister(&ctx->poller); 6111 6112 ctx->delete_done(ctx->delete_done_ctx, rc); 6113 free_bdev_nvme_delete_ctx(ctx); 6114 6115 return SPDK_POLLER_BUSY; 6116 } 6117 6118 static int 6119 _bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, const struct nvme_path_id *path_id) 6120 { 6121 struct nvme_path_id *p, *t; 6122 spdk_msg_fn msg_fn; 6123 int rc = -ENXIO; 6124 6125 pthread_mutex_lock(&nvme_ctrlr->mutex); 6126 6127 TAILQ_FOREACH_REVERSE_SAFE(p, &nvme_ctrlr->trids, nvme_paths, link, t) { 6128 if (p == TAILQ_FIRST(&nvme_ctrlr->trids)) { 6129 break; 6130 } 6131 6132 if (!nvme_path_id_compare(p, path_id)) { 6133 continue; 6134 } 6135 6136 /* We are not using the specified path. */ 6137 TAILQ_REMOVE(&nvme_ctrlr->trids, p, link); 6138 free(p); 6139 rc = 0; 6140 } 6141 6142 if (p == NULL || !nvme_path_id_compare(p, path_id)) { 6143 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6144 return rc; 6145 } 6146 6147 /* If we made it here, then this path is a match! Now we need to remove it. */ 6148 6149 /* This is the active path in use right now. The active path is always the first in the list. */ 6150 assert(p == nvme_ctrlr->active_path_id); 6151 6152 if (!TAILQ_NEXT(p, link)) { 6153 /* The current path is the only path. */ 6154 msg_fn = _nvme_ctrlr_destruct; 6155 rc = bdev_nvme_delete_ctrlr_unsafe(nvme_ctrlr, false); 6156 } else { 6157 /* There is an alternative path. */ 6158 msg_fn = _bdev_nvme_reset_ctrlr; 6159 rc = bdev_nvme_failover_ctrlr_unsafe(nvme_ctrlr, true); 6160 } 6161 6162 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6163 6164 if (rc == 0) { 6165 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 6166 } else if (rc == -EALREADY) { 6167 rc = 0; 6168 } 6169 6170 return rc; 6171 } 6172 6173 int 6174 bdev_nvme_delete(const char *name, const struct nvme_path_id *path_id, 6175 bdev_nvme_delete_done_fn delete_done, void *delete_done_ctx) 6176 { 6177 struct nvme_bdev_ctrlr *nbdev_ctrlr; 6178 struct nvme_ctrlr *nvme_ctrlr, *tmp_nvme_ctrlr; 6179 struct bdev_nvme_delete_ctx *ctx = NULL; 6180 int rc = -ENXIO, _rc; 6181 6182 if (name == NULL || path_id == NULL) { 6183 rc = -EINVAL; 6184 goto exit; 6185 } 6186 6187 pthread_mutex_lock(&g_bdev_nvme_mutex); 6188 6189 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 6190 if (nbdev_ctrlr == NULL) { 6191 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6192 6193 SPDK_ERRLOG("Failed to find NVMe bdev controller\n"); 6194 rc = -ENODEV; 6195 goto exit; 6196 } 6197 6198 TAILQ_FOREACH_SAFE(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq, tmp_nvme_ctrlr) { 6199 _rc = _bdev_nvme_delete(nvme_ctrlr, path_id); 6200 if (_rc < 0 && _rc != -ENXIO) { 6201 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6202 rc = _rc; 6203 goto exit; 6204 } else if (_rc == 0) { 6205 /* We traverse all remaining nvme_ctrlrs even if one nvme_ctrlr 6206 * was deleted successfully. To remember the successful deletion, 6207 * overwrite rc only if _rc is zero. 6208 */ 6209 rc = 0; 6210 } 6211 } 6212 6213 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6214 6215 if (rc != 0 || delete_done == NULL) { 6216 goto exit; 6217 } 6218 6219 ctx = calloc(1, sizeof(*ctx)); 6220 if (ctx == NULL) { 6221 SPDK_ERRLOG("Failed to allocate context for bdev_nvme_delete\n"); 6222 rc = -ENOMEM; 6223 goto exit; 6224 } 6225 6226 ctx->name = strdup(name); 6227 if (ctx->name == NULL) { 6228 SPDK_ERRLOG("Failed to copy controller name for deletion\n"); 6229 rc = -ENOMEM; 6230 goto exit; 6231 } 6232 6233 ctx->delete_done = delete_done; 6234 ctx->delete_done_ctx = delete_done_ctx; 6235 ctx->path_id = *path_id; 6236 ctx->timeout_ticks = spdk_get_ticks() + 10 * spdk_get_ticks_hz(); 6237 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_delete_complete_poll, ctx, 1000); 6238 if (ctx->poller == NULL) { 6239 SPDK_ERRLOG("Failed to register bdev_nvme_delete poller\n"); 6240 rc = -ENOMEM; 6241 goto exit; 6242 } 6243 6244 exit: 6245 if (rc != 0) { 6246 free_bdev_nvme_delete_ctx(ctx); 6247 } 6248 6249 return rc; 6250 } 6251 6252 #define DISCOVERY_INFOLOG(ctx, format, ...) \ 6253 SPDK_INFOLOG(bdev_nvme, "Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 6254 6255 #define DISCOVERY_ERRLOG(ctx, format, ...) \ 6256 SPDK_ERRLOG("Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 6257 6258 struct discovery_entry_ctx { 6259 char name[128]; 6260 struct spdk_nvme_transport_id trid; 6261 struct spdk_nvme_ctrlr_opts drv_opts; 6262 struct spdk_nvmf_discovery_log_page_entry entry; 6263 TAILQ_ENTRY(discovery_entry_ctx) tailq; 6264 struct discovery_ctx *ctx; 6265 }; 6266 6267 struct discovery_ctx { 6268 char *name; 6269 spdk_bdev_nvme_start_discovery_fn start_cb_fn; 6270 spdk_bdev_nvme_stop_discovery_fn stop_cb_fn; 6271 void *cb_ctx; 6272 struct spdk_nvme_probe_ctx *probe_ctx; 6273 struct spdk_nvme_detach_ctx *detach_ctx; 6274 struct spdk_nvme_ctrlr *ctrlr; 6275 struct spdk_nvme_transport_id trid; 6276 struct discovery_entry_ctx *entry_ctx_in_use; 6277 struct spdk_poller *poller; 6278 struct spdk_nvme_ctrlr_opts drv_opts; 6279 struct nvme_ctrlr_opts bdev_opts; 6280 struct spdk_nvmf_discovery_log_page *log_page; 6281 TAILQ_ENTRY(discovery_ctx) tailq; 6282 TAILQ_HEAD(, discovery_entry_ctx) nvm_entry_ctxs; 6283 TAILQ_HEAD(, discovery_entry_ctx) discovery_entry_ctxs; 6284 int rc; 6285 bool wait_for_attach; 6286 uint64_t timeout_ticks; 6287 /* Denotes that the discovery service is being started. We're waiting 6288 * for the initial connection to the discovery controller to be 6289 * established and attach discovered NVM ctrlrs. 6290 */ 6291 bool initializing; 6292 /* Denotes if a discovery is currently in progress for this context. 6293 * That includes connecting to newly discovered subsystems. Used to 6294 * ensure we do not start a new discovery until an existing one is 6295 * complete. 6296 */ 6297 bool in_progress; 6298 6299 /* Denotes if another discovery is needed after the one in progress 6300 * completes. Set when we receive an AER completion while a discovery 6301 * is already in progress. 6302 */ 6303 bool pending; 6304 6305 /* Signal to the discovery context poller that it should stop the 6306 * discovery service, including detaching from the current discovery 6307 * controller. 6308 */ 6309 bool stop; 6310 6311 struct spdk_thread *calling_thread; 6312 uint32_t index; 6313 uint32_t attach_in_progress; 6314 char *hostnqn; 6315 6316 /* Denotes if the discovery service was started by the mdns discovery. 6317 */ 6318 bool from_mdns_discovery_service; 6319 }; 6320 6321 TAILQ_HEAD(discovery_ctxs, discovery_ctx); 6322 static struct discovery_ctxs g_discovery_ctxs = TAILQ_HEAD_INITIALIZER(g_discovery_ctxs); 6323 6324 static void get_discovery_log_page(struct discovery_ctx *ctx); 6325 6326 static void 6327 free_discovery_ctx(struct discovery_ctx *ctx) 6328 { 6329 free(ctx->log_page); 6330 free(ctx->hostnqn); 6331 free(ctx->name); 6332 free(ctx); 6333 } 6334 6335 static void 6336 discovery_complete(struct discovery_ctx *ctx) 6337 { 6338 ctx->initializing = false; 6339 ctx->in_progress = false; 6340 if (ctx->pending) { 6341 ctx->pending = false; 6342 get_discovery_log_page(ctx); 6343 } 6344 } 6345 6346 static void 6347 build_trid_from_log_page_entry(struct spdk_nvme_transport_id *trid, 6348 struct spdk_nvmf_discovery_log_page_entry *entry) 6349 { 6350 char *space; 6351 6352 trid->trtype = entry->trtype; 6353 trid->adrfam = entry->adrfam; 6354 memcpy(trid->traddr, entry->traddr, sizeof(entry->traddr)); 6355 memcpy(trid->trsvcid, entry->trsvcid, sizeof(entry->trsvcid)); 6356 /* Because the source buffer (entry->subnqn) is longer than trid->subnqn, and 6357 * before call to this function trid->subnqn is zeroed out, we need 6358 * to copy sizeof(trid->subnqn) minus one byte to make sure the last character 6359 * remains 0. Then we can shorten the string (replace ' ' with 0) if required 6360 */ 6361 memcpy(trid->subnqn, entry->subnqn, sizeof(trid->subnqn) - 1); 6362 6363 /* We want the traddr, trsvcid and subnqn fields to be NULL-terminated. 6364 * But the log page entries typically pad them with spaces, not zeroes. 6365 * So add a NULL terminator to each of these fields at the appropriate 6366 * location. 6367 */ 6368 space = strchr(trid->traddr, ' '); 6369 if (space) { 6370 *space = 0; 6371 } 6372 space = strchr(trid->trsvcid, ' '); 6373 if (space) { 6374 *space = 0; 6375 } 6376 space = strchr(trid->subnqn, ' '); 6377 if (space) { 6378 *space = 0; 6379 } 6380 } 6381 6382 static void 6383 _stop_discovery(void *_ctx) 6384 { 6385 struct discovery_ctx *ctx = _ctx; 6386 6387 if (ctx->attach_in_progress > 0) { 6388 spdk_thread_send_msg(spdk_get_thread(), _stop_discovery, ctx); 6389 return; 6390 } 6391 6392 ctx->stop = true; 6393 6394 while (!TAILQ_EMPTY(&ctx->nvm_entry_ctxs)) { 6395 struct discovery_entry_ctx *entry_ctx; 6396 struct nvme_path_id path = {}; 6397 6398 entry_ctx = TAILQ_FIRST(&ctx->nvm_entry_ctxs); 6399 path.trid = entry_ctx->trid; 6400 bdev_nvme_delete(entry_ctx->name, &path, NULL, NULL); 6401 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 6402 free(entry_ctx); 6403 } 6404 6405 while (!TAILQ_EMPTY(&ctx->discovery_entry_ctxs)) { 6406 struct discovery_entry_ctx *entry_ctx; 6407 6408 entry_ctx = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 6409 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 6410 free(entry_ctx); 6411 } 6412 6413 free(ctx->entry_ctx_in_use); 6414 ctx->entry_ctx_in_use = NULL; 6415 } 6416 6417 static void 6418 stop_discovery(struct discovery_ctx *ctx, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 6419 { 6420 ctx->stop_cb_fn = cb_fn; 6421 ctx->cb_ctx = cb_ctx; 6422 6423 if (ctx->attach_in_progress > 0) { 6424 DISCOVERY_INFOLOG(ctx, "stopping discovery with attach_in_progress: %"PRIu32"\n", 6425 ctx->attach_in_progress); 6426 } 6427 6428 _stop_discovery(ctx); 6429 } 6430 6431 static void 6432 remove_discovery_entry(struct nvme_ctrlr *nvme_ctrlr) 6433 { 6434 struct discovery_ctx *d_ctx; 6435 struct nvme_path_id *path_id; 6436 struct spdk_nvme_transport_id trid = {}; 6437 struct discovery_entry_ctx *entry_ctx, *tmp; 6438 6439 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 6440 6441 TAILQ_FOREACH(d_ctx, &g_discovery_ctxs, tailq) { 6442 TAILQ_FOREACH_SAFE(entry_ctx, &d_ctx->nvm_entry_ctxs, tailq, tmp) { 6443 build_trid_from_log_page_entry(&trid, &entry_ctx->entry); 6444 if (spdk_nvme_transport_id_compare(&trid, &path_id->trid) != 0) { 6445 continue; 6446 } 6447 6448 TAILQ_REMOVE(&d_ctx->nvm_entry_ctxs, entry_ctx, tailq); 6449 free(entry_ctx); 6450 DISCOVERY_INFOLOG(d_ctx, "Remove discovery entry: %s:%s:%s\n", 6451 trid.subnqn, trid.traddr, trid.trsvcid); 6452 6453 /* Fail discovery ctrlr to force reattach attempt */ 6454 spdk_nvme_ctrlr_fail(d_ctx->ctrlr); 6455 } 6456 } 6457 } 6458 6459 static void 6460 discovery_remove_controllers(struct discovery_ctx *ctx) 6461 { 6462 struct spdk_nvmf_discovery_log_page *log_page = ctx->log_page; 6463 struct discovery_entry_ctx *entry_ctx, *tmp; 6464 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 6465 struct spdk_nvme_transport_id old_trid = {}; 6466 uint64_t numrec, i; 6467 bool found; 6468 6469 numrec = from_le64(&log_page->numrec); 6470 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->nvm_entry_ctxs, tailq, tmp) { 6471 found = false; 6472 old_entry = &entry_ctx->entry; 6473 build_trid_from_log_page_entry(&old_trid, old_entry); 6474 for (i = 0; i < numrec; i++) { 6475 new_entry = &log_page->entries[i]; 6476 if (!memcmp(old_entry, new_entry, sizeof(*old_entry))) { 6477 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s found again\n", 6478 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 6479 found = true; 6480 break; 6481 } 6482 } 6483 if (!found) { 6484 struct nvme_path_id path = {}; 6485 6486 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s not found\n", 6487 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 6488 6489 path.trid = entry_ctx->trid; 6490 bdev_nvme_delete(entry_ctx->name, &path, NULL, NULL); 6491 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 6492 free(entry_ctx); 6493 } 6494 } 6495 free(log_page); 6496 ctx->log_page = NULL; 6497 discovery_complete(ctx); 6498 } 6499 6500 static void 6501 complete_discovery_start(struct discovery_ctx *ctx, int status) 6502 { 6503 ctx->timeout_ticks = 0; 6504 ctx->rc = status; 6505 if (ctx->start_cb_fn) { 6506 ctx->start_cb_fn(ctx->cb_ctx, status); 6507 ctx->start_cb_fn = NULL; 6508 ctx->cb_ctx = NULL; 6509 } 6510 } 6511 6512 static void 6513 discovery_attach_controller_done(void *cb_ctx, size_t bdev_count, int rc) 6514 { 6515 struct discovery_entry_ctx *entry_ctx = cb_ctx; 6516 struct discovery_ctx *ctx = entry_ctx->ctx; 6517 6518 DISCOVERY_INFOLOG(ctx, "attach %s done\n", entry_ctx->name); 6519 ctx->attach_in_progress--; 6520 if (ctx->attach_in_progress == 0) { 6521 complete_discovery_start(ctx, ctx->rc); 6522 if (ctx->initializing && ctx->rc != 0) { 6523 DISCOVERY_ERRLOG(ctx, "stopping discovery due to errors: %d\n", ctx->rc); 6524 stop_discovery(ctx, NULL, ctx->cb_ctx); 6525 } else { 6526 discovery_remove_controllers(ctx); 6527 } 6528 } 6529 } 6530 6531 static struct discovery_entry_ctx * 6532 create_discovery_entry_ctx(struct discovery_ctx *ctx, struct spdk_nvme_transport_id *trid) 6533 { 6534 struct discovery_entry_ctx *new_ctx; 6535 6536 new_ctx = calloc(1, sizeof(*new_ctx)); 6537 if (new_ctx == NULL) { 6538 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6539 return NULL; 6540 } 6541 6542 new_ctx->ctx = ctx; 6543 memcpy(&new_ctx->trid, trid, sizeof(*trid)); 6544 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 6545 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 6546 return new_ctx; 6547 } 6548 6549 static void 6550 discovery_log_page_cb(void *cb_arg, int rc, const struct spdk_nvme_cpl *cpl, 6551 struct spdk_nvmf_discovery_log_page *log_page) 6552 { 6553 struct discovery_ctx *ctx = cb_arg; 6554 struct discovery_entry_ctx *entry_ctx, *tmp; 6555 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 6556 uint64_t numrec, i; 6557 bool found; 6558 6559 if (rc || spdk_nvme_cpl_is_error(cpl)) { 6560 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 6561 return; 6562 } 6563 6564 ctx->log_page = log_page; 6565 assert(ctx->attach_in_progress == 0); 6566 numrec = from_le64(&log_page->numrec); 6567 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->discovery_entry_ctxs, tailq, tmp) { 6568 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 6569 free(entry_ctx); 6570 } 6571 for (i = 0; i < numrec; i++) { 6572 found = false; 6573 new_entry = &log_page->entries[i]; 6574 if (new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY_CURRENT || 6575 new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) { 6576 struct discovery_entry_ctx *new_ctx; 6577 struct spdk_nvme_transport_id trid = {}; 6578 6579 build_trid_from_log_page_entry(&trid, new_entry); 6580 new_ctx = create_discovery_entry_ctx(ctx, &trid); 6581 if (new_ctx == NULL) { 6582 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6583 break; 6584 } 6585 6586 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, new_ctx, tailq); 6587 continue; 6588 } 6589 TAILQ_FOREACH(entry_ctx, &ctx->nvm_entry_ctxs, tailq) { 6590 old_entry = &entry_ctx->entry; 6591 if (!memcmp(new_entry, old_entry, sizeof(*new_entry))) { 6592 found = true; 6593 break; 6594 } 6595 } 6596 if (!found) { 6597 struct discovery_entry_ctx *subnqn_ctx = NULL, *new_ctx; 6598 struct discovery_ctx *d_ctx; 6599 6600 TAILQ_FOREACH(d_ctx, &g_discovery_ctxs, tailq) { 6601 TAILQ_FOREACH(subnqn_ctx, &d_ctx->nvm_entry_ctxs, tailq) { 6602 if (!memcmp(subnqn_ctx->entry.subnqn, new_entry->subnqn, 6603 sizeof(new_entry->subnqn))) { 6604 break; 6605 } 6606 } 6607 if (subnqn_ctx) { 6608 break; 6609 } 6610 } 6611 6612 new_ctx = calloc(1, sizeof(*new_ctx)); 6613 if (new_ctx == NULL) { 6614 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6615 break; 6616 } 6617 6618 new_ctx->ctx = ctx; 6619 memcpy(&new_ctx->entry, new_entry, sizeof(*new_entry)); 6620 build_trid_from_log_page_entry(&new_ctx->trid, new_entry); 6621 if (subnqn_ctx) { 6622 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s", subnqn_ctx->name); 6623 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new path for %s\n", 6624 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 6625 new_ctx->name); 6626 } else { 6627 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s%d", ctx->name, ctx->index++); 6628 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new subsystem %s\n", 6629 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 6630 new_ctx->name); 6631 } 6632 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 6633 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 6634 rc = bdev_nvme_create(&new_ctx->trid, new_ctx->name, NULL, 0, 6635 discovery_attach_controller_done, new_ctx, 6636 &new_ctx->drv_opts, &ctx->bdev_opts, true); 6637 if (rc == 0) { 6638 TAILQ_INSERT_TAIL(&ctx->nvm_entry_ctxs, new_ctx, tailq); 6639 ctx->attach_in_progress++; 6640 } else { 6641 DISCOVERY_ERRLOG(ctx, "bdev_nvme_create failed (%s)\n", spdk_strerror(-rc)); 6642 } 6643 } 6644 } 6645 6646 if (ctx->attach_in_progress == 0) { 6647 discovery_remove_controllers(ctx); 6648 } 6649 } 6650 6651 static void 6652 get_discovery_log_page(struct discovery_ctx *ctx) 6653 { 6654 int rc; 6655 6656 assert(ctx->in_progress == false); 6657 ctx->in_progress = true; 6658 rc = spdk_nvme_ctrlr_get_discovery_log_page(ctx->ctrlr, discovery_log_page_cb, ctx); 6659 if (rc != 0) { 6660 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 6661 } 6662 DISCOVERY_INFOLOG(ctx, "sent discovery log page command\n"); 6663 } 6664 6665 static void 6666 discovery_aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 6667 { 6668 struct discovery_ctx *ctx = arg; 6669 uint32_t log_page_id = (cpl->cdw0 & 0xFF0000) >> 16; 6670 6671 if (spdk_nvme_cpl_is_error(cpl)) { 6672 DISCOVERY_ERRLOG(ctx, "aer failed\n"); 6673 return; 6674 } 6675 6676 if (log_page_id != SPDK_NVME_LOG_DISCOVERY) { 6677 DISCOVERY_ERRLOG(ctx, "unexpected log page 0x%x\n", log_page_id); 6678 return; 6679 } 6680 6681 DISCOVERY_INFOLOG(ctx, "got aer\n"); 6682 if (ctx->in_progress) { 6683 ctx->pending = true; 6684 return; 6685 } 6686 6687 get_discovery_log_page(ctx); 6688 } 6689 6690 static void 6691 discovery_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 6692 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 6693 { 6694 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 6695 struct discovery_ctx *ctx; 6696 6697 ctx = SPDK_CONTAINEROF(user_opts, struct discovery_ctx, drv_opts); 6698 6699 DISCOVERY_INFOLOG(ctx, "discovery ctrlr attached\n"); 6700 ctx->probe_ctx = NULL; 6701 ctx->ctrlr = ctrlr; 6702 6703 if (ctx->rc != 0) { 6704 DISCOVERY_ERRLOG(ctx, "encountered error while attaching discovery ctrlr: %d\n", 6705 ctx->rc); 6706 return; 6707 } 6708 6709 spdk_nvme_ctrlr_register_aer_callback(ctx->ctrlr, discovery_aer_cb, ctx); 6710 } 6711 6712 static int 6713 discovery_poller(void *arg) 6714 { 6715 struct discovery_ctx *ctx = arg; 6716 struct spdk_nvme_transport_id *trid; 6717 int rc; 6718 6719 if (ctx->detach_ctx) { 6720 rc = spdk_nvme_detach_poll_async(ctx->detach_ctx); 6721 if (rc != -EAGAIN) { 6722 ctx->detach_ctx = NULL; 6723 ctx->ctrlr = NULL; 6724 } 6725 } else if (ctx->stop) { 6726 if (ctx->ctrlr != NULL) { 6727 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 6728 if (rc == 0) { 6729 return SPDK_POLLER_BUSY; 6730 } 6731 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 6732 } 6733 spdk_poller_unregister(&ctx->poller); 6734 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 6735 assert(ctx->start_cb_fn == NULL); 6736 if (ctx->stop_cb_fn != NULL) { 6737 ctx->stop_cb_fn(ctx->cb_ctx); 6738 } 6739 free_discovery_ctx(ctx); 6740 } else if (ctx->probe_ctx == NULL && ctx->ctrlr == NULL) { 6741 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 6742 DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); 6743 assert(ctx->initializing); 6744 spdk_poller_unregister(&ctx->poller); 6745 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 6746 complete_discovery_start(ctx, -ETIMEDOUT); 6747 stop_discovery(ctx, NULL, NULL); 6748 free_discovery_ctx(ctx); 6749 return SPDK_POLLER_BUSY; 6750 } 6751 6752 assert(ctx->entry_ctx_in_use == NULL); 6753 ctx->entry_ctx_in_use = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 6754 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 6755 trid = &ctx->entry_ctx_in_use->trid; 6756 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, discovery_attach_cb); 6757 if (ctx->probe_ctx) { 6758 spdk_poller_unregister(&ctx->poller); 6759 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000); 6760 } else { 6761 DISCOVERY_ERRLOG(ctx, "could not start discovery connect\n"); 6762 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 6763 ctx->entry_ctx_in_use = NULL; 6764 } 6765 } else if (ctx->probe_ctx) { 6766 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 6767 DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); 6768 complete_discovery_start(ctx, -ETIMEDOUT); 6769 return SPDK_POLLER_BUSY; 6770 } 6771 6772 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 6773 if (rc != -EAGAIN) { 6774 if (ctx->rc != 0) { 6775 assert(ctx->initializing); 6776 stop_discovery(ctx, NULL, ctx->cb_ctx); 6777 } else { 6778 assert(rc == 0); 6779 DISCOVERY_INFOLOG(ctx, "discovery ctrlr connected\n"); 6780 ctx->rc = rc; 6781 get_discovery_log_page(ctx); 6782 } 6783 } 6784 } else { 6785 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 6786 DISCOVERY_ERRLOG(ctx, "timed out while attaching NVM ctrlrs\n"); 6787 complete_discovery_start(ctx, -ETIMEDOUT); 6788 /* We need to wait until all NVM ctrlrs are attached before we stop the 6789 * discovery service to make sure we don't detach a ctrlr that is still 6790 * being attached. 6791 */ 6792 if (ctx->attach_in_progress == 0) { 6793 stop_discovery(ctx, NULL, ctx->cb_ctx); 6794 return SPDK_POLLER_BUSY; 6795 } 6796 } 6797 6798 rc = spdk_nvme_ctrlr_process_admin_completions(ctx->ctrlr); 6799 if (rc < 0) { 6800 spdk_poller_unregister(&ctx->poller); 6801 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 6802 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 6803 ctx->entry_ctx_in_use = NULL; 6804 6805 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 6806 if (rc != 0) { 6807 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 6808 ctx->ctrlr = NULL; 6809 } 6810 } 6811 } 6812 6813 return SPDK_POLLER_BUSY; 6814 } 6815 6816 static void 6817 start_discovery_poller(void *arg) 6818 { 6819 struct discovery_ctx *ctx = arg; 6820 6821 TAILQ_INSERT_TAIL(&g_discovery_ctxs, ctx, tailq); 6822 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 6823 } 6824 6825 int 6826 bdev_nvme_start_discovery(struct spdk_nvme_transport_id *trid, 6827 const char *base_name, 6828 struct spdk_nvme_ctrlr_opts *drv_opts, 6829 struct nvme_ctrlr_opts *bdev_opts, 6830 uint64_t attach_timeout, 6831 bool from_mdns, 6832 spdk_bdev_nvme_start_discovery_fn cb_fn, void *cb_ctx) 6833 { 6834 struct discovery_ctx *ctx; 6835 struct discovery_entry_ctx *discovery_entry_ctx; 6836 6837 snprintf(trid->subnqn, sizeof(trid->subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN); 6838 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 6839 if (strcmp(ctx->name, base_name) == 0) { 6840 return -EEXIST; 6841 } 6842 6843 if (ctx->entry_ctx_in_use != NULL) { 6844 if (!spdk_nvme_transport_id_compare(trid, &ctx->entry_ctx_in_use->trid)) { 6845 return -EEXIST; 6846 } 6847 } 6848 6849 TAILQ_FOREACH(discovery_entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 6850 if (!spdk_nvme_transport_id_compare(trid, &discovery_entry_ctx->trid)) { 6851 return -EEXIST; 6852 } 6853 } 6854 } 6855 6856 ctx = calloc(1, sizeof(*ctx)); 6857 if (ctx == NULL) { 6858 return -ENOMEM; 6859 } 6860 6861 ctx->name = strdup(base_name); 6862 if (ctx->name == NULL) { 6863 free_discovery_ctx(ctx); 6864 return -ENOMEM; 6865 } 6866 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 6867 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 6868 ctx->from_mdns_discovery_service = from_mdns; 6869 ctx->bdev_opts.from_discovery_service = true; 6870 ctx->calling_thread = spdk_get_thread(); 6871 ctx->start_cb_fn = cb_fn; 6872 ctx->cb_ctx = cb_ctx; 6873 ctx->initializing = true; 6874 if (ctx->start_cb_fn) { 6875 /* We can use this when dumping json to denote if this RPC parameter 6876 * was specified or not. 6877 */ 6878 ctx->wait_for_attach = true; 6879 } 6880 if (attach_timeout != 0) { 6881 ctx->timeout_ticks = spdk_get_ticks() + attach_timeout * 6882 spdk_get_ticks_hz() / 1000ull; 6883 } 6884 TAILQ_INIT(&ctx->nvm_entry_ctxs); 6885 TAILQ_INIT(&ctx->discovery_entry_ctxs); 6886 memcpy(&ctx->trid, trid, sizeof(*trid)); 6887 /* Even if user did not specify hostnqn, we can still strdup("\0"); */ 6888 ctx->hostnqn = strdup(ctx->drv_opts.hostnqn); 6889 if (ctx->hostnqn == NULL) { 6890 free_discovery_ctx(ctx); 6891 return -ENOMEM; 6892 } 6893 discovery_entry_ctx = create_discovery_entry_ctx(ctx, trid); 6894 if (discovery_entry_ctx == NULL) { 6895 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6896 free_discovery_ctx(ctx); 6897 return -ENOMEM; 6898 } 6899 6900 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, discovery_entry_ctx, tailq); 6901 spdk_thread_send_msg(g_bdev_nvme_init_thread, start_discovery_poller, ctx); 6902 return 0; 6903 } 6904 6905 int 6906 bdev_nvme_stop_discovery(const char *name, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 6907 { 6908 struct discovery_ctx *ctx; 6909 6910 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 6911 if (strcmp(name, ctx->name) == 0) { 6912 if (ctx->stop) { 6913 return -EALREADY; 6914 } 6915 /* If we're still starting the discovery service and ->rc is non-zero, we're 6916 * going to stop it as soon as we can 6917 */ 6918 if (ctx->initializing && ctx->rc != 0) { 6919 return -EALREADY; 6920 } 6921 stop_discovery(ctx, cb_fn, cb_ctx); 6922 return 0; 6923 } 6924 } 6925 6926 return -ENOENT; 6927 } 6928 6929 static int 6930 bdev_nvme_library_init(void) 6931 { 6932 g_bdev_nvme_init_thread = spdk_get_thread(); 6933 6934 spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_create_poll_group_cb, 6935 bdev_nvme_destroy_poll_group_cb, 6936 sizeof(struct nvme_poll_group), "nvme_poll_groups"); 6937 6938 return 0; 6939 } 6940 6941 static void 6942 bdev_nvme_fini_destruct_ctrlrs(void) 6943 { 6944 struct nvme_bdev_ctrlr *nbdev_ctrlr; 6945 struct nvme_ctrlr *nvme_ctrlr; 6946 6947 pthread_mutex_lock(&g_bdev_nvme_mutex); 6948 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 6949 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 6950 pthread_mutex_lock(&nvme_ctrlr->mutex); 6951 if (nvme_ctrlr->destruct) { 6952 /* This controller's destruction was already started 6953 * before the application started shutting down 6954 */ 6955 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6956 continue; 6957 } 6958 nvme_ctrlr->destruct = true; 6959 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6960 6961 spdk_thread_send_msg(nvme_ctrlr->thread, _nvme_ctrlr_destruct, 6962 nvme_ctrlr); 6963 } 6964 } 6965 6966 g_bdev_nvme_module_finish = true; 6967 if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 6968 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6969 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 6970 spdk_bdev_module_fini_done(); 6971 return; 6972 } 6973 6974 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6975 } 6976 6977 static void 6978 check_discovery_fini(void *arg) 6979 { 6980 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 6981 bdev_nvme_fini_destruct_ctrlrs(); 6982 } 6983 } 6984 6985 static void 6986 bdev_nvme_library_fini(void) 6987 { 6988 struct nvme_probe_skip_entry *entry, *entry_tmp; 6989 struct discovery_ctx *ctx; 6990 6991 spdk_poller_unregister(&g_hotplug_poller); 6992 free(g_hotplug_probe_ctx); 6993 g_hotplug_probe_ctx = NULL; 6994 6995 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) { 6996 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 6997 free(entry); 6998 } 6999 7000 assert(spdk_get_thread() == g_bdev_nvme_init_thread); 7001 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 7002 bdev_nvme_fini_destruct_ctrlrs(); 7003 } else { 7004 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 7005 stop_discovery(ctx, check_discovery_fini, NULL); 7006 } 7007 } 7008 } 7009 7010 static void 7011 bdev_nvme_verify_pi_error(struct nvme_bdev_io *bio) 7012 { 7013 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7014 struct spdk_bdev *bdev = bdev_io->bdev; 7015 struct spdk_dif_ctx dif_ctx; 7016 struct spdk_dif_error err_blk = {}; 7017 int rc; 7018 struct spdk_dif_ctx_init_ext_opts dif_opts; 7019 7020 dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format); 7021 dif_opts.dif_pi_format = SPDK_DIF_PI_FORMAT_16; 7022 rc = spdk_dif_ctx_init(&dif_ctx, 7023 bdev->blocklen, bdev->md_len, bdev->md_interleave, 7024 bdev->dif_is_head_of_md, bdev->dif_type, bdev->dif_check_flags, 7025 bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0, &dif_opts); 7026 if (rc != 0) { 7027 SPDK_ERRLOG("Initialization of DIF context failed\n"); 7028 return; 7029 } 7030 7031 if (bdev->md_interleave) { 7032 rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 7033 bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 7034 } else { 7035 struct iovec md_iov = { 7036 .iov_base = bdev_io->u.bdev.md_buf, 7037 .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len, 7038 }; 7039 7040 rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 7041 &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 7042 } 7043 7044 if (rc != 0) { 7045 SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n", 7046 err_blk.err_type, err_blk.err_offset); 7047 } else { 7048 SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n"); 7049 } 7050 } 7051 7052 static void 7053 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 7054 { 7055 struct nvme_bdev_io *bio = ref; 7056 7057 if (spdk_nvme_cpl_is_success(cpl)) { 7058 /* Run PI verification for read data buffer. */ 7059 bdev_nvme_verify_pi_error(bio); 7060 } 7061 7062 /* Return original completion status */ 7063 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 7064 } 7065 7066 static void 7067 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 7068 { 7069 struct nvme_bdev_io *bio = ref; 7070 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7071 int ret; 7072 7073 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 7074 SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n", 7075 cpl->status.sct, cpl->status.sc); 7076 7077 /* Save completion status to use after verifying PI error. */ 7078 bio->cpl = *cpl; 7079 7080 if (spdk_likely(nvme_io_path_is_available(bio->io_path))) { 7081 /* Read without PI checking to verify PI error. */ 7082 ret = bdev_nvme_no_pi_readv(bio, 7083 bdev_io->u.bdev.iovs, 7084 bdev_io->u.bdev.iovcnt, 7085 bdev_io->u.bdev.md_buf, 7086 bdev_io->u.bdev.num_blocks, 7087 bdev_io->u.bdev.offset_blocks); 7088 if (ret == 0) { 7089 return; 7090 } 7091 } 7092 } 7093 7094 bdev_nvme_io_complete_nvme_status(bio, cpl); 7095 } 7096 7097 static void 7098 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7099 { 7100 struct nvme_bdev_io *bio = ref; 7101 7102 if (spdk_nvme_cpl_is_pi_error(cpl)) { 7103 SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n", 7104 cpl->status.sct, cpl->status.sc); 7105 /* Run PI verification for write data buffer if PI error is detected. */ 7106 bdev_nvme_verify_pi_error(bio); 7107 } 7108 7109 bdev_nvme_io_complete_nvme_status(bio, cpl); 7110 } 7111 7112 static void 7113 bdev_nvme_zone_appendv_done(void *ref, const struct spdk_nvme_cpl *cpl) 7114 { 7115 struct nvme_bdev_io *bio = ref; 7116 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7117 7118 /* spdk_bdev_io_get_append_location() requires that the ALBA is stored in offset_blocks. 7119 * Additionally, offset_blocks has to be set before calling bdev_nvme_verify_pi_error(). 7120 */ 7121 bdev_io->u.bdev.offset_blocks = *(uint64_t *)&cpl->cdw0; 7122 7123 if (spdk_nvme_cpl_is_pi_error(cpl)) { 7124 SPDK_ERRLOG("zone append completed with PI error (sct=%d, sc=%d)\n", 7125 cpl->status.sct, cpl->status.sc); 7126 /* Run PI verification for zone append data buffer if PI error is detected. */ 7127 bdev_nvme_verify_pi_error(bio); 7128 } 7129 7130 bdev_nvme_io_complete_nvme_status(bio, cpl); 7131 } 7132 7133 static void 7134 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7135 { 7136 struct nvme_bdev_io *bio = ref; 7137 7138 if (spdk_nvme_cpl_is_pi_error(cpl)) { 7139 SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n", 7140 cpl->status.sct, cpl->status.sc); 7141 /* Run PI verification for compare data buffer if PI error is detected. */ 7142 bdev_nvme_verify_pi_error(bio); 7143 } 7144 7145 bdev_nvme_io_complete_nvme_status(bio, cpl); 7146 } 7147 7148 static void 7149 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7150 { 7151 struct nvme_bdev_io *bio = ref; 7152 7153 /* Compare operation completion */ 7154 if (!bio->first_fused_completed) { 7155 /* Save compare result for write callback */ 7156 bio->cpl = *cpl; 7157 bio->first_fused_completed = true; 7158 return; 7159 } 7160 7161 /* Write operation completion */ 7162 if (spdk_nvme_cpl_is_error(&bio->cpl)) { 7163 /* If bio->cpl is already an error, it means the compare operation failed. In that case, 7164 * complete the IO with the compare operation's status. 7165 */ 7166 if (!spdk_nvme_cpl_is_error(cpl)) { 7167 SPDK_ERRLOG("Unexpected write success after compare failure.\n"); 7168 } 7169 7170 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 7171 } else { 7172 bdev_nvme_io_complete_nvme_status(bio, cpl); 7173 } 7174 } 7175 7176 static void 7177 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl) 7178 { 7179 struct nvme_bdev_io *bio = ref; 7180 7181 bdev_nvme_io_complete_nvme_status(bio, cpl); 7182 } 7183 7184 static int 7185 fill_zone_from_report(struct spdk_bdev_zone_info *info, struct spdk_nvme_zns_zone_desc *desc) 7186 { 7187 switch (desc->zt) { 7188 case SPDK_NVME_ZONE_TYPE_SEQWR: 7189 info->type = SPDK_BDEV_ZONE_TYPE_SEQWR; 7190 break; 7191 default: 7192 SPDK_ERRLOG("Invalid zone type: %#x in zone report\n", desc->zt); 7193 return -EIO; 7194 } 7195 7196 switch (desc->zs) { 7197 case SPDK_NVME_ZONE_STATE_EMPTY: 7198 info->state = SPDK_BDEV_ZONE_STATE_EMPTY; 7199 break; 7200 case SPDK_NVME_ZONE_STATE_IOPEN: 7201 info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN; 7202 break; 7203 case SPDK_NVME_ZONE_STATE_EOPEN: 7204 info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN; 7205 break; 7206 case SPDK_NVME_ZONE_STATE_CLOSED: 7207 info->state = SPDK_BDEV_ZONE_STATE_CLOSED; 7208 break; 7209 case SPDK_NVME_ZONE_STATE_RONLY: 7210 info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY; 7211 break; 7212 case SPDK_NVME_ZONE_STATE_FULL: 7213 info->state = SPDK_BDEV_ZONE_STATE_FULL; 7214 break; 7215 case SPDK_NVME_ZONE_STATE_OFFLINE: 7216 info->state = SPDK_BDEV_ZONE_STATE_OFFLINE; 7217 break; 7218 default: 7219 SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", desc->zs); 7220 return -EIO; 7221 } 7222 7223 info->zone_id = desc->zslba; 7224 info->write_pointer = desc->wp; 7225 info->capacity = desc->zcap; 7226 7227 return 0; 7228 } 7229 7230 static void 7231 bdev_nvme_get_zone_info_done(void *ref, const struct spdk_nvme_cpl *cpl) 7232 { 7233 struct nvme_bdev_io *bio = ref; 7234 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7235 uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id; 7236 uint32_t zones_to_copy = bdev_io->u.zone_mgmt.num_zones; 7237 struct spdk_bdev_zone_info *info = bdev_io->u.zone_mgmt.buf; 7238 uint64_t max_zones_per_buf, i; 7239 uint32_t zone_report_bufsize; 7240 struct spdk_nvme_ns *ns; 7241 struct spdk_nvme_qpair *qpair; 7242 int ret; 7243 7244 if (spdk_nvme_cpl_is_error(cpl)) { 7245 goto out_complete_io_nvme_cpl; 7246 } 7247 7248 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 7249 ret = -ENXIO; 7250 goto out_complete_io_ret; 7251 } 7252 7253 ns = bio->io_path->nvme_ns->ns; 7254 qpair = bio->io_path->qpair->qpair; 7255 7256 zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 7257 max_zones_per_buf = (zone_report_bufsize - sizeof(*bio->zone_report_buf)) / 7258 sizeof(bio->zone_report_buf->descs[0]); 7259 7260 if (bio->zone_report_buf->nr_zones > max_zones_per_buf) { 7261 ret = -EINVAL; 7262 goto out_complete_io_ret; 7263 } 7264 7265 if (!bio->zone_report_buf->nr_zones) { 7266 ret = -EINVAL; 7267 goto out_complete_io_ret; 7268 } 7269 7270 for (i = 0; i < bio->zone_report_buf->nr_zones && bio->handled_zones < zones_to_copy; i++) { 7271 ret = fill_zone_from_report(&info[bio->handled_zones], 7272 &bio->zone_report_buf->descs[i]); 7273 if (ret) { 7274 goto out_complete_io_ret; 7275 } 7276 bio->handled_zones++; 7277 } 7278 7279 if (bio->handled_zones < zones_to_copy) { 7280 uint64_t zone_size_lba = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 7281 uint64_t slba = zone_id + (zone_size_lba * bio->handled_zones); 7282 7283 memset(bio->zone_report_buf, 0, zone_report_bufsize); 7284 ret = spdk_nvme_zns_report_zones(ns, qpair, 7285 bio->zone_report_buf, zone_report_bufsize, 7286 slba, SPDK_NVME_ZRA_LIST_ALL, true, 7287 bdev_nvme_get_zone_info_done, bio); 7288 if (!ret) { 7289 return; 7290 } else { 7291 goto out_complete_io_ret; 7292 } 7293 } 7294 7295 out_complete_io_nvme_cpl: 7296 free(bio->zone_report_buf); 7297 bio->zone_report_buf = NULL; 7298 bdev_nvme_io_complete_nvme_status(bio, cpl); 7299 return; 7300 7301 out_complete_io_ret: 7302 free(bio->zone_report_buf); 7303 bio->zone_report_buf = NULL; 7304 bdev_nvme_io_complete(bio, ret); 7305 } 7306 7307 static void 7308 bdev_nvme_zone_management_done(void *ref, const struct spdk_nvme_cpl *cpl) 7309 { 7310 struct nvme_bdev_io *bio = ref; 7311 7312 bdev_nvme_io_complete_nvme_status(bio, cpl); 7313 } 7314 7315 static void 7316 bdev_nvme_admin_passthru_complete_nvme_status(void *ctx) 7317 { 7318 struct nvme_bdev_io *bio = ctx; 7319 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7320 const struct spdk_nvme_cpl *cpl = &bio->cpl; 7321 7322 assert(bdev_nvme_io_type_is_admin(bdev_io->type)); 7323 7324 __bdev_nvme_io_complete(bdev_io, 0, cpl); 7325 } 7326 7327 static void 7328 bdev_nvme_abort_complete(void *ctx) 7329 { 7330 struct nvme_bdev_io *bio = ctx; 7331 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7332 7333 if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) { 7334 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS, NULL); 7335 } else { 7336 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED, NULL); 7337 } 7338 } 7339 7340 static void 7341 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl) 7342 { 7343 struct nvme_bdev_io *bio = ref; 7344 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7345 7346 bio->cpl = *cpl; 7347 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), bdev_nvme_abort_complete, bio); 7348 } 7349 7350 static void 7351 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl) 7352 { 7353 struct nvme_bdev_io *bio = ref; 7354 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7355 7356 bio->cpl = *cpl; 7357 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7358 bdev_nvme_admin_passthru_complete_nvme_status, bio); 7359 } 7360 7361 static void 7362 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset) 7363 { 7364 struct nvme_bdev_io *bio = ref; 7365 struct iovec *iov; 7366 7367 bio->iov_offset = sgl_offset; 7368 for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) { 7369 iov = &bio->iovs[bio->iovpos]; 7370 if (bio->iov_offset < iov->iov_len) { 7371 break; 7372 } 7373 7374 bio->iov_offset -= iov->iov_len; 7375 } 7376 } 7377 7378 static int 7379 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length) 7380 { 7381 struct nvme_bdev_io *bio = ref; 7382 struct iovec *iov; 7383 7384 assert(bio->iovpos < bio->iovcnt); 7385 7386 iov = &bio->iovs[bio->iovpos]; 7387 7388 *address = iov->iov_base; 7389 *length = iov->iov_len; 7390 7391 if (bio->iov_offset) { 7392 assert(bio->iov_offset <= iov->iov_len); 7393 *address += bio->iov_offset; 7394 *length -= bio->iov_offset; 7395 } 7396 7397 bio->iov_offset += *length; 7398 if (bio->iov_offset == iov->iov_len) { 7399 bio->iovpos++; 7400 bio->iov_offset = 0; 7401 } 7402 7403 return 0; 7404 } 7405 7406 static void 7407 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset) 7408 { 7409 struct nvme_bdev_io *bio = ref; 7410 struct iovec *iov; 7411 7412 bio->fused_iov_offset = sgl_offset; 7413 for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) { 7414 iov = &bio->fused_iovs[bio->fused_iovpos]; 7415 if (bio->fused_iov_offset < iov->iov_len) { 7416 break; 7417 } 7418 7419 bio->fused_iov_offset -= iov->iov_len; 7420 } 7421 } 7422 7423 static int 7424 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length) 7425 { 7426 struct nvme_bdev_io *bio = ref; 7427 struct iovec *iov; 7428 7429 assert(bio->fused_iovpos < bio->fused_iovcnt); 7430 7431 iov = &bio->fused_iovs[bio->fused_iovpos]; 7432 7433 *address = iov->iov_base; 7434 *length = iov->iov_len; 7435 7436 if (bio->fused_iov_offset) { 7437 assert(bio->fused_iov_offset <= iov->iov_len); 7438 *address += bio->fused_iov_offset; 7439 *length -= bio->fused_iov_offset; 7440 } 7441 7442 bio->fused_iov_offset += *length; 7443 if (bio->fused_iov_offset == iov->iov_len) { 7444 bio->fused_iovpos++; 7445 bio->fused_iov_offset = 0; 7446 } 7447 7448 return 0; 7449 } 7450 7451 static int 7452 bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7453 void *md, uint64_t lba_count, uint64_t lba) 7454 { 7455 int rc; 7456 7457 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n", 7458 lba_count, lba); 7459 7460 bio->iovs = iov; 7461 bio->iovcnt = iovcnt; 7462 bio->iovpos = 0; 7463 bio->iov_offset = 0; 7464 7465 rc = spdk_nvme_ns_cmd_readv_with_md(bio->io_path->nvme_ns->ns, 7466 bio->io_path->qpair->qpair, 7467 lba, lba_count, 7468 bdev_nvme_no_pi_readv_done, bio, 0, 7469 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 7470 md, 0, 0); 7471 7472 if (rc != 0 && rc != -ENOMEM) { 7473 SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc); 7474 } 7475 return rc; 7476 } 7477 7478 static int 7479 bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7480 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, 7481 struct spdk_memory_domain *domain, void *domain_ctx, 7482 struct spdk_accel_sequence *seq) 7483 { 7484 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7485 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7486 int rc; 7487 7488 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7489 lba_count, lba); 7490 7491 bio->iovs = iov; 7492 bio->iovcnt = iovcnt; 7493 bio->iovpos = 0; 7494 bio->iov_offset = 0; 7495 7496 if (domain != NULL || seq != NULL) { 7497 bio->ext_opts.size = SPDK_SIZEOF(&bio->ext_opts, accel_sequence); 7498 bio->ext_opts.memory_domain = domain; 7499 bio->ext_opts.memory_domain_ctx = domain_ctx; 7500 bio->ext_opts.io_flags = flags; 7501 bio->ext_opts.metadata = md; 7502 bio->ext_opts.accel_sequence = seq; 7503 7504 rc = spdk_nvme_ns_cmd_readv_ext(ns, qpair, lba, lba_count, 7505 bdev_nvme_readv_done, bio, 7506 bdev_nvme_queued_reset_sgl, 7507 bdev_nvme_queued_next_sge, 7508 &bio->ext_opts); 7509 } else if (iovcnt == 1) { 7510 rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base, 7511 md, lba, lba_count, bdev_nvme_readv_done, 7512 bio, flags, 0, 0); 7513 } else { 7514 rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count, 7515 bdev_nvme_readv_done, bio, flags, 7516 bdev_nvme_queued_reset_sgl, 7517 bdev_nvme_queued_next_sge, md, 0, 0); 7518 } 7519 7520 if (rc != 0 && rc != -ENOMEM) { 7521 SPDK_ERRLOG("readv failed: rc = %d\n", rc); 7522 } 7523 return rc; 7524 } 7525 7526 static int 7527 bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7528 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, 7529 struct spdk_memory_domain *domain, void *domain_ctx, 7530 struct spdk_accel_sequence *seq) 7531 { 7532 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7533 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7534 int rc; 7535 7536 SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7537 lba_count, lba); 7538 7539 bio->iovs = iov; 7540 bio->iovcnt = iovcnt; 7541 bio->iovpos = 0; 7542 bio->iov_offset = 0; 7543 7544 if (domain != NULL || seq != NULL) { 7545 bio->ext_opts.size = SPDK_SIZEOF(&bio->ext_opts, accel_sequence); 7546 bio->ext_opts.memory_domain = domain; 7547 bio->ext_opts.memory_domain_ctx = domain_ctx; 7548 bio->ext_opts.io_flags = flags; 7549 bio->ext_opts.metadata = md; 7550 bio->ext_opts.accel_sequence = seq; 7551 7552 rc = spdk_nvme_ns_cmd_writev_ext(ns, qpair, lba, lba_count, 7553 bdev_nvme_writev_done, bio, 7554 bdev_nvme_queued_reset_sgl, 7555 bdev_nvme_queued_next_sge, 7556 &bio->ext_opts); 7557 } else if (iovcnt == 1) { 7558 rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base, 7559 md, lba, lba_count, bdev_nvme_writev_done, 7560 bio, flags, 0, 0); 7561 } else { 7562 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 7563 bdev_nvme_writev_done, bio, flags, 7564 bdev_nvme_queued_reset_sgl, 7565 bdev_nvme_queued_next_sge, md, 0, 0); 7566 } 7567 7568 if (rc != 0 && rc != -ENOMEM) { 7569 SPDK_ERRLOG("writev failed: rc = %d\n", rc); 7570 } 7571 return rc; 7572 } 7573 7574 static int 7575 bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7576 void *md, uint64_t lba_count, uint64_t zslba, 7577 uint32_t flags) 7578 { 7579 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7580 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7581 int rc; 7582 7583 SPDK_DEBUGLOG(bdev_nvme, "zone append %" PRIu64 " blocks to zone start lba %#" PRIx64 "\n", 7584 lba_count, zslba); 7585 7586 bio->iovs = iov; 7587 bio->iovcnt = iovcnt; 7588 bio->iovpos = 0; 7589 bio->iov_offset = 0; 7590 7591 if (iovcnt == 1) { 7592 rc = spdk_nvme_zns_zone_append_with_md(ns, qpair, iov[0].iov_base, md, zslba, 7593 lba_count, 7594 bdev_nvme_zone_appendv_done, bio, 7595 flags, 7596 0, 0); 7597 } else { 7598 rc = spdk_nvme_zns_zone_appendv_with_md(ns, qpair, zslba, lba_count, 7599 bdev_nvme_zone_appendv_done, bio, flags, 7600 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 7601 md, 0, 0); 7602 } 7603 7604 if (rc != 0 && rc != -ENOMEM) { 7605 SPDK_ERRLOG("zone append failed: rc = %d\n", rc); 7606 } 7607 return rc; 7608 } 7609 7610 static int 7611 bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7612 void *md, uint64_t lba_count, uint64_t lba, 7613 uint32_t flags) 7614 { 7615 int rc; 7616 7617 SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7618 lba_count, lba); 7619 7620 bio->iovs = iov; 7621 bio->iovcnt = iovcnt; 7622 bio->iovpos = 0; 7623 bio->iov_offset = 0; 7624 7625 rc = spdk_nvme_ns_cmd_comparev_with_md(bio->io_path->nvme_ns->ns, 7626 bio->io_path->qpair->qpair, 7627 lba, lba_count, 7628 bdev_nvme_comparev_done, bio, flags, 7629 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 7630 md, 0, 0); 7631 7632 if (rc != 0 && rc != -ENOMEM) { 7633 SPDK_ERRLOG("comparev failed: rc = %d\n", rc); 7634 } 7635 return rc; 7636 } 7637 7638 static int 7639 bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, 7640 struct iovec *write_iov, int write_iovcnt, 7641 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags) 7642 { 7643 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7644 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7645 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7646 int rc; 7647 7648 SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7649 lba_count, lba); 7650 7651 bio->iovs = cmp_iov; 7652 bio->iovcnt = cmp_iovcnt; 7653 bio->iovpos = 0; 7654 bio->iov_offset = 0; 7655 bio->fused_iovs = write_iov; 7656 bio->fused_iovcnt = write_iovcnt; 7657 bio->fused_iovpos = 0; 7658 bio->fused_iov_offset = 0; 7659 7660 if (bdev_io->num_retries == 0) { 7661 bio->first_fused_submitted = false; 7662 bio->first_fused_completed = false; 7663 } 7664 7665 if (!bio->first_fused_submitted) { 7666 flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST; 7667 memset(&bio->cpl, 0, sizeof(bio->cpl)); 7668 7669 rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count, 7670 bdev_nvme_comparev_and_writev_done, bio, flags, 7671 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0); 7672 if (rc == 0) { 7673 bio->first_fused_submitted = true; 7674 flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST; 7675 } else { 7676 if (rc != -ENOMEM) { 7677 SPDK_ERRLOG("compare failed: rc = %d\n", rc); 7678 } 7679 return rc; 7680 } 7681 } 7682 7683 flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND; 7684 7685 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 7686 bdev_nvme_comparev_and_writev_done, bio, flags, 7687 bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0); 7688 if (rc != 0 && rc != -ENOMEM) { 7689 SPDK_ERRLOG("write failed: rc = %d\n", rc); 7690 rc = 0; 7691 } 7692 7693 return rc; 7694 } 7695 7696 static int 7697 bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 7698 { 7699 struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES]; 7700 struct spdk_nvme_dsm_range *range; 7701 uint64_t offset, remaining; 7702 uint64_t num_ranges_u64; 7703 uint16_t num_ranges; 7704 int rc; 7705 7706 num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) / 7707 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 7708 if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) { 7709 SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks); 7710 return -EINVAL; 7711 } 7712 num_ranges = (uint16_t)num_ranges_u64; 7713 7714 offset = offset_blocks; 7715 remaining = num_blocks; 7716 range = &dsm_ranges[0]; 7717 7718 /* Fill max-size ranges until the remaining blocks fit into one range */ 7719 while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) { 7720 range->attributes.raw = 0; 7721 range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 7722 range->starting_lba = offset; 7723 7724 offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 7725 remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 7726 range++; 7727 } 7728 7729 /* Final range describes the remaining blocks */ 7730 range->attributes.raw = 0; 7731 range->length = remaining; 7732 range->starting_lba = offset; 7733 7734 rc = spdk_nvme_ns_cmd_dataset_management(bio->io_path->nvme_ns->ns, 7735 bio->io_path->qpair->qpair, 7736 SPDK_NVME_DSM_ATTR_DEALLOCATE, 7737 dsm_ranges, num_ranges, 7738 bdev_nvme_queued_done, bio); 7739 7740 return rc; 7741 } 7742 7743 static int 7744 bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 7745 { 7746 if (num_blocks > UINT16_MAX + 1) { 7747 SPDK_ERRLOG("NVMe write zeroes is limited to 16-bit block count\n"); 7748 return -EINVAL; 7749 } 7750 7751 return spdk_nvme_ns_cmd_write_zeroes(bio->io_path->nvme_ns->ns, 7752 bio->io_path->qpair->qpair, 7753 offset_blocks, num_blocks, 7754 bdev_nvme_queued_done, bio, 7755 0); 7756 } 7757 7758 static int 7759 bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones, 7760 struct spdk_bdev_zone_info *info) 7761 { 7762 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7763 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7764 uint32_t zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 7765 uint64_t zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 7766 uint64_t total_zones = spdk_nvme_zns_ns_get_num_zones(ns); 7767 7768 if (zone_id % zone_size != 0) { 7769 return -EINVAL; 7770 } 7771 7772 if (num_zones > total_zones || !num_zones) { 7773 return -EINVAL; 7774 } 7775 7776 assert(!bio->zone_report_buf); 7777 bio->zone_report_buf = calloc(1, zone_report_bufsize); 7778 if (!bio->zone_report_buf) { 7779 return -ENOMEM; 7780 } 7781 7782 bio->handled_zones = 0; 7783 7784 return spdk_nvme_zns_report_zones(ns, qpair, bio->zone_report_buf, zone_report_bufsize, 7785 zone_id, SPDK_NVME_ZRA_LIST_ALL, true, 7786 bdev_nvme_get_zone_info_done, bio); 7787 } 7788 7789 static int 7790 bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 7791 enum spdk_bdev_zone_action action) 7792 { 7793 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7794 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7795 7796 switch (action) { 7797 case SPDK_BDEV_ZONE_CLOSE: 7798 return spdk_nvme_zns_close_zone(ns, qpair, zone_id, false, 7799 bdev_nvme_zone_management_done, bio); 7800 case SPDK_BDEV_ZONE_FINISH: 7801 return spdk_nvme_zns_finish_zone(ns, qpair, zone_id, false, 7802 bdev_nvme_zone_management_done, bio); 7803 case SPDK_BDEV_ZONE_OPEN: 7804 return spdk_nvme_zns_open_zone(ns, qpair, zone_id, false, 7805 bdev_nvme_zone_management_done, bio); 7806 case SPDK_BDEV_ZONE_RESET: 7807 return spdk_nvme_zns_reset_zone(ns, qpair, zone_id, false, 7808 bdev_nvme_zone_management_done, bio); 7809 case SPDK_BDEV_ZONE_OFFLINE: 7810 return spdk_nvme_zns_offline_zone(ns, qpair, zone_id, false, 7811 bdev_nvme_zone_management_done, bio); 7812 default: 7813 return -EINVAL; 7814 } 7815 } 7816 7817 static void 7818 bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 7819 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) 7820 { 7821 struct nvme_io_path *io_path; 7822 struct nvme_ctrlr *nvme_ctrlr; 7823 uint32_t max_xfer_size; 7824 int rc = -ENXIO; 7825 7826 /* Choose the first ctrlr which is not failed. */ 7827 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 7828 nvme_ctrlr = io_path->qpair->ctrlr; 7829 7830 /* We should skip any unavailable nvme_ctrlr rather than checking 7831 * if the return value of spdk_nvme_ctrlr_cmd_admin_raw() is -ENXIO. 7832 */ 7833 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 7834 continue; 7835 } 7836 7837 max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ctrlr->ctrlr); 7838 7839 if (nbytes > max_xfer_size) { 7840 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 7841 rc = -EINVAL; 7842 goto err; 7843 } 7844 7845 rc = spdk_nvme_ctrlr_cmd_admin_raw(nvme_ctrlr->ctrlr, cmd, buf, (uint32_t)nbytes, 7846 bdev_nvme_admin_passthru_done, bio); 7847 if (rc == 0) { 7848 return; 7849 } 7850 } 7851 7852 err: 7853 bdev_nvme_admin_complete(bio, rc); 7854 } 7855 7856 static int 7857 bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 7858 void *buf, size_t nbytes) 7859 { 7860 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7861 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7862 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 7863 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 7864 7865 if (nbytes > max_xfer_size) { 7866 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 7867 return -EINVAL; 7868 } 7869 7870 /* 7871 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 7872 * so fill it out automatically. 7873 */ 7874 cmd->nsid = spdk_nvme_ns_get_id(ns); 7875 7876 return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf, 7877 (uint32_t)nbytes, bdev_nvme_queued_done, bio); 7878 } 7879 7880 static int 7881 bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 7882 void *buf, size_t nbytes, void *md_buf, size_t md_len) 7883 { 7884 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7885 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7886 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 7887 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 7888 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 7889 7890 if (nbytes > max_xfer_size) { 7891 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 7892 return -EINVAL; 7893 } 7894 7895 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 7896 SPDK_ERRLOG("invalid meta data buffer size\n"); 7897 return -EINVAL; 7898 } 7899 7900 /* 7901 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 7902 * so fill it out automatically. 7903 */ 7904 cmd->nsid = spdk_nvme_ns_get_id(ns); 7905 7906 return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf, 7907 (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio); 7908 } 7909 7910 static int 7911 bdev_nvme_iov_passthru_md(struct nvme_bdev_io *bio, 7912 struct spdk_nvme_cmd *cmd, struct iovec *iov, int iovcnt, 7913 size_t nbytes, void *md_buf, size_t md_len) 7914 { 7915 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7916 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7917 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 7918 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 7919 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 7920 7921 bio->iovs = iov; 7922 bio->iovcnt = iovcnt; 7923 bio->iovpos = 0; 7924 bio->iov_offset = 0; 7925 7926 if (nbytes > max_xfer_size) { 7927 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 7928 return -EINVAL; 7929 } 7930 7931 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 7932 SPDK_ERRLOG("invalid meta data buffer size\n"); 7933 return -EINVAL; 7934 } 7935 7936 /* 7937 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands 7938 * require a nsid, so fill it out automatically. 7939 */ 7940 cmd->nsid = spdk_nvme_ns_get_id(ns); 7941 7942 return spdk_nvme_ctrlr_cmd_iov_raw_with_md( 7943 ctrlr, qpair, cmd, (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio, 7944 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge); 7945 } 7946 7947 static void 7948 bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 7949 struct nvme_bdev_io *bio_to_abort) 7950 { 7951 struct nvme_io_path *io_path; 7952 int rc = 0; 7953 7954 rc = bdev_nvme_abort_retry_io(nbdev_ch, bio_to_abort); 7955 if (rc == 0) { 7956 bdev_nvme_admin_complete(bio, 0); 7957 return; 7958 } 7959 7960 io_path = bio_to_abort->io_path; 7961 if (io_path != NULL) { 7962 rc = spdk_nvme_ctrlr_cmd_abort_ext(io_path->qpair->ctrlr->ctrlr, 7963 io_path->qpair->qpair, 7964 bio_to_abort, 7965 bdev_nvme_abort_done, bio); 7966 } else { 7967 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 7968 rc = spdk_nvme_ctrlr_cmd_abort_ext(io_path->qpair->ctrlr->ctrlr, 7969 NULL, 7970 bio_to_abort, 7971 bdev_nvme_abort_done, bio); 7972 7973 if (rc != -ENOENT) { 7974 break; 7975 } 7976 } 7977 } 7978 7979 if (rc != 0) { 7980 /* If no command was found or there was any error, complete the abort 7981 * request with failure. 7982 */ 7983 bdev_nvme_admin_complete(bio, rc); 7984 } 7985 } 7986 7987 static int 7988 bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, uint64_t src_offset_blocks, 7989 uint64_t num_blocks) 7990 { 7991 struct spdk_nvme_scc_source_range range = { 7992 .slba = src_offset_blocks, 7993 .nlb = num_blocks - 1 7994 }; 7995 7996 return spdk_nvme_ns_cmd_copy(bio->io_path->nvme_ns->ns, 7997 bio->io_path->qpair->qpair, 7998 &range, 1, dst_offset_blocks, 7999 bdev_nvme_queued_done, bio); 8000 } 8001 8002 static void 8003 bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w) 8004 { 8005 const char *action; 8006 8007 if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) { 8008 action = "reset"; 8009 } else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) { 8010 action = "abort"; 8011 } else { 8012 action = "none"; 8013 } 8014 8015 spdk_json_write_object_begin(w); 8016 8017 spdk_json_write_named_string(w, "method", "bdev_nvme_set_options"); 8018 8019 spdk_json_write_named_object_begin(w, "params"); 8020 spdk_json_write_named_string(w, "action_on_timeout", action); 8021 spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us); 8022 spdk_json_write_named_uint64(w, "timeout_admin_us", g_opts.timeout_admin_us); 8023 spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms); 8024 spdk_json_write_named_uint32(w, "transport_retry_count", g_opts.transport_retry_count); 8025 spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst); 8026 spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight); 8027 spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight); 8028 spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight); 8029 spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us); 8030 spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us); 8031 spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests); 8032 spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit); 8033 spdk_json_write_named_int32(w, "bdev_retry_count", g_opts.bdev_retry_count); 8034 spdk_json_write_named_uint8(w, "transport_ack_timeout", g_opts.transport_ack_timeout); 8035 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", g_opts.ctrlr_loss_timeout_sec); 8036 spdk_json_write_named_uint32(w, "reconnect_delay_sec", g_opts.reconnect_delay_sec); 8037 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", g_opts.fast_io_fail_timeout_sec); 8038 spdk_json_write_named_bool(w, "generate_uuids", g_opts.generate_uuids); 8039 spdk_json_write_named_uint8(w, "transport_tos", g_opts.transport_tos); 8040 spdk_json_write_named_bool(w, "io_path_stat", g_opts.io_path_stat); 8041 spdk_json_write_named_bool(w, "allow_accel_sequence", g_opts.allow_accel_sequence); 8042 spdk_json_write_object_end(w); 8043 8044 spdk_json_write_object_end(w); 8045 } 8046 8047 static void 8048 bdev_nvme_discovery_config_json(struct spdk_json_write_ctx *w, struct discovery_ctx *ctx) 8049 { 8050 struct spdk_nvme_transport_id trid; 8051 8052 spdk_json_write_object_begin(w); 8053 8054 spdk_json_write_named_string(w, "method", "bdev_nvme_start_discovery"); 8055 8056 spdk_json_write_named_object_begin(w, "params"); 8057 spdk_json_write_named_string(w, "name", ctx->name); 8058 spdk_json_write_named_string(w, "hostnqn", ctx->hostnqn); 8059 8060 trid = ctx->trid; 8061 memset(trid.subnqn, 0, sizeof(trid.subnqn)); 8062 nvme_bdev_dump_trid_json(&trid, w); 8063 8064 spdk_json_write_named_bool(w, "wait_for_attach", ctx->wait_for_attach); 8065 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", ctx->bdev_opts.ctrlr_loss_timeout_sec); 8066 spdk_json_write_named_uint32(w, "reconnect_delay_sec", ctx->bdev_opts.reconnect_delay_sec); 8067 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 8068 ctx->bdev_opts.fast_io_fail_timeout_sec); 8069 spdk_json_write_object_end(w); 8070 8071 spdk_json_write_object_end(w); 8072 } 8073 8074 #ifdef SPDK_CONFIG_NVME_CUSE 8075 static void 8076 nvme_ctrlr_cuse_config_json(struct spdk_json_write_ctx *w, 8077 struct nvme_ctrlr *nvme_ctrlr) 8078 { 8079 size_t cuse_name_size = 128; 8080 char cuse_name[cuse_name_size]; 8081 8082 if (spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr, 8083 cuse_name, &cuse_name_size) != 0) { 8084 return; 8085 } 8086 8087 spdk_json_write_object_begin(w); 8088 8089 spdk_json_write_named_string(w, "method", "bdev_nvme_cuse_register"); 8090 8091 spdk_json_write_named_object_begin(w, "params"); 8092 spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name); 8093 spdk_json_write_object_end(w); 8094 8095 spdk_json_write_object_end(w); 8096 } 8097 #endif 8098 8099 static void 8100 nvme_ctrlr_config_json(struct spdk_json_write_ctx *w, 8101 struct nvme_ctrlr *nvme_ctrlr) 8102 { 8103 struct spdk_nvme_transport_id *trid; 8104 const struct spdk_nvme_ctrlr_opts *opts; 8105 8106 if (nvme_ctrlr->opts.from_discovery_service) { 8107 /* Do not emit an RPC for this - it will be implicitly 8108 * covered by a separate bdev_nvme_start_discovery or 8109 * bdev_nvme_start_mdns_discovery RPC. 8110 */ 8111 return; 8112 } 8113 8114 trid = &nvme_ctrlr->active_path_id->trid; 8115 8116 spdk_json_write_object_begin(w); 8117 8118 spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller"); 8119 8120 spdk_json_write_named_object_begin(w, "params"); 8121 spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name); 8122 nvme_bdev_dump_trid_json(trid, w); 8123 spdk_json_write_named_bool(w, "prchk_reftag", 8124 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0); 8125 spdk_json_write_named_bool(w, "prchk_guard", 8126 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0); 8127 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", nvme_ctrlr->opts.ctrlr_loss_timeout_sec); 8128 spdk_json_write_named_uint32(w, "reconnect_delay_sec", nvme_ctrlr->opts.reconnect_delay_sec); 8129 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 8130 nvme_ctrlr->opts.fast_io_fail_timeout_sec); 8131 if (nvme_ctrlr->opts.psk_path[0] != '\0') { 8132 spdk_json_write_named_string(w, "psk", nvme_ctrlr->opts.psk_path); 8133 } 8134 8135 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 8136 spdk_json_write_named_string(w, "hostnqn", opts->hostnqn); 8137 spdk_json_write_named_bool(w, "hdgst", opts->header_digest); 8138 spdk_json_write_named_bool(w, "ddgst", opts->data_digest); 8139 8140 spdk_json_write_object_end(w); 8141 8142 spdk_json_write_object_end(w); 8143 } 8144 8145 static void 8146 bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w) 8147 { 8148 spdk_json_write_object_begin(w); 8149 spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug"); 8150 8151 spdk_json_write_named_object_begin(w, "params"); 8152 spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us); 8153 spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled); 8154 spdk_json_write_object_end(w); 8155 8156 spdk_json_write_object_end(w); 8157 } 8158 8159 static int 8160 bdev_nvme_config_json(struct spdk_json_write_ctx *w) 8161 { 8162 struct nvme_bdev_ctrlr *nbdev_ctrlr; 8163 struct nvme_ctrlr *nvme_ctrlr; 8164 struct discovery_ctx *ctx; 8165 8166 bdev_nvme_opts_config_json(w); 8167 8168 pthread_mutex_lock(&g_bdev_nvme_mutex); 8169 8170 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 8171 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 8172 nvme_ctrlr_config_json(w, nvme_ctrlr); 8173 8174 #ifdef SPDK_CONFIG_NVME_CUSE 8175 nvme_ctrlr_cuse_config_json(w, nvme_ctrlr); 8176 #endif 8177 } 8178 } 8179 8180 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 8181 if (!ctx->from_mdns_discovery_service) { 8182 bdev_nvme_discovery_config_json(w, ctx); 8183 } 8184 } 8185 8186 bdev_nvme_mdns_discovery_config_json(w); 8187 8188 /* Dump as last parameter to give all NVMe bdevs chance to be constructed 8189 * before enabling hotplug poller. 8190 */ 8191 bdev_nvme_hotplug_config_json(w); 8192 8193 pthread_mutex_unlock(&g_bdev_nvme_mutex); 8194 return 0; 8195 } 8196 8197 struct spdk_nvme_ctrlr * 8198 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev) 8199 { 8200 struct nvme_bdev *nbdev; 8201 struct nvme_ns *nvme_ns; 8202 8203 if (!bdev || bdev->module != &nvme_if) { 8204 return NULL; 8205 } 8206 8207 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 8208 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 8209 assert(nvme_ns != NULL); 8210 8211 return nvme_ns->ctrlr->ctrlr; 8212 } 8213 8214 void 8215 nvme_io_path_info_json(struct spdk_json_write_ctx *w, struct nvme_io_path *io_path) 8216 { 8217 struct nvme_ns *nvme_ns = io_path->nvme_ns; 8218 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 8219 const struct spdk_nvme_ctrlr_data *cdata; 8220 const struct spdk_nvme_transport_id *trid; 8221 const char *adrfam_str; 8222 8223 spdk_json_write_object_begin(w); 8224 8225 spdk_json_write_named_string(w, "bdev_name", nvme_ns->bdev->disk.name); 8226 8227 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 8228 trid = spdk_nvme_ctrlr_get_transport_id(nvme_ctrlr->ctrlr); 8229 8230 spdk_json_write_named_uint32(w, "cntlid", cdata->cntlid); 8231 spdk_json_write_named_bool(w, "current", io_path->nbdev_ch != NULL && 8232 io_path == io_path->nbdev_ch->current_io_path); 8233 spdk_json_write_named_bool(w, "connected", nvme_qpair_is_connected(io_path->qpair)); 8234 spdk_json_write_named_bool(w, "accessible", nvme_ns_is_accessible(nvme_ns)); 8235 8236 spdk_json_write_named_object_begin(w, "transport"); 8237 spdk_json_write_named_string(w, "trtype", trid->trstring); 8238 spdk_json_write_named_string(w, "traddr", trid->traddr); 8239 if (trid->trsvcid[0] != '\0') { 8240 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 8241 } 8242 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 8243 if (adrfam_str) { 8244 spdk_json_write_named_string(w, "adrfam", adrfam_str); 8245 } 8246 spdk_json_write_object_end(w); 8247 8248 spdk_json_write_object_end(w); 8249 } 8250 8251 void 8252 bdev_nvme_get_discovery_info(struct spdk_json_write_ctx *w) 8253 { 8254 struct discovery_ctx *ctx; 8255 struct discovery_entry_ctx *entry_ctx; 8256 8257 spdk_json_write_array_begin(w); 8258 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 8259 spdk_json_write_object_begin(w); 8260 spdk_json_write_named_string(w, "name", ctx->name); 8261 8262 spdk_json_write_named_object_begin(w, "trid"); 8263 nvme_bdev_dump_trid_json(&ctx->trid, w); 8264 spdk_json_write_object_end(w); 8265 8266 spdk_json_write_named_array_begin(w, "referrals"); 8267 TAILQ_FOREACH(entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 8268 spdk_json_write_object_begin(w); 8269 spdk_json_write_named_object_begin(w, "trid"); 8270 nvme_bdev_dump_trid_json(&entry_ctx->trid, w); 8271 spdk_json_write_object_end(w); 8272 spdk_json_write_object_end(w); 8273 } 8274 spdk_json_write_array_end(w); 8275 8276 spdk_json_write_object_end(w); 8277 } 8278 spdk_json_write_array_end(w); 8279 } 8280 8281 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme) 8282 8283 SPDK_TRACE_REGISTER_FN(bdev_nvme_trace, "bdev_nvme", TRACE_GROUP_BDEV_NVME) 8284 { 8285 struct spdk_trace_tpoint_opts opts[] = { 8286 { 8287 "BDEV_NVME_IO_START", TRACE_BDEV_NVME_IO_START, 8288 OWNER_NONE, OBJECT_BDEV_NVME_IO, 1, 8289 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 8290 }, 8291 { 8292 "BDEV_NVME_IO_DONE", TRACE_BDEV_NVME_IO_DONE, 8293 OWNER_NONE, OBJECT_BDEV_NVME_IO, 0, 8294 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 8295 } 8296 }; 8297 8298 8299 spdk_trace_register_object(OBJECT_BDEV_NVME_IO, 'N'); 8300 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 8301 spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_SUBMIT, OBJECT_BDEV_NVME_IO, 0); 8302 spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_SUBMIT, OBJECT_BDEV_NVME_IO, 0); 8303 spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_COMPLETE, OBJECT_BDEV_NVME_IO, 0); 8304 spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_COMPLETE, OBJECT_BDEV_NVME_IO, 0); 8305 } 8306