1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 * Copyright (c) 2022 Dell Inc, or its subsidiaries. All rights reserved. 6 */ 7 8 #include "spdk/stdinc.h" 9 10 #include "bdev_nvme.h" 11 12 #include "spdk/accel.h" 13 #include "spdk/config.h" 14 #include "spdk/endian.h" 15 #include "spdk/bdev.h" 16 #include "spdk/json.h" 17 #include "spdk/keyring.h" 18 #include "spdk/likely.h" 19 #include "spdk/nvme.h" 20 #include "spdk/nvme_ocssd.h" 21 #include "spdk/nvme_zns.h" 22 #include "spdk/opal.h" 23 #include "spdk/thread.h" 24 #include "spdk/trace.h" 25 #include "spdk/string.h" 26 #include "spdk/util.h" 27 #include "spdk/uuid.h" 28 29 #include "spdk/bdev_module.h" 30 #include "spdk/log.h" 31 32 #include "spdk_internal/usdt.h" 33 #include "spdk_internal/trace_defs.h" 34 35 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true 36 #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS (10000) 37 38 #define NSID_STR_LEN 10 39 40 #define SPDK_CONTROLLER_NAME_MAX 512 41 42 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w); 43 44 struct nvme_bdev_io { 45 /** array of iovecs to transfer. */ 46 struct iovec *iovs; 47 48 /** Number of iovecs in iovs array. */ 49 int iovcnt; 50 51 /** Current iovec position. */ 52 int iovpos; 53 54 /** Offset in current iovec. */ 55 uint32_t iov_offset; 56 57 /** I/O path the current I/O or admin passthrough is submitted on, or the I/O path 58 * being reset in a reset I/O. 59 */ 60 struct nvme_io_path *io_path; 61 62 /** array of iovecs to transfer. */ 63 struct iovec *fused_iovs; 64 65 /** Number of iovecs in iovs array. */ 66 int fused_iovcnt; 67 68 /** Current iovec position. */ 69 int fused_iovpos; 70 71 /** Offset in current iovec. */ 72 uint32_t fused_iov_offset; 73 74 /** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */ 75 struct spdk_nvme_cpl cpl; 76 77 /** Extended IO opts passed by the user to bdev layer and mapped to NVME format */ 78 struct spdk_nvme_ns_cmd_ext_io_opts ext_opts; 79 80 /** Keeps track if first of fused commands was submitted */ 81 bool first_fused_submitted; 82 83 /** Keeps track if first of fused commands was completed */ 84 bool first_fused_completed; 85 86 /** Temporary pointer to zone report buffer */ 87 struct spdk_nvme_zns_zone_report *zone_report_buf; 88 89 /** Keep track of how many zones that have been copied to the spdk_bdev_zone_info struct */ 90 uint64_t handled_zones; 91 92 /** Expiration value in ticks to retry the current I/O. */ 93 uint64_t retry_ticks; 94 95 /* How many times the current I/O was retried. */ 96 int32_t retry_count; 97 98 /* Current tsc at submit time. */ 99 uint64_t submit_tsc; 100 }; 101 102 struct nvme_probe_skip_entry { 103 struct spdk_nvme_transport_id trid; 104 TAILQ_ENTRY(nvme_probe_skip_entry) tailq; 105 }; 106 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */ 107 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER( 108 g_skipped_nvme_ctrlrs); 109 110 #define BDEV_NVME_DEFAULT_DIGESTS (SPDK_BIT(SPDK_NVMF_DHCHAP_HASH_SHA256) | \ 111 SPDK_BIT(SPDK_NVMF_DHCHAP_HASH_SHA384) | \ 112 SPDK_BIT(SPDK_NVMF_DHCHAP_HASH_SHA512)) 113 114 #define BDEV_NVME_DEFAULT_DHGROUPS (SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_NULL) | \ 115 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_2048) | \ 116 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_3072) | \ 117 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_4096) | \ 118 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_6144) | \ 119 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_8192)) 120 121 static struct spdk_bdev_nvme_opts g_opts = { 122 .action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE, 123 .timeout_us = 0, 124 .timeout_admin_us = 0, 125 .keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS, 126 .transport_retry_count = 4, 127 .arbitration_burst = 0, 128 .low_priority_weight = 0, 129 .medium_priority_weight = 0, 130 .high_priority_weight = 0, 131 .nvme_adminq_poll_period_us = 10000ULL, 132 .nvme_ioq_poll_period_us = 0, 133 .io_queue_requests = 0, 134 .delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT, 135 .bdev_retry_count = 3, 136 .transport_ack_timeout = 0, 137 .ctrlr_loss_timeout_sec = 0, 138 .reconnect_delay_sec = 0, 139 .fast_io_fail_timeout_sec = 0, 140 .disable_auto_failback = false, 141 .generate_uuids = false, 142 .transport_tos = 0, 143 .nvme_error_stat = false, 144 .io_path_stat = false, 145 .allow_accel_sequence = false, 146 .dhchap_digests = BDEV_NVME_DEFAULT_DIGESTS, 147 .dhchap_dhgroups = BDEV_NVME_DEFAULT_DHGROUPS, 148 }; 149 150 #define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL 151 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT 100000ULL 152 153 static int g_hot_insert_nvme_controller_index = 0; 154 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT; 155 static bool g_nvme_hotplug_enabled = false; 156 struct spdk_thread *g_bdev_nvme_init_thread; 157 static struct spdk_poller *g_hotplug_poller; 158 static struct spdk_poller *g_hotplug_probe_poller; 159 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx; 160 161 static void nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 162 struct nvme_async_probe_ctx *ctx); 163 static void nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 164 struct nvme_async_probe_ctx *ctx); 165 static int bdev_nvme_library_init(void); 166 static void bdev_nvme_library_fini(void); 167 static void _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, 168 struct spdk_bdev_io *bdev_io); 169 static void bdev_nvme_submit_request(struct spdk_io_channel *ch, 170 struct spdk_bdev_io *bdev_io); 171 static int bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 172 void *md, uint64_t lba_count, uint64_t lba, 173 uint32_t flags, struct spdk_memory_domain *domain, void *domain_ctx, 174 struct spdk_accel_sequence *seq); 175 static int bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 176 void *md, uint64_t lba_count, uint64_t lba); 177 static int bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 178 void *md, uint64_t lba_count, uint64_t lba, 179 uint32_t flags, struct spdk_memory_domain *domain, void *domain_ctx, 180 struct spdk_accel_sequence *seq, 181 union spdk_bdev_nvme_cdw12 cdw12, union spdk_bdev_nvme_cdw13 cdw13); 182 static int bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 183 void *md, uint64_t lba_count, 184 uint64_t zslba, uint32_t flags); 185 static int bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 186 void *md, uint64_t lba_count, uint64_t lba, 187 uint32_t flags); 188 static int bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, 189 struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov, 190 int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba, 191 uint32_t flags); 192 static int bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, 193 uint32_t num_zones, struct spdk_bdev_zone_info *info); 194 static int bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 195 enum spdk_bdev_zone_action action); 196 static void bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, 197 struct nvme_bdev_io *bio, 198 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); 199 static int bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 200 void *buf, size_t nbytes); 201 static int bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 202 void *buf, size_t nbytes, void *md_buf, size_t md_len); 203 static int bdev_nvme_iov_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 204 struct iovec *iov, int iovcnt, size_t nbytes, 205 void *md_buf, size_t md_len); 206 static void bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, 207 struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort); 208 static void bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio); 209 static int bdev_nvme_reset_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 210 static int bdev_nvme_failover_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 211 static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr); 212 static int nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr); 213 214 static struct nvme_ns *nvme_ns_alloc(void); 215 static void nvme_ns_free(struct nvme_ns *ns); 216 217 static int 218 nvme_ns_cmp(struct nvme_ns *ns1, struct nvme_ns *ns2) 219 { 220 return ns1->id < ns2->id ? -1 : ns1->id > ns2->id; 221 } 222 223 RB_GENERATE_STATIC(nvme_ns_tree, nvme_ns, node, nvme_ns_cmp); 224 225 struct spdk_nvme_qpair * 226 bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch) 227 { 228 struct nvme_ctrlr_channel *ctrlr_ch; 229 230 assert(ctrlr_io_ch != NULL); 231 232 ctrlr_ch = spdk_io_channel_get_ctx(ctrlr_io_ch); 233 234 return ctrlr_ch->qpair->qpair; 235 } 236 237 static int 238 bdev_nvme_get_ctx_size(void) 239 { 240 return sizeof(struct nvme_bdev_io); 241 } 242 243 static struct spdk_bdev_module nvme_if = { 244 .name = "nvme", 245 .async_fini = true, 246 .module_init = bdev_nvme_library_init, 247 .module_fini = bdev_nvme_library_fini, 248 .config_json = bdev_nvme_config_json, 249 .get_ctx_size = bdev_nvme_get_ctx_size, 250 251 }; 252 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if) 253 254 struct nvme_bdev_ctrlrs g_nvme_bdev_ctrlrs = TAILQ_HEAD_INITIALIZER(g_nvme_bdev_ctrlrs); 255 pthread_mutex_t g_bdev_nvme_mutex = PTHREAD_MUTEX_INITIALIZER; 256 bool g_bdev_nvme_module_finish; 257 258 struct nvme_bdev_ctrlr * 259 nvme_bdev_ctrlr_get_by_name(const char *name) 260 { 261 struct nvme_bdev_ctrlr *nbdev_ctrlr; 262 263 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 264 if (strcmp(name, nbdev_ctrlr->name) == 0) { 265 break; 266 } 267 } 268 269 return nbdev_ctrlr; 270 } 271 272 static struct nvme_ctrlr * 273 nvme_bdev_ctrlr_get_ctrlr(struct nvme_bdev_ctrlr *nbdev_ctrlr, 274 const struct spdk_nvme_transport_id *trid) 275 { 276 struct nvme_ctrlr *nvme_ctrlr; 277 278 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 279 if (spdk_nvme_transport_id_compare(trid, &nvme_ctrlr->active_path_id->trid) == 0) { 280 break; 281 } 282 } 283 284 return nvme_ctrlr; 285 } 286 287 struct nvme_ctrlr * 288 nvme_bdev_ctrlr_get_ctrlr_by_id(struct nvme_bdev_ctrlr *nbdev_ctrlr, 289 uint16_t cntlid) 290 { 291 struct nvme_ctrlr *nvme_ctrlr; 292 const struct spdk_nvme_ctrlr_data *cdata; 293 294 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 295 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 296 if (cdata->cntlid == cntlid) { 297 break; 298 } 299 } 300 301 return nvme_ctrlr; 302 } 303 304 static struct nvme_bdev * 305 nvme_bdev_ctrlr_get_bdev(struct nvme_bdev_ctrlr *nbdev_ctrlr, uint32_t nsid) 306 { 307 struct nvme_bdev *bdev; 308 309 pthread_mutex_lock(&g_bdev_nvme_mutex); 310 TAILQ_FOREACH(bdev, &nbdev_ctrlr->bdevs, tailq) { 311 if (bdev->nsid == nsid) { 312 break; 313 } 314 } 315 pthread_mutex_unlock(&g_bdev_nvme_mutex); 316 317 return bdev; 318 } 319 320 struct nvme_ns * 321 nvme_ctrlr_get_ns(struct nvme_ctrlr *nvme_ctrlr, uint32_t nsid) 322 { 323 struct nvme_ns ns; 324 325 assert(nsid > 0); 326 327 ns.id = nsid; 328 return RB_FIND(nvme_ns_tree, &nvme_ctrlr->namespaces, &ns); 329 } 330 331 struct nvme_ns * 332 nvme_ctrlr_get_first_active_ns(struct nvme_ctrlr *nvme_ctrlr) 333 { 334 return RB_MIN(nvme_ns_tree, &nvme_ctrlr->namespaces); 335 } 336 337 struct nvme_ns * 338 nvme_ctrlr_get_next_active_ns(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *ns) 339 { 340 if (ns == NULL) { 341 return NULL; 342 } 343 344 return RB_NEXT(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 345 } 346 347 static struct nvme_ctrlr * 348 nvme_ctrlr_get(const struct spdk_nvme_transport_id *trid) 349 { 350 struct nvme_bdev_ctrlr *nbdev_ctrlr; 351 struct nvme_ctrlr *nvme_ctrlr = NULL; 352 353 pthread_mutex_lock(&g_bdev_nvme_mutex); 354 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 355 nvme_ctrlr = nvme_bdev_ctrlr_get_ctrlr(nbdev_ctrlr, trid); 356 if (nvme_ctrlr != NULL) { 357 break; 358 } 359 } 360 pthread_mutex_unlock(&g_bdev_nvme_mutex); 361 362 return nvme_ctrlr; 363 } 364 365 struct nvme_ctrlr * 366 nvme_ctrlr_get_by_name(const char *name) 367 { 368 struct nvme_bdev_ctrlr *nbdev_ctrlr; 369 struct nvme_ctrlr *nvme_ctrlr = NULL; 370 371 if (name == NULL) { 372 return NULL; 373 } 374 375 pthread_mutex_lock(&g_bdev_nvme_mutex); 376 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 377 if (nbdev_ctrlr != NULL) { 378 nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 379 } 380 pthread_mutex_unlock(&g_bdev_nvme_mutex); 381 382 return nvme_ctrlr; 383 } 384 385 void 386 nvme_bdev_ctrlr_for_each(nvme_bdev_ctrlr_for_each_fn fn, void *ctx) 387 { 388 struct nvme_bdev_ctrlr *nbdev_ctrlr; 389 390 pthread_mutex_lock(&g_bdev_nvme_mutex); 391 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 392 fn(nbdev_ctrlr, ctx); 393 } 394 pthread_mutex_unlock(&g_bdev_nvme_mutex); 395 } 396 397 void 398 nvme_bdev_dump_trid_json(const struct spdk_nvme_transport_id *trid, struct spdk_json_write_ctx *w) 399 { 400 const char *trtype_str; 401 const char *adrfam_str; 402 403 trtype_str = spdk_nvme_transport_id_trtype_str(trid->trtype); 404 if (trtype_str) { 405 spdk_json_write_named_string(w, "trtype", trtype_str); 406 } 407 408 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 409 if (adrfam_str) { 410 spdk_json_write_named_string(w, "adrfam", adrfam_str); 411 } 412 413 if (trid->traddr[0] != '\0') { 414 spdk_json_write_named_string(w, "traddr", trid->traddr); 415 } 416 417 if (trid->trsvcid[0] != '\0') { 418 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 419 } 420 421 if (trid->subnqn[0] != '\0') { 422 spdk_json_write_named_string(w, "subnqn", trid->subnqn); 423 } 424 } 425 426 static void 427 nvme_bdev_ctrlr_delete(struct nvme_bdev_ctrlr *nbdev_ctrlr, 428 struct nvme_ctrlr *nvme_ctrlr) 429 { 430 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_delete, nvme_ctrlr->nbdev_ctrlr->name); 431 pthread_mutex_lock(&g_bdev_nvme_mutex); 432 433 TAILQ_REMOVE(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 434 if (!TAILQ_EMPTY(&nbdev_ctrlr->ctrlrs)) { 435 pthread_mutex_unlock(&g_bdev_nvme_mutex); 436 437 return; 438 } 439 TAILQ_REMOVE(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 440 441 pthread_mutex_unlock(&g_bdev_nvme_mutex); 442 443 assert(TAILQ_EMPTY(&nbdev_ctrlr->bdevs)); 444 445 free(nbdev_ctrlr->name); 446 free(nbdev_ctrlr); 447 } 448 449 static void 450 _nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 451 { 452 struct nvme_path_id *path_id, *tmp_path; 453 struct nvme_ns *ns, *tmp_ns; 454 455 free(nvme_ctrlr->copied_ana_desc); 456 spdk_free(nvme_ctrlr->ana_log_page); 457 458 if (nvme_ctrlr->opal_dev) { 459 spdk_opal_dev_destruct(nvme_ctrlr->opal_dev); 460 nvme_ctrlr->opal_dev = NULL; 461 } 462 463 if (nvme_ctrlr->nbdev_ctrlr) { 464 nvme_bdev_ctrlr_delete(nvme_ctrlr->nbdev_ctrlr, nvme_ctrlr); 465 } 466 467 RB_FOREACH_SAFE(ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp_ns) { 468 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 469 nvme_ns_free(ns); 470 } 471 472 TAILQ_FOREACH_SAFE(path_id, &nvme_ctrlr->trids, link, tmp_path) { 473 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 474 free(path_id); 475 } 476 477 pthread_mutex_destroy(&nvme_ctrlr->mutex); 478 spdk_keyring_put_key(nvme_ctrlr->psk); 479 spdk_keyring_put_key(nvme_ctrlr->dhchap_key); 480 spdk_keyring_put_key(nvme_ctrlr->dhchap_ctrlr_key); 481 free(nvme_ctrlr); 482 483 pthread_mutex_lock(&g_bdev_nvme_mutex); 484 if (g_bdev_nvme_module_finish && TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 485 pthread_mutex_unlock(&g_bdev_nvme_mutex); 486 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 487 spdk_bdev_module_fini_done(); 488 return; 489 } 490 pthread_mutex_unlock(&g_bdev_nvme_mutex); 491 } 492 493 static int 494 nvme_detach_poller(void *arg) 495 { 496 struct nvme_ctrlr *nvme_ctrlr = arg; 497 int rc; 498 499 rc = spdk_nvme_detach_poll_async(nvme_ctrlr->detach_ctx); 500 if (rc != -EAGAIN) { 501 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 502 _nvme_ctrlr_delete(nvme_ctrlr); 503 } 504 505 return SPDK_POLLER_BUSY; 506 } 507 508 static void 509 nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 510 { 511 int rc; 512 513 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 514 515 /* First, unregister the adminq poller, as the driver will poll adminq if necessary */ 516 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 517 518 /* If we got here, the reset/detach poller cannot be active */ 519 assert(nvme_ctrlr->reset_detach_poller == NULL); 520 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(nvme_detach_poller, 521 nvme_ctrlr, 1000); 522 if (nvme_ctrlr->reset_detach_poller == NULL) { 523 SPDK_ERRLOG("Failed to register detach poller\n"); 524 goto error; 525 } 526 527 rc = spdk_nvme_detach_async(nvme_ctrlr->ctrlr, &nvme_ctrlr->detach_ctx); 528 if (rc != 0) { 529 SPDK_ERRLOG("Failed to detach the NVMe controller\n"); 530 goto error; 531 } 532 533 return; 534 error: 535 /* We don't have a good way to handle errors here, so just do what we can and delete the 536 * controller without detaching the underlying NVMe device. 537 */ 538 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 539 _nvme_ctrlr_delete(nvme_ctrlr); 540 } 541 542 static void 543 nvme_ctrlr_unregister_cb(void *io_device) 544 { 545 struct nvme_ctrlr *nvme_ctrlr = io_device; 546 547 nvme_ctrlr_delete(nvme_ctrlr); 548 } 549 550 static void 551 nvme_ctrlr_unregister(void *ctx) 552 { 553 struct nvme_ctrlr *nvme_ctrlr = ctx; 554 555 spdk_io_device_unregister(nvme_ctrlr, nvme_ctrlr_unregister_cb); 556 } 557 558 static bool 559 nvme_ctrlr_can_be_unregistered(struct nvme_ctrlr *nvme_ctrlr) 560 { 561 if (!nvme_ctrlr->destruct) { 562 return false; 563 } 564 565 if (nvme_ctrlr->ref > 0) { 566 return false; 567 } 568 569 if (nvme_ctrlr->resetting) { 570 return false; 571 } 572 573 if (nvme_ctrlr->ana_log_page_updating) { 574 return false; 575 } 576 577 if (nvme_ctrlr->io_path_cache_clearing) { 578 return false; 579 } 580 581 return true; 582 } 583 584 static void 585 nvme_ctrlr_release(struct nvme_ctrlr *nvme_ctrlr) 586 { 587 pthread_mutex_lock(&nvme_ctrlr->mutex); 588 SPDK_DTRACE_PROBE2(bdev_nvme_ctrlr_release, nvme_ctrlr->nbdev_ctrlr->name, nvme_ctrlr->ref); 589 590 assert(nvme_ctrlr->ref > 0); 591 nvme_ctrlr->ref--; 592 593 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 594 pthread_mutex_unlock(&nvme_ctrlr->mutex); 595 return; 596 } 597 598 pthread_mutex_unlock(&nvme_ctrlr->mutex); 599 600 spdk_thread_exec_msg(nvme_ctrlr->thread, nvme_ctrlr_unregister, nvme_ctrlr); 601 } 602 603 static void 604 bdev_nvme_clear_current_io_path(struct nvme_bdev_channel *nbdev_ch) 605 { 606 nbdev_ch->current_io_path = NULL; 607 nbdev_ch->rr_counter = 0; 608 } 609 610 static struct nvme_io_path * 611 _bdev_nvme_get_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 612 { 613 struct nvme_io_path *io_path; 614 615 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 616 if (io_path->nvme_ns == nvme_ns) { 617 break; 618 } 619 } 620 621 return io_path; 622 } 623 624 static struct nvme_io_path * 625 nvme_io_path_alloc(void) 626 { 627 struct nvme_io_path *io_path; 628 629 io_path = calloc(1, sizeof(*io_path)); 630 if (io_path == NULL) { 631 SPDK_ERRLOG("Failed to alloc io_path.\n"); 632 return NULL; 633 } 634 635 if (g_opts.io_path_stat) { 636 io_path->stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); 637 if (io_path->stat == NULL) { 638 free(io_path); 639 SPDK_ERRLOG("Failed to alloc io_path stat.\n"); 640 return NULL; 641 } 642 spdk_bdev_reset_io_stat(io_path->stat, SPDK_BDEV_RESET_STAT_MAXMIN); 643 } 644 645 return io_path; 646 } 647 648 static void 649 nvme_io_path_free(struct nvme_io_path *io_path) 650 { 651 free(io_path->stat); 652 free(io_path); 653 } 654 655 static int 656 _bdev_nvme_add_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 657 { 658 struct nvme_io_path *io_path; 659 struct spdk_io_channel *ch; 660 struct nvme_ctrlr_channel *ctrlr_ch; 661 struct nvme_qpair *nvme_qpair; 662 663 io_path = nvme_io_path_alloc(); 664 if (io_path == NULL) { 665 return -ENOMEM; 666 } 667 668 io_path->nvme_ns = nvme_ns; 669 670 ch = spdk_get_io_channel(nvme_ns->ctrlr); 671 if (ch == NULL) { 672 nvme_io_path_free(io_path); 673 SPDK_ERRLOG("Failed to alloc io_channel.\n"); 674 return -ENOMEM; 675 } 676 677 ctrlr_ch = spdk_io_channel_get_ctx(ch); 678 679 nvme_qpair = ctrlr_ch->qpair; 680 assert(nvme_qpair != NULL); 681 682 io_path->qpair = nvme_qpair; 683 TAILQ_INSERT_TAIL(&nvme_qpair->io_path_list, io_path, tailq); 684 685 io_path->nbdev_ch = nbdev_ch; 686 STAILQ_INSERT_TAIL(&nbdev_ch->io_path_list, io_path, stailq); 687 688 bdev_nvme_clear_current_io_path(nbdev_ch); 689 690 return 0; 691 } 692 693 static void 694 bdev_nvme_clear_retry_io_path(struct nvme_bdev_channel *nbdev_ch, 695 struct nvme_io_path *io_path) 696 { 697 struct spdk_bdev_io *bdev_io; 698 struct nvme_bdev_io *bio; 699 700 TAILQ_FOREACH(bdev_io, &nbdev_ch->retry_io_list, module_link) { 701 bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 702 if (bio->io_path == io_path) { 703 bio->io_path = NULL; 704 } 705 } 706 } 707 708 static void 709 _bdev_nvme_delete_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *io_path) 710 { 711 struct spdk_io_channel *ch; 712 struct nvme_qpair *nvme_qpair; 713 struct nvme_ctrlr_channel *ctrlr_ch; 714 struct nvme_bdev *nbdev; 715 716 nbdev = spdk_io_channel_get_io_device(spdk_io_channel_from_ctx(nbdev_ch)); 717 718 /* Add the statistics to nvme_ns before this path is destroyed. */ 719 pthread_mutex_lock(&nbdev->mutex); 720 if (nbdev->ref != 0 && io_path->nvme_ns->stat != NULL && io_path->stat != NULL) { 721 spdk_bdev_add_io_stat(io_path->nvme_ns->stat, io_path->stat); 722 } 723 pthread_mutex_unlock(&nbdev->mutex); 724 725 bdev_nvme_clear_current_io_path(nbdev_ch); 726 bdev_nvme_clear_retry_io_path(nbdev_ch, io_path); 727 728 STAILQ_REMOVE(&nbdev_ch->io_path_list, io_path, nvme_io_path, stailq); 729 io_path->nbdev_ch = NULL; 730 731 nvme_qpair = io_path->qpair; 732 assert(nvme_qpair != NULL); 733 734 ctrlr_ch = nvme_qpair->ctrlr_ch; 735 assert(ctrlr_ch != NULL); 736 737 ch = spdk_io_channel_from_ctx(ctrlr_ch); 738 spdk_put_io_channel(ch); 739 740 /* After an io_path is removed, I/Os submitted to it may complete and update statistics 741 * of the io_path. To avoid heap-use-after-free error from this case, do not free the 742 * io_path here but free the io_path when the associated qpair is freed. It is ensured 743 * that all I/Os submitted to the io_path are completed when the associated qpair is freed. 744 */ 745 } 746 747 static void 748 _bdev_nvme_delete_io_paths(struct nvme_bdev_channel *nbdev_ch) 749 { 750 struct nvme_io_path *io_path, *tmp_io_path; 751 752 STAILQ_FOREACH_SAFE(io_path, &nbdev_ch->io_path_list, stailq, tmp_io_path) { 753 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 754 } 755 } 756 757 static int 758 bdev_nvme_create_bdev_channel_cb(void *io_device, void *ctx_buf) 759 { 760 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 761 struct nvme_bdev *nbdev = io_device; 762 struct nvme_ns *nvme_ns; 763 int rc; 764 765 STAILQ_INIT(&nbdev_ch->io_path_list); 766 TAILQ_INIT(&nbdev_ch->retry_io_list); 767 768 pthread_mutex_lock(&nbdev->mutex); 769 770 nbdev_ch->mp_policy = nbdev->mp_policy; 771 nbdev_ch->mp_selector = nbdev->mp_selector; 772 nbdev_ch->rr_min_io = nbdev->rr_min_io; 773 774 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 775 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 776 if (rc != 0) { 777 pthread_mutex_unlock(&nbdev->mutex); 778 779 _bdev_nvme_delete_io_paths(nbdev_ch); 780 return rc; 781 } 782 } 783 pthread_mutex_unlock(&nbdev->mutex); 784 785 return 0; 786 } 787 788 /* If cpl != NULL, complete the bdev_io with nvme status based on 'cpl'. 789 * If cpl == NULL, complete the bdev_io with bdev status based on 'status'. 790 */ 791 static inline void 792 __bdev_nvme_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status, 793 const struct spdk_nvme_cpl *cpl) 794 { 795 spdk_trace_record(TRACE_BDEV_NVME_IO_DONE, 0, 0, (uintptr_t)bdev_io->driver_ctx, 796 (uintptr_t)bdev_io); 797 if (cpl) { 798 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 799 } else { 800 spdk_bdev_io_complete(bdev_io, status); 801 } 802 } 803 804 static void bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch); 805 806 static void 807 bdev_nvme_destroy_bdev_channel_cb(void *io_device, void *ctx_buf) 808 { 809 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 810 811 bdev_nvme_abort_retry_ios(nbdev_ch); 812 _bdev_nvme_delete_io_paths(nbdev_ch); 813 } 814 815 static inline bool 816 bdev_nvme_io_type_is_admin(enum spdk_bdev_io_type io_type) 817 { 818 switch (io_type) { 819 case SPDK_BDEV_IO_TYPE_RESET: 820 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 821 case SPDK_BDEV_IO_TYPE_ABORT: 822 return true; 823 default: 824 break; 825 } 826 827 return false; 828 } 829 830 static inline bool 831 nvme_ns_is_active(struct nvme_ns *nvme_ns) 832 { 833 if (spdk_unlikely(nvme_ns->ana_state_updating)) { 834 return false; 835 } 836 837 if (spdk_unlikely(nvme_ns->ns == NULL)) { 838 return false; 839 } 840 841 return true; 842 } 843 844 static inline bool 845 nvme_ns_is_accessible(struct nvme_ns *nvme_ns) 846 { 847 if (spdk_unlikely(!nvme_ns_is_active(nvme_ns))) { 848 return false; 849 } 850 851 switch (nvme_ns->ana_state) { 852 case SPDK_NVME_ANA_OPTIMIZED_STATE: 853 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 854 return true; 855 default: 856 break; 857 } 858 859 return false; 860 } 861 862 static inline bool 863 nvme_qpair_is_connected(struct nvme_qpair *nvme_qpair) 864 { 865 if (spdk_unlikely(nvme_qpair->qpair == NULL)) { 866 return false; 867 } 868 869 if (spdk_unlikely(spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) != 870 SPDK_NVME_QPAIR_FAILURE_NONE)) { 871 return false; 872 } 873 874 if (spdk_unlikely(nvme_qpair->ctrlr_ch->reset_iter != NULL)) { 875 return false; 876 } 877 878 return true; 879 } 880 881 static inline bool 882 nvme_io_path_is_available(struct nvme_io_path *io_path) 883 { 884 if (spdk_unlikely(!nvme_qpair_is_connected(io_path->qpair))) { 885 return false; 886 } 887 888 if (spdk_unlikely(!nvme_ns_is_accessible(io_path->nvme_ns))) { 889 return false; 890 } 891 892 return true; 893 } 894 895 static inline bool 896 nvme_ctrlr_is_failed(struct nvme_ctrlr *nvme_ctrlr) 897 { 898 if (nvme_ctrlr->destruct) { 899 return true; 900 } 901 902 if (nvme_ctrlr->fast_io_fail_timedout) { 903 return true; 904 } 905 906 if (nvme_ctrlr->resetting) { 907 if (nvme_ctrlr->opts.reconnect_delay_sec != 0) { 908 return false; 909 } else { 910 return true; 911 } 912 } 913 914 if (nvme_ctrlr->reconnect_is_delayed) { 915 return false; 916 } 917 918 if (nvme_ctrlr->disabled) { 919 return true; 920 } 921 922 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 923 return true; 924 } else { 925 return false; 926 } 927 } 928 929 static bool 930 nvme_ctrlr_is_available(struct nvme_ctrlr *nvme_ctrlr) 931 { 932 if (nvme_ctrlr->destruct) { 933 return false; 934 } 935 936 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 937 return false; 938 } 939 940 if (nvme_ctrlr->resetting || nvme_ctrlr->reconnect_is_delayed) { 941 return false; 942 } 943 944 if (nvme_ctrlr->disabled) { 945 return false; 946 } 947 948 return true; 949 } 950 951 /* Simulate circular linked list. */ 952 static inline struct nvme_io_path * 953 nvme_io_path_get_next(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *prev_path) 954 { 955 struct nvme_io_path *next_path; 956 957 if (prev_path != NULL) { 958 next_path = STAILQ_NEXT(prev_path, stailq); 959 if (next_path != NULL) { 960 return next_path; 961 } 962 } 963 964 return STAILQ_FIRST(&nbdev_ch->io_path_list); 965 } 966 967 static struct nvme_io_path * 968 _bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 969 { 970 struct nvme_io_path *io_path, *start, *non_optimized = NULL; 971 972 start = nvme_io_path_get_next(nbdev_ch, nbdev_ch->current_io_path); 973 974 io_path = start; 975 do { 976 if (spdk_likely(nvme_io_path_is_available(io_path))) { 977 switch (io_path->nvme_ns->ana_state) { 978 case SPDK_NVME_ANA_OPTIMIZED_STATE: 979 nbdev_ch->current_io_path = io_path; 980 return io_path; 981 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 982 if (non_optimized == NULL) { 983 non_optimized = io_path; 984 } 985 break; 986 default: 987 assert(false); 988 break; 989 } 990 } 991 io_path = nvme_io_path_get_next(nbdev_ch, io_path); 992 } while (io_path != start); 993 994 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) { 995 /* We come here only if there is no optimized path. Cache even non_optimized 996 * path for load balance across multiple non_optimized paths. 997 */ 998 nbdev_ch->current_io_path = non_optimized; 999 } 1000 1001 return non_optimized; 1002 } 1003 1004 static struct nvme_io_path * 1005 _bdev_nvme_find_io_path_min_qd(struct nvme_bdev_channel *nbdev_ch) 1006 { 1007 struct nvme_io_path *io_path; 1008 struct nvme_io_path *optimized = NULL, *non_optimized = NULL; 1009 uint32_t opt_min_qd = UINT32_MAX, non_opt_min_qd = UINT32_MAX; 1010 uint32_t num_outstanding_reqs; 1011 1012 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 1013 if (spdk_unlikely(!nvme_qpair_is_connected(io_path->qpair))) { 1014 /* The device is currently resetting. */ 1015 continue; 1016 } 1017 1018 if (spdk_unlikely(!nvme_ns_is_active(io_path->nvme_ns))) { 1019 continue; 1020 } 1021 1022 num_outstanding_reqs = spdk_nvme_qpair_get_num_outstanding_reqs(io_path->qpair->qpair); 1023 switch (io_path->nvme_ns->ana_state) { 1024 case SPDK_NVME_ANA_OPTIMIZED_STATE: 1025 if (num_outstanding_reqs < opt_min_qd) { 1026 opt_min_qd = num_outstanding_reqs; 1027 optimized = io_path; 1028 } 1029 break; 1030 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 1031 if (num_outstanding_reqs < non_opt_min_qd) { 1032 non_opt_min_qd = num_outstanding_reqs; 1033 non_optimized = io_path; 1034 } 1035 break; 1036 default: 1037 break; 1038 } 1039 } 1040 1041 /* don't cache io path for BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH selector */ 1042 if (optimized != NULL) { 1043 return optimized; 1044 } 1045 1046 return non_optimized; 1047 } 1048 1049 static inline struct nvme_io_path * 1050 bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 1051 { 1052 if (spdk_likely(nbdev_ch->current_io_path != NULL)) { 1053 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE) { 1054 return nbdev_ch->current_io_path; 1055 } else if (nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 1056 if (++nbdev_ch->rr_counter < nbdev_ch->rr_min_io) { 1057 return nbdev_ch->current_io_path; 1058 } 1059 nbdev_ch->rr_counter = 0; 1060 } 1061 } 1062 1063 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE || 1064 nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 1065 return _bdev_nvme_find_io_path(nbdev_ch); 1066 } else { 1067 return _bdev_nvme_find_io_path_min_qd(nbdev_ch); 1068 } 1069 } 1070 1071 /* Return true if there is any io_path whose qpair is active or ctrlr is not failed, 1072 * or false otherwise. 1073 * 1074 * If any io_path has an active qpair but find_io_path() returned NULL, its namespace 1075 * is likely to be non-accessible now but may become accessible. 1076 * 1077 * If any io_path has an unfailed ctrlr but find_io_path() returned NULL, the ctrlr 1078 * is likely to be resetting now but the reset may succeed. A ctrlr is set to unfailed 1079 * when starting to reset it but it is set to failed when the reset failed. Hence, if 1080 * a ctrlr is unfailed, it is likely that it works fine or is resetting. 1081 */ 1082 static bool 1083 any_io_path_may_become_available(struct nvme_bdev_channel *nbdev_ch) 1084 { 1085 struct nvme_io_path *io_path; 1086 1087 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 1088 if (io_path->nvme_ns->ana_transition_timedout) { 1089 continue; 1090 } 1091 1092 if (nvme_qpair_is_connected(io_path->qpair) || 1093 !nvme_ctrlr_is_failed(io_path->qpair->ctrlr)) { 1094 return true; 1095 } 1096 } 1097 1098 return false; 1099 } 1100 1101 static void 1102 bdev_nvme_retry_io(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io) 1103 { 1104 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 1105 struct spdk_io_channel *ch; 1106 1107 if (nbdev_io->io_path != NULL && nvme_io_path_is_available(nbdev_io->io_path)) { 1108 _bdev_nvme_submit_request(nbdev_ch, bdev_io); 1109 } else { 1110 ch = spdk_io_channel_from_ctx(nbdev_ch); 1111 bdev_nvme_submit_request(ch, bdev_io); 1112 } 1113 } 1114 1115 static int 1116 bdev_nvme_retry_ios(void *arg) 1117 { 1118 struct nvme_bdev_channel *nbdev_ch = arg; 1119 struct spdk_bdev_io *bdev_io, *tmp_bdev_io; 1120 struct nvme_bdev_io *bio; 1121 uint64_t now, delay_us; 1122 1123 now = spdk_get_ticks(); 1124 1125 TAILQ_FOREACH_SAFE(bdev_io, &nbdev_ch->retry_io_list, module_link, tmp_bdev_io) { 1126 bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 1127 if (bio->retry_ticks > now) { 1128 break; 1129 } 1130 1131 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io, module_link); 1132 1133 bdev_nvme_retry_io(nbdev_ch, bdev_io); 1134 } 1135 1136 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1137 1138 bdev_io = TAILQ_FIRST(&nbdev_ch->retry_io_list); 1139 if (bdev_io != NULL) { 1140 bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 1141 1142 delay_us = (bio->retry_ticks - now) * SPDK_SEC_TO_USEC / spdk_get_ticks_hz(); 1143 1144 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 1145 delay_us); 1146 } 1147 1148 return SPDK_POLLER_BUSY; 1149 } 1150 1151 static void 1152 bdev_nvme_queue_retry_io(struct nvme_bdev_channel *nbdev_ch, 1153 struct nvme_bdev_io *bio, uint64_t delay_ms) 1154 { 1155 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1156 struct spdk_bdev_io *tmp_bdev_io; 1157 struct nvme_bdev_io *tmp_bio; 1158 1159 bio->retry_ticks = spdk_get_ticks() + delay_ms * spdk_get_ticks_hz() / 1000ULL; 1160 1161 TAILQ_FOREACH_REVERSE(tmp_bdev_io, &nbdev_ch->retry_io_list, retry_io_head, module_link) { 1162 tmp_bio = (struct nvme_bdev_io *)tmp_bdev_io->driver_ctx; 1163 1164 if (tmp_bio->retry_ticks <= bio->retry_ticks) { 1165 TAILQ_INSERT_AFTER(&nbdev_ch->retry_io_list, tmp_bdev_io, bdev_io, 1166 module_link); 1167 return; 1168 } 1169 } 1170 1171 /* No earlier I/Os were found. This I/O must be the new head. */ 1172 TAILQ_INSERT_HEAD(&nbdev_ch->retry_io_list, bdev_io, module_link); 1173 1174 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1175 1176 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 1177 delay_ms * 1000ULL); 1178 } 1179 1180 static void 1181 bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch) 1182 { 1183 struct spdk_bdev_io *bdev_io, *tmp_io; 1184 1185 TAILQ_FOREACH_SAFE(bdev_io, &nbdev_ch->retry_io_list, module_link, tmp_io) { 1186 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io, module_link); 1187 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED, NULL); 1188 } 1189 1190 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1191 } 1192 1193 static int 1194 bdev_nvme_abort_retry_io(struct nvme_bdev_channel *nbdev_ch, 1195 struct nvme_bdev_io *bio_to_abort) 1196 { 1197 struct spdk_bdev_io *bdev_io_to_abort; 1198 1199 TAILQ_FOREACH(bdev_io_to_abort, &nbdev_ch->retry_io_list, module_link) { 1200 if ((struct nvme_bdev_io *)bdev_io_to_abort->driver_ctx == bio_to_abort) { 1201 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io_to_abort, module_link); 1202 __bdev_nvme_io_complete(bdev_io_to_abort, SPDK_BDEV_IO_STATUS_ABORTED, NULL); 1203 return 0; 1204 } 1205 } 1206 1207 return -ENOENT; 1208 } 1209 1210 static void 1211 bdev_nvme_update_nvme_error_stat(struct spdk_bdev_io *bdev_io, const struct spdk_nvme_cpl *cpl) 1212 { 1213 struct nvme_bdev *nbdev; 1214 uint16_t sct, sc; 1215 1216 assert(spdk_nvme_cpl_is_error(cpl)); 1217 1218 nbdev = bdev_io->bdev->ctxt; 1219 1220 if (nbdev->err_stat == NULL) { 1221 return; 1222 } 1223 1224 sct = cpl->status.sct; 1225 sc = cpl->status.sc; 1226 1227 pthread_mutex_lock(&nbdev->mutex); 1228 1229 nbdev->err_stat->status_type[sct]++; 1230 switch (sct) { 1231 case SPDK_NVME_SCT_GENERIC: 1232 case SPDK_NVME_SCT_COMMAND_SPECIFIC: 1233 case SPDK_NVME_SCT_MEDIA_ERROR: 1234 case SPDK_NVME_SCT_PATH: 1235 nbdev->err_stat->status[sct][sc]++; 1236 break; 1237 default: 1238 break; 1239 } 1240 1241 pthread_mutex_unlock(&nbdev->mutex); 1242 } 1243 1244 static inline void 1245 bdev_nvme_update_io_path_stat(struct nvme_bdev_io *bio) 1246 { 1247 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1248 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 1249 uint32_t blocklen = bdev_io->bdev->blocklen; 1250 struct spdk_bdev_io_stat *stat; 1251 uint64_t tsc_diff; 1252 1253 if (bio->io_path->stat == NULL) { 1254 return; 1255 } 1256 1257 tsc_diff = spdk_get_ticks() - bio->submit_tsc; 1258 stat = bio->io_path->stat; 1259 1260 switch (bdev_io->type) { 1261 case SPDK_BDEV_IO_TYPE_READ: 1262 stat->bytes_read += num_blocks * blocklen; 1263 stat->num_read_ops++; 1264 stat->read_latency_ticks += tsc_diff; 1265 if (stat->max_read_latency_ticks < tsc_diff) { 1266 stat->max_read_latency_ticks = tsc_diff; 1267 } 1268 if (stat->min_read_latency_ticks > tsc_diff) { 1269 stat->min_read_latency_ticks = tsc_diff; 1270 } 1271 break; 1272 case SPDK_BDEV_IO_TYPE_WRITE: 1273 stat->bytes_written += num_blocks * blocklen; 1274 stat->num_write_ops++; 1275 stat->write_latency_ticks += tsc_diff; 1276 if (stat->max_write_latency_ticks < tsc_diff) { 1277 stat->max_write_latency_ticks = tsc_diff; 1278 } 1279 if (stat->min_write_latency_ticks > tsc_diff) { 1280 stat->min_write_latency_ticks = tsc_diff; 1281 } 1282 break; 1283 case SPDK_BDEV_IO_TYPE_UNMAP: 1284 stat->bytes_unmapped += num_blocks * blocklen; 1285 stat->num_unmap_ops++; 1286 stat->unmap_latency_ticks += tsc_diff; 1287 if (stat->max_unmap_latency_ticks < tsc_diff) { 1288 stat->max_unmap_latency_ticks = tsc_diff; 1289 } 1290 if (stat->min_unmap_latency_ticks > tsc_diff) { 1291 stat->min_unmap_latency_ticks = tsc_diff; 1292 } 1293 break; 1294 case SPDK_BDEV_IO_TYPE_ZCOPY: 1295 /* Track the data in the start phase only */ 1296 if (!bdev_io->u.bdev.zcopy.start) { 1297 break; 1298 } 1299 if (bdev_io->u.bdev.zcopy.populate) { 1300 stat->bytes_read += num_blocks * blocklen; 1301 stat->num_read_ops++; 1302 stat->read_latency_ticks += tsc_diff; 1303 if (stat->max_read_latency_ticks < tsc_diff) { 1304 stat->max_read_latency_ticks = tsc_diff; 1305 } 1306 if (stat->min_read_latency_ticks > tsc_diff) { 1307 stat->min_read_latency_ticks = tsc_diff; 1308 } 1309 } else { 1310 stat->bytes_written += num_blocks * blocklen; 1311 stat->num_write_ops++; 1312 stat->write_latency_ticks += tsc_diff; 1313 if (stat->max_write_latency_ticks < tsc_diff) { 1314 stat->max_write_latency_ticks = tsc_diff; 1315 } 1316 if (stat->min_write_latency_ticks > tsc_diff) { 1317 stat->min_write_latency_ticks = tsc_diff; 1318 } 1319 } 1320 break; 1321 case SPDK_BDEV_IO_TYPE_COPY: 1322 stat->bytes_copied += num_blocks * blocklen; 1323 stat->num_copy_ops++; 1324 stat->copy_latency_ticks += tsc_diff; 1325 if (stat->max_copy_latency_ticks < tsc_diff) { 1326 stat->max_copy_latency_ticks = tsc_diff; 1327 } 1328 if (stat->min_copy_latency_ticks > tsc_diff) { 1329 stat->min_copy_latency_ticks = tsc_diff; 1330 } 1331 break; 1332 default: 1333 break; 1334 } 1335 } 1336 1337 static bool 1338 bdev_nvme_check_retry_io(struct nvme_bdev_io *bio, 1339 const struct spdk_nvme_cpl *cpl, 1340 struct nvme_bdev_channel *nbdev_ch, 1341 uint64_t *_delay_ms) 1342 { 1343 struct nvme_io_path *io_path = bio->io_path; 1344 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 1345 const struct spdk_nvme_ctrlr_data *cdata; 1346 1347 if (spdk_nvme_cpl_is_path_error(cpl) || 1348 spdk_nvme_cpl_is_aborted_sq_deletion(cpl) || 1349 !nvme_io_path_is_available(io_path) || 1350 !nvme_ctrlr_is_available(nvme_ctrlr)) { 1351 bdev_nvme_clear_current_io_path(nbdev_ch); 1352 bio->io_path = NULL; 1353 if (spdk_nvme_cpl_is_ana_error(cpl)) { 1354 if (nvme_ctrlr_read_ana_log_page(nvme_ctrlr) == 0) { 1355 io_path->nvme_ns->ana_state_updating = true; 1356 } 1357 } 1358 if (!any_io_path_may_become_available(nbdev_ch)) { 1359 return false; 1360 } 1361 *_delay_ms = 0; 1362 } else { 1363 bio->retry_count++; 1364 1365 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 1366 1367 if (cpl->status.crd != 0) { 1368 *_delay_ms = cdata->crdt[cpl->status.crd] * 100; 1369 } else { 1370 *_delay_ms = 0; 1371 } 1372 } 1373 1374 return true; 1375 } 1376 1377 static inline void 1378 bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio, 1379 const struct spdk_nvme_cpl *cpl) 1380 { 1381 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1382 struct nvme_bdev_channel *nbdev_ch; 1383 uint64_t delay_ms; 1384 1385 assert(!bdev_nvme_io_type_is_admin(bdev_io->type)); 1386 1387 if (spdk_likely(spdk_nvme_cpl_is_success(cpl))) { 1388 bdev_nvme_update_io_path_stat(bio); 1389 goto complete; 1390 } 1391 1392 /* Update error counts before deciding if retry is needed. 1393 * Hence, error counts may be more than the number of I/O errors. 1394 */ 1395 bdev_nvme_update_nvme_error_stat(bdev_io, cpl); 1396 1397 if (cpl->status.dnr != 0 || spdk_nvme_cpl_is_aborted_by_request(cpl) || 1398 (g_opts.bdev_retry_count != -1 && bio->retry_count >= g_opts.bdev_retry_count)) { 1399 goto complete; 1400 } 1401 1402 /* At this point we don't know whether the sequence was successfully executed or not, so we 1403 * cannot retry the IO */ 1404 if (bdev_io->u.bdev.accel_sequence != NULL) { 1405 goto complete; 1406 } 1407 1408 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1409 1410 if (bdev_nvme_check_retry_io(bio, cpl, nbdev_ch, &delay_ms)) { 1411 bdev_nvme_queue_retry_io(nbdev_ch, bio, delay_ms); 1412 return; 1413 } 1414 1415 complete: 1416 bio->retry_count = 0; 1417 bio->submit_tsc = 0; 1418 bdev_io->u.bdev.accel_sequence = NULL; 1419 __bdev_nvme_io_complete(bdev_io, 0, cpl); 1420 } 1421 1422 static inline void 1423 bdev_nvme_io_complete(struct nvme_bdev_io *bio, int rc) 1424 { 1425 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1426 struct nvme_bdev_channel *nbdev_ch; 1427 enum spdk_bdev_io_status io_status; 1428 1429 assert(!bdev_nvme_io_type_is_admin(bdev_io->type)); 1430 1431 switch (rc) { 1432 case 0: 1433 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1434 break; 1435 case -ENOMEM: 1436 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1437 break; 1438 case -ENXIO: 1439 if (g_opts.bdev_retry_count == -1 || bio->retry_count < g_opts.bdev_retry_count) { 1440 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1441 1442 bdev_nvme_clear_current_io_path(nbdev_ch); 1443 bio->io_path = NULL; 1444 1445 if (any_io_path_may_become_available(nbdev_ch)) { 1446 bdev_nvme_queue_retry_io(nbdev_ch, bio, 1000ULL); 1447 return; 1448 } 1449 } 1450 1451 /* fallthrough */ 1452 default: 1453 spdk_accel_sequence_abort(bdev_io->u.bdev.accel_sequence); 1454 bdev_io->u.bdev.accel_sequence = NULL; 1455 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1456 break; 1457 } 1458 1459 bio->retry_count = 0; 1460 bio->submit_tsc = 0; 1461 __bdev_nvme_io_complete(bdev_io, io_status, NULL); 1462 } 1463 1464 static inline void 1465 bdev_nvme_admin_complete(struct nvme_bdev_io *bio, int rc) 1466 { 1467 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1468 enum spdk_bdev_io_status io_status; 1469 1470 switch (rc) { 1471 case 0: 1472 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1473 break; 1474 case -ENOMEM: 1475 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1476 break; 1477 case -ENXIO: 1478 /* fallthrough */ 1479 default: 1480 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1481 break; 1482 } 1483 1484 __bdev_nvme_io_complete(bdev_io, io_status, NULL); 1485 } 1486 1487 static void 1488 bdev_nvme_clear_io_path_caches_done(struct spdk_io_channel_iter *i, int status) 1489 { 1490 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 1491 1492 pthread_mutex_lock(&nvme_ctrlr->mutex); 1493 1494 assert(nvme_ctrlr->io_path_cache_clearing == true); 1495 nvme_ctrlr->io_path_cache_clearing = false; 1496 1497 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 1498 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1499 return; 1500 } 1501 1502 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1503 1504 nvme_ctrlr_unregister(nvme_ctrlr); 1505 } 1506 1507 static void 1508 _bdev_nvme_clear_io_path_cache(struct nvme_qpair *nvme_qpair) 1509 { 1510 struct nvme_io_path *io_path; 1511 1512 TAILQ_FOREACH(io_path, &nvme_qpair->io_path_list, tailq) { 1513 if (io_path->nbdev_ch == NULL) { 1514 continue; 1515 } 1516 bdev_nvme_clear_current_io_path(io_path->nbdev_ch); 1517 } 1518 } 1519 1520 static void 1521 bdev_nvme_clear_io_path_cache(struct spdk_io_channel_iter *i) 1522 { 1523 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 1524 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 1525 1526 assert(ctrlr_ch->qpair != NULL); 1527 1528 _bdev_nvme_clear_io_path_cache(ctrlr_ch->qpair); 1529 1530 spdk_for_each_channel_continue(i, 0); 1531 } 1532 1533 static void 1534 bdev_nvme_clear_io_path_caches(struct nvme_ctrlr *nvme_ctrlr) 1535 { 1536 pthread_mutex_lock(&nvme_ctrlr->mutex); 1537 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 1538 nvme_ctrlr->io_path_cache_clearing) { 1539 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1540 return; 1541 } 1542 1543 nvme_ctrlr->io_path_cache_clearing = true; 1544 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1545 1546 spdk_for_each_channel(nvme_ctrlr, 1547 bdev_nvme_clear_io_path_cache, 1548 NULL, 1549 bdev_nvme_clear_io_path_caches_done); 1550 } 1551 1552 static struct nvme_qpair * 1553 nvme_poll_group_get_qpair(struct nvme_poll_group *group, struct spdk_nvme_qpair *qpair) 1554 { 1555 struct nvme_qpair *nvme_qpair; 1556 1557 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1558 if (nvme_qpair->qpair == qpair) { 1559 break; 1560 } 1561 } 1562 1563 return nvme_qpair; 1564 } 1565 1566 static void nvme_qpair_delete(struct nvme_qpair *nvme_qpair); 1567 1568 static void 1569 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx) 1570 { 1571 struct nvme_poll_group *group = poll_group_ctx; 1572 struct nvme_qpair *nvme_qpair; 1573 struct nvme_ctrlr_channel *ctrlr_ch; 1574 int status; 1575 1576 nvme_qpair = nvme_poll_group_get_qpair(group, qpair); 1577 if (nvme_qpair == NULL) { 1578 return; 1579 } 1580 1581 if (nvme_qpair->qpair != NULL) { 1582 spdk_nvme_ctrlr_free_io_qpair(nvme_qpair->qpair); 1583 nvme_qpair->qpair = NULL; 1584 } 1585 1586 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1587 1588 ctrlr_ch = nvme_qpair->ctrlr_ch; 1589 1590 if (ctrlr_ch != NULL) { 1591 if (ctrlr_ch->reset_iter != NULL) { 1592 /* We are in a full reset sequence. */ 1593 if (ctrlr_ch->connect_poller != NULL) { 1594 /* qpair was failed to connect. Abort the reset sequence. */ 1595 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was failed to connect. abort the reset ctrlr sequence.\n", 1596 qpair); 1597 spdk_poller_unregister(&ctrlr_ch->connect_poller); 1598 status = -1; 1599 } else { 1600 /* qpair was completed to disconnect. Just move to the next ctrlr_channel. */ 1601 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was disconnected and freed in a reset ctrlr sequence.\n", 1602 qpair); 1603 status = 0; 1604 } 1605 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, status); 1606 ctrlr_ch->reset_iter = NULL; 1607 } else { 1608 /* qpair was disconnected unexpectedly. Reset controller for recovery. */ 1609 SPDK_NOTICELOG("qpair %p was disconnected and freed. reset controller.\n", qpair); 1610 bdev_nvme_failover_ctrlr(nvme_qpair->ctrlr); 1611 } 1612 } else { 1613 /* In this case, ctrlr_channel is already deleted. */ 1614 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was disconnected and freed. delete nvme_qpair.\n", qpair); 1615 nvme_qpair_delete(nvme_qpair); 1616 } 1617 } 1618 1619 static void 1620 bdev_nvme_check_io_qpairs(struct nvme_poll_group *group) 1621 { 1622 struct nvme_qpair *nvme_qpair; 1623 1624 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1625 if (nvme_qpair->qpair == NULL || nvme_qpair->ctrlr_ch == NULL) { 1626 continue; 1627 } 1628 1629 if (spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) != 1630 SPDK_NVME_QPAIR_FAILURE_NONE) { 1631 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1632 } 1633 } 1634 } 1635 1636 static int 1637 bdev_nvme_poll(void *arg) 1638 { 1639 struct nvme_poll_group *group = arg; 1640 int64_t num_completions; 1641 1642 if (group->collect_spin_stat && group->start_ticks == 0) { 1643 group->start_ticks = spdk_get_ticks(); 1644 } 1645 1646 num_completions = spdk_nvme_poll_group_process_completions(group->group, 0, 1647 bdev_nvme_disconnected_qpair_cb); 1648 if (group->collect_spin_stat) { 1649 if (num_completions > 0) { 1650 if (group->end_ticks != 0) { 1651 group->spin_ticks += (group->end_ticks - group->start_ticks); 1652 group->end_ticks = 0; 1653 } 1654 group->start_ticks = 0; 1655 } else { 1656 group->end_ticks = spdk_get_ticks(); 1657 } 1658 } 1659 1660 if (spdk_unlikely(num_completions < 0)) { 1661 bdev_nvme_check_io_qpairs(group); 1662 } 1663 1664 return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 1665 } 1666 1667 static int bdev_nvme_poll_adminq(void *arg); 1668 1669 static void 1670 bdev_nvme_change_adminq_poll_period(struct nvme_ctrlr *nvme_ctrlr, uint64_t new_period_us) 1671 { 1672 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 1673 1674 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, 1675 nvme_ctrlr, new_period_us); 1676 } 1677 1678 static int 1679 bdev_nvme_poll_adminq(void *arg) 1680 { 1681 int32_t rc; 1682 struct nvme_ctrlr *nvme_ctrlr = arg; 1683 nvme_ctrlr_disconnected_cb disconnected_cb; 1684 1685 assert(nvme_ctrlr != NULL); 1686 1687 rc = spdk_nvme_ctrlr_process_admin_completions(nvme_ctrlr->ctrlr); 1688 if (rc < 0) { 1689 disconnected_cb = nvme_ctrlr->disconnected_cb; 1690 nvme_ctrlr->disconnected_cb = NULL; 1691 1692 if (disconnected_cb != NULL) { 1693 bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 1694 g_opts.nvme_adminq_poll_period_us); 1695 disconnected_cb(nvme_ctrlr); 1696 } else { 1697 bdev_nvme_failover_ctrlr(nvme_ctrlr); 1698 } 1699 } else if (spdk_nvme_ctrlr_get_admin_qp_failure_reason(nvme_ctrlr->ctrlr) != 1700 SPDK_NVME_QPAIR_FAILURE_NONE) { 1701 bdev_nvme_clear_io_path_caches(nvme_ctrlr); 1702 } 1703 1704 return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY; 1705 } 1706 1707 static void 1708 nvme_bdev_free(void *io_device) 1709 { 1710 struct nvme_bdev *nvme_disk = io_device; 1711 1712 pthread_mutex_destroy(&nvme_disk->mutex); 1713 free(nvme_disk->disk.name); 1714 free(nvme_disk->err_stat); 1715 free(nvme_disk); 1716 } 1717 1718 static int 1719 bdev_nvme_destruct(void *ctx) 1720 { 1721 struct nvme_bdev *nvme_disk = ctx; 1722 struct nvme_ns *nvme_ns, *tmp_nvme_ns; 1723 1724 SPDK_DTRACE_PROBE2(bdev_nvme_destruct, nvme_disk->nbdev_ctrlr->name, nvme_disk->nsid); 1725 1726 TAILQ_FOREACH_SAFE(nvme_ns, &nvme_disk->nvme_ns_list, tailq, tmp_nvme_ns) { 1727 pthread_mutex_lock(&nvme_ns->ctrlr->mutex); 1728 1729 nvme_ns->bdev = NULL; 1730 1731 assert(nvme_ns->id > 0); 1732 1733 if (nvme_ctrlr_get_ns(nvme_ns->ctrlr, nvme_ns->id) == NULL) { 1734 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1735 1736 nvme_ctrlr_release(nvme_ns->ctrlr); 1737 nvme_ns_free(nvme_ns); 1738 } else { 1739 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1740 } 1741 } 1742 1743 pthread_mutex_lock(&g_bdev_nvme_mutex); 1744 TAILQ_REMOVE(&nvme_disk->nbdev_ctrlr->bdevs, nvme_disk, tailq); 1745 pthread_mutex_unlock(&g_bdev_nvme_mutex); 1746 1747 spdk_io_device_unregister(nvme_disk, nvme_bdev_free); 1748 1749 return 0; 1750 } 1751 1752 static int 1753 bdev_nvme_create_qpair(struct nvme_qpair *nvme_qpair) 1754 { 1755 struct nvme_ctrlr *nvme_ctrlr; 1756 struct spdk_nvme_io_qpair_opts opts; 1757 struct spdk_nvme_qpair *qpair; 1758 int rc; 1759 1760 nvme_ctrlr = nvme_qpair->ctrlr; 1761 1762 spdk_nvme_ctrlr_get_default_io_qpair_opts(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1763 opts.delay_cmd_submit = g_opts.delay_cmd_submit; 1764 opts.create_only = true; 1765 opts.async_mode = true; 1766 opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests); 1767 g_opts.io_queue_requests = opts.io_queue_requests; 1768 1769 qpair = spdk_nvme_ctrlr_alloc_io_qpair(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1770 if (qpair == NULL) { 1771 return -1; 1772 } 1773 1774 SPDK_DTRACE_PROBE3(bdev_nvme_create_qpair, nvme_ctrlr->nbdev_ctrlr->name, 1775 spdk_nvme_qpair_get_id(qpair), spdk_thread_get_id(nvme_ctrlr->thread)); 1776 1777 assert(nvme_qpair->group != NULL); 1778 1779 rc = spdk_nvme_poll_group_add(nvme_qpair->group->group, qpair); 1780 if (rc != 0) { 1781 SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n"); 1782 goto err; 1783 } 1784 1785 rc = spdk_nvme_ctrlr_connect_io_qpair(nvme_ctrlr->ctrlr, qpair); 1786 if (rc != 0) { 1787 SPDK_ERRLOG("Unable to connect I/O qpair.\n"); 1788 goto err; 1789 } 1790 1791 nvme_qpair->qpair = qpair; 1792 1793 if (!g_opts.disable_auto_failback) { 1794 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1795 } 1796 1797 return 0; 1798 1799 err: 1800 spdk_nvme_ctrlr_free_io_qpair(qpair); 1801 1802 return rc; 1803 } 1804 1805 static void bdev_nvme_reset_io_continue(void *cb_arg, int rc); 1806 1807 static void 1808 bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i) 1809 { 1810 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 1811 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 1812 int rc = 0; 1813 struct spdk_bdev_io *bdev_io; 1814 struct nvme_bdev_io *bio; 1815 1816 if (spdk_io_channel_iter_get_ctx(i) != NULL) { 1817 rc = -1; 1818 } 1819 1820 while (!TAILQ_EMPTY(&ctrlr_ch->pending_resets)) { 1821 bdev_io = TAILQ_FIRST(&ctrlr_ch->pending_resets); 1822 TAILQ_REMOVE(&ctrlr_ch->pending_resets, bdev_io, module_link); 1823 1824 bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 1825 bdev_nvme_reset_io_continue(bio, rc); 1826 } 1827 1828 spdk_for_each_channel_continue(i, 0); 1829 } 1830 1831 /* This function marks the current trid as failed by storing the current ticks 1832 * and then sets the next trid to the active trid within a controller if exists. 1833 * 1834 * The purpose of the boolean return value is to request the caller to disconnect 1835 * the current trid now to try connecting the next trid. 1836 */ 1837 static bool 1838 bdev_nvme_failover_trid(struct nvme_ctrlr *nvme_ctrlr, bool remove, bool start) 1839 { 1840 struct nvme_path_id *path_id, *next_path; 1841 int rc __attribute__((unused)); 1842 1843 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 1844 assert(path_id); 1845 assert(path_id == nvme_ctrlr->active_path_id); 1846 next_path = TAILQ_NEXT(path_id, link); 1847 1848 /* Update the last failed time. It means the trid is failed if its last 1849 * failed time is non-zero. 1850 */ 1851 path_id->last_failed_tsc = spdk_get_ticks(); 1852 1853 if (next_path == NULL) { 1854 /* There is no alternate trid within a controller. */ 1855 return false; 1856 } 1857 1858 if (!start && nvme_ctrlr->opts.reconnect_delay_sec == 0) { 1859 /* Connect is not retried in a controller reset sequence. Connecting 1860 * the next trid will be done by the next bdev_nvme_failover_ctrlr() call. 1861 */ 1862 return false; 1863 } 1864 1865 assert(path_id->trid.trtype != SPDK_NVME_TRANSPORT_PCIE); 1866 1867 SPDK_NOTICELOG("Start failover from %s:%s to %s:%s\n", path_id->trid.traddr, 1868 path_id->trid.trsvcid, next_path->trid.traddr, next_path->trid.trsvcid); 1869 1870 spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr); 1871 nvme_ctrlr->active_path_id = next_path; 1872 rc = spdk_nvme_ctrlr_set_trid(nvme_ctrlr->ctrlr, &next_path->trid); 1873 assert(rc == 0); 1874 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 1875 if (!remove) { 1876 /** Shuffle the old trid to the end of the list and use the new one. 1877 * Allows for round robin through multiple connections. 1878 */ 1879 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, path_id, link); 1880 } else { 1881 free(path_id); 1882 } 1883 1884 if (start || next_path->last_failed_tsc == 0) { 1885 /* bdev_nvme_failover_ctrlr() is just called or the next trid is not failed 1886 * or used yet. Try the next trid now. 1887 */ 1888 return true; 1889 } 1890 1891 if (spdk_get_ticks() > next_path->last_failed_tsc + spdk_get_ticks_hz() * 1892 nvme_ctrlr->opts.reconnect_delay_sec) { 1893 /* Enough backoff passed since the next trid failed. Try the next trid now. */ 1894 return true; 1895 } 1896 1897 /* The next trid will be tried after reconnect_delay_sec seconds. */ 1898 return false; 1899 } 1900 1901 static bool 1902 bdev_nvme_check_ctrlr_loss_timeout(struct nvme_ctrlr *nvme_ctrlr) 1903 { 1904 int32_t elapsed; 1905 1906 if (nvme_ctrlr->opts.ctrlr_loss_timeout_sec == 0 || 1907 nvme_ctrlr->opts.ctrlr_loss_timeout_sec == -1) { 1908 return false; 1909 } 1910 1911 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 1912 if (elapsed >= nvme_ctrlr->opts.ctrlr_loss_timeout_sec) { 1913 return true; 1914 } else { 1915 return false; 1916 } 1917 } 1918 1919 static bool 1920 bdev_nvme_check_fast_io_fail_timeout(struct nvme_ctrlr *nvme_ctrlr) 1921 { 1922 uint32_t elapsed; 1923 1924 if (nvme_ctrlr->opts.fast_io_fail_timeout_sec == 0) { 1925 return false; 1926 } 1927 1928 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 1929 if (elapsed >= nvme_ctrlr->opts.fast_io_fail_timeout_sec) { 1930 return true; 1931 } else { 1932 return false; 1933 } 1934 } 1935 1936 static void bdev_nvme_reset_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, bool success); 1937 1938 static void 1939 nvme_ctrlr_disconnect(struct nvme_ctrlr *nvme_ctrlr, nvme_ctrlr_disconnected_cb cb_fn) 1940 { 1941 int rc; 1942 1943 rc = spdk_nvme_ctrlr_disconnect(nvme_ctrlr->ctrlr); 1944 if (rc != 0) { 1945 /* Disconnect fails if ctrlr is already resetting or removed. In this case, 1946 * fail the reset sequence immediately. 1947 */ 1948 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 1949 return; 1950 } 1951 1952 /* spdk_nvme_ctrlr_disconnect() may complete asynchronously later by polling adminq. 1953 * Set callback here to execute the specified operation after ctrlr is really disconnected. 1954 */ 1955 assert(nvme_ctrlr->disconnected_cb == NULL); 1956 nvme_ctrlr->disconnected_cb = cb_fn; 1957 1958 /* During disconnection, reduce the period to poll adminq more often. */ 1959 bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 0); 1960 } 1961 1962 enum bdev_nvme_op_after_reset { 1963 OP_NONE, 1964 OP_COMPLETE_PENDING_DESTRUCT, 1965 OP_DESTRUCT, 1966 OP_DELAYED_RECONNECT, 1967 OP_FAILOVER, 1968 }; 1969 1970 typedef enum bdev_nvme_op_after_reset _bdev_nvme_op_after_reset; 1971 1972 static _bdev_nvme_op_after_reset 1973 bdev_nvme_check_op_after_reset(struct nvme_ctrlr *nvme_ctrlr, bool success) 1974 { 1975 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 1976 /* Complete pending destruct after reset completes. */ 1977 return OP_COMPLETE_PENDING_DESTRUCT; 1978 } else if (nvme_ctrlr->pending_failover) { 1979 nvme_ctrlr->pending_failover = false; 1980 nvme_ctrlr->reset_start_tsc = 0; 1981 return OP_FAILOVER; 1982 } else if (success || nvme_ctrlr->opts.reconnect_delay_sec == 0) { 1983 nvme_ctrlr->reset_start_tsc = 0; 1984 return OP_NONE; 1985 } else if (bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 1986 return OP_DESTRUCT; 1987 } else { 1988 if (bdev_nvme_check_fast_io_fail_timeout(nvme_ctrlr)) { 1989 nvme_ctrlr->fast_io_fail_timedout = true; 1990 } 1991 return OP_DELAYED_RECONNECT; 1992 } 1993 } 1994 1995 static int bdev_nvme_delete_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool hotplug); 1996 static void bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 1997 1998 static int 1999 bdev_nvme_reconnect_delay_timer_expired(void *ctx) 2000 { 2001 struct nvme_ctrlr *nvme_ctrlr = ctx; 2002 2003 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect_delay, nvme_ctrlr->nbdev_ctrlr->name); 2004 pthread_mutex_lock(&nvme_ctrlr->mutex); 2005 2006 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 2007 2008 if (!nvme_ctrlr->reconnect_is_delayed) { 2009 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2010 return SPDK_POLLER_BUSY; 2011 } 2012 2013 nvme_ctrlr->reconnect_is_delayed = false; 2014 2015 if (nvme_ctrlr->destruct) { 2016 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2017 return SPDK_POLLER_BUSY; 2018 } 2019 2020 assert(nvme_ctrlr->resetting == false); 2021 nvme_ctrlr->resetting = true; 2022 2023 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2024 2025 spdk_poller_resume(nvme_ctrlr->adminq_timer_poller); 2026 2027 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2028 return SPDK_POLLER_BUSY; 2029 } 2030 2031 static void 2032 bdev_nvme_start_reconnect_delay_timer(struct nvme_ctrlr *nvme_ctrlr) 2033 { 2034 spdk_poller_pause(nvme_ctrlr->adminq_timer_poller); 2035 2036 assert(nvme_ctrlr->reconnect_is_delayed == false); 2037 nvme_ctrlr->reconnect_is_delayed = true; 2038 2039 assert(nvme_ctrlr->reconnect_delay_timer == NULL); 2040 nvme_ctrlr->reconnect_delay_timer = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_delay_timer_expired, 2041 nvme_ctrlr, 2042 nvme_ctrlr->opts.reconnect_delay_sec * SPDK_SEC_TO_USEC); 2043 } 2044 2045 static void remove_discovery_entry(struct nvme_ctrlr *nvme_ctrlr); 2046 2047 static void 2048 _bdev_nvme_reset_ctrlr_complete(struct spdk_io_channel_iter *i, int status) 2049 { 2050 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2051 bool success = spdk_io_channel_iter_get_ctx(i) == NULL; 2052 bdev_nvme_ctrlr_op_cb ctrlr_op_cb_fn = nvme_ctrlr->ctrlr_op_cb_fn; 2053 void *ctrlr_op_cb_arg = nvme_ctrlr->ctrlr_op_cb_arg; 2054 enum bdev_nvme_op_after_reset op_after_reset; 2055 2056 assert(nvme_ctrlr->thread == spdk_get_thread()); 2057 2058 nvme_ctrlr->ctrlr_op_cb_fn = NULL; 2059 nvme_ctrlr->ctrlr_op_cb_arg = NULL; 2060 2061 if (!success) { 2062 SPDK_ERRLOG("Resetting controller failed.\n"); 2063 } else { 2064 SPDK_NOTICELOG("Resetting controller successful.\n"); 2065 } 2066 2067 pthread_mutex_lock(&nvme_ctrlr->mutex); 2068 nvme_ctrlr->resetting = false; 2069 nvme_ctrlr->dont_retry = false; 2070 nvme_ctrlr->in_failover = false; 2071 2072 op_after_reset = bdev_nvme_check_op_after_reset(nvme_ctrlr, success); 2073 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2074 2075 /* Delay callbacks when the next operation is a failover. */ 2076 if (ctrlr_op_cb_fn && op_after_reset != OP_FAILOVER) { 2077 ctrlr_op_cb_fn(ctrlr_op_cb_arg, success ? 0 : -1); 2078 } 2079 2080 switch (op_after_reset) { 2081 case OP_COMPLETE_PENDING_DESTRUCT: 2082 nvme_ctrlr_unregister(nvme_ctrlr); 2083 break; 2084 case OP_DESTRUCT: 2085 bdev_nvme_delete_ctrlr(nvme_ctrlr, false); 2086 remove_discovery_entry(nvme_ctrlr); 2087 break; 2088 case OP_DELAYED_RECONNECT: 2089 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_start_reconnect_delay_timer); 2090 break; 2091 case OP_FAILOVER: 2092 nvme_ctrlr->ctrlr_op_cb_fn = ctrlr_op_cb_fn; 2093 nvme_ctrlr->ctrlr_op_cb_arg = ctrlr_op_cb_arg; 2094 bdev_nvme_failover_ctrlr(nvme_ctrlr); 2095 break; 2096 default: 2097 break; 2098 } 2099 } 2100 2101 static void 2102 bdev_nvme_reset_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, bool success) 2103 { 2104 pthread_mutex_lock(&nvme_ctrlr->mutex); 2105 if (!success) { 2106 /* Connecting the active trid failed. Set the next alternate trid to the 2107 * active trid if it exists. 2108 */ 2109 if (bdev_nvme_failover_trid(nvme_ctrlr, false, false)) { 2110 /* The next alternate trid exists and is ready to try. Try it now. */ 2111 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2112 2113 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr); 2114 return; 2115 } 2116 2117 /* We came here if there is no alternate trid or if the next trid exists but 2118 * is not ready to try. We will try the active trid after reconnect_delay_sec 2119 * seconds if it is non-zero or at the next reset call otherwise. 2120 */ 2121 } else { 2122 /* Connecting the active trid succeeded. Clear the last failed time because it 2123 * means the trid is failed if its last failed time is non-zero. 2124 */ 2125 nvme_ctrlr->active_path_id->last_failed_tsc = 0; 2126 } 2127 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2128 2129 /* Make sure we clear any pending resets before returning. */ 2130 spdk_for_each_channel(nvme_ctrlr, 2131 bdev_nvme_complete_pending_resets, 2132 success ? NULL : (void *)0x1, 2133 _bdev_nvme_reset_ctrlr_complete); 2134 } 2135 2136 static void 2137 bdev_nvme_reset_create_qpairs_failed(struct spdk_io_channel_iter *i, int status) 2138 { 2139 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2140 2141 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 2142 } 2143 2144 static void 2145 bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i) 2146 { 2147 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 2148 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch); 2149 struct nvme_qpair *nvme_qpair; 2150 2151 nvme_qpair = ctrlr_ch->qpair; 2152 assert(nvme_qpair != NULL); 2153 2154 _bdev_nvme_clear_io_path_cache(nvme_qpair); 2155 2156 if (nvme_qpair->qpair != NULL) { 2157 if (nvme_qpair->ctrlr->dont_retry) { 2158 spdk_nvme_qpair_set_abort_dnr(nvme_qpair->qpair, true); 2159 } 2160 spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); 2161 2162 /* The current full reset sequence will move to the next 2163 * ctrlr_channel after the qpair is actually disconnected. 2164 */ 2165 assert(ctrlr_ch->reset_iter == NULL); 2166 ctrlr_ch->reset_iter = i; 2167 } else { 2168 spdk_for_each_channel_continue(i, 0); 2169 } 2170 } 2171 2172 static void 2173 bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status) 2174 { 2175 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2176 2177 if (status == 0) { 2178 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, true); 2179 } else { 2180 /* Delete the added qpairs and quiesce ctrlr to make the states clean. */ 2181 spdk_for_each_channel(nvme_ctrlr, 2182 bdev_nvme_reset_destroy_qpair, 2183 NULL, 2184 bdev_nvme_reset_create_qpairs_failed); 2185 } 2186 } 2187 2188 static int 2189 bdev_nvme_reset_check_qpair_connected(void *ctx) 2190 { 2191 struct nvme_ctrlr_channel *ctrlr_ch = ctx; 2192 2193 if (ctrlr_ch->reset_iter == NULL) { 2194 /* qpair was already failed to connect and the reset sequence is being aborted. */ 2195 assert(ctrlr_ch->connect_poller == NULL); 2196 assert(ctrlr_ch->qpair->qpair == NULL); 2197 return SPDK_POLLER_BUSY; 2198 } 2199 2200 assert(ctrlr_ch->qpair->qpair != NULL); 2201 2202 if (!spdk_nvme_qpair_is_connected(ctrlr_ch->qpair->qpair)) { 2203 return SPDK_POLLER_BUSY; 2204 } 2205 2206 spdk_poller_unregister(&ctrlr_ch->connect_poller); 2207 2208 /* qpair was completed to connect. Move to the next ctrlr_channel */ 2209 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 2210 ctrlr_ch->reset_iter = NULL; 2211 2212 if (!g_opts.disable_auto_failback) { 2213 _bdev_nvme_clear_io_path_cache(ctrlr_ch->qpair); 2214 } 2215 2216 return SPDK_POLLER_BUSY; 2217 } 2218 2219 static void 2220 bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i) 2221 { 2222 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 2223 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 2224 int rc; 2225 2226 rc = bdev_nvme_create_qpair(ctrlr_ch->qpair); 2227 if (rc == 0) { 2228 ctrlr_ch->connect_poller = SPDK_POLLER_REGISTER(bdev_nvme_reset_check_qpair_connected, 2229 ctrlr_ch, 0); 2230 2231 /* The current full reset sequence will move to the next 2232 * ctrlr_channel after the qpair is actually connected. 2233 */ 2234 assert(ctrlr_ch->reset_iter == NULL); 2235 ctrlr_ch->reset_iter = i; 2236 } else { 2237 spdk_for_each_channel_continue(i, rc); 2238 } 2239 } 2240 2241 static void 2242 nvme_ctrlr_check_namespaces(struct nvme_ctrlr *nvme_ctrlr) 2243 { 2244 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 2245 struct nvme_ns *nvme_ns; 2246 2247 for (nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 2248 nvme_ns != NULL; 2249 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns)) { 2250 if (!spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) { 2251 SPDK_DEBUGLOG(bdev_nvme, "NSID %u was removed during reset.\n", nvme_ns->id); 2252 /* NS can be added again. Just nullify nvme_ns->ns. */ 2253 nvme_ns->ns = NULL; 2254 } 2255 } 2256 } 2257 2258 2259 static int 2260 bdev_nvme_reconnect_ctrlr_poll(void *arg) 2261 { 2262 struct nvme_ctrlr *nvme_ctrlr = arg; 2263 int rc = -ETIMEDOUT; 2264 2265 if (bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 2266 /* Mark the ctrlr as failed. The next call to 2267 * spdk_nvme_ctrlr_reconnect_poll_async() will then 2268 * do the necessary cleanup and return failure. 2269 */ 2270 spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr); 2271 } 2272 2273 rc = spdk_nvme_ctrlr_reconnect_poll_async(nvme_ctrlr->ctrlr); 2274 if (rc == -EAGAIN) { 2275 return SPDK_POLLER_BUSY; 2276 } 2277 2278 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 2279 if (rc == 0) { 2280 nvme_ctrlr_check_namespaces(nvme_ctrlr); 2281 2282 /* Recreate all of the I/O queue pairs */ 2283 spdk_for_each_channel(nvme_ctrlr, 2284 bdev_nvme_reset_create_qpair, 2285 NULL, 2286 bdev_nvme_reset_create_qpairs_done); 2287 } else { 2288 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 2289 } 2290 return SPDK_POLLER_BUSY; 2291 } 2292 2293 static void 2294 bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2295 { 2296 spdk_nvme_ctrlr_reconnect_async(nvme_ctrlr->ctrlr); 2297 2298 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect, nvme_ctrlr->nbdev_ctrlr->name); 2299 assert(nvme_ctrlr->reset_detach_poller == NULL); 2300 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_ctrlr_poll, 2301 nvme_ctrlr, 0); 2302 } 2303 2304 static void 2305 bdev_nvme_reset_destroy_qpair_done(struct spdk_io_channel_iter *i, int status) 2306 { 2307 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2308 2309 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reset, nvme_ctrlr->nbdev_ctrlr->name); 2310 assert(status == 0); 2311 2312 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2313 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2314 } else { 2315 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr); 2316 } 2317 } 2318 2319 static void 2320 bdev_nvme_reset_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr) 2321 { 2322 spdk_for_each_channel(nvme_ctrlr, 2323 bdev_nvme_reset_destroy_qpair, 2324 NULL, 2325 bdev_nvme_reset_destroy_qpair_done); 2326 } 2327 2328 static void 2329 bdev_nvme_reconnect_ctrlr_now(void *ctx) 2330 { 2331 struct nvme_ctrlr *nvme_ctrlr = ctx; 2332 2333 assert(nvme_ctrlr->resetting == true); 2334 assert(nvme_ctrlr->thread == spdk_get_thread()); 2335 2336 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 2337 2338 spdk_poller_resume(nvme_ctrlr->adminq_timer_poller); 2339 2340 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2341 } 2342 2343 static void 2344 _bdev_nvme_reset_ctrlr(void *ctx) 2345 { 2346 struct nvme_ctrlr *nvme_ctrlr = ctx; 2347 2348 assert(nvme_ctrlr->resetting == true); 2349 assert(nvme_ctrlr->thread == spdk_get_thread()); 2350 2351 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2352 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reset_destroy_qpairs); 2353 } else { 2354 bdev_nvme_reset_destroy_qpairs(nvme_ctrlr); 2355 } 2356 } 2357 2358 static int 2359 bdev_nvme_reset_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2360 { 2361 spdk_msg_fn msg_fn; 2362 2363 pthread_mutex_lock(&nvme_ctrlr->mutex); 2364 if (nvme_ctrlr->destruct) { 2365 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2366 return -ENXIO; 2367 } 2368 2369 if (nvme_ctrlr->resetting) { 2370 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2371 SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); 2372 return -EBUSY; 2373 } 2374 2375 if (nvme_ctrlr->disabled) { 2376 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2377 SPDK_NOTICELOG("Unable to perform reset. Controller is disabled.\n"); 2378 return -EALREADY; 2379 } 2380 2381 nvme_ctrlr->resetting = true; 2382 nvme_ctrlr->dont_retry = true; 2383 2384 if (nvme_ctrlr->reconnect_is_delayed) { 2385 SPDK_DEBUGLOG(bdev_nvme, "Reconnect is already scheduled.\n"); 2386 msg_fn = bdev_nvme_reconnect_ctrlr_now; 2387 nvme_ctrlr->reconnect_is_delayed = false; 2388 } else { 2389 msg_fn = _bdev_nvme_reset_ctrlr; 2390 assert(nvme_ctrlr->reset_start_tsc == 0); 2391 } 2392 2393 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2394 2395 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2396 2397 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 2398 return 0; 2399 } 2400 2401 static int 2402 bdev_nvme_enable_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2403 { 2404 pthread_mutex_lock(&nvme_ctrlr->mutex); 2405 if (nvme_ctrlr->destruct) { 2406 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2407 return -ENXIO; 2408 } 2409 2410 if (nvme_ctrlr->resetting) { 2411 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2412 return -EBUSY; 2413 } 2414 2415 if (!nvme_ctrlr->disabled) { 2416 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2417 return -EALREADY; 2418 } 2419 2420 nvme_ctrlr->disabled = false; 2421 nvme_ctrlr->resetting = true; 2422 2423 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2424 2425 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2426 2427 spdk_thread_send_msg(nvme_ctrlr->thread, bdev_nvme_reconnect_ctrlr_now, nvme_ctrlr); 2428 return 0; 2429 } 2430 2431 static void 2432 _bdev_nvme_disable_ctrlr_complete(struct spdk_io_channel_iter *i, int status) 2433 { 2434 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2435 bdev_nvme_ctrlr_op_cb ctrlr_op_cb_fn = nvme_ctrlr->ctrlr_op_cb_fn; 2436 void *ctrlr_op_cb_arg = nvme_ctrlr->ctrlr_op_cb_arg; 2437 enum bdev_nvme_op_after_reset op_after_disable; 2438 2439 assert(nvme_ctrlr->thread == spdk_get_thread()); 2440 2441 nvme_ctrlr->ctrlr_op_cb_fn = NULL; 2442 nvme_ctrlr->ctrlr_op_cb_arg = NULL; 2443 2444 pthread_mutex_lock(&nvme_ctrlr->mutex); 2445 2446 nvme_ctrlr->resetting = false; 2447 nvme_ctrlr->dont_retry = false; 2448 2449 op_after_disable = bdev_nvme_check_op_after_reset(nvme_ctrlr, true); 2450 2451 nvme_ctrlr->disabled = true; 2452 spdk_poller_pause(nvme_ctrlr->adminq_timer_poller); 2453 2454 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2455 2456 if (ctrlr_op_cb_fn) { 2457 ctrlr_op_cb_fn(ctrlr_op_cb_arg, 0); 2458 } 2459 2460 switch (op_after_disable) { 2461 case OP_COMPLETE_PENDING_DESTRUCT: 2462 nvme_ctrlr_unregister(nvme_ctrlr); 2463 break; 2464 default: 2465 break; 2466 } 2467 2468 } 2469 2470 static void 2471 bdev_nvme_disable_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr) 2472 { 2473 /* Make sure we clear any pending resets before returning. */ 2474 spdk_for_each_channel(nvme_ctrlr, 2475 bdev_nvme_complete_pending_resets, 2476 NULL, 2477 _bdev_nvme_disable_ctrlr_complete); 2478 } 2479 2480 static void 2481 bdev_nvme_disable_destroy_qpairs_done(struct spdk_io_channel_iter *i, int status) 2482 { 2483 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2484 2485 assert(status == 0); 2486 2487 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2488 bdev_nvme_disable_ctrlr_complete(nvme_ctrlr); 2489 } else { 2490 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_disable_ctrlr_complete); 2491 } 2492 } 2493 2494 static void 2495 bdev_nvme_disable_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr) 2496 { 2497 spdk_for_each_channel(nvme_ctrlr, 2498 bdev_nvme_reset_destroy_qpair, 2499 NULL, 2500 bdev_nvme_disable_destroy_qpairs_done); 2501 } 2502 2503 static void 2504 _bdev_nvme_cancel_reconnect_and_disable_ctrlr(void *ctx) 2505 { 2506 struct nvme_ctrlr *nvme_ctrlr = ctx; 2507 2508 assert(nvme_ctrlr->resetting == true); 2509 assert(nvme_ctrlr->thread == spdk_get_thread()); 2510 2511 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 2512 2513 bdev_nvme_disable_ctrlr_complete(nvme_ctrlr); 2514 } 2515 2516 static void 2517 _bdev_nvme_disconnect_and_disable_ctrlr(void *ctx) 2518 { 2519 struct nvme_ctrlr *nvme_ctrlr = ctx; 2520 2521 assert(nvme_ctrlr->resetting == true); 2522 assert(nvme_ctrlr->thread == spdk_get_thread()); 2523 2524 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2525 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_disable_destroy_qpairs); 2526 } else { 2527 bdev_nvme_disable_destroy_qpairs(nvme_ctrlr); 2528 } 2529 } 2530 2531 static int 2532 bdev_nvme_disable_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2533 { 2534 spdk_msg_fn msg_fn; 2535 2536 pthread_mutex_lock(&nvme_ctrlr->mutex); 2537 if (nvme_ctrlr->destruct) { 2538 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2539 return -ENXIO; 2540 } 2541 2542 if (nvme_ctrlr->resetting) { 2543 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2544 return -EBUSY; 2545 } 2546 2547 if (nvme_ctrlr->disabled) { 2548 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2549 return -EALREADY; 2550 } 2551 2552 nvme_ctrlr->resetting = true; 2553 nvme_ctrlr->dont_retry = true; 2554 2555 if (nvme_ctrlr->reconnect_is_delayed) { 2556 msg_fn = _bdev_nvme_cancel_reconnect_and_disable_ctrlr; 2557 nvme_ctrlr->reconnect_is_delayed = false; 2558 } else { 2559 msg_fn = _bdev_nvme_disconnect_and_disable_ctrlr; 2560 } 2561 2562 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2563 2564 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2565 2566 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 2567 return 0; 2568 } 2569 2570 static int 2571 nvme_ctrlr_op(struct nvme_ctrlr *nvme_ctrlr, enum nvme_ctrlr_op op, 2572 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2573 { 2574 int rc; 2575 2576 switch (op) { 2577 case NVME_CTRLR_OP_RESET: 2578 rc = bdev_nvme_reset_ctrlr(nvme_ctrlr); 2579 break; 2580 case NVME_CTRLR_OP_ENABLE: 2581 rc = bdev_nvme_enable_ctrlr(nvme_ctrlr); 2582 break; 2583 case NVME_CTRLR_OP_DISABLE: 2584 rc = bdev_nvme_disable_ctrlr(nvme_ctrlr); 2585 break; 2586 default: 2587 rc = -EINVAL; 2588 break; 2589 } 2590 2591 if (rc == 0) { 2592 assert(nvme_ctrlr->ctrlr_op_cb_fn == NULL); 2593 assert(nvme_ctrlr->ctrlr_op_cb_arg == NULL); 2594 nvme_ctrlr->ctrlr_op_cb_fn = cb_fn; 2595 nvme_ctrlr->ctrlr_op_cb_arg = cb_arg; 2596 } 2597 return rc; 2598 } 2599 2600 struct nvme_ctrlr_op_rpc_ctx { 2601 struct nvme_ctrlr *nvme_ctrlr; 2602 struct spdk_thread *orig_thread; 2603 enum nvme_ctrlr_op op; 2604 int rc; 2605 bdev_nvme_ctrlr_op_cb cb_fn; 2606 void *cb_arg; 2607 }; 2608 2609 static void 2610 _nvme_ctrlr_op_rpc_complete(void *_ctx) 2611 { 2612 struct nvme_ctrlr_op_rpc_ctx *ctx = _ctx; 2613 2614 assert(ctx != NULL); 2615 assert(ctx->cb_fn != NULL); 2616 2617 ctx->cb_fn(ctx->cb_arg, ctx->rc); 2618 2619 free(ctx); 2620 } 2621 2622 static void 2623 nvme_ctrlr_op_rpc_complete(void *cb_arg, int rc) 2624 { 2625 struct nvme_ctrlr_op_rpc_ctx *ctx = cb_arg; 2626 2627 ctx->rc = rc; 2628 2629 spdk_thread_send_msg(ctx->orig_thread, _nvme_ctrlr_op_rpc_complete, ctx); 2630 } 2631 2632 void 2633 nvme_ctrlr_op_rpc(struct nvme_ctrlr *nvme_ctrlr, enum nvme_ctrlr_op op, 2634 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2635 { 2636 struct nvme_ctrlr_op_rpc_ctx *ctx; 2637 int rc; 2638 2639 assert(cb_fn != NULL); 2640 2641 ctx = calloc(1, sizeof(*ctx)); 2642 if (ctx == NULL) { 2643 SPDK_ERRLOG("Failed to allocate nvme_ctrlr_op_rpc_ctx.\n"); 2644 cb_fn(cb_arg, -ENOMEM); 2645 return; 2646 } 2647 2648 ctx->orig_thread = spdk_get_thread(); 2649 ctx->cb_fn = cb_fn; 2650 ctx->cb_arg = cb_arg; 2651 2652 rc = nvme_ctrlr_op(nvme_ctrlr, op, nvme_ctrlr_op_rpc_complete, ctx); 2653 if (rc == 0) { 2654 return; 2655 } else if (rc == -EALREADY) { 2656 rc = 0; 2657 } 2658 2659 nvme_ctrlr_op_rpc_complete(ctx, rc); 2660 } 2661 2662 static void nvme_bdev_ctrlr_op_rpc_continue(void *cb_arg, int rc); 2663 2664 static void 2665 _nvme_bdev_ctrlr_op_rpc_continue(void *_ctx) 2666 { 2667 struct nvme_ctrlr_op_rpc_ctx *ctx = _ctx; 2668 struct nvme_ctrlr *prev_nvme_ctrlr, *next_nvme_ctrlr; 2669 int rc; 2670 2671 prev_nvme_ctrlr = ctx->nvme_ctrlr; 2672 ctx->nvme_ctrlr = NULL; 2673 2674 if (ctx->rc != 0) { 2675 goto complete; 2676 } 2677 2678 next_nvme_ctrlr = TAILQ_NEXT(prev_nvme_ctrlr, tailq); 2679 if (next_nvme_ctrlr == NULL) { 2680 goto complete; 2681 } 2682 2683 rc = nvme_ctrlr_op(next_nvme_ctrlr, ctx->op, nvme_bdev_ctrlr_op_rpc_continue, ctx); 2684 if (rc == 0) { 2685 ctx->nvme_ctrlr = next_nvme_ctrlr; 2686 return; 2687 } else if (rc == -EALREADY) { 2688 ctx->nvme_ctrlr = next_nvme_ctrlr; 2689 rc = 0; 2690 } 2691 2692 ctx->rc = rc; 2693 2694 complete: 2695 ctx->cb_fn(ctx->cb_arg, ctx->rc); 2696 free(ctx); 2697 } 2698 2699 static void 2700 nvme_bdev_ctrlr_op_rpc_continue(void *cb_arg, int rc) 2701 { 2702 struct nvme_ctrlr_op_rpc_ctx *ctx = cb_arg; 2703 2704 ctx->rc = rc; 2705 2706 spdk_thread_send_msg(ctx->orig_thread, _nvme_bdev_ctrlr_op_rpc_continue, ctx); 2707 } 2708 2709 void 2710 nvme_bdev_ctrlr_op_rpc(struct nvme_bdev_ctrlr *nbdev_ctrlr, enum nvme_ctrlr_op op, 2711 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2712 { 2713 struct nvme_ctrlr_op_rpc_ctx *ctx; 2714 struct nvme_ctrlr *nvme_ctrlr; 2715 int rc; 2716 2717 assert(cb_fn != NULL); 2718 2719 ctx = calloc(1, sizeof(*ctx)); 2720 if (ctx == NULL) { 2721 SPDK_ERRLOG("Failed to allocate nvme_ctrlr_op_rpc_ctx.\n"); 2722 cb_fn(cb_arg, -ENOMEM); 2723 return; 2724 } 2725 2726 ctx->orig_thread = spdk_get_thread(); 2727 ctx->op = op; 2728 ctx->cb_fn = cb_fn; 2729 ctx->cb_arg = cb_arg; 2730 2731 nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 2732 assert(nvme_ctrlr != NULL); 2733 2734 rc = nvme_ctrlr_op(nvme_ctrlr, op, nvme_bdev_ctrlr_op_rpc_continue, ctx); 2735 if (rc == 0) { 2736 ctx->nvme_ctrlr = nvme_ctrlr; 2737 return; 2738 } else if (rc == -EALREADY) { 2739 ctx->nvme_ctrlr = nvme_ctrlr; 2740 rc = 0; 2741 } 2742 2743 nvme_bdev_ctrlr_op_rpc_continue(ctx, rc); 2744 } 2745 2746 static int _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio); 2747 2748 static void 2749 _bdev_nvme_reset_io_complete(struct spdk_io_channel_iter *i, int status) 2750 { 2751 struct nvme_bdev_io *bio = spdk_io_channel_iter_get_ctx(i); 2752 enum spdk_bdev_io_status io_status; 2753 2754 if (bio->cpl.cdw0 == 0) { 2755 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 2756 } else { 2757 io_status = SPDK_BDEV_IO_STATUS_FAILED; 2758 } 2759 2760 __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), io_status, NULL); 2761 } 2762 2763 static void 2764 bdev_nvme_abort_bdev_channel(struct spdk_io_channel_iter *i) 2765 { 2766 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 2767 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 2768 2769 bdev_nvme_abort_retry_ios(nbdev_ch); 2770 2771 spdk_for_each_channel_continue(i, 0); 2772 } 2773 2774 static void 2775 bdev_nvme_reset_io_complete(struct nvme_bdev_io *bio) 2776 { 2777 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2778 struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt; 2779 2780 /* Abort all queued I/Os for retry. */ 2781 spdk_for_each_channel(nbdev, 2782 bdev_nvme_abort_bdev_channel, 2783 bio, 2784 _bdev_nvme_reset_io_complete); 2785 } 2786 2787 static void 2788 _bdev_nvme_reset_io_continue(void *ctx) 2789 { 2790 struct nvme_bdev_io *bio = ctx; 2791 struct nvme_io_path *prev_io_path, *next_io_path; 2792 int rc; 2793 2794 prev_io_path = bio->io_path; 2795 bio->io_path = NULL; 2796 2797 if (bio->cpl.cdw0 != 0) { 2798 goto complete; 2799 } 2800 2801 next_io_path = STAILQ_NEXT(prev_io_path, stailq); 2802 if (next_io_path == NULL) { 2803 goto complete; 2804 } 2805 2806 rc = _bdev_nvme_reset_io(next_io_path, bio); 2807 if (rc == 0) { 2808 return; 2809 } 2810 2811 bio->cpl.cdw0 = 1; 2812 2813 complete: 2814 bdev_nvme_reset_io_complete(bio); 2815 } 2816 2817 static void 2818 bdev_nvme_reset_io_continue(void *cb_arg, int rc) 2819 { 2820 struct nvme_bdev_io *bio = cb_arg; 2821 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2822 2823 bio->cpl.cdw0 = (rc == 0) ? 0 : 1; 2824 2825 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), _bdev_nvme_reset_io_continue, bio); 2826 } 2827 2828 static int 2829 _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio) 2830 { 2831 struct nvme_ctrlr_channel *ctrlr_ch; 2832 struct spdk_bdev_io *bdev_io; 2833 int rc; 2834 2835 rc = nvme_ctrlr_op(io_path->qpair->ctrlr, NVME_CTRLR_OP_RESET, 2836 bdev_nvme_reset_io_continue, bio); 2837 if (rc != 0 && rc != -EBUSY) { 2838 return rc; 2839 } 2840 2841 assert(bio->io_path == NULL); 2842 bio->io_path = io_path; 2843 2844 if (rc == -EBUSY) { 2845 ctrlr_ch = io_path->qpair->ctrlr_ch; 2846 assert(ctrlr_ch != NULL); 2847 /* 2848 * Reset call is queued only if it is from the app framework. This is on purpose so that 2849 * we don't interfere with the app framework reset strategy. i.e. we are deferring to the 2850 * upper level. If they are in the middle of a reset, we won't try to schedule another one. 2851 */ 2852 bdev_io = spdk_bdev_io_from_ctx(bio); 2853 TAILQ_INSERT_TAIL(&ctrlr_ch->pending_resets, bdev_io, module_link); 2854 } 2855 2856 return 0; 2857 } 2858 2859 static void 2860 bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio) 2861 { 2862 struct nvme_io_path *io_path; 2863 int rc; 2864 2865 bio->cpl.cdw0 = 0; 2866 2867 /* Reset all nvme_ctrlrs of a bdev controller sequentially. */ 2868 io_path = STAILQ_FIRST(&nbdev_ch->io_path_list); 2869 assert(io_path != NULL); 2870 2871 rc = _bdev_nvme_reset_io(io_path, bio); 2872 if (rc != 0) { 2873 /* If the current nvme_ctrlr is disabled, skip it and move to the next nvme_ctrlr. */ 2874 rc = (rc == -EALREADY) ? 0 : rc; 2875 2876 bdev_nvme_reset_io_continue(bio, rc); 2877 } 2878 } 2879 2880 static int 2881 bdev_nvme_failover_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool remove) 2882 { 2883 if (nvme_ctrlr->destruct) { 2884 /* Don't bother resetting if the controller is in the process of being destructed. */ 2885 return -ENXIO; 2886 } 2887 2888 if (nvme_ctrlr->resetting) { 2889 if (!nvme_ctrlr->in_failover) { 2890 SPDK_NOTICELOG("Reset is already in progress. Defer failover until reset completes.\n"); 2891 2892 /* Defer failover until reset completes. */ 2893 nvme_ctrlr->pending_failover = true; 2894 return -EINPROGRESS; 2895 } else { 2896 SPDK_NOTICELOG("Unable to perform failover, already in progress.\n"); 2897 return -EBUSY; 2898 } 2899 } 2900 2901 bdev_nvme_failover_trid(nvme_ctrlr, remove, true); 2902 2903 if (nvme_ctrlr->reconnect_is_delayed) { 2904 SPDK_NOTICELOG("Reconnect is already scheduled.\n"); 2905 2906 /* We rely on the next reconnect for the failover. */ 2907 return -EALREADY; 2908 } 2909 2910 if (nvme_ctrlr->disabled) { 2911 SPDK_NOTICELOG("Controller is disabled.\n"); 2912 2913 /* We rely on the enablement for the failover. */ 2914 return -EALREADY; 2915 } 2916 2917 nvme_ctrlr->resetting = true; 2918 nvme_ctrlr->in_failover = true; 2919 2920 assert(nvme_ctrlr->reset_start_tsc == 0); 2921 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2922 2923 return 0; 2924 } 2925 2926 static int 2927 bdev_nvme_failover_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2928 { 2929 int rc; 2930 2931 pthread_mutex_lock(&nvme_ctrlr->mutex); 2932 rc = bdev_nvme_failover_ctrlr_unsafe(nvme_ctrlr, false); 2933 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2934 2935 if (rc == 0) { 2936 spdk_thread_send_msg(nvme_ctrlr->thread, _bdev_nvme_reset_ctrlr, nvme_ctrlr); 2937 } else if (rc == -EALREADY) { 2938 rc = 0; 2939 } 2940 2941 return rc; 2942 } 2943 2944 static int bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, 2945 uint64_t num_blocks); 2946 2947 static int bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, 2948 uint64_t num_blocks); 2949 2950 static int bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, 2951 uint64_t src_offset_blocks, 2952 uint64_t num_blocks); 2953 2954 static void 2955 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 2956 bool success) 2957 { 2958 struct nvme_bdev_io *bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 2959 int ret; 2960 2961 if (!success) { 2962 ret = -EINVAL; 2963 goto exit; 2964 } 2965 2966 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 2967 ret = -ENXIO; 2968 goto exit; 2969 } 2970 2971 ret = bdev_nvme_readv(bio, 2972 bdev_io->u.bdev.iovs, 2973 bdev_io->u.bdev.iovcnt, 2974 bdev_io->u.bdev.md_buf, 2975 bdev_io->u.bdev.num_blocks, 2976 bdev_io->u.bdev.offset_blocks, 2977 bdev_io->u.bdev.dif_check_flags, 2978 bdev_io->u.bdev.memory_domain, 2979 bdev_io->u.bdev.memory_domain_ctx, 2980 bdev_io->u.bdev.accel_sequence); 2981 2982 exit: 2983 if (spdk_unlikely(ret != 0)) { 2984 bdev_nvme_io_complete(bio, ret); 2985 } 2986 } 2987 2988 static inline void 2989 _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io) 2990 { 2991 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 2992 struct spdk_bdev *bdev = bdev_io->bdev; 2993 struct nvme_bdev_io *nbdev_io_to_abort; 2994 int rc = 0; 2995 2996 switch (bdev_io->type) { 2997 case SPDK_BDEV_IO_TYPE_READ: 2998 if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) { 2999 3000 rc = bdev_nvme_readv(nbdev_io, 3001 bdev_io->u.bdev.iovs, 3002 bdev_io->u.bdev.iovcnt, 3003 bdev_io->u.bdev.md_buf, 3004 bdev_io->u.bdev.num_blocks, 3005 bdev_io->u.bdev.offset_blocks, 3006 bdev_io->u.bdev.dif_check_flags, 3007 bdev_io->u.bdev.memory_domain, 3008 bdev_io->u.bdev.memory_domain_ctx, 3009 bdev_io->u.bdev.accel_sequence); 3010 } else { 3011 spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb, 3012 bdev_io->u.bdev.num_blocks * bdev->blocklen); 3013 rc = 0; 3014 } 3015 break; 3016 case SPDK_BDEV_IO_TYPE_WRITE: 3017 rc = bdev_nvme_writev(nbdev_io, 3018 bdev_io->u.bdev.iovs, 3019 bdev_io->u.bdev.iovcnt, 3020 bdev_io->u.bdev.md_buf, 3021 bdev_io->u.bdev.num_blocks, 3022 bdev_io->u.bdev.offset_blocks, 3023 bdev_io->u.bdev.dif_check_flags, 3024 bdev_io->u.bdev.memory_domain, 3025 bdev_io->u.bdev.memory_domain_ctx, 3026 bdev_io->u.bdev.accel_sequence, 3027 bdev_io->u.bdev.nvme_cdw12, 3028 bdev_io->u.bdev.nvme_cdw13); 3029 break; 3030 case SPDK_BDEV_IO_TYPE_COMPARE: 3031 rc = bdev_nvme_comparev(nbdev_io, 3032 bdev_io->u.bdev.iovs, 3033 bdev_io->u.bdev.iovcnt, 3034 bdev_io->u.bdev.md_buf, 3035 bdev_io->u.bdev.num_blocks, 3036 bdev_io->u.bdev.offset_blocks, 3037 bdev_io->u.bdev.dif_check_flags); 3038 break; 3039 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 3040 rc = bdev_nvme_comparev_and_writev(nbdev_io, 3041 bdev_io->u.bdev.iovs, 3042 bdev_io->u.bdev.iovcnt, 3043 bdev_io->u.bdev.fused_iovs, 3044 bdev_io->u.bdev.fused_iovcnt, 3045 bdev_io->u.bdev.md_buf, 3046 bdev_io->u.bdev.num_blocks, 3047 bdev_io->u.bdev.offset_blocks, 3048 bdev_io->u.bdev.dif_check_flags); 3049 break; 3050 case SPDK_BDEV_IO_TYPE_UNMAP: 3051 rc = bdev_nvme_unmap(nbdev_io, 3052 bdev_io->u.bdev.offset_blocks, 3053 bdev_io->u.bdev.num_blocks); 3054 break; 3055 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3056 rc = bdev_nvme_write_zeroes(nbdev_io, 3057 bdev_io->u.bdev.offset_blocks, 3058 bdev_io->u.bdev.num_blocks); 3059 break; 3060 case SPDK_BDEV_IO_TYPE_RESET: 3061 nbdev_io->io_path = NULL; 3062 bdev_nvme_reset_io(nbdev_ch, nbdev_io); 3063 return; 3064 3065 case SPDK_BDEV_IO_TYPE_FLUSH: 3066 bdev_nvme_io_complete(nbdev_io, 0); 3067 return; 3068 3069 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 3070 rc = bdev_nvme_zone_appendv(nbdev_io, 3071 bdev_io->u.bdev.iovs, 3072 bdev_io->u.bdev.iovcnt, 3073 bdev_io->u.bdev.md_buf, 3074 bdev_io->u.bdev.num_blocks, 3075 bdev_io->u.bdev.offset_blocks, 3076 bdev_io->u.bdev.dif_check_flags); 3077 break; 3078 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 3079 rc = bdev_nvme_get_zone_info(nbdev_io, 3080 bdev_io->u.zone_mgmt.zone_id, 3081 bdev_io->u.zone_mgmt.num_zones, 3082 bdev_io->u.zone_mgmt.buf); 3083 break; 3084 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 3085 rc = bdev_nvme_zone_management(nbdev_io, 3086 bdev_io->u.zone_mgmt.zone_id, 3087 bdev_io->u.zone_mgmt.zone_action); 3088 break; 3089 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 3090 nbdev_io->io_path = NULL; 3091 bdev_nvme_admin_passthru(nbdev_ch, 3092 nbdev_io, 3093 &bdev_io->u.nvme_passthru.cmd, 3094 bdev_io->u.nvme_passthru.buf, 3095 bdev_io->u.nvme_passthru.nbytes); 3096 return; 3097 3098 case SPDK_BDEV_IO_TYPE_NVME_IO: 3099 rc = bdev_nvme_io_passthru(nbdev_io, 3100 &bdev_io->u.nvme_passthru.cmd, 3101 bdev_io->u.nvme_passthru.buf, 3102 bdev_io->u.nvme_passthru.nbytes); 3103 break; 3104 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3105 rc = bdev_nvme_io_passthru_md(nbdev_io, 3106 &bdev_io->u.nvme_passthru.cmd, 3107 bdev_io->u.nvme_passthru.buf, 3108 bdev_io->u.nvme_passthru.nbytes, 3109 bdev_io->u.nvme_passthru.md_buf, 3110 bdev_io->u.nvme_passthru.md_len); 3111 break; 3112 case SPDK_BDEV_IO_TYPE_NVME_IOV_MD: 3113 rc = bdev_nvme_iov_passthru_md(nbdev_io, 3114 &bdev_io->u.nvme_passthru.cmd, 3115 bdev_io->u.nvme_passthru.iovs, 3116 bdev_io->u.nvme_passthru.iovcnt, 3117 bdev_io->u.nvme_passthru.nbytes, 3118 bdev_io->u.nvme_passthru.md_buf, 3119 bdev_io->u.nvme_passthru.md_len); 3120 break; 3121 case SPDK_BDEV_IO_TYPE_ABORT: 3122 nbdev_io->io_path = NULL; 3123 nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx; 3124 bdev_nvme_abort(nbdev_ch, 3125 nbdev_io, 3126 nbdev_io_to_abort); 3127 return; 3128 3129 case SPDK_BDEV_IO_TYPE_COPY: 3130 rc = bdev_nvme_copy(nbdev_io, 3131 bdev_io->u.bdev.offset_blocks, 3132 bdev_io->u.bdev.copy.src_offset_blocks, 3133 bdev_io->u.bdev.num_blocks); 3134 break; 3135 default: 3136 rc = -EINVAL; 3137 break; 3138 } 3139 3140 if (spdk_unlikely(rc != 0)) { 3141 bdev_nvme_io_complete(nbdev_io, rc); 3142 } 3143 } 3144 3145 static void 3146 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 3147 { 3148 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 3149 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 3150 3151 if (spdk_likely(nbdev_io->submit_tsc == 0)) { 3152 nbdev_io->submit_tsc = spdk_bdev_io_get_submit_tsc(bdev_io); 3153 } else { 3154 /* There are cases where submit_tsc != 0, i.e. retry I/O. 3155 * We need to update submit_tsc here. 3156 */ 3157 nbdev_io->submit_tsc = spdk_get_ticks(); 3158 } 3159 3160 spdk_trace_record(TRACE_BDEV_NVME_IO_START, 0, 0, (uintptr_t)nbdev_io, (uintptr_t)bdev_io); 3161 nbdev_io->io_path = bdev_nvme_find_io_path(nbdev_ch); 3162 if (spdk_unlikely(!nbdev_io->io_path)) { 3163 if (!bdev_nvme_io_type_is_admin(bdev_io->type)) { 3164 bdev_nvme_io_complete(nbdev_io, -ENXIO); 3165 return; 3166 } 3167 3168 /* Admin commands do not use the optimal I/O path. 3169 * Simply fall through even if it is not found. 3170 */ 3171 } 3172 3173 _bdev_nvme_submit_request(nbdev_ch, bdev_io); 3174 } 3175 3176 static bool 3177 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 3178 { 3179 struct nvme_bdev *nbdev = ctx; 3180 struct nvme_ns *nvme_ns; 3181 struct spdk_nvme_ns *ns; 3182 struct spdk_nvme_ctrlr *ctrlr; 3183 const struct spdk_nvme_ctrlr_data *cdata; 3184 3185 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 3186 assert(nvme_ns != NULL); 3187 ns = nvme_ns->ns; 3188 if (ns == NULL) { 3189 return false; 3190 } 3191 3192 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 3193 3194 switch (io_type) { 3195 case SPDK_BDEV_IO_TYPE_READ: 3196 case SPDK_BDEV_IO_TYPE_WRITE: 3197 case SPDK_BDEV_IO_TYPE_RESET: 3198 case SPDK_BDEV_IO_TYPE_FLUSH: 3199 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 3200 case SPDK_BDEV_IO_TYPE_NVME_IO: 3201 case SPDK_BDEV_IO_TYPE_ABORT: 3202 return true; 3203 3204 case SPDK_BDEV_IO_TYPE_COMPARE: 3205 return spdk_nvme_ns_supports_compare(ns); 3206 3207 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3208 return spdk_nvme_ns_get_md_size(ns) ? true : false; 3209 3210 case SPDK_BDEV_IO_TYPE_UNMAP: 3211 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3212 return cdata->oncs.dsm; 3213 3214 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3215 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3216 return cdata->oncs.write_zeroes; 3217 3218 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 3219 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 3220 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) { 3221 return true; 3222 } 3223 return false; 3224 3225 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 3226 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 3227 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS; 3228 3229 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 3230 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS && 3231 spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ZONE_APPEND_SUPPORTED; 3232 3233 case SPDK_BDEV_IO_TYPE_COPY: 3234 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3235 return cdata->oncs.copy; 3236 3237 default: 3238 return false; 3239 } 3240 } 3241 3242 static int 3243 nvme_qpair_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ctrlr_channel *ctrlr_ch) 3244 { 3245 struct nvme_qpair *nvme_qpair; 3246 struct spdk_io_channel *pg_ch; 3247 int rc; 3248 3249 nvme_qpair = calloc(1, sizeof(*nvme_qpair)); 3250 if (!nvme_qpair) { 3251 SPDK_ERRLOG("Failed to alloc nvme_qpair.\n"); 3252 return -1; 3253 } 3254 3255 TAILQ_INIT(&nvme_qpair->io_path_list); 3256 3257 nvme_qpair->ctrlr = nvme_ctrlr; 3258 nvme_qpair->ctrlr_ch = ctrlr_ch; 3259 3260 pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs); 3261 if (!pg_ch) { 3262 free(nvme_qpair); 3263 return -1; 3264 } 3265 3266 nvme_qpair->group = spdk_io_channel_get_ctx(pg_ch); 3267 3268 #ifdef SPDK_CONFIG_VTUNE 3269 nvme_qpair->group->collect_spin_stat = true; 3270 #else 3271 nvme_qpair->group->collect_spin_stat = false; 3272 #endif 3273 3274 if (!nvme_ctrlr->disabled) { 3275 /* If a nvme_ctrlr is disabled, don't try to create qpair for it. Qpair will 3276 * be created when it's enabled. 3277 */ 3278 rc = bdev_nvme_create_qpair(nvme_qpair); 3279 if (rc != 0) { 3280 /* nvme_ctrlr can't create IO qpair if connection is down. 3281 * If reconnect_delay_sec is non-zero, creating IO qpair is retried 3282 * after reconnect_delay_sec seconds. If bdev_retry_count is non-zero, 3283 * submitted IO will be queued until IO qpair is successfully created. 3284 * 3285 * Hence, if both are satisfied, ignore the failure. 3286 */ 3287 if (nvme_ctrlr->opts.reconnect_delay_sec == 0 || g_opts.bdev_retry_count == 0) { 3288 spdk_put_io_channel(pg_ch); 3289 free(nvme_qpair); 3290 return rc; 3291 } 3292 } 3293 } 3294 3295 TAILQ_INSERT_TAIL(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 3296 3297 ctrlr_ch->qpair = nvme_qpair; 3298 3299 pthread_mutex_lock(&nvme_qpair->ctrlr->mutex); 3300 nvme_qpair->ctrlr->ref++; 3301 pthread_mutex_unlock(&nvme_qpair->ctrlr->mutex); 3302 3303 return 0; 3304 } 3305 3306 static int 3307 bdev_nvme_create_ctrlr_channel_cb(void *io_device, void *ctx_buf) 3308 { 3309 struct nvme_ctrlr *nvme_ctrlr = io_device; 3310 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 3311 3312 TAILQ_INIT(&ctrlr_ch->pending_resets); 3313 3314 return nvme_qpair_create(nvme_ctrlr, ctrlr_ch); 3315 } 3316 3317 static void 3318 nvme_qpair_delete(struct nvme_qpair *nvme_qpair) 3319 { 3320 struct nvme_io_path *io_path, *next; 3321 3322 assert(nvme_qpair->group != NULL); 3323 3324 TAILQ_FOREACH_SAFE(io_path, &nvme_qpair->io_path_list, tailq, next) { 3325 TAILQ_REMOVE(&nvme_qpair->io_path_list, io_path, tailq); 3326 nvme_io_path_free(io_path); 3327 } 3328 3329 TAILQ_REMOVE(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 3330 3331 spdk_put_io_channel(spdk_io_channel_from_ctx(nvme_qpair->group)); 3332 3333 nvme_ctrlr_release(nvme_qpair->ctrlr); 3334 3335 free(nvme_qpair); 3336 } 3337 3338 static void 3339 bdev_nvme_destroy_ctrlr_channel_cb(void *io_device, void *ctx_buf) 3340 { 3341 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 3342 struct nvme_qpair *nvme_qpair; 3343 3344 nvme_qpair = ctrlr_ch->qpair; 3345 assert(nvme_qpair != NULL); 3346 3347 _bdev_nvme_clear_io_path_cache(nvme_qpair); 3348 3349 if (nvme_qpair->qpair != NULL) { 3350 if (ctrlr_ch->reset_iter == NULL) { 3351 spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); 3352 } else { 3353 /* Skip current ctrlr_channel in a full reset sequence because 3354 * it is being deleted now. The qpair is already being disconnected. 3355 * We do not have to restart disconnecting it. 3356 */ 3357 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 3358 } 3359 3360 /* We cannot release a reference to the poll group now. 3361 * The qpair may be disconnected asynchronously later. 3362 * We need to poll it until it is actually disconnected. 3363 * Just detach the qpair from the deleting ctrlr_channel. 3364 */ 3365 nvme_qpair->ctrlr_ch = NULL; 3366 } else { 3367 assert(ctrlr_ch->reset_iter == NULL); 3368 3369 nvme_qpair_delete(nvme_qpair); 3370 } 3371 } 3372 3373 static inline struct spdk_io_channel * 3374 bdev_nvme_get_accel_channel(struct nvme_poll_group *group) 3375 { 3376 if (spdk_unlikely(!group->accel_channel)) { 3377 group->accel_channel = spdk_accel_get_io_channel(); 3378 if (!group->accel_channel) { 3379 SPDK_ERRLOG("Cannot get the accel_channel for bdev nvme polling group=%p\n", 3380 group); 3381 return NULL; 3382 } 3383 } 3384 3385 return group->accel_channel; 3386 } 3387 3388 static void 3389 bdev_nvme_submit_accel_crc32c(void *ctx, uint32_t *dst, struct iovec *iov, 3390 uint32_t iov_cnt, uint32_t seed, 3391 spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) 3392 { 3393 struct spdk_io_channel *accel_ch; 3394 struct nvme_poll_group *group = ctx; 3395 int rc; 3396 3397 assert(cb_fn != NULL); 3398 3399 accel_ch = bdev_nvme_get_accel_channel(group); 3400 if (spdk_unlikely(accel_ch == NULL)) { 3401 cb_fn(cb_arg, -ENOMEM); 3402 return; 3403 } 3404 3405 rc = spdk_accel_submit_crc32cv(accel_ch, dst, iov, iov_cnt, seed, cb_fn, cb_arg); 3406 if (rc) { 3407 /* For the two cases, spdk_accel_submit_crc32cv does not call the user's cb_fn */ 3408 if (rc == -ENOMEM || rc == -EINVAL) { 3409 cb_fn(cb_arg, rc); 3410 } 3411 SPDK_ERRLOG("Cannot complete the accelerated crc32c operation with iov=%p\n", iov); 3412 } 3413 } 3414 3415 static void 3416 bdev_nvme_finish_sequence(void *seq, spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) 3417 { 3418 spdk_accel_sequence_finish(seq, cb_fn, cb_arg); 3419 } 3420 3421 static void 3422 bdev_nvme_abort_sequence(void *seq) 3423 { 3424 spdk_accel_sequence_abort(seq); 3425 } 3426 3427 static void 3428 bdev_nvme_reverse_sequence(void *seq) 3429 { 3430 spdk_accel_sequence_reverse(seq); 3431 } 3432 3433 static int 3434 bdev_nvme_append_crc32c(void *ctx, void **seq, uint32_t *dst, struct iovec *iovs, uint32_t iovcnt, 3435 struct spdk_memory_domain *domain, void *domain_ctx, uint32_t seed, 3436 spdk_nvme_accel_step_cb cb_fn, void *cb_arg) 3437 { 3438 struct spdk_io_channel *ch; 3439 struct nvme_poll_group *group = ctx; 3440 3441 ch = bdev_nvme_get_accel_channel(group); 3442 if (spdk_unlikely(ch == NULL)) { 3443 return -ENOMEM; 3444 } 3445 3446 return spdk_accel_append_crc32c((struct spdk_accel_sequence **)seq, ch, dst, iovs, iovcnt, 3447 domain, domain_ctx, seed, cb_fn, cb_arg); 3448 } 3449 3450 static struct spdk_nvme_accel_fn_table g_bdev_nvme_accel_fn_table = { 3451 .table_size = sizeof(struct spdk_nvme_accel_fn_table), 3452 .submit_accel_crc32c = bdev_nvme_submit_accel_crc32c, 3453 .append_crc32c = bdev_nvme_append_crc32c, 3454 .finish_sequence = bdev_nvme_finish_sequence, 3455 .reverse_sequence = bdev_nvme_reverse_sequence, 3456 .abort_sequence = bdev_nvme_abort_sequence, 3457 }; 3458 3459 static int 3460 bdev_nvme_create_poll_group_cb(void *io_device, void *ctx_buf) 3461 { 3462 struct nvme_poll_group *group = ctx_buf; 3463 3464 TAILQ_INIT(&group->qpair_list); 3465 3466 group->group = spdk_nvme_poll_group_create(group, &g_bdev_nvme_accel_fn_table); 3467 if (group->group == NULL) { 3468 return -1; 3469 } 3470 3471 group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us); 3472 3473 if (group->poller == NULL) { 3474 spdk_nvme_poll_group_destroy(group->group); 3475 return -1; 3476 } 3477 3478 return 0; 3479 } 3480 3481 static void 3482 bdev_nvme_destroy_poll_group_cb(void *io_device, void *ctx_buf) 3483 { 3484 struct nvme_poll_group *group = ctx_buf; 3485 3486 assert(TAILQ_EMPTY(&group->qpair_list)); 3487 3488 if (group->accel_channel) { 3489 spdk_put_io_channel(group->accel_channel); 3490 } 3491 3492 spdk_poller_unregister(&group->poller); 3493 if (spdk_nvme_poll_group_destroy(group->group)) { 3494 SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.\n"); 3495 assert(false); 3496 } 3497 } 3498 3499 static struct spdk_io_channel * 3500 bdev_nvme_get_io_channel(void *ctx) 3501 { 3502 struct nvme_bdev *nvme_bdev = ctx; 3503 3504 return spdk_get_io_channel(nvme_bdev); 3505 } 3506 3507 static void * 3508 bdev_nvme_get_module_ctx(void *ctx) 3509 { 3510 struct nvme_bdev *nvme_bdev = ctx; 3511 struct nvme_ns *nvme_ns; 3512 3513 if (!nvme_bdev || nvme_bdev->disk.module != &nvme_if) { 3514 return NULL; 3515 } 3516 3517 nvme_ns = TAILQ_FIRST(&nvme_bdev->nvme_ns_list); 3518 if (!nvme_ns) { 3519 return NULL; 3520 } 3521 3522 return nvme_ns->ns; 3523 } 3524 3525 static const char * 3526 _nvme_ana_state_str(enum spdk_nvme_ana_state ana_state) 3527 { 3528 switch (ana_state) { 3529 case SPDK_NVME_ANA_OPTIMIZED_STATE: 3530 return "optimized"; 3531 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 3532 return "non_optimized"; 3533 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 3534 return "inaccessible"; 3535 case SPDK_NVME_ANA_PERSISTENT_LOSS_STATE: 3536 return "persistent_loss"; 3537 case SPDK_NVME_ANA_CHANGE_STATE: 3538 return "change"; 3539 default: 3540 return NULL; 3541 } 3542 } 3543 3544 static int 3545 bdev_nvme_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size) 3546 { 3547 struct spdk_memory_domain **_domains = NULL; 3548 struct nvme_bdev *nbdev = ctx; 3549 struct nvme_ns *nvme_ns; 3550 int i = 0, _array_size = array_size; 3551 int rc = 0; 3552 3553 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 3554 if (domains && array_size >= i) { 3555 _domains = &domains[i]; 3556 } else { 3557 _domains = NULL; 3558 } 3559 rc = spdk_nvme_ctrlr_get_memory_domains(nvme_ns->ctrlr->ctrlr, _domains, _array_size); 3560 if (rc > 0) { 3561 i += rc; 3562 if (_array_size >= rc) { 3563 _array_size -= rc; 3564 } else { 3565 _array_size = 0; 3566 } 3567 } else if (rc < 0) { 3568 return rc; 3569 } 3570 } 3571 3572 return i; 3573 } 3574 3575 static const char * 3576 nvme_ctrlr_get_state_str(struct nvme_ctrlr *nvme_ctrlr) 3577 { 3578 if (nvme_ctrlr->destruct) { 3579 return "deleting"; 3580 } else if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 3581 return "failed"; 3582 } else if (nvme_ctrlr->resetting) { 3583 return "resetting"; 3584 } else if (nvme_ctrlr->reconnect_is_delayed > 0) { 3585 return "reconnect_is_delayed"; 3586 } else if (nvme_ctrlr->disabled) { 3587 return "disabled"; 3588 } else { 3589 return "enabled"; 3590 } 3591 } 3592 3593 void 3594 nvme_ctrlr_info_json(struct spdk_json_write_ctx *w, struct nvme_ctrlr *nvme_ctrlr) 3595 { 3596 struct spdk_nvme_transport_id *trid; 3597 const struct spdk_nvme_ctrlr_opts *opts; 3598 const struct spdk_nvme_ctrlr_data *cdata; 3599 struct nvme_path_id *path_id; 3600 3601 spdk_json_write_object_begin(w); 3602 3603 spdk_json_write_named_string(w, "state", nvme_ctrlr_get_state_str(nvme_ctrlr)); 3604 3605 #ifdef SPDK_CONFIG_NVME_CUSE 3606 size_t cuse_name_size = 128; 3607 char cuse_name[cuse_name_size]; 3608 3609 int rc = spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr, cuse_name, &cuse_name_size); 3610 if (rc == 0) { 3611 spdk_json_write_named_string(w, "cuse_device", cuse_name); 3612 } 3613 #endif 3614 trid = &nvme_ctrlr->active_path_id->trid; 3615 spdk_json_write_named_object_begin(w, "trid"); 3616 nvme_bdev_dump_trid_json(trid, w); 3617 spdk_json_write_object_end(w); 3618 3619 path_id = TAILQ_NEXT(nvme_ctrlr->active_path_id, link); 3620 if (path_id != NULL) { 3621 spdk_json_write_named_array_begin(w, "alternate_trids"); 3622 do { 3623 trid = &path_id->trid; 3624 spdk_json_write_object_begin(w); 3625 nvme_bdev_dump_trid_json(trid, w); 3626 spdk_json_write_object_end(w); 3627 3628 path_id = TAILQ_NEXT(path_id, link); 3629 } while (path_id != NULL); 3630 spdk_json_write_array_end(w); 3631 } 3632 3633 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 3634 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 3635 3636 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 3637 spdk_json_write_named_object_begin(w, "host"); 3638 spdk_json_write_named_string(w, "nqn", opts->hostnqn); 3639 spdk_json_write_named_string(w, "addr", opts->src_addr); 3640 spdk_json_write_named_string(w, "svcid", opts->src_svcid); 3641 spdk_json_write_object_end(w); 3642 3643 spdk_json_write_object_end(w); 3644 } 3645 3646 static void 3647 nvme_namespace_info_json(struct spdk_json_write_ctx *w, 3648 struct nvme_ns *nvme_ns) 3649 { 3650 struct spdk_nvme_ns *ns; 3651 struct spdk_nvme_ctrlr *ctrlr; 3652 const struct spdk_nvme_ctrlr_data *cdata; 3653 const struct spdk_nvme_transport_id *trid; 3654 union spdk_nvme_vs_register vs; 3655 const struct spdk_nvme_ns_data *nsdata; 3656 char buf[128]; 3657 3658 ns = nvme_ns->ns; 3659 if (ns == NULL) { 3660 return; 3661 } 3662 3663 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 3664 3665 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3666 trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); 3667 vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr); 3668 3669 spdk_json_write_object_begin(w); 3670 3671 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 3672 spdk_json_write_named_string(w, "pci_address", trid->traddr); 3673 } 3674 3675 spdk_json_write_named_object_begin(w, "trid"); 3676 3677 nvme_bdev_dump_trid_json(trid, w); 3678 3679 spdk_json_write_object_end(w); 3680 3681 #ifdef SPDK_CONFIG_NVME_CUSE 3682 size_t cuse_name_size = 128; 3683 char cuse_name[cuse_name_size]; 3684 3685 int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns), 3686 cuse_name, &cuse_name_size); 3687 if (rc == 0) { 3688 spdk_json_write_named_string(w, "cuse_device", cuse_name); 3689 } 3690 #endif 3691 3692 spdk_json_write_named_object_begin(w, "ctrlr_data"); 3693 3694 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 3695 3696 spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid); 3697 3698 snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn); 3699 spdk_str_trim(buf); 3700 spdk_json_write_named_string(w, "model_number", buf); 3701 3702 snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn); 3703 spdk_str_trim(buf); 3704 spdk_json_write_named_string(w, "serial_number", buf); 3705 3706 snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr); 3707 spdk_str_trim(buf); 3708 spdk_json_write_named_string(w, "firmware_revision", buf); 3709 3710 if (cdata->subnqn[0] != '\0') { 3711 spdk_json_write_named_string(w, "subnqn", cdata->subnqn); 3712 } 3713 3714 spdk_json_write_named_object_begin(w, "oacs"); 3715 3716 spdk_json_write_named_uint32(w, "security", cdata->oacs.security); 3717 spdk_json_write_named_uint32(w, "format", cdata->oacs.format); 3718 spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware); 3719 spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage); 3720 3721 spdk_json_write_object_end(w); 3722 3723 spdk_json_write_named_bool(w, "multi_ctrlr", cdata->cmic.multi_ctrlr); 3724 spdk_json_write_named_bool(w, "ana_reporting", cdata->cmic.ana_reporting); 3725 3726 spdk_json_write_object_end(w); 3727 3728 spdk_json_write_named_object_begin(w, "vs"); 3729 3730 spdk_json_write_name(w, "nvme_version"); 3731 if (vs.bits.ter) { 3732 spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter); 3733 } else { 3734 spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr); 3735 } 3736 3737 spdk_json_write_object_end(w); 3738 3739 nsdata = spdk_nvme_ns_get_data(ns); 3740 3741 spdk_json_write_named_object_begin(w, "ns_data"); 3742 3743 spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns)); 3744 3745 if (cdata->cmic.ana_reporting) { 3746 spdk_json_write_named_string(w, "ana_state", 3747 _nvme_ana_state_str(nvme_ns->ana_state)); 3748 } 3749 3750 spdk_json_write_named_bool(w, "can_share", nsdata->nmic.can_share); 3751 3752 spdk_json_write_object_end(w); 3753 3754 if (cdata->oacs.security) { 3755 spdk_json_write_named_object_begin(w, "security"); 3756 3757 spdk_json_write_named_bool(w, "opal", nvme_ns->bdev->opal); 3758 3759 spdk_json_write_object_end(w); 3760 } 3761 3762 spdk_json_write_object_end(w); 3763 } 3764 3765 static const char * 3766 nvme_bdev_get_mp_policy_str(struct nvme_bdev *nbdev) 3767 { 3768 switch (nbdev->mp_policy) { 3769 case BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE: 3770 return "active_passive"; 3771 case BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE: 3772 return "active_active"; 3773 default: 3774 assert(false); 3775 return "invalid"; 3776 } 3777 } 3778 3779 static const char * 3780 nvme_bdev_get_mp_selector_str(struct nvme_bdev *nbdev) 3781 { 3782 switch (nbdev->mp_selector) { 3783 case BDEV_NVME_MP_SELECTOR_ROUND_ROBIN: 3784 return "round_robin"; 3785 case BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH: 3786 return "queue_depth"; 3787 default: 3788 assert(false); 3789 return "invalid"; 3790 } 3791 } 3792 3793 static int 3794 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) 3795 { 3796 struct nvme_bdev *nvme_bdev = ctx; 3797 struct nvme_ns *nvme_ns; 3798 3799 pthread_mutex_lock(&nvme_bdev->mutex); 3800 spdk_json_write_named_array_begin(w, "nvme"); 3801 TAILQ_FOREACH(nvme_ns, &nvme_bdev->nvme_ns_list, tailq) { 3802 nvme_namespace_info_json(w, nvme_ns); 3803 } 3804 spdk_json_write_array_end(w); 3805 spdk_json_write_named_string(w, "mp_policy", nvme_bdev_get_mp_policy_str(nvme_bdev)); 3806 if (nvme_bdev->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) { 3807 spdk_json_write_named_string(w, "selector", nvme_bdev_get_mp_selector_str(nvme_bdev)); 3808 if (nvme_bdev->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 3809 spdk_json_write_named_uint32(w, "rr_min_io", nvme_bdev->rr_min_io); 3810 } 3811 } 3812 pthread_mutex_unlock(&nvme_bdev->mutex); 3813 3814 return 0; 3815 } 3816 3817 static void 3818 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3819 { 3820 /* No config per bdev needed */ 3821 } 3822 3823 static uint64_t 3824 bdev_nvme_get_spin_time(struct spdk_io_channel *ch) 3825 { 3826 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 3827 struct nvme_io_path *io_path; 3828 struct nvme_poll_group *group; 3829 uint64_t spin_time = 0; 3830 3831 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 3832 group = io_path->qpair->group; 3833 3834 if (!group || !group->collect_spin_stat) { 3835 continue; 3836 } 3837 3838 if (group->end_ticks != 0) { 3839 group->spin_ticks += (group->end_ticks - group->start_ticks); 3840 group->end_ticks = 0; 3841 } 3842 3843 spin_time += group->spin_ticks; 3844 group->start_ticks = 0; 3845 group->spin_ticks = 0; 3846 } 3847 3848 return (spin_time * 1000000ULL) / spdk_get_ticks_hz(); 3849 } 3850 3851 static void 3852 bdev_nvme_reset_device_stat(void *ctx) 3853 { 3854 struct nvme_bdev *nbdev = ctx; 3855 3856 if (nbdev->err_stat != NULL) { 3857 memset(nbdev->err_stat, 0, sizeof(struct nvme_error_stat)); 3858 } 3859 } 3860 3861 /* JSON string should be lowercases and underscore delimited string. */ 3862 static void 3863 bdev_nvme_format_nvme_status(char *dst, const char *src) 3864 { 3865 char tmp[256]; 3866 3867 spdk_strcpy_replace(dst, 256, src, " - ", "_"); 3868 spdk_strcpy_replace(tmp, 256, dst, "-", "_"); 3869 spdk_strcpy_replace(dst, 256, tmp, " ", "_"); 3870 spdk_strlwr(dst); 3871 } 3872 3873 static void 3874 bdev_nvme_dump_device_stat_json(void *ctx, struct spdk_json_write_ctx *w) 3875 { 3876 struct nvme_bdev *nbdev = ctx; 3877 struct spdk_nvme_status status = {}; 3878 uint16_t sct, sc; 3879 char status_json[256]; 3880 const char *status_str; 3881 3882 if (nbdev->err_stat == NULL) { 3883 return; 3884 } 3885 3886 spdk_json_write_named_object_begin(w, "nvme_error"); 3887 3888 spdk_json_write_named_object_begin(w, "status_type"); 3889 for (sct = 0; sct < 8; sct++) { 3890 if (nbdev->err_stat->status_type[sct] == 0) { 3891 continue; 3892 } 3893 status.sct = sct; 3894 3895 status_str = spdk_nvme_cpl_get_status_type_string(&status); 3896 assert(status_str != NULL); 3897 bdev_nvme_format_nvme_status(status_json, status_str); 3898 3899 spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status_type[sct]); 3900 } 3901 spdk_json_write_object_end(w); 3902 3903 spdk_json_write_named_object_begin(w, "status_code"); 3904 for (sct = 0; sct < 4; sct++) { 3905 status.sct = sct; 3906 for (sc = 0; sc < 256; sc++) { 3907 if (nbdev->err_stat->status[sct][sc] == 0) { 3908 continue; 3909 } 3910 status.sc = sc; 3911 3912 status_str = spdk_nvme_cpl_get_status_string(&status); 3913 assert(status_str != NULL); 3914 bdev_nvme_format_nvme_status(status_json, status_str); 3915 3916 spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status[sct][sc]); 3917 } 3918 } 3919 spdk_json_write_object_end(w); 3920 3921 spdk_json_write_object_end(w); 3922 } 3923 3924 static bool 3925 bdev_nvme_accel_sequence_supported(void *ctx, enum spdk_bdev_io_type type) 3926 { 3927 struct nvme_bdev *nbdev = ctx; 3928 struct spdk_nvme_ctrlr *ctrlr; 3929 3930 if (!g_opts.allow_accel_sequence) { 3931 return false; 3932 } 3933 3934 switch (type) { 3935 case SPDK_BDEV_IO_TYPE_WRITE: 3936 case SPDK_BDEV_IO_TYPE_READ: 3937 break; 3938 default: 3939 return false; 3940 } 3941 3942 ctrlr = bdev_nvme_get_ctrlr(&nbdev->disk); 3943 assert(ctrlr != NULL); 3944 3945 return spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ACCEL_SEQUENCE_SUPPORTED; 3946 } 3947 3948 static const struct spdk_bdev_fn_table nvmelib_fn_table = { 3949 .destruct = bdev_nvme_destruct, 3950 .submit_request = bdev_nvme_submit_request, 3951 .io_type_supported = bdev_nvme_io_type_supported, 3952 .get_io_channel = bdev_nvme_get_io_channel, 3953 .dump_info_json = bdev_nvme_dump_info_json, 3954 .write_config_json = bdev_nvme_write_config_json, 3955 .get_spin_time = bdev_nvme_get_spin_time, 3956 .get_module_ctx = bdev_nvme_get_module_ctx, 3957 .get_memory_domains = bdev_nvme_get_memory_domains, 3958 .accel_sequence_supported = bdev_nvme_accel_sequence_supported, 3959 .reset_device_stat = bdev_nvme_reset_device_stat, 3960 .dump_device_stat_json = bdev_nvme_dump_device_stat_json, 3961 }; 3962 3963 typedef int (*bdev_nvme_parse_ana_log_page_cb)( 3964 const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg); 3965 3966 static int 3967 bdev_nvme_parse_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 3968 bdev_nvme_parse_ana_log_page_cb cb_fn, void *cb_arg) 3969 { 3970 struct spdk_nvme_ana_group_descriptor *copied_desc; 3971 uint8_t *orig_desc; 3972 uint32_t i, desc_size, copy_len; 3973 int rc = 0; 3974 3975 if (nvme_ctrlr->ana_log_page == NULL) { 3976 return -EINVAL; 3977 } 3978 3979 copied_desc = nvme_ctrlr->copied_ana_desc; 3980 3981 orig_desc = (uint8_t *)nvme_ctrlr->ana_log_page + sizeof(struct spdk_nvme_ana_page); 3982 copy_len = nvme_ctrlr->max_ana_log_page_size - sizeof(struct spdk_nvme_ana_page); 3983 3984 for (i = 0; i < nvme_ctrlr->ana_log_page->num_ana_group_desc; i++) { 3985 memcpy(copied_desc, orig_desc, copy_len); 3986 3987 rc = cb_fn(copied_desc, cb_arg); 3988 if (rc != 0) { 3989 break; 3990 } 3991 3992 desc_size = sizeof(struct spdk_nvme_ana_group_descriptor) + 3993 copied_desc->num_of_nsid * sizeof(uint32_t); 3994 orig_desc += desc_size; 3995 copy_len -= desc_size; 3996 } 3997 3998 return rc; 3999 } 4000 4001 static int 4002 nvme_ns_ana_transition_timedout(void *ctx) 4003 { 4004 struct nvme_ns *nvme_ns = ctx; 4005 4006 spdk_poller_unregister(&nvme_ns->anatt_timer); 4007 nvme_ns->ana_transition_timedout = true; 4008 4009 return SPDK_POLLER_BUSY; 4010 } 4011 4012 static void 4013 _nvme_ns_set_ana_state(struct nvme_ns *nvme_ns, 4014 const struct spdk_nvme_ana_group_descriptor *desc) 4015 { 4016 const struct spdk_nvme_ctrlr_data *cdata; 4017 4018 nvme_ns->ana_group_id = desc->ana_group_id; 4019 nvme_ns->ana_state = desc->ana_state; 4020 nvme_ns->ana_state_updating = false; 4021 4022 switch (nvme_ns->ana_state) { 4023 case SPDK_NVME_ANA_OPTIMIZED_STATE: 4024 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 4025 nvme_ns->ana_transition_timedout = false; 4026 spdk_poller_unregister(&nvme_ns->anatt_timer); 4027 break; 4028 4029 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 4030 case SPDK_NVME_ANA_CHANGE_STATE: 4031 if (nvme_ns->anatt_timer != NULL) { 4032 break; 4033 } 4034 4035 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 4036 nvme_ns->anatt_timer = SPDK_POLLER_REGISTER(nvme_ns_ana_transition_timedout, 4037 nvme_ns, 4038 cdata->anatt * SPDK_SEC_TO_USEC); 4039 break; 4040 default: 4041 break; 4042 } 4043 } 4044 4045 static int 4046 nvme_ns_set_ana_state(const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg) 4047 { 4048 struct nvme_ns *nvme_ns = cb_arg; 4049 uint32_t i; 4050 4051 assert(nvme_ns->ns != NULL); 4052 4053 for (i = 0; i < desc->num_of_nsid; i++) { 4054 if (desc->nsid[i] != spdk_nvme_ns_get_id(nvme_ns->ns)) { 4055 continue; 4056 } 4057 4058 _nvme_ns_set_ana_state(nvme_ns, desc); 4059 return 1; 4060 } 4061 4062 return 0; 4063 } 4064 4065 static int 4066 nvme_generate_uuid(const char *sn, uint32_t nsid, struct spdk_uuid *uuid) 4067 { 4068 int rc = 0; 4069 struct spdk_uuid new_uuid, namespace_uuid; 4070 char merged_str[SPDK_NVME_CTRLR_SN_LEN + NSID_STR_LEN + 1] = {'\0'}; 4071 /* This namespace UUID was generated using uuid_generate() method. */ 4072 const char *namespace_str = {"edaed2de-24bc-4b07-b559-f47ecbe730fd"}; 4073 int size; 4074 4075 assert(strlen(sn) <= SPDK_NVME_CTRLR_SN_LEN); 4076 4077 spdk_uuid_set_null(&new_uuid); 4078 spdk_uuid_set_null(&namespace_uuid); 4079 4080 size = snprintf(merged_str, sizeof(merged_str), "%s%"PRIu32, sn, nsid); 4081 if (size <= 0 || (unsigned long)size >= sizeof(merged_str)) { 4082 return -EINVAL; 4083 } 4084 4085 spdk_uuid_parse(&namespace_uuid, namespace_str); 4086 4087 rc = spdk_uuid_generate_sha1(&new_uuid, &namespace_uuid, merged_str, size); 4088 if (rc == 0) { 4089 memcpy(uuid, &new_uuid, sizeof(struct spdk_uuid)); 4090 } 4091 4092 return rc; 4093 } 4094 4095 static int 4096 nvme_disk_create(struct spdk_bdev *disk, const char *base_name, 4097 struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns, 4098 uint32_t prchk_flags, void *ctx) 4099 { 4100 const struct spdk_uuid *uuid; 4101 const uint8_t *nguid; 4102 const struct spdk_nvme_ctrlr_data *cdata; 4103 const struct spdk_nvme_ns_data *nsdata; 4104 const struct spdk_nvme_ctrlr_opts *opts; 4105 enum spdk_nvme_csi csi; 4106 uint32_t atomic_bs, phys_bs, bs; 4107 char sn_tmp[SPDK_NVME_CTRLR_SN_LEN + 1] = {'\0'}; 4108 int rc; 4109 4110 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 4111 csi = spdk_nvme_ns_get_csi(ns); 4112 opts = spdk_nvme_ctrlr_get_opts(ctrlr); 4113 4114 switch (csi) { 4115 case SPDK_NVME_CSI_NVM: 4116 disk->product_name = "NVMe disk"; 4117 break; 4118 case SPDK_NVME_CSI_ZNS: 4119 disk->product_name = "NVMe ZNS disk"; 4120 disk->zoned = true; 4121 disk->zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 4122 disk->max_zone_append_size = spdk_nvme_zns_ctrlr_get_max_zone_append_size(ctrlr) / 4123 spdk_nvme_ns_get_extended_sector_size(ns); 4124 disk->max_open_zones = spdk_nvme_zns_ns_get_max_open_zones(ns); 4125 disk->max_active_zones = spdk_nvme_zns_ns_get_max_active_zones(ns); 4126 break; 4127 default: 4128 SPDK_ERRLOG("unsupported CSI: %u\n", csi); 4129 return -ENOTSUP; 4130 } 4131 4132 nguid = spdk_nvme_ns_get_nguid(ns); 4133 if (!nguid) { 4134 uuid = spdk_nvme_ns_get_uuid(ns); 4135 if (uuid) { 4136 disk->uuid = *uuid; 4137 } else if (g_opts.generate_uuids) { 4138 spdk_strcpy_pad(sn_tmp, cdata->sn, SPDK_NVME_CTRLR_SN_LEN, '\0'); 4139 rc = nvme_generate_uuid(sn_tmp, spdk_nvme_ns_get_id(ns), &disk->uuid); 4140 if (rc < 0) { 4141 SPDK_ERRLOG("UUID generation failed (%s)\n", spdk_strerror(-rc)); 4142 return rc; 4143 } 4144 } 4145 } else { 4146 memcpy(&disk->uuid, nguid, sizeof(disk->uuid)); 4147 } 4148 4149 disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns)); 4150 if (!disk->name) { 4151 return -ENOMEM; 4152 } 4153 4154 disk->write_cache = 0; 4155 if (cdata->vwc.present) { 4156 /* Enable if the Volatile Write Cache exists */ 4157 disk->write_cache = 1; 4158 } 4159 if (cdata->oncs.write_zeroes) { 4160 disk->max_write_zeroes = UINT16_MAX + 1; 4161 } 4162 disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns); 4163 disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns); 4164 disk->max_segment_size = spdk_nvme_ctrlr_get_max_xfer_size(ctrlr); 4165 disk->ctratt.raw = cdata->ctratt.raw; 4166 /* NVMe driver will split one request into multiple requests 4167 * based on MDTS and stripe boundary, the bdev layer will use 4168 * max_segment_size and max_num_segments to split one big IO 4169 * into multiple requests, then small request can't run out 4170 * of NVMe internal requests data structure. 4171 */ 4172 if (opts && opts->io_queue_requests) { 4173 disk->max_num_segments = opts->io_queue_requests / 2; 4174 } 4175 if (spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_SGL_SUPPORTED) { 4176 /* The nvme driver will try to split I/O that have too many 4177 * SGEs, but it doesn't work if that last SGE doesn't end on 4178 * an aggregate total that is block aligned. The bdev layer has 4179 * a more robust splitting framework, so use that instead for 4180 * this case. (See issue #3269.) 4181 */ 4182 uint16_t max_sges = spdk_nvme_ctrlr_get_max_sges(ctrlr); 4183 4184 if (disk->max_num_segments == 0) { 4185 disk->max_num_segments = max_sges; 4186 } else { 4187 disk->max_num_segments = spdk_min(disk->max_num_segments, max_sges); 4188 } 4189 } 4190 disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns); 4191 4192 nsdata = spdk_nvme_ns_get_data(ns); 4193 bs = spdk_nvme_ns_get_sector_size(ns); 4194 atomic_bs = bs; 4195 phys_bs = bs; 4196 if (nsdata->nabo == 0) { 4197 if (nsdata->nsfeat.ns_atomic_write_unit && nsdata->nawupf) { 4198 atomic_bs = bs * (1 + nsdata->nawupf); 4199 } else { 4200 atomic_bs = bs * (1 + cdata->awupf); 4201 } 4202 } 4203 if (nsdata->nsfeat.optperf) { 4204 phys_bs = bs * (1 + nsdata->npwg); 4205 } 4206 disk->phys_blocklen = spdk_min(phys_bs, atomic_bs); 4207 4208 disk->md_len = spdk_nvme_ns_get_md_size(ns); 4209 if (disk->md_len != 0) { 4210 disk->md_interleave = nsdata->flbas.extended; 4211 disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns); 4212 if (disk->dif_type != SPDK_DIF_DISABLE) { 4213 disk->dif_is_head_of_md = nsdata->dps.md_start; 4214 disk->dif_check_flags = prchk_flags; 4215 } 4216 } 4217 4218 if (!(spdk_nvme_ctrlr_get_flags(ctrlr) & 4219 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) { 4220 disk->acwu = 0; 4221 } else if (nsdata->nsfeat.ns_atomic_write_unit) { 4222 disk->acwu = nsdata->nacwu + 1; /* 0-based */ 4223 } else { 4224 disk->acwu = cdata->acwu + 1; /* 0-based */ 4225 } 4226 4227 if (cdata->oncs.copy) { 4228 /* For now bdev interface allows only single segment copy */ 4229 disk->max_copy = nsdata->mssrl; 4230 } 4231 4232 disk->ctxt = ctx; 4233 disk->fn_table = &nvmelib_fn_table; 4234 disk->module = &nvme_if; 4235 4236 return 0; 4237 } 4238 4239 static struct nvme_bdev * 4240 nvme_bdev_alloc(void) 4241 { 4242 struct nvme_bdev *bdev; 4243 int rc; 4244 4245 bdev = calloc(1, sizeof(*bdev)); 4246 if (!bdev) { 4247 SPDK_ERRLOG("bdev calloc() failed\n"); 4248 return NULL; 4249 } 4250 4251 if (g_opts.nvme_error_stat) { 4252 bdev->err_stat = calloc(1, sizeof(struct nvme_error_stat)); 4253 if (!bdev->err_stat) { 4254 SPDK_ERRLOG("err_stat calloc() failed\n"); 4255 free(bdev); 4256 return NULL; 4257 } 4258 } 4259 4260 rc = pthread_mutex_init(&bdev->mutex, NULL); 4261 if (rc != 0) { 4262 free(bdev->err_stat); 4263 free(bdev); 4264 return NULL; 4265 } 4266 4267 bdev->ref = 1; 4268 bdev->mp_policy = BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE; 4269 bdev->mp_selector = BDEV_NVME_MP_SELECTOR_ROUND_ROBIN; 4270 bdev->rr_min_io = UINT32_MAX; 4271 TAILQ_INIT(&bdev->nvme_ns_list); 4272 4273 return bdev; 4274 } 4275 4276 static int 4277 nvme_bdev_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4278 { 4279 struct nvme_bdev *bdev; 4280 struct nvme_bdev_ctrlr *nbdev_ctrlr = nvme_ctrlr->nbdev_ctrlr; 4281 int rc; 4282 4283 bdev = nvme_bdev_alloc(); 4284 if (bdev == NULL) { 4285 SPDK_ERRLOG("Failed to allocate NVMe bdev\n"); 4286 return -ENOMEM; 4287 } 4288 4289 bdev->opal = nvme_ctrlr->opal_dev != NULL; 4290 4291 rc = nvme_disk_create(&bdev->disk, nbdev_ctrlr->name, nvme_ctrlr->ctrlr, 4292 nvme_ns->ns, nvme_ctrlr->opts.prchk_flags, bdev); 4293 if (rc != 0) { 4294 SPDK_ERRLOG("Failed to create NVMe disk\n"); 4295 nvme_bdev_free(bdev); 4296 return rc; 4297 } 4298 4299 spdk_io_device_register(bdev, 4300 bdev_nvme_create_bdev_channel_cb, 4301 bdev_nvme_destroy_bdev_channel_cb, 4302 sizeof(struct nvme_bdev_channel), 4303 bdev->disk.name); 4304 4305 nvme_ns->bdev = bdev; 4306 bdev->nsid = nvme_ns->id; 4307 TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); 4308 4309 bdev->nbdev_ctrlr = nbdev_ctrlr; 4310 TAILQ_INSERT_TAIL(&nbdev_ctrlr->bdevs, bdev, tailq); 4311 4312 rc = spdk_bdev_register(&bdev->disk); 4313 if (rc != 0) { 4314 SPDK_ERRLOG("spdk_bdev_register() failed\n"); 4315 spdk_io_device_unregister(bdev, NULL); 4316 nvme_ns->bdev = NULL; 4317 TAILQ_REMOVE(&nbdev_ctrlr->bdevs, bdev, tailq); 4318 nvme_bdev_free(bdev); 4319 return rc; 4320 } 4321 4322 return 0; 4323 } 4324 4325 static bool 4326 bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2) 4327 { 4328 const struct spdk_nvme_ns_data *nsdata1, *nsdata2; 4329 const struct spdk_uuid *uuid1, *uuid2; 4330 4331 nsdata1 = spdk_nvme_ns_get_data(ns1); 4332 nsdata2 = spdk_nvme_ns_get_data(ns2); 4333 uuid1 = spdk_nvme_ns_get_uuid(ns1); 4334 uuid2 = spdk_nvme_ns_get_uuid(ns2); 4335 4336 return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid)) == 0 && 4337 nsdata1->eui64 == nsdata2->eui64 && 4338 ((uuid1 == NULL && uuid2 == NULL) || 4339 (uuid1 != NULL && uuid2 != NULL && spdk_uuid_compare(uuid1, uuid2) == 0)) && 4340 spdk_nvme_ns_get_csi(ns1) == spdk_nvme_ns_get_csi(ns2); 4341 } 4342 4343 static bool 4344 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 4345 struct spdk_nvme_ctrlr_opts *opts) 4346 { 4347 struct nvme_probe_skip_entry *entry; 4348 4349 TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) { 4350 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 4351 return false; 4352 } 4353 } 4354 4355 opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst; 4356 opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight; 4357 opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight; 4358 opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight; 4359 opts->disable_read_ana_log_page = true; 4360 4361 SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr); 4362 4363 return true; 4364 } 4365 4366 static void 4367 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl) 4368 { 4369 struct nvme_ctrlr *nvme_ctrlr = ctx; 4370 4371 if (spdk_nvme_cpl_is_error(cpl)) { 4372 SPDK_WARNLOG("Abort failed. Resetting controller. sc is %u, sct is %u.\n", cpl->status.sc, 4373 cpl->status.sct); 4374 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4375 } else if (cpl->cdw0 & 0x1) { 4376 SPDK_WARNLOG("Specified command could not be aborted.\n"); 4377 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4378 } 4379 } 4380 4381 static void 4382 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr, 4383 struct spdk_nvme_qpair *qpair, uint16_t cid) 4384 { 4385 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 4386 union spdk_nvme_csts_register csts; 4387 int rc; 4388 4389 assert(nvme_ctrlr->ctrlr == ctrlr); 4390 4391 SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid); 4392 4393 /* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O 4394 * queue. (Note: qpair == NULL when there's an admin cmd timeout.) Otherwise we 4395 * would submit another fabrics cmd on the admin queue to read CSTS and check for its 4396 * completion recursively. 4397 */ 4398 if (nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) { 4399 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); 4400 if (csts.bits.cfs) { 4401 SPDK_ERRLOG("Controller Fatal Status, reset required\n"); 4402 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4403 return; 4404 } 4405 } 4406 4407 switch (g_opts.action_on_timeout) { 4408 case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT: 4409 if (qpair) { 4410 /* Don't send abort to ctrlr when ctrlr is not available. */ 4411 pthread_mutex_lock(&nvme_ctrlr->mutex); 4412 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 4413 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4414 SPDK_NOTICELOG("Quit abort. Ctrlr is not available.\n"); 4415 return; 4416 } 4417 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4418 4419 rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid, 4420 nvme_abort_cpl, nvme_ctrlr); 4421 if (rc == 0) { 4422 return; 4423 } 4424 4425 SPDK_ERRLOG("Unable to send abort. Resetting, rc is %d.\n", rc); 4426 } 4427 4428 /* FALLTHROUGH */ 4429 case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET: 4430 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4431 break; 4432 case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE: 4433 SPDK_DEBUGLOG(bdev_nvme, "No action for nvme controller timeout.\n"); 4434 break; 4435 default: 4436 SPDK_ERRLOG("An invalid timeout action value is found.\n"); 4437 break; 4438 } 4439 } 4440 4441 static struct nvme_ns * 4442 nvme_ns_alloc(void) 4443 { 4444 struct nvme_ns *nvme_ns; 4445 4446 nvme_ns = calloc(1, sizeof(struct nvme_ns)); 4447 if (nvme_ns == NULL) { 4448 return NULL; 4449 } 4450 4451 if (g_opts.io_path_stat) { 4452 nvme_ns->stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); 4453 if (nvme_ns->stat == NULL) { 4454 free(nvme_ns); 4455 return NULL; 4456 } 4457 spdk_bdev_reset_io_stat(nvme_ns->stat, SPDK_BDEV_RESET_STAT_MAXMIN); 4458 } 4459 4460 return nvme_ns; 4461 } 4462 4463 static void 4464 nvme_ns_free(struct nvme_ns *nvme_ns) 4465 { 4466 free(nvme_ns->stat); 4467 free(nvme_ns); 4468 } 4469 4470 static void 4471 nvme_ctrlr_populate_namespace_done(struct nvme_ns *nvme_ns, int rc) 4472 { 4473 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 4474 struct nvme_async_probe_ctx *ctx = nvme_ns->probe_ctx; 4475 4476 if (rc == 0) { 4477 nvme_ns->probe_ctx = NULL; 4478 pthread_mutex_lock(&nvme_ctrlr->mutex); 4479 nvme_ctrlr->ref++; 4480 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4481 } else { 4482 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4483 nvme_ns_free(nvme_ns); 4484 } 4485 4486 if (ctx) { 4487 ctx->populates_in_progress--; 4488 if (ctx->populates_in_progress == 0) { 4489 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 4490 } 4491 } 4492 } 4493 4494 static void 4495 bdev_nvme_add_io_path(struct spdk_io_channel_iter *i) 4496 { 4497 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4498 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4499 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4500 int rc; 4501 4502 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 4503 if (rc != 0) { 4504 SPDK_ERRLOG("Failed to add I/O path to bdev_channel dynamically.\n"); 4505 } 4506 4507 spdk_for_each_channel_continue(i, rc); 4508 } 4509 4510 static void 4511 bdev_nvme_delete_io_path(struct spdk_io_channel_iter *i) 4512 { 4513 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4514 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4515 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4516 struct nvme_io_path *io_path; 4517 4518 io_path = _bdev_nvme_get_io_path(nbdev_ch, nvme_ns); 4519 if (io_path != NULL) { 4520 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 4521 } 4522 4523 spdk_for_each_channel_continue(i, 0); 4524 } 4525 4526 static void 4527 bdev_nvme_add_io_path_failed(struct spdk_io_channel_iter *i, int status) 4528 { 4529 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4530 4531 nvme_ctrlr_populate_namespace_done(nvme_ns, -1); 4532 } 4533 4534 static void 4535 bdev_nvme_add_io_path_done(struct spdk_io_channel_iter *i, int status) 4536 { 4537 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4538 struct nvme_bdev *bdev = spdk_io_channel_iter_get_io_device(i); 4539 4540 if (status == 0) { 4541 nvme_ctrlr_populate_namespace_done(nvme_ns, 0); 4542 } else { 4543 /* Delete the added io_paths and fail populating the namespace. */ 4544 spdk_for_each_channel(bdev, 4545 bdev_nvme_delete_io_path, 4546 nvme_ns, 4547 bdev_nvme_add_io_path_failed); 4548 } 4549 } 4550 4551 static int 4552 nvme_bdev_add_ns(struct nvme_bdev *bdev, struct nvme_ns *nvme_ns) 4553 { 4554 struct nvme_ns *tmp_ns; 4555 const struct spdk_nvme_ns_data *nsdata; 4556 4557 nsdata = spdk_nvme_ns_get_data(nvme_ns->ns); 4558 if (!nsdata->nmic.can_share) { 4559 SPDK_ERRLOG("Namespace cannot be shared.\n"); 4560 return -EINVAL; 4561 } 4562 4563 pthread_mutex_lock(&bdev->mutex); 4564 4565 tmp_ns = TAILQ_FIRST(&bdev->nvme_ns_list); 4566 assert(tmp_ns != NULL); 4567 4568 if (tmp_ns->ns != NULL && !bdev_nvme_compare_ns(nvme_ns->ns, tmp_ns->ns)) { 4569 pthread_mutex_unlock(&bdev->mutex); 4570 SPDK_ERRLOG("Namespaces are not identical.\n"); 4571 return -EINVAL; 4572 } 4573 4574 bdev->ref++; 4575 TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); 4576 nvme_ns->bdev = bdev; 4577 4578 pthread_mutex_unlock(&bdev->mutex); 4579 4580 /* Add nvme_io_path to nvme_bdev_channels dynamically. */ 4581 spdk_for_each_channel(bdev, 4582 bdev_nvme_add_io_path, 4583 nvme_ns, 4584 bdev_nvme_add_io_path_done); 4585 4586 return 0; 4587 } 4588 4589 static void 4590 nvme_ctrlr_populate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4591 { 4592 struct spdk_nvme_ns *ns; 4593 struct nvme_bdev *bdev; 4594 int rc = 0; 4595 4596 ns = spdk_nvme_ctrlr_get_ns(nvme_ctrlr->ctrlr, nvme_ns->id); 4597 if (!ns) { 4598 SPDK_DEBUGLOG(bdev_nvme, "Invalid NS %d\n", nvme_ns->id); 4599 rc = -EINVAL; 4600 goto done; 4601 } 4602 4603 nvme_ns->ns = ns; 4604 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 4605 4606 if (nvme_ctrlr->ana_log_page != NULL) { 4607 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ns_set_ana_state, nvme_ns); 4608 } 4609 4610 bdev = nvme_bdev_ctrlr_get_bdev(nvme_ctrlr->nbdev_ctrlr, nvme_ns->id); 4611 if (bdev == NULL) { 4612 rc = nvme_bdev_create(nvme_ctrlr, nvme_ns); 4613 } else { 4614 rc = nvme_bdev_add_ns(bdev, nvme_ns); 4615 if (rc == 0) { 4616 return; 4617 } 4618 } 4619 done: 4620 nvme_ctrlr_populate_namespace_done(nvme_ns, rc); 4621 } 4622 4623 static void 4624 nvme_ctrlr_depopulate_namespace_done(struct nvme_ns *nvme_ns) 4625 { 4626 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 4627 4628 assert(nvme_ctrlr != NULL); 4629 4630 pthread_mutex_lock(&nvme_ctrlr->mutex); 4631 4632 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4633 4634 if (nvme_ns->bdev != NULL) { 4635 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4636 return; 4637 } 4638 4639 nvme_ns_free(nvme_ns); 4640 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4641 4642 nvme_ctrlr_release(nvme_ctrlr); 4643 } 4644 4645 static void 4646 bdev_nvme_delete_io_path_done(struct spdk_io_channel_iter *i, int status) 4647 { 4648 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4649 4650 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 4651 } 4652 4653 static void 4654 nvme_ctrlr_depopulate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4655 { 4656 struct nvme_bdev *bdev; 4657 4658 spdk_poller_unregister(&nvme_ns->anatt_timer); 4659 4660 bdev = nvme_ns->bdev; 4661 if (bdev != NULL) { 4662 pthread_mutex_lock(&bdev->mutex); 4663 4664 assert(bdev->ref > 0); 4665 bdev->ref--; 4666 if (bdev->ref == 0) { 4667 pthread_mutex_unlock(&bdev->mutex); 4668 4669 spdk_bdev_unregister(&bdev->disk, NULL, NULL); 4670 } else { 4671 /* spdk_bdev_unregister() is not called until the last nvme_ns is 4672 * depopulated. Hence we need to remove nvme_ns from bdev->nvme_ns_list 4673 * and clear nvme_ns->bdev here. 4674 */ 4675 TAILQ_REMOVE(&bdev->nvme_ns_list, nvme_ns, tailq); 4676 nvme_ns->bdev = NULL; 4677 4678 pthread_mutex_unlock(&bdev->mutex); 4679 4680 /* Delete nvme_io_paths from nvme_bdev_channels dynamically. After that, 4681 * we call depopulate_namespace_done() to avoid use-after-free. 4682 */ 4683 spdk_for_each_channel(bdev, 4684 bdev_nvme_delete_io_path, 4685 nvme_ns, 4686 bdev_nvme_delete_io_path_done); 4687 return; 4688 } 4689 } 4690 4691 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 4692 } 4693 4694 static void 4695 nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 4696 struct nvme_async_probe_ctx *ctx) 4697 { 4698 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 4699 struct nvme_ns *nvme_ns, *next; 4700 struct spdk_nvme_ns *ns; 4701 struct nvme_bdev *bdev; 4702 uint32_t nsid; 4703 int rc; 4704 uint64_t num_sectors; 4705 4706 if (ctx) { 4707 /* Initialize this count to 1 to handle the populate functions 4708 * calling nvme_ctrlr_populate_namespace_done() immediately. 4709 */ 4710 ctx->populates_in_progress = 1; 4711 } 4712 4713 /* First loop over our existing namespaces and see if they have been 4714 * removed. */ 4715 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 4716 while (nvme_ns != NULL) { 4717 next = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 4718 4719 if (spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) { 4720 /* NS is still there or added again. Its attributes may have changed. */ 4721 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id); 4722 if (nvme_ns->ns != ns) { 4723 assert(nvme_ns->ns == NULL); 4724 nvme_ns->ns = ns; 4725 SPDK_DEBUGLOG(bdev_nvme, "NSID %u was added\n", nvme_ns->id); 4726 } 4727 4728 num_sectors = spdk_nvme_ns_get_num_sectors(ns); 4729 bdev = nvme_ns->bdev; 4730 assert(bdev != NULL); 4731 if (bdev->disk.blockcnt != num_sectors) { 4732 SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n", 4733 nvme_ns->id, 4734 bdev->disk.name, 4735 bdev->disk.blockcnt, 4736 num_sectors); 4737 rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors); 4738 if (rc != 0) { 4739 SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n", 4740 bdev->disk.name, rc); 4741 } 4742 } 4743 } else { 4744 /* Namespace was removed */ 4745 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 4746 } 4747 4748 nvme_ns = next; 4749 } 4750 4751 /* Loop through all of the namespaces at the nvme level and see if any of them are new */ 4752 nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 4753 while (nsid != 0) { 4754 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 4755 4756 if (nvme_ns == NULL) { 4757 /* Found a new one */ 4758 nvme_ns = nvme_ns_alloc(); 4759 if (nvme_ns == NULL) { 4760 SPDK_ERRLOG("Failed to allocate namespace\n"); 4761 /* This just fails to attach the namespace. It may work on a future attempt. */ 4762 continue; 4763 } 4764 4765 nvme_ns->id = nsid; 4766 nvme_ns->ctrlr = nvme_ctrlr; 4767 4768 nvme_ns->bdev = NULL; 4769 4770 if (ctx) { 4771 ctx->populates_in_progress++; 4772 } 4773 nvme_ns->probe_ctx = ctx; 4774 4775 RB_INSERT(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4776 4777 nvme_ctrlr_populate_namespace(nvme_ctrlr, nvme_ns); 4778 } 4779 4780 nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid); 4781 } 4782 4783 if (ctx) { 4784 /* Decrement this count now that the loop is over to account 4785 * for the one we started with. If the count is then 0, we 4786 * know any populate_namespace functions completed immediately, 4787 * so we'll kick the callback here. 4788 */ 4789 ctx->populates_in_progress--; 4790 if (ctx->populates_in_progress == 0) { 4791 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 4792 } 4793 } 4794 4795 } 4796 4797 static void 4798 nvme_ctrlr_depopulate_namespaces(struct nvme_ctrlr *nvme_ctrlr) 4799 { 4800 struct nvme_ns *nvme_ns, *tmp; 4801 4802 RB_FOREACH_SAFE(nvme_ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp) { 4803 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 4804 } 4805 } 4806 4807 static uint32_t 4808 nvme_ctrlr_get_ana_log_page_size(struct nvme_ctrlr *nvme_ctrlr) 4809 { 4810 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 4811 const struct spdk_nvme_ctrlr_data *cdata; 4812 uint32_t nsid, ns_count = 0; 4813 4814 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 4815 4816 for (nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 4817 nsid != 0; nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid)) { 4818 ns_count++; 4819 } 4820 4821 return sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 4822 sizeof(struct spdk_nvme_ana_group_descriptor) + ns_count * 4823 sizeof(uint32_t); 4824 } 4825 4826 static int 4827 nvme_ctrlr_set_ana_states(const struct spdk_nvme_ana_group_descriptor *desc, 4828 void *cb_arg) 4829 { 4830 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 4831 struct nvme_ns *nvme_ns; 4832 uint32_t i, nsid; 4833 4834 for (i = 0; i < desc->num_of_nsid; i++) { 4835 nsid = desc->nsid[i]; 4836 if (nsid == 0) { 4837 continue; 4838 } 4839 4840 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 4841 4842 assert(nvme_ns != NULL); 4843 if (nvme_ns == NULL) { 4844 /* Target told us that an inactive namespace had an ANA change */ 4845 continue; 4846 } 4847 4848 _nvme_ns_set_ana_state(nvme_ns, desc); 4849 } 4850 4851 return 0; 4852 } 4853 4854 static void 4855 bdev_nvme_disable_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 4856 { 4857 struct nvme_ns *nvme_ns; 4858 4859 spdk_free(nvme_ctrlr->ana_log_page); 4860 nvme_ctrlr->ana_log_page = NULL; 4861 4862 for (nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 4863 nvme_ns != NULL; 4864 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns)) { 4865 nvme_ns->ana_state_updating = false; 4866 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 4867 } 4868 } 4869 4870 static void 4871 nvme_ctrlr_read_ana_log_page_done(void *ctx, const struct spdk_nvme_cpl *cpl) 4872 { 4873 struct nvme_ctrlr *nvme_ctrlr = ctx; 4874 4875 if (cpl != NULL && spdk_nvme_cpl_is_success(cpl)) { 4876 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ctrlr_set_ana_states, 4877 nvme_ctrlr); 4878 } else { 4879 bdev_nvme_disable_read_ana_log_page(nvme_ctrlr); 4880 } 4881 4882 pthread_mutex_lock(&nvme_ctrlr->mutex); 4883 4884 assert(nvme_ctrlr->ana_log_page_updating == true); 4885 nvme_ctrlr->ana_log_page_updating = false; 4886 4887 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 4888 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4889 4890 nvme_ctrlr_unregister(nvme_ctrlr); 4891 } else { 4892 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4893 4894 bdev_nvme_clear_io_path_caches(nvme_ctrlr); 4895 } 4896 } 4897 4898 static int 4899 nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 4900 { 4901 uint32_t ana_log_page_size; 4902 int rc; 4903 4904 if (nvme_ctrlr->ana_log_page == NULL) { 4905 return -EINVAL; 4906 } 4907 4908 ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); 4909 4910 if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { 4911 SPDK_ERRLOG("ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", 4912 ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); 4913 return -EINVAL; 4914 } 4915 4916 pthread_mutex_lock(&nvme_ctrlr->mutex); 4917 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 4918 nvme_ctrlr->ana_log_page_updating) { 4919 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4920 return -EBUSY; 4921 } 4922 4923 nvme_ctrlr->ana_log_page_updating = true; 4924 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4925 4926 rc = spdk_nvme_ctrlr_cmd_get_log_page(nvme_ctrlr->ctrlr, 4927 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 4928 SPDK_NVME_GLOBAL_NS_TAG, 4929 nvme_ctrlr->ana_log_page, 4930 ana_log_page_size, 0, 4931 nvme_ctrlr_read_ana_log_page_done, 4932 nvme_ctrlr); 4933 if (rc != 0) { 4934 nvme_ctrlr_read_ana_log_page_done(nvme_ctrlr, NULL); 4935 } 4936 4937 return rc; 4938 } 4939 4940 static void 4941 dummy_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 4942 { 4943 } 4944 4945 struct bdev_nvme_set_preferred_path_ctx { 4946 struct spdk_bdev_desc *desc; 4947 struct nvme_ns *nvme_ns; 4948 bdev_nvme_set_preferred_path_cb cb_fn; 4949 void *cb_arg; 4950 }; 4951 4952 static void 4953 bdev_nvme_set_preferred_path_done(struct spdk_io_channel_iter *i, int status) 4954 { 4955 struct bdev_nvme_set_preferred_path_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4956 4957 assert(ctx != NULL); 4958 assert(ctx->desc != NULL); 4959 assert(ctx->cb_fn != NULL); 4960 4961 spdk_bdev_close(ctx->desc); 4962 4963 ctx->cb_fn(ctx->cb_arg, status); 4964 4965 free(ctx); 4966 } 4967 4968 static void 4969 _bdev_nvme_set_preferred_path(struct spdk_io_channel_iter *i) 4970 { 4971 struct bdev_nvme_set_preferred_path_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4972 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4973 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4974 struct nvme_io_path *io_path, *prev; 4975 4976 prev = NULL; 4977 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 4978 if (io_path->nvme_ns == ctx->nvme_ns) { 4979 break; 4980 } 4981 prev = io_path; 4982 } 4983 4984 if (io_path != NULL) { 4985 if (prev != NULL) { 4986 STAILQ_REMOVE_AFTER(&nbdev_ch->io_path_list, prev, stailq); 4987 STAILQ_INSERT_HEAD(&nbdev_ch->io_path_list, io_path, stailq); 4988 } 4989 4990 /* We can set io_path to nbdev_ch->current_io_path directly here. 4991 * However, it needs to be conditional. To simplify the code, 4992 * just clear nbdev_ch->current_io_path and let find_io_path() 4993 * fill it. 4994 * 4995 * Automatic failback may be disabled. Hence even if the io_path is 4996 * already at the head, clear nbdev_ch->current_io_path. 4997 */ 4998 bdev_nvme_clear_current_io_path(nbdev_ch); 4999 } 5000 5001 spdk_for_each_channel_continue(i, 0); 5002 } 5003 5004 static struct nvme_ns * 5005 bdev_nvme_set_preferred_ns(struct nvme_bdev *nbdev, uint16_t cntlid) 5006 { 5007 struct nvme_ns *nvme_ns, *prev; 5008 const struct spdk_nvme_ctrlr_data *cdata; 5009 5010 prev = NULL; 5011 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 5012 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 5013 5014 if (cdata->cntlid == cntlid) { 5015 break; 5016 } 5017 prev = nvme_ns; 5018 } 5019 5020 if (nvme_ns != NULL && prev != NULL) { 5021 TAILQ_REMOVE(&nbdev->nvme_ns_list, nvme_ns, tailq); 5022 TAILQ_INSERT_HEAD(&nbdev->nvme_ns_list, nvme_ns, tailq); 5023 } 5024 5025 return nvme_ns; 5026 } 5027 5028 /* This function supports only multipath mode. There is only a single I/O path 5029 * for each NVMe-oF controller. Hence, just move the matched I/O path to the 5030 * head of the I/O path list for each NVMe bdev channel. 5031 * 5032 * NVMe bdev channel may be acquired after completing this function. move the 5033 * matched namespace to the head of the namespace list for the NVMe bdev too. 5034 */ 5035 void 5036 bdev_nvme_set_preferred_path(const char *name, uint16_t cntlid, 5037 bdev_nvme_set_preferred_path_cb cb_fn, void *cb_arg) 5038 { 5039 struct bdev_nvme_set_preferred_path_ctx *ctx; 5040 struct spdk_bdev *bdev; 5041 struct nvme_bdev *nbdev; 5042 int rc = 0; 5043 5044 assert(cb_fn != NULL); 5045 5046 ctx = calloc(1, sizeof(*ctx)); 5047 if (ctx == NULL) { 5048 SPDK_ERRLOG("Failed to alloc context.\n"); 5049 rc = -ENOMEM; 5050 goto err_alloc; 5051 } 5052 5053 ctx->cb_fn = cb_fn; 5054 ctx->cb_arg = cb_arg; 5055 5056 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 5057 if (rc != 0) { 5058 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 5059 goto err_open; 5060 } 5061 5062 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 5063 5064 if (bdev->module != &nvme_if) { 5065 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 5066 rc = -ENODEV; 5067 goto err_bdev; 5068 } 5069 5070 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 5071 5072 pthread_mutex_lock(&nbdev->mutex); 5073 5074 ctx->nvme_ns = bdev_nvme_set_preferred_ns(nbdev, cntlid); 5075 if (ctx->nvme_ns == NULL) { 5076 pthread_mutex_unlock(&nbdev->mutex); 5077 5078 SPDK_ERRLOG("bdev %s does not have namespace to controller %u.\n", name, cntlid); 5079 rc = -ENODEV; 5080 goto err_bdev; 5081 } 5082 5083 pthread_mutex_unlock(&nbdev->mutex); 5084 5085 spdk_for_each_channel(nbdev, 5086 _bdev_nvme_set_preferred_path, 5087 ctx, 5088 bdev_nvme_set_preferred_path_done); 5089 return; 5090 5091 err_bdev: 5092 spdk_bdev_close(ctx->desc); 5093 err_open: 5094 free(ctx); 5095 err_alloc: 5096 cb_fn(cb_arg, rc); 5097 } 5098 5099 struct bdev_nvme_set_multipath_policy_ctx { 5100 struct spdk_bdev_desc *desc; 5101 bdev_nvme_set_multipath_policy_cb cb_fn; 5102 void *cb_arg; 5103 }; 5104 5105 static void 5106 bdev_nvme_set_multipath_policy_done(struct spdk_io_channel_iter *i, int status) 5107 { 5108 struct bdev_nvme_set_multipath_policy_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 5109 5110 assert(ctx != NULL); 5111 assert(ctx->desc != NULL); 5112 assert(ctx->cb_fn != NULL); 5113 5114 spdk_bdev_close(ctx->desc); 5115 5116 ctx->cb_fn(ctx->cb_arg, status); 5117 5118 free(ctx); 5119 } 5120 5121 static void 5122 _bdev_nvme_set_multipath_policy(struct spdk_io_channel_iter *i) 5123 { 5124 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 5125 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 5126 struct nvme_bdev *nbdev = spdk_io_channel_get_io_device(_ch); 5127 5128 nbdev_ch->mp_policy = nbdev->mp_policy; 5129 nbdev_ch->mp_selector = nbdev->mp_selector; 5130 nbdev_ch->rr_min_io = nbdev->rr_min_io; 5131 bdev_nvme_clear_current_io_path(nbdev_ch); 5132 5133 spdk_for_each_channel_continue(i, 0); 5134 } 5135 5136 void 5137 bdev_nvme_set_multipath_policy(const char *name, enum bdev_nvme_multipath_policy policy, 5138 enum bdev_nvme_multipath_selector selector, uint32_t rr_min_io, 5139 bdev_nvme_set_multipath_policy_cb cb_fn, void *cb_arg) 5140 { 5141 struct bdev_nvme_set_multipath_policy_ctx *ctx; 5142 struct spdk_bdev *bdev; 5143 struct nvme_bdev *nbdev; 5144 int rc; 5145 5146 assert(cb_fn != NULL); 5147 5148 switch (policy) { 5149 case BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE: 5150 break; 5151 case BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE: 5152 switch (selector) { 5153 case BDEV_NVME_MP_SELECTOR_ROUND_ROBIN: 5154 if (rr_min_io == UINT32_MAX) { 5155 rr_min_io = 1; 5156 } else if (rr_min_io == 0) { 5157 rc = -EINVAL; 5158 goto exit; 5159 } 5160 break; 5161 case BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH: 5162 break; 5163 default: 5164 rc = -EINVAL; 5165 goto exit; 5166 } 5167 break; 5168 default: 5169 rc = -EINVAL; 5170 goto exit; 5171 } 5172 5173 ctx = calloc(1, sizeof(*ctx)); 5174 if (ctx == NULL) { 5175 SPDK_ERRLOG("Failed to alloc context.\n"); 5176 rc = -ENOMEM; 5177 goto exit; 5178 } 5179 5180 ctx->cb_fn = cb_fn; 5181 ctx->cb_arg = cb_arg; 5182 5183 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 5184 if (rc != 0) { 5185 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 5186 rc = -ENODEV; 5187 goto err_open; 5188 } 5189 5190 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 5191 if (bdev->module != &nvme_if) { 5192 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 5193 rc = -ENODEV; 5194 goto err_module; 5195 } 5196 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 5197 5198 pthread_mutex_lock(&nbdev->mutex); 5199 nbdev->mp_policy = policy; 5200 nbdev->mp_selector = selector; 5201 nbdev->rr_min_io = rr_min_io; 5202 pthread_mutex_unlock(&nbdev->mutex); 5203 5204 spdk_for_each_channel(nbdev, 5205 _bdev_nvme_set_multipath_policy, 5206 ctx, 5207 bdev_nvme_set_multipath_policy_done); 5208 return; 5209 5210 err_module: 5211 spdk_bdev_close(ctx->desc); 5212 err_open: 5213 free(ctx); 5214 exit: 5215 cb_fn(cb_arg, rc); 5216 } 5217 5218 static void 5219 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 5220 { 5221 struct nvme_ctrlr *nvme_ctrlr = arg; 5222 union spdk_nvme_async_event_completion event; 5223 5224 if (spdk_nvme_cpl_is_error(cpl)) { 5225 SPDK_WARNLOG("AER request execute failed\n"); 5226 return; 5227 } 5228 5229 event.raw = cpl->cdw0; 5230 if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 5231 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) { 5232 nvme_ctrlr_populate_namespaces(nvme_ctrlr, NULL); 5233 } else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 5234 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_ANA_CHANGE)) { 5235 nvme_ctrlr_read_ana_log_page(nvme_ctrlr); 5236 } 5237 } 5238 5239 static void 5240 free_nvme_async_probe_ctx(struct nvme_async_probe_ctx *ctx) 5241 { 5242 spdk_keyring_put_key(ctx->drv_opts.tls_psk); 5243 spdk_keyring_put_key(ctx->drv_opts.dhchap_key); 5244 spdk_keyring_put_key(ctx->drv_opts.dhchap_ctrlr_key); 5245 free(ctx); 5246 } 5247 5248 static void 5249 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, int rc) 5250 { 5251 if (ctx->cb_fn) { 5252 ctx->cb_fn(ctx->cb_ctx, ctx->reported_bdevs, rc); 5253 } 5254 5255 ctx->namespaces_populated = true; 5256 if (ctx->probe_done) { 5257 /* The probe was already completed, so we need to free the context 5258 * here. This can happen for cases like OCSSD, where we need to 5259 * send additional commands to the SSD after attach. 5260 */ 5261 free_nvme_async_probe_ctx(ctx); 5262 } 5263 } 5264 5265 static void 5266 nvme_ctrlr_create_done(struct nvme_ctrlr *nvme_ctrlr, 5267 struct nvme_async_probe_ctx *ctx) 5268 { 5269 spdk_io_device_register(nvme_ctrlr, 5270 bdev_nvme_create_ctrlr_channel_cb, 5271 bdev_nvme_destroy_ctrlr_channel_cb, 5272 sizeof(struct nvme_ctrlr_channel), 5273 nvme_ctrlr->nbdev_ctrlr->name); 5274 5275 nvme_ctrlr_populate_namespaces(nvme_ctrlr, ctx); 5276 } 5277 5278 static void 5279 nvme_ctrlr_init_ana_log_page_done(void *_ctx, const struct spdk_nvme_cpl *cpl) 5280 { 5281 struct nvme_ctrlr *nvme_ctrlr = _ctx; 5282 struct nvme_async_probe_ctx *ctx = nvme_ctrlr->probe_ctx; 5283 5284 nvme_ctrlr->probe_ctx = NULL; 5285 5286 if (spdk_nvme_cpl_is_error(cpl)) { 5287 nvme_ctrlr_delete(nvme_ctrlr); 5288 5289 if (ctx != NULL) { 5290 ctx->reported_bdevs = 0; 5291 populate_namespaces_cb(ctx, -1); 5292 } 5293 return; 5294 } 5295 5296 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 5297 } 5298 5299 static int 5300 nvme_ctrlr_init_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 5301 struct nvme_async_probe_ctx *ctx) 5302 { 5303 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 5304 const struct spdk_nvme_ctrlr_data *cdata; 5305 uint32_t ana_log_page_size; 5306 5307 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5308 5309 /* Set buffer size enough to include maximum number of allowed namespaces. */ 5310 ana_log_page_size = sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 5311 sizeof(struct spdk_nvme_ana_group_descriptor) + cdata->mnan * 5312 sizeof(uint32_t); 5313 5314 nvme_ctrlr->ana_log_page = spdk_zmalloc(ana_log_page_size, 64, NULL, 5315 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 5316 if (nvme_ctrlr->ana_log_page == NULL) { 5317 SPDK_ERRLOG("could not allocate ANA log page buffer\n"); 5318 return -ENXIO; 5319 } 5320 5321 /* Each descriptor in a ANA log page is not ensured to be 8-bytes aligned. 5322 * Hence copy each descriptor to a temporary area when parsing it. 5323 * 5324 * Allocate a buffer whose size is as large as ANA log page buffer because 5325 * we do not know the size of a descriptor until actually reading it. 5326 */ 5327 nvme_ctrlr->copied_ana_desc = calloc(1, ana_log_page_size); 5328 if (nvme_ctrlr->copied_ana_desc == NULL) { 5329 SPDK_ERRLOG("could not allocate a buffer to parse ANA descriptor\n"); 5330 return -ENOMEM; 5331 } 5332 5333 nvme_ctrlr->max_ana_log_page_size = ana_log_page_size; 5334 5335 nvme_ctrlr->probe_ctx = ctx; 5336 5337 /* Then, set the read size only to include the current active namespaces. */ 5338 ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); 5339 5340 if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { 5341 SPDK_ERRLOG("ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", 5342 ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); 5343 return -EINVAL; 5344 } 5345 5346 return spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, 5347 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 5348 SPDK_NVME_GLOBAL_NS_TAG, 5349 nvme_ctrlr->ana_log_page, 5350 ana_log_page_size, 0, 5351 nvme_ctrlr_init_ana_log_page_done, 5352 nvme_ctrlr); 5353 } 5354 5355 /* hostnqn and subnqn were already verified before attaching a controller. 5356 * Hence check only the multipath capability and cntlid here. 5357 */ 5358 static bool 5359 bdev_nvme_check_multipath(struct nvme_bdev_ctrlr *nbdev_ctrlr, struct spdk_nvme_ctrlr *ctrlr) 5360 { 5361 struct nvme_ctrlr *tmp; 5362 const struct spdk_nvme_ctrlr_data *cdata, *tmp_cdata; 5363 5364 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5365 5366 if (!cdata->cmic.multi_ctrlr) { 5367 SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); 5368 return false; 5369 } 5370 5371 TAILQ_FOREACH(tmp, &nbdev_ctrlr->ctrlrs, tailq) { 5372 tmp_cdata = spdk_nvme_ctrlr_get_data(tmp->ctrlr); 5373 5374 if (!tmp_cdata->cmic.multi_ctrlr) { 5375 SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); 5376 return false; 5377 } 5378 if (cdata->cntlid == tmp_cdata->cntlid) { 5379 SPDK_ERRLOG("cntlid %u are duplicated.\n", tmp_cdata->cntlid); 5380 return false; 5381 } 5382 } 5383 5384 return true; 5385 } 5386 5387 static int 5388 nvme_bdev_ctrlr_create(const char *name, struct nvme_ctrlr *nvme_ctrlr) 5389 { 5390 struct nvme_bdev_ctrlr *nbdev_ctrlr; 5391 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 5392 int rc = 0; 5393 5394 pthread_mutex_lock(&g_bdev_nvme_mutex); 5395 5396 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 5397 if (nbdev_ctrlr != NULL) { 5398 if (!bdev_nvme_check_multipath(nbdev_ctrlr, ctrlr)) { 5399 rc = -EINVAL; 5400 goto exit; 5401 } 5402 } else { 5403 nbdev_ctrlr = calloc(1, sizeof(*nbdev_ctrlr)); 5404 if (nbdev_ctrlr == NULL) { 5405 SPDK_ERRLOG("Failed to allocate nvme_bdev_ctrlr.\n"); 5406 rc = -ENOMEM; 5407 goto exit; 5408 } 5409 nbdev_ctrlr->name = strdup(name); 5410 if (nbdev_ctrlr->name == NULL) { 5411 SPDK_ERRLOG("Failed to allocate name of nvme_bdev_ctrlr.\n"); 5412 free(nbdev_ctrlr); 5413 goto exit; 5414 } 5415 TAILQ_INIT(&nbdev_ctrlr->ctrlrs); 5416 TAILQ_INIT(&nbdev_ctrlr->bdevs); 5417 TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 5418 } 5419 nvme_ctrlr->nbdev_ctrlr = nbdev_ctrlr; 5420 TAILQ_INSERT_TAIL(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 5421 exit: 5422 pthread_mutex_unlock(&g_bdev_nvme_mutex); 5423 return rc; 5424 } 5425 5426 static int 5427 nvme_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr, 5428 const char *name, 5429 const struct spdk_nvme_transport_id *trid, 5430 struct nvme_async_probe_ctx *ctx) 5431 { 5432 struct nvme_ctrlr *nvme_ctrlr; 5433 struct nvme_path_id *path_id; 5434 const struct spdk_nvme_ctrlr_data *cdata; 5435 int rc; 5436 5437 nvme_ctrlr = calloc(1, sizeof(*nvme_ctrlr)); 5438 if (nvme_ctrlr == NULL) { 5439 SPDK_ERRLOG("Failed to allocate device struct\n"); 5440 return -ENOMEM; 5441 } 5442 5443 rc = pthread_mutex_init(&nvme_ctrlr->mutex, NULL); 5444 if (rc != 0) { 5445 free(nvme_ctrlr); 5446 return rc; 5447 } 5448 5449 TAILQ_INIT(&nvme_ctrlr->trids); 5450 RB_INIT(&nvme_ctrlr->namespaces); 5451 5452 /* Get another reference to the key, so the first one can be released from probe_ctx */ 5453 if (ctx != NULL) { 5454 if (ctx->drv_opts.tls_psk != NULL) { 5455 nvme_ctrlr->psk = spdk_keyring_get_key( 5456 spdk_key_get_name(ctx->drv_opts.tls_psk)); 5457 if (nvme_ctrlr->psk == NULL) { 5458 /* Could only happen if the key was removed in the meantime */ 5459 SPDK_ERRLOG("Couldn't get a reference to the key '%s'\n", 5460 spdk_key_get_name(ctx->drv_opts.tls_psk)); 5461 rc = -ENOKEY; 5462 goto err; 5463 } 5464 } 5465 5466 if (ctx->drv_opts.dhchap_key != NULL) { 5467 nvme_ctrlr->dhchap_key = spdk_keyring_get_key( 5468 spdk_key_get_name(ctx->drv_opts.dhchap_key)); 5469 if (nvme_ctrlr->dhchap_key == NULL) { 5470 SPDK_ERRLOG("Couldn't get a reference to the key '%s'\n", 5471 spdk_key_get_name(ctx->drv_opts.dhchap_key)); 5472 rc = -ENOKEY; 5473 goto err; 5474 } 5475 } 5476 5477 if (ctx->drv_opts.dhchap_ctrlr_key != NULL) { 5478 nvme_ctrlr->dhchap_ctrlr_key = 5479 spdk_keyring_get_key( 5480 spdk_key_get_name(ctx->drv_opts.dhchap_ctrlr_key)); 5481 if (nvme_ctrlr->dhchap_ctrlr_key == NULL) { 5482 SPDK_ERRLOG("Couldn't get a reference to the key '%s'\n", 5483 spdk_key_get_name(ctx->drv_opts.dhchap_ctrlr_key)); 5484 rc = -ENOKEY; 5485 goto err; 5486 } 5487 } 5488 } 5489 5490 path_id = calloc(1, sizeof(*path_id)); 5491 if (path_id == NULL) { 5492 SPDK_ERRLOG("Failed to allocate trid entry pointer\n"); 5493 rc = -ENOMEM; 5494 goto err; 5495 } 5496 5497 path_id->trid = *trid; 5498 if (ctx != NULL) { 5499 memcpy(path_id->hostid.hostaddr, ctx->drv_opts.src_addr, sizeof(path_id->hostid.hostaddr)); 5500 memcpy(path_id->hostid.hostsvcid, ctx->drv_opts.src_svcid, sizeof(path_id->hostid.hostsvcid)); 5501 } 5502 nvme_ctrlr->active_path_id = path_id; 5503 TAILQ_INSERT_HEAD(&nvme_ctrlr->trids, path_id, link); 5504 5505 nvme_ctrlr->thread = spdk_get_thread(); 5506 nvme_ctrlr->ctrlr = ctrlr; 5507 nvme_ctrlr->ref = 1; 5508 5509 if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) { 5510 SPDK_ERRLOG("OCSSDs are not supported"); 5511 rc = -ENOTSUP; 5512 goto err; 5513 } 5514 5515 if (ctx != NULL) { 5516 memcpy(&nvme_ctrlr->opts, &ctx->bdev_opts, sizeof(ctx->bdev_opts)); 5517 } else { 5518 bdev_nvme_get_default_ctrlr_opts(&nvme_ctrlr->opts); 5519 } 5520 5521 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_ctrlr, 5522 g_opts.nvme_adminq_poll_period_us); 5523 5524 if (g_opts.timeout_us > 0) { 5525 /* Register timeout callback. Timeout values for IO vs. admin reqs can be different. */ 5526 /* If timeout_admin_us is 0 (not specified), admin uses same timeout as IO. */ 5527 uint64_t adm_timeout_us = (g_opts.timeout_admin_us == 0) ? 5528 g_opts.timeout_us : g_opts.timeout_admin_us; 5529 spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us, 5530 adm_timeout_us, timeout_cb, nvme_ctrlr); 5531 } 5532 5533 spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_ctrlr); 5534 spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_ctrlr); 5535 5536 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 5537 SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) { 5538 nvme_ctrlr->opal_dev = spdk_opal_dev_construct(ctrlr); 5539 } 5540 5541 rc = nvme_bdev_ctrlr_create(name, nvme_ctrlr); 5542 if (rc != 0) { 5543 goto err; 5544 } 5545 5546 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5547 5548 if (cdata->cmic.ana_reporting) { 5549 rc = nvme_ctrlr_init_ana_log_page(nvme_ctrlr, ctx); 5550 if (rc == 0) { 5551 return 0; 5552 } 5553 } else { 5554 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 5555 return 0; 5556 } 5557 5558 err: 5559 nvme_ctrlr_delete(nvme_ctrlr); 5560 return rc; 5561 } 5562 5563 void 5564 bdev_nvme_get_default_ctrlr_opts(struct nvme_ctrlr_opts *opts) 5565 { 5566 opts->prchk_flags = 0; 5567 opts->ctrlr_loss_timeout_sec = g_opts.ctrlr_loss_timeout_sec; 5568 opts->reconnect_delay_sec = g_opts.reconnect_delay_sec; 5569 opts->fast_io_fail_timeout_sec = g_opts.fast_io_fail_timeout_sec; 5570 } 5571 5572 static void 5573 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5574 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *drv_opts) 5575 { 5576 char *name; 5577 5578 name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++); 5579 if (!name) { 5580 SPDK_ERRLOG("Failed to assign name to NVMe device\n"); 5581 return; 5582 } 5583 5584 if (nvme_ctrlr_create(ctrlr, name, trid, NULL) == 0) { 5585 SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name); 5586 } else { 5587 SPDK_ERRLOG("Failed to attach to %s (%s)\n", trid->traddr, name); 5588 } 5589 5590 free(name); 5591 } 5592 5593 static void 5594 _nvme_ctrlr_destruct(void *ctx) 5595 { 5596 struct nvme_ctrlr *nvme_ctrlr = ctx; 5597 5598 nvme_ctrlr_depopulate_namespaces(nvme_ctrlr); 5599 nvme_ctrlr_release(nvme_ctrlr); 5600 } 5601 5602 static int 5603 bdev_nvme_delete_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) 5604 { 5605 struct nvme_probe_skip_entry *entry; 5606 5607 /* The controller's destruction was already started */ 5608 if (nvme_ctrlr->destruct) { 5609 return -EALREADY; 5610 } 5611 5612 if (!hotplug && 5613 nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { 5614 entry = calloc(1, sizeof(*entry)); 5615 if (!entry) { 5616 return -ENOMEM; 5617 } 5618 entry->trid = nvme_ctrlr->active_path_id->trid; 5619 TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq); 5620 } 5621 5622 nvme_ctrlr->destruct = true; 5623 return 0; 5624 } 5625 5626 static int 5627 bdev_nvme_delete_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) 5628 { 5629 int rc; 5630 5631 pthread_mutex_lock(&nvme_ctrlr->mutex); 5632 rc = bdev_nvme_delete_ctrlr_unsafe(nvme_ctrlr, hotplug); 5633 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5634 5635 if (rc == 0) { 5636 _nvme_ctrlr_destruct(nvme_ctrlr); 5637 } else if (rc == -EALREADY) { 5638 rc = 0; 5639 } 5640 5641 return rc; 5642 } 5643 5644 static void 5645 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr) 5646 { 5647 struct nvme_ctrlr *nvme_ctrlr = cb_ctx; 5648 5649 bdev_nvme_delete_ctrlr(nvme_ctrlr, true); 5650 } 5651 5652 static int 5653 bdev_nvme_hotplug_probe(void *arg) 5654 { 5655 if (g_hotplug_probe_ctx == NULL) { 5656 spdk_poller_unregister(&g_hotplug_probe_poller); 5657 return SPDK_POLLER_IDLE; 5658 } 5659 5660 if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) { 5661 g_hotplug_probe_ctx = NULL; 5662 spdk_poller_unregister(&g_hotplug_probe_poller); 5663 } 5664 5665 return SPDK_POLLER_BUSY; 5666 } 5667 5668 static int 5669 bdev_nvme_hotplug(void *arg) 5670 { 5671 struct spdk_nvme_transport_id trid_pcie; 5672 5673 if (g_hotplug_probe_ctx) { 5674 return SPDK_POLLER_BUSY; 5675 } 5676 5677 memset(&trid_pcie, 0, sizeof(trid_pcie)); 5678 spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE); 5679 5680 g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL, 5681 hotplug_probe_cb, attach_cb, NULL); 5682 5683 if (g_hotplug_probe_ctx) { 5684 assert(g_hotplug_probe_poller == NULL); 5685 g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000); 5686 } 5687 5688 return SPDK_POLLER_BUSY; 5689 } 5690 5691 void 5692 bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts) 5693 { 5694 *opts = g_opts; 5695 } 5696 5697 static bool bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, 5698 uint32_t reconnect_delay_sec, 5699 uint32_t fast_io_fail_timeout_sec); 5700 5701 static int 5702 bdev_nvme_validate_opts(const struct spdk_bdev_nvme_opts *opts) 5703 { 5704 if ((opts->timeout_us == 0) && (opts->timeout_admin_us != 0)) { 5705 /* Can't set timeout_admin_us without also setting timeout_us */ 5706 SPDK_WARNLOG("Invalid options: Can't have (timeout_us == 0) with (timeout_admin_us > 0)\n"); 5707 return -EINVAL; 5708 } 5709 5710 if (opts->bdev_retry_count < -1) { 5711 SPDK_WARNLOG("Invalid option: bdev_retry_count can't be less than -1.\n"); 5712 return -EINVAL; 5713 } 5714 5715 if (!bdev_nvme_check_io_error_resiliency_params(opts->ctrlr_loss_timeout_sec, 5716 opts->reconnect_delay_sec, 5717 opts->fast_io_fail_timeout_sec)) { 5718 return -EINVAL; 5719 } 5720 5721 return 0; 5722 } 5723 5724 int 5725 bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts) 5726 { 5727 int ret; 5728 5729 ret = bdev_nvme_validate_opts(opts); 5730 if (ret) { 5731 SPDK_WARNLOG("Failed to set nvme opts.\n"); 5732 return ret; 5733 } 5734 5735 if (g_bdev_nvme_init_thread != NULL) { 5736 if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 5737 return -EPERM; 5738 } 5739 } 5740 5741 if (opts->rdma_srq_size != 0 || 5742 opts->rdma_max_cq_size != 0 || 5743 opts->rdma_cm_event_timeout_ms != 0) { 5744 struct spdk_nvme_transport_opts drv_opts; 5745 5746 spdk_nvme_transport_get_opts(&drv_opts, sizeof(drv_opts)); 5747 if (opts->rdma_srq_size != 0) { 5748 drv_opts.rdma_srq_size = opts->rdma_srq_size; 5749 } 5750 if (opts->rdma_max_cq_size != 0) { 5751 drv_opts.rdma_max_cq_size = opts->rdma_max_cq_size; 5752 } 5753 if (opts->rdma_cm_event_timeout_ms != 0) { 5754 drv_opts.rdma_cm_event_timeout_ms = opts->rdma_cm_event_timeout_ms; 5755 } 5756 5757 ret = spdk_nvme_transport_set_opts(&drv_opts, sizeof(drv_opts)); 5758 if (ret) { 5759 SPDK_ERRLOG("Failed to set NVMe transport opts.\n"); 5760 return ret; 5761 } 5762 } 5763 5764 g_opts = *opts; 5765 5766 return 0; 5767 } 5768 5769 struct set_nvme_hotplug_ctx { 5770 uint64_t period_us; 5771 bool enabled; 5772 spdk_msg_fn fn; 5773 void *fn_ctx; 5774 }; 5775 5776 static void 5777 set_nvme_hotplug_period_cb(void *_ctx) 5778 { 5779 struct set_nvme_hotplug_ctx *ctx = _ctx; 5780 5781 spdk_poller_unregister(&g_hotplug_poller); 5782 if (ctx->enabled) { 5783 g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us); 5784 } 5785 5786 g_nvme_hotplug_poll_period_us = ctx->period_us; 5787 g_nvme_hotplug_enabled = ctx->enabled; 5788 if (ctx->fn) { 5789 ctx->fn(ctx->fn_ctx); 5790 } 5791 5792 free(ctx); 5793 } 5794 5795 int 5796 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx) 5797 { 5798 struct set_nvme_hotplug_ctx *ctx; 5799 5800 if (enabled == true && !spdk_process_is_primary()) { 5801 return -EPERM; 5802 } 5803 5804 ctx = calloc(1, sizeof(*ctx)); 5805 if (ctx == NULL) { 5806 return -ENOMEM; 5807 } 5808 5809 period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us; 5810 ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX); 5811 ctx->enabled = enabled; 5812 ctx->fn = cb; 5813 ctx->fn_ctx = cb_ctx; 5814 5815 spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx); 5816 return 0; 5817 } 5818 5819 static void 5820 nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 5821 struct nvme_async_probe_ctx *ctx) 5822 { 5823 struct nvme_ns *nvme_ns; 5824 struct nvme_bdev *nvme_bdev; 5825 size_t j; 5826 5827 assert(nvme_ctrlr != NULL); 5828 5829 if (ctx->names == NULL) { 5830 ctx->reported_bdevs = 0; 5831 populate_namespaces_cb(ctx, 0); 5832 return; 5833 } 5834 5835 /* 5836 * Report the new bdevs that were created in this call. 5837 * There can be more than one bdev per NVMe controller. 5838 */ 5839 j = 0; 5840 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 5841 while (nvme_ns != NULL) { 5842 nvme_bdev = nvme_ns->bdev; 5843 if (j < ctx->max_bdevs) { 5844 ctx->names[j] = nvme_bdev->disk.name; 5845 j++; 5846 } else { 5847 SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n", 5848 ctx->max_bdevs); 5849 ctx->reported_bdevs = 0; 5850 populate_namespaces_cb(ctx, -ERANGE); 5851 return; 5852 } 5853 5854 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 5855 } 5856 5857 ctx->reported_bdevs = j; 5858 populate_namespaces_cb(ctx, 0); 5859 } 5860 5861 static int 5862 bdev_nvme_check_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 5863 struct spdk_nvme_ctrlr *new_ctrlr, 5864 struct spdk_nvme_transport_id *trid) 5865 { 5866 struct nvme_path_id *tmp_trid; 5867 5868 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 5869 SPDK_ERRLOG("PCIe failover is not supported.\n"); 5870 return -ENOTSUP; 5871 } 5872 5873 /* Currently we only support failover to the same transport type. */ 5874 if (nvme_ctrlr->active_path_id->trid.trtype != trid->trtype) { 5875 SPDK_WARNLOG("Failover from trtype: %s to a different trtype: %s is not supported currently\n", 5876 spdk_nvme_transport_id_trtype_str(nvme_ctrlr->active_path_id->trid.trtype), 5877 spdk_nvme_transport_id_trtype_str(trid->trtype)); 5878 return -EINVAL; 5879 } 5880 5881 5882 /* Currently we only support failover to the same NQN. */ 5883 if (strncmp(trid->subnqn, nvme_ctrlr->active_path_id->trid.subnqn, SPDK_NVMF_NQN_MAX_LEN)) { 5884 SPDK_WARNLOG("Failover from subnqn: %s to a different subnqn: %s is not supported currently\n", 5885 nvme_ctrlr->active_path_id->trid.subnqn, trid->subnqn); 5886 return -EINVAL; 5887 } 5888 5889 /* Skip all the other checks if we've already registered this path. */ 5890 TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) { 5891 if (!spdk_nvme_transport_id_compare(&tmp_trid->trid, trid)) { 5892 SPDK_WARNLOG("This path (traddr: %s subnqn: %s) is already registered\n", trid->traddr, 5893 trid->subnqn); 5894 return -EALREADY; 5895 } 5896 } 5897 5898 return 0; 5899 } 5900 5901 static int 5902 bdev_nvme_check_secondary_namespace(struct nvme_ctrlr *nvme_ctrlr, 5903 struct spdk_nvme_ctrlr *new_ctrlr) 5904 { 5905 struct nvme_ns *nvme_ns; 5906 struct spdk_nvme_ns *new_ns; 5907 5908 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 5909 while (nvme_ns != NULL) { 5910 new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nvme_ns->id); 5911 assert(new_ns != NULL); 5912 5913 if (!bdev_nvme_compare_ns(nvme_ns->ns, new_ns)) { 5914 return -EINVAL; 5915 } 5916 5917 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 5918 } 5919 5920 return 0; 5921 } 5922 5923 static int 5924 _bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 5925 struct spdk_nvme_transport_id *trid) 5926 { 5927 struct nvme_path_id *active_id, *new_trid, *tmp_trid; 5928 5929 new_trid = calloc(1, sizeof(*new_trid)); 5930 if (new_trid == NULL) { 5931 return -ENOMEM; 5932 } 5933 new_trid->trid = *trid; 5934 5935 active_id = nvme_ctrlr->active_path_id; 5936 assert(active_id != NULL); 5937 assert(active_id == TAILQ_FIRST(&nvme_ctrlr->trids)); 5938 5939 /* Skip the active trid not to replace it until it is failed. */ 5940 tmp_trid = TAILQ_NEXT(active_id, link); 5941 if (tmp_trid == NULL) { 5942 goto add_tail; 5943 } 5944 5945 /* It means the trid is faled if its last failed time is non-zero. 5946 * Insert the new alternate trid before any failed trid. 5947 */ 5948 TAILQ_FOREACH_FROM(tmp_trid, &nvme_ctrlr->trids, link) { 5949 if (tmp_trid->last_failed_tsc != 0) { 5950 TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link); 5951 return 0; 5952 } 5953 } 5954 5955 add_tail: 5956 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, new_trid, link); 5957 return 0; 5958 } 5959 5960 /* This is the case that a secondary path is added to an existing 5961 * nvme_ctrlr for failover. After checking if it can access the same 5962 * namespaces as the primary path, it is disconnected until failover occurs. 5963 */ 5964 static int 5965 bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 5966 struct spdk_nvme_ctrlr *new_ctrlr, 5967 struct spdk_nvme_transport_id *trid) 5968 { 5969 int rc; 5970 5971 assert(nvme_ctrlr != NULL); 5972 5973 pthread_mutex_lock(&nvme_ctrlr->mutex); 5974 5975 rc = bdev_nvme_check_secondary_trid(nvme_ctrlr, new_ctrlr, trid); 5976 if (rc != 0) { 5977 goto exit; 5978 } 5979 5980 rc = bdev_nvme_check_secondary_namespace(nvme_ctrlr, new_ctrlr); 5981 if (rc != 0) { 5982 goto exit; 5983 } 5984 5985 rc = _bdev_nvme_add_secondary_trid(nvme_ctrlr, trid); 5986 5987 exit: 5988 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5989 5990 spdk_nvme_detach(new_ctrlr); 5991 5992 return rc; 5993 } 5994 5995 static void 5996 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5997 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 5998 { 5999 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 6000 struct nvme_async_probe_ctx *ctx; 6001 int rc; 6002 6003 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 6004 ctx->ctrlr_attached = true; 6005 6006 rc = nvme_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx); 6007 if (rc != 0) { 6008 ctx->reported_bdevs = 0; 6009 populate_namespaces_cb(ctx, rc); 6010 } 6011 } 6012 6013 static void 6014 connect_set_failover_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 6015 struct spdk_nvme_ctrlr *ctrlr, 6016 const struct spdk_nvme_ctrlr_opts *opts) 6017 { 6018 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 6019 struct nvme_ctrlr *nvme_ctrlr; 6020 struct nvme_async_probe_ctx *ctx; 6021 int rc; 6022 6023 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 6024 ctx->ctrlr_attached = true; 6025 6026 nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->base_name); 6027 if (nvme_ctrlr) { 6028 rc = bdev_nvme_add_secondary_trid(nvme_ctrlr, ctrlr, &ctx->trid); 6029 } else { 6030 rc = -ENODEV; 6031 } 6032 6033 ctx->reported_bdevs = 0; 6034 populate_namespaces_cb(ctx, rc); 6035 } 6036 6037 static int 6038 bdev_nvme_async_poll(void *arg) 6039 { 6040 struct nvme_async_probe_ctx *ctx = arg; 6041 int rc; 6042 6043 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 6044 if (spdk_unlikely(rc != -EAGAIN)) { 6045 ctx->probe_done = true; 6046 spdk_poller_unregister(&ctx->poller); 6047 if (!ctx->ctrlr_attached) { 6048 /* The probe is done, but no controller was attached. 6049 * That means we had a failure, so report -EIO back to 6050 * the caller (usually the RPC). populate_namespaces_cb() 6051 * will take care of freeing the nvme_async_probe_ctx. 6052 */ 6053 ctx->reported_bdevs = 0; 6054 populate_namespaces_cb(ctx, -EIO); 6055 } else if (ctx->namespaces_populated) { 6056 /* The namespaces for the attached controller were all 6057 * populated and the response was already sent to the 6058 * caller (usually the RPC). So free the context here. 6059 */ 6060 free_nvme_async_probe_ctx(ctx); 6061 } 6062 } 6063 6064 return SPDK_POLLER_BUSY; 6065 } 6066 6067 static bool 6068 bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, 6069 uint32_t reconnect_delay_sec, 6070 uint32_t fast_io_fail_timeout_sec) 6071 { 6072 if (ctrlr_loss_timeout_sec < -1) { 6073 SPDK_ERRLOG("ctrlr_loss_timeout_sec can't be less than -1.\n"); 6074 return false; 6075 } else if (ctrlr_loss_timeout_sec == -1) { 6076 if (reconnect_delay_sec == 0) { 6077 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 6078 return false; 6079 } else if (fast_io_fail_timeout_sec != 0 && 6080 fast_io_fail_timeout_sec < reconnect_delay_sec) { 6081 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io-fail_timeout_sec.\n"); 6082 return false; 6083 } 6084 } else if (ctrlr_loss_timeout_sec != 0) { 6085 if (reconnect_delay_sec == 0) { 6086 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 6087 return false; 6088 } else if (reconnect_delay_sec > (uint32_t)ctrlr_loss_timeout_sec) { 6089 SPDK_ERRLOG("reconnect_delay_sec can't be more than ctrlr_loss_timeout_sec.\n"); 6090 return false; 6091 } else if (fast_io_fail_timeout_sec != 0) { 6092 if (fast_io_fail_timeout_sec < reconnect_delay_sec) { 6093 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io_fail_timeout_sec.\n"); 6094 return false; 6095 } else if (fast_io_fail_timeout_sec > (uint32_t)ctrlr_loss_timeout_sec) { 6096 SPDK_ERRLOG("fast_io_fail_timeout_sec can't be more than ctrlr_loss_timeout_sec.\n"); 6097 return false; 6098 } 6099 } 6100 } else if (reconnect_delay_sec != 0 || fast_io_fail_timeout_sec != 0) { 6101 SPDK_ERRLOG("Both reconnect_delay_sec and fast_io_fail_timeout_sec must be 0 if ctrlr_loss_timeout_sec is 0.\n"); 6102 return false; 6103 } 6104 6105 return true; 6106 } 6107 6108 static int 6109 bdev_nvme_load_psk(const char *fname, char *buf, size_t bufsz) 6110 { 6111 FILE *psk_file; 6112 struct stat statbuf; 6113 int rc; 6114 #define TCP_PSK_INVALID_PERMISSIONS 0177 6115 6116 if (stat(fname, &statbuf) != 0) { 6117 SPDK_ERRLOG("Could not read permissions for PSK file\n"); 6118 return -EACCES; 6119 } 6120 6121 if ((statbuf.st_mode & TCP_PSK_INVALID_PERMISSIONS) != 0) { 6122 SPDK_ERRLOG("Incorrect permissions for PSK file\n"); 6123 return -EPERM; 6124 } 6125 if ((size_t)statbuf.st_size >= bufsz) { 6126 SPDK_ERRLOG("Invalid PSK: too long\n"); 6127 return -EINVAL; 6128 } 6129 psk_file = fopen(fname, "r"); 6130 if (psk_file == NULL) { 6131 SPDK_ERRLOG("Could not open PSK file\n"); 6132 return -EINVAL; 6133 } 6134 6135 memset(buf, 0, bufsz); 6136 rc = fread(buf, 1, statbuf.st_size, psk_file); 6137 if (rc != statbuf.st_size) { 6138 SPDK_ERRLOG("Failed to read PSK\n"); 6139 fclose(psk_file); 6140 return -EINVAL; 6141 } 6142 6143 fclose(psk_file); 6144 return 0; 6145 } 6146 6147 int 6148 bdev_nvme_create(struct spdk_nvme_transport_id *trid, 6149 const char *base_name, 6150 const char **names, 6151 uint32_t count, 6152 spdk_bdev_create_nvme_fn cb_fn, 6153 void *cb_ctx, 6154 struct spdk_nvme_ctrlr_opts *drv_opts, 6155 struct nvme_ctrlr_opts *bdev_opts, 6156 bool multipath) 6157 { 6158 struct nvme_probe_skip_entry *entry, *tmp; 6159 struct nvme_async_probe_ctx *ctx; 6160 spdk_nvme_attach_cb attach_cb; 6161 int rc, len; 6162 6163 /* TODO expand this check to include both the host and target TRIDs. 6164 * Only if both are the same should we fail. 6165 */ 6166 if (nvme_ctrlr_get(trid) != NULL) { 6167 SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr); 6168 return -EEXIST; 6169 } 6170 6171 len = strnlen(base_name, SPDK_CONTROLLER_NAME_MAX); 6172 6173 if (len == 0 || len == SPDK_CONTROLLER_NAME_MAX) { 6174 SPDK_ERRLOG("controller name must be between 1 and %d characters\n", SPDK_CONTROLLER_NAME_MAX - 1); 6175 return -EINVAL; 6176 } 6177 6178 if (bdev_opts != NULL && 6179 !bdev_nvme_check_io_error_resiliency_params(bdev_opts->ctrlr_loss_timeout_sec, 6180 bdev_opts->reconnect_delay_sec, 6181 bdev_opts->fast_io_fail_timeout_sec)) { 6182 return -EINVAL; 6183 } 6184 6185 ctx = calloc(1, sizeof(*ctx)); 6186 if (!ctx) { 6187 return -ENOMEM; 6188 } 6189 ctx->base_name = base_name; 6190 ctx->names = names; 6191 ctx->max_bdevs = count; 6192 ctx->cb_fn = cb_fn; 6193 ctx->cb_ctx = cb_ctx; 6194 ctx->trid = *trid; 6195 6196 if (bdev_opts) { 6197 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 6198 } else { 6199 bdev_nvme_get_default_ctrlr_opts(&ctx->bdev_opts); 6200 } 6201 6202 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 6203 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) { 6204 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 6205 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 6206 free(entry); 6207 break; 6208 } 6209 } 6210 } 6211 6212 if (drv_opts) { 6213 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 6214 } else { 6215 spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->drv_opts, sizeof(ctx->drv_opts)); 6216 } 6217 6218 ctx->drv_opts.transport_retry_count = g_opts.transport_retry_count; 6219 ctx->drv_opts.transport_ack_timeout = g_opts.transport_ack_timeout; 6220 ctx->drv_opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms; 6221 ctx->drv_opts.disable_read_ana_log_page = true; 6222 ctx->drv_opts.transport_tos = g_opts.transport_tos; 6223 6224 if (ctx->bdev_opts.psk[0] != '\0') { 6225 /* Try to use the keyring first */ 6226 ctx->drv_opts.tls_psk = spdk_keyring_get_key(ctx->bdev_opts.psk); 6227 if (ctx->drv_opts.tls_psk == NULL) { 6228 rc = bdev_nvme_load_psk(ctx->bdev_opts.psk, 6229 ctx->drv_opts.psk, sizeof(ctx->drv_opts.psk)); 6230 if (rc != 0) { 6231 SPDK_ERRLOG("Could not load PSK from %s\n", ctx->bdev_opts.psk); 6232 free_nvme_async_probe_ctx(ctx); 6233 return rc; 6234 } 6235 } 6236 } 6237 6238 if (ctx->bdev_opts.dhchap_key != NULL) { 6239 ctx->drv_opts.dhchap_key = spdk_keyring_get_key(ctx->bdev_opts.dhchap_key); 6240 if (ctx->drv_opts.dhchap_key == NULL) { 6241 SPDK_ERRLOG("Could not load DH-HMAC-CHAP key: %s\n", 6242 ctx->bdev_opts.dhchap_key); 6243 free_nvme_async_probe_ctx(ctx); 6244 return -ENOKEY; 6245 } 6246 6247 ctx->drv_opts.dhchap_digests = g_opts.dhchap_digests; 6248 ctx->drv_opts.dhchap_dhgroups = g_opts.dhchap_dhgroups; 6249 } 6250 if (ctx->bdev_opts.dhchap_ctrlr_key != NULL) { 6251 ctx->drv_opts.dhchap_ctrlr_key = 6252 spdk_keyring_get_key(ctx->bdev_opts.dhchap_ctrlr_key); 6253 if (ctx->drv_opts.dhchap_ctrlr_key == NULL) { 6254 SPDK_ERRLOG("Could not load DH-HMAC-CHAP controller key: %s\n", 6255 ctx->bdev_opts.dhchap_ctrlr_key); 6256 free_nvme_async_probe_ctx(ctx); 6257 return -ENOKEY; 6258 } 6259 } 6260 6261 if (nvme_bdev_ctrlr_get_by_name(base_name) == NULL || multipath) { 6262 attach_cb = connect_attach_cb; 6263 } else { 6264 attach_cb = connect_set_failover_cb; 6265 } 6266 6267 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, attach_cb); 6268 if (ctx->probe_ctx == NULL) { 6269 SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr); 6270 free_nvme_async_probe_ctx(ctx); 6271 return -ENODEV; 6272 } 6273 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000); 6274 6275 return 0; 6276 } 6277 6278 struct bdev_nvme_delete_ctx { 6279 char *name; 6280 struct nvme_path_id path_id; 6281 bdev_nvme_delete_done_fn delete_done; 6282 void *delete_done_ctx; 6283 uint64_t timeout_ticks; 6284 struct spdk_poller *poller; 6285 }; 6286 6287 static void 6288 free_bdev_nvme_delete_ctx(struct bdev_nvme_delete_ctx *ctx) 6289 { 6290 if (ctx != NULL) { 6291 free(ctx->name); 6292 free(ctx); 6293 } 6294 } 6295 6296 static bool 6297 nvme_path_id_compare(struct nvme_path_id *p, const struct nvme_path_id *path_id) 6298 { 6299 if (path_id->trid.trtype != 0) { 6300 if (path_id->trid.trtype == SPDK_NVME_TRANSPORT_CUSTOM) { 6301 if (strcasecmp(path_id->trid.trstring, p->trid.trstring) != 0) { 6302 return false; 6303 } 6304 } else { 6305 if (path_id->trid.trtype != p->trid.trtype) { 6306 return false; 6307 } 6308 } 6309 } 6310 6311 if (!spdk_mem_all_zero(path_id->trid.traddr, sizeof(path_id->trid.traddr))) { 6312 if (strcasecmp(path_id->trid.traddr, p->trid.traddr) != 0) { 6313 return false; 6314 } 6315 } 6316 6317 if (path_id->trid.adrfam != 0) { 6318 if (path_id->trid.adrfam != p->trid.adrfam) { 6319 return false; 6320 } 6321 } 6322 6323 if (!spdk_mem_all_zero(path_id->trid.trsvcid, sizeof(path_id->trid.trsvcid))) { 6324 if (strcasecmp(path_id->trid.trsvcid, p->trid.trsvcid) != 0) { 6325 return false; 6326 } 6327 } 6328 6329 if (!spdk_mem_all_zero(path_id->trid.subnqn, sizeof(path_id->trid.subnqn))) { 6330 if (strcmp(path_id->trid.subnqn, p->trid.subnqn) != 0) { 6331 return false; 6332 } 6333 } 6334 6335 if (!spdk_mem_all_zero(path_id->hostid.hostaddr, sizeof(path_id->hostid.hostaddr))) { 6336 if (strcmp(path_id->hostid.hostaddr, p->hostid.hostaddr) != 0) { 6337 return false; 6338 } 6339 } 6340 6341 if (!spdk_mem_all_zero(path_id->hostid.hostsvcid, sizeof(path_id->hostid.hostsvcid))) { 6342 if (strcmp(path_id->hostid.hostsvcid, p->hostid.hostsvcid) != 0) { 6343 return false; 6344 } 6345 } 6346 6347 return true; 6348 } 6349 6350 static bool 6351 nvme_path_id_exists(const char *name, const struct nvme_path_id *path_id) 6352 { 6353 struct nvme_bdev_ctrlr *nbdev_ctrlr; 6354 struct nvme_ctrlr *ctrlr; 6355 struct nvme_path_id *p; 6356 6357 pthread_mutex_lock(&g_bdev_nvme_mutex); 6358 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 6359 if (!nbdev_ctrlr) { 6360 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6361 return false; 6362 } 6363 6364 TAILQ_FOREACH(ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 6365 pthread_mutex_lock(&ctrlr->mutex); 6366 TAILQ_FOREACH(p, &ctrlr->trids, link) { 6367 if (nvme_path_id_compare(p, path_id)) { 6368 pthread_mutex_unlock(&ctrlr->mutex); 6369 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6370 return true; 6371 } 6372 } 6373 pthread_mutex_unlock(&ctrlr->mutex); 6374 } 6375 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6376 6377 return false; 6378 } 6379 6380 static int 6381 bdev_nvme_delete_complete_poll(void *arg) 6382 { 6383 struct bdev_nvme_delete_ctx *ctx = arg; 6384 int rc = 0; 6385 6386 if (nvme_path_id_exists(ctx->name, &ctx->path_id)) { 6387 if (ctx->timeout_ticks > spdk_get_ticks()) { 6388 return SPDK_POLLER_BUSY; 6389 } 6390 6391 SPDK_ERRLOG("NVMe path '%s' still exists after delete\n", ctx->name); 6392 rc = -ETIMEDOUT; 6393 } 6394 6395 spdk_poller_unregister(&ctx->poller); 6396 6397 ctx->delete_done(ctx->delete_done_ctx, rc); 6398 free_bdev_nvme_delete_ctx(ctx); 6399 6400 return SPDK_POLLER_BUSY; 6401 } 6402 6403 static int 6404 _bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, const struct nvme_path_id *path_id) 6405 { 6406 struct nvme_path_id *p, *t; 6407 spdk_msg_fn msg_fn; 6408 int rc = -ENXIO; 6409 6410 pthread_mutex_lock(&nvme_ctrlr->mutex); 6411 6412 TAILQ_FOREACH_REVERSE_SAFE(p, &nvme_ctrlr->trids, nvme_paths, link, t) { 6413 if (p == TAILQ_FIRST(&nvme_ctrlr->trids)) { 6414 break; 6415 } 6416 6417 if (!nvme_path_id_compare(p, path_id)) { 6418 continue; 6419 } 6420 6421 /* We are not using the specified path. */ 6422 TAILQ_REMOVE(&nvme_ctrlr->trids, p, link); 6423 free(p); 6424 rc = 0; 6425 } 6426 6427 if (p == NULL || !nvme_path_id_compare(p, path_id)) { 6428 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6429 return rc; 6430 } 6431 6432 /* If we made it here, then this path is a match! Now we need to remove it. */ 6433 6434 /* This is the active path in use right now. The active path is always the first in the list. */ 6435 assert(p == nvme_ctrlr->active_path_id); 6436 6437 if (!TAILQ_NEXT(p, link)) { 6438 /* The current path is the only path. */ 6439 msg_fn = _nvme_ctrlr_destruct; 6440 rc = bdev_nvme_delete_ctrlr_unsafe(nvme_ctrlr, false); 6441 } else { 6442 /* There is an alternative path. */ 6443 msg_fn = _bdev_nvme_reset_ctrlr; 6444 rc = bdev_nvme_failover_ctrlr_unsafe(nvme_ctrlr, true); 6445 } 6446 6447 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6448 6449 if (rc == 0) { 6450 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 6451 } else if (rc == -EALREADY) { 6452 rc = 0; 6453 } 6454 6455 return rc; 6456 } 6457 6458 int 6459 bdev_nvme_delete(const char *name, const struct nvme_path_id *path_id, 6460 bdev_nvme_delete_done_fn delete_done, void *delete_done_ctx) 6461 { 6462 struct nvme_bdev_ctrlr *nbdev_ctrlr; 6463 struct nvme_ctrlr *nvme_ctrlr, *tmp_nvme_ctrlr; 6464 struct bdev_nvme_delete_ctx *ctx = NULL; 6465 int rc = -ENXIO, _rc; 6466 6467 if (name == NULL || path_id == NULL) { 6468 rc = -EINVAL; 6469 goto exit; 6470 } 6471 6472 pthread_mutex_lock(&g_bdev_nvme_mutex); 6473 6474 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 6475 if (nbdev_ctrlr == NULL) { 6476 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6477 6478 SPDK_ERRLOG("Failed to find NVMe bdev controller\n"); 6479 rc = -ENODEV; 6480 goto exit; 6481 } 6482 6483 TAILQ_FOREACH_SAFE(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq, tmp_nvme_ctrlr) { 6484 _rc = _bdev_nvme_delete(nvme_ctrlr, path_id); 6485 if (_rc < 0 && _rc != -ENXIO) { 6486 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6487 rc = _rc; 6488 goto exit; 6489 } else if (_rc == 0) { 6490 /* We traverse all remaining nvme_ctrlrs even if one nvme_ctrlr 6491 * was deleted successfully. To remember the successful deletion, 6492 * overwrite rc only if _rc is zero. 6493 */ 6494 rc = 0; 6495 } 6496 } 6497 6498 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6499 6500 if (rc != 0 || delete_done == NULL) { 6501 goto exit; 6502 } 6503 6504 ctx = calloc(1, sizeof(*ctx)); 6505 if (ctx == NULL) { 6506 SPDK_ERRLOG("Failed to allocate context for bdev_nvme_delete\n"); 6507 rc = -ENOMEM; 6508 goto exit; 6509 } 6510 6511 ctx->name = strdup(name); 6512 if (ctx->name == NULL) { 6513 SPDK_ERRLOG("Failed to copy controller name for deletion\n"); 6514 rc = -ENOMEM; 6515 goto exit; 6516 } 6517 6518 ctx->delete_done = delete_done; 6519 ctx->delete_done_ctx = delete_done_ctx; 6520 ctx->path_id = *path_id; 6521 ctx->timeout_ticks = spdk_get_ticks() + 10 * spdk_get_ticks_hz(); 6522 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_delete_complete_poll, ctx, 1000); 6523 if (ctx->poller == NULL) { 6524 SPDK_ERRLOG("Failed to register bdev_nvme_delete poller\n"); 6525 rc = -ENOMEM; 6526 goto exit; 6527 } 6528 6529 exit: 6530 if (rc != 0) { 6531 free_bdev_nvme_delete_ctx(ctx); 6532 } 6533 6534 return rc; 6535 } 6536 6537 #define DISCOVERY_INFOLOG(ctx, format, ...) \ 6538 SPDK_INFOLOG(bdev_nvme, "Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 6539 6540 #define DISCOVERY_ERRLOG(ctx, format, ...) \ 6541 SPDK_ERRLOG("Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 6542 6543 struct discovery_entry_ctx { 6544 char name[128]; 6545 struct spdk_nvme_transport_id trid; 6546 struct spdk_nvme_ctrlr_opts drv_opts; 6547 struct spdk_nvmf_discovery_log_page_entry entry; 6548 TAILQ_ENTRY(discovery_entry_ctx) tailq; 6549 struct discovery_ctx *ctx; 6550 }; 6551 6552 struct discovery_ctx { 6553 char *name; 6554 spdk_bdev_nvme_start_discovery_fn start_cb_fn; 6555 spdk_bdev_nvme_stop_discovery_fn stop_cb_fn; 6556 void *cb_ctx; 6557 struct spdk_nvme_probe_ctx *probe_ctx; 6558 struct spdk_nvme_detach_ctx *detach_ctx; 6559 struct spdk_nvme_ctrlr *ctrlr; 6560 struct spdk_nvme_transport_id trid; 6561 struct discovery_entry_ctx *entry_ctx_in_use; 6562 struct spdk_poller *poller; 6563 struct spdk_nvme_ctrlr_opts drv_opts; 6564 struct nvme_ctrlr_opts bdev_opts; 6565 struct spdk_nvmf_discovery_log_page *log_page; 6566 TAILQ_ENTRY(discovery_ctx) tailq; 6567 TAILQ_HEAD(, discovery_entry_ctx) nvm_entry_ctxs; 6568 TAILQ_HEAD(, discovery_entry_ctx) discovery_entry_ctxs; 6569 int rc; 6570 bool wait_for_attach; 6571 uint64_t timeout_ticks; 6572 /* Denotes that the discovery service is being started. We're waiting 6573 * for the initial connection to the discovery controller to be 6574 * established and attach discovered NVM ctrlrs. 6575 */ 6576 bool initializing; 6577 /* Denotes if a discovery is currently in progress for this context. 6578 * That includes connecting to newly discovered subsystems. Used to 6579 * ensure we do not start a new discovery until an existing one is 6580 * complete. 6581 */ 6582 bool in_progress; 6583 6584 /* Denotes if another discovery is needed after the one in progress 6585 * completes. Set when we receive an AER completion while a discovery 6586 * is already in progress. 6587 */ 6588 bool pending; 6589 6590 /* Signal to the discovery context poller that it should stop the 6591 * discovery service, including detaching from the current discovery 6592 * controller. 6593 */ 6594 bool stop; 6595 6596 struct spdk_thread *calling_thread; 6597 uint32_t index; 6598 uint32_t attach_in_progress; 6599 char *hostnqn; 6600 6601 /* Denotes if the discovery service was started by the mdns discovery. 6602 */ 6603 bool from_mdns_discovery_service; 6604 }; 6605 6606 TAILQ_HEAD(discovery_ctxs, discovery_ctx); 6607 static struct discovery_ctxs g_discovery_ctxs = TAILQ_HEAD_INITIALIZER(g_discovery_ctxs); 6608 6609 static void get_discovery_log_page(struct discovery_ctx *ctx); 6610 6611 static void 6612 free_discovery_ctx(struct discovery_ctx *ctx) 6613 { 6614 free(ctx->log_page); 6615 free(ctx->hostnqn); 6616 free(ctx->name); 6617 free(ctx); 6618 } 6619 6620 static void 6621 discovery_complete(struct discovery_ctx *ctx) 6622 { 6623 ctx->initializing = false; 6624 ctx->in_progress = false; 6625 if (ctx->pending) { 6626 ctx->pending = false; 6627 get_discovery_log_page(ctx); 6628 } 6629 } 6630 6631 static void 6632 build_trid_from_log_page_entry(struct spdk_nvme_transport_id *trid, 6633 struct spdk_nvmf_discovery_log_page_entry *entry) 6634 { 6635 char *space; 6636 6637 trid->trtype = entry->trtype; 6638 trid->adrfam = entry->adrfam; 6639 memcpy(trid->traddr, entry->traddr, sizeof(entry->traddr)); 6640 memcpy(trid->trsvcid, entry->trsvcid, sizeof(entry->trsvcid)); 6641 /* Because the source buffer (entry->subnqn) is longer than trid->subnqn, and 6642 * before call to this function trid->subnqn is zeroed out, we need 6643 * to copy sizeof(trid->subnqn) minus one byte to make sure the last character 6644 * remains 0. Then we can shorten the string (replace ' ' with 0) if required 6645 */ 6646 memcpy(trid->subnqn, entry->subnqn, sizeof(trid->subnqn) - 1); 6647 6648 /* We want the traddr, trsvcid and subnqn fields to be NULL-terminated. 6649 * But the log page entries typically pad them with spaces, not zeroes. 6650 * So add a NULL terminator to each of these fields at the appropriate 6651 * location. 6652 */ 6653 space = strchr(trid->traddr, ' '); 6654 if (space) { 6655 *space = 0; 6656 } 6657 space = strchr(trid->trsvcid, ' '); 6658 if (space) { 6659 *space = 0; 6660 } 6661 space = strchr(trid->subnqn, ' '); 6662 if (space) { 6663 *space = 0; 6664 } 6665 } 6666 6667 static void 6668 _stop_discovery(void *_ctx) 6669 { 6670 struct discovery_ctx *ctx = _ctx; 6671 6672 if (ctx->attach_in_progress > 0) { 6673 spdk_thread_send_msg(spdk_get_thread(), _stop_discovery, ctx); 6674 return; 6675 } 6676 6677 ctx->stop = true; 6678 6679 while (!TAILQ_EMPTY(&ctx->nvm_entry_ctxs)) { 6680 struct discovery_entry_ctx *entry_ctx; 6681 struct nvme_path_id path = {}; 6682 6683 entry_ctx = TAILQ_FIRST(&ctx->nvm_entry_ctxs); 6684 path.trid = entry_ctx->trid; 6685 bdev_nvme_delete(entry_ctx->name, &path, NULL, NULL); 6686 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 6687 free(entry_ctx); 6688 } 6689 6690 while (!TAILQ_EMPTY(&ctx->discovery_entry_ctxs)) { 6691 struct discovery_entry_ctx *entry_ctx; 6692 6693 entry_ctx = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 6694 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 6695 free(entry_ctx); 6696 } 6697 6698 free(ctx->entry_ctx_in_use); 6699 ctx->entry_ctx_in_use = NULL; 6700 } 6701 6702 static void 6703 stop_discovery(struct discovery_ctx *ctx, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 6704 { 6705 ctx->stop_cb_fn = cb_fn; 6706 ctx->cb_ctx = cb_ctx; 6707 6708 if (ctx->attach_in_progress > 0) { 6709 DISCOVERY_INFOLOG(ctx, "stopping discovery with attach_in_progress: %"PRIu32"\n", 6710 ctx->attach_in_progress); 6711 } 6712 6713 _stop_discovery(ctx); 6714 } 6715 6716 static void 6717 remove_discovery_entry(struct nvme_ctrlr *nvme_ctrlr) 6718 { 6719 struct discovery_ctx *d_ctx; 6720 struct nvme_path_id *path_id; 6721 struct spdk_nvme_transport_id trid = {}; 6722 struct discovery_entry_ctx *entry_ctx, *tmp; 6723 6724 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 6725 6726 TAILQ_FOREACH(d_ctx, &g_discovery_ctxs, tailq) { 6727 TAILQ_FOREACH_SAFE(entry_ctx, &d_ctx->nvm_entry_ctxs, tailq, tmp) { 6728 build_trid_from_log_page_entry(&trid, &entry_ctx->entry); 6729 if (spdk_nvme_transport_id_compare(&trid, &path_id->trid) != 0) { 6730 continue; 6731 } 6732 6733 TAILQ_REMOVE(&d_ctx->nvm_entry_ctxs, entry_ctx, tailq); 6734 free(entry_ctx); 6735 DISCOVERY_INFOLOG(d_ctx, "Remove discovery entry: %s:%s:%s\n", 6736 trid.subnqn, trid.traddr, trid.trsvcid); 6737 6738 /* Fail discovery ctrlr to force reattach attempt */ 6739 spdk_nvme_ctrlr_fail(d_ctx->ctrlr); 6740 } 6741 } 6742 } 6743 6744 static void 6745 discovery_remove_controllers(struct discovery_ctx *ctx) 6746 { 6747 struct spdk_nvmf_discovery_log_page *log_page = ctx->log_page; 6748 struct discovery_entry_ctx *entry_ctx, *tmp; 6749 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 6750 struct spdk_nvme_transport_id old_trid = {}; 6751 uint64_t numrec, i; 6752 bool found; 6753 6754 numrec = from_le64(&log_page->numrec); 6755 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->nvm_entry_ctxs, tailq, tmp) { 6756 found = false; 6757 old_entry = &entry_ctx->entry; 6758 build_trid_from_log_page_entry(&old_trid, old_entry); 6759 for (i = 0; i < numrec; i++) { 6760 new_entry = &log_page->entries[i]; 6761 if (!memcmp(old_entry, new_entry, sizeof(*old_entry))) { 6762 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s found again\n", 6763 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 6764 found = true; 6765 break; 6766 } 6767 } 6768 if (!found) { 6769 struct nvme_path_id path = {}; 6770 6771 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s not found\n", 6772 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 6773 6774 path.trid = entry_ctx->trid; 6775 bdev_nvme_delete(entry_ctx->name, &path, NULL, NULL); 6776 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 6777 free(entry_ctx); 6778 } 6779 } 6780 free(log_page); 6781 ctx->log_page = NULL; 6782 discovery_complete(ctx); 6783 } 6784 6785 static void 6786 complete_discovery_start(struct discovery_ctx *ctx, int status) 6787 { 6788 ctx->timeout_ticks = 0; 6789 ctx->rc = status; 6790 if (ctx->start_cb_fn) { 6791 ctx->start_cb_fn(ctx->cb_ctx, status); 6792 ctx->start_cb_fn = NULL; 6793 ctx->cb_ctx = NULL; 6794 } 6795 } 6796 6797 static void 6798 discovery_attach_controller_done(void *cb_ctx, size_t bdev_count, int rc) 6799 { 6800 struct discovery_entry_ctx *entry_ctx = cb_ctx; 6801 struct discovery_ctx *ctx = entry_ctx->ctx; 6802 6803 DISCOVERY_INFOLOG(ctx, "attach %s done\n", entry_ctx->name); 6804 ctx->attach_in_progress--; 6805 if (ctx->attach_in_progress == 0) { 6806 complete_discovery_start(ctx, ctx->rc); 6807 if (ctx->initializing && ctx->rc != 0) { 6808 DISCOVERY_ERRLOG(ctx, "stopping discovery due to errors: %d\n", ctx->rc); 6809 stop_discovery(ctx, NULL, ctx->cb_ctx); 6810 } else { 6811 discovery_remove_controllers(ctx); 6812 } 6813 } 6814 } 6815 6816 static struct discovery_entry_ctx * 6817 create_discovery_entry_ctx(struct discovery_ctx *ctx, struct spdk_nvme_transport_id *trid) 6818 { 6819 struct discovery_entry_ctx *new_ctx; 6820 6821 new_ctx = calloc(1, sizeof(*new_ctx)); 6822 if (new_ctx == NULL) { 6823 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6824 return NULL; 6825 } 6826 6827 new_ctx->ctx = ctx; 6828 memcpy(&new_ctx->trid, trid, sizeof(*trid)); 6829 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 6830 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 6831 return new_ctx; 6832 } 6833 6834 static void 6835 discovery_log_page_cb(void *cb_arg, int rc, const struct spdk_nvme_cpl *cpl, 6836 struct spdk_nvmf_discovery_log_page *log_page) 6837 { 6838 struct discovery_ctx *ctx = cb_arg; 6839 struct discovery_entry_ctx *entry_ctx, *tmp; 6840 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 6841 uint64_t numrec, i; 6842 bool found; 6843 6844 if (rc || spdk_nvme_cpl_is_error(cpl)) { 6845 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 6846 return; 6847 } 6848 6849 ctx->log_page = log_page; 6850 assert(ctx->attach_in_progress == 0); 6851 numrec = from_le64(&log_page->numrec); 6852 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->discovery_entry_ctxs, tailq, tmp) { 6853 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 6854 free(entry_ctx); 6855 } 6856 for (i = 0; i < numrec; i++) { 6857 found = false; 6858 new_entry = &log_page->entries[i]; 6859 if (new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY_CURRENT || 6860 new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) { 6861 struct discovery_entry_ctx *new_ctx; 6862 struct spdk_nvme_transport_id trid = {}; 6863 6864 build_trid_from_log_page_entry(&trid, new_entry); 6865 new_ctx = create_discovery_entry_ctx(ctx, &trid); 6866 if (new_ctx == NULL) { 6867 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6868 break; 6869 } 6870 6871 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, new_ctx, tailq); 6872 continue; 6873 } 6874 TAILQ_FOREACH(entry_ctx, &ctx->nvm_entry_ctxs, tailq) { 6875 old_entry = &entry_ctx->entry; 6876 if (!memcmp(new_entry, old_entry, sizeof(*new_entry))) { 6877 found = true; 6878 break; 6879 } 6880 } 6881 if (!found) { 6882 struct discovery_entry_ctx *subnqn_ctx = NULL, *new_ctx; 6883 struct discovery_ctx *d_ctx; 6884 6885 TAILQ_FOREACH(d_ctx, &g_discovery_ctxs, tailq) { 6886 TAILQ_FOREACH(subnqn_ctx, &d_ctx->nvm_entry_ctxs, tailq) { 6887 if (!memcmp(subnqn_ctx->entry.subnqn, new_entry->subnqn, 6888 sizeof(new_entry->subnqn))) { 6889 break; 6890 } 6891 } 6892 if (subnqn_ctx) { 6893 break; 6894 } 6895 } 6896 6897 new_ctx = calloc(1, sizeof(*new_ctx)); 6898 if (new_ctx == NULL) { 6899 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6900 break; 6901 } 6902 6903 new_ctx->ctx = ctx; 6904 memcpy(&new_ctx->entry, new_entry, sizeof(*new_entry)); 6905 build_trid_from_log_page_entry(&new_ctx->trid, new_entry); 6906 if (subnqn_ctx) { 6907 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s", subnqn_ctx->name); 6908 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new path for %s\n", 6909 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 6910 new_ctx->name); 6911 } else { 6912 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s%d", ctx->name, ctx->index++); 6913 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new subsystem %s\n", 6914 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 6915 new_ctx->name); 6916 } 6917 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 6918 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 6919 rc = bdev_nvme_create(&new_ctx->trid, new_ctx->name, NULL, 0, 6920 discovery_attach_controller_done, new_ctx, 6921 &new_ctx->drv_opts, &ctx->bdev_opts, true); 6922 if (rc == 0) { 6923 TAILQ_INSERT_TAIL(&ctx->nvm_entry_ctxs, new_ctx, tailq); 6924 ctx->attach_in_progress++; 6925 } else { 6926 DISCOVERY_ERRLOG(ctx, "bdev_nvme_create failed (%s)\n", spdk_strerror(-rc)); 6927 } 6928 } 6929 } 6930 6931 if (ctx->attach_in_progress == 0) { 6932 discovery_remove_controllers(ctx); 6933 } 6934 } 6935 6936 static void 6937 get_discovery_log_page(struct discovery_ctx *ctx) 6938 { 6939 int rc; 6940 6941 assert(ctx->in_progress == false); 6942 ctx->in_progress = true; 6943 rc = spdk_nvme_ctrlr_get_discovery_log_page(ctx->ctrlr, discovery_log_page_cb, ctx); 6944 if (rc != 0) { 6945 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 6946 } 6947 DISCOVERY_INFOLOG(ctx, "sent discovery log page command\n"); 6948 } 6949 6950 static void 6951 discovery_aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 6952 { 6953 struct discovery_ctx *ctx = arg; 6954 uint32_t log_page_id = (cpl->cdw0 & 0xFF0000) >> 16; 6955 6956 if (spdk_nvme_cpl_is_error(cpl)) { 6957 DISCOVERY_ERRLOG(ctx, "aer failed\n"); 6958 return; 6959 } 6960 6961 if (log_page_id != SPDK_NVME_LOG_DISCOVERY) { 6962 DISCOVERY_ERRLOG(ctx, "unexpected log page 0x%x\n", log_page_id); 6963 return; 6964 } 6965 6966 DISCOVERY_INFOLOG(ctx, "got aer\n"); 6967 if (ctx->in_progress) { 6968 ctx->pending = true; 6969 return; 6970 } 6971 6972 get_discovery_log_page(ctx); 6973 } 6974 6975 static void 6976 discovery_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 6977 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 6978 { 6979 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 6980 struct discovery_ctx *ctx; 6981 6982 ctx = SPDK_CONTAINEROF(user_opts, struct discovery_ctx, drv_opts); 6983 6984 DISCOVERY_INFOLOG(ctx, "discovery ctrlr attached\n"); 6985 ctx->probe_ctx = NULL; 6986 ctx->ctrlr = ctrlr; 6987 6988 if (ctx->rc != 0) { 6989 DISCOVERY_ERRLOG(ctx, "encountered error while attaching discovery ctrlr: %d\n", 6990 ctx->rc); 6991 return; 6992 } 6993 6994 spdk_nvme_ctrlr_register_aer_callback(ctx->ctrlr, discovery_aer_cb, ctx); 6995 } 6996 6997 static int 6998 discovery_poller(void *arg) 6999 { 7000 struct discovery_ctx *ctx = arg; 7001 struct spdk_nvme_transport_id *trid; 7002 int rc; 7003 7004 if (ctx->detach_ctx) { 7005 rc = spdk_nvme_detach_poll_async(ctx->detach_ctx); 7006 if (rc != -EAGAIN) { 7007 ctx->detach_ctx = NULL; 7008 ctx->ctrlr = NULL; 7009 } 7010 } else if (ctx->stop) { 7011 if (ctx->ctrlr != NULL) { 7012 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 7013 if (rc == 0) { 7014 return SPDK_POLLER_BUSY; 7015 } 7016 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 7017 } 7018 spdk_poller_unregister(&ctx->poller); 7019 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 7020 assert(ctx->start_cb_fn == NULL); 7021 if (ctx->stop_cb_fn != NULL) { 7022 ctx->stop_cb_fn(ctx->cb_ctx); 7023 } 7024 free_discovery_ctx(ctx); 7025 } else if (ctx->probe_ctx == NULL && ctx->ctrlr == NULL) { 7026 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 7027 DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); 7028 assert(ctx->initializing); 7029 spdk_poller_unregister(&ctx->poller); 7030 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 7031 complete_discovery_start(ctx, -ETIMEDOUT); 7032 stop_discovery(ctx, NULL, NULL); 7033 free_discovery_ctx(ctx); 7034 return SPDK_POLLER_BUSY; 7035 } 7036 7037 assert(ctx->entry_ctx_in_use == NULL); 7038 ctx->entry_ctx_in_use = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 7039 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 7040 trid = &ctx->entry_ctx_in_use->trid; 7041 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, discovery_attach_cb); 7042 if (ctx->probe_ctx) { 7043 spdk_poller_unregister(&ctx->poller); 7044 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000); 7045 } else { 7046 DISCOVERY_ERRLOG(ctx, "could not start discovery connect\n"); 7047 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 7048 ctx->entry_ctx_in_use = NULL; 7049 } 7050 } else if (ctx->probe_ctx) { 7051 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 7052 DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); 7053 complete_discovery_start(ctx, -ETIMEDOUT); 7054 return SPDK_POLLER_BUSY; 7055 } 7056 7057 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 7058 if (rc != -EAGAIN) { 7059 if (ctx->rc != 0) { 7060 assert(ctx->initializing); 7061 stop_discovery(ctx, NULL, ctx->cb_ctx); 7062 } else { 7063 assert(rc == 0); 7064 DISCOVERY_INFOLOG(ctx, "discovery ctrlr connected\n"); 7065 ctx->rc = rc; 7066 get_discovery_log_page(ctx); 7067 } 7068 } 7069 } else { 7070 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 7071 DISCOVERY_ERRLOG(ctx, "timed out while attaching NVM ctrlrs\n"); 7072 complete_discovery_start(ctx, -ETIMEDOUT); 7073 /* We need to wait until all NVM ctrlrs are attached before we stop the 7074 * discovery service to make sure we don't detach a ctrlr that is still 7075 * being attached. 7076 */ 7077 if (ctx->attach_in_progress == 0) { 7078 stop_discovery(ctx, NULL, ctx->cb_ctx); 7079 return SPDK_POLLER_BUSY; 7080 } 7081 } 7082 7083 rc = spdk_nvme_ctrlr_process_admin_completions(ctx->ctrlr); 7084 if (rc < 0) { 7085 spdk_poller_unregister(&ctx->poller); 7086 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 7087 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 7088 ctx->entry_ctx_in_use = NULL; 7089 7090 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 7091 if (rc != 0) { 7092 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 7093 ctx->ctrlr = NULL; 7094 } 7095 } 7096 } 7097 7098 return SPDK_POLLER_BUSY; 7099 } 7100 7101 static void 7102 start_discovery_poller(void *arg) 7103 { 7104 struct discovery_ctx *ctx = arg; 7105 7106 TAILQ_INSERT_TAIL(&g_discovery_ctxs, ctx, tailq); 7107 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 7108 } 7109 7110 int 7111 bdev_nvme_start_discovery(struct spdk_nvme_transport_id *trid, 7112 const char *base_name, 7113 struct spdk_nvme_ctrlr_opts *drv_opts, 7114 struct nvme_ctrlr_opts *bdev_opts, 7115 uint64_t attach_timeout, 7116 bool from_mdns, 7117 spdk_bdev_nvme_start_discovery_fn cb_fn, void *cb_ctx) 7118 { 7119 struct discovery_ctx *ctx; 7120 struct discovery_entry_ctx *discovery_entry_ctx; 7121 7122 snprintf(trid->subnqn, sizeof(trid->subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN); 7123 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 7124 if (strcmp(ctx->name, base_name) == 0) { 7125 return -EEXIST; 7126 } 7127 7128 if (ctx->entry_ctx_in_use != NULL) { 7129 if (!spdk_nvme_transport_id_compare(trid, &ctx->entry_ctx_in_use->trid)) { 7130 return -EEXIST; 7131 } 7132 } 7133 7134 TAILQ_FOREACH(discovery_entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 7135 if (!spdk_nvme_transport_id_compare(trid, &discovery_entry_ctx->trid)) { 7136 return -EEXIST; 7137 } 7138 } 7139 } 7140 7141 ctx = calloc(1, sizeof(*ctx)); 7142 if (ctx == NULL) { 7143 return -ENOMEM; 7144 } 7145 7146 ctx->name = strdup(base_name); 7147 if (ctx->name == NULL) { 7148 free_discovery_ctx(ctx); 7149 return -ENOMEM; 7150 } 7151 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 7152 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 7153 ctx->from_mdns_discovery_service = from_mdns; 7154 ctx->bdev_opts.from_discovery_service = true; 7155 ctx->calling_thread = spdk_get_thread(); 7156 ctx->start_cb_fn = cb_fn; 7157 ctx->cb_ctx = cb_ctx; 7158 ctx->initializing = true; 7159 if (ctx->start_cb_fn) { 7160 /* We can use this when dumping json to denote if this RPC parameter 7161 * was specified or not. 7162 */ 7163 ctx->wait_for_attach = true; 7164 } 7165 if (attach_timeout != 0) { 7166 ctx->timeout_ticks = spdk_get_ticks() + attach_timeout * 7167 spdk_get_ticks_hz() / 1000ull; 7168 } 7169 TAILQ_INIT(&ctx->nvm_entry_ctxs); 7170 TAILQ_INIT(&ctx->discovery_entry_ctxs); 7171 memcpy(&ctx->trid, trid, sizeof(*trid)); 7172 /* Even if user did not specify hostnqn, we can still strdup("\0"); */ 7173 ctx->hostnqn = strdup(ctx->drv_opts.hostnqn); 7174 if (ctx->hostnqn == NULL) { 7175 free_discovery_ctx(ctx); 7176 return -ENOMEM; 7177 } 7178 discovery_entry_ctx = create_discovery_entry_ctx(ctx, trid); 7179 if (discovery_entry_ctx == NULL) { 7180 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 7181 free_discovery_ctx(ctx); 7182 return -ENOMEM; 7183 } 7184 7185 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, discovery_entry_ctx, tailq); 7186 spdk_thread_send_msg(g_bdev_nvme_init_thread, start_discovery_poller, ctx); 7187 return 0; 7188 } 7189 7190 int 7191 bdev_nvme_stop_discovery(const char *name, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 7192 { 7193 struct discovery_ctx *ctx; 7194 7195 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 7196 if (strcmp(name, ctx->name) == 0) { 7197 if (ctx->stop) { 7198 return -EALREADY; 7199 } 7200 /* If we're still starting the discovery service and ->rc is non-zero, we're 7201 * going to stop it as soon as we can 7202 */ 7203 if (ctx->initializing && ctx->rc != 0) { 7204 return -EALREADY; 7205 } 7206 stop_discovery(ctx, cb_fn, cb_ctx); 7207 return 0; 7208 } 7209 } 7210 7211 return -ENOENT; 7212 } 7213 7214 static int 7215 bdev_nvme_library_init(void) 7216 { 7217 g_bdev_nvme_init_thread = spdk_get_thread(); 7218 7219 spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_create_poll_group_cb, 7220 bdev_nvme_destroy_poll_group_cb, 7221 sizeof(struct nvme_poll_group), "nvme_poll_groups"); 7222 7223 return 0; 7224 } 7225 7226 static void 7227 bdev_nvme_fini_destruct_ctrlrs(void) 7228 { 7229 struct nvme_bdev_ctrlr *nbdev_ctrlr; 7230 struct nvme_ctrlr *nvme_ctrlr; 7231 7232 pthread_mutex_lock(&g_bdev_nvme_mutex); 7233 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 7234 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 7235 pthread_mutex_lock(&nvme_ctrlr->mutex); 7236 if (nvme_ctrlr->destruct) { 7237 /* This controller's destruction was already started 7238 * before the application started shutting down 7239 */ 7240 pthread_mutex_unlock(&nvme_ctrlr->mutex); 7241 continue; 7242 } 7243 nvme_ctrlr->destruct = true; 7244 pthread_mutex_unlock(&nvme_ctrlr->mutex); 7245 7246 spdk_thread_send_msg(nvme_ctrlr->thread, _nvme_ctrlr_destruct, 7247 nvme_ctrlr); 7248 } 7249 } 7250 7251 g_bdev_nvme_module_finish = true; 7252 if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 7253 pthread_mutex_unlock(&g_bdev_nvme_mutex); 7254 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 7255 spdk_bdev_module_fini_done(); 7256 return; 7257 } 7258 7259 pthread_mutex_unlock(&g_bdev_nvme_mutex); 7260 } 7261 7262 static void 7263 check_discovery_fini(void *arg) 7264 { 7265 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 7266 bdev_nvme_fini_destruct_ctrlrs(); 7267 } 7268 } 7269 7270 static void 7271 bdev_nvme_library_fini(void) 7272 { 7273 struct nvme_probe_skip_entry *entry, *entry_tmp; 7274 struct discovery_ctx *ctx; 7275 7276 spdk_poller_unregister(&g_hotplug_poller); 7277 free(g_hotplug_probe_ctx); 7278 g_hotplug_probe_ctx = NULL; 7279 7280 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) { 7281 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 7282 free(entry); 7283 } 7284 7285 assert(spdk_get_thread() == g_bdev_nvme_init_thread); 7286 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 7287 bdev_nvme_fini_destruct_ctrlrs(); 7288 } else { 7289 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 7290 stop_discovery(ctx, check_discovery_fini, NULL); 7291 } 7292 } 7293 } 7294 7295 static void 7296 bdev_nvme_verify_pi_error(struct nvme_bdev_io *bio) 7297 { 7298 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7299 struct spdk_bdev *bdev = bdev_io->bdev; 7300 struct spdk_dif_ctx dif_ctx; 7301 struct spdk_dif_error err_blk = {}; 7302 int rc; 7303 struct spdk_dif_ctx_init_ext_opts dif_opts; 7304 7305 dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format); 7306 dif_opts.dif_pi_format = SPDK_DIF_PI_FORMAT_16; 7307 rc = spdk_dif_ctx_init(&dif_ctx, 7308 bdev->blocklen, bdev->md_len, bdev->md_interleave, 7309 bdev->dif_is_head_of_md, bdev->dif_type, 7310 bdev_io->u.bdev.dif_check_flags, 7311 bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0, &dif_opts); 7312 if (rc != 0) { 7313 SPDK_ERRLOG("Initialization of DIF context failed\n"); 7314 return; 7315 } 7316 7317 if (bdev->md_interleave) { 7318 rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 7319 bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 7320 } else { 7321 struct iovec md_iov = { 7322 .iov_base = bdev_io->u.bdev.md_buf, 7323 .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len, 7324 }; 7325 7326 rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 7327 &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 7328 } 7329 7330 if (rc != 0) { 7331 SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n", 7332 err_blk.err_type, err_blk.err_offset); 7333 } else { 7334 SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n"); 7335 } 7336 } 7337 7338 static void 7339 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 7340 { 7341 struct nvme_bdev_io *bio = ref; 7342 7343 if (spdk_nvme_cpl_is_success(cpl)) { 7344 /* Run PI verification for read data buffer. */ 7345 bdev_nvme_verify_pi_error(bio); 7346 } 7347 7348 /* Return original completion status */ 7349 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 7350 } 7351 7352 static void 7353 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 7354 { 7355 struct nvme_bdev_io *bio = ref; 7356 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7357 int ret; 7358 7359 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 7360 SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n", 7361 cpl->status.sct, cpl->status.sc); 7362 7363 /* Save completion status to use after verifying PI error. */ 7364 bio->cpl = *cpl; 7365 7366 if (spdk_likely(nvme_io_path_is_available(bio->io_path))) { 7367 /* Read without PI checking to verify PI error. */ 7368 ret = bdev_nvme_no_pi_readv(bio, 7369 bdev_io->u.bdev.iovs, 7370 bdev_io->u.bdev.iovcnt, 7371 bdev_io->u.bdev.md_buf, 7372 bdev_io->u.bdev.num_blocks, 7373 bdev_io->u.bdev.offset_blocks); 7374 if (ret == 0) { 7375 return; 7376 } 7377 } 7378 } 7379 7380 bdev_nvme_io_complete_nvme_status(bio, cpl); 7381 } 7382 7383 static void 7384 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7385 { 7386 struct nvme_bdev_io *bio = ref; 7387 7388 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 7389 SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n", 7390 cpl->status.sct, cpl->status.sc); 7391 /* Run PI verification for write data buffer if PI error is detected. */ 7392 bdev_nvme_verify_pi_error(bio); 7393 } 7394 7395 bdev_nvme_io_complete_nvme_status(bio, cpl); 7396 } 7397 7398 static void 7399 bdev_nvme_zone_appendv_done(void *ref, const struct spdk_nvme_cpl *cpl) 7400 { 7401 struct nvme_bdev_io *bio = ref; 7402 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7403 7404 /* spdk_bdev_io_get_append_location() requires that the ALBA is stored in offset_blocks. 7405 * Additionally, offset_blocks has to be set before calling bdev_nvme_verify_pi_error(). 7406 */ 7407 bdev_io->u.bdev.offset_blocks = *(uint64_t *)&cpl->cdw0; 7408 7409 if (spdk_nvme_cpl_is_pi_error(cpl)) { 7410 SPDK_ERRLOG("zone append completed with PI error (sct=%d, sc=%d)\n", 7411 cpl->status.sct, cpl->status.sc); 7412 /* Run PI verification for zone append data buffer if PI error is detected. */ 7413 bdev_nvme_verify_pi_error(bio); 7414 } 7415 7416 bdev_nvme_io_complete_nvme_status(bio, cpl); 7417 } 7418 7419 static void 7420 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7421 { 7422 struct nvme_bdev_io *bio = ref; 7423 7424 if (spdk_nvme_cpl_is_pi_error(cpl)) { 7425 SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n", 7426 cpl->status.sct, cpl->status.sc); 7427 /* Run PI verification for compare data buffer if PI error is detected. */ 7428 bdev_nvme_verify_pi_error(bio); 7429 } 7430 7431 bdev_nvme_io_complete_nvme_status(bio, cpl); 7432 } 7433 7434 static void 7435 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7436 { 7437 struct nvme_bdev_io *bio = ref; 7438 7439 /* Compare operation completion */ 7440 if (!bio->first_fused_completed) { 7441 /* Save compare result for write callback */ 7442 bio->cpl = *cpl; 7443 bio->first_fused_completed = true; 7444 return; 7445 } 7446 7447 /* Write operation completion */ 7448 if (spdk_nvme_cpl_is_error(&bio->cpl)) { 7449 /* If bio->cpl is already an error, it means the compare operation failed. In that case, 7450 * complete the IO with the compare operation's status. 7451 */ 7452 if (!spdk_nvme_cpl_is_error(cpl)) { 7453 SPDK_ERRLOG("Unexpected write success after compare failure.\n"); 7454 } 7455 7456 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 7457 } else { 7458 bdev_nvme_io_complete_nvme_status(bio, cpl); 7459 } 7460 } 7461 7462 static void 7463 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl) 7464 { 7465 struct nvme_bdev_io *bio = ref; 7466 7467 bdev_nvme_io_complete_nvme_status(bio, cpl); 7468 } 7469 7470 static int 7471 fill_zone_from_report(struct spdk_bdev_zone_info *info, struct spdk_nvme_zns_zone_desc *desc) 7472 { 7473 switch (desc->zt) { 7474 case SPDK_NVME_ZONE_TYPE_SEQWR: 7475 info->type = SPDK_BDEV_ZONE_TYPE_SEQWR; 7476 break; 7477 default: 7478 SPDK_ERRLOG("Invalid zone type: %#x in zone report\n", desc->zt); 7479 return -EIO; 7480 } 7481 7482 switch (desc->zs) { 7483 case SPDK_NVME_ZONE_STATE_EMPTY: 7484 info->state = SPDK_BDEV_ZONE_STATE_EMPTY; 7485 break; 7486 case SPDK_NVME_ZONE_STATE_IOPEN: 7487 info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN; 7488 break; 7489 case SPDK_NVME_ZONE_STATE_EOPEN: 7490 info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN; 7491 break; 7492 case SPDK_NVME_ZONE_STATE_CLOSED: 7493 info->state = SPDK_BDEV_ZONE_STATE_CLOSED; 7494 break; 7495 case SPDK_NVME_ZONE_STATE_RONLY: 7496 info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY; 7497 break; 7498 case SPDK_NVME_ZONE_STATE_FULL: 7499 info->state = SPDK_BDEV_ZONE_STATE_FULL; 7500 break; 7501 case SPDK_NVME_ZONE_STATE_OFFLINE: 7502 info->state = SPDK_BDEV_ZONE_STATE_OFFLINE; 7503 break; 7504 default: 7505 SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", desc->zs); 7506 return -EIO; 7507 } 7508 7509 info->zone_id = desc->zslba; 7510 info->write_pointer = desc->wp; 7511 info->capacity = desc->zcap; 7512 7513 return 0; 7514 } 7515 7516 static void 7517 bdev_nvme_get_zone_info_done(void *ref, const struct spdk_nvme_cpl *cpl) 7518 { 7519 struct nvme_bdev_io *bio = ref; 7520 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7521 uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id; 7522 uint32_t zones_to_copy = bdev_io->u.zone_mgmt.num_zones; 7523 struct spdk_bdev_zone_info *info = bdev_io->u.zone_mgmt.buf; 7524 uint64_t max_zones_per_buf, i; 7525 uint32_t zone_report_bufsize; 7526 struct spdk_nvme_ns *ns; 7527 struct spdk_nvme_qpair *qpair; 7528 int ret; 7529 7530 if (spdk_nvme_cpl_is_error(cpl)) { 7531 goto out_complete_io_nvme_cpl; 7532 } 7533 7534 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 7535 ret = -ENXIO; 7536 goto out_complete_io_ret; 7537 } 7538 7539 ns = bio->io_path->nvme_ns->ns; 7540 qpair = bio->io_path->qpair->qpair; 7541 7542 zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 7543 max_zones_per_buf = (zone_report_bufsize - sizeof(*bio->zone_report_buf)) / 7544 sizeof(bio->zone_report_buf->descs[0]); 7545 7546 if (bio->zone_report_buf->nr_zones > max_zones_per_buf) { 7547 ret = -EINVAL; 7548 goto out_complete_io_ret; 7549 } 7550 7551 if (!bio->zone_report_buf->nr_zones) { 7552 ret = -EINVAL; 7553 goto out_complete_io_ret; 7554 } 7555 7556 for (i = 0; i < bio->zone_report_buf->nr_zones && bio->handled_zones < zones_to_copy; i++) { 7557 ret = fill_zone_from_report(&info[bio->handled_zones], 7558 &bio->zone_report_buf->descs[i]); 7559 if (ret) { 7560 goto out_complete_io_ret; 7561 } 7562 bio->handled_zones++; 7563 } 7564 7565 if (bio->handled_zones < zones_to_copy) { 7566 uint64_t zone_size_lba = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 7567 uint64_t slba = zone_id + (zone_size_lba * bio->handled_zones); 7568 7569 memset(bio->zone_report_buf, 0, zone_report_bufsize); 7570 ret = spdk_nvme_zns_report_zones(ns, qpair, 7571 bio->zone_report_buf, zone_report_bufsize, 7572 slba, SPDK_NVME_ZRA_LIST_ALL, true, 7573 bdev_nvme_get_zone_info_done, bio); 7574 if (!ret) { 7575 return; 7576 } else { 7577 goto out_complete_io_ret; 7578 } 7579 } 7580 7581 out_complete_io_nvme_cpl: 7582 free(bio->zone_report_buf); 7583 bio->zone_report_buf = NULL; 7584 bdev_nvme_io_complete_nvme_status(bio, cpl); 7585 return; 7586 7587 out_complete_io_ret: 7588 free(bio->zone_report_buf); 7589 bio->zone_report_buf = NULL; 7590 bdev_nvme_io_complete(bio, ret); 7591 } 7592 7593 static void 7594 bdev_nvme_zone_management_done(void *ref, const struct spdk_nvme_cpl *cpl) 7595 { 7596 struct nvme_bdev_io *bio = ref; 7597 7598 bdev_nvme_io_complete_nvme_status(bio, cpl); 7599 } 7600 7601 static void 7602 bdev_nvme_admin_passthru_complete_nvme_status(void *ctx) 7603 { 7604 struct nvme_bdev_io *bio = ctx; 7605 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7606 const struct spdk_nvme_cpl *cpl = &bio->cpl; 7607 7608 assert(bdev_nvme_io_type_is_admin(bdev_io->type)); 7609 7610 __bdev_nvme_io_complete(bdev_io, 0, cpl); 7611 } 7612 7613 static void 7614 bdev_nvme_abort_complete(void *ctx) 7615 { 7616 struct nvme_bdev_io *bio = ctx; 7617 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7618 7619 if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) { 7620 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS, NULL); 7621 } else { 7622 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED, NULL); 7623 } 7624 } 7625 7626 static void 7627 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl) 7628 { 7629 struct nvme_bdev_io *bio = ref; 7630 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7631 7632 bio->cpl = *cpl; 7633 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), bdev_nvme_abort_complete, bio); 7634 } 7635 7636 static void 7637 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl) 7638 { 7639 struct nvme_bdev_io *bio = ref; 7640 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7641 7642 bio->cpl = *cpl; 7643 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7644 bdev_nvme_admin_passthru_complete_nvme_status, bio); 7645 } 7646 7647 static void 7648 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset) 7649 { 7650 struct nvme_bdev_io *bio = ref; 7651 struct iovec *iov; 7652 7653 bio->iov_offset = sgl_offset; 7654 for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) { 7655 iov = &bio->iovs[bio->iovpos]; 7656 if (bio->iov_offset < iov->iov_len) { 7657 break; 7658 } 7659 7660 bio->iov_offset -= iov->iov_len; 7661 } 7662 } 7663 7664 static int 7665 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length) 7666 { 7667 struct nvme_bdev_io *bio = ref; 7668 struct iovec *iov; 7669 7670 assert(bio->iovpos < bio->iovcnt); 7671 7672 iov = &bio->iovs[bio->iovpos]; 7673 7674 *address = iov->iov_base; 7675 *length = iov->iov_len; 7676 7677 if (bio->iov_offset) { 7678 assert(bio->iov_offset <= iov->iov_len); 7679 *address += bio->iov_offset; 7680 *length -= bio->iov_offset; 7681 } 7682 7683 bio->iov_offset += *length; 7684 if (bio->iov_offset == iov->iov_len) { 7685 bio->iovpos++; 7686 bio->iov_offset = 0; 7687 } 7688 7689 return 0; 7690 } 7691 7692 static void 7693 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset) 7694 { 7695 struct nvme_bdev_io *bio = ref; 7696 struct iovec *iov; 7697 7698 bio->fused_iov_offset = sgl_offset; 7699 for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) { 7700 iov = &bio->fused_iovs[bio->fused_iovpos]; 7701 if (bio->fused_iov_offset < iov->iov_len) { 7702 break; 7703 } 7704 7705 bio->fused_iov_offset -= iov->iov_len; 7706 } 7707 } 7708 7709 static int 7710 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length) 7711 { 7712 struct nvme_bdev_io *bio = ref; 7713 struct iovec *iov; 7714 7715 assert(bio->fused_iovpos < bio->fused_iovcnt); 7716 7717 iov = &bio->fused_iovs[bio->fused_iovpos]; 7718 7719 *address = iov->iov_base; 7720 *length = iov->iov_len; 7721 7722 if (bio->fused_iov_offset) { 7723 assert(bio->fused_iov_offset <= iov->iov_len); 7724 *address += bio->fused_iov_offset; 7725 *length -= bio->fused_iov_offset; 7726 } 7727 7728 bio->fused_iov_offset += *length; 7729 if (bio->fused_iov_offset == iov->iov_len) { 7730 bio->fused_iovpos++; 7731 bio->fused_iov_offset = 0; 7732 } 7733 7734 return 0; 7735 } 7736 7737 static int 7738 bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7739 void *md, uint64_t lba_count, uint64_t lba) 7740 { 7741 int rc; 7742 7743 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n", 7744 lba_count, lba); 7745 7746 bio->iovs = iov; 7747 bio->iovcnt = iovcnt; 7748 bio->iovpos = 0; 7749 bio->iov_offset = 0; 7750 7751 rc = spdk_nvme_ns_cmd_readv_with_md(bio->io_path->nvme_ns->ns, 7752 bio->io_path->qpair->qpair, 7753 lba, lba_count, 7754 bdev_nvme_no_pi_readv_done, bio, 0, 7755 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 7756 md, 0, 0); 7757 7758 if (rc != 0 && rc != -ENOMEM) { 7759 SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc); 7760 } 7761 return rc; 7762 } 7763 7764 static int 7765 bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7766 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, 7767 struct spdk_memory_domain *domain, void *domain_ctx, 7768 struct spdk_accel_sequence *seq) 7769 { 7770 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7771 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7772 int rc; 7773 7774 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7775 lba_count, lba); 7776 7777 bio->iovs = iov; 7778 bio->iovcnt = iovcnt; 7779 bio->iovpos = 0; 7780 bio->iov_offset = 0; 7781 7782 if (domain != NULL || seq != NULL) { 7783 bio->ext_opts.size = SPDK_SIZEOF(&bio->ext_opts, accel_sequence); 7784 bio->ext_opts.memory_domain = domain; 7785 bio->ext_opts.memory_domain_ctx = domain_ctx; 7786 bio->ext_opts.io_flags = flags; 7787 bio->ext_opts.metadata = md; 7788 bio->ext_opts.accel_sequence = seq; 7789 7790 if (iovcnt == 1) { 7791 rc = spdk_nvme_ns_cmd_read_ext(ns, qpair, iov[0].iov_base, lba, lba_count, bdev_nvme_readv_done, 7792 bio, &bio->ext_opts); 7793 } else { 7794 rc = spdk_nvme_ns_cmd_readv_ext(ns, qpair, lba, lba_count, 7795 bdev_nvme_readv_done, bio, 7796 bdev_nvme_queued_reset_sgl, 7797 bdev_nvme_queued_next_sge, 7798 &bio->ext_opts); 7799 } 7800 } else if (iovcnt == 1) { 7801 rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base, 7802 md, lba, lba_count, bdev_nvme_readv_done, 7803 bio, flags, 0, 0); 7804 } else { 7805 rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count, 7806 bdev_nvme_readv_done, bio, flags, 7807 bdev_nvme_queued_reset_sgl, 7808 bdev_nvme_queued_next_sge, md, 0, 0); 7809 } 7810 7811 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 7812 SPDK_ERRLOG("readv failed: rc = %d\n", rc); 7813 } 7814 return rc; 7815 } 7816 7817 static int 7818 bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7819 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, 7820 struct spdk_memory_domain *domain, void *domain_ctx, 7821 struct spdk_accel_sequence *seq, 7822 union spdk_bdev_nvme_cdw12 cdw12, union spdk_bdev_nvme_cdw13 cdw13) 7823 { 7824 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7825 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7826 int rc; 7827 7828 SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7829 lba_count, lba); 7830 7831 bio->iovs = iov; 7832 bio->iovcnt = iovcnt; 7833 bio->iovpos = 0; 7834 bio->iov_offset = 0; 7835 7836 if (domain != NULL || seq != NULL) { 7837 bio->ext_opts.size = SPDK_SIZEOF(&bio->ext_opts, accel_sequence); 7838 bio->ext_opts.memory_domain = domain; 7839 bio->ext_opts.memory_domain_ctx = domain_ctx; 7840 bio->ext_opts.io_flags = flags | SPDK_NVME_IO_FLAGS_DIRECTIVE(cdw12.write.dtype); 7841 bio->ext_opts.cdw13 = cdw13.raw; 7842 bio->ext_opts.metadata = md; 7843 bio->ext_opts.accel_sequence = seq; 7844 7845 if (iovcnt == 1) { 7846 rc = spdk_nvme_ns_cmd_write_ext(ns, qpair, iov[0].iov_base, lba, lba_count, bdev_nvme_writev_done, 7847 bio, &bio->ext_opts); 7848 } else { 7849 rc = spdk_nvme_ns_cmd_writev_ext(ns, qpair, lba, lba_count, 7850 bdev_nvme_writev_done, bio, 7851 bdev_nvme_queued_reset_sgl, 7852 bdev_nvme_queued_next_sge, 7853 &bio->ext_opts); 7854 } 7855 } else if (iovcnt == 1) { 7856 rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base, 7857 md, lba, lba_count, bdev_nvme_writev_done, 7858 bio, flags, 0, 0); 7859 } else { 7860 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 7861 bdev_nvme_writev_done, bio, flags, 7862 bdev_nvme_queued_reset_sgl, 7863 bdev_nvme_queued_next_sge, md, 0, 0); 7864 } 7865 7866 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 7867 SPDK_ERRLOG("writev failed: rc = %d\n", rc); 7868 } 7869 return rc; 7870 } 7871 7872 static int 7873 bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7874 void *md, uint64_t lba_count, uint64_t zslba, 7875 uint32_t flags) 7876 { 7877 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7878 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7879 int rc; 7880 7881 SPDK_DEBUGLOG(bdev_nvme, "zone append %" PRIu64 " blocks to zone start lba %#" PRIx64 "\n", 7882 lba_count, zslba); 7883 7884 bio->iovs = iov; 7885 bio->iovcnt = iovcnt; 7886 bio->iovpos = 0; 7887 bio->iov_offset = 0; 7888 7889 if (iovcnt == 1) { 7890 rc = spdk_nvme_zns_zone_append_with_md(ns, qpair, iov[0].iov_base, md, zslba, 7891 lba_count, 7892 bdev_nvme_zone_appendv_done, bio, 7893 flags, 7894 0, 0); 7895 } else { 7896 rc = spdk_nvme_zns_zone_appendv_with_md(ns, qpair, zslba, lba_count, 7897 bdev_nvme_zone_appendv_done, bio, flags, 7898 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 7899 md, 0, 0); 7900 } 7901 7902 if (rc != 0 && rc != -ENOMEM) { 7903 SPDK_ERRLOG("zone append failed: rc = %d\n", rc); 7904 } 7905 return rc; 7906 } 7907 7908 static int 7909 bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7910 void *md, uint64_t lba_count, uint64_t lba, 7911 uint32_t flags) 7912 { 7913 int rc; 7914 7915 SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7916 lba_count, lba); 7917 7918 bio->iovs = iov; 7919 bio->iovcnt = iovcnt; 7920 bio->iovpos = 0; 7921 bio->iov_offset = 0; 7922 7923 rc = spdk_nvme_ns_cmd_comparev_with_md(bio->io_path->nvme_ns->ns, 7924 bio->io_path->qpair->qpair, 7925 lba, lba_count, 7926 bdev_nvme_comparev_done, bio, flags, 7927 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 7928 md, 0, 0); 7929 7930 if (rc != 0 && rc != -ENOMEM) { 7931 SPDK_ERRLOG("comparev failed: rc = %d\n", rc); 7932 } 7933 return rc; 7934 } 7935 7936 static int 7937 bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, 7938 struct iovec *write_iov, int write_iovcnt, 7939 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags) 7940 { 7941 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7942 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7943 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7944 int rc; 7945 7946 SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7947 lba_count, lba); 7948 7949 bio->iovs = cmp_iov; 7950 bio->iovcnt = cmp_iovcnt; 7951 bio->iovpos = 0; 7952 bio->iov_offset = 0; 7953 bio->fused_iovs = write_iov; 7954 bio->fused_iovcnt = write_iovcnt; 7955 bio->fused_iovpos = 0; 7956 bio->fused_iov_offset = 0; 7957 7958 if (bdev_io->num_retries == 0) { 7959 bio->first_fused_submitted = false; 7960 bio->first_fused_completed = false; 7961 } 7962 7963 if (!bio->first_fused_submitted) { 7964 flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST; 7965 memset(&bio->cpl, 0, sizeof(bio->cpl)); 7966 7967 rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count, 7968 bdev_nvme_comparev_and_writev_done, bio, flags, 7969 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0); 7970 if (rc == 0) { 7971 bio->first_fused_submitted = true; 7972 flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST; 7973 } else { 7974 if (rc != -ENOMEM) { 7975 SPDK_ERRLOG("compare failed: rc = %d\n", rc); 7976 } 7977 return rc; 7978 } 7979 } 7980 7981 flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND; 7982 7983 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 7984 bdev_nvme_comparev_and_writev_done, bio, flags, 7985 bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0); 7986 if (rc != 0 && rc != -ENOMEM) { 7987 SPDK_ERRLOG("write failed: rc = %d\n", rc); 7988 rc = 0; 7989 } 7990 7991 return rc; 7992 } 7993 7994 static int 7995 bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 7996 { 7997 struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES]; 7998 struct spdk_nvme_dsm_range *range; 7999 uint64_t offset, remaining; 8000 uint64_t num_ranges_u64; 8001 uint16_t num_ranges; 8002 int rc; 8003 8004 num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) / 8005 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 8006 if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) { 8007 SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks); 8008 return -EINVAL; 8009 } 8010 num_ranges = (uint16_t)num_ranges_u64; 8011 8012 offset = offset_blocks; 8013 remaining = num_blocks; 8014 range = &dsm_ranges[0]; 8015 8016 /* Fill max-size ranges until the remaining blocks fit into one range */ 8017 while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) { 8018 range->attributes.raw = 0; 8019 range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 8020 range->starting_lba = offset; 8021 8022 offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 8023 remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 8024 range++; 8025 } 8026 8027 /* Final range describes the remaining blocks */ 8028 range->attributes.raw = 0; 8029 range->length = remaining; 8030 range->starting_lba = offset; 8031 8032 rc = spdk_nvme_ns_cmd_dataset_management(bio->io_path->nvme_ns->ns, 8033 bio->io_path->qpair->qpair, 8034 SPDK_NVME_DSM_ATTR_DEALLOCATE, 8035 dsm_ranges, num_ranges, 8036 bdev_nvme_queued_done, bio); 8037 8038 return rc; 8039 } 8040 8041 static int 8042 bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 8043 { 8044 if (num_blocks > UINT16_MAX + 1) { 8045 SPDK_ERRLOG("NVMe write zeroes is limited to 16-bit block count\n"); 8046 return -EINVAL; 8047 } 8048 8049 return spdk_nvme_ns_cmd_write_zeroes(bio->io_path->nvme_ns->ns, 8050 bio->io_path->qpair->qpair, 8051 offset_blocks, num_blocks, 8052 bdev_nvme_queued_done, bio, 8053 0); 8054 } 8055 8056 static int 8057 bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones, 8058 struct spdk_bdev_zone_info *info) 8059 { 8060 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8061 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8062 uint32_t zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 8063 uint64_t zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 8064 uint64_t total_zones = spdk_nvme_zns_ns_get_num_zones(ns); 8065 8066 if (zone_id % zone_size != 0) { 8067 return -EINVAL; 8068 } 8069 8070 if (num_zones > total_zones || !num_zones) { 8071 return -EINVAL; 8072 } 8073 8074 assert(!bio->zone_report_buf); 8075 bio->zone_report_buf = calloc(1, zone_report_bufsize); 8076 if (!bio->zone_report_buf) { 8077 return -ENOMEM; 8078 } 8079 8080 bio->handled_zones = 0; 8081 8082 return spdk_nvme_zns_report_zones(ns, qpair, bio->zone_report_buf, zone_report_bufsize, 8083 zone_id, SPDK_NVME_ZRA_LIST_ALL, true, 8084 bdev_nvme_get_zone_info_done, bio); 8085 } 8086 8087 static int 8088 bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 8089 enum spdk_bdev_zone_action action) 8090 { 8091 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8092 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8093 8094 switch (action) { 8095 case SPDK_BDEV_ZONE_CLOSE: 8096 return spdk_nvme_zns_close_zone(ns, qpair, zone_id, false, 8097 bdev_nvme_zone_management_done, bio); 8098 case SPDK_BDEV_ZONE_FINISH: 8099 return spdk_nvme_zns_finish_zone(ns, qpair, zone_id, false, 8100 bdev_nvme_zone_management_done, bio); 8101 case SPDK_BDEV_ZONE_OPEN: 8102 return spdk_nvme_zns_open_zone(ns, qpair, zone_id, false, 8103 bdev_nvme_zone_management_done, bio); 8104 case SPDK_BDEV_ZONE_RESET: 8105 return spdk_nvme_zns_reset_zone(ns, qpair, zone_id, false, 8106 bdev_nvme_zone_management_done, bio); 8107 case SPDK_BDEV_ZONE_OFFLINE: 8108 return spdk_nvme_zns_offline_zone(ns, qpair, zone_id, false, 8109 bdev_nvme_zone_management_done, bio); 8110 default: 8111 return -EINVAL; 8112 } 8113 } 8114 8115 static void 8116 bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 8117 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) 8118 { 8119 struct nvme_io_path *io_path; 8120 struct nvme_ctrlr *nvme_ctrlr; 8121 uint32_t max_xfer_size; 8122 int rc = -ENXIO; 8123 8124 /* Choose the first ctrlr which is not failed. */ 8125 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 8126 nvme_ctrlr = io_path->qpair->ctrlr; 8127 8128 /* We should skip any unavailable nvme_ctrlr rather than checking 8129 * if the return value of spdk_nvme_ctrlr_cmd_admin_raw() is -ENXIO. 8130 */ 8131 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 8132 continue; 8133 } 8134 8135 max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ctrlr->ctrlr); 8136 8137 if (nbytes > max_xfer_size) { 8138 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 8139 rc = -EINVAL; 8140 goto err; 8141 } 8142 8143 rc = spdk_nvme_ctrlr_cmd_admin_raw(nvme_ctrlr->ctrlr, cmd, buf, (uint32_t)nbytes, 8144 bdev_nvme_admin_passthru_done, bio); 8145 if (rc == 0) { 8146 return; 8147 } 8148 } 8149 8150 err: 8151 bdev_nvme_admin_complete(bio, rc); 8152 } 8153 8154 static int 8155 bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 8156 void *buf, size_t nbytes) 8157 { 8158 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8159 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8160 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 8161 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 8162 8163 if (nbytes > max_xfer_size) { 8164 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 8165 return -EINVAL; 8166 } 8167 8168 /* 8169 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 8170 * so fill it out automatically. 8171 */ 8172 cmd->nsid = spdk_nvme_ns_get_id(ns); 8173 8174 return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf, 8175 (uint32_t)nbytes, bdev_nvme_queued_done, bio); 8176 } 8177 8178 static int 8179 bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 8180 void *buf, size_t nbytes, void *md_buf, size_t md_len) 8181 { 8182 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8183 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8184 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 8185 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 8186 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 8187 8188 if (nbytes > max_xfer_size) { 8189 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 8190 return -EINVAL; 8191 } 8192 8193 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 8194 SPDK_ERRLOG("invalid meta data buffer size\n"); 8195 return -EINVAL; 8196 } 8197 8198 /* 8199 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 8200 * so fill it out automatically. 8201 */ 8202 cmd->nsid = spdk_nvme_ns_get_id(ns); 8203 8204 return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf, 8205 (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio); 8206 } 8207 8208 static int 8209 bdev_nvme_iov_passthru_md(struct nvme_bdev_io *bio, 8210 struct spdk_nvme_cmd *cmd, struct iovec *iov, int iovcnt, 8211 size_t nbytes, void *md_buf, size_t md_len) 8212 { 8213 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8214 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8215 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 8216 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 8217 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 8218 8219 bio->iovs = iov; 8220 bio->iovcnt = iovcnt; 8221 bio->iovpos = 0; 8222 bio->iov_offset = 0; 8223 8224 if (nbytes > max_xfer_size) { 8225 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 8226 return -EINVAL; 8227 } 8228 8229 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 8230 SPDK_ERRLOG("invalid meta data buffer size\n"); 8231 return -EINVAL; 8232 } 8233 8234 /* 8235 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands 8236 * require a nsid, so fill it out automatically. 8237 */ 8238 cmd->nsid = spdk_nvme_ns_get_id(ns); 8239 8240 return spdk_nvme_ctrlr_cmd_iov_raw_with_md( 8241 ctrlr, qpair, cmd, (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio, 8242 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge); 8243 } 8244 8245 static void 8246 bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 8247 struct nvme_bdev_io *bio_to_abort) 8248 { 8249 struct nvme_io_path *io_path; 8250 int rc = 0; 8251 8252 rc = bdev_nvme_abort_retry_io(nbdev_ch, bio_to_abort); 8253 if (rc == 0) { 8254 bdev_nvme_admin_complete(bio, 0); 8255 return; 8256 } 8257 8258 io_path = bio_to_abort->io_path; 8259 if (io_path != NULL) { 8260 rc = spdk_nvme_ctrlr_cmd_abort_ext(io_path->qpair->ctrlr->ctrlr, 8261 io_path->qpair->qpair, 8262 bio_to_abort, 8263 bdev_nvme_abort_done, bio); 8264 } else { 8265 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 8266 rc = spdk_nvme_ctrlr_cmd_abort_ext(io_path->qpair->ctrlr->ctrlr, 8267 NULL, 8268 bio_to_abort, 8269 bdev_nvme_abort_done, bio); 8270 8271 if (rc != -ENOENT) { 8272 break; 8273 } 8274 } 8275 } 8276 8277 if (rc != 0) { 8278 /* If no command was found or there was any error, complete the abort 8279 * request with failure. 8280 */ 8281 bdev_nvme_admin_complete(bio, rc); 8282 } 8283 } 8284 8285 static int 8286 bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, uint64_t src_offset_blocks, 8287 uint64_t num_blocks) 8288 { 8289 struct spdk_nvme_scc_source_range range = { 8290 .slba = src_offset_blocks, 8291 .nlb = num_blocks - 1 8292 }; 8293 8294 return spdk_nvme_ns_cmd_copy(bio->io_path->nvme_ns->ns, 8295 bio->io_path->qpair->qpair, 8296 &range, 1, dst_offset_blocks, 8297 bdev_nvme_queued_done, bio); 8298 } 8299 8300 static void 8301 bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w) 8302 { 8303 const char *action; 8304 uint32_t i; 8305 8306 if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) { 8307 action = "reset"; 8308 } else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) { 8309 action = "abort"; 8310 } else { 8311 action = "none"; 8312 } 8313 8314 spdk_json_write_object_begin(w); 8315 8316 spdk_json_write_named_string(w, "method", "bdev_nvme_set_options"); 8317 8318 spdk_json_write_named_object_begin(w, "params"); 8319 spdk_json_write_named_string(w, "action_on_timeout", action); 8320 spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us); 8321 spdk_json_write_named_uint64(w, "timeout_admin_us", g_opts.timeout_admin_us); 8322 spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms); 8323 spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst); 8324 spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight); 8325 spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight); 8326 spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight); 8327 spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us); 8328 spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us); 8329 spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests); 8330 spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit); 8331 spdk_json_write_named_uint32(w, "transport_retry_count", g_opts.transport_retry_count); 8332 spdk_json_write_named_int32(w, "bdev_retry_count", g_opts.bdev_retry_count); 8333 spdk_json_write_named_uint8(w, "transport_ack_timeout", g_opts.transport_ack_timeout); 8334 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", g_opts.ctrlr_loss_timeout_sec); 8335 spdk_json_write_named_uint32(w, "reconnect_delay_sec", g_opts.reconnect_delay_sec); 8336 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", g_opts.fast_io_fail_timeout_sec); 8337 spdk_json_write_named_bool(w, "disable_auto_failback", g_opts.disable_auto_failback); 8338 spdk_json_write_named_bool(w, "generate_uuids", g_opts.generate_uuids); 8339 spdk_json_write_named_uint8(w, "transport_tos", g_opts.transport_tos); 8340 spdk_json_write_named_bool(w, "nvme_error_stat", g_opts.nvme_error_stat); 8341 spdk_json_write_named_uint32(w, "rdma_srq_size", g_opts.rdma_srq_size); 8342 spdk_json_write_named_bool(w, "io_path_stat", g_opts.io_path_stat); 8343 spdk_json_write_named_bool(w, "allow_accel_sequence", g_opts.allow_accel_sequence); 8344 spdk_json_write_named_uint32(w, "rdma_max_cq_size", g_opts.rdma_max_cq_size); 8345 spdk_json_write_named_uint16(w, "rdma_cm_event_timeout_ms", g_opts.rdma_cm_event_timeout_ms); 8346 spdk_json_write_named_array_begin(w, "dhchap_digests"); 8347 for (i = 0; i < 32; ++i) { 8348 if (g_opts.dhchap_digests & SPDK_BIT(i)) { 8349 spdk_json_write_string(w, spdk_nvme_dhchap_get_digest_name(i)); 8350 } 8351 } 8352 spdk_json_write_array_end(w); 8353 spdk_json_write_named_array_begin(w, "dhchap_dhgroups"); 8354 for (i = 0; i < 32; ++i) { 8355 if (g_opts.dhchap_dhgroups & SPDK_BIT(i)) { 8356 spdk_json_write_string(w, spdk_nvme_dhchap_get_dhgroup_name(i)); 8357 } 8358 } 8359 8360 spdk_json_write_array_end(w); 8361 spdk_json_write_object_end(w); 8362 8363 spdk_json_write_object_end(w); 8364 } 8365 8366 static void 8367 bdev_nvme_discovery_config_json(struct spdk_json_write_ctx *w, struct discovery_ctx *ctx) 8368 { 8369 struct spdk_nvme_transport_id trid; 8370 8371 spdk_json_write_object_begin(w); 8372 8373 spdk_json_write_named_string(w, "method", "bdev_nvme_start_discovery"); 8374 8375 spdk_json_write_named_object_begin(w, "params"); 8376 spdk_json_write_named_string(w, "name", ctx->name); 8377 spdk_json_write_named_string(w, "hostnqn", ctx->hostnqn); 8378 8379 trid = ctx->trid; 8380 memset(trid.subnqn, 0, sizeof(trid.subnqn)); 8381 nvme_bdev_dump_trid_json(&trid, w); 8382 8383 spdk_json_write_named_bool(w, "wait_for_attach", ctx->wait_for_attach); 8384 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", ctx->bdev_opts.ctrlr_loss_timeout_sec); 8385 spdk_json_write_named_uint32(w, "reconnect_delay_sec", ctx->bdev_opts.reconnect_delay_sec); 8386 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 8387 ctx->bdev_opts.fast_io_fail_timeout_sec); 8388 spdk_json_write_object_end(w); 8389 8390 spdk_json_write_object_end(w); 8391 } 8392 8393 #ifdef SPDK_CONFIG_NVME_CUSE 8394 static void 8395 nvme_ctrlr_cuse_config_json(struct spdk_json_write_ctx *w, 8396 struct nvme_ctrlr *nvme_ctrlr) 8397 { 8398 size_t cuse_name_size = 128; 8399 char cuse_name[cuse_name_size]; 8400 8401 if (spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr, 8402 cuse_name, &cuse_name_size) != 0) { 8403 return; 8404 } 8405 8406 spdk_json_write_object_begin(w); 8407 8408 spdk_json_write_named_string(w, "method", "bdev_nvme_cuse_register"); 8409 8410 spdk_json_write_named_object_begin(w, "params"); 8411 spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name); 8412 spdk_json_write_object_end(w); 8413 8414 spdk_json_write_object_end(w); 8415 } 8416 #endif 8417 8418 static void 8419 nvme_ctrlr_config_json(struct spdk_json_write_ctx *w, 8420 struct nvme_ctrlr *nvme_ctrlr) 8421 { 8422 struct spdk_nvme_transport_id *trid; 8423 const struct spdk_nvme_ctrlr_opts *opts; 8424 8425 if (nvme_ctrlr->opts.from_discovery_service) { 8426 /* Do not emit an RPC for this - it will be implicitly 8427 * covered by a separate bdev_nvme_start_discovery or 8428 * bdev_nvme_start_mdns_discovery RPC. 8429 */ 8430 return; 8431 } 8432 8433 trid = &nvme_ctrlr->active_path_id->trid; 8434 8435 spdk_json_write_object_begin(w); 8436 8437 spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller"); 8438 8439 spdk_json_write_named_object_begin(w, "params"); 8440 spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name); 8441 nvme_bdev_dump_trid_json(trid, w); 8442 spdk_json_write_named_bool(w, "prchk_reftag", 8443 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0); 8444 spdk_json_write_named_bool(w, "prchk_guard", 8445 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0); 8446 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", nvme_ctrlr->opts.ctrlr_loss_timeout_sec); 8447 spdk_json_write_named_uint32(w, "reconnect_delay_sec", nvme_ctrlr->opts.reconnect_delay_sec); 8448 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 8449 nvme_ctrlr->opts.fast_io_fail_timeout_sec); 8450 if (nvme_ctrlr->psk != NULL) { 8451 spdk_json_write_named_string(w, "psk", spdk_key_get_name(nvme_ctrlr->psk)); 8452 } else if (nvme_ctrlr->opts.psk[0] != '\0') { 8453 spdk_json_write_named_string(w, "psk", nvme_ctrlr->opts.psk); 8454 } 8455 8456 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 8457 spdk_json_write_named_string(w, "hostnqn", opts->hostnqn); 8458 spdk_json_write_named_bool(w, "hdgst", opts->header_digest); 8459 spdk_json_write_named_bool(w, "ddgst", opts->data_digest); 8460 if (opts->src_addr[0] != '\0') { 8461 spdk_json_write_named_string(w, "hostaddr", opts->src_addr); 8462 } 8463 if (opts->src_svcid[0] != '\0') { 8464 spdk_json_write_named_string(w, "hostsvcid", opts->src_svcid); 8465 } 8466 8467 spdk_json_write_object_end(w); 8468 8469 spdk_json_write_object_end(w); 8470 } 8471 8472 static void 8473 bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w) 8474 { 8475 spdk_json_write_object_begin(w); 8476 spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug"); 8477 8478 spdk_json_write_named_object_begin(w, "params"); 8479 spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us); 8480 spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled); 8481 spdk_json_write_object_end(w); 8482 8483 spdk_json_write_object_end(w); 8484 } 8485 8486 static int 8487 bdev_nvme_config_json(struct spdk_json_write_ctx *w) 8488 { 8489 struct nvme_bdev_ctrlr *nbdev_ctrlr; 8490 struct nvme_ctrlr *nvme_ctrlr; 8491 struct discovery_ctx *ctx; 8492 8493 bdev_nvme_opts_config_json(w); 8494 8495 pthread_mutex_lock(&g_bdev_nvme_mutex); 8496 8497 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 8498 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 8499 nvme_ctrlr_config_json(w, nvme_ctrlr); 8500 8501 #ifdef SPDK_CONFIG_NVME_CUSE 8502 nvme_ctrlr_cuse_config_json(w, nvme_ctrlr); 8503 #endif 8504 } 8505 } 8506 8507 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 8508 if (!ctx->from_mdns_discovery_service) { 8509 bdev_nvme_discovery_config_json(w, ctx); 8510 } 8511 } 8512 8513 bdev_nvme_mdns_discovery_config_json(w); 8514 8515 /* Dump as last parameter to give all NVMe bdevs chance to be constructed 8516 * before enabling hotplug poller. 8517 */ 8518 bdev_nvme_hotplug_config_json(w); 8519 8520 pthread_mutex_unlock(&g_bdev_nvme_mutex); 8521 return 0; 8522 } 8523 8524 struct spdk_nvme_ctrlr * 8525 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev) 8526 { 8527 struct nvme_bdev *nbdev; 8528 struct nvme_ns *nvme_ns; 8529 8530 if (!bdev || bdev->module != &nvme_if) { 8531 return NULL; 8532 } 8533 8534 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 8535 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 8536 assert(nvme_ns != NULL); 8537 8538 return nvme_ns->ctrlr->ctrlr; 8539 } 8540 8541 static bool 8542 nvme_io_path_is_current(struct nvme_io_path *io_path) 8543 { 8544 const struct nvme_bdev_channel *nbdev_ch; 8545 bool current; 8546 8547 if (!nvme_io_path_is_available(io_path)) { 8548 return false; 8549 } 8550 8551 nbdev_ch = io_path->nbdev_ch; 8552 if (nbdev_ch == NULL) { 8553 current = false; 8554 } else if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) { 8555 struct nvme_io_path *optimized_io_path = NULL; 8556 8557 STAILQ_FOREACH(optimized_io_path, &nbdev_ch->io_path_list, stailq) { 8558 if (optimized_io_path->nvme_ns->ana_state == SPDK_NVME_ANA_OPTIMIZED_STATE) { 8559 break; 8560 } 8561 } 8562 8563 /* A non-optimized path is only current if there are no optimized paths. */ 8564 current = (io_path->nvme_ns->ana_state == SPDK_NVME_ANA_OPTIMIZED_STATE) || 8565 (optimized_io_path == NULL); 8566 } else { 8567 if (nbdev_ch->current_io_path) { 8568 current = (io_path == nbdev_ch->current_io_path); 8569 } else { 8570 struct nvme_io_path *first_path; 8571 8572 /* We arrived here as there are no optimized paths for active-passive 8573 * mode. Check if this io_path is the first one available on the list. 8574 */ 8575 current = false; 8576 STAILQ_FOREACH(first_path, &nbdev_ch->io_path_list, stailq) { 8577 if (nvme_io_path_is_available(first_path)) { 8578 current = (io_path == first_path); 8579 break; 8580 } 8581 } 8582 } 8583 } 8584 8585 return current; 8586 } 8587 8588 void 8589 nvme_io_path_info_json(struct spdk_json_write_ctx *w, struct nvme_io_path *io_path) 8590 { 8591 struct nvme_ns *nvme_ns = io_path->nvme_ns; 8592 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 8593 const struct spdk_nvme_ctrlr_data *cdata; 8594 const struct spdk_nvme_transport_id *trid; 8595 const char *adrfam_str; 8596 8597 spdk_json_write_object_begin(w); 8598 8599 spdk_json_write_named_string(w, "bdev_name", nvme_ns->bdev->disk.name); 8600 8601 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 8602 trid = spdk_nvme_ctrlr_get_transport_id(nvme_ctrlr->ctrlr); 8603 8604 spdk_json_write_named_uint32(w, "cntlid", cdata->cntlid); 8605 spdk_json_write_named_bool(w, "current", nvme_io_path_is_current(io_path)); 8606 spdk_json_write_named_bool(w, "connected", nvme_qpair_is_connected(io_path->qpair)); 8607 spdk_json_write_named_bool(w, "accessible", nvme_ns_is_accessible(nvme_ns)); 8608 8609 spdk_json_write_named_object_begin(w, "transport"); 8610 spdk_json_write_named_string(w, "trtype", trid->trstring); 8611 spdk_json_write_named_string(w, "traddr", trid->traddr); 8612 if (trid->trsvcid[0] != '\0') { 8613 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 8614 } 8615 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 8616 if (adrfam_str) { 8617 spdk_json_write_named_string(w, "adrfam", adrfam_str); 8618 } 8619 spdk_json_write_object_end(w); 8620 8621 spdk_json_write_object_end(w); 8622 } 8623 8624 void 8625 bdev_nvme_get_discovery_info(struct spdk_json_write_ctx *w) 8626 { 8627 struct discovery_ctx *ctx; 8628 struct discovery_entry_ctx *entry_ctx; 8629 8630 spdk_json_write_array_begin(w); 8631 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 8632 spdk_json_write_object_begin(w); 8633 spdk_json_write_named_string(w, "name", ctx->name); 8634 8635 spdk_json_write_named_object_begin(w, "trid"); 8636 nvme_bdev_dump_trid_json(&ctx->trid, w); 8637 spdk_json_write_object_end(w); 8638 8639 spdk_json_write_named_array_begin(w, "referrals"); 8640 TAILQ_FOREACH(entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 8641 spdk_json_write_object_begin(w); 8642 spdk_json_write_named_object_begin(w, "trid"); 8643 nvme_bdev_dump_trid_json(&entry_ctx->trid, w); 8644 spdk_json_write_object_end(w); 8645 spdk_json_write_object_end(w); 8646 } 8647 spdk_json_write_array_end(w); 8648 8649 spdk_json_write_object_end(w); 8650 } 8651 spdk_json_write_array_end(w); 8652 } 8653 8654 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme) 8655 8656 SPDK_TRACE_REGISTER_FN(bdev_nvme_trace, "bdev_nvme", TRACE_GROUP_BDEV_NVME) 8657 { 8658 struct spdk_trace_tpoint_opts opts[] = { 8659 { 8660 "BDEV_NVME_IO_START", TRACE_BDEV_NVME_IO_START, 8661 OWNER_TYPE_NONE, OBJECT_BDEV_NVME_IO, 1, 8662 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 8663 }, 8664 { 8665 "BDEV_NVME_IO_DONE", TRACE_BDEV_NVME_IO_DONE, 8666 OWNER_TYPE_NONE, OBJECT_BDEV_NVME_IO, 0, 8667 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 8668 } 8669 }; 8670 8671 8672 spdk_trace_register_object(OBJECT_BDEV_NVME_IO, 'N'); 8673 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 8674 spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_SUBMIT, OBJECT_BDEV_NVME_IO, 0); 8675 spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_SUBMIT, OBJECT_BDEV_NVME_IO, 0); 8676 spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_COMPLETE, OBJECT_BDEV_NVME_IO, 0); 8677 spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_COMPLETE, OBJECT_BDEV_NVME_IO, 0); 8678 } 8679