1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 * Copyright (c) 2022 Dell Inc, or its subsidiaries. All rights reserved. 6 */ 7 8 #include "spdk/stdinc.h" 9 10 #include "bdev_nvme.h" 11 12 #include "spdk/accel.h" 13 #include "spdk/config.h" 14 #include "spdk/endian.h" 15 #include "spdk/bdev.h" 16 #include "spdk/json.h" 17 #include "spdk/keyring.h" 18 #include "spdk/likely.h" 19 #include "spdk/nvme.h" 20 #include "spdk/nvme_ocssd.h" 21 #include "spdk/nvme_zns.h" 22 #include "spdk/opal.h" 23 #include "spdk/thread.h" 24 #include "spdk/trace.h" 25 #include "spdk/string.h" 26 #include "spdk/util.h" 27 #include "spdk/uuid.h" 28 29 #include "spdk/bdev_module.h" 30 #include "spdk/log.h" 31 32 #include "spdk_internal/usdt.h" 33 #include "spdk_internal/trace_defs.h" 34 35 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true 36 #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS (10000) 37 38 #define NSID_STR_LEN 10 39 40 #define SPDK_CONTROLLER_NAME_MAX 512 41 42 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w); 43 44 struct nvme_bdev_io { 45 /** array of iovecs to transfer. */ 46 struct iovec *iovs; 47 48 /** Number of iovecs in iovs array. */ 49 int iovcnt; 50 51 /** Current iovec position. */ 52 int iovpos; 53 54 /** Offset in current iovec. */ 55 uint32_t iov_offset; 56 57 /** I/O path the current I/O or admin passthrough is submitted on, or the I/O path 58 * being reset in a reset I/O. 59 */ 60 struct nvme_io_path *io_path; 61 62 /** array of iovecs to transfer. */ 63 struct iovec *fused_iovs; 64 65 /** Number of iovecs in iovs array. */ 66 int fused_iovcnt; 67 68 /** Current iovec position. */ 69 int fused_iovpos; 70 71 /** Offset in current iovec. */ 72 uint32_t fused_iov_offset; 73 74 /** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */ 75 struct spdk_nvme_cpl cpl; 76 77 /** Extended IO opts passed by the user to bdev layer and mapped to NVME format */ 78 struct spdk_nvme_ns_cmd_ext_io_opts ext_opts; 79 80 /** Keeps track if first of fused commands was submitted */ 81 bool first_fused_submitted; 82 83 /** Keeps track if first of fused commands was completed */ 84 bool first_fused_completed; 85 86 /** Temporary pointer to zone report buffer */ 87 struct spdk_nvme_zns_zone_report *zone_report_buf; 88 89 /** Keep track of how many zones that have been copied to the spdk_bdev_zone_info struct */ 90 uint64_t handled_zones; 91 92 /** Expiration value in ticks to retry the current I/O. */ 93 uint64_t retry_ticks; 94 95 /* How many times the current I/O was retried. */ 96 int32_t retry_count; 97 98 /* Current tsc at submit time. */ 99 uint64_t submit_tsc; 100 }; 101 102 struct nvme_probe_skip_entry { 103 struct spdk_nvme_transport_id trid; 104 TAILQ_ENTRY(nvme_probe_skip_entry) tailq; 105 }; 106 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */ 107 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER( 108 g_skipped_nvme_ctrlrs); 109 110 #define BDEV_NVME_DEFAULT_DIGESTS (SPDK_BIT(SPDK_NVMF_DHCHAP_HASH_SHA256) | \ 111 SPDK_BIT(SPDK_NVMF_DHCHAP_HASH_SHA384) | \ 112 SPDK_BIT(SPDK_NVMF_DHCHAP_HASH_SHA512)) 113 114 #define BDEV_NVME_DEFAULT_DHGROUPS (SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_NULL) | \ 115 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_2048) | \ 116 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_3072) | \ 117 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_4096) | \ 118 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_6144) | \ 119 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_8192)) 120 121 static struct spdk_bdev_nvme_opts g_opts = { 122 .action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE, 123 .timeout_us = 0, 124 .timeout_admin_us = 0, 125 .keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS, 126 .transport_retry_count = 4, 127 .arbitration_burst = 0, 128 .low_priority_weight = 0, 129 .medium_priority_weight = 0, 130 .high_priority_weight = 0, 131 .nvme_adminq_poll_period_us = 10000ULL, 132 .nvme_ioq_poll_period_us = 0, 133 .io_queue_requests = 0, 134 .delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT, 135 .bdev_retry_count = 3, 136 .transport_ack_timeout = 0, 137 .ctrlr_loss_timeout_sec = 0, 138 .reconnect_delay_sec = 0, 139 .fast_io_fail_timeout_sec = 0, 140 .disable_auto_failback = false, 141 .generate_uuids = false, 142 .transport_tos = 0, 143 .nvme_error_stat = false, 144 .io_path_stat = false, 145 .allow_accel_sequence = false, 146 .dhchap_digests = BDEV_NVME_DEFAULT_DIGESTS, 147 .dhchap_dhgroups = BDEV_NVME_DEFAULT_DHGROUPS, 148 }; 149 150 #define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL 151 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT 100000ULL 152 153 static int g_hot_insert_nvme_controller_index = 0; 154 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT; 155 static bool g_nvme_hotplug_enabled = false; 156 struct spdk_thread *g_bdev_nvme_init_thread; 157 static struct spdk_poller *g_hotplug_poller; 158 static struct spdk_poller *g_hotplug_probe_poller; 159 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx; 160 161 static void nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 162 struct nvme_async_probe_ctx *ctx); 163 static void nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 164 struct nvme_async_probe_ctx *ctx); 165 static int bdev_nvme_library_init(void); 166 static void bdev_nvme_library_fini(void); 167 static void _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, 168 struct spdk_bdev_io *bdev_io); 169 static void bdev_nvme_submit_request(struct spdk_io_channel *ch, 170 struct spdk_bdev_io *bdev_io); 171 static int bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 172 void *md, uint64_t lba_count, uint64_t lba, 173 uint32_t flags, struct spdk_memory_domain *domain, void *domain_ctx, 174 struct spdk_accel_sequence *seq); 175 static int bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 176 void *md, uint64_t lba_count, uint64_t lba); 177 static int bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 178 void *md, uint64_t lba_count, uint64_t lba, 179 uint32_t flags, struct spdk_memory_domain *domain, void *domain_ctx, 180 struct spdk_accel_sequence *seq, 181 union spdk_bdev_nvme_cdw12 cdw12, union spdk_bdev_nvme_cdw13 cdw13); 182 static int bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 183 void *md, uint64_t lba_count, 184 uint64_t zslba, uint32_t flags); 185 static int bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 186 void *md, uint64_t lba_count, uint64_t lba, 187 uint32_t flags); 188 static int bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, 189 struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov, 190 int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba, 191 uint32_t flags); 192 static int bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, 193 uint32_t num_zones, struct spdk_bdev_zone_info *info); 194 static int bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 195 enum spdk_bdev_zone_action action); 196 static void bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, 197 struct nvme_bdev_io *bio, 198 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); 199 static int bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 200 void *buf, size_t nbytes); 201 static int bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 202 void *buf, size_t nbytes, void *md_buf, size_t md_len); 203 static int bdev_nvme_iov_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 204 struct iovec *iov, int iovcnt, size_t nbytes, 205 void *md_buf, size_t md_len); 206 static void bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, 207 struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort); 208 static void bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio); 209 static int bdev_nvme_reset_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 210 static int bdev_nvme_failover_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 211 static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr); 212 static int nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr); 213 214 static struct nvme_ns *nvme_ns_alloc(void); 215 static void nvme_ns_free(struct nvme_ns *ns); 216 217 static int 218 nvme_ns_cmp(struct nvme_ns *ns1, struct nvme_ns *ns2) 219 { 220 return ns1->id < ns2->id ? -1 : ns1->id > ns2->id; 221 } 222 223 RB_GENERATE_STATIC(nvme_ns_tree, nvme_ns, node, nvme_ns_cmp); 224 225 struct spdk_nvme_qpair * 226 bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch) 227 { 228 struct nvme_ctrlr_channel *ctrlr_ch; 229 230 assert(ctrlr_io_ch != NULL); 231 232 ctrlr_ch = spdk_io_channel_get_ctx(ctrlr_io_ch); 233 234 return ctrlr_ch->qpair->qpair; 235 } 236 237 static int 238 bdev_nvme_get_ctx_size(void) 239 { 240 return sizeof(struct nvme_bdev_io); 241 } 242 243 static struct spdk_bdev_module nvme_if = { 244 .name = "nvme", 245 .async_fini = true, 246 .module_init = bdev_nvme_library_init, 247 .module_fini = bdev_nvme_library_fini, 248 .config_json = bdev_nvme_config_json, 249 .get_ctx_size = bdev_nvme_get_ctx_size, 250 251 }; 252 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if) 253 254 struct nvme_bdev_ctrlrs g_nvme_bdev_ctrlrs = TAILQ_HEAD_INITIALIZER(g_nvme_bdev_ctrlrs); 255 pthread_mutex_t g_bdev_nvme_mutex = PTHREAD_MUTEX_INITIALIZER; 256 bool g_bdev_nvme_module_finish; 257 258 struct nvme_bdev_ctrlr * 259 nvme_bdev_ctrlr_get_by_name(const char *name) 260 { 261 struct nvme_bdev_ctrlr *nbdev_ctrlr; 262 263 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 264 if (strcmp(name, nbdev_ctrlr->name) == 0) { 265 break; 266 } 267 } 268 269 return nbdev_ctrlr; 270 } 271 272 static struct nvme_ctrlr * 273 nvme_bdev_ctrlr_get_ctrlr(struct nvme_bdev_ctrlr *nbdev_ctrlr, 274 const struct spdk_nvme_transport_id *trid) 275 { 276 struct nvme_ctrlr *nvme_ctrlr; 277 278 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 279 if (spdk_nvme_transport_id_compare(trid, &nvme_ctrlr->active_path_id->trid) == 0) { 280 break; 281 } 282 } 283 284 return nvme_ctrlr; 285 } 286 287 struct nvme_ctrlr * 288 nvme_bdev_ctrlr_get_ctrlr_by_id(struct nvme_bdev_ctrlr *nbdev_ctrlr, 289 uint16_t cntlid) 290 { 291 struct nvme_ctrlr *nvme_ctrlr; 292 const struct spdk_nvme_ctrlr_data *cdata; 293 294 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 295 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 296 if (cdata->cntlid == cntlid) { 297 break; 298 } 299 } 300 301 return nvme_ctrlr; 302 } 303 304 static struct nvme_bdev * 305 nvme_bdev_ctrlr_get_bdev(struct nvme_bdev_ctrlr *nbdev_ctrlr, uint32_t nsid) 306 { 307 struct nvme_bdev *bdev; 308 309 pthread_mutex_lock(&g_bdev_nvme_mutex); 310 TAILQ_FOREACH(bdev, &nbdev_ctrlr->bdevs, tailq) { 311 if (bdev->nsid == nsid) { 312 break; 313 } 314 } 315 pthread_mutex_unlock(&g_bdev_nvme_mutex); 316 317 return bdev; 318 } 319 320 struct nvme_ns * 321 nvme_ctrlr_get_ns(struct nvme_ctrlr *nvme_ctrlr, uint32_t nsid) 322 { 323 struct nvme_ns ns; 324 325 assert(nsid > 0); 326 327 ns.id = nsid; 328 return RB_FIND(nvme_ns_tree, &nvme_ctrlr->namespaces, &ns); 329 } 330 331 struct nvme_ns * 332 nvme_ctrlr_get_first_active_ns(struct nvme_ctrlr *nvme_ctrlr) 333 { 334 return RB_MIN(nvme_ns_tree, &nvme_ctrlr->namespaces); 335 } 336 337 struct nvme_ns * 338 nvme_ctrlr_get_next_active_ns(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *ns) 339 { 340 if (ns == NULL) { 341 return NULL; 342 } 343 344 return RB_NEXT(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 345 } 346 347 static struct nvme_ctrlr * 348 nvme_ctrlr_get(const struct spdk_nvme_transport_id *trid) 349 { 350 struct nvme_bdev_ctrlr *nbdev_ctrlr; 351 struct nvme_ctrlr *nvme_ctrlr = NULL; 352 353 pthread_mutex_lock(&g_bdev_nvme_mutex); 354 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 355 nvme_ctrlr = nvme_bdev_ctrlr_get_ctrlr(nbdev_ctrlr, trid); 356 if (nvme_ctrlr != NULL) { 357 break; 358 } 359 } 360 pthread_mutex_unlock(&g_bdev_nvme_mutex); 361 362 return nvme_ctrlr; 363 } 364 365 struct nvme_ctrlr * 366 nvme_ctrlr_get_by_name(const char *name) 367 { 368 struct nvme_bdev_ctrlr *nbdev_ctrlr; 369 struct nvme_ctrlr *nvme_ctrlr = NULL; 370 371 if (name == NULL) { 372 return NULL; 373 } 374 375 pthread_mutex_lock(&g_bdev_nvme_mutex); 376 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 377 if (nbdev_ctrlr != NULL) { 378 nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 379 } 380 pthread_mutex_unlock(&g_bdev_nvme_mutex); 381 382 return nvme_ctrlr; 383 } 384 385 void 386 nvme_bdev_ctrlr_for_each(nvme_bdev_ctrlr_for_each_fn fn, void *ctx) 387 { 388 struct nvme_bdev_ctrlr *nbdev_ctrlr; 389 390 pthread_mutex_lock(&g_bdev_nvme_mutex); 391 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 392 fn(nbdev_ctrlr, ctx); 393 } 394 pthread_mutex_unlock(&g_bdev_nvme_mutex); 395 } 396 397 void 398 nvme_bdev_dump_trid_json(const struct spdk_nvme_transport_id *trid, struct spdk_json_write_ctx *w) 399 { 400 const char *trtype_str; 401 const char *adrfam_str; 402 403 trtype_str = spdk_nvme_transport_id_trtype_str(trid->trtype); 404 if (trtype_str) { 405 spdk_json_write_named_string(w, "trtype", trtype_str); 406 } 407 408 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 409 if (adrfam_str) { 410 spdk_json_write_named_string(w, "adrfam", adrfam_str); 411 } 412 413 if (trid->traddr[0] != '\0') { 414 spdk_json_write_named_string(w, "traddr", trid->traddr); 415 } 416 417 if (trid->trsvcid[0] != '\0') { 418 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 419 } 420 421 if (trid->subnqn[0] != '\0') { 422 spdk_json_write_named_string(w, "subnqn", trid->subnqn); 423 } 424 } 425 426 static void 427 nvme_bdev_ctrlr_delete(struct nvme_bdev_ctrlr *nbdev_ctrlr, 428 struct nvme_ctrlr *nvme_ctrlr) 429 { 430 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_delete, nvme_ctrlr->nbdev_ctrlr->name); 431 pthread_mutex_lock(&g_bdev_nvme_mutex); 432 433 TAILQ_REMOVE(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 434 if (!TAILQ_EMPTY(&nbdev_ctrlr->ctrlrs)) { 435 pthread_mutex_unlock(&g_bdev_nvme_mutex); 436 437 return; 438 } 439 TAILQ_REMOVE(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 440 441 pthread_mutex_unlock(&g_bdev_nvme_mutex); 442 443 assert(TAILQ_EMPTY(&nbdev_ctrlr->bdevs)); 444 445 free(nbdev_ctrlr->name); 446 free(nbdev_ctrlr); 447 } 448 449 static void 450 _nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 451 { 452 struct nvme_path_id *path_id, *tmp_path; 453 struct nvme_ns *ns, *tmp_ns; 454 455 free(nvme_ctrlr->copied_ana_desc); 456 spdk_free(nvme_ctrlr->ana_log_page); 457 458 if (nvme_ctrlr->opal_dev) { 459 spdk_opal_dev_destruct(nvme_ctrlr->opal_dev); 460 nvme_ctrlr->opal_dev = NULL; 461 } 462 463 if (nvme_ctrlr->nbdev_ctrlr) { 464 nvme_bdev_ctrlr_delete(nvme_ctrlr->nbdev_ctrlr, nvme_ctrlr); 465 } 466 467 RB_FOREACH_SAFE(ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp_ns) { 468 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 469 nvme_ns_free(ns); 470 } 471 472 TAILQ_FOREACH_SAFE(path_id, &nvme_ctrlr->trids, link, tmp_path) { 473 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 474 free(path_id); 475 } 476 477 pthread_mutex_destroy(&nvme_ctrlr->mutex); 478 spdk_keyring_put_key(nvme_ctrlr->psk); 479 spdk_keyring_put_key(nvme_ctrlr->dhchap_key); 480 spdk_keyring_put_key(nvme_ctrlr->dhchap_ctrlr_key); 481 free(nvme_ctrlr); 482 483 pthread_mutex_lock(&g_bdev_nvme_mutex); 484 if (g_bdev_nvme_module_finish && TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 485 pthread_mutex_unlock(&g_bdev_nvme_mutex); 486 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 487 spdk_bdev_module_fini_done(); 488 return; 489 } 490 pthread_mutex_unlock(&g_bdev_nvme_mutex); 491 } 492 493 static int 494 nvme_detach_poller(void *arg) 495 { 496 struct nvme_ctrlr *nvme_ctrlr = arg; 497 int rc; 498 499 rc = spdk_nvme_detach_poll_async(nvme_ctrlr->detach_ctx); 500 if (rc != -EAGAIN) { 501 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 502 _nvme_ctrlr_delete(nvme_ctrlr); 503 } 504 505 return SPDK_POLLER_BUSY; 506 } 507 508 static void 509 nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 510 { 511 int rc; 512 513 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 514 515 /* First, unregister the adminq poller, as the driver will poll adminq if necessary */ 516 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 517 518 /* If we got here, the reset/detach poller cannot be active */ 519 assert(nvme_ctrlr->reset_detach_poller == NULL); 520 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(nvme_detach_poller, 521 nvme_ctrlr, 1000); 522 if (nvme_ctrlr->reset_detach_poller == NULL) { 523 SPDK_ERRLOG("Failed to register detach poller\n"); 524 goto error; 525 } 526 527 rc = spdk_nvme_detach_async(nvme_ctrlr->ctrlr, &nvme_ctrlr->detach_ctx); 528 if (rc != 0) { 529 SPDK_ERRLOG("Failed to detach the NVMe controller\n"); 530 goto error; 531 } 532 533 return; 534 error: 535 /* We don't have a good way to handle errors here, so just do what we can and delete the 536 * controller without detaching the underlying NVMe device. 537 */ 538 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 539 _nvme_ctrlr_delete(nvme_ctrlr); 540 } 541 542 static void 543 nvme_ctrlr_unregister_cb(void *io_device) 544 { 545 struct nvme_ctrlr *nvme_ctrlr = io_device; 546 547 nvme_ctrlr_delete(nvme_ctrlr); 548 } 549 550 static void 551 nvme_ctrlr_unregister(void *ctx) 552 { 553 struct nvme_ctrlr *nvme_ctrlr = ctx; 554 555 spdk_io_device_unregister(nvme_ctrlr, nvme_ctrlr_unregister_cb); 556 } 557 558 static bool 559 nvme_ctrlr_can_be_unregistered(struct nvme_ctrlr *nvme_ctrlr) 560 { 561 if (!nvme_ctrlr->destruct) { 562 return false; 563 } 564 565 if (nvme_ctrlr->ref > 0) { 566 return false; 567 } 568 569 if (nvme_ctrlr->resetting) { 570 return false; 571 } 572 573 if (nvme_ctrlr->ana_log_page_updating) { 574 return false; 575 } 576 577 if (nvme_ctrlr->io_path_cache_clearing) { 578 return false; 579 } 580 581 return true; 582 } 583 584 static void 585 nvme_ctrlr_release(struct nvme_ctrlr *nvme_ctrlr) 586 { 587 pthread_mutex_lock(&nvme_ctrlr->mutex); 588 SPDK_DTRACE_PROBE2(bdev_nvme_ctrlr_release, nvme_ctrlr->nbdev_ctrlr->name, nvme_ctrlr->ref); 589 590 assert(nvme_ctrlr->ref > 0); 591 nvme_ctrlr->ref--; 592 593 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 594 pthread_mutex_unlock(&nvme_ctrlr->mutex); 595 return; 596 } 597 598 pthread_mutex_unlock(&nvme_ctrlr->mutex); 599 600 spdk_thread_exec_msg(nvme_ctrlr->thread, nvme_ctrlr_unregister, nvme_ctrlr); 601 } 602 603 static void 604 bdev_nvme_clear_current_io_path(struct nvme_bdev_channel *nbdev_ch) 605 { 606 nbdev_ch->current_io_path = NULL; 607 nbdev_ch->rr_counter = 0; 608 } 609 610 static struct nvme_io_path * 611 _bdev_nvme_get_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 612 { 613 struct nvme_io_path *io_path; 614 615 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 616 if (io_path->nvme_ns == nvme_ns) { 617 break; 618 } 619 } 620 621 return io_path; 622 } 623 624 static struct nvme_io_path * 625 nvme_io_path_alloc(void) 626 { 627 struct nvme_io_path *io_path; 628 629 io_path = calloc(1, sizeof(*io_path)); 630 if (io_path == NULL) { 631 SPDK_ERRLOG("Failed to alloc io_path.\n"); 632 return NULL; 633 } 634 635 if (g_opts.io_path_stat) { 636 io_path->stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); 637 if (io_path->stat == NULL) { 638 free(io_path); 639 SPDK_ERRLOG("Failed to alloc io_path stat.\n"); 640 return NULL; 641 } 642 spdk_bdev_reset_io_stat(io_path->stat, SPDK_BDEV_RESET_STAT_MAXMIN); 643 } 644 645 return io_path; 646 } 647 648 static void 649 nvme_io_path_free(struct nvme_io_path *io_path) 650 { 651 free(io_path->stat); 652 free(io_path); 653 } 654 655 static int 656 _bdev_nvme_add_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 657 { 658 struct nvme_io_path *io_path; 659 struct spdk_io_channel *ch; 660 struct nvme_ctrlr_channel *ctrlr_ch; 661 struct nvme_qpair *nvme_qpair; 662 663 io_path = nvme_io_path_alloc(); 664 if (io_path == NULL) { 665 return -ENOMEM; 666 } 667 668 io_path->nvme_ns = nvme_ns; 669 670 ch = spdk_get_io_channel(nvme_ns->ctrlr); 671 if (ch == NULL) { 672 nvme_io_path_free(io_path); 673 SPDK_ERRLOG("Failed to alloc io_channel.\n"); 674 return -ENOMEM; 675 } 676 677 ctrlr_ch = spdk_io_channel_get_ctx(ch); 678 679 nvme_qpair = ctrlr_ch->qpair; 680 assert(nvme_qpair != NULL); 681 682 io_path->qpair = nvme_qpair; 683 TAILQ_INSERT_TAIL(&nvme_qpair->io_path_list, io_path, tailq); 684 685 io_path->nbdev_ch = nbdev_ch; 686 STAILQ_INSERT_TAIL(&nbdev_ch->io_path_list, io_path, stailq); 687 688 bdev_nvme_clear_current_io_path(nbdev_ch); 689 690 return 0; 691 } 692 693 static void 694 bdev_nvme_clear_retry_io_path(struct nvme_bdev_channel *nbdev_ch, 695 struct nvme_io_path *io_path) 696 { 697 struct spdk_bdev_io *bdev_io; 698 struct nvme_bdev_io *bio; 699 700 TAILQ_FOREACH(bdev_io, &nbdev_ch->retry_io_list, module_link) { 701 bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 702 if (bio->io_path == io_path) { 703 bio->io_path = NULL; 704 } 705 } 706 } 707 708 static void 709 _bdev_nvme_delete_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *io_path) 710 { 711 struct spdk_io_channel *ch; 712 struct nvme_qpair *nvme_qpair; 713 struct nvme_ctrlr_channel *ctrlr_ch; 714 struct nvme_bdev *nbdev; 715 716 nbdev = spdk_io_channel_get_io_device(spdk_io_channel_from_ctx(nbdev_ch)); 717 718 /* Add the statistics to nvme_ns before this path is destroyed. */ 719 pthread_mutex_lock(&nbdev->mutex); 720 if (nbdev->ref != 0 && io_path->nvme_ns->stat != NULL && io_path->stat != NULL) { 721 spdk_bdev_add_io_stat(io_path->nvme_ns->stat, io_path->stat); 722 } 723 pthread_mutex_unlock(&nbdev->mutex); 724 725 bdev_nvme_clear_current_io_path(nbdev_ch); 726 bdev_nvme_clear_retry_io_path(nbdev_ch, io_path); 727 728 STAILQ_REMOVE(&nbdev_ch->io_path_list, io_path, nvme_io_path, stailq); 729 io_path->nbdev_ch = NULL; 730 731 nvme_qpair = io_path->qpair; 732 assert(nvme_qpair != NULL); 733 734 ctrlr_ch = nvme_qpair->ctrlr_ch; 735 assert(ctrlr_ch != NULL); 736 737 ch = spdk_io_channel_from_ctx(ctrlr_ch); 738 spdk_put_io_channel(ch); 739 740 /* After an io_path is removed, I/Os submitted to it may complete and update statistics 741 * of the io_path. To avoid heap-use-after-free error from this case, do not free the 742 * io_path here but free the io_path when the associated qpair is freed. It is ensured 743 * that all I/Os submitted to the io_path are completed when the associated qpair is freed. 744 */ 745 } 746 747 static void 748 _bdev_nvme_delete_io_paths(struct nvme_bdev_channel *nbdev_ch) 749 { 750 struct nvme_io_path *io_path, *tmp_io_path; 751 752 STAILQ_FOREACH_SAFE(io_path, &nbdev_ch->io_path_list, stailq, tmp_io_path) { 753 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 754 } 755 } 756 757 static int 758 bdev_nvme_create_bdev_channel_cb(void *io_device, void *ctx_buf) 759 { 760 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 761 struct nvme_bdev *nbdev = io_device; 762 struct nvme_ns *nvme_ns; 763 int rc; 764 765 STAILQ_INIT(&nbdev_ch->io_path_list); 766 TAILQ_INIT(&nbdev_ch->retry_io_list); 767 768 pthread_mutex_lock(&nbdev->mutex); 769 770 nbdev_ch->mp_policy = nbdev->mp_policy; 771 nbdev_ch->mp_selector = nbdev->mp_selector; 772 nbdev_ch->rr_min_io = nbdev->rr_min_io; 773 774 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 775 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 776 if (rc != 0) { 777 pthread_mutex_unlock(&nbdev->mutex); 778 779 _bdev_nvme_delete_io_paths(nbdev_ch); 780 return rc; 781 } 782 } 783 pthread_mutex_unlock(&nbdev->mutex); 784 785 return 0; 786 } 787 788 /* If cpl != NULL, complete the bdev_io with nvme status based on 'cpl'. 789 * If cpl == NULL, complete the bdev_io with bdev status based on 'status'. 790 */ 791 static inline void 792 __bdev_nvme_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status, 793 const struct spdk_nvme_cpl *cpl) 794 { 795 spdk_trace_record(TRACE_BDEV_NVME_IO_DONE, 0, 0, (uintptr_t)bdev_io->driver_ctx, 796 (uintptr_t)bdev_io); 797 if (cpl) { 798 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 799 } else { 800 spdk_bdev_io_complete(bdev_io, status); 801 } 802 } 803 804 static void bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch); 805 806 static void 807 bdev_nvme_destroy_bdev_channel_cb(void *io_device, void *ctx_buf) 808 { 809 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 810 811 bdev_nvme_abort_retry_ios(nbdev_ch); 812 _bdev_nvme_delete_io_paths(nbdev_ch); 813 } 814 815 static inline bool 816 bdev_nvme_io_type_is_admin(enum spdk_bdev_io_type io_type) 817 { 818 switch (io_type) { 819 case SPDK_BDEV_IO_TYPE_RESET: 820 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 821 case SPDK_BDEV_IO_TYPE_ABORT: 822 return true; 823 default: 824 break; 825 } 826 827 return false; 828 } 829 830 static inline bool 831 nvme_ns_is_active(struct nvme_ns *nvme_ns) 832 { 833 if (spdk_unlikely(nvme_ns->ana_state_updating)) { 834 return false; 835 } 836 837 if (spdk_unlikely(nvme_ns->ns == NULL)) { 838 return false; 839 } 840 841 return true; 842 } 843 844 static inline bool 845 nvme_ns_is_accessible(struct nvme_ns *nvme_ns) 846 { 847 if (spdk_unlikely(!nvme_ns_is_active(nvme_ns))) { 848 return false; 849 } 850 851 switch (nvme_ns->ana_state) { 852 case SPDK_NVME_ANA_OPTIMIZED_STATE: 853 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 854 return true; 855 default: 856 break; 857 } 858 859 return false; 860 } 861 862 static inline bool 863 nvme_qpair_is_connected(struct nvme_qpair *nvme_qpair) 864 { 865 if (spdk_unlikely(nvme_qpair->qpair == NULL)) { 866 return false; 867 } 868 869 if (spdk_unlikely(spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) != 870 SPDK_NVME_QPAIR_FAILURE_NONE)) { 871 return false; 872 } 873 874 if (spdk_unlikely(nvme_qpair->ctrlr_ch->reset_iter != NULL)) { 875 return false; 876 } 877 878 return true; 879 } 880 881 static inline bool 882 nvme_io_path_is_available(struct nvme_io_path *io_path) 883 { 884 if (spdk_unlikely(!nvme_qpair_is_connected(io_path->qpair))) { 885 return false; 886 } 887 888 if (spdk_unlikely(!nvme_ns_is_accessible(io_path->nvme_ns))) { 889 return false; 890 } 891 892 return true; 893 } 894 895 static inline bool 896 nvme_ctrlr_is_failed(struct nvme_ctrlr *nvme_ctrlr) 897 { 898 if (nvme_ctrlr->destruct) { 899 return true; 900 } 901 902 if (nvme_ctrlr->fast_io_fail_timedout) { 903 return true; 904 } 905 906 if (nvme_ctrlr->resetting) { 907 if (nvme_ctrlr->opts.reconnect_delay_sec != 0) { 908 return false; 909 } else { 910 return true; 911 } 912 } 913 914 if (nvme_ctrlr->reconnect_is_delayed) { 915 return false; 916 } 917 918 if (nvme_ctrlr->disabled) { 919 return true; 920 } 921 922 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 923 return true; 924 } else { 925 return false; 926 } 927 } 928 929 static bool 930 nvme_ctrlr_is_available(struct nvme_ctrlr *nvme_ctrlr) 931 { 932 if (nvme_ctrlr->destruct) { 933 return false; 934 } 935 936 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 937 return false; 938 } 939 940 if (nvme_ctrlr->resetting || nvme_ctrlr->reconnect_is_delayed) { 941 return false; 942 } 943 944 if (nvme_ctrlr->disabled) { 945 return false; 946 } 947 948 return true; 949 } 950 951 /* Simulate circular linked list. */ 952 static inline struct nvme_io_path * 953 nvme_io_path_get_next(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *prev_path) 954 { 955 struct nvme_io_path *next_path; 956 957 if (prev_path != NULL) { 958 next_path = STAILQ_NEXT(prev_path, stailq); 959 if (next_path != NULL) { 960 return next_path; 961 } 962 } 963 964 return STAILQ_FIRST(&nbdev_ch->io_path_list); 965 } 966 967 static struct nvme_io_path * 968 _bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 969 { 970 struct nvme_io_path *io_path, *start, *non_optimized = NULL; 971 972 start = nvme_io_path_get_next(nbdev_ch, nbdev_ch->current_io_path); 973 974 io_path = start; 975 do { 976 if (spdk_likely(nvme_io_path_is_available(io_path))) { 977 switch (io_path->nvme_ns->ana_state) { 978 case SPDK_NVME_ANA_OPTIMIZED_STATE: 979 nbdev_ch->current_io_path = io_path; 980 return io_path; 981 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 982 if (non_optimized == NULL) { 983 non_optimized = io_path; 984 } 985 break; 986 default: 987 assert(false); 988 break; 989 } 990 } 991 io_path = nvme_io_path_get_next(nbdev_ch, io_path); 992 } while (io_path != start); 993 994 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) { 995 /* We come here only if there is no optimized path. Cache even non_optimized 996 * path for load balance across multiple non_optimized paths. 997 */ 998 nbdev_ch->current_io_path = non_optimized; 999 } 1000 1001 return non_optimized; 1002 } 1003 1004 static struct nvme_io_path * 1005 _bdev_nvme_find_io_path_min_qd(struct nvme_bdev_channel *nbdev_ch) 1006 { 1007 struct nvme_io_path *io_path; 1008 struct nvme_io_path *optimized = NULL, *non_optimized = NULL; 1009 uint32_t opt_min_qd = UINT32_MAX, non_opt_min_qd = UINT32_MAX; 1010 uint32_t num_outstanding_reqs; 1011 1012 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 1013 if (spdk_unlikely(!nvme_qpair_is_connected(io_path->qpair))) { 1014 /* The device is currently resetting. */ 1015 continue; 1016 } 1017 1018 if (spdk_unlikely(!nvme_ns_is_active(io_path->nvme_ns))) { 1019 continue; 1020 } 1021 1022 num_outstanding_reqs = spdk_nvme_qpair_get_num_outstanding_reqs(io_path->qpair->qpair); 1023 switch (io_path->nvme_ns->ana_state) { 1024 case SPDK_NVME_ANA_OPTIMIZED_STATE: 1025 if (num_outstanding_reqs < opt_min_qd) { 1026 opt_min_qd = num_outstanding_reqs; 1027 optimized = io_path; 1028 } 1029 break; 1030 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 1031 if (num_outstanding_reqs < non_opt_min_qd) { 1032 non_opt_min_qd = num_outstanding_reqs; 1033 non_optimized = io_path; 1034 } 1035 break; 1036 default: 1037 break; 1038 } 1039 } 1040 1041 /* don't cache io path for BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH selector */ 1042 if (optimized != NULL) { 1043 return optimized; 1044 } 1045 1046 return non_optimized; 1047 } 1048 1049 static inline struct nvme_io_path * 1050 bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 1051 { 1052 if (spdk_likely(nbdev_ch->current_io_path != NULL)) { 1053 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE) { 1054 return nbdev_ch->current_io_path; 1055 } else if (nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 1056 if (++nbdev_ch->rr_counter < nbdev_ch->rr_min_io) { 1057 return nbdev_ch->current_io_path; 1058 } 1059 nbdev_ch->rr_counter = 0; 1060 } 1061 } 1062 1063 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE || 1064 nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 1065 return _bdev_nvme_find_io_path(nbdev_ch); 1066 } else { 1067 return _bdev_nvme_find_io_path_min_qd(nbdev_ch); 1068 } 1069 } 1070 1071 /* Return true if there is any io_path whose qpair is active or ctrlr is not failed, 1072 * or false otherwise. 1073 * 1074 * If any io_path has an active qpair but find_io_path() returned NULL, its namespace 1075 * is likely to be non-accessible now but may become accessible. 1076 * 1077 * If any io_path has an unfailed ctrlr but find_io_path() returned NULL, the ctrlr 1078 * is likely to be resetting now but the reset may succeed. A ctrlr is set to unfailed 1079 * when starting to reset it but it is set to failed when the reset failed. Hence, if 1080 * a ctrlr is unfailed, it is likely that it works fine or is resetting. 1081 */ 1082 static bool 1083 any_io_path_may_become_available(struct nvme_bdev_channel *nbdev_ch) 1084 { 1085 struct nvme_io_path *io_path; 1086 1087 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 1088 if (io_path->nvme_ns->ana_transition_timedout) { 1089 continue; 1090 } 1091 1092 if (nvme_qpair_is_connected(io_path->qpair) || 1093 !nvme_ctrlr_is_failed(io_path->qpair->ctrlr)) { 1094 return true; 1095 } 1096 } 1097 1098 return false; 1099 } 1100 1101 static void 1102 bdev_nvme_retry_io(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io) 1103 { 1104 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 1105 struct spdk_io_channel *ch; 1106 1107 if (nbdev_io->io_path != NULL && nvme_io_path_is_available(nbdev_io->io_path)) { 1108 _bdev_nvme_submit_request(nbdev_ch, bdev_io); 1109 } else { 1110 ch = spdk_io_channel_from_ctx(nbdev_ch); 1111 bdev_nvme_submit_request(ch, bdev_io); 1112 } 1113 } 1114 1115 static int 1116 bdev_nvme_retry_ios(void *arg) 1117 { 1118 struct nvme_bdev_channel *nbdev_ch = arg; 1119 struct spdk_bdev_io *bdev_io, *tmp_bdev_io; 1120 struct nvme_bdev_io *bio; 1121 uint64_t now, delay_us; 1122 1123 now = spdk_get_ticks(); 1124 1125 TAILQ_FOREACH_SAFE(bdev_io, &nbdev_ch->retry_io_list, module_link, tmp_bdev_io) { 1126 bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 1127 if (bio->retry_ticks > now) { 1128 break; 1129 } 1130 1131 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io, module_link); 1132 1133 bdev_nvme_retry_io(nbdev_ch, bdev_io); 1134 } 1135 1136 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1137 1138 bdev_io = TAILQ_FIRST(&nbdev_ch->retry_io_list); 1139 if (bdev_io != NULL) { 1140 bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 1141 1142 delay_us = (bio->retry_ticks - now) * SPDK_SEC_TO_USEC / spdk_get_ticks_hz(); 1143 1144 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 1145 delay_us); 1146 } 1147 1148 return SPDK_POLLER_BUSY; 1149 } 1150 1151 static void 1152 bdev_nvme_queue_retry_io(struct nvme_bdev_channel *nbdev_ch, 1153 struct nvme_bdev_io *bio, uint64_t delay_ms) 1154 { 1155 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1156 struct spdk_bdev_io *tmp_bdev_io; 1157 struct nvme_bdev_io *tmp_bio; 1158 1159 bio->retry_ticks = spdk_get_ticks() + delay_ms * spdk_get_ticks_hz() / 1000ULL; 1160 1161 TAILQ_FOREACH_REVERSE(tmp_bdev_io, &nbdev_ch->retry_io_list, retry_io_head, module_link) { 1162 tmp_bio = (struct nvme_bdev_io *)tmp_bdev_io->driver_ctx; 1163 1164 if (tmp_bio->retry_ticks <= bio->retry_ticks) { 1165 TAILQ_INSERT_AFTER(&nbdev_ch->retry_io_list, tmp_bdev_io, bdev_io, 1166 module_link); 1167 return; 1168 } 1169 } 1170 1171 /* No earlier I/Os were found. This I/O must be the new head. */ 1172 TAILQ_INSERT_HEAD(&nbdev_ch->retry_io_list, bdev_io, module_link); 1173 1174 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1175 1176 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 1177 delay_ms * 1000ULL); 1178 } 1179 1180 static void 1181 bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch) 1182 { 1183 struct spdk_bdev_io *bdev_io, *tmp_io; 1184 1185 TAILQ_FOREACH_SAFE(bdev_io, &nbdev_ch->retry_io_list, module_link, tmp_io) { 1186 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io, module_link); 1187 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED, NULL); 1188 } 1189 1190 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1191 } 1192 1193 static int 1194 bdev_nvme_abort_retry_io(struct nvme_bdev_channel *nbdev_ch, 1195 struct nvme_bdev_io *bio_to_abort) 1196 { 1197 struct spdk_bdev_io *bdev_io_to_abort; 1198 1199 TAILQ_FOREACH(bdev_io_to_abort, &nbdev_ch->retry_io_list, module_link) { 1200 if ((struct nvme_bdev_io *)bdev_io_to_abort->driver_ctx == bio_to_abort) { 1201 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io_to_abort, module_link); 1202 __bdev_nvme_io_complete(bdev_io_to_abort, SPDK_BDEV_IO_STATUS_ABORTED, NULL); 1203 return 0; 1204 } 1205 } 1206 1207 return -ENOENT; 1208 } 1209 1210 static void 1211 bdev_nvme_update_nvme_error_stat(struct spdk_bdev_io *bdev_io, const struct spdk_nvme_cpl *cpl) 1212 { 1213 struct nvme_bdev *nbdev; 1214 uint16_t sct, sc; 1215 1216 assert(spdk_nvme_cpl_is_error(cpl)); 1217 1218 nbdev = bdev_io->bdev->ctxt; 1219 1220 if (nbdev->err_stat == NULL) { 1221 return; 1222 } 1223 1224 sct = cpl->status.sct; 1225 sc = cpl->status.sc; 1226 1227 pthread_mutex_lock(&nbdev->mutex); 1228 1229 nbdev->err_stat->status_type[sct]++; 1230 switch (sct) { 1231 case SPDK_NVME_SCT_GENERIC: 1232 case SPDK_NVME_SCT_COMMAND_SPECIFIC: 1233 case SPDK_NVME_SCT_MEDIA_ERROR: 1234 case SPDK_NVME_SCT_PATH: 1235 nbdev->err_stat->status[sct][sc]++; 1236 break; 1237 default: 1238 break; 1239 } 1240 1241 pthread_mutex_unlock(&nbdev->mutex); 1242 } 1243 1244 static inline void 1245 bdev_nvme_update_io_path_stat(struct nvme_bdev_io *bio) 1246 { 1247 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1248 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 1249 uint32_t blocklen = bdev_io->bdev->blocklen; 1250 struct spdk_bdev_io_stat *stat; 1251 uint64_t tsc_diff; 1252 1253 if (bio->io_path->stat == NULL) { 1254 return; 1255 } 1256 1257 tsc_diff = spdk_get_ticks() - bio->submit_tsc; 1258 stat = bio->io_path->stat; 1259 1260 switch (bdev_io->type) { 1261 case SPDK_BDEV_IO_TYPE_READ: 1262 stat->bytes_read += num_blocks * blocklen; 1263 stat->num_read_ops++; 1264 stat->read_latency_ticks += tsc_diff; 1265 if (stat->max_read_latency_ticks < tsc_diff) { 1266 stat->max_read_latency_ticks = tsc_diff; 1267 } 1268 if (stat->min_read_latency_ticks > tsc_diff) { 1269 stat->min_read_latency_ticks = tsc_diff; 1270 } 1271 break; 1272 case SPDK_BDEV_IO_TYPE_WRITE: 1273 stat->bytes_written += num_blocks * blocklen; 1274 stat->num_write_ops++; 1275 stat->write_latency_ticks += tsc_diff; 1276 if (stat->max_write_latency_ticks < tsc_diff) { 1277 stat->max_write_latency_ticks = tsc_diff; 1278 } 1279 if (stat->min_write_latency_ticks > tsc_diff) { 1280 stat->min_write_latency_ticks = tsc_diff; 1281 } 1282 break; 1283 case SPDK_BDEV_IO_TYPE_UNMAP: 1284 stat->bytes_unmapped += num_blocks * blocklen; 1285 stat->num_unmap_ops++; 1286 stat->unmap_latency_ticks += tsc_diff; 1287 if (stat->max_unmap_latency_ticks < tsc_diff) { 1288 stat->max_unmap_latency_ticks = tsc_diff; 1289 } 1290 if (stat->min_unmap_latency_ticks > tsc_diff) { 1291 stat->min_unmap_latency_ticks = tsc_diff; 1292 } 1293 break; 1294 case SPDK_BDEV_IO_TYPE_ZCOPY: 1295 /* Track the data in the start phase only */ 1296 if (!bdev_io->u.bdev.zcopy.start) { 1297 break; 1298 } 1299 if (bdev_io->u.bdev.zcopy.populate) { 1300 stat->bytes_read += num_blocks * blocklen; 1301 stat->num_read_ops++; 1302 stat->read_latency_ticks += tsc_diff; 1303 if (stat->max_read_latency_ticks < tsc_diff) { 1304 stat->max_read_latency_ticks = tsc_diff; 1305 } 1306 if (stat->min_read_latency_ticks > tsc_diff) { 1307 stat->min_read_latency_ticks = tsc_diff; 1308 } 1309 } else { 1310 stat->bytes_written += num_blocks * blocklen; 1311 stat->num_write_ops++; 1312 stat->write_latency_ticks += tsc_diff; 1313 if (stat->max_write_latency_ticks < tsc_diff) { 1314 stat->max_write_latency_ticks = tsc_diff; 1315 } 1316 if (stat->min_write_latency_ticks > tsc_diff) { 1317 stat->min_write_latency_ticks = tsc_diff; 1318 } 1319 } 1320 break; 1321 case SPDK_BDEV_IO_TYPE_COPY: 1322 stat->bytes_copied += num_blocks * blocklen; 1323 stat->num_copy_ops++; 1324 stat->copy_latency_ticks += tsc_diff; 1325 if (stat->max_copy_latency_ticks < tsc_diff) { 1326 stat->max_copy_latency_ticks = tsc_diff; 1327 } 1328 if (stat->min_copy_latency_ticks > tsc_diff) { 1329 stat->min_copy_latency_ticks = tsc_diff; 1330 } 1331 break; 1332 default: 1333 break; 1334 } 1335 } 1336 1337 static bool 1338 bdev_nvme_check_retry_io(struct nvme_bdev_io *bio, 1339 const struct spdk_nvme_cpl *cpl, 1340 struct nvme_bdev_channel *nbdev_ch, 1341 uint64_t *_delay_ms) 1342 { 1343 struct nvme_io_path *io_path = bio->io_path; 1344 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 1345 const struct spdk_nvme_ctrlr_data *cdata; 1346 1347 if (spdk_nvme_cpl_is_path_error(cpl) || 1348 spdk_nvme_cpl_is_aborted_sq_deletion(cpl) || 1349 !nvme_io_path_is_available(io_path) || 1350 !nvme_ctrlr_is_available(nvme_ctrlr)) { 1351 bdev_nvme_clear_current_io_path(nbdev_ch); 1352 bio->io_path = NULL; 1353 if (spdk_nvme_cpl_is_ana_error(cpl)) { 1354 if (nvme_ctrlr_read_ana_log_page(nvme_ctrlr) == 0) { 1355 io_path->nvme_ns->ana_state_updating = true; 1356 } 1357 } 1358 if (!any_io_path_may_become_available(nbdev_ch)) { 1359 return false; 1360 } 1361 *_delay_ms = 0; 1362 } else { 1363 bio->retry_count++; 1364 1365 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 1366 1367 if (cpl->status.crd != 0) { 1368 *_delay_ms = cdata->crdt[cpl->status.crd] * 100; 1369 } else { 1370 *_delay_ms = 0; 1371 } 1372 } 1373 1374 return true; 1375 } 1376 1377 static inline void 1378 bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio, 1379 const struct spdk_nvme_cpl *cpl) 1380 { 1381 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1382 struct nvme_bdev_channel *nbdev_ch; 1383 uint64_t delay_ms; 1384 1385 assert(!bdev_nvme_io_type_is_admin(bdev_io->type)); 1386 1387 if (spdk_likely(spdk_nvme_cpl_is_success(cpl))) { 1388 bdev_nvme_update_io_path_stat(bio); 1389 goto complete; 1390 } 1391 1392 /* Update error counts before deciding if retry is needed. 1393 * Hence, error counts may be more than the number of I/O errors. 1394 */ 1395 bdev_nvme_update_nvme_error_stat(bdev_io, cpl); 1396 1397 if (cpl->status.dnr != 0 || spdk_nvme_cpl_is_aborted_by_request(cpl) || 1398 (g_opts.bdev_retry_count != -1 && bio->retry_count >= g_opts.bdev_retry_count)) { 1399 goto complete; 1400 } 1401 1402 /* At this point we don't know whether the sequence was successfully executed or not, so we 1403 * cannot retry the IO */ 1404 if (bdev_io->u.bdev.accel_sequence != NULL) { 1405 goto complete; 1406 } 1407 1408 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1409 1410 if (bdev_nvme_check_retry_io(bio, cpl, nbdev_ch, &delay_ms)) { 1411 bdev_nvme_queue_retry_io(nbdev_ch, bio, delay_ms); 1412 return; 1413 } 1414 1415 complete: 1416 bio->retry_count = 0; 1417 bio->submit_tsc = 0; 1418 bdev_io->u.bdev.accel_sequence = NULL; 1419 __bdev_nvme_io_complete(bdev_io, 0, cpl); 1420 } 1421 1422 static inline void 1423 bdev_nvme_io_complete(struct nvme_bdev_io *bio, int rc) 1424 { 1425 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1426 struct nvme_bdev_channel *nbdev_ch; 1427 enum spdk_bdev_io_status io_status; 1428 1429 assert(!bdev_nvme_io_type_is_admin(bdev_io->type)); 1430 1431 switch (rc) { 1432 case 0: 1433 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1434 break; 1435 case -ENOMEM: 1436 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1437 break; 1438 case -ENXIO: 1439 if (g_opts.bdev_retry_count == -1 || bio->retry_count < g_opts.bdev_retry_count) { 1440 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1441 1442 bdev_nvme_clear_current_io_path(nbdev_ch); 1443 bio->io_path = NULL; 1444 1445 if (any_io_path_may_become_available(nbdev_ch)) { 1446 bdev_nvme_queue_retry_io(nbdev_ch, bio, 1000ULL); 1447 return; 1448 } 1449 } 1450 1451 /* fallthrough */ 1452 default: 1453 spdk_accel_sequence_abort(bdev_io->u.bdev.accel_sequence); 1454 bdev_io->u.bdev.accel_sequence = NULL; 1455 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1456 break; 1457 } 1458 1459 bio->retry_count = 0; 1460 bio->submit_tsc = 0; 1461 __bdev_nvme_io_complete(bdev_io, io_status, NULL); 1462 } 1463 1464 static inline void 1465 bdev_nvme_admin_complete(struct nvme_bdev_io *bio, int rc) 1466 { 1467 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1468 enum spdk_bdev_io_status io_status; 1469 1470 switch (rc) { 1471 case 0: 1472 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1473 break; 1474 case -ENOMEM: 1475 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1476 break; 1477 case -ENXIO: 1478 /* fallthrough */ 1479 default: 1480 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1481 break; 1482 } 1483 1484 __bdev_nvme_io_complete(bdev_io, io_status, NULL); 1485 } 1486 1487 static void 1488 bdev_nvme_clear_io_path_caches_done(struct spdk_io_channel_iter *i, int status) 1489 { 1490 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 1491 1492 pthread_mutex_lock(&nvme_ctrlr->mutex); 1493 1494 assert(nvme_ctrlr->io_path_cache_clearing == true); 1495 nvme_ctrlr->io_path_cache_clearing = false; 1496 1497 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 1498 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1499 return; 1500 } 1501 1502 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1503 1504 nvme_ctrlr_unregister(nvme_ctrlr); 1505 } 1506 1507 static void 1508 _bdev_nvme_clear_io_path_cache(struct nvme_qpair *nvme_qpair) 1509 { 1510 struct nvme_io_path *io_path; 1511 1512 TAILQ_FOREACH(io_path, &nvme_qpair->io_path_list, tailq) { 1513 if (io_path->nbdev_ch == NULL) { 1514 continue; 1515 } 1516 bdev_nvme_clear_current_io_path(io_path->nbdev_ch); 1517 } 1518 } 1519 1520 static void 1521 bdev_nvme_clear_io_path_cache(struct spdk_io_channel_iter *i) 1522 { 1523 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 1524 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 1525 1526 assert(ctrlr_ch->qpair != NULL); 1527 1528 _bdev_nvme_clear_io_path_cache(ctrlr_ch->qpair); 1529 1530 spdk_for_each_channel_continue(i, 0); 1531 } 1532 1533 static void 1534 bdev_nvme_clear_io_path_caches(struct nvme_ctrlr *nvme_ctrlr) 1535 { 1536 pthread_mutex_lock(&nvme_ctrlr->mutex); 1537 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 1538 nvme_ctrlr->io_path_cache_clearing) { 1539 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1540 return; 1541 } 1542 1543 nvme_ctrlr->io_path_cache_clearing = true; 1544 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1545 1546 spdk_for_each_channel(nvme_ctrlr, 1547 bdev_nvme_clear_io_path_cache, 1548 NULL, 1549 bdev_nvme_clear_io_path_caches_done); 1550 } 1551 1552 static struct nvme_qpair * 1553 nvme_poll_group_get_qpair(struct nvme_poll_group *group, struct spdk_nvme_qpair *qpair) 1554 { 1555 struct nvme_qpair *nvme_qpair; 1556 1557 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1558 if (nvme_qpair->qpair == qpair) { 1559 break; 1560 } 1561 } 1562 1563 return nvme_qpair; 1564 } 1565 1566 static void nvme_qpair_delete(struct nvme_qpair *nvme_qpair); 1567 1568 static void 1569 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx) 1570 { 1571 struct nvme_poll_group *group = poll_group_ctx; 1572 struct nvme_qpair *nvme_qpair; 1573 struct nvme_ctrlr_channel *ctrlr_ch; 1574 int status; 1575 1576 nvme_qpair = nvme_poll_group_get_qpair(group, qpair); 1577 if (nvme_qpair == NULL) { 1578 return; 1579 } 1580 1581 if (nvme_qpair->qpair != NULL) { 1582 spdk_nvme_ctrlr_free_io_qpair(nvme_qpair->qpair); 1583 nvme_qpair->qpair = NULL; 1584 } 1585 1586 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1587 1588 ctrlr_ch = nvme_qpair->ctrlr_ch; 1589 1590 if (ctrlr_ch != NULL) { 1591 if (ctrlr_ch->reset_iter != NULL) { 1592 /* We are in a full reset sequence. */ 1593 if (ctrlr_ch->connect_poller != NULL) { 1594 /* qpair was failed to connect. Abort the reset sequence. */ 1595 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was failed to connect. abort the reset ctrlr sequence.\n", 1596 qpair); 1597 spdk_poller_unregister(&ctrlr_ch->connect_poller); 1598 status = -1; 1599 } else { 1600 /* qpair was completed to disconnect. Just move to the next ctrlr_channel. */ 1601 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was disconnected and freed in a reset ctrlr sequence.\n", 1602 qpair); 1603 status = 0; 1604 } 1605 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, status); 1606 ctrlr_ch->reset_iter = NULL; 1607 } else { 1608 /* qpair was disconnected unexpectedly. Reset controller for recovery. */ 1609 SPDK_NOTICELOG("qpair %p was disconnected and freed. reset controller.\n", qpair); 1610 bdev_nvme_failover_ctrlr(nvme_qpair->ctrlr); 1611 } 1612 } else { 1613 /* In this case, ctrlr_channel is already deleted. */ 1614 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was disconnected and freed. delete nvme_qpair.\n", qpair); 1615 nvme_qpair_delete(nvme_qpair); 1616 } 1617 } 1618 1619 static void 1620 bdev_nvme_check_io_qpairs(struct nvme_poll_group *group) 1621 { 1622 struct nvme_qpair *nvme_qpair; 1623 1624 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1625 if (nvme_qpair->qpair == NULL || nvme_qpair->ctrlr_ch == NULL) { 1626 continue; 1627 } 1628 1629 if (spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) != 1630 SPDK_NVME_QPAIR_FAILURE_NONE) { 1631 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1632 } 1633 } 1634 } 1635 1636 static int 1637 bdev_nvme_poll(void *arg) 1638 { 1639 struct nvme_poll_group *group = arg; 1640 int64_t num_completions; 1641 1642 if (group->collect_spin_stat && group->start_ticks == 0) { 1643 group->start_ticks = spdk_get_ticks(); 1644 } 1645 1646 num_completions = spdk_nvme_poll_group_process_completions(group->group, 0, 1647 bdev_nvme_disconnected_qpair_cb); 1648 if (group->collect_spin_stat) { 1649 if (num_completions > 0) { 1650 if (group->end_ticks != 0) { 1651 group->spin_ticks += (group->end_ticks - group->start_ticks); 1652 group->end_ticks = 0; 1653 } 1654 group->start_ticks = 0; 1655 } else { 1656 group->end_ticks = spdk_get_ticks(); 1657 } 1658 } 1659 1660 if (spdk_unlikely(num_completions < 0)) { 1661 bdev_nvme_check_io_qpairs(group); 1662 } 1663 1664 return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 1665 } 1666 1667 static int bdev_nvme_poll_adminq(void *arg); 1668 1669 static void 1670 bdev_nvme_change_adminq_poll_period(struct nvme_ctrlr *nvme_ctrlr, uint64_t new_period_us) 1671 { 1672 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 1673 1674 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, 1675 nvme_ctrlr, new_period_us); 1676 } 1677 1678 static int 1679 bdev_nvme_poll_adminq(void *arg) 1680 { 1681 int32_t rc; 1682 struct nvme_ctrlr *nvme_ctrlr = arg; 1683 nvme_ctrlr_disconnected_cb disconnected_cb; 1684 1685 assert(nvme_ctrlr != NULL); 1686 1687 rc = spdk_nvme_ctrlr_process_admin_completions(nvme_ctrlr->ctrlr); 1688 if (rc < 0) { 1689 disconnected_cb = nvme_ctrlr->disconnected_cb; 1690 nvme_ctrlr->disconnected_cb = NULL; 1691 1692 if (disconnected_cb != NULL) { 1693 bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 1694 g_opts.nvme_adminq_poll_period_us); 1695 disconnected_cb(nvme_ctrlr); 1696 } else { 1697 bdev_nvme_failover_ctrlr(nvme_ctrlr); 1698 } 1699 } else if (spdk_nvme_ctrlr_get_admin_qp_failure_reason(nvme_ctrlr->ctrlr) != 1700 SPDK_NVME_QPAIR_FAILURE_NONE) { 1701 bdev_nvme_clear_io_path_caches(nvme_ctrlr); 1702 } 1703 1704 return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY; 1705 } 1706 1707 static void 1708 nvme_bdev_free(void *io_device) 1709 { 1710 struct nvme_bdev *nvme_disk = io_device; 1711 1712 pthread_mutex_destroy(&nvme_disk->mutex); 1713 free(nvme_disk->disk.name); 1714 free(nvme_disk->err_stat); 1715 free(nvme_disk); 1716 } 1717 1718 static int 1719 bdev_nvme_destruct(void *ctx) 1720 { 1721 struct nvme_bdev *nvme_disk = ctx; 1722 struct nvme_ns *nvme_ns, *tmp_nvme_ns; 1723 1724 SPDK_DTRACE_PROBE2(bdev_nvme_destruct, nvme_disk->nbdev_ctrlr->name, nvme_disk->nsid); 1725 1726 TAILQ_FOREACH_SAFE(nvme_ns, &nvme_disk->nvme_ns_list, tailq, tmp_nvme_ns) { 1727 pthread_mutex_lock(&nvme_ns->ctrlr->mutex); 1728 1729 nvme_ns->bdev = NULL; 1730 1731 assert(nvme_ns->id > 0); 1732 1733 if (nvme_ctrlr_get_ns(nvme_ns->ctrlr, nvme_ns->id) == NULL) { 1734 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1735 1736 nvme_ctrlr_release(nvme_ns->ctrlr); 1737 nvme_ns_free(nvme_ns); 1738 } else { 1739 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1740 } 1741 } 1742 1743 pthread_mutex_lock(&g_bdev_nvme_mutex); 1744 TAILQ_REMOVE(&nvme_disk->nbdev_ctrlr->bdevs, nvme_disk, tailq); 1745 pthread_mutex_unlock(&g_bdev_nvme_mutex); 1746 1747 spdk_io_device_unregister(nvme_disk, nvme_bdev_free); 1748 1749 return 0; 1750 } 1751 1752 static int 1753 bdev_nvme_create_qpair(struct nvme_qpair *nvme_qpair) 1754 { 1755 struct nvme_ctrlr *nvme_ctrlr; 1756 struct spdk_nvme_io_qpair_opts opts; 1757 struct spdk_nvme_qpair *qpair; 1758 int rc; 1759 1760 nvme_ctrlr = nvme_qpair->ctrlr; 1761 1762 spdk_nvme_ctrlr_get_default_io_qpair_opts(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1763 opts.delay_cmd_submit = g_opts.delay_cmd_submit; 1764 opts.create_only = true; 1765 opts.async_mode = true; 1766 opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests); 1767 g_opts.io_queue_requests = opts.io_queue_requests; 1768 1769 qpair = spdk_nvme_ctrlr_alloc_io_qpair(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1770 if (qpair == NULL) { 1771 return -1; 1772 } 1773 1774 SPDK_DTRACE_PROBE3(bdev_nvme_create_qpair, nvme_ctrlr->nbdev_ctrlr->name, 1775 spdk_nvme_qpair_get_id(qpair), spdk_thread_get_id(nvme_ctrlr->thread)); 1776 1777 assert(nvme_qpair->group != NULL); 1778 1779 rc = spdk_nvme_poll_group_add(nvme_qpair->group->group, qpair); 1780 if (rc != 0) { 1781 SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n"); 1782 goto err; 1783 } 1784 1785 rc = spdk_nvme_ctrlr_connect_io_qpair(nvme_ctrlr->ctrlr, qpair); 1786 if (rc != 0) { 1787 SPDK_ERRLOG("Unable to connect I/O qpair.\n"); 1788 goto err; 1789 } 1790 1791 nvme_qpair->qpair = qpair; 1792 1793 if (!g_opts.disable_auto_failback) { 1794 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1795 } 1796 1797 return 0; 1798 1799 err: 1800 spdk_nvme_ctrlr_free_io_qpair(qpair); 1801 1802 return rc; 1803 } 1804 1805 static void bdev_nvme_reset_io_continue(void *cb_arg, int rc); 1806 1807 static void 1808 bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i) 1809 { 1810 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 1811 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 1812 int rc = 0; 1813 struct spdk_bdev_io *bdev_io; 1814 struct nvme_bdev_io *bio; 1815 1816 if (spdk_io_channel_iter_get_ctx(i) != NULL) { 1817 rc = -1; 1818 } 1819 1820 while (!TAILQ_EMPTY(&ctrlr_ch->pending_resets)) { 1821 bdev_io = TAILQ_FIRST(&ctrlr_ch->pending_resets); 1822 TAILQ_REMOVE(&ctrlr_ch->pending_resets, bdev_io, module_link); 1823 1824 bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 1825 bdev_nvme_reset_io_continue(bio, rc); 1826 } 1827 1828 spdk_for_each_channel_continue(i, 0); 1829 } 1830 1831 /* This function marks the current trid as failed by storing the current ticks 1832 * and then sets the next trid to the active trid within a controller if exists. 1833 * 1834 * The purpose of the boolean return value is to request the caller to disconnect 1835 * the current trid now to try connecting the next trid. 1836 */ 1837 static bool 1838 bdev_nvme_failover_trid(struct nvme_ctrlr *nvme_ctrlr, bool remove, bool start) 1839 { 1840 struct nvme_path_id *path_id, *next_path; 1841 int rc __attribute__((unused)); 1842 1843 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 1844 assert(path_id); 1845 assert(path_id == nvme_ctrlr->active_path_id); 1846 next_path = TAILQ_NEXT(path_id, link); 1847 1848 /* Update the last failed time. It means the trid is failed if its last 1849 * failed time is non-zero. 1850 */ 1851 path_id->last_failed_tsc = spdk_get_ticks(); 1852 1853 if (next_path == NULL) { 1854 /* There is no alternate trid within a controller. */ 1855 return false; 1856 } 1857 1858 if (!start && nvme_ctrlr->opts.reconnect_delay_sec == 0) { 1859 /* Connect is not retried in a controller reset sequence. Connecting 1860 * the next trid will be done by the next bdev_nvme_failover_ctrlr() call. 1861 */ 1862 return false; 1863 } 1864 1865 assert(path_id->trid.trtype != SPDK_NVME_TRANSPORT_PCIE); 1866 1867 SPDK_NOTICELOG("Start failover from %s:%s to %s:%s\n", path_id->trid.traddr, 1868 path_id->trid.trsvcid, next_path->trid.traddr, next_path->trid.trsvcid); 1869 1870 spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr); 1871 nvme_ctrlr->active_path_id = next_path; 1872 rc = spdk_nvme_ctrlr_set_trid(nvme_ctrlr->ctrlr, &next_path->trid); 1873 assert(rc == 0); 1874 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 1875 if (!remove) { 1876 /** Shuffle the old trid to the end of the list and use the new one. 1877 * Allows for round robin through multiple connections. 1878 */ 1879 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, path_id, link); 1880 } else { 1881 free(path_id); 1882 } 1883 1884 if (start || next_path->last_failed_tsc == 0) { 1885 /* bdev_nvme_failover_ctrlr() is just called or the next trid is not failed 1886 * or used yet. Try the next trid now. 1887 */ 1888 return true; 1889 } 1890 1891 if (spdk_get_ticks() > next_path->last_failed_tsc + spdk_get_ticks_hz() * 1892 nvme_ctrlr->opts.reconnect_delay_sec) { 1893 /* Enough backoff passed since the next trid failed. Try the next trid now. */ 1894 return true; 1895 } 1896 1897 /* The next trid will be tried after reconnect_delay_sec seconds. */ 1898 return false; 1899 } 1900 1901 static bool 1902 bdev_nvme_check_ctrlr_loss_timeout(struct nvme_ctrlr *nvme_ctrlr) 1903 { 1904 int32_t elapsed; 1905 1906 if (nvme_ctrlr->opts.ctrlr_loss_timeout_sec == 0 || 1907 nvme_ctrlr->opts.ctrlr_loss_timeout_sec == -1) { 1908 return false; 1909 } 1910 1911 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 1912 if (elapsed >= nvme_ctrlr->opts.ctrlr_loss_timeout_sec) { 1913 return true; 1914 } else { 1915 return false; 1916 } 1917 } 1918 1919 static bool 1920 bdev_nvme_check_fast_io_fail_timeout(struct nvme_ctrlr *nvme_ctrlr) 1921 { 1922 uint32_t elapsed; 1923 1924 if (nvme_ctrlr->opts.fast_io_fail_timeout_sec == 0) { 1925 return false; 1926 } 1927 1928 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 1929 if (elapsed >= nvme_ctrlr->opts.fast_io_fail_timeout_sec) { 1930 return true; 1931 } else { 1932 return false; 1933 } 1934 } 1935 1936 static void bdev_nvme_reset_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, bool success); 1937 1938 static void 1939 nvme_ctrlr_disconnect(struct nvme_ctrlr *nvme_ctrlr, nvme_ctrlr_disconnected_cb cb_fn) 1940 { 1941 int rc; 1942 1943 rc = spdk_nvme_ctrlr_disconnect(nvme_ctrlr->ctrlr); 1944 if (rc != 0) { 1945 /* Disconnect fails if ctrlr is already resetting or removed. In this case, 1946 * fail the reset sequence immediately. 1947 */ 1948 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 1949 return; 1950 } 1951 1952 /* spdk_nvme_ctrlr_disconnect() may complete asynchronously later by polling adminq. 1953 * Set callback here to execute the specified operation after ctrlr is really disconnected. 1954 */ 1955 assert(nvme_ctrlr->disconnected_cb == NULL); 1956 nvme_ctrlr->disconnected_cb = cb_fn; 1957 1958 /* During disconnection, reduce the period to poll adminq more often. */ 1959 bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 0); 1960 } 1961 1962 enum bdev_nvme_op_after_reset { 1963 OP_NONE, 1964 OP_COMPLETE_PENDING_DESTRUCT, 1965 OP_DESTRUCT, 1966 OP_DELAYED_RECONNECT, 1967 OP_FAILOVER, 1968 }; 1969 1970 typedef enum bdev_nvme_op_after_reset _bdev_nvme_op_after_reset; 1971 1972 static _bdev_nvme_op_after_reset 1973 bdev_nvme_check_op_after_reset(struct nvme_ctrlr *nvme_ctrlr, bool success) 1974 { 1975 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 1976 /* Complete pending destruct after reset completes. */ 1977 return OP_COMPLETE_PENDING_DESTRUCT; 1978 } else if (nvme_ctrlr->pending_failover) { 1979 nvme_ctrlr->pending_failover = false; 1980 nvme_ctrlr->reset_start_tsc = 0; 1981 return OP_FAILOVER; 1982 } else if (success || nvme_ctrlr->opts.reconnect_delay_sec == 0) { 1983 nvme_ctrlr->reset_start_tsc = 0; 1984 return OP_NONE; 1985 } else if (bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 1986 return OP_DESTRUCT; 1987 } else { 1988 if (bdev_nvme_check_fast_io_fail_timeout(nvme_ctrlr)) { 1989 nvme_ctrlr->fast_io_fail_timedout = true; 1990 } 1991 return OP_DELAYED_RECONNECT; 1992 } 1993 } 1994 1995 static int bdev_nvme_delete_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool hotplug); 1996 static void bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 1997 1998 static int 1999 bdev_nvme_reconnect_delay_timer_expired(void *ctx) 2000 { 2001 struct nvme_ctrlr *nvme_ctrlr = ctx; 2002 2003 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect_delay, nvme_ctrlr->nbdev_ctrlr->name); 2004 pthread_mutex_lock(&nvme_ctrlr->mutex); 2005 2006 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 2007 2008 if (!nvme_ctrlr->reconnect_is_delayed) { 2009 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2010 return SPDK_POLLER_BUSY; 2011 } 2012 2013 nvme_ctrlr->reconnect_is_delayed = false; 2014 2015 if (nvme_ctrlr->destruct) { 2016 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2017 return SPDK_POLLER_BUSY; 2018 } 2019 2020 assert(nvme_ctrlr->resetting == false); 2021 nvme_ctrlr->resetting = true; 2022 2023 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2024 2025 spdk_poller_resume(nvme_ctrlr->adminq_timer_poller); 2026 2027 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2028 return SPDK_POLLER_BUSY; 2029 } 2030 2031 static void 2032 bdev_nvme_start_reconnect_delay_timer(struct nvme_ctrlr *nvme_ctrlr) 2033 { 2034 spdk_poller_pause(nvme_ctrlr->adminq_timer_poller); 2035 2036 assert(nvme_ctrlr->reconnect_is_delayed == false); 2037 nvme_ctrlr->reconnect_is_delayed = true; 2038 2039 assert(nvme_ctrlr->reconnect_delay_timer == NULL); 2040 nvme_ctrlr->reconnect_delay_timer = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_delay_timer_expired, 2041 nvme_ctrlr, 2042 nvme_ctrlr->opts.reconnect_delay_sec * SPDK_SEC_TO_USEC); 2043 } 2044 2045 static void remove_discovery_entry(struct nvme_ctrlr *nvme_ctrlr); 2046 2047 static void 2048 _bdev_nvme_reset_ctrlr_complete(struct spdk_io_channel_iter *i, int status) 2049 { 2050 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2051 bool success = spdk_io_channel_iter_get_ctx(i) == NULL; 2052 bdev_nvme_ctrlr_op_cb ctrlr_op_cb_fn = nvme_ctrlr->ctrlr_op_cb_fn; 2053 void *ctrlr_op_cb_arg = nvme_ctrlr->ctrlr_op_cb_arg; 2054 enum bdev_nvme_op_after_reset op_after_reset; 2055 2056 assert(nvme_ctrlr->thread == spdk_get_thread()); 2057 2058 nvme_ctrlr->ctrlr_op_cb_fn = NULL; 2059 nvme_ctrlr->ctrlr_op_cb_arg = NULL; 2060 2061 if (!success) { 2062 SPDK_ERRLOG("Resetting controller failed.\n"); 2063 } else { 2064 SPDK_NOTICELOG("Resetting controller successful.\n"); 2065 } 2066 2067 pthread_mutex_lock(&nvme_ctrlr->mutex); 2068 nvme_ctrlr->resetting = false; 2069 nvme_ctrlr->dont_retry = false; 2070 nvme_ctrlr->in_failover = false; 2071 2072 op_after_reset = bdev_nvme_check_op_after_reset(nvme_ctrlr, success); 2073 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2074 2075 /* Delay callbacks when the next operation is a failover. */ 2076 if (ctrlr_op_cb_fn && op_after_reset != OP_FAILOVER) { 2077 ctrlr_op_cb_fn(ctrlr_op_cb_arg, success ? 0 : -1); 2078 } 2079 2080 switch (op_after_reset) { 2081 case OP_COMPLETE_PENDING_DESTRUCT: 2082 nvme_ctrlr_unregister(nvme_ctrlr); 2083 break; 2084 case OP_DESTRUCT: 2085 bdev_nvme_delete_ctrlr(nvme_ctrlr, false); 2086 remove_discovery_entry(nvme_ctrlr); 2087 break; 2088 case OP_DELAYED_RECONNECT: 2089 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_start_reconnect_delay_timer); 2090 break; 2091 case OP_FAILOVER: 2092 nvme_ctrlr->ctrlr_op_cb_fn = ctrlr_op_cb_fn; 2093 nvme_ctrlr->ctrlr_op_cb_arg = ctrlr_op_cb_arg; 2094 bdev_nvme_failover_ctrlr(nvme_ctrlr); 2095 break; 2096 default: 2097 break; 2098 } 2099 } 2100 2101 static void 2102 bdev_nvme_reset_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, bool success) 2103 { 2104 pthread_mutex_lock(&nvme_ctrlr->mutex); 2105 if (!success) { 2106 /* Connecting the active trid failed. Set the next alternate trid to the 2107 * active trid if it exists. 2108 */ 2109 if (bdev_nvme_failover_trid(nvme_ctrlr, false, false)) { 2110 /* The next alternate trid exists and is ready to try. Try it now. */ 2111 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2112 2113 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr); 2114 return; 2115 } 2116 2117 /* We came here if there is no alternate trid or if the next trid exists but 2118 * is not ready to try. We will try the active trid after reconnect_delay_sec 2119 * seconds if it is non-zero or at the next reset call otherwise. 2120 */ 2121 } else { 2122 /* Connecting the active trid succeeded. Clear the last failed time because it 2123 * means the trid is failed if its last failed time is non-zero. 2124 */ 2125 nvme_ctrlr->active_path_id->last_failed_tsc = 0; 2126 } 2127 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2128 2129 /* Make sure we clear any pending resets before returning. */ 2130 spdk_for_each_channel(nvme_ctrlr, 2131 bdev_nvme_complete_pending_resets, 2132 success ? NULL : (void *)0x1, 2133 _bdev_nvme_reset_ctrlr_complete); 2134 } 2135 2136 static void 2137 bdev_nvme_reset_create_qpairs_failed(struct spdk_io_channel_iter *i, int status) 2138 { 2139 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2140 2141 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 2142 } 2143 2144 static void 2145 bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i) 2146 { 2147 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 2148 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch); 2149 struct nvme_qpair *nvme_qpair; 2150 2151 nvme_qpair = ctrlr_ch->qpair; 2152 assert(nvme_qpair != NULL); 2153 2154 _bdev_nvme_clear_io_path_cache(nvme_qpair); 2155 2156 if (nvme_qpair->qpair != NULL) { 2157 if (nvme_qpair->ctrlr->dont_retry) { 2158 spdk_nvme_qpair_set_abort_dnr(nvme_qpair->qpair, true); 2159 } 2160 spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); 2161 2162 /* The current full reset sequence will move to the next 2163 * ctrlr_channel after the qpair is actually disconnected. 2164 */ 2165 assert(ctrlr_ch->reset_iter == NULL); 2166 ctrlr_ch->reset_iter = i; 2167 } else { 2168 spdk_for_each_channel_continue(i, 0); 2169 } 2170 } 2171 2172 static void 2173 bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status) 2174 { 2175 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2176 2177 if (status == 0) { 2178 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, true); 2179 } else { 2180 /* Delete the added qpairs and quiesce ctrlr to make the states clean. */ 2181 spdk_for_each_channel(nvme_ctrlr, 2182 bdev_nvme_reset_destroy_qpair, 2183 NULL, 2184 bdev_nvme_reset_create_qpairs_failed); 2185 } 2186 } 2187 2188 static int 2189 bdev_nvme_reset_check_qpair_connected(void *ctx) 2190 { 2191 struct nvme_ctrlr_channel *ctrlr_ch = ctx; 2192 2193 if (ctrlr_ch->reset_iter == NULL) { 2194 /* qpair was already failed to connect and the reset sequence is being aborted. */ 2195 assert(ctrlr_ch->connect_poller == NULL); 2196 assert(ctrlr_ch->qpair->qpair == NULL); 2197 return SPDK_POLLER_BUSY; 2198 } 2199 2200 assert(ctrlr_ch->qpair->qpair != NULL); 2201 2202 if (!spdk_nvme_qpair_is_connected(ctrlr_ch->qpair->qpair)) { 2203 return SPDK_POLLER_BUSY; 2204 } 2205 2206 spdk_poller_unregister(&ctrlr_ch->connect_poller); 2207 2208 /* qpair was completed to connect. Move to the next ctrlr_channel */ 2209 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 2210 ctrlr_ch->reset_iter = NULL; 2211 2212 if (!g_opts.disable_auto_failback) { 2213 _bdev_nvme_clear_io_path_cache(ctrlr_ch->qpair); 2214 } 2215 2216 return SPDK_POLLER_BUSY; 2217 } 2218 2219 static void 2220 bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i) 2221 { 2222 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 2223 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 2224 int rc; 2225 2226 rc = bdev_nvme_create_qpair(ctrlr_ch->qpair); 2227 if (rc == 0) { 2228 ctrlr_ch->connect_poller = SPDK_POLLER_REGISTER(bdev_nvme_reset_check_qpair_connected, 2229 ctrlr_ch, 0); 2230 2231 /* The current full reset sequence will move to the next 2232 * ctrlr_channel after the qpair is actually connected. 2233 */ 2234 assert(ctrlr_ch->reset_iter == NULL); 2235 ctrlr_ch->reset_iter = i; 2236 } else { 2237 spdk_for_each_channel_continue(i, rc); 2238 } 2239 } 2240 2241 static void 2242 nvme_ctrlr_check_namespaces(struct nvme_ctrlr *nvme_ctrlr) 2243 { 2244 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 2245 struct nvme_ns *nvme_ns; 2246 2247 for (nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 2248 nvme_ns != NULL; 2249 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns)) { 2250 if (!spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) { 2251 SPDK_DEBUGLOG(bdev_nvme, "NSID %u was removed during reset.\n", nvme_ns->id); 2252 /* NS can be added again. Just nullify nvme_ns->ns. */ 2253 nvme_ns->ns = NULL; 2254 } 2255 } 2256 } 2257 2258 2259 static int 2260 bdev_nvme_reconnect_ctrlr_poll(void *arg) 2261 { 2262 struct nvme_ctrlr *nvme_ctrlr = arg; 2263 int rc = -ETIMEDOUT; 2264 2265 if (!bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 2266 rc = spdk_nvme_ctrlr_reconnect_poll_async(nvme_ctrlr->ctrlr); 2267 if (rc == -EAGAIN) { 2268 return SPDK_POLLER_BUSY; 2269 } 2270 } 2271 2272 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 2273 if (rc == 0) { 2274 nvme_ctrlr_check_namespaces(nvme_ctrlr); 2275 2276 /* Recreate all of the I/O queue pairs */ 2277 spdk_for_each_channel(nvme_ctrlr, 2278 bdev_nvme_reset_create_qpair, 2279 NULL, 2280 bdev_nvme_reset_create_qpairs_done); 2281 } else { 2282 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 2283 } 2284 return SPDK_POLLER_BUSY; 2285 } 2286 2287 static void 2288 bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2289 { 2290 spdk_nvme_ctrlr_reconnect_async(nvme_ctrlr->ctrlr); 2291 2292 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect, nvme_ctrlr->nbdev_ctrlr->name); 2293 assert(nvme_ctrlr->reset_detach_poller == NULL); 2294 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_ctrlr_poll, 2295 nvme_ctrlr, 0); 2296 } 2297 2298 static void 2299 bdev_nvme_reset_destroy_qpair_done(struct spdk_io_channel_iter *i, int status) 2300 { 2301 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2302 2303 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reset, nvme_ctrlr->nbdev_ctrlr->name); 2304 assert(status == 0); 2305 2306 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2307 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2308 } else { 2309 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr); 2310 } 2311 } 2312 2313 static void 2314 bdev_nvme_reset_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr) 2315 { 2316 spdk_for_each_channel(nvme_ctrlr, 2317 bdev_nvme_reset_destroy_qpair, 2318 NULL, 2319 bdev_nvme_reset_destroy_qpair_done); 2320 } 2321 2322 static void 2323 bdev_nvme_reconnect_ctrlr_now(void *ctx) 2324 { 2325 struct nvme_ctrlr *nvme_ctrlr = ctx; 2326 2327 assert(nvme_ctrlr->resetting == true); 2328 assert(nvme_ctrlr->thread == spdk_get_thread()); 2329 2330 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 2331 2332 spdk_poller_resume(nvme_ctrlr->adminq_timer_poller); 2333 2334 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2335 } 2336 2337 static void 2338 _bdev_nvme_reset_ctrlr(void *ctx) 2339 { 2340 struct nvme_ctrlr *nvme_ctrlr = ctx; 2341 2342 assert(nvme_ctrlr->resetting == true); 2343 assert(nvme_ctrlr->thread == spdk_get_thread()); 2344 2345 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2346 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reset_destroy_qpairs); 2347 } else { 2348 bdev_nvme_reset_destroy_qpairs(nvme_ctrlr); 2349 } 2350 } 2351 2352 static int 2353 bdev_nvme_reset_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2354 { 2355 spdk_msg_fn msg_fn; 2356 2357 pthread_mutex_lock(&nvme_ctrlr->mutex); 2358 if (nvme_ctrlr->destruct) { 2359 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2360 return -ENXIO; 2361 } 2362 2363 if (nvme_ctrlr->resetting) { 2364 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2365 SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); 2366 return -EBUSY; 2367 } 2368 2369 if (nvme_ctrlr->disabled) { 2370 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2371 SPDK_NOTICELOG("Unable to perform reset. Controller is disabled.\n"); 2372 return -EALREADY; 2373 } 2374 2375 nvme_ctrlr->resetting = true; 2376 nvme_ctrlr->dont_retry = true; 2377 2378 if (nvme_ctrlr->reconnect_is_delayed) { 2379 SPDK_DEBUGLOG(bdev_nvme, "Reconnect is already scheduled.\n"); 2380 msg_fn = bdev_nvme_reconnect_ctrlr_now; 2381 nvme_ctrlr->reconnect_is_delayed = false; 2382 } else { 2383 msg_fn = _bdev_nvme_reset_ctrlr; 2384 assert(nvme_ctrlr->reset_start_tsc == 0); 2385 } 2386 2387 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2388 2389 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2390 2391 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 2392 return 0; 2393 } 2394 2395 static int 2396 bdev_nvme_enable_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2397 { 2398 pthread_mutex_lock(&nvme_ctrlr->mutex); 2399 if (nvme_ctrlr->destruct) { 2400 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2401 return -ENXIO; 2402 } 2403 2404 if (nvme_ctrlr->resetting) { 2405 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2406 return -EBUSY; 2407 } 2408 2409 if (!nvme_ctrlr->disabled) { 2410 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2411 return -EALREADY; 2412 } 2413 2414 nvme_ctrlr->disabled = false; 2415 nvme_ctrlr->resetting = true; 2416 2417 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2418 2419 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2420 2421 spdk_thread_send_msg(nvme_ctrlr->thread, bdev_nvme_reconnect_ctrlr_now, nvme_ctrlr); 2422 return 0; 2423 } 2424 2425 static void 2426 _bdev_nvme_disable_ctrlr_complete(struct spdk_io_channel_iter *i, int status) 2427 { 2428 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2429 bdev_nvme_ctrlr_op_cb ctrlr_op_cb_fn = nvme_ctrlr->ctrlr_op_cb_fn; 2430 void *ctrlr_op_cb_arg = nvme_ctrlr->ctrlr_op_cb_arg; 2431 enum bdev_nvme_op_after_reset op_after_disable; 2432 2433 assert(nvme_ctrlr->thread == spdk_get_thread()); 2434 2435 nvme_ctrlr->ctrlr_op_cb_fn = NULL; 2436 nvme_ctrlr->ctrlr_op_cb_arg = NULL; 2437 2438 pthread_mutex_lock(&nvme_ctrlr->mutex); 2439 2440 nvme_ctrlr->resetting = false; 2441 nvme_ctrlr->dont_retry = false; 2442 2443 op_after_disable = bdev_nvme_check_op_after_reset(nvme_ctrlr, true); 2444 2445 nvme_ctrlr->disabled = true; 2446 spdk_poller_pause(nvme_ctrlr->adminq_timer_poller); 2447 2448 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2449 2450 if (ctrlr_op_cb_fn) { 2451 ctrlr_op_cb_fn(ctrlr_op_cb_arg, 0); 2452 } 2453 2454 switch (op_after_disable) { 2455 case OP_COMPLETE_PENDING_DESTRUCT: 2456 nvme_ctrlr_unregister(nvme_ctrlr); 2457 break; 2458 default: 2459 break; 2460 } 2461 2462 } 2463 2464 static void 2465 bdev_nvme_disable_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr) 2466 { 2467 /* Make sure we clear any pending resets before returning. */ 2468 spdk_for_each_channel(nvme_ctrlr, 2469 bdev_nvme_complete_pending_resets, 2470 NULL, 2471 _bdev_nvme_disable_ctrlr_complete); 2472 } 2473 2474 static void 2475 bdev_nvme_disable_destroy_qpairs_done(struct spdk_io_channel_iter *i, int status) 2476 { 2477 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2478 2479 assert(status == 0); 2480 2481 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2482 bdev_nvme_disable_ctrlr_complete(nvme_ctrlr); 2483 } else { 2484 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_disable_ctrlr_complete); 2485 } 2486 } 2487 2488 static void 2489 bdev_nvme_disable_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr) 2490 { 2491 spdk_for_each_channel(nvme_ctrlr, 2492 bdev_nvme_reset_destroy_qpair, 2493 NULL, 2494 bdev_nvme_disable_destroy_qpairs_done); 2495 } 2496 2497 static void 2498 _bdev_nvme_cancel_reconnect_and_disable_ctrlr(void *ctx) 2499 { 2500 struct nvme_ctrlr *nvme_ctrlr = ctx; 2501 2502 assert(nvme_ctrlr->resetting == true); 2503 assert(nvme_ctrlr->thread == spdk_get_thread()); 2504 2505 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 2506 2507 bdev_nvme_disable_ctrlr_complete(nvme_ctrlr); 2508 } 2509 2510 static void 2511 _bdev_nvme_disconnect_and_disable_ctrlr(void *ctx) 2512 { 2513 struct nvme_ctrlr *nvme_ctrlr = ctx; 2514 2515 assert(nvme_ctrlr->resetting == true); 2516 assert(nvme_ctrlr->thread == spdk_get_thread()); 2517 2518 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2519 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_disable_destroy_qpairs); 2520 } else { 2521 bdev_nvme_disable_destroy_qpairs(nvme_ctrlr); 2522 } 2523 } 2524 2525 static int 2526 bdev_nvme_disable_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2527 { 2528 spdk_msg_fn msg_fn; 2529 2530 pthread_mutex_lock(&nvme_ctrlr->mutex); 2531 if (nvme_ctrlr->destruct) { 2532 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2533 return -ENXIO; 2534 } 2535 2536 if (nvme_ctrlr->resetting) { 2537 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2538 return -EBUSY; 2539 } 2540 2541 if (nvme_ctrlr->disabled) { 2542 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2543 return -EALREADY; 2544 } 2545 2546 nvme_ctrlr->resetting = true; 2547 nvme_ctrlr->dont_retry = true; 2548 2549 if (nvme_ctrlr->reconnect_is_delayed) { 2550 msg_fn = _bdev_nvme_cancel_reconnect_and_disable_ctrlr; 2551 nvme_ctrlr->reconnect_is_delayed = false; 2552 } else { 2553 msg_fn = _bdev_nvme_disconnect_and_disable_ctrlr; 2554 } 2555 2556 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2557 2558 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2559 2560 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 2561 return 0; 2562 } 2563 2564 static int 2565 nvme_ctrlr_op(struct nvme_ctrlr *nvme_ctrlr, enum nvme_ctrlr_op op, 2566 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2567 { 2568 int rc; 2569 2570 switch (op) { 2571 case NVME_CTRLR_OP_RESET: 2572 rc = bdev_nvme_reset_ctrlr(nvme_ctrlr); 2573 break; 2574 case NVME_CTRLR_OP_ENABLE: 2575 rc = bdev_nvme_enable_ctrlr(nvme_ctrlr); 2576 break; 2577 case NVME_CTRLR_OP_DISABLE: 2578 rc = bdev_nvme_disable_ctrlr(nvme_ctrlr); 2579 break; 2580 default: 2581 rc = -EINVAL; 2582 break; 2583 } 2584 2585 if (rc == 0) { 2586 assert(nvme_ctrlr->ctrlr_op_cb_fn == NULL); 2587 assert(nvme_ctrlr->ctrlr_op_cb_arg == NULL); 2588 nvme_ctrlr->ctrlr_op_cb_fn = cb_fn; 2589 nvme_ctrlr->ctrlr_op_cb_arg = cb_arg; 2590 } 2591 return rc; 2592 } 2593 2594 struct nvme_ctrlr_op_rpc_ctx { 2595 struct nvme_ctrlr *nvme_ctrlr; 2596 struct spdk_thread *orig_thread; 2597 enum nvme_ctrlr_op op; 2598 int rc; 2599 bdev_nvme_ctrlr_op_cb cb_fn; 2600 void *cb_arg; 2601 }; 2602 2603 static void 2604 _nvme_ctrlr_op_rpc_complete(void *_ctx) 2605 { 2606 struct nvme_ctrlr_op_rpc_ctx *ctx = _ctx; 2607 2608 assert(ctx != NULL); 2609 assert(ctx->cb_fn != NULL); 2610 2611 ctx->cb_fn(ctx->cb_arg, ctx->rc); 2612 2613 free(ctx); 2614 } 2615 2616 static void 2617 nvme_ctrlr_op_rpc_complete(void *cb_arg, int rc) 2618 { 2619 struct nvme_ctrlr_op_rpc_ctx *ctx = cb_arg; 2620 2621 ctx->rc = rc; 2622 2623 spdk_thread_send_msg(ctx->orig_thread, _nvme_ctrlr_op_rpc_complete, ctx); 2624 } 2625 2626 void 2627 nvme_ctrlr_op_rpc(struct nvme_ctrlr *nvme_ctrlr, enum nvme_ctrlr_op op, 2628 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2629 { 2630 struct nvme_ctrlr_op_rpc_ctx *ctx; 2631 int rc; 2632 2633 assert(cb_fn != NULL); 2634 2635 ctx = calloc(1, sizeof(*ctx)); 2636 if (ctx == NULL) { 2637 SPDK_ERRLOG("Failed to allocate nvme_ctrlr_op_rpc_ctx.\n"); 2638 cb_fn(cb_arg, -ENOMEM); 2639 return; 2640 } 2641 2642 ctx->orig_thread = spdk_get_thread(); 2643 ctx->cb_fn = cb_fn; 2644 ctx->cb_arg = cb_arg; 2645 2646 rc = nvme_ctrlr_op(nvme_ctrlr, op, nvme_ctrlr_op_rpc_complete, ctx); 2647 if (rc == 0) { 2648 return; 2649 } else if (rc == -EALREADY) { 2650 rc = 0; 2651 } 2652 2653 nvme_ctrlr_op_rpc_complete(ctx, rc); 2654 } 2655 2656 static void nvme_bdev_ctrlr_op_rpc_continue(void *cb_arg, int rc); 2657 2658 static void 2659 _nvme_bdev_ctrlr_op_rpc_continue(void *_ctx) 2660 { 2661 struct nvme_ctrlr_op_rpc_ctx *ctx = _ctx; 2662 struct nvme_ctrlr *prev_nvme_ctrlr, *next_nvme_ctrlr; 2663 int rc; 2664 2665 prev_nvme_ctrlr = ctx->nvme_ctrlr; 2666 ctx->nvme_ctrlr = NULL; 2667 2668 if (ctx->rc != 0) { 2669 goto complete; 2670 } 2671 2672 next_nvme_ctrlr = TAILQ_NEXT(prev_nvme_ctrlr, tailq); 2673 if (next_nvme_ctrlr == NULL) { 2674 goto complete; 2675 } 2676 2677 rc = nvme_ctrlr_op(next_nvme_ctrlr, ctx->op, nvme_bdev_ctrlr_op_rpc_continue, ctx); 2678 if (rc == 0) { 2679 ctx->nvme_ctrlr = next_nvme_ctrlr; 2680 return; 2681 } else if (rc == -EALREADY) { 2682 ctx->nvme_ctrlr = next_nvme_ctrlr; 2683 rc = 0; 2684 } 2685 2686 ctx->rc = rc; 2687 2688 complete: 2689 ctx->cb_fn(ctx->cb_arg, ctx->rc); 2690 free(ctx); 2691 } 2692 2693 static void 2694 nvme_bdev_ctrlr_op_rpc_continue(void *cb_arg, int rc) 2695 { 2696 struct nvme_ctrlr_op_rpc_ctx *ctx = cb_arg; 2697 2698 ctx->rc = rc; 2699 2700 spdk_thread_send_msg(ctx->orig_thread, _nvme_bdev_ctrlr_op_rpc_continue, ctx); 2701 } 2702 2703 void 2704 nvme_bdev_ctrlr_op_rpc(struct nvme_bdev_ctrlr *nbdev_ctrlr, enum nvme_ctrlr_op op, 2705 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2706 { 2707 struct nvme_ctrlr_op_rpc_ctx *ctx; 2708 struct nvme_ctrlr *nvme_ctrlr; 2709 int rc; 2710 2711 assert(cb_fn != NULL); 2712 2713 ctx = calloc(1, sizeof(*ctx)); 2714 if (ctx == NULL) { 2715 SPDK_ERRLOG("Failed to allocate nvme_ctrlr_op_rpc_ctx.\n"); 2716 cb_fn(cb_arg, -ENOMEM); 2717 return; 2718 } 2719 2720 ctx->orig_thread = spdk_get_thread(); 2721 ctx->op = op; 2722 ctx->cb_fn = cb_fn; 2723 ctx->cb_arg = cb_arg; 2724 2725 nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 2726 assert(nvme_ctrlr != NULL); 2727 2728 rc = nvme_ctrlr_op(nvme_ctrlr, op, nvme_bdev_ctrlr_op_rpc_continue, ctx); 2729 if (rc == 0) { 2730 ctx->nvme_ctrlr = nvme_ctrlr; 2731 return; 2732 } else if (rc == -EALREADY) { 2733 ctx->nvme_ctrlr = nvme_ctrlr; 2734 rc = 0; 2735 } 2736 2737 nvme_bdev_ctrlr_op_rpc_continue(ctx, rc); 2738 } 2739 2740 static int _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio); 2741 2742 static void 2743 _bdev_nvme_reset_io_complete(struct spdk_io_channel_iter *i, int status) 2744 { 2745 struct nvme_bdev_io *bio = spdk_io_channel_iter_get_ctx(i); 2746 enum spdk_bdev_io_status io_status; 2747 2748 if (bio->cpl.cdw0 == 0) { 2749 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 2750 } else { 2751 io_status = SPDK_BDEV_IO_STATUS_FAILED; 2752 } 2753 2754 __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), io_status, NULL); 2755 } 2756 2757 static void 2758 bdev_nvme_abort_bdev_channel(struct spdk_io_channel_iter *i) 2759 { 2760 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 2761 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 2762 2763 bdev_nvme_abort_retry_ios(nbdev_ch); 2764 2765 spdk_for_each_channel_continue(i, 0); 2766 } 2767 2768 static void 2769 bdev_nvme_reset_io_complete(struct nvme_bdev_io *bio) 2770 { 2771 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2772 struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt; 2773 2774 /* Abort all queued I/Os for retry. */ 2775 spdk_for_each_channel(nbdev, 2776 bdev_nvme_abort_bdev_channel, 2777 bio, 2778 _bdev_nvme_reset_io_complete); 2779 } 2780 2781 static void 2782 _bdev_nvme_reset_io_continue(void *ctx) 2783 { 2784 struct nvme_bdev_io *bio = ctx; 2785 struct nvme_io_path *prev_io_path, *next_io_path; 2786 int rc; 2787 2788 prev_io_path = bio->io_path; 2789 bio->io_path = NULL; 2790 2791 if (bio->cpl.cdw0 != 0) { 2792 goto complete; 2793 } 2794 2795 next_io_path = STAILQ_NEXT(prev_io_path, stailq); 2796 if (next_io_path == NULL) { 2797 goto complete; 2798 } 2799 2800 rc = _bdev_nvme_reset_io(next_io_path, bio); 2801 if (rc == 0) { 2802 return; 2803 } 2804 2805 bio->cpl.cdw0 = 1; 2806 2807 complete: 2808 bdev_nvme_reset_io_complete(bio); 2809 } 2810 2811 static void 2812 bdev_nvme_reset_io_continue(void *cb_arg, int rc) 2813 { 2814 struct nvme_bdev_io *bio = cb_arg; 2815 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2816 2817 bio->cpl.cdw0 = (rc == 0) ? 0 : 1; 2818 2819 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), _bdev_nvme_reset_io_continue, bio); 2820 } 2821 2822 static int 2823 _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio) 2824 { 2825 struct nvme_ctrlr_channel *ctrlr_ch; 2826 struct spdk_bdev_io *bdev_io; 2827 int rc; 2828 2829 rc = nvme_ctrlr_op(io_path->qpair->ctrlr, NVME_CTRLR_OP_RESET, 2830 bdev_nvme_reset_io_continue, bio); 2831 if (rc != 0 && rc != -EBUSY) { 2832 return rc; 2833 } 2834 2835 assert(bio->io_path == NULL); 2836 bio->io_path = io_path; 2837 2838 if (rc == -EBUSY) { 2839 ctrlr_ch = io_path->qpair->ctrlr_ch; 2840 assert(ctrlr_ch != NULL); 2841 /* 2842 * Reset call is queued only if it is from the app framework. This is on purpose so that 2843 * we don't interfere with the app framework reset strategy. i.e. we are deferring to the 2844 * upper level. If they are in the middle of a reset, we won't try to schedule another one. 2845 */ 2846 bdev_io = spdk_bdev_io_from_ctx(bio); 2847 TAILQ_INSERT_TAIL(&ctrlr_ch->pending_resets, bdev_io, module_link); 2848 } 2849 2850 return 0; 2851 } 2852 2853 static void 2854 bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio) 2855 { 2856 struct nvme_io_path *io_path; 2857 int rc; 2858 2859 bio->cpl.cdw0 = 0; 2860 2861 /* Reset all nvme_ctrlrs of a bdev controller sequentially. */ 2862 io_path = STAILQ_FIRST(&nbdev_ch->io_path_list); 2863 assert(io_path != NULL); 2864 2865 rc = _bdev_nvme_reset_io(io_path, bio); 2866 if (rc != 0) { 2867 /* If the current nvme_ctrlr is disabled, skip it and move to the next nvme_ctrlr. */ 2868 rc = (rc == -EALREADY) ? 0 : rc; 2869 2870 bdev_nvme_reset_io_continue(bio, rc); 2871 } 2872 } 2873 2874 static int 2875 bdev_nvme_failover_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool remove) 2876 { 2877 if (nvme_ctrlr->destruct) { 2878 /* Don't bother resetting if the controller is in the process of being destructed. */ 2879 return -ENXIO; 2880 } 2881 2882 if (nvme_ctrlr->resetting) { 2883 if (!nvme_ctrlr->in_failover) { 2884 SPDK_NOTICELOG("Reset is already in progress. Defer failover until reset completes.\n"); 2885 2886 /* Defer failover until reset completes. */ 2887 nvme_ctrlr->pending_failover = true; 2888 return -EINPROGRESS; 2889 } else { 2890 SPDK_NOTICELOG("Unable to perform failover, already in progress.\n"); 2891 return -EBUSY; 2892 } 2893 } 2894 2895 bdev_nvme_failover_trid(nvme_ctrlr, remove, true); 2896 2897 if (nvme_ctrlr->reconnect_is_delayed) { 2898 SPDK_NOTICELOG("Reconnect is already scheduled.\n"); 2899 2900 /* We rely on the next reconnect for the failover. */ 2901 return -EALREADY; 2902 } 2903 2904 if (nvme_ctrlr->disabled) { 2905 SPDK_NOTICELOG("Controller is disabled.\n"); 2906 2907 /* We rely on the enablement for the failover. */ 2908 return -EALREADY; 2909 } 2910 2911 nvme_ctrlr->resetting = true; 2912 nvme_ctrlr->in_failover = true; 2913 2914 assert(nvme_ctrlr->reset_start_tsc == 0); 2915 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2916 2917 return 0; 2918 } 2919 2920 static int 2921 bdev_nvme_failover_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2922 { 2923 int rc; 2924 2925 pthread_mutex_lock(&nvme_ctrlr->mutex); 2926 rc = bdev_nvme_failover_ctrlr_unsafe(nvme_ctrlr, false); 2927 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2928 2929 if (rc == 0) { 2930 spdk_thread_send_msg(nvme_ctrlr->thread, _bdev_nvme_reset_ctrlr, nvme_ctrlr); 2931 } else if (rc == -EALREADY) { 2932 rc = 0; 2933 } 2934 2935 return rc; 2936 } 2937 2938 static int bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, 2939 uint64_t num_blocks); 2940 2941 static int bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, 2942 uint64_t num_blocks); 2943 2944 static int bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, 2945 uint64_t src_offset_blocks, 2946 uint64_t num_blocks); 2947 2948 static void 2949 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 2950 bool success) 2951 { 2952 struct nvme_bdev_io *bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 2953 int ret; 2954 2955 if (!success) { 2956 ret = -EINVAL; 2957 goto exit; 2958 } 2959 2960 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 2961 ret = -ENXIO; 2962 goto exit; 2963 } 2964 2965 ret = bdev_nvme_readv(bio, 2966 bdev_io->u.bdev.iovs, 2967 bdev_io->u.bdev.iovcnt, 2968 bdev_io->u.bdev.md_buf, 2969 bdev_io->u.bdev.num_blocks, 2970 bdev_io->u.bdev.offset_blocks, 2971 bdev_io->u.bdev.dif_check_flags, 2972 bdev_io->u.bdev.memory_domain, 2973 bdev_io->u.bdev.memory_domain_ctx, 2974 bdev_io->u.bdev.accel_sequence); 2975 2976 exit: 2977 if (spdk_unlikely(ret != 0)) { 2978 bdev_nvme_io_complete(bio, ret); 2979 } 2980 } 2981 2982 static inline void 2983 _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io) 2984 { 2985 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 2986 struct spdk_bdev *bdev = bdev_io->bdev; 2987 struct nvme_bdev_io *nbdev_io_to_abort; 2988 int rc = 0; 2989 2990 switch (bdev_io->type) { 2991 case SPDK_BDEV_IO_TYPE_READ: 2992 if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) { 2993 2994 rc = bdev_nvme_readv(nbdev_io, 2995 bdev_io->u.bdev.iovs, 2996 bdev_io->u.bdev.iovcnt, 2997 bdev_io->u.bdev.md_buf, 2998 bdev_io->u.bdev.num_blocks, 2999 bdev_io->u.bdev.offset_blocks, 3000 bdev_io->u.bdev.dif_check_flags, 3001 bdev_io->u.bdev.memory_domain, 3002 bdev_io->u.bdev.memory_domain_ctx, 3003 bdev_io->u.bdev.accel_sequence); 3004 } else { 3005 spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb, 3006 bdev_io->u.bdev.num_blocks * bdev->blocklen); 3007 rc = 0; 3008 } 3009 break; 3010 case SPDK_BDEV_IO_TYPE_WRITE: 3011 rc = bdev_nvme_writev(nbdev_io, 3012 bdev_io->u.bdev.iovs, 3013 bdev_io->u.bdev.iovcnt, 3014 bdev_io->u.bdev.md_buf, 3015 bdev_io->u.bdev.num_blocks, 3016 bdev_io->u.bdev.offset_blocks, 3017 bdev_io->u.bdev.dif_check_flags, 3018 bdev_io->u.bdev.memory_domain, 3019 bdev_io->u.bdev.memory_domain_ctx, 3020 bdev_io->u.bdev.accel_sequence, 3021 bdev_io->u.bdev.nvme_cdw12, 3022 bdev_io->u.bdev.nvme_cdw13); 3023 break; 3024 case SPDK_BDEV_IO_TYPE_COMPARE: 3025 rc = bdev_nvme_comparev(nbdev_io, 3026 bdev_io->u.bdev.iovs, 3027 bdev_io->u.bdev.iovcnt, 3028 bdev_io->u.bdev.md_buf, 3029 bdev_io->u.bdev.num_blocks, 3030 bdev_io->u.bdev.offset_blocks, 3031 bdev_io->u.bdev.dif_check_flags); 3032 break; 3033 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 3034 rc = bdev_nvme_comparev_and_writev(nbdev_io, 3035 bdev_io->u.bdev.iovs, 3036 bdev_io->u.bdev.iovcnt, 3037 bdev_io->u.bdev.fused_iovs, 3038 bdev_io->u.bdev.fused_iovcnt, 3039 bdev_io->u.bdev.md_buf, 3040 bdev_io->u.bdev.num_blocks, 3041 bdev_io->u.bdev.offset_blocks, 3042 bdev_io->u.bdev.dif_check_flags); 3043 break; 3044 case SPDK_BDEV_IO_TYPE_UNMAP: 3045 rc = bdev_nvme_unmap(nbdev_io, 3046 bdev_io->u.bdev.offset_blocks, 3047 bdev_io->u.bdev.num_blocks); 3048 break; 3049 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3050 rc = bdev_nvme_write_zeroes(nbdev_io, 3051 bdev_io->u.bdev.offset_blocks, 3052 bdev_io->u.bdev.num_blocks); 3053 break; 3054 case SPDK_BDEV_IO_TYPE_RESET: 3055 nbdev_io->io_path = NULL; 3056 bdev_nvme_reset_io(nbdev_ch, nbdev_io); 3057 return; 3058 3059 case SPDK_BDEV_IO_TYPE_FLUSH: 3060 bdev_nvme_io_complete(nbdev_io, 0); 3061 return; 3062 3063 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 3064 rc = bdev_nvme_zone_appendv(nbdev_io, 3065 bdev_io->u.bdev.iovs, 3066 bdev_io->u.bdev.iovcnt, 3067 bdev_io->u.bdev.md_buf, 3068 bdev_io->u.bdev.num_blocks, 3069 bdev_io->u.bdev.offset_blocks, 3070 bdev_io->u.bdev.dif_check_flags); 3071 break; 3072 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 3073 rc = bdev_nvme_get_zone_info(nbdev_io, 3074 bdev_io->u.zone_mgmt.zone_id, 3075 bdev_io->u.zone_mgmt.num_zones, 3076 bdev_io->u.zone_mgmt.buf); 3077 break; 3078 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 3079 rc = bdev_nvme_zone_management(nbdev_io, 3080 bdev_io->u.zone_mgmt.zone_id, 3081 bdev_io->u.zone_mgmt.zone_action); 3082 break; 3083 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 3084 nbdev_io->io_path = NULL; 3085 bdev_nvme_admin_passthru(nbdev_ch, 3086 nbdev_io, 3087 &bdev_io->u.nvme_passthru.cmd, 3088 bdev_io->u.nvme_passthru.buf, 3089 bdev_io->u.nvme_passthru.nbytes); 3090 return; 3091 3092 case SPDK_BDEV_IO_TYPE_NVME_IO: 3093 rc = bdev_nvme_io_passthru(nbdev_io, 3094 &bdev_io->u.nvme_passthru.cmd, 3095 bdev_io->u.nvme_passthru.buf, 3096 bdev_io->u.nvme_passthru.nbytes); 3097 break; 3098 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3099 rc = bdev_nvme_io_passthru_md(nbdev_io, 3100 &bdev_io->u.nvme_passthru.cmd, 3101 bdev_io->u.nvme_passthru.buf, 3102 bdev_io->u.nvme_passthru.nbytes, 3103 bdev_io->u.nvme_passthru.md_buf, 3104 bdev_io->u.nvme_passthru.md_len); 3105 break; 3106 case SPDK_BDEV_IO_TYPE_NVME_IOV_MD: 3107 rc = bdev_nvme_iov_passthru_md(nbdev_io, 3108 &bdev_io->u.nvme_passthru.cmd, 3109 bdev_io->u.nvme_passthru.iovs, 3110 bdev_io->u.nvme_passthru.iovcnt, 3111 bdev_io->u.nvme_passthru.nbytes, 3112 bdev_io->u.nvme_passthru.md_buf, 3113 bdev_io->u.nvme_passthru.md_len); 3114 break; 3115 case SPDK_BDEV_IO_TYPE_ABORT: 3116 nbdev_io->io_path = NULL; 3117 nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx; 3118 bdev_nvme_abort(nbdev_ch, 3119 nbdev_io, 3120 nbdev_io_to_abort); 3121 return; 3122 3123 case SPDK_BDEV_IO_TYPE_COPY: 3124 rc = bdev_nvme_copy(nbdev_io, 3125 bdev_io->u.bdev.offset_blocks, 3126 bdev_io->u.bdev.copy.src_offset_blocks, 3127 bdev_io->u.bdev.num_blocks); 3128 break; 3129 default: 3130 rc = -EINVAL; 3131 break; 3132 } 3133 3134 if (spdk_unlikely(rc != 0)) { 3135 bdev_nvme_io_complete(nbdev_io, rc); 3136 } 3137 } 3138 3139 static void 3140 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 3141 { 3142 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 3143 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 3144 3145 if (spdk_likely(nbdev_io->submit_tsc == 0)) { 3146 nbdev_io->submit_tsc = spdk_bdev_io_get_submit_tsc(bdev_io); 3147 } else { 3148 /* There are cases where submit_tsc != 0, i.e. retry I/O. 3149 * We need to update submit_tsc here. 3150 */ 3151 nbdev_io->submit_tsc = spdk_get_ticks(); 3152 } 3153 3154 spdk_trace_record(TRACE_BDEV_NVME_IO_START, 0, 0, (uintptr_t)nbdev_io, (uintptr_t)bdev_io); 3155 nbdev_io->io_path = bdev_nvme_find_io_path(nbdev_ch); 3156 if (spdk_unlikely(!nbdev_io->io_path)) { 3157 if (!bdev_nvme_io_type_is_admin(bdev_io->type)) { 3158 bdev_nvme_io_complete(nbdev_io, -ENXIO); 3159 return; 3160 } 3161 3162 /* Admin commands do not use the optimal I/O path. 3163 * Simply fall through even if it is not found. 3164 */ 3165 } 3166 3167 _bdev_nvme_submit_request(nbdev_ch, bdev_io); 3168 } 3169 3170 static bool 3171 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 3172 { 3173 struct nvme_bdev *nbdev = ctx; 3174 struct nvme_ns *nvme_ns; 3175 struct spdk_nvme_ns *ns; 3176 struct spdk_nvme_ctrlr *ctrlr; 3177 const struct spdk_nvme_ctrlr_data *cdata; 3178 3179 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 3180 assert(nvme_ns != NULL); 3181 ns = nvme_ns->ns; 3182 if (ns == NULL) { 3183 return false; 3184 } 3185 3186 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 3187 3188 switch (io_type) { 3189 case SPDK_BDEV_IO_TYPE_READ: 3190 case SPDK_BDEV_IO_TYPE_WRITE: 3191 case SPDK_BDEV_IO_TYPE_RESET: 3192 case SPDK_BDEV_IO_TYPE_FLUSH: 3193 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 3194 case SPDK_BDEV_IO_TYPE_NVME_IO: 3195 case SPDK_BDEV_IO_TYPE_ABORT: 3196 return true; 3197 3198 case SPDK_BDEV_IO_TYPE_COMPARE: 3199 return spdk_nvme_ns_supports_compare(ns); 3200 3201 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3202 return spdk_nvme_ns_get_md_size(ns) ? true : false; 3203 3204 case SPDK_BDEV_IO_TYPE_UNMAP: 3205 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3206 return cdata->oncs.dsm; 3207 3208 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3209 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3210 return cdata->oncs.write_zeroes; 3211 3212 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 3213 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 3214 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) { 3215 return true; 3216 } 3217 return false; 3218 3219 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 3220 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 3221 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS; 3222 3223 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 3224 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS && 3225 spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ZONE_APPEND_SUPPORTED; 3226 3227 case SPDK_BDEV_IO_TYPE_COPY: 3228 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3229 return cdata->oncs.copy; 3230 3231 default: 3232 return false; 3233 } 3234 } 3235 3236 static int 3237 nvme_qpair_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ctrlr_channel *ctrlr_ch) 3238 { 3239 struct nvme_qpair *nvme_qpair; 3240 struct spdk_io_channel *pg_ch; 3241 int rc; 3242 3243 nvme_qpair = calloc(1, sizeof(*nvme_qpair)); 3244 if (!nvme_qpair) { 3245 SPDK_ERRLOG("Failed to alloc nvme_qpair.\n"); 3246 return -1; 3247 } 3248 3249 TAILQ_INIT(&nvme_qpair->io_path_list); 3250 3251 nvme_qpair->ctrlr = nvme_ctrlr; 3252 nvme_qpair->ctrlr_ch = ctrlr_ch; 3253 3254 pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs); 3255 if (!pg_ch) { 3256 free(nvme_qpair); 3257 return -1; 3258 } 3259 3260 nvme_qpair->group = spdk_io_channel_get_ctx(pg_ch); 3261 3262 #ifdef SPDK_CONFIG_VTUNE 3263 nvme_qpair->group->collect_spin_stat = true; 3264 #else 3265 nvme_qpair->group->collect_spin_stat = false; 3266 #endif 3267 3268 if (!nvme_ctrlr->disabled) { 3269 /* If a nvme_ctrlr is disabled, don't try to create qpair for it. Qpair will 3270 * be created when it's enabled. 3271 */ 3272 rc = bdev_nvme_create_qpair(nvme_qpair); 3273 if (rc != 0) { 3274 /* nvme_ctrlr can't create IO qpair if connection is down. 3275 * If reconnect_delay_sec is non-zero, creating IO qpair is retried 3276 * after reconnect_delay_sec seconds. If bdev_retry_count is non-zero, 3277 * submitted IO will be queued until IO qpair is successfully created. 3278 * 3279 * Hence, if both are satisfied, ignore the failure. 3280 */ 3281 if (nvme_ctrlr->opts.reconnect_delay_sec == 0 || g_opts.bdev_retry_count == 0) { 3282 spdk_put_io_channel(pg_ch); 3283 free(nvme_qpair); 3284 return rc; 3285 } 3286 } 3287 } 3288 3289 TAILQ_INSERT_TAIL(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 3290 3291 ctrlr_ch->qpair = nvme_qpair; 3292 3293 pthread_mutex_lock(&nvme_qpair->ctrlr->mutex); 3294 nvme_qpair->ctrlr->ref++; 3295 pthread_mutex_unlock(&nvme_qpair->ctrlr->mutex); 3296 3297 return 0; 3298 } 3299 3300 static int 3301 bdev_nvme_create_ctrlr_channel_cb(void *io_device, void *ctx_buf) 3302 { 3303 struct nvme_ctrlr *nvme_ctrlr = io_device; 3304 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 3305 3306 TAILQ_INIT(&ctrlr_ch->pending_resets); 3307 3308 return nvme_qpair_create(nvme_ctrlr, ctrlr_ch); 3309 } 3310 3311 static void 3312 nvme_qpair_delete(struct nvme_qpair *nvme_qpair) 3313 { 3314 struct nvme_io_path *io_path, *next; 3315 3316 assert(nvme_qpair->group != NULL); 3317 3318 TAILQ_FOREACH_SAFE(io_path, &nvme_qpair->io_path_list, tailq, next) { 3319 TAILQ_REMOVE(&nvme_qpair->io_path_list, io_path, tailq); 3320 nvme_io_path_free(io_path); 3321 } 3322 3323 TAILQ_REMOVE(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 3324 3325 spdk_put_io_channel(spdk_io_channel_from_ctx(nvme_qpair->group)); 3326 3327 nvme_ctrlr_release(nvme_qpair->ctrlr); 3328 3329 free(nvme_qpair); 3330 } 3331 3332 static void 3333 bdev_nvme_destroy_ctrlr_channel_cb(void *io_device, void *ctx_buf) 3334 { 3335 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 3336 struct nvme_qpair *nvme_qpair; 3337 3338 nvme_qpair = ctrlr_ch->qpair; 3339 assert(nvme_qpair != NULL); 3340 3341 _bdev_nvme_clear_io_path_cache(nvme_qpair); 3342 3343 if (nvme_qpair->qpair != NULL) { 3344 if (ctrlr_ch->reset_iter == NULL) { 3345 spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); 3346 } else { 3347 /* Skip current ctrlr_channel in a full reset sequence because 3348 * it is being deleted now. The qpair is already being disconnected. 3349 * We do not have to restart disconnecting it. 3350 */ 3351 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 3352 } 3353 3354 /* We cannot release a reference to the poll group now. 3355 * The qpair may be disconnected asynchronously later. 3356 * We need to poll it until it is actually disconnected. 3357 * Just detach the qpair from the deleting ctrlr_channel. 3358 */ 3359 nvme_qpair->ctrlr_ch = NULL; 3360 } else { 3361 assert(ctrlr_ch->reset_iter == NULL); 3362 3363 nvme_qpair_delete(nvme_qpair); 3364 } 3365 } 3366 3367 static inline struct spdk_io_channel * 3368 bdev_nvme_get_accel_channel(struct nvme_poll_group *group) 3369 { 3370 if (spdk_unlikely(!group->accel_channel)) { 3371 group->accel_channel = spdk_accel_get_io_channel(); 3372 if (!group->accel_channel) { 3373 SPDK_ERRLOG("Cannot get the accel_channel for bdev nvme polling group=%p\n", 3374 group); 3375 return NULL; 3376 } 3377 } 3378 3379 return group->accel_channel; 3380 } 3381 3382 static void 3383 bdev_nvme_submit_accel_crc32c(void *ctx, uint32_t *dst, struct iovec *iov, 3384 uint32_t iov_cnt, uint32_t seed, 3385 spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) 3386 { 3387 struct spdk_io_channel *accel_ch; 3388 struct nvme_poll_group *group = ctx; 3389 int rc; 3390 3391 assert(cb_fn != NULL); 3392 3393 accel_ch = bdev_nvme_get_accel_channel(group); 3394 if (spdk_unlikely(accel_ch == NULL)) { 3395 cb_fn(cb_arg, -ENOMEM); 3396 return; 3397 } 3398 3399 rc = spdk_accel_submit_crc32cv(accel_ch, dst, iov, iov_cnt, seed, cb_fn, cb_arg); 3400 if (rc) { 3401 /* For the two cases, spdk_accel_submit_crc32cv does not call the user's cb_fn */ 3402 if (rc == -ENOMEM || rc == -EINVAL) { 3403 cb_fn(cb_arg, rc); 3404 } 3405 SPDK_ERRLOG("Cannot complete the accelerated crc32c operation with iov=%p\n", iov); 3406 } 3407 } 3408 3409 static void 3410 bdev_nvme_finish_sequence(void *seq, spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) 3411 { 3412 spdk_accel_sequence_finish(seq, cb_fn, cb_arg); 3413 } 3414 3415 static void 3416 bdev_nvme_abort_sequence(void *seq) 3417 { 3418 spdk_accel_sequence_abort(seq); 3419 } 3420 3421 static void 3422 bdev_nvme_reverse_sequence(void *seq) 3423 { 3424 spdk_accel_sequence_reverse(seq); 3425 } 3426 3427 static int 3428 bdev_nvme_append_crc32c(void *ctx, void **seq, uint32_t *dst, struct iovec *iovs, uint32_t iovcnt, 3429 struct spdk_memory_domain *domain, void *domain_ctx, uint32_t seed, 3430 spdk_nvme_accel_step_cb cb_fn, void *cb_arg) 3431 { 3432 struct spdk_io_channel *ch; 3433 struct nvme_poll_group *group = ctx; 3434 3435 ch = bdev_nvme_get_accel_channel(group); 3436 if (spdk_unlikely(ch == NULL)) { 3437 return -ENOMEM; 3438 } 3439 3440 return spdk_accel_append_crc32c((struct spdk_accel_sequence **)seq, ch, dst, iovs, iovcnt, 3441 domain, domain_ctx, seed, cb_fn, cb_arg); 3442 } 3443 3444 static struct spdk_nvme_accel_fn_table g_bdev_nvme_accel_fn_table = { 3445 .table_size = sizeof(struct spdk_nvme_accel_fn_table), 3446 .submit_accel_crc32c = bdev_nvme_submit_accel_crc32c, 3447 .append_crc32c = bdev_nvme_append_crc32c, 3448 .finish_sequence = bdev_nvme_finish_sequence, 3449 .reverse_sequence = bdev_nvme_reverse_sequence, 3450 .abort_sequence = bdev_nvme_abort_sequence, 3451 }; 3452 3453 static int 3454 bdev_nvme_create_poll_group_cb(void *io_device, void *ctx_buf) 3455 { 3456 struct nvme_poll_group *group = ctx_buf; 3457 3458 TAILQ_INIT(&group->qpair_list); 3459 3460 group->group = spdk_nvme_poll_group_create(group, &g_bdev_nvme_accel_fn_table); 3461 if (group->group == NULL) { 3462 return -1; 3463 } 3464 3465 group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us); 3466 3467 if (group->poller == NULL) { 3468 spdk_nvme_poll_group_destroy(group->group); 3469 return -1; 3470 } 3471 3472 return 0; 3473 } 3474 3475 static void 3476 bdev_nvme_destroy_poll_group_cb(void *io_device, void *ctx_buf) 3477 { 3478 struct nvme_poll_group *group = ctx_buf; 3479 3480 assert(TAILQ_EMPTY(&group->qpair_list)); 3481 3482 if (group->accel_channel) { 3483 spdk_put_io_channel(group->accel_channel); 3484 } 3485 3486 spdk_poller_unregister(&group->poller); 3487 if (spdk_nvme_poll_group_destroy(group->group)) { 3488 SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.\n"); 3489 assert(false); 3490 } 3491 } 3492 3493 static struct spdk_io_channel * 3494 bdev_nvme_get_io_channel(void *ctx) 3495 { 3496 struct nvme_bdev *nvme_bdev = ctx; 3497 3498 return spdk_get_io_channel(nvme_bdev); 3499 } 3500 3501 static void * 3502 bdev_nvme_get_module_ctx(void *ctx) 3503 { 3504 struct nvme_bdev *nvme_bdev = ctx; 3505 struct nvme_ns *nvme_ns; 3506 3507 if (!nvme_bdev || nvme_bdev->disk.module != &nvme_if) { 3508 return NULL; 3509 } 3510 3511 nvme_ns = TAILQ_FIRST(&nvme_bdev->nvme_ns_list); 3512 if (!nvme_ns) { 3513 return NULL; 3514 } 3515 3516 return nvme_ns->ns; 3517 } 3518 3519 static const char * 3520 _nvme_ana_state_str(enum spdk_nvme_ana_state ana_state) 3521 { 3522 switch (ana_state) { 3523 case SPDK_NVME_ANA_OPTIMIZED_STATE: 3524 return "optimized"; 3525 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 3526 return "non_optimized"; 3527 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 3528 return "inaccessible"; 3529 case SPDK_NVME_ANA_PERSISTENT_LOSS_STATE: 3530 return "persistent_loss"; 3531 case SPDK_NVME_ANA_CHANGE_STATE: 3532 return "change"; 3533 default: 3534 return NULL; 3535 } 3536 } 3537 3538 static int 3539 bdev_nvme_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size) 3540 { 3541 struct spdk_memory_domain **_domains = NULL; 3542 struct nvme_bdev *nbdev = ctx; 3543 struct nvme_ns *nvme_ns; 3544 int i = 0, _array_size = array_size; 3545 int rc = 0; 3546 3547 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 3548 if (domains && array_size >= i) { 3549 _domains = &domains[i]; 3550 } else { 3551 _domains = NULL; 3552 } 3553 rc = spdk_nvme_ctrlr_get_memory_domains(nvme_ns->ctrlr->ctrlr, _domains, _array_size); 3554 if (rc > 0) { 3555 i += rc; 3556 if (_array_size >= rc) { 3557 _array_size -= rc; 3558 } else { 3559 _array_size = 0; 3560 } 3561 } else if (rc < 0) { 3562 return rc; 3563 } 3564 } 3565 3566 return i; 3567 } 3568 3569 static const char * 3570 nvme_ctrlr_get_state_str(struct nvme_ctrlr *nvme_ctrlr) 3571 { 3572 if (nvme_ctrlr->destruct) { 3573 return "deleting"; 3574 } else if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 3575 return "failed"; 3576 } else if (nvme_ctrlr->resetting) { 3577 return "resetting"; 3578 } else if (nvme_ctrlr->reconnect_is_delayed > 0) { 3579 return "reconnect_is_delayed"; 3580 } else if (nvme_ctrlr->disabled) { 3581 return "disabled"; 3582 } else { 3583 return "enabled"; 3584 } 3585 } 3586 3587 void 3588 nvme_ctrlr_info_json(struct spdk_json_write_ctx *w, struct nvme_ctrlr *nvme_ctrlr) 3589 { 3590 struct spdk_nvme_transport_id *trid; 3591 const struct spdk_nvme_ctrlr_opts *opts; 3592 const struct spdk_nvme_ctrlr_data *cdata; 3593 struct nvme_path_id *path_id; 3594 3595 spdk_json_write_object_begin(w); 3596 3597 spdk_json_write_named_string(w, "state", nvme_ctrlr_get_state_str(nvme_ctrlr)); 3598 3599 #ifdef SPDK_CONFIG_NVME_CUSE 3600 size_t cuse_name_size = 128; 3601 char cuse_name[cuse_name_size]; 3602 3603 int rc = spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr, cuse_name, &cuse_name_size); 3604 if (rc == 0) { 3605 spdk_json_write_named_string(w, "cuse_device", cuse_name); 3606 } 3607 #endif 3608 trid = &nvme_ctrlr->active_path_id->trid; 3609 spdk_json_write_named_object_begin(w, "trid"); 3610 nvme_bdev_dump_trid_json(trid, w); 3611 spdk_json_write_object_end(w); 3612 3613 path_id = TAILQ_NEXT(nvme_ctrlr->active_path_id, link); 3614 if (path_id != NULL) { 3615 spdk_json_write_named_array_begin(w, "alternate_trids"); 3616 do { 3617 trid = &path_id->trid; 3618 spdk_json_write_object_begin(w); 3619 nvme_bdev_dump_trid_json(trid, w); 3620 spdk_json_write_object_end(w); 3621 3622 path_id = TAILQ_NEXT(path_id, link); 3623 } while (path_id != NULL); 3624 spdk_json_write_array_end(w); 3625 } 3626 3627 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 3628 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 3629 3630 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 3631 spdk_json_write_named_object_begin(w, "host"); 3632 spdk_json_write_named_string(w, "nqn", opts->hostnqn); 3633 spdk_json_write_named_string(w, "addr", opts->src_addr); 3634 spdk_json_write_named_string(w, "svcid", opts->src_svcid); 3635 spdk_json_write_object_end(w); 3636 3637 spdk_json_write_object_end(w); 3638 } 3639 3640 static void 3641 nvme_namespace_info_json(struct spdk_json_write_ctx *w, 3642 struct nvme_ns *nvme_ns) 3643 { 3644 struct spdk_nvme_ns *ns; 3645 struct spdk_nvme_ctrlr *ctrlr; 3646 const struct spdk_nvme_ctrlr_data *cdata; 3647 const struct spdk_nvme_transport_id *trid; 3648 union spdk_nvme_vs_register vs; 3649 const struct spdk_nvme_ns_data *nsdata; 3650 char buf[128]; 3651 3652 ns = nvme_ns->ns; 3653 if (ns == NULL) { 3654 return; 3655 } 3656 3657 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 3658 3659 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3660 trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); 3661 vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr); 3662 3663 spdk_json_write_object_begin(w); 3664 3665 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 3666 spdk_json_write_named_string(w, "pci_address", trid->traddr); 3667 } 3668 3669 spdk_json_write_named_object_begin(w, "trid"); 3670 3671 nvme_bdev_dump_trid_json(trid, w); 3672 3673 spdk_json_write_object_end(w); 3674 3675 #ifdef SPDK_CONFIG_NVME_CUSE 3676 size_t cuse_name_size = 128; 3677 char cuse_name[cuse_name_size]; 3678 3679 int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns), 3680 cuse_name, &cuse_name_size); 3681 if (rc == 0) { 3682 spdk_json_write_named_string(w, "cuse_device", cuse_name); 3683 } 3684 #endif 3685 3686 spdk_json_write_named_object_begin(w, "ctrlr_data"); 3687 3688 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 3689 3690 spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid); 3691 3692 snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn); 3693 spdk_str_trim(buf); 3694 spdk_json_write_named_string(w, "model_number", buf); 3695 3696 snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn); 3697 spdk_str_trim(buf); 3698 spdk_json_write_named_string(w, "serial_number", buf); 3699 3700 snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr); 3701 spdk_str_trim(buf); 3702 spdk_json_write_named_string(w, "firmware_revision", buf); 3703 3704 if (cdata->subnqn[0] != '\0') { 3705 spdk_json_write_named_string(w, "subnqn", cdata->subnqn); 3706 } 3707 3708 spdk_json_write_named_object_begin(w, "oacs"); 3709 3710 spdk_json_write_named_uint32(w, "security", cdata->oacs.security); 3711 spdk_json_write_named_uint32(w, "format", cdata->oacs.format); 3712 spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware); 3713 spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage); 3714 3715 spdk_json_write_object_end(w); 3716 3717 spdk_json_write_named_bool(w, "multi_ctrlr", cdata->cmic.multi_ctrlr); 3718 spdk_json_write_named_bool(w, "ana_reporting", cdata->cmic.ana_reporting); 3719 3720 spdk_json_write_object_end(w); 3721 3722 spdk_json_write_named_object_begin(w, "vs"); 3723 3724 spdk_json_write_name(w, "nvme_version"); 3725 if (vs.bits.ter) { 3726 spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter); 3727 } else { 3728 spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr); 3729 } 3730 3731 spdk_json_write_object_end(w); 3732 3733 nsdata = spdk_nvme_ns_get_data(ns); 3734 3735 spdk_json_write_named_object_begin(w, "ns_data"); 3736 3737 spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns)); 3738 3739 if (cdata->cmic.ana_reporting) { 3740 spdk_json_write_named_string(w, "ana_state", 3741 _nvme_ana_state_str(nvme_ns->ana_state)); 3742 } 3743 3744 spdk_json_write_named_bool(w, "can_share", nsdata->nmic.can_share); 3745 3746 spdk_json_write_object_end(w); 3747 3748 if (cdata->oacs.security) { 3749 spdk_json_write_named_object_begin(w, "security"); 3750 3751 spdk_json_write_named_bool(w, "opal", nvme_ns->bdev->opal); 3752 3753 spdk_json_write_object_end(w); 3754 } 3755 3756 spdk_json_write_object_end(w); 3757 } 3758 3759 static const char * 3760 nvme_bdev_get_mp_policy_str(struct nvme_bdev *nbdev) 3761 { 3762 switch (nbdev->mp_policy) { 3763 case BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE: 3764 return "active_passive"; 3765 case BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE: 3766 return "active_active"; 3767 default: 3768 assert(false); 3769 return "invalid"; 3770 } 3771 } 3772 3773 static const char * 3774 nvme_bdev_get_mp_selector_str(struct nvme_bdev *nbdev) 3775 { 3776 switch (nbdev->mp_selector) { 3777 case BDEV_NVME_MP_SELECTOR_ROUND_ROBIN: 3778 return "round_robin"; 3779 case BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH: 3780 return "queue_depth"; 3781 default: 3782 assert(false); 3783 return "invalid"; 3784 } 3785 } 3786 3787 static int 3788 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) 3789 { 3790 struct nvme_bdev *nvme_bdev = ctx; 3791 struct nvme_ns *nvme_ns; 3792 3793 pthread_mutex_lock(&nvme_bdev->mutex); 3794 spdk_json_write_named_array_begin(w, "nvme"); 3795 TAILQ_FOREACH(nvme_ns, &nvme_bdev->nvme_ns_list, tailq) { 3796 nvme_namespace_info_json(w, nvme_ns); 3797 } 3798 spdk_json_write_array_end(w); 3799 spdk_json_write_named_string(w, "mp_policy", nvme_bdev_get_mp_policy_str(nvme_bdev)); 3800 if (nvme_bdev->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) { 3801 spdk_json_write_named_string(w, "selector", nvme_bdev_get_mp_selector_str(nvme_bdev)); 3802 if (nvme_bdev->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 3803 spdk_json_write_named_uint32(w, "rr_min_io", nvme_bdev->rr_min_io); 3804 } 3805 } 3806 pthread_mutex_unlock(&nvme_bdev->mutex); 3807 3808 return 0; 3809 } 3810 3811 static void 3812 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3813 { 3814 /* No config per bdev needed */ 3815 } 3816 3817 static uint64_t 3818 bdev_nvme_get_spin_time(struct spdk_io_channel *ch) 3819 { 3820 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 3821 struct nvme_io_path *io_path; 3822 struct nvme_poll_group *group; 3823 uint64_t spin_time = 0; 3824 3825 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 3826 group = io_path->qpair->group; 3827 3828 if (!group || !group->collect_spin_stat) { 3829 continue; 3830 } 3831 3832 if (group->end_ticks != 0) { 3833 group->spin_ticks += (group->end_ticks - group->start_ticks); 3834 group->end_ticks = 0; 3835 } 3836 3837 spin_time += group->spin_ticks; 3838 group->start_ticks = 0; 3839 group->spin_ticks = 0; 3840 } 3841 3842 return (spin_time * 1000000ULL) / spdk_get_ticks_hz(); 3843 } 3844 3845 static void 3846 bdev_nvme_reset_device_stat(void *ctx) 3847 { 3848 struct nvme_bdev *nbdev = ctx; 3849 3850 if (nbdev->err_stat != NULL) { 3851 memset(nbdev->err_stat, 0, sizeof(struct nvme_error_stat)); 3852 } 3853 } 3854 3855 /* JSON string should be lowercases and underscore delimited string. */ 3856 static void 3857 bdev_nvme_format_nvme_status(char *dst, const char *src) 3858 { 3859 char tmp[256]; 3860 3861 spdk_strcpy_replace(dst, 256, src, " - ", "_"); 3862 spdk_strcpy_replace(tmp, 256, dst, "-", "_"); 3863 spdk_strcpy_replace(dst, 256, tmp, " ", "_"); 3864 spdk_strlwr(dst); 3865 } 3866 3867 static void 3868 bdev_nvme_dump_device_stat_json(void *ctx, struct spdk_json_write_ctx *w) 3869 { 3870 struct nvme_bdev *nbdev = ctx; 3871 struct spdk_nvme_status status = {}; 3872 uint16_t sct, sc; 3873 char status_json[256]; 3874 const char *status_str; 3875 3876 if (nbdev->err_stat == NULL) { 3877 return; 3878 } 3879 3880 spdk_json_write_named_object_begin(w, "nvme_error"); 3881 3882 spdk_json_write_named_object_begin(w, "status_type"); 3883 for (sct = 0; sct < 8; sct++) { 3884 if (nbdev->err_stat->status_type[sct] == 0) { 3885 continue; 3886 } 3887 status.sct = sct; 3888 3889 status_str = spdk_nvme_cpl_get_status_type_string(&status); 3890 assert(status_str != NULL); 3891 bdev_nvme_format_nvme_status(status_json, status_str); 3892 3893 spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status_type[sct]); 3894 } 3895 spdk_json_write_object_end(w); 3896 3897 spdk_json_write_named_object_begin(w, "status_code"); 3898 for (sct = 0; sct < 4; sct++) { 3899 status.sct = sct; 3900 for (sc = 0; sc < 256; sc++) { 3901 if (nbdev->err_stat->status[sct][sc] == 0) { 3902 continue; 3903 } 3904 status.sc = sc; 3905 3906 status_str = spdk_nvme_cpl_get_status_string(&status); 3907 assert(status_str != NULL); 3908 bdev_nvme_format_nvme_status(status_json, status_str); 3909 3910 spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status[sct][sc]); 3911 } 3912 } 3913 spdk_json_write_object_end(w); 3914 3915 spdk_json_write_object_end(w); 3916 } 3917 3918 static bool 3919 bdev_nvme_accel_sequence_supported(void *ctx, enum spdk_bdev_io_type type) 3920 { 3921 struct nvme_bdev *nbdev = ctx; 3922 struct spdk_nvme_ctrlr *ctrlr; 3923 3924 if (!g_opts.allow_accel_sequence) { 3925 return false; 3926 } 3927 3928 switch (type) { 3929 case SPDK_BDEV_IO_TYPE_WRITE: 3930 case SPDK_BDEV_IO_TYPE_READ: 3931 break; 3932 default: 3933 return false; 3934 } 3935 3936 ctrlr = bdev_nvme_get_ctrlr(&nbdev->disk); 3937 assert(ctrlr != NULL); 3938 3939 return spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ACCEL_SEQUENCE_SUPPORTED; 3940 } 3941 3942 static const struct spdk_bdev_fn_table nvmelib_fn_table = { 3943 .destruct = bdev_nvme_destruct, 3944 .submit_request = bdev_nvme_submit_request, 3945 .io_type_supported = bdev_nvme_io_type_supported, 3946 .get_io_channel = bdev_nvme_get_io_channel, 3947 .dump_info_json = bdev_nvme_dump_info_json, 3948 .write_config_json = bdev_nvme_write_config_json, 3949 .get_spin_time = bdev_nvme_get_spin_time, 3950 .get_module_ctx = bdev_nvme_get_module_ctx, 3951 .get_memory_domains = bdev_nvme_get_memory_domains, 3952 .accel_sequence_supported = bdev_nvme_accel_sequence_supported, 3953 .reset_device_stat = bdev_nvme_reset_device_stat, 3954 .dump_device_stat_json = bdev_nvme_dump_device_stat_json, 3955 }; 3956 3957 typedef int (*bdev_nvme_parse_ana_log_page_cb)( 3958 const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg); 3959 3960 static int 3961 bdev_nvme_parse_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 3962 bdev_nvme_parse_ana_log_page_cb cb_fn, void *cb_arg) 3963 { 3964 struct spdk_nvme_ana_group_descriptor *copied_desc; 3965 uint8_t *orig_desc; 3966 uint32_t i, desc_size, copy_len; 3967 int rc = 0; 3968 3969 if (nvme_ctrlr->ana_log_page == NULL) { 3970 return -EINVAL; 3971 } 3972 3973 copied_desc = nvme_ctrlr->copied_ana_desc; 3974 3975 orig_desc = (uint8_t *)nvme_ctrlr->ana_log_page + sizeof(struct spdk_nvme_ana_page); 3976 copy_len = nvme_ctrlr->max_ana_log_page_size - sizeof(struct spdk_nvme_ana_page); 3977 3978 for (i = 0; i < nvme_ctrlr->ana_log_page->num_ana_group_desc; i++) { 3979 memcpy(copied_desc, orig_desc, copy_len); 3980 3981 rc = cb_fn(copied_desc, cb_arg); 3982 if (rc != 0) { 3983 break; 3984 } 3985 3986 desc_size = sizeof(struct spdk_nvme_ana_group_descriptor) + 3987 copied_desc->num_of_nsid * sizeof(uint32_t); 3988 orig_desc += desc_size; 3989 copy_len -= desc_size; 3990 } 3991 3992 return rc; 3993 } 3994 3995 static int 3996 nvme_ns_ana_transition_timedout(void *ctx) 3997 { 3998 struct nvme_ns *nvme_ns = ctx; 3999 4000 spdk_poller_unregister(&nvme_ns->anatt_timer); 4001 nvme_ns->ana_transition_timedout = true; 4002 4003 return SPDK_POLLER_BUSY; 4004 } 4005 4006 static void 4007 _nvme_ns_set_ana_state(struct nvme_ns *nvme_ns, 4008 const struct spdk_nvme_ana_group_descriptor *desc) 4009 { 4010 const struct spdk_nvme_ctrlr_data *cdata; 4011 4012 nvme_ns->ana_group_id = desc->ana_group_id; 4013 nvme_ns->ana_state = desc->ana_state; 4014 nvme_ns->ana_state_updating = false; 4015 4016 switch (nvme_ns->ana_state) { 4017 case SPDK_NVME_ANA_OPTIMIZED_STATE: 4018 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 4019 nvme_ns->ana_transition_timedout = false; 4020 spdk_poller_unregister(&nvme_ns->anatt_timer); 4021 break; 4022 4023 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 4024 case SPDK_NVME_ANA_CHANGE_STATE: 4025 if (nvme_ns->anatt_timer != NULL) { 4026 break; 4027 } 4028 4029 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 4030 nvme_ns->anatt_timer = SPDK_POLLER_REGISTER(nvme_ns_ana_transition_timedout, 4031 nvme_ns, 4032 cdata->anatt * SPDK_SEC_TO_USEC); 4033 break; 4034 default: 4035 break; 4036 } 4037 } 4038 4039 static int 4040 nvme_ns_set_ana_state(const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg) 4041 { 4042 struct nvme_ns *nvme_ns = cb_arg; 4043 uint32_t i; 4044 4045 assert(nvme_ns->ns != NULL); 4046 4047 for (i = 0; i < desc->num_of_nsid; i++) { 4048 if (desc->nsid[i] != spdk_nvme_ns_get_id(nvme_ns->ns)) { 4049 continue; 4050 } 4051 4052 _nvme_ns_set_ana_state(nvme_ns, desc); 4053 return 1; 4054 } 4055 4056 return 0; 4057 } 4058 4059 static int 4060 nvme_generate_uuid(const char *sn, uint32_t nsid, struct spdk_uuid *uuid) 4061 { 4062 int rc = 0; 4063 struct spdk_uuid new_uuid, namespace_uuid; 4064 char merged_str[SPDK_NVME_CTRLR_SN_LEN + NSID_STR_LEN + 1] = {'\0'}; 4065 /* This namespace UUID was generated using uuid_generate() method. */ 4066 const char *namespace_str = {"edaed2de-24bc-4b07-b559-f47ecbe730fd"}; 4067 int size; 4068 4069 assert(strlen(sn) <= SPDK_NVME_CTRLR_SN_LEN); 4070 4071 spdk_uuid_set_null(&new_uuid); 4072 spdk_uuid_set_null(&namespace_uuid); 4073 4074 size = snprintf(merged_str, sizeof(merged_str), "%s%"PRIu32, sn, nsid); 4075 if (size <= 0 || (unsigned long)size >= sizeof(merged_str)) { 4076 return -EINVAL; 4077 } 4078 4079 spdk_uuid_parse(&namespace_uuid, namespace_str); 4080 4081 rc = spdk_uuid_generate_sha1(&new_uuid, &namespace_uuid, merged_str, size); 4082 if (rc == 0) { 4083 memcpy(uuid, &new_uuid, sizeof(struct spdk_uuid)); 4084 } 4085 4086 return rc; 4087 } 4088 4089 static int 4090 nvme_disk_create(struct spdk_bdev *disk, const char *base_name, 4091 struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns, 4092 uint32_t prchk_flags, void *ctx) 4093 { 4094 const struct spdk_uuid *uuid; 4095 const uint8_t *nguid; 4096 const struct spdk_nvme_ctrlr_data *cdata; 4097 const struct spdk_nvme_ns_data *nsdata; 4098 const struct spdk_nvme_ctrlr_opts *opts; 4099 enum spdk_nvme_csi csi; 4100 uint32_t atomic_bs, phys_bs, bs; 4101 char sn_tmp[SPDK_NVME_CTRLR_SN_LEN + 1] = {'\0'}; 4102 int rc; 4103 4104 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 4105 csi = spdk_nvme_ns_get_csi(ns); 4106 opts = spdk_nvme_ctrlr_get_opts(ctrlr); 4107 4108 switch (csi) { 4109 case SPDK_NVME_CSI_NVM: 4110 disk->product_name = "NVMe disk"; 4111 break; 4112 case SPDK_NVME_CSI_ZNS: 4113 disk->product_name = "NVMe ZNS disk"; 4114 disk->zoned = true; 4115 disk->zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 4116 disk->max_zone_append_size = spdk_nvme_zns_ctrlr_get_max_zone_append_size(ctrlr) / 4117 spdk_nvme_ns_get_extended_sector_size(ns); 4118 disk->max_open_zones = spdk_nvme_zns_ns_get_max_open_zones(ns); 4119 disk->max_active_zones = spdk_nvme_zns_ns_get_max_active_zones(ns); 4120 break; 4121 default: 4122 SPDK_ERRLOG("unsupported CSI: %u\n", csi); 4123 return -ENOTSUP; 4124 } 4125 4126 nguid = spdk_nvme_ns_get_nguid(ns); 4127 if (!nguid) { 4128 uuid = spdk_nvme_ns_get_uuid(ns); 4129 if (uuid) { 4130 disk->uuid = *uuid; 4131 } else if (g_opts.generate_uuids) { 4132 spdk_strcpy_pad(sn_tmp, cdata->sn, SPDK_NVME_CTRLR_SN_LEN, '\0'); 4133 rc = nvme_generate_uuid(sn_tmp, spdk_nvme_ns_get_id(ns), &disk->uuid); 4134 if (rc < 0) { 4135 SPDK_ERRLOG("UUID generation failed (%s)\n", spdk_strerror(-rc)); 4136 return rc; 4137 } 4138 } 4139 } else { 4140 memcpy(&disk->uuid, nguid, sizeof(disk->uuid)); 4141 } 4142 4143 disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns)); 4144 if (!disk->name) { 4145 return -ENOMEM; 4146 } 4147 4148 disk->write_cache = 0; 4149 if (cdata->vwc.present) { 4150 /* Enable if the Volatile Write Cache exists */ 4151 disk->write_cache = 1; 4152 } 4153 if (cdata->oncs.write_zeroes) { 4154 disk->max_write_zeroes = UINT16_MAX + 1; 4155 } 4156 disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns); 4157 disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns); 4158 disk->max_segment_size = spdk_nvme_ctrlr_get_max_xfer_size(ctrlr); 4159 disk->ctratt.raw = cdata->ctratt.raw; 4160 /* NVMe driver will split one request into multiple requests 4161 * based on MDTS and stripe boundary, the bdev layer will use 4162 * max_segment_size and max_num_segments to split one big IO 4163 * into multiple requests, then small request can't run out 4164 * of NVMe internal requests data structure. 4165 */ 4166 if (opts && opts->io_queue_requests) { 4167 disk->max_num_segments = opts->io_queue_requests / 2; 4168 } 4169 if (spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_SGL_SUPPORTED) { 4170 /* The nvme driver will try to split I/O that have too many 4171 * SGEs, but it doesn't work if that last SGE doesn't end on 4172 * an aggregate total that is block aligned. The bdev layer has 4173 * a more robust splitting framework, so use that instead for 4174 * this case. (See issue #3269.) 4175 */ 4176 uint16_t max_sges = spdk_nvme_ctrlr_get_max_sges(ctrlr); 4177 4178 if (disk->max_num_segments == 0) { 4179 disk->max_num_segments = max_sges; 4180 } else { 4181 disk->max_num_segments = spdk_min(disk->max_num_segments, max_sges); 4182 } 4183 } 4184 disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns); 4185 4186 nsdata = spdk_nvme_ns_get_data(ns); 4187 bs = spdk_nvme_ns_get_sector_size(ns); 4188 atomic_bs = bs; 4189 phys_bs = bs; 4190 if (nsdata->nabo == 0) { 4191 if (nsdata->nsfeat.ns_atomic_write_unit && nsdata->nawupf) { 4192 atomic_bs = bs * (1 + nsdata->nawupf); 4193 } else { 4194 atomic_bs = bs * (1 + cdata->awupf); 4195 } 4196 } 4197 if (nsdata->nsfeat.optperf) { 4198 phys_bs = bs * (1 + nsdata->npwg); 4199 } 4200 disk->phys_blocklen = spdk_min(phys_bs, atomic_bs); 4201 4202 disk->md_len = spdk_nvme_ns_get_md_size(ns); 4203 if (disk->md_len != 0) { 4204 disk->md_interleave = nsdata->flbas.extended; 4205 disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns); 4206 if (disk->dif_type != SPDK_DIF_DISABLE) { 4207 disk->dif_is_head_of_md = nsdata->dps.md_start; 4208 disk->dif_check_flags = prchk_flags; 4209 } 4210 } 4211 4212 if (!(spdk_nvme_ctrlr_get_flags(ctrlr) & 4213 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) { 4214 disk->acwu = 0; 4215 } else if (nsdata->nsfeat.ns_atomic_write_unit) { 4216 disk->acwu = nsdata->nacwu + 1; /* 0-based */ 4217 } else { 4218 disk->acwu = cdata->acwu + 1; /* 0-based */ 4219 } 4220 4221 if (cdata->oncs.copy) { 4222 /* For now bdev interface allows only single segment copy */ 4223 disk->max_copy = nsdata->mssrl; 4224 } 4225 4226 disk->ctxt = ctx; 4227 disk->fn_table = &nvmelib_fn_table; 4228 disk->module = &nvme_if; 4229 4230 return 0; 4231 } 4232 4233 static struct nvme_bdev * 4234 nvme_bdev_alloc(void) 4235 { 4236 struct nvme_bdev *bdev; 4237 int rc; 4238 4239 bdev = calloc(1, sizeof(*bdev)); 4240 if (!bdev) { 4241 SPDK_ERRLOG("bdev calloc() failed\n"); 4242 return NULL; 4243 } 4244 4245 if (g_opts.nvme_error_stat) { 4246 bdev->err_stat = calloc(1, sizeof(struct nvme_error_stat)); 4247 if (!bdev->err_stat) { 4248 SPDK_ERRLOG("err_stat calloc() failed\n"); 4249 free(bdev); 4250 return NULL; 4251 } 4252 } 4253 4254 rc = pthread_mutex_init(&bdev->mutex, NULL); 4255 if (rc != 0) { 4256 free(bdev->err_stat); 4257 free(bdev); 4258 return NULL; 4259 } 4260 4261 bdev->ref = 1; 4262 bdev->mp_policy = BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE; 4263 bdev->mp_selector = BDEV_NVME_MP_SELECTOR_ROUND_ROBIN; 4264 bdev->rr_min_io = UINT32_MAX; 4265 TAILQ_INIT(&bdev->nvme_ns_list); 4266 4267 return bdev; 4268 } 4269 4270 static int 4271 nvme_bdev_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4272 { 4273 struct nvme_bdev *bdev; 4274 struct nvme_bdev_ctrlr *nbdev_ctrlr = nvme_ctrlr->nbdev_ctrlr; 4275 int rc; 4276 4277 bdev = nvme_bdev_alloc(); 4278 if (bdev == NULL) { 4279 SPDK_ERRLOG("Failed to allocate NVMe bdev\n"); 4280 return -ENOMEM; 4281 } 4282 4283 bdev->opal = nvme_ctrlr->opal_dev != NULL; 4284 4285 rc = nvme_disk_create(&bdev->disk, nbdev_ctrlr->name, nvme_ctrlr->ctrlr, 4286 nvme_ns->ns, nvme_ctrlr->opts.prchk_flags, bdev); 4287 if (rc != 0) { 4288 SPDK_ERRLOG("Failed to create NVMe disk\n"); 4289 nvme_bdev_free(bdev); 4290 return rc; 4291 } 4292 4293 spdk_io_device_register(bdev, 4294 bdev_nvme_create_bdev_channel_cb, 4295 bdev_nvme_destroy_bdev_channel_cb, 4296 sizeof(struct nvme_bdev_channel), 4297 bdev->disk.name); 4298 4299 nvme_ns->bdev = bdev; 4300 bdev->nsid = nvme_ns->id; 4301 TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); 4302 4303 bdev->nbdev_ctrlr = nbdev_ctrlr; 4304 TAILQ_INSERT_TAIL(&nbdev_ctrlr->bdevs, bdev, tailq); 4305 4306 rc = spdk_bdev_register(&bdev->disk); 4307 if (rc != 0) { 4308 SPDK_ERRLOG("spdk_bdev_register() failed\n"); 4309 spdk_io_device_unregister(bdev, NULL); 4310 nvme_ns->bdev = NULL; 4311 TAILQ_REMOVE(&nbdev_ctrlr->bdevs, bdev, tailq); 4312 nvme_bdev_free(bdev); 4313 return rc; 4314 } 4315 4316 return 0; 4317 } 4318 4319 static bool 4320 bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2) 4321 { 4322 const struct spdk_nvme_ns_data *nsdata1, *nsdata2; 4323 const struct spdk_uuid *uuid1, *uuid2; 4324 4325 nsdata1 = spdk_nvme_ns_get_data(ns1); 4326 nsdata2 = spdk_nvme_ns_get_data(ns2); 4327 uuid1 = spdk_nvme_ns_get_uuid(ns1); 4328 uuid2 = spdk_nvme_ns_get_uuid(ns2); 4329 4330 return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid)) == 0 && 4331 nsdata1->eui64 == nsdata2->eui64 && 4332 ((uuid1 == NULL && uuid2 == NULL) || 4333 (uuid1 != NULL && uuid2 != NULL && spdk_uuid_compare(uuid1, uuid2) == 0)) && 4334 spdk_nvme_ns_get_csi(ns1) == spdk_nvme_ns_get_csi(ns2); 4335 } 4336 4337 static bool 4338 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 4339 struct spdk_nvme_ctrlr_opts *opts) 4340 { 4341 struct nvme_probe_skip_entry *entry; 4342 4343 TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) { 4344 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 4345 return false; 4346 } 4347 } 4348 4349 opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst; 4350 opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight; 4351 opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight; 4352 opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight; 4353 opts->disable_read_ana_log_page = true; 4354 4355 SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr); 4356 4357 return true; 4358 } 4359 4360 static void 4361 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl) 4362 { 4363 struct nvme_ctrlr *nvme_ctrlr = ctx; 4364 4365 if (spdk_nvme_cpl_is_error(cpl)) { 4366 SPDK_WARNLOG("Abort failed. Resetting controller. sc is %u, sct is %u.\n", cpl->status.sc, 4367 cpl->status.sct); 4368 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4369 } else if (cpl->cdw0 & 0x1) { 4370 SPDK_WARNLOG("Specified command could not be aborted.\n"); 4371 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4372 } 4373 } 4374 4375 static void 4376 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr, 4377 struct spdk_nvme_qpair *qpair, uint16_t cid) 4378 { 4379 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 4380 union spdk_nvme_csts_register csts; 4381 int rc; 4382 4383 assert(nvme_ctrlr->ctrlr == ctrlr); 4384 4385 SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid); 4386 4387 /* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O 4388 * queue. (Note: qpair == NULL when there's an admin cmd timeout.) Otherwise we 4389 * would submit another fabrics cmd on the admin queue to read CSTS and check for its 4390 * completion recursively. 4391 */ 4392 if (nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) { 4393 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); 4394 if (csts.bits.cfs) { 4395 SPDK_ERRLOG("Controller Fatal Status, reset required\n"); 4396 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4397 return; 4398 } 4399 } 4400 4401 switch (g_opts.action_on_timeout) { 4402 case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT: 4403 if (qpair) { 4404 /* Don't send abort to ctrlr when ctrlr is not available. */ 4405 pthread_mutex_lock(&nvme_ctrlr->mutex); 4406 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 4407 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4408 SPDK_NOTICELOG("Quit abort. Ctrlr is not available.\n"); 4409 return; 4410 } 4411 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4412 4413 rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid, 4414 nvme_abort_cpl, nvme_ctrlr); 4415 if (rc == 0) { 4416 return; 4417 } 4418 4419 SPDK_ERRLOG("Unable to send abort. Resetting, rc is %d.\n", rc); 4420 } 4421 4422 /* FALLTHROUGH */ 4423 case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET: 4424 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4425 break; 4426 case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE: 4427 SPDK_DEBUGLOG(bdev_nvme, "No action for nvme controller timeout.\n"); 4428 break; 4429 default: 4430 SPDK_ERRLOG("An invalid timeout action value is found.\n"); 4431 break; 4432 } 4433 } 4434 4435 static struct nvme_ns * 4436 nvme_ns_alloc(void) 4437 { 4438 struct nvme_ns *nvme_ns; 4439 4440 nvme_ns = calloc(1, sizeof(struct nvme_ns)); 4441 if (nvme_ns == NULL) { 4442 return NULL; 4443 } 4444 4445 if (g_opts.io_path_stat) { 4446 nvme_ns->stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); 4447 if (nvme_ns->stat == NULL) { 4448 free(nvme_ns); 4449 return NULL; 4450 } 4451 spdk_bdev_reset_io_stat(nvme_ns->stat, SPDK_BDEV_RESET_STAT_MAXMIN); 4452 } 4453 4454 return nvme_ns; 4455 } 4456 4457 static void 4458 nvme_ns_free(struct nvme_ns *nvme_ns) 4459 { 4460 free(nvme_ns->stat); 4461 free(nvme_ns); 4462 } 4463 4464 static void 4465 nvme_ctrlr_populate_namespace_done(struct nvme_ns *nvme_ns, int rc) 4466 { 4467 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 4468 struct nvme_async_probe_ctx *ctx = nvme_ns->probe_ctx; 4469 4470 if (rc == 0) { 4471 nvme_ns->probe_ctx = NULL; 4472 pthread_mutex_lock(&nvme_ctrlr->mutex); 4473 nvme_ctrlr->ref++; 4474 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4475 } else { 4476 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4477 nvme_ns_free(nvme_ns); 4478 } 4479 4480 if (ctx) { 4481 ctx->populates_in_progress--; 4482 if (ctx->populates_in_progress == 0) { 4483 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 4484 } 4485 } 4486 } 4487 4488 static void 4489 bdev_nvme_add_io_path(struct spdk_io_channel_iter *i) 4490 { 4491 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4492 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4493 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4494 int rc; 4495 4496 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 4497 if (rc != 0) { 4498 SPDK_ERRLOG("Failed to add I/O path to bdev_channel dynamically.\n"); 4499 } 4500 4501 spdk_for_each_channel_continue(i, rc); 4502 } 4503 4504 static void 4505 bdev_nvme_delete_io_path(struct spdk_io_channel_iter *i) 4506 { 4507 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4508 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4509 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4510 struct nvme_io_path *io_path; 4511 4512 io_path = _bdev_nvme_get_io_path(nbdev_ch, nvme_ns); 4513 if (io_path != NULL) { 4514 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 4515 } 4516 4517 spdk_for_each_channel_continue(i, 0); 4518 } 4519 4520 static void 4521 bdev_nvme_add_io_path_failed(struct spdk_io_channel_iter *i, int status) 4522 { 4523 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4524 4525 nvme_ctrlr_populate_namespace_done(nvme_ns, -1); 4526 } 4527 4528 static void 4529 bdev_nvme_add_io_path_done(struct spdk_io_channel_iter *i, int status) 4530 { 4531 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4532 struct nvme_bdev *bdev = spdk_io_channel_iter_get_io_device(i); 4533 4534 if (status == 0) { 4535 nvme_ctrlr_populate_namespace_done(nvme_ns, 0); 4536 } else { 4537 /* Delete the added io_paths and fail populating the namespace. */ 4538 spdk_for_each_channel(bdev, 4539 bdev_nvme_delete_io_path, 4540 nvme_ns, 4541 bdev_nvme_add_io_path_failed); 4542 } 4543 } 4544 4545 static int 4546 nvme_bdev_add_ns(struct nvme_bdev *bdev, struct nvme_ns *nvme_ns) 4547 { 4548 struct nvme_ns *tmp_ns; 4549 const struct spdk_nvme_ns_data *nsdata; 4550 4551 nsdata = spdk_nvme_ns_get_data(nvme_ns->ns); 4552 if (!nsdata->nmic.can_share) { 4553 SPDK_ERRLOG("Namespace cannot be shared.\n"); 4554 return -EINVAL; 4555 } 4556 4557 pthread_mutex_lock(&bdev->mutex); 4558 4559 tmp_ns = TAILQ_FIRST(&bdev->nvme_ns_list); 4560 assert(tmp_ns != NULL); 4561 4562 if (tmp_ns->ns != NULL && !bdev_nvme_compare_ns(nvme_ns->ns, tmp_ns->ns)) { 4563 pthread_mutex_unlock(&bdev->mutex); 4564 SPDK_ERRLOG("Namespaces are not identical.\n"); 4565 return -EINVAL; 4566 } 4567 4568 bdev->ref++; 4569 TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); 4570 nvme_ns->bdev = bdev; 4571 4572 pthread_mutex_unlock(&bdev->mutex); 4573 4574 /* Add nvme_io_path to nvme_bdev_channels dynamically. */ 4575 spdk_for_each_channel(bdev, 4576 bdev_nvme_add_io_path, 4577 nvme_ns, 4578 bdev_nvme_add_io_path_done); 4579 4580 return 0; 4581 } 4582 4583 static void 4584 nvme_ctrlr_populate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4585 { 4586 struct spdk_nvme_ns *ns; 4587 struct nvme_bdev *bdev; 4588 int rc = 0; 4589 4590 ns = spdk_nvme_ctrlr_get_ns(nvme_ctrlr->ctrlr, nvme_ns->id); 4591 if (!ns) { 4592 SPDK_DEBUGLOG(bdev_nvme, "Invalid NS %d\n", nvme_ns->id); 4593 rc = -EINVAL; 4594 goto done; 4595 } 4596 4597 nvme_ns->ns = ns; 4598 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 4599 4600 if (nvme_ctrlr->ana_log_page != NULL) { 4601 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ns_set_ana_state, nvme_ns); 4602 } 4603 4604 bdev = nvme_bdev_ctrlr_get_bdev(nvme_ctrlr->nbdev_ctrlr, nvme_ns->id); 4605 if (bdev == NULL) { 4606 rc = nvme_bdev_create(nvme_ctrlr, nvme_ns); 4607 } else { 4608 rc = nvme_bdev_add_ns(bdev, nvme_ns); 4609 if (rc == 0) { 4610 return; 4611 } 4612 } 4613 done: 4614 nvme_ctrlr_populate_namespace_done(nvme_ns, rc); 4615 } 4616 4617 static void 4618 nvme_ctrlr_depopulate_namespace_done(struct nvme_ns *nvme_ns) 4619 { 4620 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 4621 4622 assert(nvme_ctrlr != NULL); 4623 4624 pthread_mutex_lock(&nvme_ctrlr->mutex); 4625 4626 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4627 4628 if (nvme_ns->bdev != NULL) { 4629 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4630 return; 4631 } 4632 4633 nvme_ns_free(nvme_ns); 4634 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4635 4636 nvme_ctrlr_release(nvme_ctrlr); 4637 } 4638 4639 static void 4640 bdev_nvme_delete_io_path_done(struct spdk_io_channel_iter *i, int status) 4641 { 4642 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4643 4644 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 4645 } 4646 4647 static void 4648 nvme_ctrlr_depopulate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4649 { 4650 struct nvme_bdev *bdev; 4651 4652 spdk_poller_unregister(&nvme_ns->anatt_timer); 4653 4654 bdev = nvme_ns->bdev; 4655 if (bdev != NULL) { 4656 pthread_mutex_lock(&bdev->mutex); 4657 4658 assert(bdev->ref > 0); 4659 bdev->ref--; 4660 if (bdev->ref == 0) { 4661 pthread_mutex_unlock(&bdev->mutex); 4662 4663 spdk_bdev_unregister(&bdev->disk, NULL, NULL); 4664 } else { 4665 /* spdk_bdev_unregister() is not called until the last nvme_ns is 4666 * depopulated. Hence we need to remove nvme_ns from bdev->nvme_ns_list 4667 * and clear nvme_ns->bdev here. 4668 */ 4669 TAILQ_REMOVE(&bdev->nvme_ns_list, nvme_ns, tailq); 4670 nvme_ns->bdev = NULL; 4671 4672 pthread_mutex_unlock(&bdev->mutex); 4673 4674 /* Delete nvme_io_paths from nvme_bdev_channels dynamically. After that, 4675 * we call depopulate_namespace_done() to avoid use-after-free. 4676 */ 4677 spdk_for_each_channel(bdev, 4678 bdev_nvme_delete_io_path, 4679 nvme_ns, 4680 bdev_nvme_delete_io_path_done); 4681 return; 4682 } 4683 } 4684 4685 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 4686 } 4687 4688 static void 4689 nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 4690 struct nvme_async_probe_ctx *ctx) 4691 { 4692 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 4693 struct nvme_ns *nvme_ns, *next; 4694 struct spdk_nvme_ns *ns; 4695 struct nvme_bdev *bdev; 4696 uint32_t nsid; 4697 int rc; 4698 uint64_t num_sectors; 4699 4700 if (ctx) { 4701 /* Initialize this count to 1 to handle the populate functions 4702 * calling nvme_ctrlr_populate_namespace_done() immediately. 4703 */ 4704 ctx->populates_in_progress = 1; 4705 } 4706 4707 /* First loop over our existing namespaces and see if they have been 4708 * removed. */ 4709 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 4710 while (nvme_ns != NULL) { 4711 next = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 4712 4713 if (spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) { 4714 /* NS is still there or added again. Its attributes may have changed. */ 4715 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id); 4716 if (nvme_ns->ns != ns) { 4717 assert(nvme_ns->ns == NULL); 4718 nvme_ns->ns = ns; 4719 SPDK_DEBUGLOG(bdev_nvme, "NSID %u was added\n", nvme_ns->id); 4720 } 4721 4722 num_sectors = spdk_nvme_ns_get_num_sectors(ns); 4723 bdev = nvme_ns->bdev; 4724 assert(bdev != NULL); 4725 if (bdev->disk.blockcnt != num_sectors) { 4726 SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n", 4727 nvme_ns->id, 4728 bdev->disk.name, 4729 bdev->disk.blockcnt, 4730 num_sectors); 4731 rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors); 4732 if (rc != 0) { 4733 SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n", 4734 bdev->disk.name, rc); 4735 } 4736 } 4737 } else { 4738 /* Namespace was removed */ 4739 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 4740 } 4741 4742 nvme_ns = next; 4743 } 4744 4745 /* Loop through all of the namespaces at the nvme level and see if any of them are new */ 4746 nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 4747 while (nsid != 0) { 4748 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 4749 4750 if (nvme_ns == NULL) { 4751 /* Found a new one */ 4752 nvme_ns = nvme_ns_alloc(); 4753 if (nvme_ns == NULL) { 4754 SPDK_ERRLOG("Failed to allocate namespace\n"); 4755 /* This just fails to attach the namespace. It may work on a future attempt. */ 4756 continue; 4757 } 4758 4759 nvme_ns->id = nsid; 4760 nvme_ns->ctrlr = nvme_ctrlr; 4761 4762 nvme_ns->bdev = NULL; 4763 4764 if (ctx) { 4765 ctx->populates_in_progress++; 4766 } 4767 nvme_ns->probe_ctx = ctx; 4768 4769 RB_INSERT(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4770 4771 nvme_ctrlr_populate_namespace(nvme_ctrlr, nvme_ns); 4772 } 4773 4774 nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid); 4775 } 4776 4777 if (ctx) { 4778 /* Decrement this count now that the loop is over to account 4779 * for the one we started with. If the count is then 0, we 4780 * know any populate_namespace functions completed immediately, 4781 * so we'll kick the callback here. 4782 */ 4783 ctx->populates_in_progress--; 4784 if (ctx->populates_in_progress == 0) { 4785 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 4786 } 4787 } 4788 4789 } 4790 4791 static void 4792 nvme_ctrlr_depopulate_namespaces(struct nvme_ctrlr *nvme_ctrlr) 4793 { 4794 struct nvme_ns *nvme_ns, *tmp; 4795 4796 RB_FOREACH_SAFE(nvme_ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp) { 4797 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 4798 } 4799 } 4800 4801 static uint32_t 4802 nvme_ctrlr_get_ana_log_page_size(struct nvme_ctrlr *nvme_ctrlr) 4803 { 4804 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 4805 const struct spdk_nvme_ctrlr_data *cdata; 4806 uint32_t nsid, ns_count = 0; 4807 4808 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 4809 4810 for (nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 4811 nsid != 0; nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid)) { 4812 ns_count++; 4813 } 4814 4815 return sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 4816 sizeof(struct spdk_nvme_ana_group_descriptor) + ns_count * 4817 sizeof(uint32_t); 4818 } 4819 4820 static int 4821 nvme_ctrlr_set_ana_states(const struct spdk_nvme_ana_group_descriptor *desc, 4822 void *cb_arg) 4823 { 4824 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 4825 struct nvme_ns *nvme_ns; 4826 uint32_t i, nsid; 4827 4828 for (i = 0; i < desc->num_of_nsid; i++) { 4829 nsid = desc->nsid[i]; 4830 if (nsid == 0) { 4831 continue; 4832 } 4833 4834 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 4835 4836 assert(nvme_ns != NULL); 4837 if (nvme_ns == NULL) { 4838 /* Target told us that an inactive namespace had an ANA change */ 4839 continue; 4840 } 4841 4842 _nvme_ns_set_ana_state(nvme_ns, desc); 4843 } 4844 4845 return 0; 4846 } 4847 4848 static void 4849 bdev_nvme_disable_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 4850 { 4851 struct nvme_ns *nvme_ns; 4852 4853 spdk_free(nvme_ctrlr->ana_log_page); 4854 nvme_ctrlr->ana_log_page = NULL; 4855 4856 for (nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 4857 nvme_ns != NULL; 4858 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns)) { 4859 nvme_ns->ana_state_updating = false; 4860 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 4861 } 4862 } 4863 4864 static void 4865 nvme_ctrlr_read_ana_log_page_done(void *ctx, const struct spdk_nvme_cpl *cpl) 4866 { 4867 struct nvme_ctrlr *nvme_ctrlr = ctx; 4868 4869 if (cpl != NULL && spdk_nvme_cpl_is_success(cpl)) { 4870 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ctrlr_set_ana_states, 4871 nvme_ctrlr); 4872 } else { 4873 bdev_nvme_disable_read_ana_log_page(nvme_ctrlr); 4874 } 4875 4876 pthread_mutex_lock(&nvme_ctrlr->mutex); 4877 4878 assert(nvme_ctrlr->ana_log_page_updating == true); 4879 nvme_ctrlr->ana_log_page_updating = false; 4880 4881 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 4882 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4883 4884 nvme_ctrlr_unregister(nvme_ctrlr); 4885 } else { 4886 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4887 4888 bdev_nvme_clear_io_path_caches(nvme_ctrlr); 4889 } 4890 } 4891 4892 static int 4893 nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 4894 { 4895 uint32_t ana_log_page_size; 4896 int rc; 4897 4898 if (nvme_ctrlr->ana_log_page == NULL) { 4899 return -EINVAL; 4900 } 4901 4902 ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); 4903 4904 if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { 4905 SPDK_ERRLOG("ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", 4906 ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); 4907 return -EINVAL; 4908 } 4909 4910 pthread_mutex_lock(&nvme_ctrlr->mutex); 4911 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 4912 nvme_ctrlr->ana_log_page_updating) { 4913 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4914 return -EBUSY; 4915 } 4916 4917 nvme_ctrlr->ana_log_page_updating = true; 4918 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4919 4920 rc = spdk_nvme_ctrlr_cmd_get_log_page(nvme_ctrlr->ctrlr, 4921 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 4922 SPDK_NVME_GLOBAL_NS_TAG, 4923 nvme_ctrlr->ana_log_page, 4924 ana_log_page_size, 0, 4925 nvme_ctrlr_read_ana_log_page_done, 4926 nvme_ctrlr); 4927 if (rc != 0) { 4928 nvme_ctrlr_read_ana_log_page_done(nvme_ctrlr, NULL); 4929 } 4930 4931 return rc; 4932 } 4933 4934 static void 4935 dummy_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 4936 { 4937 } 4938 4939 struct bdev_nvme_set_preferred_path_ctx { 4940 struct spdk_bdev_desc *desc; 4941 struct nvme_ns *nvme_ns; 4942 bdev_nvme_set_preferred_path_cb cb_fn; 4943 void *cb_arg; 4944 }; 4945 4946 static void 4947 bdev_nvme_set_preferred_path_done(struct spdk_io_channel_iter *i, int status) 4948 { 4949 struct bdev_nvme_set_preferred_path_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4950 4951 assert(ctx != NULL); 4952 assert(ctx->desc != NULL); 4953 assert(ctx->cb_fn != NULL); 4954 4955 spdk_bdev_close(ctx->desc); 4956 4957 ctx->cb_fn(ctx->cb_arg, status); 4958 4959 free(ctx); 4960 } 4961 4962 static void 4963 _bdev_nvme_set_preferred_path(struct spdk_io_channel_iter *i) 4964 { 4965 struct bdev_nvme_set_preferred_path_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4966 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4967 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4968 struct nvme_io_path *io_path, *prev; 4969 4970 prev = NULL; 4971 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 4972 if (io_path->nvme_ns == ctx->nvme_ns) { 4973 break; 4974 } 4975 prev = io_path; 4976 } 4977 4978 if (io_path != NULL) { 4979 if (prev != NULL) { 4980 STAILQ_REMOVE_AFTER(&nbdev_ch->io_path_list, prev, stailq); 4981 STAILQ_INSERT_HEAD(&nbdev_ch->io_path_list, io_path, stailq); 4982 } 4983 4984 /* We can set io_path to nbdev_ch->current_io_path directly here. 4985 * However, it needs to be conditional. To simplify the code, 4986 * just clear nbdev_ch->current_io_path and let find_io_path() 4987 * fill it. 4988 * 4989 * Automatic failback may be disabled. Hence even if the io_path is 4990 * already at the head, clear nbdev_ch->current_io_path. 4991 */ 4992 bdev_nvme_clear_current_io_path(nbdev_ch); 4993 } 4994 4995 spdk_for_each_channel_continue(i, 0); 4996 } 4997 4998 static struct nvme_ns * 4999 bdev_nvme_set_preferred_ns(struct nvme_bdev *nbdev, uint16_t cntlid) 5000 { 5001 struct nvme_ns *nvme_ns, *prev; 5002 const struct spdk_nvme_ctrlr_data *cdata; 5003 5004 prev = NULL; 5005 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 5006 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 5007 5008 if (cdata->cntlid == cntlid) { 5009 break; 5010 } 5011 prev = nvme_ns; 5012 } 5013 5014 if (nvme_ns != NULL && prev != NULL) { 5015 TAILQ_REMOVE(&nbdev->nvme_ns_list, nvme_ns, tailq); 5016 TAILQ_INSERT_HEAD(&nbdev->nvme_ns_list, nvme_ns, tailq); 5017 } 5018 5019 return nvme_ns; 5020 } 5021 5022 /* This function supports only multipath mode. There is only a single I/O path 5023 * for each NVMe-oF controller. Hence, just move the matched I/O path to the 5024 * head of the I/O path list for each NVMe bdev channel. 5025 * 5026 * NVMe bdev channel may be acquired after completing this function. move the 5027 * matched namespace to the head of the namespace list for the NVMe bdev too. 5028 */ 5029 void 5030 bdev_nvme_set_preferred_path(const char *name, uint16_t cntlid, 5031 bdev_nvme_set_preferred_path_cb cb_fn, void *cb_arg) 5032 { 5033 struct bdev_nvme_set_preferred_path_ctx *ctx; 5034 struct spdk_bdev *bdev; 5035 struct nvme_bdev *nbdev; 5036 int rc = 0; 5037 5038 assert(cb_fn != NULL); 5039 5040 ctx = calloc(1, sizeof(*ctx)); 5041 if (ctx == NULL) { 5042 SPDK_ERRLOG("Failed to alloc context.\n"); 5043 rc = -ENOMEM; 5044 goto err_alloc; 5045 } 5046 5047 ctx->cb_fn = cb_fn; 5048 ctx->cb_arg = cb_arg; 5049 5050 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 5051 if (rc != 0) { 5052 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 5053 goto err_open; 5054 } 5055 5056 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 5057 5058 if (bdev->module != &nvme_if) { 5059 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 5060 rc = -ENODEV; 5061 goto err_bdev; 5062 } 5063 5064 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 5065 5066 pthread_mutex_lock(&nbdev->mutex); 5067 5068 ctx->nvme_ns = bdev_nvme_set_preferred_ns(nbdev, cntlid); 5069 if (ctx->nvme_ns == NULL) { 5070 pthread_mutex_unlock(&nbdev->mutex); 5071 5072 SPDK_ERRLOG("bdev %s does not have namespace to controller %u.\n", name, cntlid); 5073 rc = -ENODEV; 5074 goto err_bdev; 5075 } 5076 5077 pthread_mutex_unlock(&nbdev->mutex); 5078 5079 spdk_for_each_channel(nbdev, 5080 _bdev_nvme_set_preferred_path, 5081 ctx, 5082 bdev_nvme_set_preferred_path_done); 5083 return; 5084 5085 err_bdev: 5086 spdk_bdev_close(ctx->desc); 5087 err_open: 5088 free(ctx); 5089 err_alloc: 5090 cb_fn(cb_arg, rc); 5091 } 5092 5093 struct bdev_nvme_set_multipath_policy_ctx { 5094 struct spdk_bdev_desc *desc; 5095 bdev_nvme_set_multipath_policy_cb cb_fn; 5096 void *cb_arg; 5097 }; 5098 5099 static void 5100 bdev_nvme_set_multipath_policy_done(struct spdk_io_channel_iter *i, int status) 5101 { 5102 struct bdev_nvme_set_multipath_policy_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 5103 5104 assert(ctx != NULL); 5105 assert(ctx->desc != NULL); 5106 assert(ctx->cb_fn != NULL); 5107 5108 spdk_bdev_close(ctx->desc); 5109 5110 ctx->cb_fn(ctx->cb_arg, status); 5111 5112 free(ctx); 5113 } 5114 5115 static void 5116 _bdev_nvme_set_multipath_policy(struct spdk_io_channel_iter *i) 5117 { 5118 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 5119 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 5120 struct nvme_bdev *nbdev = spdk_io_channel_get_io_device(_ch); 5121 5122 nbdev_ch->mp_policy = nbdev->mp_policy; 5123 nbdev_ch->mp_selector = nbdev->mp_selector; 5124 nbdev_ch->rr_min_io = nbdev->rr_min_io; 5125 bdev_nvme_clear_current_io_path(nbdev_ch); 5126 5127 spdk_for_each_channel_continue(i, 0); 5128 } 5129 5130 void 5131 bdev_nvme_set_multipath_policy(const char *name, enum bdev_nvme_multipath_policy policy, 5132 enum bdev_nvme_multipath_selector selector, uint32_t rr_min_io, 5133 bdev_nvme_set_multipath_policy_cb cb_fn, void *cb_arg) 5134 { 5135 struct bdev_nvme_set_multipath_policy_ctx *ctx; 5136 struct spdk_bdev *bdev; 5137 struct nvme_bdev *nbdev; 5138 int rc; 5139 5140 assert(cb_fn != NULL); 5141 5142 switch (policy) { 5143 case BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE: 5144 break; 5145 case BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE: 5146 switch (selector) { 5147 case BDEV_NVME_MP_SELECTOR_ROUND_ROBIN: 5148 if (rr_min_io == UINT32_MAX) { 5149 rr_min_io = 1; 5150 } else if (rr_min_io == 0) { 5151 rc = -EINVAL; 5152 goto exit; 5153 } 5154 break; 5155 case BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH: 5156 break; 5157 default: 5158 rc = -EINVAL; 5159 goto exit; 5160 } 5161 break; 5162 default: 5163 rc = -EINVAL; 5164 goto exit; 5165 } 5166 5167 ctx = calloc(1, sizeof(*ctx)); 5168 if (ctx == NULL) { 5169 SPDK_ERRLOG("Failed to alloc context.\n"); 5170 rc = -ENOMEM; 5171 goto exit; 5172 } 5173 5174 ctx->cb_fn = cb_fn; 5175 ctx->cb_arg = cb_arg; 5176 5177 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 5178 if (rc != 0) { 5179 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 5180 rc = -ENODEV; 5181 goto err_open; 5182 } 5183 5184 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 5185 if (bdev->module != &nvme_if) { 5186 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 5187 rc = -ENODEV; 5188 goto err_module; 5189 } 5190 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 5191 5192 pthread_mutex_lock(&nbdev->mutex); 5193 nbdev->mp_policy = policy; 5194 nbdev->mp_selector = selector; 5195 nbdev->rr_min_io = rr_min_io; 5196 pthread_mutex_unlock(&nbdev->mutex); 5197 5198 spdk_for_each_channel(nbdev, 5199 _bdev_nvme_set_multipath_policy, 5200 ctx, 5201 bdev_nvme_set_multipath_policy_done); 5202 return; 5203 5204 err_module: 5205 spdk_bdev_close(ctx->desc); 5206 err_open: 5207 free(ctx); 5208 exit: 5209 cb_fn(cb_arg, rc); 5210 } 5211 5212 static void 5213 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 5214 { 5215 struct nvme_ctrlr *nvme_ctrlr = arg; 5216 union spdk_nvme_async_event_completion event; 5217 5218 if (spdk_nvme_cpl_is_error(cpl)) { 5219 SPDK_WARNLOG("AER request execute failed\n"); 5220 return; 5221 } 5222 5223 event.raw = cpl->cdw0; 5224 if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 5225 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) { 5226 nvme_ctrlr_populate_namespaces(nvme_ctrlr, NULL); 5227 } else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 5228 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_ANA_CHANGE)) { 5229 nvme_ctrlr_read_ana_log_page(nvme_ctrlr); 5230 } 5231 } 5232 5233 static void 5234 free_nvme_async_probe_ctx(struct nvme_async_probe_ctx *ctx) 5235 { 5236 spdk_keyring_put_key(ctx->drv_opts.tls_psk); 5237 spdk_keyring_put_key(ctx->drv_opts.dhchap_key); 5238 spdk_keyring_put_key(ctx->drv_opts.dhchap_ctrlr_key); 5239 free(ctx); 5240 } 5241 5242 static void 5243 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, int rc) 5244 { 5245 if (ctx->cb_fn) { 5246 ctx->cb_fn(ctx->cb_ctx, ctx->reported_bdevs, rc); 5247 } 5248 5249 ctx->namespaces_populated = true; 5250 if (ctx->probe_done) { 5251 /* The probe was already completed, so we need to free the context 5252 * here. This can happen for cases like OCSSD, where we need to 5253 * send additional commands to the SSD after attach. 5254 */ 5255 free_nvme_async_probe_ctx(ctx); 5256 } 5257 } 5258 5259 static void 5260 nvme_ctrlr_create_done(struct nvme_ctrlr *nvme_ctrlr, 5261 struct nvme_async_probe_ctx *ctx) 5262 { 5263 spdk_io_device_register(nvme_ctrlr, 5264 bdev_nvme_create_ctrlr_channel_cb, 5265 bdev_nvme_destroy_ctrlr_channel_cb, 5266 sizeof(struct nvme_ctrlr_channel), 5267 nvme_ctrlr->nbdev_ctrlr->name); 5268 5269 nvme_ctrlr_populate_namespaces(nvme_ctrlr, ctx); 5270 } 5271 5272 static void 5273 nvme_ctrlr_init_ana_log_page_done(void *_ctx, const struct spdk_nvme_cpl *cpl) 5274 { 5275 struct nvme_ctrlr *nvme_ctrlr = _ctx; 5276 struct nvme_async_probe_ctx *ctx = nvme_ctrlr->probe_ctx; 5277 5278 nvme_ctrlr->probe_ctx = NULL; 5279 5280 if (spdk_nvme_cpl_is_error(cpl)) { 5281 nvme_ctrlr_delete(nvme_ctrlr); 5282 5283 if (ctx != NULL) { 5284 ctx->reported_bdevs = 0; 5285 populate_namespaces_cb(ctx, -1); 5286 } 5287 return; 5288 } 5289 5290 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 5291 } 5292 5293 static int 5294 nvme_ctrlr_init_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 5295 struct nvme_async_probe_ctx *ctx) 5296 { 5297 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 5298 const struct spdk_nvme_ctrlr_data *cdata; 5299 uint32_t ana_log_page_size; 5300 5301 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5302 5303 /* Set buffer size enough to include maximum number of allowed namespaces. */ 5304 ana_log_page_size = sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 5305 sizeof(struct spdk_nvme_ana_group_descriptor) + cdata->mnan * 5306 sizeof(uint32_t); 5307 5308 nvme_ctrlr->ana_log_page = spdk_zmalloc(ana_log_page_size, 64, NULL, 5309 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 5310 if (nvme_ctrlr->ana_log_page == NULL) { 5311 SPDK_ERRLOG("could not allocate ANA log page buffer\n"); 5312 return -ENXIO; 5313 } 5314 5315 /* Each descriptor in a ANA log page is not ensured to be 8-bytes aligned. 5316 * Hence copy each descriptor to a temporary area when parsing it. 5317 * 5318 * Allocate a buffer whose size is as large as ANA log page buffer because 5319 * we do not know the size of a descriptor until actually reading it. 5320 */ 5321 nvme_ctrlr->copied_ana_desc = calloc(1, ana_log_page_size); 5322 if (nvme_ctrlr->copied_ana_desc == NULL) { 5323 SPDK_ERRLOG("could not allocate a buffer to parse ANA descriptor\n"); 5324 return -ENOMEM; 5325 } 5326 5327 nvme_ctrlr->max_ana_log_page_size = ana_log_page_size; 5328 5329 nvme_ctrlr->probe_ctx = ctx; 5330 5331 /* Then, set the read size only to include the current active namespaces. */ 5332 ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); 5333 5334 if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { 5335 SPDK_ERRLOG("ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", 5336 ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); 5337 return -EINVAL; 5338 } 5339 5340 return spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, 5341 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 5342 SPDK_NVME_GLOBAL_NS_TAG, 5343 nvme_ctrlr->ana_log_page, 5344 ana_log_page_size, 0, 5345 nvme_ctrlr_init_ana_log_page_done, 5346 nvme_ctrlr); 5347 } 5348 5349 /* hostnqn and subnqn were already verified before attaching a controller. 5350 * Hence check only the multipath capability and cntlid here. 5351 */ 5352 static bool 5353 bdev_nvme_check_multipath(struct nvme_bdev_ctrlr *nbdev_ctrlr, struct spdk_nvme_ctrlr *ctrlr) 5354 { 5355 struct nvme_ctrlr *tmp; 5356 const struct spdk_nvme_ctrlr_data *cdata, *tmp_cdata; 5357 5358 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5359 5360 if (!cdata->cmic.multi_ctrlr) { 5361 SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); 5362 return false; 5363 } 5364 5365 TAILQ_FOREACH(tmp, &nbdev_ctrlr->ctrlrs, tailq) { 5366 tmp_cdata = spdk_nvme_ctrlr_get_data(tmp->ctrlr); 5367 5368 if (!tmp_cdata->cmic.multi_ctrlr) { 5369 SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); 5370 return false; 5371 } 5372 if (cdata->cntlid == tmp_cdata->cntlid) { 5373 SPDK_ERRLOG("cntlid %u are duplicated.\n", tmp_cdata->cntlid); 5374 return false; 5375 } 5376 } 5377 5378 return true; 5379 } 5380 5381 static int 5382 nvme_bdev_ctrlr_create(const char *name, struct nvme_ctrlr *nvme_ctrlr) 5383 { 5384 struct nvme_bdev_ctrlr *nbdev_ctrlr; 5385 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 5386 int rc = 0; 5387 5388 pthread_mutex_lock(&g_bdev_nvme_mutex); 5389 5390 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 5391 if (nbdev_ctrlr != NULL) { 5392 if (!bdev_nvme_check_multipath(nbdev_ctrlr, ctrlr)) { 5393 rc = -EINVAL; 5394 goto exit; 5395 } 5396 } else { 5397 nbdev_ctrlr = calloc(1, sizeof(*nbdev_ctrlr)); 5398 if (nbdev_ctrlr == NULL) { 5399 SPDK_ERRLOG("Failed to allocate nvme_bdev_ctrlr.\n"); 5400 rc = -ENOMEM; 5401 goto exit; 5402 } 5403 nbdev_ctrlr->name = strdup(name); 5404 if (nbdev_ctrlr->name == NULL) { 5405 SPDK_ERRLOG("Failed to allocate name of nvme_bdev_ctrlr.\n"); 5406 free(nbdev_ctrlr); 5407 goto exit; 5408 } 5409 TAILQ_INIT(&nbdev_ctrlr->ctrlrs); 5410 TAILQ_INIT(&nbdev_ctrlr->bdevs); 5411 TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 5412 } 5413 nvme_ctrlr->nbdev_ctrlr = nbdev_ctrlr; 5414 TAILQ_INSERT_TAIL(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 5415 exit: 5416 pthread_mutex_unlock(&g_bdev_nvme_mutex); 5417 return rc; 5418 } 5419 5420 static int 5421 nvme_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr, 5422 const char *name, 5423 const struct spdk_nvme_transport_id *trid, 5424 struct nvme_async_probe_ctx *ctx) 5425 { 5426 struct nvme_ctrlr *nvme_ctrlr; 5427 struct nvme_path_id *path_id; 5428 const struct spdk_nvme_ctrlr_data *cdata; 5429 int rc; 5430 5431 nvme_ctrlr = calloc(1, sizeof(*nvme_ctrlr)); 5432 if (nvme_ctrlr == NULL) { 5433 SPDK_ERRLOG("Failed to allocate device struct\n"); 5434 return -ENOMEM; 5435 } 5436 5437 rc = pthread_mutex_init(&nvme_ctrlr->mutex, NULL); 5438 if (rc != 0) { 5439 free(nvme_ctrlr); 5440 return rc; 5441 } 5442 5443 TAILQ_INIT(&nvme_ctrlr->trids); 5444 RB_INIT(&nvme_ctrlr->namespaces); 5445 5446 /* Get another reference to the key, so the first one can be released from probe_ctx */ 5447 if (ctx != NULL) { 5448 if (ctx->drv_opts.tls_psk != NULL) { 5449 nvme_ctrlr->psk = spdk_keyring_get_key( 5450 spdk_key_get_name(ctx->drv_opts.tls_psk)); 5451 if (nvme_ctrlr->psk == NULL) { 5452 /* Could only happen if the key was removed in the meantime */ 5453 SPDK_ERRLOG("Couldn't get a reference to the key '%s'\n", 5454 spdk_key_get_name(ctx->drv_opts.tls_psk)); 5455 rc = -ENOKEY; 5456 goto err; 5457 } 5458 } 5459 5460 if (ctx->drv_opts.dhchap_key != NULL) { 5461 nvme_ctrlr->dhchap_key = spdk_keyring_get_key( 5462 spdk_key_get_name(ctx->drv_opts.dhchap_key)); 5463 if (nvme_ctrlr->dhchap_key == NULL) { 5464 SPDK_ERRLOG("Couldn't get a reference to the key '%s'\n", 5465 spdk_key_get_name(ctx->drv_opts.dhchap_key)); 5466 rc = -ENOKEY; 5467 goto err; 5468 } 5469 } 5470 5471 if (ctx->drv_opts.dhchap_ctrlr_key != NULL) { 5472 nvme_ctrlr->dhchap_ctrlr_key = 5473 spdk_keyring_get_key( 5474 spdk_key_get_name(ctx->drv_opts.dhchap_ctrlr_key)); 5475 if (nvme_ctrlr->dhchap_ctrlr_key == NULL) { 5476 SPDK_ERRLOG("Couldn't get a reference to the key '%s'\n", 5477 spdk_key_get_name(ctx->drv_opts.dhchap_ctrlr_key)); 5478 rc = -ENOKEY; 5479 goto err; 5480 } 5481 } 5482 } 5483 5484 path_id = calloc(1, sizeof(*path_id)); 5485 if (path_id == NULL) { 5486 SPDK_ERRLOG("Failed to allocate trid entry pointer\n"); 5487 rc = -ENOMEM; 5488 goto err; 5489 } 5490 5491 path_id->trid = *trid; 5492 if (ctx != NULL) { 5493 memcpy(path_id->hostid.hostaddr, ctx->drv_opts.src_addr, sizeof(path_id->hostid.hostaddr)); 5494 memcpy(path_id->hostid.hostsvcid, ctx->drv_opts.src_svcid, sizeof(path_id->hostid.hostsvcid)); 5495 } 5496 nvme_ctrlr->active_path_id = path_id; 5497 TAILQ_INSERT_HEAD(&nvme_ctrlr->trids, path_id, link); 5498 5499 nvme_ctrlr->thread = spdk_get_thread(); 5500 nvme_ctrlr->ctrlr = ctrlr; 5501 nvme_ctrlr->ref = 1; 5502 5503 if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) { 5504 SPDK_ERRLOG("OCSSDs are not supported"); 5505 rc = -ENOTSUP; 5506 goto err; 5507 } 5508 5509 if (ctx != NULL) { 5510 memcpy(&nvme_ctrlr->opts, &ctx->bdev_opts, sizeof(ctx->bdev_opts)); 5511 } else { 5512 bdev_nvme_get_default_ctrlr_opts(&nvme_ctrlr->opts); 5513 } 5514 5515 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_ctrlr, 5516 g_opts.nvme_adminq_poll_period_us); 5517 5518 if (g_opts.timeout_us > 0) { 5519 /* Register timeout callback. Timeout values for IO vs. admin reqs can be different. */ 5520 /* If timeout_admin_us is 0 (not specified), admin uses same timeout as IO. */ 5521 uint64_t adm_timeout_us = (g_opts.timeout_admin_us == 0) ? 5522 g_opts.timeout_us : g_opts.timeout_admin_us; 5523 spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us, 5524 adm_timeout_us, timeout_cb, nvme_ctrlr); 5525 } 5526 5527 spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_ctrlr); 5528 spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_ctrlr); 5529 5530 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 5531 SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) { 5532 nvme_ctrlr->opal_dev = spdk_opal_dev_construct(ctrlr); 5533 } 5534 5535 rc = nvme_bdev_ctrlr_create(name, nvme_ctrlr); 5536 if (rc != 0) { 5537 goto err; 5538 } 5539 5540 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5541 5542 if (cdata->cmic.ana_reporting) { 5543 rc = nvme_ctrlr_init_ana_log_page(nvme_ctrlr, ctx); 5544 if (rc == 0) { 5545 return 0; 5546 } 5547 } else { 5548 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 5549 return 0; 5550 } 5551 5552 err: 5553 nvme_ctrlr_delete(nvme_ctrlr); 5554 return rc; 5555 } 5556 5557 void 5558 bdev_nvme_get_default_ctrlr_opts(struct nvme_ctrlr_opts *opts) 5559 { 5560 opts->prchk_flags = 0; 5561 opts->ctrlr_loss_timeout_sec = g_opts.ctrlr_loss_timeout_sec; 5562 opts->reconnect_delay_sec = g_opts.reconnect_delay_sec; 5563 opts->fast_io_fail_timeout_sec = g_opts.fast_io_fail_timeout_sec; 5564 } 5565 5566 static void 5567 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5568 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *drv_opts) 5569 { 5570 char *name; 5571 5572 name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++); 5573 if (!name) { 5574 SPDK_ERRLOG("Failed to assign name to NVMe device\n"); 5575 return; 5576 } 5577 5578 if (nvme_ctrlr_create(ctrlr, name, trid, NULL) == 0) { 5579 SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name); 5580 } else { 5581 SPDK_ERRLOG("Failed to attach to %s (%s)\n", trid->traddr, name); 5582 } 5583 5584 free(name); 5585 } 5586 5587 static void 5588 _nvme_ctrlr_destruct(void *ctx) 5589 { 5590 struct nvme_ctrlr *nvme_ctrlr = ctx; 5591 5592 nvme_ctrlr_depopulate_namespaces(nvme_ctrlr); 5593 nvme_ctrlr_release(nvme_ctrlr); 5594 } 5595 5596 static int 5597 bdev_nvme_delete_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) 5598 { 5599 struct nvme_probe_skip_entry *entry; 5600 5601 /* The controller's destruction was already started */ 5602 if (nvme_ctrlr->destruct) { 5603 return -EALREADY; 5604 } 5605 5606 if (!hotplug && 5607 nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { 5608 entry = calloc(1, sizeof(*entry)); 5609 if (!entry) { 5610 return -ENOMEM; 5611 } 5612 entry->trid = nvme_ctrlr->active_path_id->trid; 5613 TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq); 5614 } 5615 5616 nvme_ctrlr->destruct = true; 5617 return 0; 5618 } 5619 5620 static int 5621 bdev_nvme_delete_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) 5622 { 5623 int rc; 5624 5625 pthread_mutex_lock(&nvme_ctrlr->mutex); 5626 rc = bdev_nvme_delete_ctrlr_unsafe(nvme_ctrlr, hotplug); 5627 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5628 5629 if (rc == 0) { 5630 _nvme_ctrlr_destruct(nvme_ctrlr); 5631 } else if (rc == -EALREADY) { 5632 rc = 0; 5633 } 5634 5635 return rc; 5636 } 5637 5638 static void 5639 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr) 5640 { 5641 struct nvme_ctrlr *nvme_ctrlr = cb_ctx; 5642 5643 bdev_nvme_delete_ctrlr(nvme_ctrlr, true); 5644 } 5645 5646 static int 5647 bdev_nvme_hotplug_probe(void *arg) 5648 { 5649 if (g_hotplug_probe_ctx == NULL) { 5650 spdk_poller_unregister(&g_hotplug_probe_poller); 5651 return SPDK_POLLER_IDLE; 5652 } 5653 5654 if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) { 5655 g_hotplug_probe_ctx = NULL; 5656 spdk_poller_unregister(&g_hotplug_probe_poller); 5657 } 5658 5659 return SPDK_POLLER_BUSY; 5660 } 5661 5662 static int 5663 bdev_nvme_hotplug(void *arg) 5664 { 5665 struct spdk_nvme_transport_id trid_pcie; 5666 5667 if (g_hotplug_probe_ctx) { 5668 return SPDK_POLLER_BUSY; 5669 } 5670 5671 memset(&trid_pcie, 0, sizeof(trid_pcie)); 5672 spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE); 5673 5674 g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL, 5675 hotplug_probe_cb, attach_cb, NULL); 5676 5677 if (g_hotplug_probe_ctx) { 5678 assert(g_hotplug_probe_poller == NULL); 5679 g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000); 5680 } 5681 5682 return SPDK_POLLER_BUSY; 5683 } 5684 5685 void 5686 bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts) 5687 { 5688 *opts = g_opts; 5689 } 5690 5691 static bool bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, 5692 uint32_t reconnect_delay_sec, 5693 uint32_t fast_io_fail_timeout_sec); 5694 5695 static int 5696 bdev_nvme_validate_opts(const struct spdk_bdev_nvme_opts *opts) 5697 { 5698 if ((opts->timeout_us == 0) && (opts->timeout_admin_us != 0)) { 5699 /* Can't set timeout_admin_us without also setting timeout_us */ 5700 SPDK_WARNLOG("Invalid options: Can't have (timeout_us == 0) with (timeout_admin_us > 0)\n"); 5701 return -EINVAL; 5702 } 5703 5704 if (opts->bdev_retry_count < -1) { 5705 SPDK_WARNLOG("Invalid option: bdev_retry_count can't be less than -1.\n"); 5706 return -EINVAL; 5707 } 5708 5709 if (!bdev_nvme_check_io_error_resiliency_params(opts->ctrlr_loss_timeout_sec, 5710 opts->reconnect_delay_sec, 5711 opts->fast_io_fail_timeout_sec)) { 5712 return -EINVAL; 5713 } 5714 5715 return 0; 5716 } 5717 5718 int 5719 bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts) 5720 { 5721 int ret; 5722 5723 ret = bdev_nvme_validate_opts(opts); 5724 if (ret) { 5725 SPDK_WARNLOG("Failed to set nvme opts.\n"); 5726 return ret; 5727 } 5728 5729 if (g_bdev_nvme_init_thread != NULL) { 5730 if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 5731 return -EPERM; 5732 } 5733 } 5734 5735 if (opts->rdma_srq_size != 0 || 5736 opts->rdma_max_cq_size != 0 || 5737 opts->rdma_cm_event_timeout_ms != 0) { 5738 struct spdk_nvme_transport_opts drv_opts; 5739 5740 spdk_nvme_transport_get_opts(&drv_opts, sizeof(drv_opts)); 5741 if (opts->rdma_srq_size != 0) { 5742 drv_opts.rdma_srq_size = opts->rdma_srq_size; 5743 } 5744 if (opts->rdma_max_cq_size != 0) { 5745 drv_opts.rdma_max_cq_size = opts->rdma_max_cq_size; 5746 } 5747 if (opts->rdma_cm_event_timeout_ms != 0) { 5748 drv_opts.rdma_cm_event_timeout_ms = opts->rdma_cm_event_timeout_ms; 5749 } 5750 5751 ret = spdk_nvme_transport_set_opts(&drv_opts, sizeof(drv_opts)); 5752 if (ret) { 5753 SPDK_ERRLOG("Failed to set NVMe transport opts.\n"); 5754 return ret; 5755 } 5756 } 5757 5758 g_opts = *opts; 5759 5760 return 0; 5761 } 5762 5763 struct set_nvme_hotplug_ctx { 5764 uint64_t period_us; 5765 bool enabled; 5766 spdk_msg_fn fn; 5767 void *fn_ctx; 5768 }; 5769 5770 static void 5771 set_nvme_hotplug_period_cb(void *_ctx) 5772 { 5773 struct set_nvme_hotplug_ctx *ctx = _ctx; 5774 5775 spdk_poller_unregister(&g_hotplug_poller); 5776 if (ctx->enabled) { 5777 g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us); 5778 } 5779 5780 g_nvme_hotplug_poll_period_us = ctx->period_us; 5781 g_nvme_hotplug_enabled = ctx->enabled; 5782 if (ctx->fn) { 5783 ctx->fn(ctx->fn_ctx); 5784 } 5785 5786 free(ctx); 5787 } 5788 5789 int 5790 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx) 5791 { 5792 struct set_nvme_hotplug_ctx *ctx; 5793 5794 if (enabled == true && !spdk_process_is_primary()) { 5795 return -EPERM; 5796 } 5797 5798 ctx = calloc(1, sizeof(*ctx)); 5799 if (ctx == NULL) { 5800 return -ENOMEM; 5801 } 5802 5803 period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us; 5804 ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX); 5805 ctx->enabled = enabled; 5806 ctx->fn = cb; 5807 ctx->fn_ctx = cb_ctx; 5808 5809 spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx); 5810 return 0; 5811 } 5812 5813 static void 5814 nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 5815 struct nvme_async_probe_ctx *ctx) 5816 { 5817 struct nvme_ns *nvme_ns; 5818 struct nvme_bdev *nvme_bdev; 5819 size_t j; 5820 5821 assert(nvme_ctrlr != NULL); 5822 5823 if (ctx->names == NULL) { 5824 ctx->reported_bdevs = 0; 5825 populate_namespaces_cb(ctx, 0); 5826 return; 5827 } 5828 5829 /* 5830 * Report the new bdevs that were created in this call. 5831 * There can be more than one bdev per NVMe controller. 5832 */ 5833 j = 0; 5834 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 5835 while (nvme_ns != NULL) { 5836 nvme_bdev = nvme_ns->bdev; 5837 if (j < ctx->max_bdevs) { 5838 ctx->names[j] = nvme_bdev->disk.name; 5839 j++; 5840 } else { 5841 SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n", 5842 ctx->max_bdevs); 5843 ctx->reported_bdevs = 0; 5844 populate_namespaces_cb(ctx, -ERANGE); 5845 return; 5846 } 5847 5848 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 5849 } 5850 5851 ctx->reported_bdevs = j; 5852 populate_namespaces_cb(ctx, 0); 5853 } 5854 5855 static int 5856 bdev_nvme_check_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 5857 struct spdk_nvme_ctrlr *new_ctrlr, 5858 struct spdk_nvme_transport_id *trid) 5859 { 5860 struct nvme_path_id *tmp_trid; 5861 5862 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 5863 SPDK_ERRLOG("PCIe failover is not supported.\n"); 5864 return -ENOTSUP; 5865 } 5866 5867 /* Currently we only support failover to the same transport type. */ 5868 if (nvme_ctrlr->active_path_id->trid.trtype != trid->trtype) { 5869 SPDK_WARNLOG("Failover from trtype: %s to a different trtype: %s is not supported currently\n", 5870 spdk_nvme_transport_id_trtype_str(nvme_ctrlr->active_path_id->trid.trtype), 5871 spdk_nvme_transport_id_trtype_str(trid->trtype)); 5872 return -EINVAL; 5873 } 5874 5875 5876 /* Currently we only support failover to the same NQN. */ 5877 if (strncmp(trid->subnqn, nvme_ctrlr->active_path_id->trid.subnqn, SPDK_NVMF_NQN_MAX_LEN)) { 5878 SPDK_WARNLOG("Failover from subnqn: %s to a different subnqn: %s is not supported currently\n", 5879 nvme_ctrlr->active_path_id->trid.subnqn, trid->subnqn); 5880 return -EINVAL; 5881 } 5882 5883 /* Skip all the other checks if we've already registered this path. */ 5884 TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) { 5885 if (!spdk_nvme_transport_id_compare(&tmp_trid->trid, trid)) { 5886 SPDK_WARNLOG("This path (traddr: %s subnqn: %s) is already registered\n", trid->traddr, 5887 trid->subnqn); 5888 return -EALREADY; 5889 } 5890 } 5891 5892 return 0; 5893 } 5894 5895 static int 5896 bdev_nvme_check_secondary_namespace(struct nvme_ctrlr *nvme_ctrlr, 5897 struct spdk_nvme_ctrlr *new_ctrlr) 5898 { 5899 struct nvme_ns *nvme_ns; 5900 struct spdk_nvme_ns *new_ns; 5901 5902 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 5903 while (nvme_ns != NULL) { 5904 new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nvme_ns->id); 5905 assert(new_ns != NULL); 5906 5907 if (!bdev_nvme_compare_ns(nvme_ns->ns, new_ns)) { 5908 return -EINVAL; 5909 } 5910 5911 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 5912 } 5913 5914 return 0; 5915 } 5916 5917 static int 5918 _bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 5919 struct spdk_nvme_transport_id *trid) 5920 { 5921 struct nvme_path_id *active_id, *new_trid, *tmp_trid; 5922 5923 new_trid = calloc(1, sizeof(*new_trid)); 5924 if (new_trid == NULL) { 5925 return -ENOMEM; 5926 } 5927 new_trid->trid = *trid; 5928 5929 active_id = nvme_ctrlr->active_path_id; 5930 assert(active_id != NULL); 5931 assert(active_id == TAILQ_FIRST(&nvme_ctrlr->trids)); 5932 5933 /* Skip the active trid not to replace it until it is failed. */ 5934 tmp_trid = TAILQ_NEXT(active_id, link); 5935 if (tmp_trid == NULL) { 5936 goto add_tail; 5937 } 5938 5939 /* It means the trid is faled if its last failed time is non-zero. 5940 * Insert the new alternate trid before any failed trid. 5941 */ 5942 TAILQ_FOREACH_FROM(tmp_trid, &nvme_ctrlr->trids, link) { 5943 if (tmp_trid->last_failed_tsc != 0) { 5944 TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link); 5945 return 0; 5946 } 5947 } 5948 5949 add_tail: 5950 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, new_trid, link); 5951 return 0; 5952 } 5953 5954 /* This is the case that a secondary path is added to an existing 5955 * nvme_ctrlr for failover. After checking if it can access the same 5956 * namespaces as the primary path, it is disconnected until failover occurs. 5957 */ 5958 static int 5959 bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 5960 struct spdk_nvme_ctrlr *new_ctrlr, 5961 struct spdk_nvme_transport_id *trid) 5962 { 5963 int rc; 5964 5965 assert(nvme_ctrlr != NULL); 5966 5967 pthread_mutex_lock(&nvme_ctrlr->mutex); 5968 5969 rc = bdev_nvme_check_secondary_trid(nvme_ctrlr, new_ctrlr, trid); 5970 if (rc != 0) { 5971 goto exit; 5972 } 5973 5974 rc = bdev_nvme_check_secondary_namespace(nvme_ctrlr, new_ctrlr); 5975 if (rc != 0) { 5976 goto exit; 5977 } 5978 5979 rc = _bdev_nvme_add_secondary_trid(nvme_ctrlr, trid); 5980 5981 exit: 5982 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5983 5984 spdk_nvme_detach(new_ctrlr); 5985 5986 return rc; 5987 } 5988 5989 static void 5990 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5991 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 5992 { 5993 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 5994 struct nvme_async_probe_ctx *ctx; 5995 int rc; 5996 5997 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 5998 ctx->ctrlr_attached = true; 5999 6000 rc = nvme_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx); 6001 if (rc != 0) { 6002 ctx->reported_bdevs = 0; 6003 populate_namespaces_cb(ctx, rc); 6004 } 6005 } 6006 6007 static void 6008 connect_set_failover_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 6009 struct spdk_nvme_ctrlr *ctrlr, 6010 const struct spdk_nvme_ctrlr_opts *opts) 6011 { 6012 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 6013 struct nvme_ctrlr *nvme_ctrlr; 6014 struct nvme_async_probe_ctx *ctx; 6015 int rc; 6016 6017 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 6018 ctx->ctrlr_attached = true; 6019 6020 nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->base_name); 6021 if (nvme_ctrlr) { 6022 rc = bdev_nvme_add_secondary_trid(nvme_ctrlr, ctrlr, &ctx->trid); 6023 } else { 6024 rc = -ENODEV; 6025 } 6026 6027 ctx->reported_bdevs = 0; 6028 populate_namespaces_cb(ctx, rc); 6029 } 6030 6031 static int 6032 bdev_nvme_async_poll(void *arg) 6033 { 6034 struct nvme_async_probe_ctx *ctx = arg; 6035 int rc; 6036 6037 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 6038 if (spdk_unlikely(rc != -EAGAIN)) { 6039 ctx->probe_done = true; 6040 spdk_poller_unregister(&ctx->poller); 6041 if (!ctx->ctrlr_attached) { 6042 /* The probe is done, but no controller was attached. 6043 * That means we had a failure, so report -EIO back to 6044 * the caller (usually the RPC). populate_namespaces_cb() 6045 * will take care of freeing the nvme_async_probe_ctx. 6046 */ 6047 ctx->reported_bdevs = 0; 6048 populate_namespaces_cb(ctx, -EIO); 6049 } else if (ctx->namespaces_populated) { 6050 /* The namespaces for the attached controller were all 6051 * populated and the response was already sent to the 6052 * caller (usually the RPC). So free the context here. 6053 */ 6054 free_nvme_async_probe_ctx(ctx); 6055 } 6056 } 6057 6058 return SPDK_POLLER_BUSY; 6059 } 6060 6061 static bool 6062 bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, 6063 uint32_t reconnect_delay_sec, 6064 uint32_t fast_io_fail_timeout_sec) 6065 { 6066 if (ctrlr_loss_timeout_sec < -1) { 6067 SPDK_ERRLOG("ctrlr_loss_timeout_sec can't be less than -1.\n"); 6068 return false; 6069 } else if (ctrlr_loss_timeout_sec == -1) { 6070 if (reconnect_delay_sec == 0) { 6071 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 6072 return false; 6073 } else if (fast_io_fail_timeout_sec != 0 && 6074 fast_io_fail_timeout_sec < reconnect_delay_sec) { 6075 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io-fail_timeout_sec.\n"); 6076 return false; 6077 } 6078 } else if (ctrlr_loss_timeout_sec != 0) { 6079 if (reconnect_delay_sec == 0) { 6080 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 6081 return false; 6082 } else if (reconnect_delay_sec > (uint32_t)ctrlr_loss_timeout_sec) { 6083 SPDK_ERRLOG("reconnect_delay_sec can't be more than ctrlr_loss_timeout_sec.\n"); 6084 return false; 6085 } else if (fast_io_fail_timeout_sec != 0) { 6086 if (fast_io_fail_timeout_sec < reconnect_delay_sec) { 6087 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io_fail_timeout_sec.\n"); 6088 return false; 6089 } else if (fast_io_fail_timeout_sec > (uint32_t)ctrlr_loss_timeout_sec) { 6090 SPDK_ERRLOG("fast_io_fail_timeout_sec can't be more than ctrlr_loss_timeout_sec.\n"); 6091 return false; 6092 } 6093 } 6094 } else if (reconnect_delay_sec != 0 || fast_io_fail_timeout_sec != 0) { 6095 SPDK_ERRLOG("Both reconnect_delay_sec and fast_io_fail_timeout_sec must be 0 if ctrlr_loss_timeout_sec is 0.\n"); 6096 return false; 6097 } 6098 6099 return true; 6100 } 6101 6102 static int 6103 bdev_nvme_load_psk(const char *fname, char *buf, size_t bufsz) 6104 { 6105 FILE *psk_file; 6106 struct stat statbuf; 6107 int rc; 6108 #define TCP_PSK_INVALID_PERMISSIONS 0177 6109 6110 if (stat(fname, &statbuf) != 0) { 6111 SPDK_ERRLOG("Could not read permissions for PSK file\n"); 6112 return -EACCES; 6113 } 6114 6115 if ((statbuf.st_mode & TCP_PSK_INVALID_PERMISSIONS) != 0) { 6116 SPDK_ERRLOG("Incorrect permissions for PSK file\n"); 6117 return -EPERM; 6118 } 6119 if ((size_t)statbuf.st_size >= bufsz) { 6120 SPDK_ERRLOG("Invalid PSK: too long\n"); 6121 return -EINVAL; 6122 } 6123 psk_file = fopen(fname, "r"); 6124 if (psk_file == NULL) { 6125 SPDK_ERRLOG("Could not open PSK file\n"); 6126 return -EINVAL; 6127 } 6128 6129 memset(buf, 0, bufsz); 6130 rc = fread(buf, 1, statbuf.st_size, psk_file); 6131 if (rc != statbuf.st_size) { 6132 SPDK_ERRLOG("Failed to read PSK\n"); 6133 fclose(psk_file); 6134 return -EINVAL; 6135 } 6136 6137 fclose(psk_file); 6138 return 0; 6139 } 6140 6141 int 6142 bdev_nvme_create(struct spdk_nvme_transport_id *trid, 6143 const char *base_name, 6144 const char **names, 6145 uint32_t count, 6146 spdk_bdev_create_nvme_fn cb_fn, 6147 void *cb_ctx, 6148 struct spdk_nvme_ctrlr_opts *drv_opts, 6149 struct nvme_ctrlr_opts *bdev_opts, 6150 bool multipath) 6151 { 6152 struct nvme_probe_skip_entry *entry, *tmp; 6153 struct nvme_async_probe_ctx *ctx; 6154 spdk_nvme_attach_cb attach_cb; 6155 int rc, len; 6156 6157 /* TODO expand this check to include both the host and target TRIDs. 6158 * Only if both are the same should we fail. 6159 */ 6160 if (nvme_ctrlr_get(trid) != NULL) { 6161 SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr); 6162 return -EEXIST; 6163 } 6164 6165 len = strnlen(base_name, SPDK_CONTROLLER_NAME_MAX); 6166 6167 if (len == 0 || len == SPDK_CONTROLLER_NAME_MAX) { 6168 SPDK_ERRLOG("controller name must be between 1 and %d characters\n", SPDK_CONTROLLER_NAME_MAX - 1); 6169 return -EINVAL; 6170 } 6171 6172 if (bdev_opts != NULL && 6173 !bdev_nvme_check_io_error_resiliency_params(bdev_opts->ctrlr_loss_timeout_sec, 6174 bdev_opts->reconnect_delay_sec, 6175 bdev_opts->fast_io_fail_timeout_sec)) { 6176 return -EINVAL; 6177 } 6178 6179 ctx = calloc(1, sizeof(*ctx)); 6180 if (!ctx) { 6181 return -ENOMEM; 6182 } 6183 ctx->base_name = base_name; 6184 ctx->names = names; 6185 ctx->max_bdevs = count; 6186 ctx->cb_fn = cb_fn; 6187 ctx->cb_ctx = cb_ctx; 6188 ctx->trid = *trid; 6189 6190 if (bdev_opts) { 6191 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 6192 } else { 6193 bdev_nvme_get_default_ctrlr_opts(&ctx->bdev_opts); 6194 } 6195 6196 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 6197 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) { 6198 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 6199 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 6200 free(entry); 6201 break; 6202 } 6203 } 6204 } 6205 6206 if (drv_opts) { 6207 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 6208 } else { 6209 spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->drv_opts, sizeof(ctx->drv_opts)); 6210 } 6211 6212 ctx->drv_opts.transport_retry_count = g_opts.transport_retry_count; 6213 ctx->drv_opts.transport_ack_timeout = g_opts.transport_ack_timeout; 6214 ctx->drv_opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms; 6215 ctx->drv_opts.disable_read_ana_log_page = true; 6216 ctx->drv_opts.transport_tos = g_opts.transport_tos; 6217 6218 if (ctx->bdev_opts.psk[0] != '\0') { 6219 /* Try to use the keyring first */ 6220 ctx->drv_opts.tls_psk = spdk_keyring_get_key(ctx->bdev_opts.psk); 6221 if (ctx->drv_opts.tls_psk == NULL) { 6222 rc = bdev_nvme_load_psk(ctx->bdev_opts.psk, 6223 ctx->drv_opts.psk, sizeof(ctx->drv_opts.psk)); 6224 if (rc != 0) { 6225 SPDK_ERRLOG("Could not load PSK from %s\n", ctx->bdev_opts.psk); 6226 free_nvme_async_probe_ctx(ctx); 6227 return rc; 6228 } 6229 } 6230 } 6231 6232 if (ctx->bdev_opts.dhchap_key != NULL) { 6233 ctx->drv_opts.dhchap_key = spdk_keyring_get_key(ctx->bdev_opts.dhchap_key); 6234 if (ctx->drv_opts.dhchap_key == NULL) { 6235 SPDK_ERRLOG("Could not load DH-HMAC-CHAP key: %s\n", 6236 ctx->bdev_opts.dhchap_key); 6237 free_nvme_async_probe_ctx(ctx); 6238 return -ENOKEY; 6239 } 6240 6241 ctx->drv_opts.dhchap_digests = g_opts.dhchap_digests; 6242 ctx->drv_opts.dhchap_dhgroups = g_opts.dhchap_dhgroups; 6243 } 6244 if (ctx->bdev_opts.dhchap_ctrlr_key != NULL) { 6245 ctx->drv_opts.dhchap_ctrlr_key = 6246 spdk_keyring_get_key(ctx->bdev_opts.dhchap_ctrlr_key); 6247 if (ctx->drv_opts.dhchap_ctrlr_key == NULL) { 6248 SPDK_ERRLOG("Could not load DH-HMAC-CHAP controller key: %s\n", 6249 ctx->bdev_opts.dhchap_ctrlr_key); 6250 free_nvme_async_probe_ctx(ctx); 6251 return -ENOKEY; 6252 } 6253 } 6254 6255 if (nvme_bdev_ctrlr_get_by_name(base_name) == NULL || multipath) { 6256 attach_cb = connect_attach_cb; 6257 } else { 6258 attach_cb = connect_set_failover_cb; 6259 } 6260 6261 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, attach_cb); 6262 if (ctx->probe_ctx == NULL) { 6263 SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr); 6264 free_nvme_async_probe_ctx(ctx); 6265 return -ENODEV; 6266 } 6267 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000); 6268 6269 return 0; 6270 } 6271 6272 struct bdev_nvme_delete_ctx { 6273 char *name; 6274 struct nvme_path_id path_id; 6275 bdev_nvme_delete_done_fn delete_done; 6276 void *delete_done_ctx; 6277 uint64_t timeout_ticks; 6278 struct spdk_poller *poller; 6279 }; 6280 6281 static void 6282 free_bdev_nvme_delete_ctx(struct bdev_nvme_delete_ctx *ctx) 6283 { 6284 if (ctx != NULL) { 6285 free(ctx->name); 6286 free(ctx); 6287 } 6288 } 6289 6290 static bool 6291 nvme_path_id_compare(struct nvme_path_id *p, const struct nvme_path_id *path_id) 6292 { 6293 if (path_id->trid.trtype != 0) { 6294 if (path_id->trid.trtype == SPDK_NVME_TRANSPORT_CUSTOM) { 6295 if (strcasecmp(path_id->trid.trstring, p->trid.trstring) != 0) { 6296 return false; 6297 } 6298 } else { 6299 if (path_id->trid.trtype != p->trid.trtype) { 6300 return false; 6301 } 6302 } 6303 } 6304 6305 if (!spdk_mem_all_zero(path_id->trid.traddr, sizeof(path_id->trid.traddr))) { 6306 if (strcasecmp(path_id->trid.traddr, p->trid.traddr) != 0) { 6307 return false; 6308 } 6309 } 6310 6311 if (path_id->trid.adrfam != 0) { 6312 if (path_id->trid.adrfam != p->trid.adrfam) { 6313 return false; 6314 } 6315 } 6316 6317 if (!spdk_mem_all_zero(path_id->trid.trsvcid, sizeof(path_id->trid.trsvcid))) { 6318 if (strcasecmp(path_id->trid.trsvcid, p->trid.trsvcid) != 0) { 6319 return false; 6320 } 6321 } 6322 6323 if (!spdk_mem_all_zero(path_id->trid.subnqn, sizeof(path_id->trid.subnqn))) { 6324 if (strcmp(path_id->trid.subnqn, p->trid.subnqn) != 0) { 6325 return false; 6326 } 6327 } 6328 6329 if (!spdk_mem_all_zero(path_id->hostid.hostaddr, sizeof(path_id->hostid.hostaddr))) { 6330 if (strcmp(path_id->hostid.hostaddr, p->hostid.hostaddr) != 0) { 6331 return false; 6332 } 6333 } 6334 6335 if (!spdk_mem_all_zero(path_id->hostid.hostsvcid, sizeof(path_id->hostid.hostsvcid))) { 6336 if (strcmp(path_id->hostid.hostsvcid, p->hostid.hostsvcid) != 0) { 6337 return false; 6338 } 6339 } 6340 6341 return true; 6342 } 6343 6344 static bool 6345 nvme_path_id_exists(const char *name, const struct nvme_path_id *path_id) 6346 { 6347 struct nvme_bdev_ctrlr *nbdev_ctrlr; 6348 struct nvme_ctrlr *ctrlr; 6349 struct nvme_path_id *p; 6350 6351 pthread_mutex_lock(&g_bdev_nvme_mutex); 6352 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 6353 if (!nbdev_ctrlr) { 6354 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6355 return false; 6356 } 6357 6358 TAILQ_FOREACH(ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 6359 pthread_mutex_lock(&ctrlr->mutex); 6360 TAILQ_FOREACH(p, &ctrlr->trids, link) { 6361 if (nvme_path_id_compare(p, path_id)) { 6362 pthread_mutex_unlock(&ctrlr->mutex); 6363 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6364 return true; 6365 } 6366 } 6367 pthread_mutex_unlock(&ctrlr->mutex); 6368 } 6369 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6370 6371 return false; 6372 } 6373 6374 static int 6375 bdev_nvme_delete_complete_poll(void *arg) 6376 { 6377 struct bdev_nvme_delete_ctx *ctx = arg; 6378 int rc = 0; 6379 6380 if (nvme_path_id_exists(ctx->name, &ctx->path_id)) { 6381 if (ctx->timeout_ticks > spdk_get_ticks()) { 6382 return SPDK_POLLER_BUSY; 6383 } 6384 6385 SPDK_ERRLOG("NVMe path '%s' still exists after delete\n", ctx->name); 6386 rc = -ETIMEDOUT; 6387 } 6388 6389 spdk_poller_unregister(&ctx->poller); 6390 6391 ctx->delete_done(ctx->delete_done_ctx, rc); 6392 free_bdev_nvme_delete_ctx(ctx); 6393 6394 return SPDK_POLLER_BUSY; 6395 } 6396 6397 static int 6398 _bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, const struct nvme_path_id *path_id) 6399 { 6400 struct nvme_path_id *p, *t; 6401 spdk_msg_fn msg_fn; 6402 int rc = -ENXIO; 6403 6404 pthread_mutex_lock(&nvme_ctrlr->mutex); 6405 6406 TAILQ_FOREACH_REVERSE_SAFE(p, &nvme_ctrlr->trids, nvme_paths, link, t) { 6407 if (p == TAILQ_FIRST(&nvme_ctrlr->trids)) { 6408 break; 6409 } 6410 6411 if (!nvme_path_id_compare(p, path_id)) { 6412 continue; 6413 } 6414 6415 /* We are not using the specified path. */ 6416 TAILQ_REMOVE(&nvme_ctrlr->trids, p, link); 6417 free(p); 6418 rc = 0; 6419 } 6420 6421 if (p == NULL || !nvme_path_id_compare(p, path_id)) { 6422 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6423 return rc; 6424 } 6425 6426 /* If we made it here, then this path is a match! Now we need to remove it. */ 6427 6428 /* This is the active path in use right now. The active path is always the first in the list. */ 6429 assert(p == nvme_ctrlr->active_path_id); 6430 6431 if (!TAILQ_NEXT(p, link)) { 6432 /* The current path is the only path. */ 6433 msg_fn = _nvme_ctrlr_destruct; 6434 rc = bdev_nvme_delete_ctrlr_unsafe(nvme_ctrlr, false); 6435 } else { 6436 /* There is an alternative path. */ 6437 msg_fn = _bdev_nvme_reset_ctrlr; 6438 rc = bdev_nvme_failover_ctrlr_unsafe(nvme_ctrlr, true); 6439 } 6440 6441 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6442 6443 if (rc == 0) { 6444 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 6445 } else if (rc == -EALREADY) { 6446 rc = 0; 6447 } 6448 6449 return rc; 6450 } 6451 6452 int 6453 bdev_nvme_delete(const char *name, const struct nvme_path_id *path_id, 6454 bdev_nvme_delete_done_fn delete_done, void *delete_done_ctx) 6455 { 6456 struct nvme_bdev_ctrlr *nbdev_ctrlr; 6457 struct nvme_ctrlr *nvme_ctrlr, *tmp_nvme_ctrlr; 6458 struct bdev_nvme_delete_ctx *ctx = NULL; 6459 int rc = -ENXIO, _rc; 6460 6461 if (name == NULL || path_id == NULL) { 6462 rc = -EINVAL; 6463 goto exit; 6464 } 6465 6466 pthread_mutex_lock(&g_bdev_nvme_mutex); 6467 6468 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 6469 if (nbdev_ctrlr == NULL) { 6470 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6471 6472 SPDK_ERRLOG("Failed to find NVMe bdev controller\n"); 6473 rc = -ENODEV; 6474 goto exit; 6475 } 6476 6477 TAILQ_FOREACH_SAFE(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq, tmp_nvme_ctrlr) { 6478 _rc = _bdev_nvme_delete(nvme_ctrlr, path_id); 6479 if (_rc < 0 && _rc != -ENXIO) { 6480 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6481 rc = _rc; 6482 goto exit; 6483 } else if (_rc == 0) { 6484 /* We traverse all remaining nvme_ctrlrs even if one nvme_ctrlr 6485 * was deleted successfully. To remember the successful deletion, 6486 * overwrite rc only if _rc is zero. 6487 */ 6488 rc = 0; 6489 } 6490 } 6491 6492 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6493 6494 if (rc != 0 || delete_done == NULL) { 6495 goto exit; 6496 } 6497 6498 ctx = calloc(1, sizeof(*ctx)); 6499 if (ctx == NULL) { 6500 SPDK_ERRLOG("Failed to allocate context for bdev_nvme_delete\n"); 6501 rc = -ENOMEM; 6502 goto exit; 6503 } 6504 6505 ctx->name = strdup(name); 6506 if (ctx->name == NULL) { 6507 SPDK_ERRLOG("Failed to copy controller name for deletion\n"); 6508 rc = -ENOMEM; 6509 goto exit; 6510 } 6511 6512 ctx->delete_done = delete_done; 6513 ctx->delete_done_ctx = delete_done_ctx; 6514 ctx->path_id = *path_id; 6515 ctx->timeout_ticks = spdk_get_ticks() + 10 * spdk_get_ticks_hz(); 6516 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_delete_complete_poll, ctx, 1000); 6517 if (ctx->poller == NULL) { 6518 SPDK_ERRLOG("Failed to register bdev_nvme_delete poller\n"); 6519 rc = -ENOMEM; 6520 goto exit; 6521 } 6522 6523 exit: 6524 if (rc != 0) { 6525 free_bdev_nvme_delete_ctx(ctx); 6526 } 6527 6528 return rc; 6529 } 6530 6531 #define DISCOVERY_INFOLOG(ctx, format, ...) \ 6532 SPDK_INFOLOG(bdev_nvme, "Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 6533 6534 #define DISCOVERY_ERRLOG(ctx, format, ...) \ 6535 SPDK_ERRLOG("Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 6536 6537 struct discovery_entry_ctx { 6538 char name[128]; 6539 struct spdk_nvme_transport_id trid; 6540 struct spdk_nvme_ctrlr_opts drv_opts; 6541 struct spdk_nvmf_discovery_log_page_entry entry; 6542 TAILQ_ENTRY(discovery_entry_ctx) tailq; 6543 struct discovery_ctx *ctx; 6544 }; 6545 6546 struct discovery_ctx { 6547 char *name; 6548 spdk_bdev_nvme_start_discovery_fn start_cb_fn; 6549 spdk_bdev_nvme_stop_discovery_fn stop_cb_fn; 6550 void *cb_ctx; 6551 struct spdk_nvme_probe_ctx *probe_ctx; 6552 struct spdk_nvme_detach_ctx *detach_ctx; 6553 struct spdk_nvme_ctrlr *ctrlr; 6554 struct spdk_nvme_transport_id trid; 6555 struct discovery_entry_ctx *entry_ctx_in_use; 6556 struct spdk_poller *poller; 6557 struct spdk_nvme_ctrlr_opts drv_opts; 6558 struct nvme_ctrlr_opts bdev_opts; 6559 struct spdk_nvmf_discovery_log_page *log_page; 6560 TAILQ_ENTRY(discovery_ctx) tailq; 6561 TAILQ_HEAD(, discovery_entry_ctx) nvm_entry_ctxs; 6562 TAILQ_HEAD(, discovery_entry_ctx) discovery_entry_ctxs; 6563 int rc; 6564 bool wait_for_attach; 6565 uint64_t timeout_ticks; 6566 /* Denotes that the discovery service is being started. We're waiting 6567 * for the initial connection to the discovery controller to be 6568 * established and attach discovered NVM ctrlrs. 6569 */ 6570 bool initializing; 6571 /* Denotes if a discovery is currently in progress for this context. 6572 * That includes connecting to newly discovered subsystems. Used to 6573 * ensure we do not start a new discovery until an existing one is 6574 * complete. 6575 */ 6576 bool in_progress; 6577 6578 /* Denotes if another discovery is needed after the one in progress 6579 * completes. Set when we receive an AER completion while a discovery 6580 * is already in progress. 6581 */ 6582 bool pending; 6583 6584 /* Signal to the discovery context poller that it should stop the 6585 * discovery service, including detaching from the current discovery 6586 * controller. 6587 */ 6588 bool stop; 6589 6590 struct spdk_thread *calling_thread; 6591 uint32_t index; 6592 uint32_t attach_in_progress; 6593 char *hostnqn; 6594 6595 /* Denotes if the discovery service was started by the mdns discovery. 6596 */ 6597 bool from_mdns_discovery_service; 6598 }; 6599 6600 TAILQ_HEAD(discovery_ctxs, discovery_ctx); 6601 static struct discovery_ctxs g_discovery_ctxs = TAILQ_HEAD_INITIALIZER(g_discovery_ctxs); 6602 6603 static void get_discovery_log_page(struct discovery_ctx *ctx); 6604 6605 static void 6606 free_discovery_ctx(struct discovery_ctx *ctx) 6607 { 6608 free(ctx->log_page); 6609 free(ctx->hostnqn); 6610 free(ctx->name); 6611 free(ctx); 6612 } 6613 6614 static void 6615 discovery_complete(struct discovery_ctx *ctx) 6616 { 6617 ctx->initializing = false; 6618 ctx->in_progress = false; 6619 if (ctx->pending) { 6620 ctx->pending = false; 6621 get_discovery_log_page(ctx); 6622 } 6623 } 6624 6625 static void 6626 build_trid_from_log_page_entry(struct spdk_nvme_transport_id *trid, 6627 struct spdk_nvmf_discovery_log_page_entry *entry) 6628 { 6629 char *space; 6630 6631 trid->trtype = entry->trtype; 6632 trid->adrfam = entry->adrfam; 6633 memcpy(trid->traddr, entry->traddr, sizeof(entry->traddr)); 6634 memcpy(trid->trsvcid, entry->trsvcid, sizeof(entry->trsvcid)); 6635 /* Because the source buffer (entry->subnqn) is longer than trid->subnqn, and 6636 * before call to this function trid->subnqn is zeroed out, we need 6637 * to copy sizeof(trid->subnqn) minus one byte to make sure the last character 6638 * remains 0. Then we can shorten the string (replace ' ' with 0) if required 6639 */ 6640 memcpy(trid->subnqn, entry->subnqn, sizeof(trid->subnqn) - 1); 6641 6642 /* We want the traddr, trsvcid and subnqn fields to be NULL-terminated. 6643 * But the log page entries typically pad them with spaces, not zeroes. 6644 * So add a NULL terminator to each of these fields at the appropriate 6645 * location. 6646 */ 6647 space = strchr(trid->traddr, ' '); 6648 if (space) { 6649 *space = 0; 6650 } 6651 space = strchr(trid->trsvcid, ' '); 6652 if (space) { 6653 *space = 0; 6654 } 6655 space = strchr(trid->subnqn, ' '); 6656 if (space) { 6657 *space = 0; 6658 } 6659 } 6660 6661 static void 6662 _stop_discovery(void *_ctx) 6663 { 6664 struct discovery_ctx *ctx = _ctx; 6665 6666 if (ctx->attach_in_progress > 0) { 6667 spdk_thread_send_msg(spdk_get_thread(), _stop_discovery, ctx); 6668 return; 6669 } 6670 6671 ctx->stop = true; 6672 6673 while (!TAILQ_EMPTY(&ctx->nvm_entry_ctxs)) { 6674 struct discovery_entry_ctx *entry_ctx; 6675 struct nvme_path_id path = {}; 6676 6677 entry_ctx = TAILQ_FIRST(&ctx->nvm_entry_ctxs); 6678 path.trid = entry_ctx->trid; 6679 bdev_nvme_delete(entry_ctx->name, &path, NULL, NULL); 6680 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 6681 free(entry_ctx); 6682 } 6683 6684 while (!TAILQ_EMPTY(&ctx->discovery_entry_ctxs)) { 6685 struct discovery_entry_ctx *entry_ctx; 6686 6687 entry_ctx = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 6688 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 6689 free(entry_ctx); 6690 } 6691 6692 free(ctx->entry_ctx_in_use); 6693 ctx->entry_ctx_in_use = NULL; 6694 } 6695 6696 static void 6697 stop_discovery(struct discovery_ctx *ctx, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 6698 { 6699 ctx->stop_cb_fn = cb_fn; 6700 ctx->cb_ctx = cb_ctx; 6701 6702 if (ctx->attach_in_progress > 0) { 6703 DISCOVERY_INFOLOG(ctx, "stopping discovery with attach_in_progress: %"PRIu32"\n", 6704 ctx->attach_in_progress); 6705 } 6706 6707 _stop_discovery(ctx); 6708 } 6709 6710 static void 6711 remove_discovery_entry(struct nvme_ctrlr *nvme_ctrlr) 6712 { 6713 struct discovery_ctx *d_ctx; 6714 struct nvme_path_id *path_id; 6715 struct spdk_nvme_transport_id trid = {}; 6716 struct discovery_entry_ctx *entry_ctx, *tmp; 6717 6718 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 6719 6720 TAILQ_FOREACH(d_ctx, &g_discovery_ctxs, tailq) { 6721 TAILQ_FOREACH_SAFE(entry_ctx, &d_ctx->nvm_entry_ctxs, tailq, tmp) { 6722 build_trid_from_log_page_entry(&trid, &entry_ctx->entry); 6723 if (spdk_nvme_transport_id_compare(&trid, &path_id->trid) != 0) { 6724 continue; 6725 } 6726 6727 TAILQ_REMOVE(&d_ctx->nvm_entry_ctxs, entry_ctx, tailq); 6728 free(entry_ctx); 6729 DISCOVERY_INFOLOG(d_ctx, "Remove discovery entry: %s:%s:%s\n", 6730 trid.subnqn, trid.traddr, trid.trsvcid); 6731 6732 /* Fail discovery ctrlr to force reattach attempt */ 6733 spdk_nvme_ctrlr_fail(d_ctx->ctrlr); 6734 } 6735 } 6736 } 6737 6738 static void 6739 discovery_remove_controllers(struct discovery_ctx *ctx) 6740 { 6741 struct spdk_nvmf_discovery_log_page *log_page = ctx->log_page; 6742 struct discovery_entry_ctx *entry_ctx, *tmp; 6743 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 6744 struct spdk_nvme_transport_id old_trid = {}; 6745 uint64_t numrec, i; 6746 bool found; 6747 6748 numrec = from_le64(&log_page->numrec); 6749 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->nvm_entry_ctxs, tailq, tmp) { 6750 found = false; 6751 old_entry = &entry_ctx->entry; 6752 build_trid_from_log_page_entry(&old_trid, old_entry); 6753 for (i = 0; i < numrec; i++) { 6754 new_entry = &log_page->entries[i]; 6755 if (!memcmp(old_entry, new_entry, sizeof(*old_entry))) { 6756 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s found again\n", 6757 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 6758 found = true; 6759 break; 6760 } 6761 } 6762 if (!found) { 6763 struct nvme_path_id path = {}; 6764 6765 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s not found\n", 6766 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 6767 6768 path.trid = entry_ctx->trid; 6769 bdev_nvme_delete(entry_ctx->name, &path, NULL, NULL); 6770 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 6771 free(entry_ctx); 6772 } 6773 } 6774 free(log_page); 6775 ctx->log_page = NULL; 6776 discovery_complete(ctx); 6777 } 6778 6779 static void 6780 complete_discovery_start(struct discovery_ctx *ctx, int status) 6781 { 6782 ctx->timeout_ticks = 0; 6783 ctx->rc = status; 6784 if (ctx->start_cb_fn) { 6785 ctx->start_cb_fn(ctx->cb_ctx, status); 6786 ctx->start_cb_fn = NULL; 6787 ctx->cb_ctx = NULL; 6788 } 6789 } 6790 6791 static void 6792 discovery_attach_controller_done(void *cb_ctx, size_t bdev_count, int rc) 6793 { 6794 struct discovery_entry_ctx *entry_ctx = cb_ctx; 6795 struct discovery_ctx *ctx = entry_ctx->ctx; 6796 6797 DISCOVERY_INFOLOG(ctx, "attach %s done\n", entry_ctx->name); 6798 ctx->attach_in_progress--; 6799 if (ctx->attach_in_progress == 0) { 6800 complete_discovery_start(ctx, ctx->rc); 6801 if (ctx->initializing && ctx->rc != 0) { 6802 DISCOVERY_ERRLOG(ctx, "stopping discovery due to errors: %d\n", ctx->rc); 6803 stop_discovery(ctx, NULL, ctx->cb_ctx); 6804 } else { 6805 discovery_remove_controllers(ctx); 6806 } 6807 } 6808 } 6809 6810 static struct discovery_entry_ctx * 6811 create_discovery_entry_ctx(struct discovery_ctx *ctx, struct spdk_nvme_transport_id *trid) 6812 { 6813 struct discovery_entry_ctx *new_ctx; 6814 6815 new_ctx = calloc(1, sizeof(*new_ctx)); 6816 if (new_ctx == NULL) { 6817 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6818 return NULL; 6819 } 6820 6821 new_ctx->ctx = ctx; 6822 memcpy(&new_ctx->trid, trid, sizeof(*trid)); 6823 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 6824 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 6825 return new_ctx; 6826 } 6827 6828 static void 6829 discovery_log_page_cb(void *cb_arg, int rc, const struct spdk_nvme_cpl *cpl, 6830 struct spdk_nvmf_discovery_log_page *log_page) 6831 { 6832 struct discovery_ctx *ctx = cb_arg; 6833 struct discovery_entry_ctx *entry_ctx, *tmp; 6834 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 6835 uint64_t numrec, i; 6836 bool found; 6837 6838 if (rc || spdk_nvme_cpl_is_error(cpl)) { 6839 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 6840 return; 6841 } 6842 6843 ctx->log_page = log_page; 6844 assert(ctx->attach_in_progress == 0); 6845 numrec = from_le64(&log_page->numrec); 6846 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->discovery_entry_ctxs, tailq, tmp) { 6847 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 6848 free(entry_ctx); 6849 } 6850 for (i = 0; i < numrec; i++) { 6851 found = false; 6852 new_entry = &log_page->entries[i]; 6853 if (new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY_CURRENT || 6854 new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) { 6855 struct discovery_entry_ctx *new_ctx; 6856 struct spdk_nvme_transport_id trid = {}; 6857 6858 build_trid_from_log_page_entry(&trid, new_entry); 6859 new_ctx = create_discovery_entry_ctx(ctx, &trid); 6860 if (new_ctx == NULL) { 6861 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6862 break; 6863 } 6864 6865 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, new_ctx, tailq); 6866 continue; 6867 } 6868 TAILQ_FOREACH(entry_ctx, &ctx->nvm_entry_ctxs, tailq) { 6869 old_entry = &entry_ctx->entry; 6870 if (!memcmp(new_entry, old_entry, sizeof(*new_entry))) { 6871 found = true; 6872 break; 6873 } 6874 } 6875 if (!found) { 6876 struct discovery_entry_ctx *subnqn_ctx = NULL, *new_ctx; 6877 struct discovery_ctx *d_ctx; 6878 6879 TAILQ_FOREACH(d_ctx, &g_discovery_ctxs, tailq) { 6880 TAILQ_FOREACH(subnqn_ctx, &d_ctx->nvm_entry_ctxs, tailq) { 6881 if (!memcmp(subnqn_ctx->entry.subnqn, new_entry->subnqn, 6882 sizeof(new_entry->subnqn))) { 6883 break; 6884 } 6885 } 6886 if (subnqn_ctx) { 6887 break; 6888 } 6889 } 6890 6891 new_ctx = calloc(1, sizeof(*new_ctx)); 6892 if (new_ctx == NULL) { 6893 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6894 break; 6895 } 6896 6897 new_ctx->ctx = ctx; 6898 memcpy(&new_ctx->entry, new_entry, sizeof(*new_entry)); 6899 build_trid_from_log_page_entry(&new_ctx->trid, new_entry); 6900 if (subnqn_ctx) { 6901 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s", subnqn_ctx->name); 6902 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new path for %s\n", 6903 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 6904 new_ctx->name); 6905 } else { 6906 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s%d", ctx->name, ctx->index++); 6907 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new subsystem %s\n", 6908 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 6909 new_ctx->name); 6910 } 6911 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 6912 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 6913 rc = bdev_nvme_create(&new_ctx->trid, new_ctx->name, NULL, 0, 6914 discovery_attach_controller_done, new_ctx, 6915 &new_ctx->drv_opts, &ctx->bdev_opts, true); 6916 if (rc == 0) { 6917 TAILQ_INSERT_TAIL(&ctx->nvm_entry_ctxs, new_ctx, tailq); 6918 ctx->attach_in_progress++; 6919 } else { 6920 DISCOVERY_ERRLOG(ctx, "bdev_nvme_create failed (%s)\n", spdk_strerror(-rc)); 6921 } 6922 } 6923 } 6924 6925 if (ctx->attach_in_progress == 0) { 6926 discovery_remove_controllers(ctx); 6927 } 6928 } 6929 6930 static void 6931 get_discovery_log_page(struct discovery_ctx *ctx) 6932 { 6933 int rc; 6934 6935 assert(ctx->in_progress == false); 6936 ctx->in_progress = true; 6937 rc = spdk_nvme_ctrlr_get_discovery_log_page(ctx->ctrlr, discovery_log_page_cb, ctx); 6938 if (rc != 0) { 6939 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 6940 } 6941 DISCOVERY_INFOLOG(ctx, "sent discovery log page command\n"); 6942 } 6943 6944 static void 6945 discovery_aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 6946 { 6947 struct discovery_ctx *ctx = arg; 6948 uint32_t log_page_id = (cpl->cdw0 & 0xFF0000) >> 16; 6949 6950 if (spdk_nvme_cpl_is_error(cpl)) { 6951 DISCOVERY_ERRLOG(ctx, "aer failed\n"); 6952 return; 6953 } 6954 6955 if (log_page_id != SPDK_NVME_LOG_DISCOVERY) { 6956 DISCOVERY_ERRLOG(ctx, "unexpected log page 0x%x\n", log_page_id); 6957 return; 6958 } 6959 6960 DISCOVERY_INFOLOG(ctx, "got aer\n"); 6961 if (ctx->in_progress) { 6962 ctx->pending = true; 6963 return; 6964 } 6965 6966 get_discovery_log_page(ctx); 6967 } 6968 6969 static void 6970 discovery_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 6971 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 6972 { 6973 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 6974 struct discovery_ctx *ctx; 6975 6976 ctx = SPDK_CONTAINEROF(user_opts, struct discovery_ctx, drv_opts); 6977 6978 DISCOVERY_INFOLOG(ctx, "discovery ctrlr attached\n"); 6979 ctx->probe_ctx = NULL; 6980 ctx->ctrlr = ctrlr; 6981 6982 if (ctx->rc != 0) { 6983 DISCOVERY_ERRLOG(ctx, "encountered error while attaching discovery ctrlr: %d\n", 6984 ctx->rc); 6985 return; 6986 } 6987 6988 spdk_nvme_ctrlr_register_aer_callback(ctx->ctrlr, discovery_aer_cb, ctx); 6989 } 6990 6991 static int 6992 discovery_poller(void *arg) 6993 { 6994 struct discovery_ctx *ctx = arg; 6995 struct spdk_nvme_transport_id *trid; 6996 int rc; 6997 6998 if (ctx->detach_ctx) { 6999 rc = spdk_nvme_detach_poll_async(ctx->detach_ctx); 7000 if (rc != -EAGAIN) { 7001 ctx->detach_ctx = NULL; 7002 ctx->ctrlr = NULL; 7003 } 7004 } else if (ctx->stop) { 7005 if (ctx->ctrlr != NULL) { 7006 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 7007 if (rc == 0) { 7008 return SPDK_POLLER_BUSY; 7009 } 7010 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 7011 } 7012 spdk_poller_unregister(&ctx->poller); 7013 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 7014 assert(ctx->start_cb_fn == NULL); 7015 if (ctx->stop_cb_fn != NULL) { 7016 ctx->stop_cb_fn(ctx->cb_ctx); 7017 } 7018 free_discovery_ctx(ctx); 7019 } else if (ctx->probe_ctx == NULL && ctx->ctrlr == NULL) { 7020 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 7021 DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); 7022 assert(ctx->initializing); 7023 spdk_poller_unregister(&ctx->poller); 7024 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 7025 complete_discovery_start(ctx, -ETIMEDOUT); 7026 stop_discovery(ctx, NULL, NULL); 7027 free_discovery_ctx(ctx); 7028 return SPDK_POLLER_BUSY; 7029 } 7030 7031 assert(ctx->entry_ctx_in_use == NULL); 7032 ctx->entry_ctx_in_use = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 7033 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 7034 trid = &ctx->entry_ctx_in_use->trid; 7035 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, discovery_attach_cb); 7036 if (ctx->probe_ctx) { 7037 spdk_poller_unregister(&ctx->poller); 7038 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000); 7039 } else { 7040 DISCOVERY_ERRLOG(ctx, "could not start discovery connect\n"); 7041 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 7042 ctx->entry_ctx_in_use = NULL; 7043 } 7044 } else if (ctx->probe_ctx) { 7045 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 7046 DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); 7047 complete_discovery_start(ctx, -ETIMEDOUT); 7048 return SPDK_POLLER_BUSY; 7049 } 7050 7051 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 7052 if (rc != -EAGAIN) { 7053 if (ctx->rc != 0) { 7054 assert(ctx->initializing); 7055 stop_discovery(ctx, NULL, ctx->cb_ctx); 7056 } else { 7057 assert(rc == 0); 7058 DISCOVERY_INFOLOG(ctx, "discovery ctrlr connected\n"); 7059 ctx->rc = rc; 7060 get_discovery_log_page(ctx); 7061 } 7062 } 7063 } else { 7064 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 7065 DISCOVERY_ERRLOG(ctx, "timed out while attaching NVM ctrlrs\n"); 7066 complete_discovery_start(ctx, -ETIMEDOUT); 7067 /* We need to wait until all NVM ctrlrs are attached before we stop the 7068 * discovery service to make sure we don't detach a ctrlr that is still 7069 * being attached. 7070 */ 7071 if (ctx->attach_in_progress == 0) { 7072 stop_discovery(ctx, NULL, ctx->cb_ctx); 7073 return SPDK_POLLER_BUSY; 7074 } 7075 } 7076 7077 rc = spdk_nvme_ctrlr_process_admin_completions(ctx->ctrlr); 7078 if (rc < 0) { 7079 spdk_poller_unregister(&ctx->poller); 7080 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 7081 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 7082 ctx->entry_ctx_in_use = NULL; 7083 7084 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 7085 if (rc != 0) { 7086 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 7087 ctx->ctrlr = NULL; 7088 } 7089 } 7090 } 7091 7092 return SPDK_POLLER_BUSY; 7093 } 7094 7095 static void 7096 start_discovery_poller(void *arg) 7097 { 7098 struct discovery_ctx *ctx = arg; 7099 7100 TAILQ_INSERT_TAIL(&g_discovery_ctxs, ctx, tailq); 7101 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 7102 } 7103 7104 int 7105 bdev_nvme_start_discovery(struct spdk_nvme_transport_id *trid, 7106 const char *base_name, 7107 struct spdk_nvme_ctrlr_opts *drv_opts, 7108 struct nvme_ctrlr_opts *bdev_opts, 7109 uint64_t attach_timeout, 7110 bool from_mdns, 7111 spdk_bdev_nvme_start_discovery_fn cb_fn, void *cb_ctx) 7112 { 7113 struct discovery_ctx *ctx; 7114 struct discovery_entry_ctx *discovery_entry_ctx; 7115 7116 snprintf(trid->subnqn, sizeof(trid->subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN); 7117 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 7118 if (strcmp(ctx->name, base_name) == 0) { 7119 return -EEXIST; 7120 } 7121 7122 if (ctx->entry_ctx_in_use != NULL) { 7123 if (!spdk_nvme_transport_id_compare(trid, &ctx->entry_ctx_in_use->trid)) { 7124 return -EEXIST; 7125 } 7126 } 7127 7128 TAILQ_FOREACH(discovery_entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 7129 if (!spdk_nvme_transport_id_compare(trid, &discovery_entry_ctx->trid)) { 7130 return -EEXIST; 7131 } 7132 } 7133 } 7134 7135 ctx = calloc(1, sizeof(*ctx)); 7136 if (ctx == NULL) { 7137 return -ENOMEM; 7138 } 7139 7140 ctx->name = strdup(base_name); 7141 if (ctx->name == NULL) { 7142 free_discovery_ctx(ctx); 7143 return -ENOMEM; 7144 } 7145 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 7146 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 7147 ctx->from_mdns_discovery_service = from_mdns; 7148 ctx->bdev_opts.from_discovery_service = true; 7149 ctx->calling_thread = spdk_get_thread(); 7150 ctx->start_cb_fn = cb_fn; 7151 ctx->cb_ctx = cb_ctx; 7152 ctx->initializing = true; 7153 if (ctx->start_cb_fn) { 7154 /* We can use this when dumping json to denote if this RPC parameter 7155 * was specified or not. 7156 */ 7157 ctx->wait_for_attach = true; 7158 } 7159 if (attach_timeout != 0) { 7160 ctx->timeout_ticks = spdk_get_ticks() + attach_timeout * 7161 spdk_get_ticks_hz() / 1000ull; 7162 } 7163 TAILQ_INIT(&ctx->nvm_entry_ctxs); 7164 TAILQ_INIT(&ctx->discovery_entry_ctxs); 7165 memcpy(&ctx->trid, trid, sizeof(*trid)); 7166 /* Even if user did not specify hostnqn, we can still strdup("\0"); */ 7167 ctx->hostnqn = strdup(ctx->drv_opts.hostnqn); 7168 if (ctx->hostnqn == NULL) { 7169 free_discovery_ctx(ctx); 7170 return -ENOMEM; 7171 } 7172 discovery_entry_ctx = create_discovery_entry_ctx(ctx, trid); 7173 if (discovery_entry_ctx == NULL) { 7174 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 7175 free_discovery_ctx(ctx); 7176 return -ENOMEM; 7177 } 7178 7179 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, discovery_entry_ctx, tailq); 7180 spdk_thread_send_msg(g_bdev_nvme_init_thread, start_discovery_poller, ctx); 7181 return 0; 7182 } 7183 7184 int 7185 bdev_nvme_stop_discovery(const char *name, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 7186 { 7187 struct discovery_ctx *ctx; 7188 7189 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 7190 if (strcmp(name, ctx->name) == 0) { 7191 if (ctx->stop) { 7192 return -EALREADY; 7193 } 7194 /* If we're still starting the discovery service and ->rc is non-zero, we're 7195 * going to stop it as soon as we can 7196 */ 7197 if (ctx->initializing && ctx->rc != 0) { 7198 return -EALREADY; 7199 } 7200 stop_discovery(ctx, cb_fn, cb_ctx); 7201 return 0; 7202 } 7203 } 7204 7205 return -ENOENT; 7206 } 7207 7208 static int 7209 bdev_nvme_library_init(void) 7210 { 7211 g_bdev_nvme_init_thread = spdk_get_thread(); 7212 7213 spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_create_poll_group_cb, 7214 bdev_nvme_destroy_poll_group_cb, 7215 sizeof(struct nvme_poll_group), "nvme_poll_groups"); 7216 7217 return 0; 7218 } 7219 7220 static void 7221 bdev_nvme_fini_destruct_ctrlrs(void) 7222 { 7223 struct nvme_bdev_ctrlr *nbdev_ctrlr; 7224 struct nvme_ctrlr *nvme_ctrlr; 7225 7226 pthread_mutex_lock(&g_bdev_nvme_mutex); 7227 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 7228 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 7229 pthread_mutex_lock(&nvme_ctrlr->mutex); 7230 if (nvme_ctrlr->destruct) { 7231 /* This controller's destruction was already started 7232 * before the application started shutting down 7233 */ 7234 pthread_mutex_unlock(&nvme_ctrlr->mutex); 7235 continue; 7236 } 7237 nvme_ctrlr->destruct = true; 7238 pthread_mutex_unlock(&nvme_ctrlr->mutex); 7239 7240 spdk_thread_send_msg(nvme_ctrlr->thread, _nvme_ctrlr_destruct, 7241 nvme_ctrlr); 7242 } 7243 } 7244 7245 g_bdev_nvme_module_finish = true; 7246 if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 7247 pthread_mutex_unlock(&g_bdev_nvme_mutex); 7248 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 7249 spdk_bdev_module_fini_done(); 7250 return; 7251 } 7252 7253 pthread_mutex_unlock(&g_bdev_nvme_mutex); 7254 } 7255 7256 static void 7257 check_discovery_fini(void *arg) 7258 { 7259 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 7260 bdev_nvme_fini_destruct_ctrlrs(); 7261 } 7262 } 7263 7264 static void 7265 bdev_nvme_library_fini(void) 7266 { 7267 struct nvme_probe_skip_entry *entry, *entry_tmp; 7268 struct discovery_ctx *ctx; 7269 7270 spdk_poller_unregister(&g_hotplug_poller); 7271 free(g_hotplug_probe_ctx); 7272 g_hotplug_probe_ctx = NULL; 7273 7274 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) { 7275 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 7276 free(entry); 7277 } 7278 7279 assert(spdk_get_thread() == g_bdev_nvme_init_thread); 7280 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 7281 bdev_nvme_fini_destruct_ctrlrs(); 7282 } else { 7283 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 7284 stop_discovery(ctx, check_discovery_fini, NULL); 7285 } 7286 } 7287 } 7288 7289 static void 7290 bdev_nvme_verify_pi_error(struct nvme_bdev_io *bio) 7291 { 7292 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7293 struct spdk_bdev *bdev = bdev_io->bdev; 7294 struct spdk_dif_ctx dif_ctx; 7295 struct spdk_dif_error err_blk = {}; 7296 int rc; 7297 struct spdk_dif_ctx_init_ext_opts dif_opts; 7298 7299 dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format); 7300 dif_opts.dif_pi_format = SPDK_DIF_PI_FORMAT_16; 7301 rc = spdk_dif_ctx_init(&dif_ctx, 7302 bdev->blocklen, bdev->md_len, bdev->md_interleave, 7303 bdev->dif_is_head_of_md, bdev->dif_type, 7304 bdev_io->u.bdev.dif_check_flags, 7305 bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0, &dif_opts); 7306 if (rc != 0) { 7307 SPDK_ERRLOG("Initialization of DIF context failed\n"); 7308 return; 7309 } 7310 7311 if (bdev->md_interleave) { 7312 rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 7313 bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 7314 } else { 7315 struct iovec md_iov = { 7316 .iov_base = bdev_io->u.bdev.md_buf, 7317 .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len, 7318 }; 7319 7320 rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 7321 &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 7322 } 7323 7324 if (rc != 0) { 7325 SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n", 7326 err_blk.err_type, err_blk.err_offset); 7327 } else { 7328 SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n"); 7329 } 7330 } 7331 7332 static void 7333 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 7334 { 7335 struct nvme_bdev_io *bio = ref; 7336 7337 if (spdk_nvme_cpl_is_success(cpl)) { 7338 /* Run PI verification for read data buffer. */ 7339 bdev_nvme_verify_pi_error(bio); 7340 } 7341 7342 /* Return original completion status */ 7343 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 7344 } 7345 7346 static void 7347 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 7348 { 7349 struct nvme_bdev_io *bio = ref; 7350 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7351 int ret; 7352 7353 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 7354 SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n", 7355 cpl->status.sct, cpl->status.sc); 7356 7357 /* Save completion status to use after verifying PI error. */ 7358 bio->cpl = *cpl; 7359 7360 if (spdk_likely(nvme_io_path_is_available(bio->io_path))) { 7361 /* Read without PI checking to verify PI error. */ 7362 ret = bdev_nvme_no_pi_readv(bio, 7363 bdev_io->u.bdev.iovs, 7364 bdev_io->u.bdev.iovcnt, 7365 bdev_io->u.bdev.md_buf, 7366 bdev_io->u.bdev.num_blocks, 7367 bdev_io->u.bdev.offset_blocks); 7368 if (ret == 0) { 7369 return; 7370 } 7371 } 7372 } 7373 7374 bdev_nvme_io_complete_nvme_status(bio, cpl); 7375 } 7376 7377 static void 7378 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7379 { 7380 struct nvme_bdev_io *bio = ref; 7381 7382 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 7383 SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n", 7384 cpl->status.sct, cpl->status.sc); 7385 /* Run PI verification for write data buffer if PI error is detected. */ 7386 bdev_nvme_verify_pi_error(bio); 7387 } 7388 7389 bdev_nvme_io_complete_nvme_status(bio, cpl); 7390 } 7391 7392 static void 7393 bdev_nvme_zone_appendv_done(void *ref, const struct spdk_nvme_cpl *cpl) 7394 { 7395 struct nvme_bdev_io *bio = ref; 7396 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7397 7398 /* spdk_bdev_io_get_append_location() requires that the ALBA is stored in offset_blocks. 7399 * Additionally, offset_blocks has to be set before calling bdev_nvme_verify_pi_error(). 7400 */ 7401 bdev_io->u.bdev.offset_blocks = *(uint64_t *)&cpl->cdw0; 7402 7403 if (spdk_nvme_cpl_is_pi_error(cpl)) { 7404 SPDK_ERRLOG("zone append completed with PI error (sct=%d, sc=%d)\n", 7405 cpl->status.sct, cpl->status.sc); 7406 /* Run PI verification for zone append data buffer if PI error is detected. */ 7407 bdev_nvme_verify_pi_error(bio); 7408 } 7409 7410 bdev_nvme_io_complete_nvme_status(bio, cpl); 7411 } 7412 7413 static void 7414 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7415 { 7416 struct nvme_bdev_io *bio = ref; 7417 7418 if (spdk_nvme_cpl_is_pi_error(cpl)) { 7419 SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n", 7420 cpl->status.sct, cpl->status.sc); 7421 /* Run PI verification for compare data buffer if PI error is detected. */ 7422 bdev_nvme_verify_pi_error(bio); 7423 } 7424 7425 bdev_nvme_io_complete_nvme_status(bio, cpl); 7426 } 7427 7428 static void 7429 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7430 { 7431 struct nvme_bdev_io *bio = ref; 7432 7433 /* Compare operation completion */ 7434 if (!bio->first_fused_completed) { 7435 /* Save compare result for write callback */ 7436 bio->cpl = *cpl; 7437 bio->first_fused_completed = true; 7438 return; 7439 } 7440 7441 /* Write operation completion */ 7442 if (spdk_nvme_cpl_is_error(&bio->cpl)) { 7443 /* If bio->cpl is already an error, it means the compare operation failed. In that case, 7444 * complete the IO with the compare operation's status. 7445 */ 7446 if (!spdk_nvme_cpl_is_error(cpl)) { 7447 SPDK_ERRLOG("Unexpected write success after compare failure.\n"); 7448 } 7449 7450 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 7451 } else { 7452 bdev_nvme_io_complete_nvme_status(bio, cpl); 7453 } 7454 } 7455 7456 static void 7457 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl) 7458 { 7459 struct nvme_bdev_io *bio = ref; 7460 7461 bdev_nvme_io_complete_nvme_status(bio, cpl); 7462 } 7463 7464 static int 7465 fill_zone_from_report(struct spdk_bdev_zone_info *info, struct spdk_nvme_zns_zone_desc *desc) 7466 { 7467 switch (desc->zt) { 7468 case SPDK_NVME_ZONE_TYPE_SEQWR: 7469 info->type = SPDK_BDEV_ZONE_TYPE_SEQWR; 7470 break; 7471 default: 7472 SPDK_ERRLOG("Invalid zone type: %#x in zone report\n", desc->zt); 7473 return -EIO; 7474 } 7475 7476 switch (desc->zs) { 7477 case SPDK_NVME_ZONE_STATE_EMPTY: 7478 info->state = SPDK_BDEV_ZONE_STATE_EMPTY; 7479 break; 7480 case SPDK_NVME_ZONE_STATE_IOPEN: 7481 info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN; 7482 break; 7483 case SPDK_NVME_ZONE_STATE_EOPEN: 7484 info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN; 7485 break; 7486 case SPDK_NVME_ZONE_STATE_CLOSED: 7487 info->state = SPDK_BDEV_ZONE_STATE_CLOSED; 7488 break; 7489 case SPDK_NVME_ZONE_STATE_RONLY: 7490 info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY; 7491 break; 7492 case SPDK_NVME_ZONE_STATE_FULL: 7493 info->state = SPDK_BDEV_ZONE_STATE_FULL; 7494 break; 7495 case SPDK_NVME_ZONE_STATE_OFFLINE: 7496 info->state = SPDK_BDEV_ZONE_STATE_OFFLINE; 7497 break; 7498 default: 7499 SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", desc->zs); 7500 return -EIO; 7501 } 7502 7503 info->zone_id = desc->zslba; 7504 info->write_pointer = desc->wp; 7505 info->capacity = desc->zcap; 7506 7507 return 0; 7508 } 7509 7510 static void 7511 bdev_nvme_get_zone_info_done(void *ref, const struct spdk_nvme_cpl *cpl) 7512 { 7513 struct nvme_bdev_io *bio = ref; 7514 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7515 uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id; 7516 uint32_t zones_to_copy = bdev_io->u.zone_mgmt.num_zones; 7517 struct spdk_bdev_zone_info *info = bdev_io->u.zone_mgmt.buf; 7518 uint64_t max_zones_per_buf, i; 7519 uint32_t zone_report_bufsize; 7520 struct spdk_nvme_ns *ns; 7521 struct spdk_nvme_qpair *qpair; 7522 int ret; 7523 7524 if (spdk_nvme_cpl_is_error(cpl)) { 7525 goto out_complete_io_nvme_cpl; 7526 } 7527 7528 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 7529 ret = -ENXIO; 7530 goto out_complete_io_ret; 7531 } 7532 7533 ns = bio->io_path->nvme_ns->ns; 7534 qpair = bio->io_path->qpair->qpair; 7535 7536 zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 7537 max_zones_per_buf = (zone_report_bufsize - sizeof(*bio->zone_report_buf)) / 7538 sizeof(bio->zone_report_buf->descs[0]); 7539 7540 if (bio->zone_report_buf->nr_zones > max_zones_per_buf) { 7541 ret = -EINVAL; 7542 goto out_complete_io_ret; 7543 } 7544 7545 if (!bio->zone_report_buf->nr_zones) { 7546 ret = -EINVAL; 7547 goto out_complete_io_ret; 7548 } 7549 7550 for (i = 0; i < bio->zone_report_buf->nr_zones && bio->handled_zones < zones_to_copy; i++) { 7551 ret = fill_zone_from_report(&info[bio->handled_zones], 7552 &bio->zone_report_buf->descs[i]); 7553 if (ret) { 7554 goto out_complete_io_ret; 7555 } 7556 bio->handled_zones++; 7557 } 7558 7559 if (bio->handled_zones < zones_to_copy) { 7560 uint64_t zone_size_lba = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 7561 uint64_t slba = zone_id + (zone_size_lba * bio->handled_zones); 7562 7563 memset(bio->zone_report_buf, 0, zone_report_bufsize); 7564 ret = spdk_nvme_zns_report_zones(ns, qpair, 7565 bio->zone_report_buf, zone_report_bufsize, 7566 slba, SPDK_NVME_ZRA_LIST_ALL, true, 7567 bdev_nvme_get_zone_info_done, bio); 7568 if (!ret) { 7569 return; 7570 } else { 7571 goto out_complete_io_ret; 7572 } 7573 } 7574 7575 out_complete_io_nvme_cpl: 7576 free(bio->zone_report_buf); 7577 bio->zone_report_buf = NULL; 7578 bdev_nvme_io_complete_nvme_status(bio, cpl); 7579 return; 7580 7581 out_complete_io_ret: 7582 free(bio->zone_report_buf); 7583 bio->zone_report_buf = NULL; 7584 bdev_nvme_io_complete(bio, ret); 7585 } 7586 7587 static void 7588 bdev_nvme_zone_management_done(void *ref, const struct spdk_nvme_cpl *cpl) 7589 { 7590 struct nvme_bdev_io *bio = ref; 7591 7592 bdev_nvme_io_complete_nvme_status(bio, cpl); 7593 } 7594 7595 static void 7596 bdev_nvme_admin_passthru_complete_nvme_status(void *ctx) 7597 { 7598 struct nvme_bdev_io *bio = ctx; 7599 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7600 const struct spdk_nvme_cpl *cpl = &bio->cpl; 7601 7602 assert(bdev_nvme_io_type_is_admin(bdev_io->type)); 7603 7604 __bdev_nvme_io_complete(bdev_io, 0, cpl); 7605 } 7606 7607 static void 7608 bdev_nvme_abort_complete(void *ctx) 7609 { 7610 struct nvme_bdev_io *bio = ctx; 7611 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7612 7613 if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) { 7614 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS, NULL); 7615 } else { 7616 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED, NULL); 7617 } 7618 } 7619 7620 static void 7621 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl) 7622 { 7623 struct nvme_bdev_io *bio = ref; 7624 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7625 7626 bio->cpl = *cpl; 7627 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), bdev_nvme_abort_complete, bio); 7628 } 7629 7630 static void 7631 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl) 7632 { 7633 struct nvme_bdev_io *bio = ref; 7634 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7635 7636 bio->cpl = *cpl; 7637 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7638 bdev_nvme_admin_passthru_complete_nvme_status, bio); 7639 } 7640 7641 static void 7642 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset) 7643 { 7644 struct nvme_bdev_io *bio = ref; 7645 struct iovec *iov; 7646 7647 bio->iov_offset = sgl_offset; 7648 for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) { 7649 iov = &bio->iovs[bio->iovpos]; 7650 if (bio->iov_offset < iov->iov_len) { 7651 break; 7652 } 7653 7654 bio->iov_offset -= iov->iov_len; 7655 } 7656 } 7657 7658 static int 7659 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length) 7660 { 7661 struct nvme_bdev_io *bio = ref; 7662 struct iovec *iov; 7663 7664 assert(bio->iovpos < bio->iovcnt); 7665 7666 iov = &bio->iovs[bio->iovpos]; 7667 7668 *address = iov->iov_base; 7669 *length = iov->iov_len; 7670 7671 if (bio->iov_offset) { 7672 assert(bio->iov_offset <= iov->iov_len); 7673 *address += bio->iov_offset; 7674 *length -= bio->iov_offset; 7675 } 7676 7677 bio->iov_offset += *length; 7678 if (bio->iov_offset == iov->iov_len) { 7679 bio->iovpos++; 7680 bio->iov_offset = 0; 7681 } 7682 7683 return 0; 7684 } 7685 7686 static void 7687 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset) 7688 { 7689 struct nvme_bdev_io *bio = ref; 7690 struct iovec *iov; 7691 7692 bio->fused_iov_offset = sgl_offset; 7693 for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) { 7694 iov = &bio->fused_iovs[bio->fused_iovpos]; 7695 if (bio->fused_iov_offset < iov->iov_len) { 7696 break; 7697 } 7698 7699 bio->fused_iov_offset -= iov->iov_len; 7700 } 7701 } 7702 7703 static int 7704 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length) 7705 { 7706 struct nvme_bdev_io *bio = ref; 7707 struct iovec *iov; 7708 7709 assert(bio->fused_iovpos < bio->fused_iovcnt); 7710 7711 iov = &bio->fused_iovs[bio->fused_iovpos]; 7712 7713 *address = iov->iov_base; 7714 *length = iov->iov_len; 7715 7716 if (bio->fused_iov_offset) { 7717 assert(bio->fused_iov_offset <= iov->iov_len); 7718 *address += bio->fused_iov_offset; 7719 *length -= bio->fused_iov_offset; 7720 } 7721 7722 bio->fused_iov_offset += *length; 7723 if (bio->fused_iov_offset == iov->iov_len) { 7724 bio->fused_iovpos++; 7725 bio->fused_iov_offset = 0; 7726 } 7727 7728 return 0; 7729 } 7730 7731 static int 7732 bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7733 void *md, uint64_t lba_count, uint64_t lba) 7734 { 7735 int rc; 7736 7737 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n", 7738 lba_count, lba); 7739 7740 bio->iovs = iov; 7741 bio->iovcnt = iovcnt; 7742 bio->iovpos = 0; 7743 bio->iov_offset = 0; 7744 7745 rc = spdk_nvme_ns_cmd_readv_with_md(bio->io_path->nvme_ns->ns, 7746 bio->io_path->qpair->qpair, 7747 lba, lba_count, 7748 bdev_nvme_no_pi_readv_done, bio, 0, 7749 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 7750 md, 0, 0); 7751 7752 if (rc != 0 && rc != -ENOMEM) { 7753 SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc); 7754 } 7755 return rc; 7756 } 7757 7758 static int 7759 bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7760 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, 7761 struct spdk_memory_domain *domain, void *domain_ctx, 7762 struct spdk_accel_sequence *seq) 7763 { 7764 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7765 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7766 int rc; 7767 7768 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7769 lba_count, lba); 7770 7771 bio->iovs = iov; 7772 bio->iovcnt = iovcnt; 7773 bio->iovpos = 0; 7774 bio->iov_offset = 0; 7775 7776 if (domain != NULL || seq != NULL) { 7777 bio->ext_opts.size = SPDK_SIZEOF(&bio->ext_opts, accel_sequence); 7778 bio->ext_opts.memory_domain = domain; 7779 bio->ext_opts.memory_domain_ctx = domain_ctx; 7780 bio->ext_opts.io_flags = flags; 7781 bio->ext_opts.metadata = md; 7782 bio->ext_opts.accel_sequence = seq; 7783 7784 if (iovcnt == 1) { 7785 rc = spdk_nvme_ns_cmd_read_ext(ns, qpair, iov[0].iov_base, lba, lba_count, bdev_nvme_readv_done, 7786 bio, &bio->ext_opts); 7787 } else { 7788 rc = spdk_nvme_ns_cmd_readv_ext(ns, qpair, lba, lba_count, 7789 bdev_nvme_readv_done, bio, 7790 bdev_nvme_queued_reset_sgl, 7791 bdev_nvme_queued_next_sge, 7792 &bio->ext_opts); 7793 } 7794 } else if (iovcnt == 1) { 7795 rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base, 7796 md, lba, lba_count, bdev_nvme_readv_done, 7797 bio, flags, 0, 0); 7798 } else { 7799 rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count, 7800 bdev_nvme_readv_done, bio, flags, 7801 bdev_nvme_queued_reset_sgl, 7802 bdev_nvme_queued_next_sge, md, 0, 0); 7803 } 7804 7805 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 7806 SPDK_ERRLOG("readv failed: rc = %d\n", rc); 7807 } 7808 return rc; 7809 } 7810 7811 static int 7812 bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7813 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, 7814 struct spdk_memory_domain *domain, void *domain_ctx, 7815 struct spdk_accel_sequence *seq, 7816 union spdk_bdev_nvme_cdw12 cdw12, union spdk_bdev_nvme_cdw13 cdw13) 7817 { 7818 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7819 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7820 int rc; 7821 7822 SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7823 lba_count, lba); 7824 7825 bio->iovs = iov; 7826 bio->iovcnt = iovcnt; 7827 bio->iovpos = 0; 7828 bio->iov_offset = 0; 7829 7830 if (domain != NULL || seq != NULL) { 7831 bio->ext_opts.size = SPDK_SIZEOF(&bio->ext_opts, accel_sequence); 7832 bio->ext_opts.memory_domain = domain; 7833 bio->ext_opts.memory_domain_ctx = domain_ctx; 7834 bio->ext_opts.io_flags = flags | SPDK_NVME_IO_FLAGS_DIRECTIVE(cdw12.write.dtype); 7835 bio->ext_opts.cdw13 = cdw13.raw; 7836 bio->ext_opts.metadata = md; 7837 bio->ext_opts.accel_sequence = seq; 7838 7839 if (iovcnt == 1) { 7840 rc = spdk_nvme_ns_cmd_write_ext(ns, qpair, iov[0].iov_base, lba, lba_count, bdev_nvme_writev_done, 7841 bio, &bio->ext_opts); 7842 } else { 7843 rc = spdk_nvme_ns_cmd_writev_ext(ns, qpair, lba, lba_count, 7844 bdev_nvme_writev_done, bio, 7845 bdev_nvme_queued_reset_sgl, 7846 bdev_nvme_queued_next_sge, 7847 &bio->ext_opts); 7848 } 7849 } else if (iovcnt == 1) { 7850 rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base, 7851 md, lba, lba_count, bdev_nvme_writev_done, 7852 bio, flags, 0, 0); 7853 } else { 7854 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 7855 bdev_nvme_writev_done, bio, flags, 7856 bdev_nvme_queued_reset_sgl, 7857 bdev_nvme_queued_next_sge, md, 0, 0); 7858 } 7859 7860 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 7861 SPDK_ERRLOG("writev failed: rc = %d\n", rc); 7862 } 7863 return rc; 7864 } 7865 7866 static int 7867 bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7868 void *md, uint64_t lba_count, uint64_t zslba, 7869 uint32_t flags) 7870 { 7871 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7872 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7873 int rc; 7874 7875 SPDK_DEBUGLOG(bdev_nvme, "zone append %" PRIu64 " blocks to zone start lba %#" PRIx64 "\n", 7876 lba_count, zslba); 7877 7878 bio->iovs = iov; 7879 bio->iovcnt = iovcnt; 7880 bio->iovpos = 0; 7881 bio->iov_offset = 0; 7882 7883 if (iovcnt == 1) { 7884 rc = spdk_nvme_zns_zone_append_with_md(ns, qpair, iov[0].iov_base, md, zslba, 7885 lba_count, 7886 bdev_nvme_zone_appendv_done, bio, 7887 flags, 7888 0, 0); 7889 } else { 7890 rc = spdk_nvme_zns_zone_appendv_with_md(ns, qpair, zslba, lba_count, 7891 bdev_nvme_zone_appendv_done, bio, flags, 7892 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 7893 md, 0, 0); 7894 } 7895 7896 if (rc != 0 && rc != -ENOMEM) { 7897 SPDK_ERRLOG("zone append failed: rc = %d\n", rc); 7898 } 7899 return rc; 7900 } 7901 7902 static int 7903 bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7904 void *md, uint64_t lba_count, uint64_t lba, 7905 uint32_t flags) 7906 { 7907 int rc; 7908 7909 SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7910 lba_count, lba); 7911 7912 bio->iovs = iov; 7913 bio->iovcnt = iovcnt; 7914 bio->iovpos = 0; 7915 bio->iov_offset = 0; 7916 7917 rc = spdk_nvme_ns_cmd_comparev_with_md(bio->io_path->nvme_ns->ns, 7918 bio->io_path->qpair->qpair, 7919 lba, lba_count, 7920 bdev_nvme_comparev_done, bio, flags, 7921 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 7922 md, 0, 0); 7923 7924 if (rc != 0 && rc != -ENOMEM) { 7925 SPDK_ERRLOG("comparev failed: rc = %d\n", rc); 7926 } 7927 return rc; 7928 } 7929 7930 static int 7931 bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, 7932 struct iovec *write_iov, int write_iovcnt, 7933 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags) 7934 { 7935 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7936 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7937 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7938 int rc; 7939 7940 SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7941 lba_count, lba); 7942 7943 bio->iovs = cmp_iov; 7944 bio->iovcnt = cmp_iovcnt; 7945 bio->iovpos = 0; 7946 bio->iov_offset = 0; 7947 bio->fused_iovs = write_iov; 7948 bio->fused_iovcnt = write_iovcnt; 7949 bio->fused_iovpos = 0; 7950 bio->fused_iov_offset = 0; 7951 7952 if (bdev_io->num_retries == 0) { 7953 bio->first_fused_submitted = false; 7954 bio->first_fused_completed = false; 7955 } 7956 7957 if (!bio->first_fused_submitted) { 7958 flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST; 7959 memset(&bio->cpl, 0, sizeof(bio->cpl)); 7960 7961 rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count, 7962 bdev_nvme_comparev_and_writev_done, bio, flags, 7963 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0); 7964 if (rc == 0) { 7965 bio->first_fused_submitted = true; 7966 flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST; 7967 } else { 7968 if (rc != -ENOMEM) { 7969 SPDK_ERRLOG("compare failed: rc = %d\n", rc); 7970 } 7971 return rc; 7972 } 7973 } 7974 7975 flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND; 7976 7977 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 7978 bdev_nvme_comparev_and_writev_done, bio, flags, 7979 bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0); 7980 if (rc != 0 && rc != -ENOMEM) { 7981 SPDK_ERRLOG("write failed: rc = %d\n", rc); 7982 rc = 0; 7983 } 7984 7985 return rc; 7986 } 7987 7988 static int 7989 bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 7990 { 7991 struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES]; 7992 struct spdk_nvme_dsm_range *range; 7993 uint64_t offset, remaining; 7994 uint64_t num_ranges_u64; 7995 uint16_t num_ranges; 7996 int rc; 7997 7998 num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) / 7999 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 8000 if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) { 8001 SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks); 8002 return -EINVAL; 8003 } 8004 num_ranges = (uint16_t)num_ranges_u64; 8005 8006 offset = offset_blocks; 8007 remaining = num_blocks; 8008 range = &dsm_ranges[0]; 8009 8010 /* Fill max-size ranges until the remaining blocks fit into one range */ 8011 while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) { 8012 range->attributes.raw = 0; 8013 range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 8014 range->starting_lba = offset; 8015 8016 offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 8017 remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 8018 range++; 8019 } 8020 8021 /* Final range describes the remaining blocks */ 8022 range->attributes.raw = 0; 8023 range->length = remaining; 8024 range->starting_lba = offset; 8025 8026 rc = spdk_nvme_ns_cmd_dataset_management(bio->io_path->nvme_ns->ns, 8027 bio->io_path->qpair->qpair, 8028 SPDK_NVME_DSM_ATTR_DEALLOCATE, 8029 dsm_ranges, num_ranges, 8030 bdev_nvme_queued_done, bio); 8031 8032 return rc; 8033 } 8034 8035 static int 8036 bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 8037 { 8038 if (num_blocks > UINT16_MAX + 1) { 8039 SPDK_ERRLOG("NVMe write zeroes is limited to 16-bit block count\n"); 8040 return -EINVAL; 8041 } 8042 8043 return spdk_nvme_ns_cmd_write_zeroes(bio->io_path->nvme_ns->ns, 8044 bio->io_path->qpair->qpair, 8045 offset_blocks, num_blocks, 8046 bdev_nvme_queued_done, bio, 8047 0); 8048 } 8049 8050 static int 8051 bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones, 8052 struct spdk_bdev_zone_info *info) 8053 { 8054 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8055 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8056 uint32_t zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 8057 uint64_t zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 8058 uint64_t total_zones = spdk_nvme_zns_ns_get_num_zones(ns); 8059 8060 if (zone_id % zone_size != 0) { 8061 return -EINVAL; 8062 } 8063 8064 if (num_zones > total_zones || !num_zones) { 8065 return -EINVAL; 8066 } 8067 8068 assert(!bio->zone_report_buf); 8069 bio->zone_report_buf = calloc(1, zone_report_bufsize); 8070 if (!bio->zone_report_buf) { 8071 return -ENOMEM; 8072 } 8073 8074 bio->handled_zones = 0; 8075 8076 return spdk_nvme_zns_report_zones(ns, qpair, bio->zone_report_buf, zone_report_bufsize, 8077 zone_id, SPDK_NVME_ZRA_LIST_ALL, true, 8078 bdev_nvme_get_zone_info_done, bio); 8079 } 8080 8081 static int 8082 bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 8083 enum spdk_bdev_zone_action action) 8084 { 8085 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8086 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8087 8088 switch (action) { 8089 case SPDK_BDEV_ZONE_CLOSE: 8090 return spdk_nvme_zns_close_zone(ns, qpair, zone_id, false, 8091 bdev_nvme_zone_management_done, bio); 8092 case SPDK_BDEV_ZONE_FINISH: 8093 return spdk_nvme_zns_finish_zone(ns, qpair, zone_id, false, 8094 bdev_nvme_zone_management_done, bio); 8095 case SPDK_BDEV_ZONE_OPEN: 8096 return spdk_nvme_zns_open_zone(ns, qpair, zone_id, false, 8097 bdev_nvme_zone_management_done, bio); 8098 case SPDK_BDEV_ZONE_RESET: 8099 return spdk_nvme_zns_reset_zone(ns, qpair, zone_id, false, 8100 bdev_nvme_zone_management_done, bio); 8101 case SPDK_BDEV_ZONE_OFFLINE: 8102 return spdk_nvme_zns_offline_zone(ns, qpair, zone_id, false, 8103 bdev_nvme_zone_management_done, bio); 8104 default: 8105 return -EINVAL; 8106 } 8107 } 8108 8109 static void 8110 bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 8111 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) 8112 { 8113 struct nvme_io_path *io_path; 8114 struct nvme_ctrlr *nvme_ctrlr; 8115 uint32_t max_xfer_size; 8116 int rc = -ENXIO; 8117 8118 /* Choose the first ctrlr which is not failed. */ 8119 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 8120 nvme_ctrlr = io_path->qpair->ctrlr; 8121 8122 /* We should skip any unavailable nvme_ctrlr rather than checking 8123 * if the return value of spdk_nvme_ctrlr_cmd_admin_raw() is -ENXIO. 8124 */ 8125 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 8126 continue; 8127 } 8128 8129 max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ctrlr->ctrlr); 8130 8131 if (nbytes > max_xfer_size) { 8132 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 8133 rc = -EINVAL; 8134 goto err; 8135 } 8136 8137 rc = spdk_nvme_ctrlr_cmd_admin_raw(nvme_ctrlr->ctrlr, cmd, buf, (uint32_t)nbytes, 8138 bdev_nvme_admin_passthru_done, bio); 8139 if (rc == 0) { 8140 return; 8141 } 8142 } 8143 8144 err: 8145 bdev_nvme_admin_complete(bio, rc); 8146 } 8147 8148 static int 8149 bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 8150 void *buf, size_t nbytes) 8151 { 8152 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8153 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8154 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 8155 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 8156 8157 if (nbytes > max_xfer_size) { 8158 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 8159 return -EINVAL; 8160 } 8161 8162 /* 8163 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 8164 * so fill it out automatically. 8165 */ 8166 cmd->nsid = spdk_nvme_ns_get_id(ns); 8167 8168 return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf, 8169 (uint32_t)nbytes, bdev_nvme_queued_done, bio); 8170 } 8171 8172 static int 8173 bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 8174 void *buf, size_t nbytes, void *md_buf, size_t md_len) 8175 { 8176 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8177 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8178 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 8179 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 8180 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 8181 8182 if (nbytes > max_xfer_size) { 8183 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 8184 return -EINVAL; 8185 } 8186 8187 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 8188 SPDK_ERRLOG("invalid meta data buffer size\n"); 8189 return -EINVAL; 8190 } 8191 8192 /* 8193 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 8194 * so fill it out automatically. 8195 */ 8196 cmd->nsid = spdk_nvme_ns_get_id(ns); 8197 8198 return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf, 8199 (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio); 8200 } 8201 8202 static int 8203 bdev_nvme_iov_passthru_md(struct nvme_bdev_io *bio, 8204 struct spdk_nvme_cmd *cmd, struct iovec *iov, int iovcnt, 8205 size_t nbytes, void *md_buf, size_t md_len) 8206 { 8207 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8208 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8209 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 8210 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 8211 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 8212 8213 bio->iovs = iov; 8214 bio->iovcnt = iovcnt; 8215 bio->iovpos = 0; 8216 bio->iov_offset = 0; 8217 8218 if (nbytes > max_xfer_size) { 8219 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 8220 return -EINVAL; 8221 } 8222 8223 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 8224 SPDK_ERRLOG("invalid meta data buffer size\n"); 8225 return -EINVAL; 8226 } 8227 8228 /* 8229 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands 8230 * require a nsid, so fill it out automatically. 8231 */ 8232 cmd->nsid = spdk_nvme_ns_get_id(ns); 8233 8234 return spdk_nvme_ctrlr_cmd_iov_raw_with_md( 8235 ctrlr, qpair, cmd, (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio, 8236 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge); 8237 } 8238 8239 static void 8240 bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 8241 struct nvme_bdev_io *bio_to_abort) 8242 { 8243 struct nvme_io_path *io_path; 8244 int rc = 0; 8245 8246 rc = bdev_nvme_abort_retry_io(nbdev_ch, bio_to_abort); 8247 if (rc == 0) { 8248 bdev_nvme_admin_complete(bio, 0); 8249 return; 8250 } 8251 8252 io_path = bio_to_abort->io_path; 8253 if (io_path != NULL) { 8254 rc = spdk_nvme_ctrlr_cmd_abort_ext(io_path->qpair->ctrlr->ctrlr, 8255 io_path->qpair->qpair, 8256 bio_to_abort, 8257 bdev_nvme_abort_done, bio); 8258 } else { 8259 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 8260 rc = spdk_nvme_ctrlr_cmd_abort_ext(io_path->qpair->ctrlr->ctrlr, 8261 NULL, 8262 bio_to_abort, 8263 bdev_nvme_abort_done, bio); 8264 8265 if (rc != -ENOENT) { 8266 break; 8267 } 8268 } 8269 } 8270 8271 if (rc != 0) { 8272 /* If no command was found or there was any error, complete the abort 8273 * request with failure. 8274 */ 8275 bdev_nvme_admin_complete(bio, rc); 8276 } 8277 } 8278 8279 static int 8280 bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, uint64_t src_offset_blocks, 8281 uint64_t num_blocks) 8282 { 8283 struct spdk_nvme_scc_source_range range = { 8284 .slba = src_offset_blocks, 8285 .nlb = num_blocks - 1 8286 }; 8287 8288 return spdk_nvme_ns_cmd_copy(bio->io_path->nvme_ns->ns, 8289 bio->io_path->qpair->qpair, 8290 &range, 1, dst_offset_blocks, 8291 bdev_nvme_queued_done, bio); 8292 } 8293 8294 static void 8295 bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w) 8296 { 8297 const char *action; 8298 uint32_t i; 8299 8300 if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) { 8301 action = "reset"; 8302 } else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) { 8303 action = "abort"; 8304 } else { 8305 action = "none"; 8306 } 8307 8308 spdk_json_write_object_begin(w); 8309 8310 spdk_json_write_named_string(w, "method", "bdev_nvme_set_options"); 8311 8312 spdk_json_write_named_object_begin(w, "params"); 8313 spdk_json_write_named_string(w, "action_on_timeout", action); 8314 spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us); 8315 spdk_json_write_named_uint64(w, "timeout_admin_us", g_opts.timeout_admin_us); 8316 spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms); 8317 spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst); 8318 spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight); 8319 spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight); 8320 spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight); 8321 spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us); 8322 spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us); 8323 spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests); 8324 spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit); 8325 spdk_json_write_named_uint32(w, "transport_retry_count", g_opts.transport_retry_count); 8326 spdk_json_write_named_int32(w, "bdev_retry_count", g_opts.bdev_retry_count); 8327 spdk_json_write_named_uint8(w, "transport_ack_timeout", g_opts.transport_ack_timeout); 8328 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", g_opts.ctrlr_loss_timeout_sec); 8329 spdk_json_write_named_uint32(w, "reconnect_delay_sec", g_opts.reconnect_delay_sec); 8330 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", g_opts.fast_io_fail_timeout_sec); 8331 spdk_json_write_named_bool(w, "disable_auto_failback", g_opts.disable_auto_failback); 8332 spdk_json_write_named_bool(w, "generate_uuids", g_opts.generate_uuids); 8333 spdk_json_write_named_uint8(w, "transport_tos", g_opts.transport_tos); 8334 spdk_json_write_named_bool(w, "nvme_error_stat", g_opts.nvme_error_stat); 8335 spdk_json_write_named_uint32(w, "rdma_srq_size", g_opts.rdma_srq_size); 8336 spdk_json_write_named_bool(w, "io_path_stat", g_opts.io_path_stat); 8337 spdk_json_write_named_bool(w, "allow_accel_sequence", g_opts.allow_accel_sequence); 8338 spdk_json_write_named_uint32(w, "rdma_max_cq_size", g_opts.rdma_max_cq_size); 8339 spdk_json_write_named_uint16(w, "rdma_cm_event_timeout_ms", g_opts.rdma_cm_event_timeout_ms); 8340 spdk_json_write_named_array_begin(w, "dhchap_digests"); 8341 for (i = 0; i < 32; ++i) { 8342 if (g_opts.dhchap_digests & SPDK_BIT(i)) { 8343 spdk_json_write_string(w, spdk_nvme_dhchap_get_digest_name(i)); 8344 } 8345 } 8346 spdk_json_write_array_end(w); 8347 spdk_json_write_named_array_begin(w, "dhchap_dhgroups"); 8348 for (i = 0; i < 32; ++i) { 8349 if (g_opts.dhchap_dhgroups & SPDK_BIT(i)) { 8350 spdk_json_write_string(w, spdk_nvme_dhchap_get_dhgroup_name(i)); 8351 } 8352 } 8353 8354 spdk_json_write_array_end(w); 8355 spdk_json_write_object_end(w); 8356 8357 spdk_json_write_object_end(w); 8358 } 8359 8360 static void 8361 bdev_nvme_discovery_config_json(struct spdk_json_write_ctx *w, struct discovery_ctx *ctx) 8362 { 8363 struct spdk_nvme_transport_id trid; 8364 8365 spdk_json_write_object_begin(w); 8366 8367 spdk_json_write_named_string(w, "method", "bdev_nvme_start_discovery"); 8368 8369 spdk_json_write_named_object_begin(w, "params"); 8370 spdk_json_write_named_string(w, "name", ctx->name); 8371 spdk_json_write_named_string(w, "hostnqn", ctx->hostnqn); 8372 8373 trid = ctx->trid; 8374 memset(trid.subnqn, 0, sizeof(trid.subnqn)); 8375 nvme_bdev_dump_trid_json(&trid, w); 8376 8377 spdk_json_write_named_bool(w, "wait_for_attach", ctx->wait_for_attach); 8378 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", ctx->bdev_opts.ctrlr_loss_timeout_sec); 8379 spdk_json_write_named_uint32(w, "reconnect_delay_sec", ctx->bdev_opts.reconnect_delay_sec); 8380 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 8381 ctx->bdev_opts.fast_io_fail_timeout_sec); 8382 spdk_json_write_object_end(w); 8383 8384 spdk_json_write_object_end(w); 8385 } 8386 8387 #ifdef SPDK_CONFIG_NVME_CUSE 8388 static void 8389 nvme_ctrlr_cuse_config_json(struct spdk_json_write_ctx *w, 8390 struct nvme_ctrlr *nvme_ctrlr) 8391 { 8392 size_t cuse_name_size = 128; 8393 char cuse_name[cuse_name_size]; 8394 8395 if (spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr, 8396 cuse_name, &cuse_name_size) != 0) { 8397 return; 8398 } 8399 8400 spdk_json_write_object_begin(w); 8401 8402 spdk_json_write_named_string(w, "method", "bdev_nvme_cuse_register"); 8403 8404 spdk_json_write_named_object_begin(w, "params"); 8405 spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name); 8406 spdk_json_write_object_end(w); 8407 8408 spdk_json_write_object_end(w); 8409 } 8410 #endif 8411 8412 static void 8413 nvme_ctrlr_config_json(struct spdk_json_write_ctx *w, 8414 struct nvme_ctrlr *nvme_ctrlr) 8415 { 8416 struct spdk_nvme_transport_id *trid; 8417 const struct spdk_nvme_ctrlr_opts *opts; 8418 8419 if (nvme_ctrlr->opts.from_discovery_service) { 8420 /* Do not emit an RPC for this - it will be implicitly 8421 * covered by a separate bdev_nvme_start_discovery or 8422 * bdev_nvme_start_mdns_discovery RPC. 8423 */ 8424 return; 8425 } 8426 8427 trid = &nvme_ctrlr->active_path_id->trid; 8428 8429 spdk_json_write_object_begin(w); 8430 8431 spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller"); 8432 8433 spdk_json_write_named_object_begin(w, "params"); 8434 spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name); 8435 nvme_bdev_dump_trid_json(trid, w); 8436 spdk_json_write_named_bool(w, "prchk_reftag", 8437 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0); 8438 spdk_json_write_named_bool(w, "prchk_guard", 8439 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0); 8440 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", nvme_ctrlr->opts.ctrlr_loss_timeout_sec); 8441 spdk_json_write_named_uint32(w, "reconnect_delay_sec", nvme_ctrlr->opts.reconnect_delay_sec); 8442 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 8443 nvme_ctrlr->opts.fast_io_fail_timeout_sec); 8444 if (nvme_ctrlr->psk != NULL) { 8445 spdk_json_write_named_string(w, "psk", spdk_key_get_name(nvme_ctrlr->psk)); 8446 } else if (nvme_ctrlr->opts.psk[0] != '\0') { 8447 spdk_json_write_named_string(w, "psk", nvme_ctrlr->opts.psk); 8448 } 8449 8450 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 8451 spdk_json_write_named_string(w, "hostnqn", opts->hostnqn); 8452 spdk_json_write_named_bool(w, "hdgst", opts->header_digest); 8453 spdk_json_write_named_bool(w, "ddgst", opts->data_digest); 8454 if (opts->src_addr[0] != '\0') { 8455 spdk_json_write_named_string(w, "hostaddr", opts->src_addr); 8456 } 8457 if (opts->src_svcid[0] != '\0') { 8458 spdk_json_write_named_string(w, "hostsvcid", opts->src_svcid); 8459 } 8460 8461 spdk_json_write_object_end(w); 8462 8463 spdk_json_write_object_end(w); 8464 } 8465 8466 static void 8467 bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w) 8468 { 8469 spdk_json_write_object_begin(w); 8470 spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug"); 8471 8472 spdk_json_write_named_object_begin(w, "params"); 8473 spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us); 8474 spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled); 8475 spdk_json_write_object_end(w); 8476 8477 spdk_json_write_object_end(w); 8478 } 8479 8480 static int 8481 bdev_nvme_config_json(struct spdk_json_write_ctx *w) 8482 { 8483 struct nvme_bdev_ctrlr *nbdev_ctrlr; 8484 struct nvme_ctrlr *nvme_ctrlr; 8485 struct discovery_ctx *ctx; 8486 8487 bdev_nvme_opts_config_json(w); 8488 8489 pthread_mutex_lock(&g_bdev_nvme_mutex); 8490 8491 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 8492 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 8493 nvme_ctrlr_config_json(w, nvme_ctrlr); 8494 8495 #ifdef SPDK_CONFIG_NVME_CUSE 8496 nvme_ctrlr_cuse_config_json(w, nvme_ctrlr); 8497 #endif 8498 } 8499 } 8500 8501 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 8502 if (!ctx->from_mdns_discovery_service) { 8503 bdev_nvme_discovery_config_json(w, ctx); 8504 } 8505 } 8506 8507 bdev_nvme_mdns_discovery_config_json(w); 8508 8509 /* Dump as last parameter to give all NVMe bdevs chance to be constructed 8510 * before enabling hotplug poller. 8511 */ 8512 bdev_nvme_hotplug_config_json(w); 8513 8514 pthread_mutex_unlock(&g_bdev_nvme_mutex); 8515 return 0; 8516 } 8517 8518 struct spdk_nvme_ctrlr * 8519 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev) 8520 { 8521 struct nvme_bdev *nbdev; 8522 struct nvme_ns *nvme_ns; 8523 8524 if (!bdev || bdev->module != &nvme_if) { 8525 return NULL; 8526 } 8527 8528 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 8529 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 8530 assert(nvme_ns != NULL); 8531 8532 return nvme_ns->ctrlr->ctrlr; 8533 } 8534 8535 void 8536 nvme_io_path_info_json(struct spdk_json_write_ctx *w, struct nvme_io_path *io_path) 8537 { 8538 struct nvme_ns *nvme_ns = io_path->nvme_ns; 8539 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 8540 const struct spdk_nvme_ctrlr_data *cdata; 8541 const struct spdk_nvme_transport_id *trid; 8542 const struct nvme_bdev_channel *nbdev_ch; 8543 const char *adrfam_str; 8544 bool current; 8545 8546 spdk_json_write_object_begin(w); 8547 8548 spdk_json_write_named_string(w, "bdev_name", nvme_ns->bdev->disk.name); 8549 8550 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 8551 trid = spdk_nvme_ctrlr_get_transport_id(nvme_ctrlr->ctrlr); 8552 8553 spdk_json_write_named_uint32(w, "cntlid", cdata->cntlid); 8554 nbdev_ch = io_path->nbdev_ch; 8555 if (nbdev_ch == NULL) { 8556 current = false; 8557 } else if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) { 8558 struct nvme_io_path *optimized_io_path = NULL; 8559 8560 STAILQ_FOREACH(optimized_io_path, &nbdev_ch->io_path_list, stailq) { 8561 if (optimized_io_path->nvme_ns->ana_state == SPDK_NVME_ANA_OPTIMIZED_STATE) { 8562 break; 8563 } 8564 } 8565 8566 current = nvme_io_path_is_available(io_path); 8567 if (io_path->nvme_ns->ana_state == SPDK_NVME_ANA_NON_OPTIMIZED_STATE) { 8568 /* A non-optimized path is only current if there are no optimized paths. */ 8569 current = current && (optimized_io_path == NULL); 8570 } 8571 } else { 8572 if (nbdev_ch->current_io_path) { 8573 current = (io_path == nbdev_ch->current_io_path); 8574 } else { 8575 struct nvme_io_path *first_path; 8576 8577 /* We arrived here as there are no optimized paths for active-passive 8578 * mode. Check if this io_path is the first one available on the list. 8579 */ 8580 current = false; 8581 STAILQ_FOREACH(first_path, &nbdev_ch->io_path_list, stailq) { 8582 if (nvme_io_path_is_available(first_path)) { 8583 current = (io_path == first_path); 8584 break; 8585 } 8586 } 8587 } 8588 } 8589 spdk_json_write_named_bool(w, "current", current); 8590 spdk_json_write_named_bool(w, "connected", nvme_qpair_is_connected(io_path->qpair)); 8591 spdk_json_write_named_bool(w, "accessible", nvme_ns_is_accessible(nvme_ns)); 8592 8593 spdk_json_write_named_object_begin(w, "transport"); 8594 spdk_json_write_named_string(w, "trtype", trid->trstring); 8595 spdk_json_write_named_string(w, "traddr", trid->traddr); 8596 if (trid->trsvcid[0] != '\0') { 8597 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 8598 } 8599 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 8600 if (adrfam_str) { 8601 spdk_json_write_named_string(w, "adrfam", adrfam_str); 8602 } 8603 spdk_json_write_object_end(w); 8604 8605 spdk_json_write_object_end(w); 8606 } 8607 8608 void 8609 bdev_nvme_get_discovery_info(struct spdk_json_write_ctx *w) 8610 { 8611 struct discovery_ctx *ctx; 8612 struct discovery_entry_ctx *entry_ctx; 8613 8614 spdk_json_write_array_begin(w); 8615 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 8616 spdk_json_write_object_begin(w); 8617 spdk_json_write_named_string(w, "name", ctx->name); 8618 8619 spdk_json_write_named_object_begin(w, "trid"); 8620 nvme_bdev_dump_trid_json(&ctx->trid, w); 8621 spdk_json_write_object_end(w); 8622 8623 spdk_json_write_named_array_begin(w, "referrals"); 8624 TAILQ_FOREACH(entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 8625 spdk_json_write_object_begin(w); 8626 spdk_json_write_named_object_begin(w, "trid"); 8627 nvme_bdev_dump_trid_json(&entry_ctx->trid, w); 8628 spdk_json_write_object_end(w); 8629 spdk_json_write_object_end(w); 8630 } 8631 spdk_json_write_array_end(w); 8632 8633 spdk_json_write_object_end(w); 8634 } 8635 spdk_json_write_array_end(w); 8636 } 8637 8638 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme) 8639 8640 SPDK_TRACE_REGISTER_FN(bdev_nvme_trace, "bdev_nvme", TRACE_GROUP_BDEV_NVME) 8641 { 8642 struct spdk_trace_tpoint_opts opts[] = { 8643 { 8644 "BDEV_NVME_IO_START", TRACE_BDEV_NVME_IO_START, 8645 OWNER_TYPE_NONE, OBJECT_BDEV_NVME_IO, 1, 8646 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 8647 }, 8648 { 8649 "BDEV_NVME_IO_DONE", TRACE_BDEV_NVME_IO_DONE, 8650 OWNER_TYPE_NONE, OBJECT_BDEV_NVME_IO, 0, 8651 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 8652 } 8653 }; 8654 8655 8656 spdk_trace_register_object(OBJECT_BDEV_NVME_IO, 'N'); 8657 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 8658 spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_SUBMIT, OBJECT_BDEV_NVME_IO, 0); 8659 spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_SUBMIT, OBJECT_BDEV_NVME_IO, 0); 8660 spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_COMPLETE, OBJECT_BDEV_NVME_IO, 0); 8661 spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_COMPLETE, OBJECT_BDEV_NVME_IO, 0); 8662 } 8663