1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 * Copyright (c) 2022 Dell Inc, or its subsidiaries. All rights reserved. 6 */ 7 8 #include "spdk/stdinc.h" 9 10 #include "bdev_nvme.h" 11 12 #include "spdk/accel.h" 13 #include "spdk/config.h" 14 #include "spdk/endian.h" 15 #include "spdk/bdev.h" 16 #include "spdk/json.h" 17 #include "spdk/keyring.h" 18 #include "spdk/likely.h" 19 #include "spdk/nvme.h" 20 #include "spdk/nvme_ocssd.h" 21 #include "spdk/nvme_zns.h" 22 #include "spdk/opal.h" 23 #include "spdk/thread.h" 24 #include "spdk/trace.h" 25 #include "spdk/string.h" 26 #include "spdk/util.h" 27 #include "spdk/uuid.h" 28 29 #include "spdk/bdev_module.h" 30 #include "spdk/log.h" 31 32 #include "spdk_internal/usdt.h" 33 #include "spdk_internal/trace_defs.h" 34 35 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true 36 #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS (10000) 37 38 #define NSID_STR_LEN 10 39 40 #define SPDK_CONTROLLER_NAME_MAX 512 41 42 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w); 43 44 struct nvme_bdev_io { 45 /** array of iovecs to transfer. */ 46 struct iovec *iovs; 47 48 /** Number of iovecs in iovs array. */ 49 int iovcnt; 50 51 /** Current iovec position. */ 52 int iovpos; 53 54 /** Offset in current iovec. */ 55 uint32_t iov_offset; 56 57 /** I/O path the current I/O or admin passthrough is submitted on, or the I/O path 58 * being reset in a reset I/O. 59 */ 60 struct nvme_io_path *io_path; 61 62 /** array of iovecs to transfer. */ 63 struct iovec *fused_iovs; 64 65 /** Number of iovecs in iovs array. */ 66 int fused_iovcnt; 67 68 /** Current iovec position. */ 69 int fused_iovpos; 70 71 /** Offset in current iovec. */ 72 uint32_t fused_iov_offset; 73 74 /** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */ 75 struct spdk_nvme_cpl cpl; 76 77 /** Extended IO opts passed by the user to bdev layer and mapped to NVME format */ 78 struct spdk_nvme_ns_cmd_ext_io_opts ext_opts; 79 80 /** Keeps track if first of fused commands was submitted */ 81 bool first_fused_submitted; 82 83 /** Keeps track if first of fused commands was completed */ 84 bool first_fused_completed; 85 86 /** Temporary pointer to zone report buffer */ 87 struct spdk_nvme_zns_zone_report *zone_report_buf; 88 89 /** Keep track of how many zones that have been copied to the spdk_bdev_zone_info struct */ 90 uint64_t handled_zones; 91 92 /** Expiration value in ticks to retry the current I/O. */ 93 uint64_t retry_ticks; 94 95 /* How many times the current I/O was retried. */ 96 int32_t retry_count; 97 98 /* Current tsc at submit time. */ 99 uint64_t submit_tsc; 100 101 /* Used to put nvme_bdev_io into the list */ 102 TAILQ_ENTRY(nvme_bdev_io) retry_link; 103 }; 104 105 struct nvme_probe_skip_entry { 106 struct spdk_nvme_transport_id trid; 107 TAILQ_ENTRY(nvme_probe_skip_entry) tailq; 108 }; 109 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */ 110 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER( 111 g_skipped_nvme_ctrlrs); 112 113 #define BDEV_NVME_DEFAULT_DIGESTS (SPDK_BIT(SPDK_NVMF_DHCHAP_HASH_SHA256) | \ 114 SPDK_BIT(SPDK_NVMF_DHCHAP_HASH_SHA384) | \ 115 SPDK_BIT(SPDK_NVMF_DHCHAP_HASH_SHA512)) 116 117 #define BDEV_NVME_DEFAULT_DHGROUPS (SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_NULL) | \ 118 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_2048) | \ 119 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_3072) | \ 120 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_4096) | \ 121 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_6144) | \ 122 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_8192)) 123 124 static struct spdk_bdev_nvme_opts g_opts = { 125 .action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE, 126 .timeout_us = 0, 127 .timeout_admin_us = 0, 128 .keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS, 129 .transport_retry_count = 4, 130 .arbitration_burst = 0, 131 .low_priority_weight = 0, 132 .medium_priority_weight = 0, 133 .high_priority_weight = 0, 134 .nvme_adminq_poll_period_us = 10000ULL, 135 .nvme_ioq_poll_period_us = 0, 136 .io_queue_requests = 0, 137 .delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT, 138 .bdev_retry_count = 3, 139 .transport_ack_timeout = 0, 140 .ctrlr_loss_timeout_sec = 0, 141 .reconnect_delay_sec = 0, 142 .fast_io_fail_timeout_sec = 0, 143 .disable_auto_failback = false, 144 .generate_uuids = false, 145 .transport_tos = 0, 146 .nvme_error_stat = false, 147 .io_path_stat = false, 148 .allow_accel_sequence = false, 149 .dhchap_digests = BDEV_NVME_DEFAULT_DIGESTS, 150 .dhchap_dhgroups = BDEV_NVME_DEFAULT_DHGROUPS, 151 }; 152 153 #define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL 154 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT 100000ULL 155 156 static int g_hot_insert_nvme_controller_index = 0; 157 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT; 158 static bool g_nvme_hotplug_enabled = false; 159 struct spdk_thread *g_bdev_nvme_init_thread; 160 static struct spdk_poller *g_hotplug_poller; 161 static struct spdk_poller *g_hotplug_probe_poller; 162 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx; 163 164 static void nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 165 struct nvme_async_probe_ctx *ctx); 166 static void nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 167 struct nvme_async_probe_ctx *ctx); 168 static int bdev_nvme_library_init(void); 169 static void bdev_nvme_library_fini(void); 170 static void _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, 171 struct spdk_bdev_io *bdev_io); 172 static void bdev_nvme_submit_request(struct spdk_io_channel *ch, 173 struct spdk_bdev_io *bdev_io); 174 static int bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 175 void *md, uint64_t lba_count, uint64_t lba, 176 uint32_t flags, struct spdk_memory_domain *domain, void *domain_ctx, 177 struct spdk_accel_sequence *seq); 178 static int bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 179 void *md, uint64_t lba_count, uint64_t lba); 180 static int bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 181 void *md, uint64_t lba_count, uint64_t lba, 182 uint32_t flags, struct spdk_memory_domain *domain, void *domain_ctx, 183 struct spdk_accel_sequence *seq, 184 union spdk_bdev_nvme_cdw12 cdw12, union spdk_bdev_nvme_cdw13 cdw13); 185 static int bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 186 void *md, uint64_t lba_count, 187 uint64_t zslba, uint32_t flags); 188 static int bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 189 void *md, uint64_t lba_count, uint64_t lba, 190 uint32_t flags); 191 static int bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, 192 struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov, 193 int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba, 194 uint32_t flags); 195 static int bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, 196 uint32_t num_zones, struct spdk_bdev_zone_info *info); 197 static int bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 198 enum spdk_bdev_zone_action action); 199 static void bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, 200 struct nvme_bdev_io *bio, 201 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); 202 static int bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 203 void *buf, size_t nbytes); 204 static int bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 205 void *buf, size_t nbytes, void *md_buf, size_t md_len); 206 static int bdev_nvme_iov_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 207 struct iovec *iov, int iovcnt, size_t nbytes, 208 void *md_buf, size_t md_len); 209 static void bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, 210 struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort); 211 static void bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio); 212 static int bdev_nvme_reset_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 213 static int bdev_nvme_failover_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 214 static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr); 215 static int nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr); 216 217 static struct nvme_ns *nvme_ns_alloc(void); 218 static void nvme_ns_free(struct nvme_ns *ns); 219 220 static int 221 nvme_ns_cmp(struct nvme_ns *ns1, struct nvme_ns *ns2) 222 { 223 return ns1->id < ns2->id ? -1 : ns1->id > ns2->id; 224 } 225 226 RB_GENERATE_STATIC(nvme_ns_tree, nvme_ns, node, nvme_ns_cmp); 227 228 struct spdk_nvme_qpair * 229 bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch) 230 { 231 struct nvme_ctrlr_channel *ctrlr_ch; 232 233 assert(ctrlr_io_ch != NULL); 234 235 ctrlr_ch = spdk_io_channel_get_ctx(ctrlr_io_ch); 236 237 return ctrlr_ch->qpair->qpair; 238 } 239 240 static int 241 bdev_nvme_get_ctx_size(void) 242 { 243 return sizeof(struct nvme_bdev_io); 244 } 245 246 static struct spdk_bdev_module nvme_if = { 247 .name = "nvme", 248 .async_fini = true, 249 .module_init = bdev_nvme_library_init, 250 .module_fini = bdev_nvme_library_fini, 251 .config_json = bdev_nvme_config_json, 252 .get_ctx_size = bdev_nvme_get_ctx_size, 253 254 }; 255 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if) 256 257 struct nvme_bdev_ctrlrs g_nvme_bdev_ctrlrs = TAILQ_HEAD_INITIALIZER(g_nvme_bdev_ctrlrs); 258 pthread_mutex_t g_bdev_nvme_mutex = PTHREAD_MUTEX_INITIALIZER; 259 bool g_bdev_nvme_module_finish; 260 261 struct nvme_bdev_ctrlr * 262 nvme_bdev_ctrlr_get_by_name(const char *name) 263 { 264 struct nvme_bdev_ctrlr *nbdev_ctrlr; 265 266 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 267 if (strcmp(name, nbdev_ctrlr->name) == 0) { 268 break; 269 } 270 } 271 272 return nbdev_ctrlr; 273 } 274 275 static struct nvme_ctrlr * 276 nvme_bdev_ctrlr_get_ctrlr(struct nvme_bdev_ctrlr *nbdev_ctrlr, 277 const struct spdk_nvme_transport_id *trid) 278 { 279 struct nvme_ctrlr *nvme_ctrlr; 280 281 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 282 if (spdk_nvme_transport_id_compare(trid, &nvme_ctrlr->active_path_id->trid) == 0) { 283 break; 284 } 285 } 286 287 return nvme_ctrlr; 288 } 289 290 struct nvme_ctrlr * 291 nvme_bdev_ctrlr_get_ctrlr_by_id(struct nvme_bdev_ctrlr *nbdev_ctrlr, 292 uint16_t cntlid) 293 { 294 struct nvme_ctrlr *nvme_ctrlr; 295 const struct spdk_nvme_ctrlr_data *cdata; 296 297 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 298 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 299 if (cdata->cntlid == cntlid) { 300 break; 301 } 302 } 303 304 return nvme_ctrlr; 305 } 306 307 static struct nvme_bdev * 308 nvme_bdev_ctrlr_get_bdev(struct nvme_bdev_ctrlr *nbdev_ctrlr, uint32_t nsid) 309 { 310 struct nvme_bdev *bdev; 311 312 pthread_mutex_lock(&g_bdev_nvme_mutex); 313 TAILQ_FOREACH(bdev, &nbdev_ctrlr->bdevs, tailq) { 314 if (bdev->nsid == nsid) { 315 break; 316 } 317 } 318 pthread_mutex_unlock(&g_bdev_nvme_mutex); 319 320 return bdev; 321 } 322 323 struct nvme_ns * 324 nvme_ctrlr_get_ns(struct nvme_ctrlr *nvme_ctrlr, uint32_t nsid) 325 { 326 struct nvme_ns ns; 327 328 assert(nsid > 0); 329 330 ns.id = nsid; 331 return RB_FIND(nvme_ns_tree, &nvme_ctrlr->namespaces, &ns); 332 } 333 334 struct nvme_ns * 335 nvme_ctrlr_get_first_active_ns(struct nvme_ctrlr *nvme_ctrlr) 336 { 337 return RB_MIN(nvme_ns_tree, &nvme_ctrlr->namespaces); 338 } 339 340 struct nvme_ns * 341 nvme_ctrlr_get_next_active_ns(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *ns) 342 { 343 if (ns == NULL) { 344 return NULL; 345 } 346 347 return RB_NEXT(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 348 } 349 350 static struct nvme_ctrlr * 351 nvme_ctrlr_get(const struct spdk_nvme_transport_id *trid) 352 { 353 struct nvme_bdev_ctrlr *nbdev_ctrlr; 354 struct nvme_ctrlr *nvme_ctrlr = NULL; 355 356 pthread_mutex_lock(&g_bdev_nvme_mutex); 357 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 358 nvme_ctrlr = nvme_bdev_ctrlr_get_ctrlr(nbdev_ctrlr, trid); 359 if (nvme_ctrlr != NULL) { 360 break; 361 } 362 } 363 pthread_mutex_unlock(&g_bdev_nvme_mutex); 364 365 return nvme_ctrlr; 366 } 367 368 struct nvme_ctrlr * 369 nvme_ctrlr_get_by_name(const char *name) 370 { 371 struct nvme_bdev_ctrlr *nbdev_ctrlr; 372 struct nvme_ctrlr *nvme_ctrlr = NULL; 373 374 if (name == NULL) { 375 return NULL; 376 } 377 378 pthread_mutex_lock(&g_bdev_nvme_mutex); 379 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 380 if (nbdev_ctrlr != NULL) { 381 nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 382 } 383 pthread_mutex_unlock(&g_bdev_nvme_mutex); 384 385 return nvme_ctrlr; 386 } 387 388 void 389 nvme_bdev_ctrlr_for_each(nvme_bdev_ctrlr_for_each_fn fn, void *ctx) 390 { 391 struct nvme_bdev_ctrlr *nbdev_ctrlr; 392 393 pthread_mutex_lock(&g_bdev_nvme_mutex); 394 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 395 fn(nbdev_ctrlr, ctx); 396 } 397 pthread_mutex_unlock(&g_bdev_nvme_mutex); 398 } 399 400 void 401 nvme_bdev_dump_trid_json(const struct spdk_nvme_transport_id *trid, struct spdk_json_write_ctx *w) 402 { 403 const char *trtype_str; 404 const char *adrfam_str; 405 406 trtype_str = spdk_nvme_transport_id_trtype_str(trid->trtype); 407 if (trtype_str) { 408 spdk_json_write_named_string(w, "trtype", trtype_str); 409 } 410 411 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 412 if (adrfam_str) { 413 spdk_json_write_named_string(w, "adrfam", adrfam_str); 414 } 415 416 if (trid->traddr[0] != '\0') { 417 spdk_json_write_named_string(w, "traddr", trid->traddr); 418 } 419 420 if (trid->trsvcid[0] != '\0') { 421 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 422 } 423 424 if (trid->subnqn[0] != '\0') { 425 spdk_json_write_named_string(w, "subnqn", trid->subnqn); 426 } 427 } 428 429 static void 430 nvme_bdev_ctrlr_delete(struct nvme_bdev_ctrlr *nbdev_ctrlr, 431 struct nvme_ctrlr *nvme_ctrlr) 432 { 433 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_delete, nvme_ctrlr->nbdev_ctrlr->name); 434 pthread_mutex_lock(&g_bdev_nvme_mutex); 435 436 TAILQ_REMOVE(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 437 if (!TAILQ_EMPTY(&nbdev_ctrlr->ctrlrs)) { 438 pthread_mutex_unlock(&g_bdev_nvme_mutex); 439 440 return; 441 } 442 TAILQ_REMOVE(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 443 444 pthread_mutex_unlock(&g_bdev_nvme_mutex); 445 446 assert(TAILQ_EMPTY(&nbdev_ctrlr->bdevs)); 447 448 free(nbdev_ctrlr->name); 449 free(nbdev_ctrlr); 450 } 451 452 static void 453 _nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 454 { 455 struct nvme_path_id *path_id, *tmp_path; 456 struct nvme_ns *ns, *tmp_ns; 457 458 free(nvme_ctrlr->copied_ana_desc); 459 spdk_free(nvme_ctrlr->ana_log_page); 460 461 if (nvme_ctrlr->opal_dev) { 462 spdk_opal_dev_destruct(nvme_ctrlr->opal_dev); 463 nvme_ctrlr->opal_dev = NULL; 464 } 465 466 if (nvme_ctrlr->nbdev_ctrlr) { 467 nvme_bdev_ctrlr_delete(nvme_ctrlr->nbdev_ctrlr, nvme_ctrlr); 468 } 469 470 RB_FOREACH_SAFE(ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp_ns) { 471 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 472 nvme_ns_free(ns); 473 } 474 475 TAILQ_FOREACH_SAFE(path_id, &nvme_ctrlr->trids, link, tmp_path) { 476 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 477 free(path_id); 478 } 479 480 pthread_mutex_destroy(&nvme_ctrlr->mutex); 481 spdk_keyring_put_key(nvme_ctrlr->psk); 482 spdk_keyring_put_key(nvme_ctrlr->dhchap_key); 483 spdk_keyring_put_key(nvme_ctrlr->dhchap_ctrlr_key); 484 free(nvme_ctrlr); 485 486 pthread_mutex_lock(&g_bdev_nvme_mutex); 487 if (g_bdev_nvme_module_finish && TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 488 pthread_mutex_unlock(&g_bdev_nvme_mutex); 489 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 490 spdk_bdev_module_fini_done(); 491 return; 492 } 493 pthread_mutex_unlock(&g_bdev_nvme_mutex); 494 } 495 496 static int 497 nvme_detach_poller(void *arg) 498 { 499 struct nvme_ctrlr *nvme_ctrlr = arg; 500 int rc; 501 502 rc = spdk_nvme_detach_poll_async(nvme_ctrlr->detach_ctx); 503 if (rc != -EAGAIN) { 504 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 505 _nvme_ctrlr_delete(nvme_ctrlr); 506 } 507 508 return SPDK_POLLER_BUSY; 509 } 510 511 static void 512 nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 513 { 514 int rc; 515 516 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 517 518 /* First, unregister the adminq poller, as the driver will poll adminq if necessary */ 519 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 520 521 /* If we got here, the reset/detach poller cannot be active */ 522 assert(nvme_ctrlr->reset_detach_poller == NULL); 523 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(nvme_detach_poller, 524 nvme_ctrlr, 1000); 525 if (nvme_ctrlr->reset_detach_poller == NULL) { 526 SPDK_ERRLOG("Failed to register detach poller\n"); 527 goto error; 528 } 529 530 rc = spdk_nvme_detach_async(nvme_ctrlr->ctrlr, &nvme_ctrlr->detach_ctx); 531 if (rc != 0) { 532 SPDK_ERRLOG("Failed to detach the NVMe controller\n"); 533 goto error; 534 } 535 536 return; 537 error: 538 /* We don't have a good way to handle errors here, so just do what we can and delete the 539 * controller without detaching the underlying NVMe device. 540 */ 541 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 542 _nvme_ctrlr_delete(nvme_ctrlr); 543 } 544 545 static void 546 nvme_ctrlr_unregister_cb(void *io_device) 547 { 548 struct nvme_ctrlr *nvme_ctrlr = io_device; 549 550 nvme_ctrlr_delete(nvme_ctrlr); 551 } 552 553 static void 554 nvme_ctrlr_unregister(void *ctx) 555 { 556 struct nvme_ctrlr *nvme_ctrlr = ctx; 557 558 spdk_io_device_unregister(nvme_ctrlr, nvme_ctrlr_unregister_cb); 559 } 560 561 static bool 562 nvme_ctrlr_can_be_unregistered(struct nvme_ctrlr *nvme_ctrlr) 563 { 564 if (!nvme_ctrlr->destruct) { 565 return false; 566 } 567 568 if (nvme_ctrlr->ref > 0) { 569 return false; 570 } 571 572 if (nvme_ctrlr->resetting) { 573 return false; 574 } 575 576 if (nvme_ctrlr->ana_log_page_updating) { 577 return false; 578 } 579 580 if (nvme_ctrlr->io_path_cache_clearing) { 581 return false; 582 } 583 584 return true; 585 } 586 587 static void 588 nvme_ctrlr_release(struct nvme_ctrlr *nvme_ctrlr) 589 { 590 pthread_mutex_lock(&nvme_ctrlr->mutex); 591 SPDK_DTRACE_PROBE2(bdev_nvme_ctrlr_release, nvme_ctrlr->nbdev_ctrlr->name, nvme_ctrlr->ref); 592 593 assert(nvme_ctrlr->ref > 0); 594 nvme_ctrlr->ref--; 595 596 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 597 pthread_mutex_unlock(&nvme_ctrlr->mutex); 598 return; 599 } 600 601 pthread_mutex_unlock(&nvme_ctrlr->mutex); 602 603 spdk_thread_exec_msg(nvme_ctrlr->thread, nvme_ctrlr_unregister, nvme_ctrlr); 604 } 605 606 static void 607 bdev_nvme_clear_current_io_path(struct nvme_bdev_channel *nbdev_ch) 608 { 609 nbdev_ch->current_io_path = NULL; 610 nbdev_ch->rr_counter = 0; 611 } 612 613 static struct nvme_io_path * 614 _bdev_nvme_get_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 615 { 616 struct nvme_io_path *io_path; 617 618 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 619 if (io_path->nvme_ns == nvme_ns) { 620 break; 621 } 622 } 623 624 return io_path; 625 } 626 627 static struct nvme_io_path * 628 nvme_io_path_alloc(void) 629 { 630 struct nvme_io_path *io_path; 631 632 io_path = calloc(1, sizeof(*io_path)); 633 if (io_path == NULL) { 634 SPDK_ERRLOG("Failed to alloc io_path.\n"); 635 return NULL; 636 } 637 638 if (g_opts.io_path_stat) { 639 io_path->stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); 640 if (io_path->stat == NULL) { 641 free(io_path); 642 SPDK_ERRLOG("Failed to alloc io_path stat.\n"); 643 return NULL; 644 } 645 spdk_bdev_reset_io_stat(io_path->stat, SPDK_BDEV_RESET_STAT_MAXMIN); 646 } 647 648 return io_path; 649 } 650 651 static void 652 nvme_io_path_free(struct nvme_io_path *io_path) 653 { 654 free(io_path->stat); 655 free(io_path); 656 } 657 658 static int 659 _bdev_nvme_add_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 660 { 661 struct nvme_io_path *io_path; 662 struct spdk_io_channel *ch; 663 struct nvme_ctrlr_channel *ctrlr_ch; 664 struct nvme_qpair *nvme_qpair; 665 666 io_path = nvme_io_path_alloc(); 667 if (io_path == NULL) { 668 return -ENOMEM; 669 } 670 671 io_path->nvme_ns = nvme_ns; 672 673 ch = spdk_get_io_channel(nvme_ns->ctrlr); 674 if (ch == NULL) { 675 nvme_io_path_free(io_path); 676 SPDK_ERRLOG("Failed to alloc io_channel.\n"); 677 return -ENOMEM; 678 } 679 680 ctrlr_ch = spdk_io_channel_get_ctx(ch); 681 682 nvme_qpair = ctrlr_ch->qpair; 683 assert(nvme_qpair != NULL); 684 685 io_path->qpair = nvme_qpair; 686 TAILQ_INSERT_TAIL(&nvme_qpair->io_path_list, io_path, tailq); 687 688 io_path->nbdev_ch = nbdev_ch; 689 STAILQ_INSERT_TAIL(&nbdev_ch->io_path_list, io_path, stailq); 690 691 bdev_nvme_clear_current_io_path(nbdev_ch); 692 693 return 0; 694 } 695 696 static void 697 bdev_nvme_clear_retry_io_path(struct nvme_bdev_channel *nbdev_ch, 698 struct nvme_io_path *io_path) 699 { 700 struct nvme_bdev_io *bio; 701 702 TAILQ_FOREACH(bio, &nbdev_ch->retry_io_list, retry_link) { 703 if (bio->io_path == io_path) { 704 bio->io_path = NULL; 705 } 706 } 707 } 708 709 static void 710 _bdev_nvme_delete_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *io_path) 711 { 712 struct spdk_io_channel *ch; 713 struct nvme_qpair *nvme_qpair; 714 struct nvme_ctrlr_channel *ctrlr_ch; 715 struct nvme_bdev *nbdev; 716 717 nbdev = spdk_io_channel_get_io_device(spdk_io_channel_from_ctx(nbdev_ch)); 718 719 /* Add the statistics to nvme_ns before this path is destroyed. */ 720 pthread_mutex_lock(&nbdev->mutex); 721 if (nbdev->ref != 0 && io_path->nvme_ns->stat != NULL && io_path->stat != NULL) { 722 spdk_bdev_add_io_stat(io_path->nvme_ns->stat, io_path->stat); 723 } 724 pthread_mutex_unlock(&nbdev->mutex); 725 726 bdev_nvme_clear_current_io_path(nbdev_ch); 727 bdev_nvme_clear_retry_io_path(nbdev_ch, io_path); 728 729 STAILQ_REMOVE(&nbdev_ch->io_path_list, io_path, nvme_io_path, stailq); 730 io_path->nbdev_ch = NULL; 731 732 nvme_qpair = io_path->qpair; 733 assert(nvme_qpair != NULL); 734 735 ctrlr_ch = nvme_qpair->ctrlr_ch; 736 assert(ctrlr_ch != NULL); 737 738 ch = spdk_io_channel_from_ctx(ctrlr_ch); 739 spdk_put_io_channel(ch); 740 741 /* After an io_path is removed, I/Os submitted to it may complete and update statistics 742 * of the io_path. To avoid heap-use-after-free error from this case, do not free the 743 * io_path here but free the io_path when the associated qpair is freed. It is ensured 744 * that all I/Os submitted to the io_path are completed when the associated qpair is freed. 745 */ 746 } 747 748 static void 749 _bdev_nvme_delete_io_paths(struct nvme_bdev_channel *nbdev_ch) 750 { 751 struct nvme_io_path *io_path, *tmp_io_path; 752 753 STAILQ_FOREACH_SAFE(io_path, &nbdev_ch->io_path_list, stailq, tmp_io_path) { 754 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 755 } 756 } 757 758 static int 759 bdev_nvme_create_bdev_channel_cb(void *io_device, void *ctx_buf) 760 { 761 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 762 struct nvme_bdev *nbdev = io_device; 763 struct nvme_ns *nvme_ns; 764 int rc; 765 766 STAILQ_INIT(&nbdev_ch->io_path_list); 767 TAILQ_INIT(&nbdev_ch->retry_io_list); 768 769 pthread_mutex_lock(&nbdev->mutex); 770 771 nbdev_ch->mp_policy = nbdev->mp_policy; 772 nbdev_ch->mp_selector = nbdev->mp_selector; 773 nbdev_ch->rr_min_io = nbdev->rr_min_io; 774 775 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 776 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 777 if (rc != 0) { 778 pthread_mutex_unlock(&nbdev->mutex); 779 780 _bdev_nvme_delete_io_paths(nbdev_ch); 781 return rc; 782 } 783 } 784 pthread_mutex_unlock(&nbdev->mutex); 785 786 return 0; 787 } 788 789 /* If cpl != NULL, complete the bdev_io with nvme status based on 'cpl'. 790 * If cpl == NULL, complete the bdev_io with bdev status based on 'status'. 791 */ 792 static inline void 793 __bdev_nvme_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status, 794 const struct spdk_nvme_cpl *cpl) 795 { 796 spdk_trace_record(TRACE_BDEV_NVME_IO_DONE, 0, 0, (uintptr_t)bdev_io->driver_ctx, 797 (uintptr_t)bdev_io); 798 if (cpl) { 799 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 800 } else { 801 spdk_bdev_io_complete(bdev_io, status); 802 } 803 } 804 805 static void bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch); 806 807 static void 808 bdev_nvme_destroy_bdev_channel_cb(void *io_device, void *ctx_buf) 809 { 810 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 811 812 bdev_nvme_abort_retry_ios(nbdev_ch); 813 _bdev_nvme_delete_io_paths(nbdev_ch); 814 } 815 816 static inline bool 817 bdev_nvme_io_type_is_admin(enum spdk_bdev_io_type io_type) 818 { 819 switch (io_type) { 820 case SPDK_BDEV_IO_TYPE_RESET: 821 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 822 case SPDK_BDEV_IO_TYPE_ABORT: 823 return true; 824 default: 825 break; 826 } 827 828 return false; 829 } 830 831 static inline bool 832 nvme_ns_is_active(struct nvme_ns *nvme_ns) 833 { 834 if (spdk_unlikely(nvme_ns->ana_state_updating)) { 835 return false; 836 } 837 838 if (spdk_unlikely(nvme_ns->ns == NULL)) { 839 return false; 840 } 841 842 return true; 843 } 844 845 static inline bool 846 nvme_ns_is_accessible(struct nvme_ns *nvme_ns) 847 { 848 if (spdk_unlikely(!nvme_ns_is_active(nvme_ns))) { 849 return false; 850 } 851 852 switch (nvme_ns->ana_state) { 853 case SPDK_NVME_ANA_OPTIMIZED_STATE: 854 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 855 return true; 856 default: 857 break; 858 } 859 860 return false; 861 } 862 863 static inline bool 864 nvme_qpair_is_connected(struct nvme_qpair *nvme_qpair) 865 { 866 if (spdk_unlikely(nvme_qpair->qpair == NULL)) { 867 return false; 868 } 869 870 if (spdk_unlikely(spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) != 871 SPDK_NVME_QPAIR_FAILURE_NONE)) { 872 return false; 873 } 874 875 if (spdk_unlikely(nvme_qpair->ctrlr_ch->reset_iter != NULL)) { 876 return false; 877 } 878 879 return true; 880 } 881 882 static inline bool 883 nvme_io_path_is_available(struct nvme_io_path *io_path) 884 { 885 if (spdk_unlikely(!nvme_qpair_is_connected(io_path->qpair))) { 886 return false; 887 } 888 889 if (spdk_unlikely(!nvme_ns_is_accessible(io_path->nvme_ns))) { 890 return false; 891 } 892 893 return true; 894 } 895 896 static inline bool 897 nvme_ctrlr_is_failed(struct nvme_ctrlr *nvme_ctrlr) 898 { 899 if (nvme_ctrlr->destruct) { 900 return true; 901 } 902 903 if (nvme_ctrlr->fast_io_fail_timedout) { 904 return true; 905 } 906 907 if (nvme_ctrlr->resetting) { 908 if (nvme_ctrlr->opts.reconnect_delay_sec != 0) { 909 return false; 910 } else { 911 return true; 912 } 913 } 914 915 if (nvme_ctrlr->reconnect_is_delayed) { 916 return false; 917 } 918 919 if (nvme_ctrlr->disabled) { 920 return true; 921 } 922 923 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 924 return true; 925 } else { 926 return false; 927 } 928 } 929 930 static bool 931 nvme_ctrlr_is_available(struct nvme_ctrlr *nvme_ctrlr) 932 { 933 if (nvme_ctrlr->destruct) { 934 return false; 935 } 936 937 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 938 return false; 939 } 940 941 if (nvme_ctrlr->resetting || nvme_ctrlr->reconnect_is_delayed) { 942 return false; 943 } 944 945 if (nvme_ctrlr->disabled) { 946 return false; 947 } 948 949 return true; 950 } 951 952 /* Simulate circular linked list. */ 953 static inline struct nvme_io_path * 954 nvme_io_path_get_next(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *prev_path) 955 { 956 struct nvme_io_path *next_path; 957 958 if (prev_path != NULL) { 959 next_path = STAILQ_NEXT(prev_path, stailq); 960 if (next_path != NULL) { 961 return next_path; 962 } 963 } 964 965 return STAILQ_FIRST(&nbdev_ch->io_path_list); 966 } 967 968 static struct nvme_io_path * 969 _bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 970 { 971 struct nvme_io_path *io_path, *start, *non_optimized = NULL; 972 973 start = nvme_io_path_get_next(nbdev_ch, nbdev_ch->current_io_path); 974 975 io_path = start; 976 do { 977 if (spdk_likely(nvme_io_path_is_available(io_path))) { 978 switch (io_path->nvme_ns->ana_state) { 979 case SPDK_NVME_ANA_OPTIMIZED_STATE: 980 nbdev_ch->current_io_path = io_path; 981 return io_path; 982 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 983 if (non_optimized == NULL) { 984 non_optimized = io_path; 985 } 986 break; 987 default: 988 assert(false); 989 break; 990 } 991 } 992 io_path = nvme_io_path_get_next(nbdev_ch, io_path); 993 } while (io_path != start); 994 995 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) { 996 /* We come here only if there is no optimized path. Cache even non_optimized 997 * path for load balance across multiple non_optimized paths. 998 */ 999 nbdev_ch->current_io_path = non_optimized; 1000 } 1001 1002 return non_optimized; 1003 } 1004 1005 static struct nvme_io_path * 1006 _bdev_nvme_find_io_path_min_qd(struct nvme_bdev_channel *nbdev_ch) 1007 { 1008 struct nvme_io_path *io_path; 1009 struct nvme_io_path *optimized = NULL, *non_optimized = NULL; 1010 uint32_t opt_min_qd = UINT32_MAX, non_opt_min_qd = UINT32_MAX; 1011 uint32_t num_outstanding_reqs; 1012 1013 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 1014 if (spdk_unlikely(!nvme_qpair_is_connected(io_path->qpair))) { 1015 /* The device is currently resetting. */ 1016 continue; 1017 } 1018 1019 if (spdk_unlikely(!nvme_ns_is_active(io_path->nvme_ns))) { 1020 continue; 1021 } 1022 1023 num_outstanding_reqs = spdk_nvme_qpair_get_num_outstanding_reqs(io_path->qpair->qpair); 1024 switch (io_path->nvme_ns->ana_state) { 1025 case SPDK_NVME_ANA_OPTIMIZED_STATE: 1026 if (num_outstanding_reqs < opt_min_qd) { 1027 opt_min_qd = num_outstanding_reqs; 1028 optimized = io_path; 1029 } 1030 break; 1031 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 1032 if (num_outstanding_reqs < non_opt_min_qd) { 1033 non_opt_min_qd = num_outstanding_reqs; 1034 non_optimized = io_path; 1035 } 1036 break; 1037 default: 1038 break; 1039 } 1040 } 1041 1042 /* don't cache io path for BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH selector */ 1043 if (optimized != NULL) { 1044 return optimized; 1045 } 1046 1047 return non_optimized; 1048 } 1049 1050 static inline struct nvme_io_path * 1051 bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 1052 { 1053 if (spdk_likely(nbdev_ch->current_io_path != NULL)) { 1054 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE) { 1055 return nbdev_ch->current_io_path; 1056 } else if (nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 1057 if (++nbdev_ch->rr_counter < nbdev_ch->rr_min_io) { 1058 return nbdev_ch->current_io_path; 1059 } 1060 nbdev_ch->rr_counter = 0; 1061 } 1062 } 1063 1064 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE || 1065 nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 1066 return _bdev_nvme_find_io_path(nbdev_ch); 1067 } else { 1068 return _bdev_nvme_find_io_path_min_qd(nbdev_ch); 1069 } 1070 } 1071 1072 /* Return true if there is any io_path whose qpair is active or ctrlr is not failed, 1073 * or false otherwise. 1074 * 1075 * If any io_path has an active qpair but find_io_path() returned NULL, its namespace 1076 * is likely to be non-accessible now but may become accessible. 1077 * 1078 * If any io_path has an unfailed ctrlr but find_io_path() returned NULL, the ctrlr 1079 * is likely to be resetting now but the reset may succeed. A ctrlr is set to unfailed 1080 * when starting to reset it but it is set to failed when the reset failed. Hence, if 1081 * a ctrlr is unfailed, it is likely that it works fine or is resetting. 1082 */ 1083 static bool 1084 any_io_path_may_become_available(struct nvme_bdev_channel *nbdev_ch) 1085 { 1086 struct nvme_io_path *io_path; 1087 1088 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 1089 if (io_path->nvme_ns->ana_transition_timedout) { 1090 continue; 1091 } 1092 1093 if (nvme_qpair_is_connected(io_path->qpair) || 1094 !nvme_ctrlr_is_failed(io_path->qpair->ctrlr)) { 1095 return true; 1096 } 1097 } 1098 1099 return false; 1100 } 1101 1102 static void 1103 bdev_nvme_retry_io(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io) 1104 { 1105 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 1106 struct spdk_io_channel *ch; 1107 1108 if (nbdev_io->io_path != NULL && nvme_io_path_is_available(nbdev_io->io_path)) { 1109 _bdev_nvme_submit_request(nbdev_ch, bdev_io); 1110 } else { 1111 ch = spdk_io_channel_from_ctx(nbdev_ch); 1112 bdev_nvme_submit_request(ch, bdev_io); 1113 } 1114 } 1115 1116 static int 1117 bdev_nvme_retry_ios(void *arg) 1118 { 1119 struct nvme_bdev_channel *nbdev_ch = arg; 1120 struct nvme_bdev_io *bio, *tmp_bio; 1121 uint64_t now, delay_us; 1122 1123 now = spdk_get_ticks(); 1124 1125 TAILQ_FOREACH_SAFE(bio, &nbdev_ch->retry_io_list, retry_link, tmp_bio) { 1126 if (bio->retry_ticks > now) { 1127 break; 1128 } 1129 1130 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bio, retry_link); 1131 1132 bdev_nvme_retry_io(nbdev_ch, spdk_bdev_io_from_ctx(bio)); 1133 } 1134 1135 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1136 1137 bio = TAILQ_FIRST(&nbdev_ch->retry_io_list); 1138 if (bio != NULL) { 1139 delay_us = (bio->retry_ticks - now) * SPDK_SEC_TO_USEC / spdk_get_ticks_hz(); 1140 1141 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 1142 delay_us); 1143 } 1144 1145 return SPDK_POLLER_BUSY; 1146 } 1147 1148 static void 1149 bdev_nvme_queue_retry_io(struct nvme_bdev_channel *nbdev_ch, 1150 struct nvme_bdev_io *bio, uint64_t delay_ms) 1151 { 1152 struct nvme_bdev_io *tmp_bio; 1153 1154 bio->retry_ticks = spdk_get_ticks() + delay_ms * spdk_get_ticks_hz() / 1000ULL; 1155 1156 TAILQ_FOREACH_REVERSE(tmp_bio, &nbdev_ch->retry_io_list, retry_io_head, retry_link) { 1157 if (tmp_bio->retry_ticks <= bio->retry_ticks) { 1158 TAILQ_INSERT_AFTER(&nbdev_ch->retry_io_list, tmp_bio, bio, 1159 retry_link); 1160 return; 1161 } 1162 } 1163 1164 /* No earlier I/Os were found. This I/O must be the new head. */ 1165 TAILQ_INSERT_HEAD(&nbdev_ch->retry_io_list, bio, retry_link); 1166 1167 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1168 1169 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 1170 delay_ms * 1000ULL); 1171 } 1172 1173 static void 1174 bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch) 1175 { 1176 struct nvme_bdev_io *bio, *tmp_bio; 1177 1178 TAILQ_FOREACH_SAFE(bio, &nbdev_ch->retry_io_list, retry_link, tmp_bio) { 1179 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bio, retry_link); 1180 __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_ABORTED, NULL); 1181 } 1182 1183 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1184 } 1185 1186 static int 1187 bdev_nvme_abort_retry_io(struct nvme_bdev_channel *nbdev_ch, 1188 struct nvme_bdev_io *bio_to_abort) 1189 { 1190 struct nvme_bdev_io *bio; 1191 1192 TAILQ_FOREACH(bio, &nbdev_ch->retry_io_list, retry_link) { 1193 if (bio == bio_to_abort) { 1194 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bio, retry_link); 1195 __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_ABORTED, NULL); 1196 return 0; 1197 } 1198 } 1199 1200 return -ENOENT; 1201 } 1202 1203 static void 1204 bdev_nvme_update_nvme_error_stat(struct spdk_bdev_io *bdev_io, const struct spdk_nvme_cpl *cpl) 1205 { 1206 struct nvme_bdev *nbdev; 1207 uint16_t sct, sc; 1208 1209 assert(spdk_nvme_cpl_is_error(cpl)); 1210 1211 nbdev = bdev_io->bdev->ctxt; 1212 1213 if (nbdev->err_stat == NULL) { 1214 return; 1215 } 1216 1217 sct = cpl->status.sct; 1218 sc = cpl->status.sc; 1219 1220 pthread_mutex_lock(&nbdev->mutex); 1221 1222 nbdev->err_stat->status_type[sct]++; 1223 switch (sct) { 1224 case SPDK_NVME_SCT_GENERIC: 1225 case SPDK_NVME_SCT_COMMAND_SPECIFIC: 1226 case SPDK_NVME_SCT_MEDIA_ERROR: 1227 case SPDK_NVME_SCT_PATH: 1228 nbdev->err_stat->status[sct][sc]++; 1229 break; 1230 default: 1231 break; 1232 } 1233 1234 pthread_mutex_unlock(&nbdev->mutex); 1235 } 1236 1237 static inline void 1238 bdev_nvme_update_io_path_stat(struct nvme_bdev_io *bio) 1239 { 1240 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1241 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 1242 uint32_t blocklen = bdev_io->bdev->blocklen; 1243 struct spdk_bdev_io_stat *stat; 1244 uint64_t tsc_diff; 1245 1246 if (bio->io_path->stat == NULL) { 1247 return; 1248 } 1249 1250 tsc_diff = spdk_get_ticks() - bio->submit_tsc; 1251 stat = bio->io_path->stat; 1252 1253 switch (bdev_io->type) { 1254 case SPDK_BDEV_IO_TYPE_READ: 1255 stat->bytes_read += num_blocks * blocklen; 1256 stat->num_read_ops++; 1257 stat->read_latency_ticks += tsc_diff; 1258 if (stat->max_read_latency_ticks < tsc_diff) { 1259 stat->max_read_latency_ticks = tsc_diff; 1260 } 1261 if (stat->min_read_latency_ticks > tsc_diff) { 1262 stat->min_read_latency_ticks = tsc_diff; 1263 } 1264 break; 1265 case SPDK_BDEV_IO_TYPE_WRITE: 1266 stat->bytes_written += num_blocks * blocklen; 1267 stat->num_write_ops++; 1268 stat->write_latency_ticks += tsc_diff; 1269 if (stat->max_write_latency_ticks < tsc_diff) { 1270 stat->max_write_latency_ticks = tsc_diff; 1271 } 1272 if (stat->min_write_latency_ticks > tsc_diff) { 1273 stat->min_write_latency_ticks = tsc_diff; 1274 } 1275 break; 1276 case SPDK_BDEV_IO_TYPE_UNMAP: 1277 stat->bytes_unmapped += num_blocks * blocklen; 1278 stat->num_unmap_ops++; 1279 stat->unmap_latency_ticks += tsc_diff; 1280 if (stat->max_unmap_latency_ticks < tsc_diff) { 1281 stat->max_unmap_latency_ticks = tsc_diff; 1282 } 1283 if (stat->min_unmap_latency_ticks > tsc_diff) { 1284 stat->min_unmap_latency_ticks = tsc_diff; 1285 } 1286 break; 1287 case SPDK_BDEV_IO_TYPE_ZCOPY: 1288 /* Track the data in the start phase only */ 1289 if (!bdev_io->u.bdev.zcopy.start) { 1290 break; 1291 } 1292 if (bdev_io->u.bdev.zcopy.populate) { 1293 stat->bytes_read += num_blocks * blocklen; 1294 stat->num_read_ops++; 1295 stat->read_latency_ticks += tsc_diff; 1296 if (stat->max_read_latency_ticks < tsc_diff) { 1297 stat->max_read_latency_ticks = tsc_diff; 1298 } 1299 if (stat->min_read_latency_ticks > tsc_diff) { 1300 stat->min_read_latency_ticks = tsc_diff; 1301 } 1302 } else { 1303 stat->bytes_written += num_blocks * blocklen; 1304 stat->num_write_ops++; 1305 stat->write_latency_ticks += tsc_diff; 1306 if (stat->max_write_latency_ticks < tsc_diff) { 1307 stat->max_write_latency_ticks = tsc_diff; 1308 } 1309 if (stat->min_write_latency_ticks > tsc_diff) { 1310 stat->min_write_latency_ticks = tsc_diff; 1311 } 1312 } 1313 break; 1314 case SPDK_BDEV_IO_TYPE_COPY: 1315 stat->bytes_copied += num_blocks * blocklen; 1316 stat->num_copy_ops++; 1317 stat->copy_latency_ticks += tsc_diff; 1318 if (stat->max_copy_latency_ticks < tsc_diff) { 1319 stat->max_copy_latency_ticks = tsc_diff; 1320 } 1321 if (stat->min_copy_latency_ticks > tsc_diff) { 1322 stat->min_copy_latency_ticks = tsc_diff; 1323 } 1324 break; 1325 default: 1326 break; 1327 } 1328 } 1329 1330 static bool 1331 bdev_nvme_check_retry_io(struct nvme_bdev_io *bio, 1332 const struct spdk_nvme_cpl *cpl, 1333 struct nvme_bdev_channel *nbdev_ch, 1334 uint64_t *_delay_ms) 1335 { 1336 struct nvme_io_path *io_path = bio->io_path; 1337 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 1338 const struct spdk_nvme_ctrlr_data *cdata; 1339 1340 if (spdk_nvme_cpl_is_path_error(cpl) || 1341 spdk_nvme_cpl_is_aborted_sq_deletion(cpl) || 1342 !nvme_io_path_is_available(io_path) || 1343 !nvme_ctrlr_is_available(nvme_ctrlr)) { 1344 bdev_nvme_clear_current_io_path(nbdev_ch); 1345 bio->io_path = NULL; 1346 if (spdk_nvme_cpl_is_ana_error(cpl)) { 1347 if (nvme_ctrlr_read_ana_log_page(nvme_ctrlr) == 0) { 1348 io_path->nvme_ns->ana_state_updating = true; 1349 } 1350 } 1351 if (!any_io_path_may_become_available(nbdev_ch)) { 1352 return false; 1353 } 1354 *_delay_ms = 0; 1355 } else { 1356 bio->retry_count++; 1357 1358 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 1359 1360 if (cpl->status.crd != 0) { 1361 *_delay_ms = cdata->crdt[cpl->status.crd] * 100; 1362 } else { 1363 *_delay_ms = 0; 1364 } 1365 } 1366 1367 return true; 1368 } 1369 1370 static inline void 1371 bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio, 1372 const struct spdk_nvme_cpl *cpl) 1373 { 1374 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1375 struct nvme_bdev_channel *nbdev_ch; 1376 uint64_t delay_ms; 1377 1378 assert(!bdev_nvme_io_type_is_admin(bdev_io->type)); 1379 1380 if (spdk_likely(spdk_nvme_cpl_is_success(cpl))) { 1381 bdev_nvme_update_io_path_stat(bio); 1382 goto complete; 1383 } 1384 1385 /* Update error counts before deciding if retry is needed. 1386 * Hence, error counts may be more than the number of I/O errors. 1387 */ 1388 bdev_nvme_update_nvme_error_stat(bdev_io, cpl); 1389 1390 if (cpl->status.dnr != 0 || spdk_nvme_cpl_is_aborted_by_request(cpl) || 1391 (g_opts.bdev_retry_count != -1 && bio->retry_count >= g_opts.bdev_retry_count)) { 1392 goto complete; 1393 } 1394 1395 /* At this point we don't know whether the sequence was successfully executed or not, so we 1396 * cannot retry the IO */ 1397 if (bdev_io->u.bdev.accel_sequence != NULL) { 1398 goto complete; 1399 } 1400 1401 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1402 1403 if (bdev_nvme_check_retry_io(bio, cpl, nbdev_ch, &delay_ms)) { 1404 bdev_nvme_queue_retry_io(nbdev_ch, bio, delay_ms); 1405 return; 1406 } 1407 1408 complete: 1409 bio->retry_count = 0; 1410 bio->submit_tsc = 0; 1411 bdev_io->u.bdev.accel_sequence = NULL; 1412 __bdev_nvme_io_complete(bdev_io, 0, cpl); 1413 } 1414 1415 static inline void 1416 bdev_nvme_io_complete(struct nvme_bdev_io *bio, int rc) 1417 { 1418 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1419 struct nvme_bdev_channel *nbdev_ch; 1420 enum spdk_bdev_io_status io_status; 1421 1422 assert(!bdev_nvme_io_type_is_admin(bdev_io->type)); 1423 1424 switch (rc) { 1425 case 0: 1426 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1427 break; 1428 case -ENOMEM: 1429 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1430 break; 1431 case -ENXIO: 1432 if (g_opts.bdev_retry_count == -1 || bio->retry_count < g_opts.bdev_retry_count) { 1433 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1434 1435 bdev_nvme_clear_current_io_path(nbdev_ch); 1436 bio->io_path = NULL; 1437 1438 if (any_io_path_may_become_available(nbdev_ch)) { 1439 bdev_nvme_queue_retry_io(nbdev_ch, bio, 1000ULL); 1440 return; 1441 } 1442 } 1443 1444 /* fallthrough */ 1445 default: 1446 spdk_accel_sequence_abort(bdev_io->u.bdev.accel_sequence); 1447 bdev_io->u.bdev.accel_sequence = NULL; 1448 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1449 break; 1450 } 1451 1452 bio->retry_count = 0; 1453 bio->submit_tsc = 0; 1454 __bdev_nvme_io_complete(bdev_io, io_status, NULL); 1455 } 1456 1457 static inline void 1458 bdev_nvme_admin_complete(struct nvme_bdev_io *bio, int rc) 1459 { 1460 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1461 enum spdk_bdev_io_status io_status; 1462 1463 switch (rc) { 1464 case 0: 1465 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1466 break; 1467 case -ENOMEM: 1468 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1469 break; 1470 case -ENXIO: 1471 /* fallthrough */ 1472 default: 1473 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1474 break; 1475 } 1476 1477 __bdev_nvme_io_complete(bdev_io, io_status, NULL); 1478 } 1479 1480 static void 1481 bdev_nvme_clear_io_path_caches_done(struct spdk_io_channel_iter *i, int status) 1482 { 1483 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 1484 1485 pthread_mutex_lock(&nvme_ctrlr->mutex); 1486 1487 assert(nvme_ctrlr->io_path_cache_clearing == true); 1488 nvme_ctrlr->io_path_cache_clearing = false; 1489 1490 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 1491 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1492 return; 1493 } 1494 1495 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1496 1497 nvme_ctrlr_unregister(nvme_ctrlr); 1498 } 1499 1500 static void 1501 _bdev_nvme_clear_io_path_cache(struct nvme_qpair *nvme_qpair) 1502 { 1503 struct nvme_io_path *io_path; 1504 1505 TAILQ_FOREACH(io_path, &nvme_qpair->io_path_list, tailq) { 1506 if (io_path->nbdev_ch == NULL) { 1507 continue; 1508 } 1509 bdev_nvme_clear_current_io_path(io_path->nbdev_ch); 1510 } 1511 } 1512 1513 static void 1514 bdev_nvme_clear_io_path_cache(struct spdk_io_channel_iter *i) 1515 { 1516 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 1517 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 1518 1519 assert(ctrlr_ch->qpair != NULL); 1520 1521 _bdev_nvme_clear_io_path_cache(ctrlr_ch->qpair); 1522 1523 spdk_for_each_channel_continue(i, 0); 1524 } 1525 1526 static void 1527 bdev_nvme_clear_io_path_caches(struct nvme_ctrlr *nvme_ctrlr) 1528 { 1529 pthread_mutex_lock(&nvme_ctrlr->mutex); 1530 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 1531 nvme_ctrlr->io_path_cache_clearing) { 1532 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1533 return; 1534 } 1535 1536 nvme_ctrlr->io_path_cache_clearing = true; 1537 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1538 1539 spdk_for_each_channel(nvme_ctrlr, 1540 bdev_nvme_clear_io_path_cache, 1541 NULL, 1542 bdev_nvme_clear_io_path_caches_done); 1543 } 1544 1545 static struct nvme_qpair * 1546 nvme_poll_group_get_qpair(struct nvme_poll_group *group, struct spdk_nvme_qpair *qpair) 1547 { 1548 struct nvme_qpair *nvme_qpair; 1549 1550 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1551 if (nvme_qpair->qpair == qpair) { 1552 break; 1553 } 1554 } 1555 1556 return nvme_qpair; 1557 } 1558 1559 static void nvme_qpair_delete(struct nvme_qpair *nvme_qpair); 1560 1561 static void 1562 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx) 1563 { 1564 struct nvme_poll_group *group = poll_group_ctx; 1565 struct nvme_qpair *nvme_qpair; 1566 struct nvme_ctrlr_channel *ctrlr_ch; 1567 int status; 1568 1569 nvme_qpair = nvme_poll_group_get_qpair(group, qpair); 1570 if (nvme_qpair == NULL) { 1571 return; 1572 } 1573 1574 if (nvme_qpair->qpair != NULL) { 1575 spdk_nvme_ctrlr_free_io_qpair(nvme_qpair->qpair); 1576 nvme_qpair->qpair = NULL; 1577 } 1578 1579 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1580 1581 ctrlr_ch = nvme_qpair->ctrlr_ch; 1582 1583 if (ctrlr_ch != NULL) { 1584 if (ctrlr_ch->reset_iter != NULL) { 1585 /* We are in a full reset sequence. */ 1586 if (ctrlr_ch->connect_poller != NULL) { 1587 /* qpair was failed to connect. Abort the reset sequence. */ 1588 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was failed to connect. abort the reset ctrlr sequence.\n", 1589 qpair); 1590 spdk_poller_unregister(&ctrlr_ch->connect_poller); 1591 status = -1; 1592 } else { 1593 /* qpair was completed to disconnect. Just move to the next ctrlr_channel. */ 1594 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was disconnected and freed in a reset ctrlr sequence.\n", 1595 qpair); 1596 status = 0; 1597 } 1598 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, status); 1599 ctrlr_ch->reset_iter = NULL; 1600 } else { 1601 /* qpair was disconnected unexpectedly. Reset controller for recovery. */ 1602 SPDK_NOTICELOG("qpair %p was disconnected and freed. reset controller.\n", qpair); 1603 bdev_nvme_failover_ctrlr(nvme_qpair->ctrlr); 1604 } 1605 } else { 1606 /* In this case, ctrlr_channel is already deleted. */ 1607 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was disconnected and freed. delete nvme_qpair.\n", qpair); 1608 nvme_qpair_delete(nvme_qpair); 1609 } 1610 } 1611 1612 static void 1613 bdev_nvme_check_io_qpairs(struct nvme_poll_group *group) 1614 { 1615 struct nvme_qpair *nvme_qpair; 1616 1617 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1618 if (nvme_qpair->qpair == NULL || nvme_qpair->ctrlr_ch == NULL) { 1619 continue; 1620 } 1621 1622 if (spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) != 1623 SPDK_NVME_QPAIR_FAILURE_NONE) { 1624 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1625 } 1626 } 1627 } 1628 1629 static int 1630 bdev_nvme_poll(void *arg) 1631 { 1632 struct nvme_poll_group *group = arg; 1633 int64_t num_completions; 1634 1635 if (group->collect_spin_stat && group->start_ticks == 0) { 1636 group->start_ticks = spdk_get_ticks(); 1637 } 1638 1639 num_completions = spdk_nvme_poll_group_process_completions(group->group, 0, 1640 bdev_nvme_disconnected_qpair_cb); 1641 if (group->collect_spin_stat) { 1642 if (num_completions > 0) { 1643 if (group->end_ticks != 0) { 1644 group->spin_ticks += (group->end_ticks - group->start_ticks); 1645 group->end_ticks = 0; 1646 } 1647 group->start_ticks = 0; 1648 } else { 1649 group->end_ticks = spdk_get_ticks(); 1650 } 1651 } 1652 1653 if (spdk_unlikely(num_completions < 0)) { 1654 bdev_nvme_check_io_qpairs(group); 1655 } 1656 1657 return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 1658 } 1659 1660 static int bdev_nvme_poll_adminq(void *arg); 1661 1662 static void 1663 bdev_nvme_change_adminq_poll_period(struct nvme_ctrlr *nvme_ctrlr, uint64_t new_period_us) 1664 { 1665 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 1666 1667 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, 1668 nvme_ctrlr, new_period_us); 1669 } 1670 1671 static int 1672 bdev_nvme_poll_adminq(void *arg) 1673 { 1674 int32_t rc; 1675 struct nvme_ctrlr *nvme_ctrlr = arg; 1676 nvme_ctrlr_disconnected_cb disconnected_cb; 1677 1678 assert(nvme_ctrlr != NULL); 1679 1680 rc = spdk_nvme_ctrlr_process_admin_completions(nvme_ctrlr->ctrlr); 1681 if (rc < 0) { 1682 disconnected_cb = nvme_ctrlr->disconnected_cb; 1683 nvme_ctrlr->disconnected_cb = NULL; 1684 1685 if (disconnected_cb != NULL) { 1686 bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 1687 g_opts.nvme_adminq_poll_period_us); 1688 disconnected_cb(nvme_ctrlr); 1689 } else { 1690 bdev_nvme_failover_ctrlr(nvme_ctrlr); 1691 } 1692 } else if (spdk_nvme_ctrlr_get_admin_qp_failure_reason(nvme_ctrlr->ctrlr) != 1693 SPDK_NVME_QPAIR_FAILURE_NONE) { 1694 bdev_nvme_clear_io_path_caches(nvme_ctrlr); 1695 } 1696 1697 return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY; 1698 } 1699 1700 static void 1701 nvme_bdev_free(void *io_device) 1702 { 1703 struct nvme_bdev *nvme_disk = io_device; 1704 1705 pthread_mutex_destroy(&nvme_disk->mutex); 1706 free(nvme_disk->disk.name); 1707 free(nvme_disk->err_stat); 1708 free(nvme_disk); 1709 } 1710 1711 static int 1712 bdev_nvme_destruct(void *ctx) 1713 { 1714 struct nvme_bdev *nvme_disk = ctx; 1715 struct nvme_ns *nvme_ns, *tmp_nvme_ns; 1716 1717 SPDK_DTRACE_PROBE2(bdev_nvme_destruct, nvme_disk->nbdev_ctrlr->name, nvme_disk->nsid); 1718 1719 TAILQ_FOREACH_SAFE(nvme_ns, &nvme_disk->nvme_ns_list, tailq, tmp_nvme_ns) { 1720 pthread_mutex_lock(&nvme_ns->ctrlr->mutex); 1721 1722 nvme_ns->bdev = NULL; 1723 1724 assert(nvme_ns->id > 0); 1725 1726 if (nvme_ctrlr_get_ns(nvme_ns->ctrlr, nvme_ns->id) == NULL) { 1727 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1728 1729 nvme_ctrlr_release(nvme_ns->ctrlr); 1730 nvme_ns_free(nvme_ns); 1731 } else { 1732 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1733 } 1734 } 1735 1736 pthread_mutex_lock(&g_bdev_nvme_mutex); 1737 TAILQ_REMOVE(&nvme_disk->nbdev_ctrlr->bdevs, nvme_disk, tailq); 1738 pthread_mutex_unlock(&g_bdev_nvme_mutex); 1739 1740 spdk_io_device_unregister(nvme_disk, nvme_bdev_free); 1741 1742 return 0; 1743 } 1744 1745 static int 1746 bdev_nvme_create_qpair(struct nvme_qpair *nvme_qpair) 1747 { 1748 struct nvme_ctrlr *nvme_ctrlr; 1749 struct spdk_nvme_io_qpair_opts opts; 1750 struct spdk_nvme_qpair *qpair; 1751 int rc; 1752 1753 nvme_ctrlr = nvme_qpair->ctrlr; 1754 1755 spdk_nvme_ctrlr_get_default_io_qpair_opts(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1756 opts.delay_cmd_submit = g_opts.delay_cmd_submit; 1757 opts.create_only = true; 1758 opts.async_mode = true; 1759 opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests); 1760 g_opts.io_queue_requests = opts.io_queue_requests; 1761 1762 qpair = spdk_nvme_ctrlr_alloc_io_qpair(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1763 if (qpair == NULL) { 1764 return -1; 1765 } 1766 1767 SPDK_DTRACE_PROBE3(bdev_nvme_create_qpair, nvme_ctrlr->nbdev_ctrlr->name, 1768 spdk_nvme_qpair_get_id(qpair), spdk_thread_get_id(nvme_ctrlr->thread)); 1769 1770 assert(nvme_qpair->group != NULL); 1771 1772 rc = spdk_nvme_poll_group_add(nvme_qpair->group->group, qpair); 1773 if (rc != 0) { 1774 SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n"); 1775 goto err; 1776 } 1777 1778 rc = spdk_nvme_ctrlr_connect_io_qpair(nvme_ctrlr->ctrlr, qpair); 1779 if (rc != 0) { 1780 SPDK_ERRLOG("Unable to connect I/O qpair.\n"); 1781 goto err; 1782 } 1783 1784 nvme_qpair->qpair = qpair; 1785 1786 if (!g_opts.disable_auto_failback) { 1787 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1788 } 1789 1790 return 0; 1791 1792 err: 1793 spdk_nvme_ctrlr_free_io_qpair(qpair); 1794 1795 return rc; 1796 } 1797 1798 static void bdev_nvme_reset_io_continue(void *cb_arg, int rc); 1799 1800 static void 1801 bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i) 1802 { 1803 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 1804 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 1805 int rc = 0; 1806 struct nvme_bdev_io *bio; 1807 1808 if (spdk_io_channel_iter_get_ctx(i) != NULL) { 1809 rc = -1; 1810 } 1811 1812 while (!TAILQ_EMPTY(&ctrlr_ch->pending_resets)) { 1813 bio = TAILQ_FIRST(&ctrlr_ch->pending_resets); 1814 TAILQ_REMOVE(&ctrlr_ch->pending_resets, bio, retry_link); 1815 1816 bdev_nvme_reset_io_continue(bio, rc); 1817 } 1818 1819 spdk_for_each_channel_continue(i, 0); 1820 } 1821 1822 /* This function marks the current trid as failed by storing the current ticks 1823 * and then sets the next trid to the active trid within a controller if exists. 1824 * 1825 * The purpose of the boolean return value is to request the caller to disconnect 1826 * the current trid now to try connecting the next trid. 1827 */ 1828 static bool 1829 bdev_nvme_failover_trid(struct nvme_ctrlr *nvme_ctrlr, bool remove, bool start) 1830 { 1831 struct nvme_path_id *path_id, *next_path; 1832 int rc __attribute__((unused)); 1833 1834 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 1835 assert(path_id); 1836 assert(path_id == nvme_ctrlr->active_path_id); 1837 next_path = TAILQ_NEXT(path_id, link); 1838 1839 /* Update the last failed time. It means the trid is failed if its last 1840 * failed time is non-zero. 1841 */ 1842 path_id->last_failed_tsc = spdk_get_ticks(); 1843 1844 if (next_path == NULL) { 1845 /* There is no alternate trid within a controller. */ 1846 return false; 1847 } 1848 1849 if (!start && nvme_ctrlr->opts.reconnect_delay_sec == 0) { 1850 /* Connect is not retried in a controller reset sequence. Connecting 1851 * the next trid will be done by the next bdev_nvme_failover_ctrlr() call. 1852 */ 1853 return false; 1854 } 1855 1856 assert(path_id->trid.trtype != SPDK_NVME_TRANSPORT_PCIE); 1857 1858 SPDK_NOTICELOG("Start failover from %s:%s to %s:%s\n", path_id->trid.traddr, 1859 path_id->trid.trsvcid, next_path->trid.traddr, next_path->trid.trsvcid); 1860 1861 spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr); 1862 nvme_ctrlr->active_path_id = next_path; 1863 rc = spdk_nvme_ctrlr_set_trid(nvme_ctrlr->ctrlr, &next_path->trid); 1864 assert(rc == 0); 1865 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 1866 if (!remove) { 1867 /** Shuffle the old trid to the end of the list and use the new one. 1868 * Allows for round robin through multiple connections. 1869 */ 1870 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, path_id, link); 1871 } else { 1872 free(path_id); 1873 } 1874 1875 if (start || next_path->last_failed_tsc == 0) { 1876 /* bdev_nvme_failover_ctrlr() is just called or the next trid is not failed 1877 * or used yet. Try the next trid now. 1878 */ 1879 return true; 1880 } 1881 1882 if (spdk_get_ticks() > next_path->last_failed_tsc + spdk_get_ticks_hz() * 1883 nvme_ctrlr->opts.reconnect_delay_sec) { 1884 /* Enough backoff passed since the next trid failed. Try the next trid now. */ 1885 return true; 1886 } 1887 1888 /* The next trid will be tried after reconnect_delay_sec seconds. */ 1889 return false; 1890 } 1891 1892 static bool 1893 bdev_nvme_check_ctrlr_loss_timeout(struct nvme_ctrlr *nvme_ctrlr) 1894 { 1895 int32_t elapsed; 1896 1897 if (nvme_ctrlr->opts.ctrlr_loss_timeout_sec == 0 || 1898 nvme_ctrlr->opts.ctrlr_loss_timeout_sec == -1) { 1899 return false; 1900 } 1901 1902 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 1903 if (elapsed >= nvme_ctrlr->opts.ctrlr_loss_timeout_sec) { 1904 return true; 1905 } else { 1906 return false; 1907 } 1908 } 1909 1910 static bool 1911 bdev_nvme_check_fast_io_fail_timeout(struct nvme_ctrlr *nvme_ctrlr) 1912 { 1913 uint32_t elapsed; 1914 1915 if (nvme_ctrlr->opts.fast_io_fail_timeout_sec == 0) { 1916 return false; 1917 } 1918 1919 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 1920 if (elapsed >= nvme_ctrlr->opts.fast_io_fail_timeout_sec) { 1921 return true; 1922 } else { 1923 return false; 1924 } 1925 } 1926 1927 static void bdev_nvme_reset_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, bool success); 1928 1929 static void 1930 nvme_ctrlr_disconnect(struct nvme_ctrlr *nvme_ctrlr, nvme_ctrlr_disconnected_cb cb_fn) 1931 { 1932 int rc; 1933 1934 rc = spdk_nvme_ctrlr_disconnect(nvme_ctrlr->ctrlr); 1935 if (rc != 0) { 1936 /* Disconnect fails if ctrlr is already resetting or removed. In this case, 1937 * fail the reset sequence immediately. 1938 */ 1939 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 1940 return; 1941 } 1942 1943 /* spdk_nvme_ctrlr_disconnect() may complete asynchronously later by polling adminq. 1944 * Set callback here to execute the specified operation after ctrlr is really disconnected. 1945 */ 1946 assert(nvme_ctrlr->disconnected_cb == NULL); 1947 nvme_ctrlr->disconnected_cb = cb_fn; 1948 1949 /* During disconnection, reduce the period to poll adminq more often. */ 1950 bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 0); 1951 } 1952 1953 enum bdev_nvme_op_after_reset { 1954 OP_NONE, 1955 OP_COMPLETE_PENDING_DESTRUCT, 1956 OP_DESTRUCT, 1957 OP_DELAYED_RECONNECT, 1958 OP_FAILOVER, 1959 }; 1960 1961 typedef enum bdev_nvme_op_after_reset _bdev_nvme_op_after_reset; 1962 1963 static _bdev_nvme_op_after_reset 1964 bdev_nvme_check_op_after_reset(struct nvme_ctrlr *nvme_ctrlr, bool success) 1965 { 1966 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 1967 /* Complete pending destruct after reset completes. */ 1968 return OP_COMPLETE_PENDING_DESTRUCT; 1969 } else if (nvme_ctrlr->pending_failover) { 1970 nvme_ctrlr->pending_failover = false; 1971 nvme_ctrlr->reset_start_tsc = 0; 1972 return OP_FAILOVER; 1973 } else if (success || nvme_ctrlr->opts.reconnect_delay_sec == 0) { 1974 nvme_ctrlr->reset_start_tsc = 0; 1975 return OP_NONE; 1976 } else if (bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 1977 return OP_DESTRUCT; 1978 } else { 1979 if (bdev_nvme_check_fast_io_fail_timeout(nvme_ctrlr)) { 1980 nvme_ctrlr->fast_io_fail_timedout = true; 1981 } 1982 return OP_DELAYED_RECONNECT; 1983 } 1984 } 1985 1986 static int bdev_nvme_delete_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool hotplug); 1987 static void bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 1988 1989 static int 1990 bdev_nvme_reconnect_delay_timer_expired(void *ctx) 1991 { 1992 struct nvme_ctrlr *nvme_ctrlr = ctx; 1993 1994 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect_delay, nvme_ctrlr->nbdev_ctrlr->name); 1995 pthread_mutex_lock(&nvme_ctrlr->mutex); 1996 1997 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 1998 1999 if (!nvme_ctrlr->reconnect_is_delayed) { 2000 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2001 return SPDK_POLLER_BUSY; 2002 } 2003 2004 nvme_ctrlr->reconnect_is_delayed = false; 2005 2006 if (nvme_ctrlr->destruct) { 2007 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2008 return SPDK_POLLER_BUSY; 2009 } 2010 2011 assert(nvme_ctrlr->resetting == false); 2012 nvme_ctrlr->resetting = true; 2013 2014 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2015 2016 spdk_poller_resume(nvme_ctrlr->adminq_timer_poller); 2017 2018 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2019 return SPDK_POLLER_BUSY; 2020 } 2021 2022 static void 2023 bdev_nvme_start_reconnect_delay_timer(struct nvme_ctrlr *nvme_ctrlr) 2024 { 2025 spdk_poller_pause(nvme_ctrlr->adminq_timer_poller); 2026 2027 assert(nvme_ctrlr->reconnect_is_delayed == false); 2028 nvme_ctrlr->reconnect_is_delayed = true; 2029 2030 assert(nvme_ctrlr->reconnect_delay_timer == NULL); 2031 nvme_ctrlr->reconnect_delay_timer = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_delay_timer_expired, 2032 nvme_ctrlr, 2033 nvme_ctrlr->opts.reconnect_delay_sec * SPDK_SEC_TO_USEC); 2034 } 2035 2036 static void remove_discovery_entry(struct nvme_ctrlr *nvme_ctrlr); 2037 2038 static void 2039 _bdev_nvme_reset_ctrlr_complete(struct spdk_io_channel_iter *i, int status) 2040 { 2041 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2042 bool success = spdk_io_channel_iter_get_ctx(i) == NULL; 2043 bdev_nvme_ctrlr_op_cb ctrlr_op_cb_fn = nvme_ctrlr->ctrlr_op_cb_fn; 2044 void *ctrlr_op_cb_arg = nvme_ctrlr->ctrlr_op_cb_arg; 2045 enum bdev_nvme_op_after_reset op_after_reset; 2046 2047 assert(nvme_ctrlr->thread == spdk_get_thread()); 2048 2049 nvme_ctrlr->ctrlr_op_cb_fn = NULL; 2050 nvme_ctrlr->ctrlr_op_cb_arg = NULL; 2051 2052 if (!success) { 2053 SPDK_ERRLOG("Resetting controller failed.\n"); 2054 } else { 2055 SPDK_NOTICELOG("Resetting controller successful.\n"); 2056 } 2057 2058 pthread_mutex_lock(&nvme_ctrlr->mutex); 2059 nvme_ctrlr->resetting = false; 2060 nvme_ctrlr->dont_retry = false; 2061 nvme_ctrlr->in_failover = false; 2062 2063 op_after_reset = bdev_nvme_check_op_after_reset(nvme_ctrlr, success); 2064 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2065 2066 /* Delay callbacks when the next operation is a failover. */ 2067 if (ctrlr_op_cb_fn && op_after_reset != OP_FAILOVER) { 2068 ctrlr_op_cb_fn(ctrlr_op_cb_arg, success ? 0 : -1); 2069 } 2070 2071 switch (op_after_reset) { 2072 case OP_COMPLETE_PENDING_DESTRUCT: 2073 nvme_ctrlr_unregister(nvme_ctrlr); 2074 break; 2075 case OP_DESTRUCT: 2076 bdev_nvme_delete_ctrlr(nvme_ctrlr, false); 2077 remove_discovery_entry(nvme_ctrlr); 2078 break; 2079 case OP_DELAYED_RECONNECT: 2080 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_start_reconnect_delay_timer); 2081 break; 2082 case OP_FAILOVER: 2083 nvme_ctrlr->ctrlr_op_cb_fn = ctrlr_op_cb_fn; 2084 nvme_ctrlr->ctrlr_op_cb_arg = ctrlr_op_cb_arg; 2085 bdev_nvme_failover_ctrlr(nvme_ctrlr); 2086 break; 2087 default: 2088 break; 2089 } 2090 } 2091 2092 static void 2093 bdev_nvme_reset_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, bool success) 2094 { 2095 pthread_mutex_lock(&nvme_ctrlr->mutex); 2096 if (!success) { 2097 /* Connecting the active trid failed. Set the next alternate trid to the 2098 * active trid if it exists. 2099 */ 2100 if (bdev_nvme_failover_trid(nvme_ctrlr, false, false)) { 2101 /* The next alternate trid exists and is ready to try. Try it now. */ 2102 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2103 2104 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr); 2105 return; 2106 } 2107 2108 /* We came here if there is no alternate trid or if the next trid exists but 2109 * is not ready to try. We will try the active trid after reconnect_delay_sec 2110 * seconds if it is non-zero or at the next reset call otherwise. 2111 */ 2112 } else { 2113 /* Connecting the active trid succeeded. Clear the last failed time because it 2114 * means the trid is failed if its last failed time is non-zero. 2115 */ 2116 nvme_ctrlr->active_path_id->last_failed_tsc = 0; 2117 } 2118 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2119 2120 /* Make sure we clear any pending resets before returning. */ 2121 spdk_for_each_channel(nvme_ctrlr, 2122 bdev_nvme_complete_pending_resets, 2123 success ? NULL : (void *)0x1, 2124 _bdev_nvme_reset_ctrlr_complete); 2125 } 2126 2127 static void 2128 bdev_nvme_reset_create_qpairs_failed(struct spdk_io_channel_iter *i, int status) 2129 { 2130 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2131 2132 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 2133 } 2134 2135 static void 2136 bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i) 2137 { 2138 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 2139 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch); 2140 struct nvme_qpair *nvme_qpair; 2141 2142 nvme_qpair = ctrlr_ch->qpair; 2143 assert(nvme_qpair != NULL); 2144 2145 _bdev_nvme_clear_io_path_cache(nvme_qpair); 2146 2147 if (nvme_qpair->qpair != NULL) { 2148 if (nvme_qpair->ctrlr->dont_retry) { 2149 spdk_nvme_qpair_set_abort_dnr(nvme_qpair->qpair, true); 2150 } 2151 spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); 2152 2153 /* The current full reset sequence will move to the next 2154 * ctrlr_channel after the qpair is actually disconnected. 2155 */ 2156 assert(ctrlr_ch->reset_iter == NULL); 2157 ctrlr_ch->reset_iter = i; 2158 } else { 2159 spdk_for_each_channel_continue(i, 0); 2160 } 2161 } 2162 2163 static void 2164 bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status) 2165 { 2166 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2167 2168 if (status == 0) { 2169 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, true); 2170 } else { 2171 /* Delete the added qpairs and quiesce ctrlr to make the states clean. */ 2172 spdk_for_each_channel(nvme_ctrlr, 2173 bdev_nvme_reset_destroy_qpair, 2174 NULL, 2175 bdev_nvme_reset_create_qpairs_failed); 2176 } 2177 } 2178 2179 static int 2180 bdev_nvme_reset_check_qpair_connected(void *ctx) 2181 { 2182 struct nvme_ctrlr_channel *ctrlr_ch = ctx; 2183 2184 if (ctrlr_ch->reset_iter == NULL) { 2185 /* qpair was already failed to connect and the reset sequence is being aborted. */ 2186 assert(ctrlr_ch->connect_poller == NULL); 2187 assert(ctrlr_ch->qpair->qpair == NULL); 2188 return SPDK_POLLER_BUSY; 2189 } 2190 2191 assert(ctrlr_ch->qpair->qpair != NULL); 2192 2193 if (!spdk_nvme_qpair_is_connected(ctrlr_ch->qpair->qpair)) { 2194 return SPDK_POLLER_BUSY; 2195 } 2196 2197 spdk_poller_unregister(&ctrlr_ch->connect_poller); 2198 2199 /* qpair was completed to connect. Move to the next ctrlr_channel */ 2200 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 2201 ctrlr_ch->reset_iter = NULL; 2202 2203 if (!g_opts.disable_auto_failback) { 2204 _bdev_nvme_clear_io_path_cache(ctrlr_ch->qpair); 2205 } 2206 2207 return SPDK_POLLER_BUSY; 2208 } 2209 2210 static void 2211 bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i) 2212 { 2213 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 2214 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 2215 int rc; 2216 2217 rc = bdev_nvme_create_qpair(ctrlr_ch->qpair); 2218 if (rc == 0) { 2219 ctrlr_ch->connect_poller = SPDK_POLLER_REGISTER(bdev_nvme_reset_check_qpair_connected, 2220 ctrlr_ch, 0); 2221 2222 /* The current full reset sequence will move to the next 2223 * ctrlr_channel after the qpair is actually connected. 2224 */ 2225 assert(ctrlr_ch->reset_iter == NULL); 2226 ctrlr_ch->reset_iter = i; 2227 } else { 2228 spdk_for_each_channel_continue(i, rc); 2229 } 2230 } 2231 2232 static void 2233 nvme_ctrlr_check_namespaces(struct nvme_ctrlr *nvme_ctrlr) 2234 { 2235 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 2236 struct nvme_ns *nvme_ns; 2237 2238 for (nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 2239 nvme_ns != NULL; 2240 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns)) { 2241 if (!spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) { 2242 SPDK_DEBUGLOG(bdev_nvme, "NSID %u was removed during reset.\n", nvme_ns->id); 2243 /* NS can be added again. Just nullify nvme_ns->ns. */ 2244 nvme_ns->ns = NULL; 2245 } 2246 } 2247 } 2248 2249 2250 static int 2251 bdev_nvme_reconnect_ctrlr_poll(void *arg) 2252 { 2253 struct nvme_ctrlr *nvme_ctrlr = arg; 2254 int rc = -ETIMEDOUT; 2255 2256 if (!bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 2257 rc = spdk_nvme_ctrlr_reconnect_poll_async(nvme_ctrlr->ctrlr); 2258 if (rc == -EAGAIN) { 2259 return SPDK_POLLER_BUSY; 2260 } 2261 } 2262 2263 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 2264 if (rc == 0) { 2265 nvme_ctrlr_check_namespaces(nvme_ctrlr); 2266 2267 /* Recreate all of the I/O queue pairs */ 2268 spdk_for_each_channel(nvme_ctrlr, 2269 bdev_nvme_reset_create_qpair, 2270 NULL, 2271 bdev_nvme_reset_create_qpairs_done); 2272 } else { 2273 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 2274 } 2275 return SPDK_POLLER_BUSY; 2276 } 2277 2278 static void 2279 bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2280 { 2281 spdk_nvme_ctrlr_reconnect_async(nvme_ctrlr->ctrlr); 2282 2283 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect, nvme_ctrlr->nbdev_ctrlr->name); 2284 assert(nvme_ctrlr->reset_detach_poller == NULL); 2285 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_ctrlr_poll, 2286 nvme_ctrlr, 0); 2287 } 2288 2289 static void 2290 bdev_nvme_reset_destroy_qpair_done(struct spdk_io_channel_iter *i, int status) 2291 { 2292 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2293 2294 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reset, nvme_ctrlr->nbdev_ctrlr->name); 2295 assert(status == 0); 2296 2297 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2298 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2299 } else { 2300 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr); 2301 } 2302 } 2303 2304 static void 2305 bdev_nvme_reset_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr) 2306 { 2307 spdk_for_each_channel(nvme_ctrlr, 2308 bdev_nvme_reset_destroy_qpair, 2309 NULL, 2310 bdev_nvme_reset_destroy_qpair_done); 2311 } 2312 2313 static void 2314 bdev_nvme_reconnect_ctrlr_now(void *ctx) 2315 { 2316 struct nvme_ctrlr *nvme_ctrlr = ctx; 2317 2318 assert(nvme_ctrlr->resetting == true); 2319 assert(nvme_ctrlr->thread == spdk_get_thread()); 2320 2321 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 2322 2323 spdk_poller_resume(nvme_ctrlr->adminq_timer_poller); 2324 2325 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2326 } 2327 2328 static void 2329 _bdev_nvme_reset_ctrlr(void *ctx) 2330 { 2331 struct nvme_ctrlr *nvme_ctrlr = ctx; 2332 2333 assert(nvme_ctrlr->resetting == true); 2334 assert(nvme_ctrlr->thread == spdk_get_thread()); 2335 2336 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2337 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reset_destroy_qpairs); 2338 } else { 2339 bdev_nvme_reset_destroy_qpairs(nvme_ctrlr); 2340 } 2341 } 2342 2343 static int 2344 bdev_nvme_reset_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2345 { 2346 spdk_msg_fn msg_fn; 2347 2348 pthread_mutex_lock(&nvme_ctrlr->mutex); 2349 if (nvme_ctrlr->destruct) { 2350 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2351 return -ENXIO; 2352 } 2353 2354 if (nvme_ctrlr->resetting) { 2355 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2356 SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); 2357 return -EBUSY; 2358 } 2359 2360 if (nvme_ctrlr->disabled) { 2361 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2362 SPDK_NOTICELOG("Unable to perform reset. Controller is disabled.\n"); 2363 return -EALREADY; 2364 } 2365 2366 nvme_ctrlr->resetting = true; 2367 nvme_ctrlr->dont_retry = true; 2368 2369 if (nvme_ctrlr->reconnect_is_delayed) { 2370 SPDK_DEBUGLOG(bdev_nvme, "Reconnect is already scheduled.\n"); 2371 msg_fn = bdev_nvme_reconnect_ctrlr_now; 2372 nvme_ctrlr->reconnect_is_delayed = false; 2373 } else { 2374 msg_fn = _bdev_nvme_reset_ctrlr; 2375 assert(nvme_ctrlr->reset_start_tsc == 0); 2376 } 2377 2378 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2379 2380 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2381 2382 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 2383 return 0; 2384 } 2385 2386 static int 2387 bdev_nvme_enable_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2388 { 2389 pthread_mutex_lock(&nvme_ctrlr->mutex); 2390 if (nvme_ctrlr->destruct) { 2391 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2392 return -ENXIO; 2393 } 2394 2395 if (nvme_ctrlr->resetting) { 2396 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2397 return -EBUSY; 2398 } 2399 2400 if (!nvme_ctrlr->disabled) { 2401 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2402 return -EALREADY; 2403 } 2404 2405 nvme_ctrlr->disabled = false; 2406 nvme_ctrlr->resetting = true; 2407 2408 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2409 2410 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2411 2412 spdk_thread_send_msg(nvme_ctrlr->thread, bdev_nvme_reconnect_ctrlr_now, nvme_ctrlr); 2413 return 0; 2414 } 2415 2416 static void 2417 _bdev_nvme_disable_ctrlr_complete(struct spdk_io_channel_iter *i, int status) 2418 { 2419 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2420 bdev_nvme_ctrlr_op_cb ctrlr_op_cb_fn = nvme_ctrlr->ctrlr_op_cb_fn; 2421 void *ctrlr_op_cb_arg = nvme_ctrlr->ctrlr_op_cb_arg; 2422 enum bdev_nvme_op_after_reset op_after_disable; 2423 2424 assert(nvme_ctrlr->thread == spdk_get_thread()); 2425 2426 nvme_ctrlr->ctrlr_op_cb_fn = NULL; 2427 nvme_ctrlr->ctrlr_op_cb_arg = NULL; 2428 2429 pthread_mutex_lock(&nvme_ctrlr->mutex); 2430 2431 nvme_ctrlr->resetting = false; 2432 nvme_ctrlr->dont_retry = false; 2433 2434 op_after_disable = bdev_nvme_check_op_after_reset(nvme_ctrlr, true); 2435 2436 nvme_ctrlr->disabled = true; 2437 spdk_poller_pause(nvme_ctrlr->adminq_timer_poller); 2438 2439 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2440 2441 if (ctrlr_op_cb_fn) { 2442 ctrlr_op_cb_fn(ctrlr_op_cb_arg, 0); 2443 } 2444 2445 switch (op_after_disable) { 2446 case OP_COMPLETE_PENDING_DESTRUCT: 2447 nvme_ctrlr_unregister(nvme_ctrlr); 2448 break; 2449 default: 2450 break; 2451 } 2452 2453 } 2454 2455 static void 2456 bdev_nvme_disable_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr) 2457 { 2458 /* Make sure we clear any pending resets before returning. */ 2459 spdk_for_each_channel(nvme_ctrlr, 2460 bdev_nvme_complete_pending_resets, 2461 NULL, 2462 _bdev_nvme_disable_ctrlr_complete); 2463 } 2464 2465 static void 2466 bdev_nvme_disable_destroy_qpairs_done(struct spdk_io_channel_iter *i, int status) 2467 { 2468 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2469 2470 assert(status == 0); 2471 2472 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2473 bdev_nvme_disable_ctrlr_complete(nvme_ctrlr); 2474 } else { 2475 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_disable_ctrlr_complete); 2476 } 2477 } 2478 2479 static void 2480 bdev_nvme_disable_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr) 2481 { 2482 spdk_for_each_channel(nvme_ctrlr, 2483 bdev_nvme_reset_destroy_qpair, 2484 NULL, 2485 bdev_nvme_disable_destroy_qpairs_done); 2486 } 2487 2488 static void 2489 _bdev_nvme_cancel_reconnect_and_disable_ctrlr(void *ctx) 2490 { 2491 struct nvme_ctrlr *nvme_ctrlr = ctx; 2492 2493 assert(nvme_ctrlr->resetting == true); 2494 assert(nvme_ctrlr->thread == spdk_get_thread()); 2495 2496 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 2497 2498 bdev_nvme_disable_ctrlr_complete(nvme_ctrlr); 2499 } 2500 2501 static void 2502 _bdev_nvme_disconnect_and_disable_ctrlr(void *ctx) 2503 { 2504 struct nvme_ctrlr *nvme_ctrlr = ctx; 2505 2506 assert(nvme_ctrlr->resetting == true); 2507 assert(nvme_ctrlr->thread == spdk_get_thread()); 2508 2509 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2510 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_disable_destroy_qpairs); 2511 } else { 2512 bdev_nvme_disable_destroy_qpairs(nvme_ctrlr); 2513 } 2514 } 2515 2516 static int 2517 bdev_nvme_disable_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2518 { 2519 spdk_msg_fn msg_fn; 2520 2521 pthread_mutex_lock(&nvme_ctrlr->mutex); 2522 if (nvme_ctrlr->destruct) { 2523 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2524 return -ENXIO; 2525 } 2526 2527 if (nvme_ctrlr->resetting) { 2528 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2529 return -EBUSY; 2530 } 2531 2532 if (nvme_ctrlr->disabled) { 2533 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2534 return -EALREADY; 2535 } 2536 2537 nvme_ctrlr->resetting = true; 2538 nvme_ctrlr->dont_retry = true; 2539 2540 if (nvme_ctrlr->reconnect_is_delayed) { 2541 msg_fn = _bdev_nvme_cancel_reconnect_and_disable_ctrlr; 2542 nvme_ctrlr->reconnect_is_delayed = false; 2543 } else { 2544 msg_fn = _bdev_nvme_disconnect_and_disable_ctrlr; 2545 } 2546 2547 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2548 2549 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2550 2551 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 2552 return 0; 2553 } 2554 2555 static int 2556 nvme_ctrlr_op(struct nvme_ctrlr *nvme_ctrlr, enum nvme_ctrlr_op op, 2557 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2558 { 2559 int rc; 2560 2561 switch (op) { 2562 case NVME_CTRLR_OP_RESET: 2563 rc = bdev_nvme_reset_ctrlr(nvme_ctrlr); 2564 break; 2565 case NVME_CTRLR_OP_ENABLE: 2566 rc = bdev_nvme_enable_ctrlr(nvme_ctrlr); 2567 break; 2568 case NVME_CTRLR_OP_DISABLE: 2569 rc = bdev_nvme_disable_ctrlr(nvme_ctrlr); 2570 break; 2571 default: 2572 rc = -EINVAL; 2573 break; 2574 } 2575 2576 if (rc == 0) { 2577 assert(nvme_ctrlr->ctrlr_op_cb_fn == NULL); 2578 assert(nvme_ctrlr->ctrlr_op_cb_arg == NULL); 2579 nvme_ctrlr->ctrlr_op_cb_fn = cb_fn; 2580 nvme_ctrlr->ctrlr_op_cb_arg = cb_arg; 2581 } 2582 return rc; 2583 } 2584 2585 struct nvme_ctrlr_op_rpc_ctx { 2586 struct nvme_ctrlr *nvme_ctrlr; 2587 struct spdk_thread *orig_thread; 2588 enum nvme_ctrlr_op op; 2589 int rc; 2590 bdev_nvme_ctrlr_op_cb cb_fn; 2591 void *cb_arg; 2592 }; 2593 2594 static void 2595 _nvme_ctrlr_op_rpc_complete(void *_ctx) 2596 { 2597 struct nvme_ctrlr_op_rpc_ctx *ctx = _ctx; 2598 2599 assert(ctx != NULL); 2600 assert(ctx->cb_fn != NULL); 2601 2602 ctx->cb_fn(ctx->cb_arg, ctx->rc); 2603 2604 free(ctx); 2605 } 2606 2607 static void 2608 nvme_ctrlr_op_rpc_complete(void *cb_arg, int rc) 2609 { 2610 struct nvme_ctrlr_op_rpc_ctx *ctx = cb_arg; 2611 2612 ctx->rc = rc; 2613 2614 spdk_thread_send_msg(ctx->orig_thread, _nvme_ctrlr_op_rpc_complete, ctx); 2615 } 2616 2617 void 2618 nvme_ctrlr_op_rpc(struct nvme_ctrlr *nvme_ctrlr, enum nvme_ctrlr_op op, 2619 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2620 { 2621 struct nvme_ctrlr_op_rpc_ctx *ctx; 2622 int rc; 2623 2624 assert(cb_fn != NULL); 2625 2626 ctx = calloc(1, sizeof(*ctx)); 2627 if (ctx == NULL) { 2628 SPDK_ERRLOG("Failed to allocate nvme_ctrlr_op_rpc_ctx.\n"); 2629 cb_fn(cb_arg, -ENOMEM); 2630 return; 2631 } 2632 2633 ctx->orig_thread = spdk_get_thread(); 2634 ctx->cb_fn = cb_fn; 2635 ctx->cb_arg = cb_arg; 2636 2637 rc = nvme_ctrlr_op(nvme_ctrlr, op, nvme_ctrlr_op_rpc_complete, ctx); 2638 if (rc == 0) { 2639 return; 2640 } else if (rc == -EALREADY) { 2641 rc = 0; 2642 } 2643 2644 nvme_ctrlr_op_rpc_complete(ctx, rc); 2645 } 2646 2647 static void nvme_bdev_ctrlr_op_rpc_continue(void *cb_arg, int rc); 2648 2649 static void 2650 _nvme_bdev_ctrlr_op_rpc_continue(void *_ctx) 2651 { 2652 struct nvme_ctrlr_op_rpc_ctx *ctx = _ctx; 2653 struct nvme_ctrlr *prev_nvme_ctrlr, *next_nvme_ctrlr; 2654 int rc; 2655 2656 prev_nvme_ctrlr = ctx->nvme_ctrlr; 2657 ctx->nvme_ctrlr = NULL; 2658 2659 if (ctx->rc != 0) { 2660 goto complete; 2661 } 2662 2663 next_nvme_ctrlr = TAILQ_NEXT(prev_nvme_ctrlr, tailq); 2664 if (next_nvme_ctrlr == NULL) { 2665 goto complete; 2666 } 2667 2668 rc = nvme_ctrlr_op(next_nvme_ctrlr, ctx->op, nvme_bdev_ctrlr_op_rpc_continue, ctx); 2669 if (rc == 0) { 2670 ctx->nvme_ctrlr = next_nvme_ctrlr; 2671 return; 2672 } else if (rc == -EALREADY) { 2673 ctx->nvme_ctrlr = next_nvme_ctrlr; 2674 rc = 0; 2675 } 2676 2677 ctx->rc = rc; 2678 2679 complete: 2680 ctx->cb_fn(ctx->cb_arg, ctx->rc); 2681 free(ctx); 2682 } 2683 2684 static void 2685 nvme_bdev_ctrlr_op_rpc_continue(void *cb_arg, int rc) 2686 { 2687 struct nvme_ctrlr_op_rpc_ctx *ctx = cb_arg; 2688 2689 ctx->rc = rc; 2690 2691 spdk_thread_send_msg(ctx->orig_thread, _nvme_bdev_ctrlr_op_rpc_continue, ctx); 2692 } 2693 2694 void 2695 nvme_bdev_ctrlr_op_rpc(struct nvme_bdev_ctrlr *nbdev_ctrlr, enum nvme_ctrlr_op op, 2696 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2697 { 2698 struct nvme_ctrlr_op_rpc_ctx *ctx; 2699 struct nvme_ctrlr *nvme_ctrlr; 2700 int rc; 2701 2702 assert(cb_fn != NULL); 2703 2704 ctx = calloc(1, sizeof(*ctx)); 2705 if (ctx == NULL) { 2706 SPDK_ERRLOG("Failed to allocate nvme_ctrlr_op_rpc_ctx.\n"); 2707 cb_fn(cb_arg, -ENOMEM); 2708 return; 2709 } 2710 2711 ctx->orig_thread = spdk_get_thread(); 2712 ctx->op = op; 2713 ctx->cb_fn = cb_fn; 2714 ctx->cb_arg = cb_arg; 2715 2716 nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 2717 assert(nvme_ctrlr != NULL); 2718 2719 rc = nvme_ctrlr_op(nvme_ctrlr, op, nvme_bdev_ctrlr_op_rpc_continue, ctx); 2720 if (rc == 0) { 2721 ctx->nvme_ctrlr = nvme_ctrlr; 2722 return; 2723 } else if (rc == -EALREADY) { 2724 ctx->nvme_ctrlr = nvme_ctrlr; 2725 rc = 0; 2726 } 2727 2728 nvme_bdev_ctrlr_op_rpc_continue(ctx, rc); 2729 } 2730 2731 static int _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio); 2732 2733 static void 2734 _bdev_nvme_reset_io_complete(struct spdk_io_channel_iter *i, int status) 2735 { 2736 struct nvme_bdev_io *bio = spdk_io_channel_iter_get_ctx(i); 2737 enum spdk_bdev_io_status io_status; 2738 2739 if (bio->cpl.cdw0 == 0) { 2740 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 2741 } else { 2742 io_status = SPDK_BDEV_IO_STATUS_FAILED; 2743 } 2744 2745 __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), io_status, NULL); 2746 } 2747 2748 static void 2749 bdev_nvme_abort_bdev_channel(struct spdk_io_channel_iter *i) 2750 { 2751 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 2752 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 2753 2754 bdev_nvme_abort_retry_ios(nbdev_ch); 2755 2756 spdk_for_each_channel_continue(i, 0); 2757 } 2758 2759 static void 2760 bdev_nvme_reset_io_complete(struct nvme_bdev_io *bio) 2761 { 2762 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2763 struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt; 2764 2765 /* Abort all queued I/Os for retry. */ 2766 spdk_for_each_channel(nbdev, 2767 bdev_nvme_abort_bdev_channel, 2768 bio, 2769 _bdev_nvme_reset_io_complete); 2770 } 2771 2772 static void 2773 _bdev_nvme_reset_io_continue(void *ctx) 2774 { 2775 struct nvme_bdev_io *bio = ctx; 2776 struct nvme_io_path *prev_io_path, *next_io_path; 2777 int rc; 2778 2779 prev_io_path = bio->io_path; 2780 bio->io_path = NULL; 2781 2782 if (bio->cpl.cdw0 != 0) { 2783 goto complete; 2784 } 2785 2786 next_io_path = STAILQ_NEXT(prev_io_path, stailq); 2787 if (next_io_path == NULL) { 2788 goto complete; 2789 } 2790 2791 rc = _bdev_nvme_reset_io(next_io_path, bio); 2792 if (rc == 0) { 2793 return; 2794 } 2795 2796 bio->cpl.cdw0 = 1; 2797 2798 complete: 2799 bdev_nvme_reset_io_complete(bio); 2800 } 2801 2802 static void 2803 bdev_nvme_reset_io_continue(void *cb_arg, int rc) 2804 { 2805 struct nvme_bdev_io *bio = cb_arg; 2806 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2807 2808 bio->cpl.cdw0 = (rc == 0) ? 0 : 1; 2809 2810 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), _bdev_nvme_reset_io_continue, bio); 2811 } 2812 2813 static int 2814 _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio) 2815 { 2816 struct nvme_ctrlr_channel *ctrlr_ch; 2817 int rc; 2818 2819 rc = nvme_ctrlr_op(io_path->qpair->ctrlr, NVME_CTRLR_OP_RESET, 2820 bdev_nvme_reset_io_continue, bio); 2821 if (rc != 0 && rc != -EBUSY) { 2822 return rc; 2823 } 2824 2825 assert(bio->io_path == NULL); 2826 bio->io_path = io_path; 2827 2828 if (rc == -EBUSY) { 2829 ctrlr_ch = io_path->qpair->ctrlr_ch; 2830 assert(ctrlr_ch != NULL); 2831 /* 2832 * Reset call is queued only if it is from the app framework. This is on purpose so that 2833 * we don't interfere with the app framework reset strategy. i.e. we are deferring to the 2834 * upper level. If they are in the middle of a reset, we won't try to schedule another one. 2835 */ 2836 TAILQ_INSERT_TAIL(&ctrlr_ch->pending_resets, bio, retry_link); 2837 } 2838 2839 return 0; 2840 } 2841 2842 static void 2843 bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio) 2844 { 2845 struct nvme_io_path *io_path; 2846 int rc; 2847 2848 bio->cpl.cdw0 = 0; 2849 2850 /* Reset all nvme_ctrlrs of a bdev controller sequentially. */ 2851 io_path = STAILQ_FIRST(&nbdev_ch->io_path_list); 2852 assert(io_path != NULL); 2853 2854 rc = _bdev_nvme_reset_io(io_path, bio); 2855 if (rc != 0) { 2856 /* If the current nvme_ctrlr is disabled, skip it and move to the next nvme_ctrlr. */ 2857 rc = (rc == -EALREADY) ? 0 : rc; 2858 2859 bdev_nvme_reset_io_continue(bio, rc); 2860 } 2861 } 2862 2863 static int 2864 bdev_nvme_failover_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool remove) 2865 { 2866 if (nvme_ctrlr->destruct) { 2867 /* Don't bother resetting if the controller is in the process of being destructed. */ 2868 return -ENXIO; 2869 } 2870 2871 if (nvme_ctrlr->resetting) { 2872 if (!nvme_ctrlr->in_failover) { 2873 SPDK_NOTICELOG("Reset is already in progress. Defer failover until reset completes.\n"); 2874 2875 /* Defer failover until reset completes. */ 2876 nvme_ctrlr->pending_failover = true; 2877 return -EINPROGRESS; 2878 } else { 2879 SPDK_NOTICELOG("Unable to perform failover, already in progress.\n"); 2880 return -EBUSY; 2881 } 2882 } 2883 2884 bdev_nvme_failover_trid(nvme_ctrlr, remove, true); 2885 2886 if (nvme_ctrlr->reconnect_is_delayed) { 2887 SPDK_NOTICELOG("Reconnect is already scheduled.\n"); 2888 2889 /* We rely on the next reconnect for the failover. */ 2890 return -EALREADY; 2891 } 2892 2893 if (nvme_ctrlr->disabled) { 2894 SPDK_NOTICELOG("Controller is disabled.\n"); 2895 2896 /* We rely on the enablement for the failover. */ 2897 return -EALREADY; 2898 } 2899 2900 nvme_ctrlr->resetting = true; 2901 nvme_ctrlr->in_failover = true; 2902 2903 assert(nvme_ctrlr->reset_start_tsc == 0); 2904 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2905 2906 return 0; 2907 } 2908 2909 static int 2910 bdev_nvme_failover_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2911 { 2912 int rc; 2913 2914 pthread_mutex_lock(&nvme_ctrlr->mutex); 2915 rc = bdev_nvme_failover_ctrlr_unsafe(nvme_ctrlr, false); 2916 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2917 2918 if (rc == 0) { 2919 spdk_thread_send_msg(nvme_ctrlr->thread, _bdev_nvme_reset_ctrlr, nvme_ctrlr); 2920 } else if (rc == -EALREADY) { 2921 rc = 0; 2922 } 2923 2924 return rc; 2925 } 2926 2927 static int bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, 2928 uint64_t num_blocks); 2929 2930 static int bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, 2931 uint64_t num_blocks); 2932 2933 static int bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, 2934 uint64_t src_offset_blocks, 2935 uint64_t num_blocks); 2936 2937 static void 2938 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 2939 bool success) 2940 { 2941 struct nvme_bdev_io *bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 2942 int ret; 2943 2944 if (!success) { 2945 ret = -EINVAL; 2946 goto exit; 2947 } 2948 2949 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 2950 ret = -ENXIO; 2951 goto exit; 2952 } 2953 2954 ret = bdev_nvme_readv(bio, 2955 bdev_io->u.bdev.iovs, 2956 bdev_io->u.bdev.iovcnt, 2957 bdev_io->u.bdev.md_buf, 2958 bdev_io->u.bdev.num_blocks, 2959 bdev_io->u.bdev.offset_blocks, 2960 bdev_io->u.bdev.dif_check_flags, 2961 bdev_io->u.bdev.memory_domain, 2962 bdev_io->u.bdev.memory_domain_ctx, 2963 bdev_io->u.bdev.accel_sequence); 2964 2965 exit: 2966 if (spdk_unlikely(ret != 0)) { 2967 bdev_nvme_io_complete(bio, ret); 2968 } 2969 } 2970 2971 static inline void 2972 _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io) 2973 { 2974 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 2975 struct spdk_bdev *bdev = bdev_io->bdev; 2976 struct nvme_bdev_io *nbdev_io_to_abort; 2977 int rc = 0; 2978 2979 switch (bdev_io->type) { 2980 case SPDK_BDEV_IO_TYPE_READ: 2981 if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) { 2982 2983 rc = bdev_nvme_readv(nbdev_io, 2984 bdev_io->u.bdev.iovs, 2985 bdev_io->u.bdev.iovcnt, 2986 bdev_io->u.bdev.md_buf, 2987 bdev_io->u.bdev.num_blocks, 2988 bdev_io->u.bdev.offset_blocks, 2989 bdev_io->u.bdev.dif_check_flags, 2990 bdev_io->u.bdev.memory_domain, 2991 bdev_io->u.bdev.memory_domain_ctx, 2992 bdev_io->u.bdev.accel_sequence); 2993 } else { 2994 spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb, 2995 bdev_io->u.bdev.num_blocks * bdev->blocklen); 2996 rc = 0; 2997 } 2998 break; 2999 case SPDK_BDEV_IO_TYPE_WRITE: 3000 rc = bdev_nvme_writev(nbdev_io, 3001 bdev_io->u.bdev.iovs, 3002 bdev_io->u.bdev.iovcnt, 3003 bdev_io->u.bdev.md_buf, 3004 bdev_io->u.bdev.num_blocks, 3005 bdev_io->u.bdev.offset_blocks, 3006 bdev_io->u.bdev.dif_check_flags, 3007 bdev_io->u.bdev.memory_domain, 3008 bdev_io->u.bdev.memory_domain_ctx, 3009 bdev_io->u.bdev.accel_sequence, 3010 bdev_io->u.bdev.nvme_cdw12, 3011 bdev_io->u.bdev.nvme_cdw13); 3012 break; 3013 case SPDK_BDEV_IO_TYPE_COMPARE: 3014 rc = bdev_nvme_comparev(nbdev_io, 3015 bdev_io->u.bdev.iovs, 3016 bdev_io->u.bdev.iovcnt, 3017 bdev_io->u.bdev.md_buf, 3018 bdev_io->u.bdev.num_blocks, 3019 bdev_io->u.bdev.offset_blocks, 3020 bdev_io->u.bdev.dif_check_flags); 3021 break; 3022 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 3023 rc = bdev_nvme_comparev_and_writev(nbdev_io, 3024 bdev_io->u.bdev.iovs, 3025 bdev_io->u.bdev.iovcnt, 3026 bdev_io->u.bdev.fused_iovs, 3027 bdev_io->u.bdev.fused_iovcnt, 3028 bdev_io->u.bdev.md_buf, 3029 bdev_io->u.bdev.num_blocks, 3030 bdev_io->u.bdev.offset_blocks, 3031 bdev_io->u.bdev.dif_check_flags); 3032 break; 3033 case SPDK_BDEV_IO_TYPE_UNMAP: 3034 rc = bdev_nvme_unmap(nbdev_io, 3035 bdev_io->u.bdev.offset_blocks, 3036 bdev_io->u.bdev.num_blocks); 3037 break; 3038 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3039 rc = bdev_nvme_write_zeroes(nbdev_io, 3040 bdev_io->u.bdev.offset_blocks, 3041 bdev_io->u.bdev.num_blocks); 3042 break; 3043 case SPDK_BDEV_IO_TYPE_RESET: 3044 nbdev_io->io_path = NULL; 3045 bdev_nvme_reset_io(nbdev_ch, nbdev_io); 3046 return; 3047 3048 case SPDK_BDEV_IO_TYPE_FLUSH: 3049 bdev_nvme_io_complete(nbdev_io, 0); 3050 return; 3051 3052 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 3053 rc = bdev_nvme_zone_appendv(nbdev_io, 3054 bdev_io->u.bdev.iovs, 3055 bdev_io->u.bdev.iovcnt, 3056 bdev_io->u.bdev.md_buf, 3057 bdev_io->u.bdev.num_blocks, 3058 bdev_io->u.bdev.offset_blocks, 3059 bdev_io->u.bdev.dif_check_flags); 3060 break; 3061 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 3062 rc = bdev_nvme_get_zone_info(nbdev_io, 3063 bdev_io->u.zone_mgmt.zone_id, 3064 bdev_io->u.zone_mgmt.num_zones, 3065 bdev_io->u.zone_mgmt.buf); 3066 break; 3067 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 3068 rc = bdev_nvme_zone_management(nbdev_io, 3069 bdev_io->u.zone_mgmt.zone_id, 3070 bdev_io->u.zone_mgmt.zone_action); 3071 break; 3072 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 3073 nbdev_io->io_path = NULL; 3074 bdev_nvme_admin_passthru(nbdev_ch, 3075 nbdev_io, 3076 &bdev_io->u.nvme_passthru.cmd, 3077 bdev_io->u.nvme_passthru.buf, 3078 bdev_io->u.nvme_passthru.nbytes); 3079 return; 3080 3081 case SPDK_BDEV_IO_TYPE_NVME_IO: 3082 rc = bdev_nvme_io_passthru(nbdev_io, 3083 &bdev_io->u.nvme_passthru.cmd, 3084 bdev_io->u.nvme_passthru.buf, 3085 bdev_io->u.nvme_passthru.nbytes); 3086 break; 3087 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3088 rc = bdev_nvme_io_passthru_md(nbdev_io, 3089 &bdev_io->u.nvme_passthru.cmd, 3090 bdev_io->u.nvme_passthru.buf, 3091 bdev_io->u.nvme_passthru.nbytes, 3092 bdev_io->u.nvme_passthru.md_buf, 3093 bdev_io->u.nvme_passthru.md_len); 3094 break; 3095 case SPDK_BDEV_IO_TYPE_NVME_IOV_MD: 3096 rc = bdev_nvme_iov_passthru_md(nbdev_io, 3097 &bdev_io->u.nvme_passthru.cmd, 3098 bdev_io->u.nvme_passthru.iovs, 3099 bdev_io->u.nvme_passthru.iovcnt, 3100 bdev_io->u.nvme_passthru.nbytes, 3101 bdev_io->u.nvme_passthru.md_buf, 3102 bdev_io->u.nvme_passthru.md_len); 3103 break; 3104 case SPDK_BDEV_IO_TYPE_ABORT: 3105 nbdev_io->io_path = NULL; 3106 nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx; 3107 bdev_nvme_abort(nbdev_ch, 3108 nbdev_io, 3109 nbdev_io_to_abort); 3110 return; 3111 3112 case SPDK_BDEV_IO_TYPE_COPY: 3113 rc = bdev_nvme_copy(nbdev_io, 3114 bdev_io->u.bdev.offset_blocks, 3115 bdev_io->u.bdev.copy.src_offset_blocks, 3116 bdev_io->u.bdev.num_blocks); 3117 break; 3118 default: 3119 rc = -EINVAL; 3120 break; 3121 } 3122 3123 if (spdk_unlikely(rc != 0)) { 3124 bdev_nvme_io_complete(nbdev_io, rc); 3125 } 3126 } 3127 3128 static void 3129 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 3130 { 3131 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 3132 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 3133 3134 if (spdk_likely(nbdev_io->submit_tsc == 0)) { 3135 nbdev_io->submit_tsc = spdk_bdev_io_get_submit_tsc(bdev_io); 3136 } else { 3137 /* There are cases where submit_tsc != 0, i.e. retry I/O. 3138 * We need to update submit_tsc here. 3139 */ 3140 nbdev_io->submit_tsc = spdk_get_ticks(); 3141 } 3142 3143 spdk_trace_record(TRACE_BDEV_NVME_IO_START, 0, 0, (uintptr_t)nbdev_io, (uintptr_t)bdev_io); 3144 nbdev_io->io_path = bdev_nvme_find_io_path(nbdev_ch); 3145 if (spdk_unlikely(!nbdev_io->io_path)) { 3146 if (!bdev_nvme_io_type_is_admin(bdev_io->type)) { 3147 bdev_nvme_io_complete(nbdev_io, -ENXIO); 3148 return; 3149 } 3150 3151 /* Admin commands do not use the optimal I/O path. 3152 * Simply fall through even if it is not found. 3153 */ 3154 } 3155 3156 _bdev_nvme_submit_request(nbdev_ch, bdev_io); 3157 } 3158 3159 static bool 3160 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 3161 { 3162 struct nvme_bdev *nbdev = ctx; 3163 struct nvme_ns *nvme_ns; 3164 struct spdk_nvme_ns *ns; 3165 struct spdk_nvme_ctrlr *ctrlr; 3166 const struct spdk_nvme_ctrlr_data *cdata; 3167 3168 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 3169 assert(nvme_ns != NULL); 3170 ns = nvme_ns->ns; 3171 if (ns == NULL) { 3172 return false; 3173 } 3174 3175 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 3176 3177 switch (io_type) { 3178 case SPDK_BDEV_IO_TYPE_READ: 3179 case SPDK_BDEV_IO_TYPE_WRITE: 3180 case SPDK_BDEV_IO_TYPE_RESET: 3181 case SPDK_BDEV_IO_TYPE_FLUSH: 3182 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 3183 case SPDK_BDEV_IO_TYPE_NVME_IO: 3184 case SPDK_BDEV_IO_TYPE_ABORT: 3185 return true; 3186 3187 case SPDK_BDEV_IO_TYPE_COMPARE: 3188 return spdk_nvme_ns_supports_compare(ns); 3189 3190 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3191 return spdk_nvme_ns_get_md_size(ns) ? true : false; 3192 3193 case SPDK_BDEV_IO_TYPE_UNMAP: 3194 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3195 return cdata->oncs.dsm; 3196 3197 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3198 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3199 return cdata->oncs.write_zeroes; 3200 3201 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 3202 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 3203 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) { 3204 return true; 3205 } 3206 return false; 3207 3208 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 3209 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 3210 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS; 3211 3212 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 3213 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS && 3214 spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ZONE_APPEND_SUPPORTED; 3215 3216 case SPDK_BDEV_IO_TYPE_COPY: 3217 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3218 return cdata->oncs.copy; 3219 3220 default: 3221 return false; 3222 } 3223 } 3224 3225 static int 3226 nvme_qpair_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ctrlr_channel *ctrlr_ch) 3227 { 3228 struct nvme_qpair *nvme_qpair; 3229 struct spdk_io_channel *pg_ch; 3230 int rc; 3231 3232 nvme_qpair = calloc(1, sizeof(*nvme_qpair)); 3233 if (!nvme_qpair) { 3234 SPDK_ERRLOG("Failed to alloc nvme_qpair.\n"); 3235 return -1; 3236 } 3237 3238 TAILQ_INIT(&nvme_qpair->io_path_list); 3239 3240 nvme_qpair->ctrlr = nvme_ctrlr; 3241 nvme_qpair->ctrlr_ch = ctrlr_ch; 3242 3243 pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs); 3244 if (!pg_ch) { 3245 free(nvme_qpair); 3246 return -1; 3247 } 3248 3249 nvme_qpair->group = spdk_io_channel_get_ctx(pg_ch); 3250 3251 #ifdef SPDK_CONFIG_VTUNE 3252 nvme_qpair->group->collect_spin_stat = true; 3253 #else 3254 nvme_qpair->group->collect_spin_stat = false; 3255 #endif 3256 3257 if (!nvme_ctrlr->disabled) { 3258 /* If a nvme_ctrlr is disabled, don't try to create qpair for it. Qpair will 3259 * be created when it's enabled. 3260 */ 3261 rc = bdev_nvme_create_qpair(nvme_qpair); 3262 if (rc != 0) { 3263 /* nvme_ctrlr can't create IO qpair if connection is down. 3264 * If reconnect_delay_sec is non-zero, creating IO qpair is retried 3265 * after reconnect_delay_sec seconds. If bdev_retry_count is non-zero, 3266 * submitted IO will be queued until IO qpair is successfully created. 3267 * 3268 * Hence, if both are satisfied, ignore the failure. 3269 */ 3270 if (nvme_ctrlr->opts.reconnect_delay_sec == 0 || g_opts.bdev_retry_count == 0) { 3271 spdk_put_io_channel(pg_ch); 3272 free(nvme_qpair); 3273 return rc; 3274 } 3275 } 3276 } 3277 3278 TAILQ_INSERT_TAIL(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 3279 3280 ctrlr_ch->qpair = nvme_qpair; 3281 3282 pthread_mutex_lock(&nvme_qpair->ctrlr->mutex); 3283 nvme_qpair->ctrlr->ref++; 3284 pthread_mutex_unlock(&nvme_qpair->ctrlr->mutex); 3285 3286 return 0; 3287 } 3288 3289 static int 3290 bdev_nvme_create_ctrlr_channel_cb(void *io_device, void *ctx_buf) 3291 { 3292 struct nvme_ctrlr *nvme_ctrlr = io_device; 3293 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 3294 3295 TAILQ_INIT(&ctrlr_ch->pending_resets); 3296 3297 return nvme_qpair_create(nvme_ctrlr, ctrlr_ch); 3298 } 3299 3300 static void 3301 nvme_qpair_delete(struct nvme_qpair *nvme_qpair) 3302 { 3303 struct nvme_io_path *io_path, *next; 3304 3305 assert(nvme_qpair->group != NULL); 3306 3307 TAILQ_FOREACH_SAFE(io_path, &nvme_qpair->io_path_list, tailq, next) { 3308 TAILQ_REMOVE(&nvme_qpair->io_path_list, io_path, tailq); 3309 nvme_io_path_free(io_path); 3310 } 3311 3312 TAILQ_REMOVE(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 3313 3314 spdk_put_io_channel(spdk_io_channel_from_ctx(nvme_qpair->group)); 3315 3316 nvme_ctrlr_release(nvme_qpair->ctrlr); 3317 3318 free(nvme_qpair); 3319 } 3320 3321 static void 3322 bdev_nvme_destroy_ctrlr_channel_cb(void *io_device, void *ctx_buf) 3323 { 3324 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 3325 struct nvme_qpair *nvme_qpair; 3326 3327 nvme_qpair = ctrlr_ch->qpair; 3328 assert(nvme_qpair != NULL); 3329 3330 _bdev_nvme_clear_io_path_cache(nvme_qpair); 3331 3332 if (nvme_qpair->qpair != NULL) { 3333 if (ctrlr_ch->reset_iter == NULL) { 3334 spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); 3335 } else { 3336 /* Skip current ctrlr_channel in a full reset sequence because 3337 * it is being deleted now. The qpair is already being disconnected. 3338 * We do not have to restart disconnecting it. 3339 */ 3340 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 3341 } 3342 3343 /* We cannot release a reference to the poll group now. 3344 * The qpair may be disconnected asynchronously later. 3345 * We need to poll it until it is actually disconnected. 3346 * Just detach the qpair from the deleting ctrlr_channel. 3347 */ 3348 nvme_qpair->ctrlr_ch = NULL; 3349 } else { 3350 assert(ctrlr_ch->reset_iter == NULL); 3351 3352 nvme_qpair_delete(nvme_qpair); 3353 } 3354 } 3355 3356 static inline struct spdk_io_channel * 3357 bdev_nvme_get_accel_channel(struct nvme_poll_group *group) 3358 { 3359 if (spdk_unlikely(!group->accel_channel)) { 3360 group->accel_channel = spdk_accel_get_io_channel(); 3361 if (!group->accel_channel) { 3362 SPDK_ERRLOG("Cannot get the accel_channel for bdev nvme polling group=%p\n", 3363 group); 3364 return NULL; 3365 } 3366 } 3367 3368 return group->accel_channel; 3369 } 3370 3371 static void 3372 bdev_nvme_submit_accel_crc32c(void *ctx, uint32_t *dst, struct iovec *iov, 3373 uint32_t iov_cnt, uint32_t seed, 3374 spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) 3375 { 3376 struct spdk_io_channel *accel_ch; 3377 struct nvme_poll_group *group = ctx; 3378 int rc; 3379 3380 assert(cb_fn != NULL); 3381 3382 accel_ch = bdev_nvme_get_accel_channel(group); 3383 if (spdk_unlikely(accel_ch == NULL)) { 3384 cb_fn(cb_arg, -ENOMEM); 3385 return; 3386 } 3387 3388 rc = spdk_accel_submit_crc32cv(accel_ch, dst, iov, iov_cnt, seed, cb_fn, cb_arg); 3389 if (rc) { 3390 /* For the two cases, spdk_accel_submit_crc32cv does not call the user's cb_fn */ 3391 if (rc == -ENOMEM || rc == -EINVAL) { 3392 cb_fn(cb_arg, rc); 3393 } 3394 SPDK_ERRLOG("Cannot complete the accelerated crc32c operation with iov=%p\n", iov); 3395 } 3396 } 3397 3398 static void 3399 bdev_nvme_finish_sequence(void *seq, spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) 3400 { 3401 spdk_accel_sequence_finish(seq, cb_fn, cb_arg); 3402 } 3403 3404 static void 3405 bdev_nvme_abort_sequence(void *seq) 3406 { 3407 spdk_accel_sequence_abort(seq); 3408 } 3409 3410 static void 3411 bdev_nvme_reverse_sequence(void *seq) 3412 { 3413 spdk_accel_sequence_reverse(seq); 3414 } 3415 3416 static int 3417 bdev_nvme_append_crc32c(void *ctx, void **seq, uint32_t *dst, struct iovec *iovs, uint32_t iovcnt, 3418 struct spdk_memory_domain *domain, void *domain_ctx, uint32_t seed, 3419 spdk_nvme_accel_step_cb cb_fn, void *cb_arg) 3420 { 3421 struct spdk_io_channel *ch; 3422 struct nvme_poll_group *group = ctx; 3423 3424 ch = bdev_nvme_get_accel_channel(group); 3425 if (spdk_unlikely(ch == NULL)) { 3426 return -ENOMEM; 3427 } 3428 3429 return spdk_accel_append_crc32c((struct spdk_accel_sequence **)seq, ch, dst, iovs, iovcnt, 3430 domain, domain_ctx, seed, cb_fn, cb_arg); 3431 } 3432 3433 static struct spdk_nvme_accel_fn_table g_bdev_nvme_accel_fn_table = { 3434 .table_size = sizeof(struct spdk_nvme_accel_fn_table), 3435 .submit_accel_crc32c = bdev_nvme_submit_accel_crc32c, 3436 .append_crc32c = bdev_nvme_append_crc32c, 3437 .finish_sequence = bdev_nvme_finish_sequence, 3438 .reverse_sequence = bdev_nvme_reverse_sequence, 3439 .abort_sequence = bdev_nvme_abort_sequence, 3440 }; 3441 3442 static int 3443 bdev_nvme_create_poll_group_cb(void *io_device, void *ctx_buf) 3444 { 3445 struct nvme_poll_group *group = ctx_buf; 3446 3447 TAILQ_INIT(&group->qpair_list); 3448 3449 group->group = spdk_nvme_poll_group_create(group, &g_bdev_nvme_accel_fn_table); 3450 if (group->group == NULL) { 3451 return -1; 3452 } 3453 3454 group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us); 3455 3456 if (group->poller == NULL) { 3457 spdk_nvme_poll_group_destroy(group->group); 3458 return -1; 3459 } 3460 3461 return 0; 3462 } 3463 3464 static void 3465 bdev_nvme_destroy_poll_group_cb(void *io_device, void *ctx_buf) 3466 { 3467 struct nvme_poll_group *group = ctx_buf; 3468 3469 assert(TAILQ_EMPTY(&group->qpair_list)); 3470 3471 if (group->accel_channel) { 3472 spdk_put_io_channel(group->accel_channel); 3473 } 3474 3475 spdk_poller_unregister(&group->poller); 3476 if (spdk_nvme_poll_group_destroy(group->group)) { 3477 SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.\n"); 3478 assert(false); 3479 } 3480 } 3481 3482 static struct spdk_io_channel * 3483 bdev_nvme_get_io_channel(void *ctx) 3484 { 3485 struct nvme_bdev *nvme_bdev = ctx; 3486 3487 return spdk_get_io_channel(nvme_bdev); 3488 } 3489 3490 static void * 3491 bdev_nvme_get_module_ctx(void *ctx) 3492 { 3493 struct nvme_bdev *nvme_bdev = ctx; 3494 struct nvme_ns *nvme_ns; 3495 3496 if (!nvme_bdev || nvme_bdev->disk.module != &nvme_if) { 3497 return NULL; 3498 } 3499 3500 nvme_ns = TAILQ_FIRST(&nvme_bdev->nvme_ns_list); 3501 if (!nvme_ns) { 3502 return NULL; 3503 } 3504 3505 return nvme_ns->ns; 3506 } 3507 3508 static const char * 3509 _nvme_ana_state_str(enum spdk_nvme_ana_state ana_state) 3510 { 3511 switch (ana_state) { 3512 case SPDK_NVME_ANA_OPTIMIZED_STATE: 3513 return "optimized"; 3514 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 3515 return "non_optimized"; 3516 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 3517 return "inaccessible"; 3518 case SPDK_NVME_ANA_PERSISTENT_LOSS_STATE: 3519 return "persistent_loss"; 3520 case SPDK_NVME_ANA_CHANGE_STATE: 3521 return "change"; 3522 default: 3523 return NULL; 3524 } 3525 } 3526 3527 static int 3528 bdev_nvme_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size) 3529 { 3530 struct spdk_memory_domain **_domains = NULL; 3531 struct nvme_bdev *nbdev = ctx; 3532 struct nvme_ns *nvme_ns; 3533 int i = 0, _array_size = array_size; 3534 int rc = 0; 3535 3536 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 3537 if (domains && array_size >= i) { 3538 _domains = &domains[i]; 3539 } else { 3540 _domains = NULL; 3541 } 3542 rc = spdk_nvme_ctrlr_get_memory_domains(nvme_ns->ctrlr->ctrlr, _domains, _array_size); 3543 if (rc > 0) { 3544 i += rc; 3545 if (_array_size >= rc) { 3546 _array_size -= rc; 3547 } else { 3548 _array_size = 0; 3549 } 3550 } else if (rc < 0) { 3551 return rc; 3552 } 3553 } 3554 3555 return i; 3556 } 3557 3558 static const char * 3559 nvme_ctrlr_get_state_str(struct nvme_ctrlr *nvme_ctrlr) 3560 { 3561 if (nvme_ctrlr->destruct) { 3562 return "deleting"; 3563 } else if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 3564 return "failed"; 3565 } else if (nvme_ctrlr->resetting) { 3566 return "resetting"; 3567 } else if (nvme_ctrlr->reconnect_is_delayed > 0) { 3568 return "reconnect_is_delayed"; 3569 } else if (nvme_ctrlr->disabled) { 3570 return "disabled"; 3571 } else { 3572 return "enabled"; 3573 } 3574 } 3575 3576 void 3577 nvme_ctrlr_info_json(struct spdk_json_write_ctx *w, struct nvme_ctrlr *nvme_ctrlr) 3578 { 3579 struct spdk_nvme_transport_id *trid; 3580 const struct spdk_nvme_ctrlr_opts *opts; 3581 const struct spdk_nvme_ctrlr_data *cdata; 3582 struct nvme_path_id *path_id; 3583 3584 spdk_json_write_object_begin(w); 3585 3586 spdk_json_write_named_string(w, "state", nvme_ctrlr_get_state_str(nvme_ctrlr)); 3587 3588 #ifdef SPDK_CONFIG_NVME_CUSE 3589 size_t cuse_name_size = 128; 3590 char cuse_name[cuse_name_size]; 3591 3592 int rc = spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr, cuse_name, &cuse_name_size); 3593 if (rc == 0) { 3594 spdk_json_write_named_string(w, "cuse_device", cuse_name); 3595 } 3596 #endif 3597 trid = &nvme_ctrlr->active_path_id->trid; 3598 spdk_json_write_named_object_begin(w, "trid"); 3599 nvme_bdev_dump_trid_json(trid, w); 3600 spdk_json_write_object_end(w); 3601 3602 path_id = TAILQ_NEXT(nvme_ctrlr->active_path_id, link); 3603 if (path_id != NULL) { 3604 spdk_json_write_named_array_begin(w, "alternate_trids"); 3605 do { 3606 trid = &path_id->trid; 3607 spdk_json_write_object_begin(w); 3608 nvme_bdev_dump_trid_json(trid, w); 3609 spdk_json_write_object_end(w); 3610 3611 path_id = TAILQ_NEXT(path_id, link); 3612 } while (path_id != NULL); 3613 spdk_json_write_array_end(w); 3614 } 3615 3616 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 3617 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 3618 3619 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 3620 spdk_json_write_named_object_begin(w, "host"); 3621 spdk_json_write_named_string(w, "nqn", opts->hostnqn); 3622 spdk_json_write_named_string(w, "addr", opts->src_addr); 3623 spdk_json_write_named_string(w, "svcid", opts->src_svcid); 3624 spdk_json_write_object_end(w); 3625 3626 spdk_json_write_object_end(w); 3627 } 3628 3629 static void 3630 nvme_namespace_info_json(struct spdk_json_write_ctx *w, 3631 struct nvme_ns *nvme_ns) 3632 { 3633 struct spdk_nvme_ns *ns; 3634 struct spdk_nvme_ctrlr *ctrlr; 3635 const struct spdk_nvme_ctrlr_data *cdata; 3636 const struct spdk_nvme_transport_id *trid; 3637 union spdk_nvme_vs_register vs; 3638 const struct spdk_nvme_ns_data *nsdata; 3639 char buf[128]; 3640 3641 ns = nvme_ns->ns; 3642 if (ns == NULL) { 3643 return; 3644 } 3645 3646 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 3647 3648 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3649 trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); 3650 vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr); 3651 3652 spdk_json_write_object_begin(w); 3653 3654 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 3655 spdk_json_write_named_string(w, "pci_address", trid->traddr); 3656 } 3657 3658 spdk_json_write_named_object_begin(w, "trid"); 3659 3660 nvme_bdev_dump_trid_json(trid, w); 3661 3662 spdk_json_write_object_end(w); 3663 3664 #ifdef SPDK_CONFIG_NVME_CUSE 3665 size_t cuse_name_size = 128; 3666 char cuse_name[cuse_name_size]; 3667 3668 int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns), 3669 cuse_name, &cuse_name_size); 3670 if (rc == 0) { 3671 spdk_json_write_named_string(w, "cuse_device", cuse_name); 3672 } 3673 #endif 3674 3675 spdk_json_write_named_object_begin(w, "ctrlr_data"); 3676 3677 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 3678 3679 spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid); 3680 3681 snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn); 3682 spdk_str_trim(buf); 3683 spdk_json_write_named_string(w, "model_number", buf); 3684 3685 snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn); 3686 spdk_str_trim(buf); 3687 spdk_json_write_named_string(w, "serial_number", buf); 3688 3689 snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr); 3690 spdk_str_trim(buf); 3691 spdk_json_write_named_string(w, "firmware_revision", buf); 3692 3693 if (cdata->subnqn[0] != '\0') { 3694 spdk_json_write_named_string(w, "subnqn", cdata->subnqn); 3695 } 3696 3697 spdk_json_write_named_object_begin(w, "oacs"); 3698 3699 spdk_json_write_named_uint32(w, "security", cdata->oacs.security); 3700 spdk_json_write_named_uint32(w, "format", cdata->oacs.format); 3701 spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware); 3702 spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage); 3703 3704 spdk_json_write_object_end(w); 3705 3706 spdk_json_write_named_bool(w, "multi_ctrlr", cdata->cmic.multi_ctrlr); 3707 spdk_json_write_named_bool(w, "ana_reporting", cdata->cmic.ana_reporting); 3708 3709 spdk_json_write_object_end(w); 3710 3711 spdk_json_write_named_object_begin(w, "vs"); 3712 3713 spdk_json_write_name(w, "nvme_version"); 3714 if (vs.bits.ter) { 3715 spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter); 3716 } else { 3717 spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr); 3718 } 3719 3720 spdk_json_write_object_end(w); 3721 3722 nsdata = spdk_nvme_ns_get_data(ns); 3723 3724 spdk_json_write_named_object_begin(w, "ns_data"); 3725 3726 spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns)); 3727 3728 if (cdata->cmic.ana_reporting) { 3729 spdk_json_write_named_string(w, "ana_state", 3730 _nvme_ana_state_str(nvme_ns->ana_state)); 3731 } 3732 3733 spdk_json_write_named_bool(w, "can_share", nsdata->nmic.can_share); 3734 3735 spdk_json_write_object_end(w); 3736 3737 if (cdata->oacs.security) { 3738 spdk_json_write_named_object_begin(w, "security"); 3739 3740 spdk_json_write_named_bool(w, "opal", nvme_ns->bdev->opal); 3741 3742 spdk_json_write_object_end(w); 3743 } 3744 3745 spdk_json_write_object_end(w); 3746 } 3747 3748 static const char * 3749 nvme_bdev_get_mp_policy_str(struct nvme_bdev *nbdev) 3750 { 3751 switch (nbdev->mp_policy) { 3752 case BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE: 3753 return "active_passive"; 3754 case BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE: 3755 return "active_active"; 3756 default: 3757 assert(false); 3758 return "invalid"; 3759 } 3760 } 3761 3762 static const char * 3763 nvme_bdev_get_mp_selector_str(struct nvme_bdev *nbdev) 3764 { 3765 switch (nbdev->mp_selector) { 3766 case BDEV_NVME_MP_SELECTOR_ROUND_ROBIN: 3767 return "round_robin"; 3768 case BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH: 3769 return "queue_depth"; 3770 default: 3771 assert(false); 3772 return "invalid"; 3773 } 3774 } 3775 3776 static int 3777 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) 3778 { 3779 struct nvme_bdev *nvme_bdev = ctx; 3780 struct nvme_ns *nvme_ns; 3781 3782 pthread_mutex_lock(&nvme_bdev->mutex); 3783 spdk_json_write_named_array_begin(w, "nvme"); 3784 TAILQ_FOREACH(nvme_ns, &nvme_bdev->nvme_ns_list, tailq) { 3785 nvme_namespace_info_json(w, nvme_ns); 3786 } 3787 spdk_json_write_array_end(w); 3788 spdk_json_write_named_string(w, "mp_policy", nvme_bdev_get_mp_policy_str(nvme_bdev)); 3789 if (nvme_bdev->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) { 3790 spdk_json_write_named_string(w, "selector", nvme_bdev_get_mp_selector_str(nvme_bdev)); 3791 if (nvme_bdev->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 3792 spdk_json_write_named_uint32(w, "rr_min_io", nvme_bdev->rr_min_io); 3793 } 3794 } 3795 pthread_mutex_unlock(&nvme_bdev->mutex); 3796 3797 return 0; 3798 } 3799 3800 static void 3801 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3802 { 3803 /* No config per bdev needed */ 3804 } 3805 3806 static uint64_t 3807 bdev_nvme_get_spin_time(struct spdk_io_channel *ch) 3808 { 3809 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 3810 struct nvme_io_path *io_path; 3811 struct nvme_poll_group *group; 3812 uint64_t spin_time = 0; 3813 3814 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 3815 group = io_path->qpair->group; 3816 3817 if (!group || !group->collect_spin_stat) { 3818 continue; 3819 } 3820 3821 if (group->end_ticks != 0) { 3822 group->spin_ticks += (group->end_ticks - group->start_ticks); 3823 group->end_ticks = 0; 3824 } 3825 3826 spin_time += group->spin_ticks; 3827 group->start_ticks = 0; 3828 group->spin_ticks = 0; 3829 } 3830 3831 return (spin_time * 1000000ULL) / spdk_get_ticks_hz(); 3832 } 3833 3834 static void 3835 bdev_nvme_reset_device_stat(void *ctx) 3836 { 3837 struct nvme_bdev *nbdev = ctx; 3838 3839 if (nbdev->err_stat != NULL) { 3840 memset(nbdev->err_stat, 0, sizeof(struct nvme_error_stat)); 3841 } 3842 } 3843 3844 /* JSON string should be lowercases and underscore delimited string. */ 3845 static void 3846 bdev_nvme_format_nvme_status(char *dst, const char *src) 3847 { 3848 char tmp[256]; 3849 3850 spdk_strcpy_replace(dst, 256, src, " - ", "_"); 3851 spdk_strcpy_replace(tmp, 256, dst, "-", "_"); 3852 spdk_strcpy_replace(dst, 256, tmp, " ", "_"); 3853 spdk_strlwr(dst); 3854 } 3855 3856 static void 3857 bdev_nvme_dump_device_stat_json(void *ctx, struct spdk_json_write_ctx *w) 3858 { 3859 struct nvme_bdev *nbdev = ctx; 3860 struct spdk_nvme_status status = {}; 3861 uint16_t sct, sc; 3862 char status_json[256]; 3863 const char *status_str; 3864 3865 if (nbdev->err_stat == NULL) { 3866 return; 3867 } 3868 3869 spdk_json_write_named_object_begin(w, "nvme_error"); 3870 3871 spdk_json_write_named_object_begin(w, "status_type"); 3872 for (sct = 0; sct < 8; sct++) { 3873 if (nbdev->err_stat->status_type[sct] == 0) { 3874 continue; 3875 } 3876 status.sct = sct; 3877 3878 status_str = spdk_nvme_cpl_get_status_type_string(&status); 3879 assert(status_str != NULL); 3880 bdev_nvme_format_nvme_status(status_json, status_str); 3881 3882 spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status_type[sct]); 3883 } 3884 spdk_json_write_object_end(w); 3885 3886 spdk_json_write_named_object_begin(w, "status_code"); 3887 for (sct = 0; sct < 4; sct++) { 3888 status.sct = sct; 3889 for (sc = 0; sc < 256; sc++) { 3890 if (nbdev->err_stat->status[sct][sc] == 0) { 3891 continue; 3892 } 3893 status.sc = sc; 3894 3895 status_str = spdk_nvme_cpl_get_status_string(&status); 3896 assert(status_str != NULL); 3897 bdev_nvme_format_nvme_status(status_json, status_str); 3898 3899 spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status[sct][sc]); 3900 } 3901 } 3902 spdk_json_write_object_end(w); 3903 3904 spdk_json_write_object_end(w); 3905 } 3906 3907 static bool 3908 bdev_nvme_accel_sequence_supported(void *ctx, enum spdk_bdev_io_type type) 3909 { 3910 struct nvme_bdev *nbdev = ctx; 3911 struct spdk_nvme_ctrlr *ctrlr; 3912 3913 if (!g_opts.allow_accel_sequence) { 3914 return false; 3915 } 3916 3917 switch (type) { 3918 case SPDK_BDEV_IO_TYPE_WRITE: 3919 case SPDK_BDEV_IO_TYPE_READ: 3920 break; 3921 default: 3922 return false; 3923 } 3924 3925 ctrlr = bdev_nvme_get_ctrlr(&nbdev->disk); 3926 assert(ctrlr != NULL); 3927 3928 return spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ACCEL_SEQUENCE_SUPPORTED; 3929 } 3930 3931 static const struct spdk_bdev_fn_table nvmelib_fn_table = { 3932 .destruct = bdev_nvme_destruct, 3933 .submit_request = bdev_nvme_submit_request, 3934 .io_type_supported = bdev_nvme_io_type_supported, 3935 .get_io_channel = bdev_nvme_get_io_channel, 3936 .dump_info_json = bdev_nvme_dump_info_json, 3937 .write_config_json = bdev_nvme_write_config_json, 3938 .get_spin_time = bdev_nvme_get_spin_time, 3939 .get_module_ctx = bdev_nvme_get_module_ctx, 3940 .get_memory_domains = bdev_nvme_get_memory_domains, 3941 .accel_sequence_supported = bdev_nvme_accel_sequence_supported, 3942 .reset_device_stat = bdev_nvme_reset_device_stat, 3943 .dump_device_stat_json = bdev_nvme_dump_device_stat_json, 3944 }; 3945 3946 typedef int (*bdev_nvme_parse_ana_log_page_cb)( 3947 const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg); 3948 3949 static int 3950 bdev_nvme_parse_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 3951 bdev_nvme_parse_ana_log_page_cb cb_fn, void *cb_arg) 3952 { 3953 struct spdk_nvme_ana_group_descriptor *copied_desc; 3954 uint8_t *orig_desc; 3955 uint32_t i, desc_size, copy_len; 3956 int rc = 0; 3957 3958 if (nvme_ctrlr->ana_log_page == NULL) { 3959 return -EINVAL; 3960 } 3961 3962 copied_desc = nvme_ctrlr->copied_ana_desc; 3963 3964 orig_desc = (uint8_t *)nvme_ctrlr->ana_log_page + sizeof(struct spdk_nvme_ana_page); 3965 copy_len = nvme_ctrlr->max_ana_log_page_size - sizeof(struct spdk_nvme_ana_page); 3966 3967 for (i = 0; i < nvme_ctrlr->ana_log_page->num_ana_group_desc; i++) { 3968 memcpy(copied_desc, orig_desc, copy_len); 3969 3970 rc = cb_fn(copied_desc, cb_arg); 3971 if (rc != 0) { 3972 break; 3973 } 3974 3975 desc_size = sizeof(struct spdk_nvme_ana_group_descriptor) + 3976 copied_desc->num_of_nsid * sizeof(uint32_t); 3977 orig_desc += desc_size; 3978 copy_len -= desc_size; 3979 } 3980 3981 return rc; 3982 } 3983 3984 static int 3985 nvme_ns_ana_transition_timedout(void *ctx) 3986 { 3987 struct nvme_ns *nvme_ns = ctx; 3988 3989 spdk_poller_unregister(&nvme_ns->anatt_timer); 3990 nvme_ns->ana_transition_timedout = true; 3991 3992 return SPDK_POLLER_BUSY; 3993 } 3994 3995 static void 3996 _nvme_ns_set_ana_state(struct nvme_ns *nvme_ns, 3997 const struct spdk_nvme_ana_group_descriptor *desc) 3998 { 3999 const struct spdk_nvme_ctrlr_data *cdata; 4000 4001 nvme_ns->ana_group_id = desc->ana_group_id; 4002 nvme_ns->ana_state = desc->ana_state; 4003 nvme_ns->ana_state_updating = false; 4004 4005 switch (nvme_ns->ana_state) { 4006 case SPDK_NVME_ANA_OPTIMIZED_STATE: 4007 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 4008 nvme_ns->ana_transition_timedout = false; 4009 spdk_poller_unregister(&nvme_ns->anatt_timer); 4010 break; 4011 4012 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 4013 case SPDK_NVME_ANA_CHANGE_STATE: 4014 if (nvme_ns->anatt_timer != NULL) { 4015 break; 4016 } 4017 4018 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 4019 nvme_ns->anatt_timer = SPDK_POLLER_REGISTER(nvme_ns_ana_transition_timedout, 4020 nvme_ns, 4021 cdata->anatt * SPDK_SEC_TO_USEC); 4022 break; 4023 default: 4024 break; 4025 } 4026 } 4027 4028 static int 4029 nvme_ns_set_ana_state(const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg) 4030 { 4031 struct nvme_ns *nvme_ns = cb_arg; 4032 uint32_t i; 4033 4034 assert(nvme_ns->ns != NULL); 4035 4036 for (i = 0; i < desc->num_of_nsid; i++) { 4037 if (desc->nsid[i] != spdk_nvme_ns_get_id(nvme_ns->ns)) { 4038 continue; 4039 } 4040 4041 _nvme_ns_set_ana_state(nvme_ns, desc); 4042 return 1; 4043 } 4044 4045 return 0; 4046 } 4047 4048 static int 4049 nvme_generate_uuid(const char *sn, uint32_t nsid, struct spdk_uuid *uuid) 4050 { 4051 int rc = 0; 4052 struct spdk_uuid new_uuid, namespace_uuid; 4053 char merged_str[SPDK_NVME_CTRLR_SN_LEN + NSID_STR_LEN + 1] = {'\0'}; 4054 /* This namespace UUID was generated using uuid_generate() method. */ 4055 const char *namespace_str = {"edaed2de-24bc-4b07-b559-f47ecbe730fd"}; 4056 int size; 4057 4058 assert(strlen(sn) <= SPDK_NVME_CTRLR_SN_LEN); 4059 4060 spdk_uuid_set_null(&new_uuid); 4061 spdk_uuid_set_null(&namespace_uuid); 4062 4063 size = snprintf(merged_str, sizeof(merged_str), "%s%"PRIu32, sn, nsid); 4064 if (size <= 0 || (unsigned long)size >= sizeof(merged_str)) { 4065 return -EINVAL; 4066 } 4067 4068 spdk_uuid_parse(&namespace_uuid, namespace_str); 4069 4070 rc = spdk_uuid_generate_sha1(&new_uuid, &namespace_uuid, merged_str, size); 4071 if (rc == 0) { 4072 memcpy(uuid, &new_uuid, sizeof(struct spdk_uuid)); 4073 } 4074 4075 return rc; 4076 } 4077 4078 static int 4079 nvme_disk_create(struct spdk_bdev *disk, const char *base_name, 4080 struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns, 4081 uint32_t prchk_flags, void *ctx) 4082 { 4083 const struct spdk_uuid *uuid; 4084 const uint8_t *nguid; 4085 const struct spdk_nvme_ctrlr_data *cdata; 4086 const struct spdk_nvme_ns_data *nsdata; 4087 const struct spdk_nvme_ctrlr_opts *opts; 4088 enum spdk_nvme_csi csi; 4089 uint32_t atomic_bs, phys_bs, bs; 4090 char sn_tmp[SPDK_NVME_CTRLR_SN_LEN + 1] = {'\0'}; 4091 int rc; 4092 4093 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 4094 csi = spdk_nvme_ns_get_csi(ns); 4095 opts = spdk_nvme_ctrlr_get_opts(ctrlr); 4096 4097 switch (csi) { 4098 case SPDK_NVME_CSI_NVM: 4099 disk->product_name = "NVMe disk"; 4100 break; 4101 case SPDK_NVME_CSI_ZNS: 4102 disk->product_name = "NVMe ZNS disk"; 4103 disk->zoned = true; 4104 disk->zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 4105 disk->max_zone_append_size = spdk_nvme_zns_ctrlr_get_max_zone_append_size(ctrlr) / 4106 spdk_nvme_ns_get_extended_sector_size(ns); 4107 disk->max_open_zones = spdk_nvme_zns_ns_get_max_open_zones(ns); 4108 disk->max_active_zones = spdk_nvme_zns_ns_get_max_active_zones(ns); 4109 break; 4110 default: 4111 SPDK_ERRLOG("unsupported CSI: %u\n", csi); 4112 return -ENOTSUP; 4113 } 4114 4115 nguid = spdk_nvme_ns_get_nguid(ns); 4116 if (!nguid) { 4117 uuid = spdk_nvme_ns_get_uuid(ns); 4118 if (uuid) { 4119 disk->uuid = *uuid; 4120 } else if (g_opts.generate_uuids) { 4121 spdk_strcpy_pad(sn_tmp, cdata->sn, SPDK_NVME_CTRLR_SN_LEN, '\0'); 4122 rc = nvme_generate_uuid(sn_tmp, spdk_nvme_ns_get_id(ns), &disk->uuid); 4123 if (rc < 0) { 4124 SPDK_ERRLOG("UUID generation failed (%s)\n", spdk_strerror(-rc)); 4125 return rc; 4126 } 4127 } 4128 } else { 4129 memcpy(&disk->uuid, nguid, sizeof(disk->uuid)); 4130 } 4131 4132 disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns)); 4133 if (!disk->name) { 4134 return -ENOMEM; 4135 } 4136 4137 disk->write_cache = 0; 4138 if (cdata->vwc.present) { 4139 /* Enable if the Volatile Write Cache exists */ 4140 disk->write_cache = 1; 4141 } 4142 if (cdata->oncs.write_zeroes) { 4143 disk->max_write_zeroes = UINT16_MAX + 1; 4144 } 4145 disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns); 4146 disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns); 4147 disk->max_segment_size = spdk_nvme_ctrlr_get_max_xfer_size(ctrlr); 4148 disk->ctratt.raw = cdata->ctratt.raw; 4149 /* NVMe driver will split one request into multiple requests 4150 * based on MDTS and stripe boundary, the bdev layer will use 4151 * max_segment_size and max_num_segments to split one big IO 4152 * into multiple requests, then small request can't run out 4153 * of NVMe internal requests data structure. 4154 */ 4155 if (opts && opts->io_queue_requests) { 4156 disk->max_num_segments = opts->io_queue_requests / 2; 4157 } 4158 if (spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_SGL_SUPPORTED) { 4159 /* The nvme driver will try to split I/O that have too many 4160 * SGEs, but it doesn't work if that last SGE doesn't end on 4161 * an aggregate total that is block aligned. The bdev layer has 4162 * a more robust splitting framework, so use that instead for 4163 * this case. (See issue #3269.) 4164 */ 4165 uint16_t max_sges = spdk_nvme_ctrlr_get_max_sges(ctrlr); 4166 4167 if (disk->max_num_segments == 0) { 4168 disk->max_num_segments = max_sges; 4169 } else { 4170 disk->max_num_segments = spdk_min(disk->max_num_segments, max_sges); 4171 } 4172 } 4173 disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns); 4174 4175 nsdata = spdk_nvme_ns_get_data(ns); 4176 bs = spdk_nvme_ns_get_sector_size(ns); 4177 atomic_bs = bs; 4178 phys_bs = bs; 4179 if (nsdata->nabo == 0) { 4180 if (nsdata->nsfeat.ns_atomic_write_unit && nsdata->nawupf) { 4181 atomic_bs = bs * (1 + nsdata->nawupf); 4182 } else { 4183 atomic_bs = bs * (1 + cdata->awupf); 4184 } 4185 } 4186 if (nsdata->nsfeat.optperf) { 4187 phys_bs = bs * (1 + nsdata->npwg); 4188 } 4189 disk->phys_blocklen = spdk_min(phys_bs, atomic_bs); 4190 4191 disk->md_len = spdk_nvme_ns_get_md_size(ns); 4192 if (disk->md_len != 0) { 4193 disk->md_interleave = nsdata->flbas.extended; 4194 disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns); 4195 if (disk->dif_type != SPDK_DIF_DISABLE) { 4196 disk->dif_is_head_of_md = nsdata->dps.md_start; 4197 disk->dif_check_flags = prchk_flags; 4198 } 4199 } 4200 4201 if (!(spdk_nvme_ctrlr_get_flags(ctrlr) & 4202 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) { 4203 disk->acwu = 0; 4204 } else if (nsdata->nsfeat.ns_atomic_write_unit) { 4205 disk->acwu = nsdata->nacwu + 1; /* 0-based */ 4206 } else { 4207 disk->acwu = cdata->acwu + 1; /* 0-based */ 4208 } 4209 4210 if (cdata->oncs.copy) { 4211 /* For now bdev interface allows only single segment copy */ 4212 disk->max_copy = nsdata->mssrl; 4213 } 4214 4215 disk->ctxt = ctx; 4216 disk->fn_table = &nvmelib_fn_table; 4217 disk->module = &nvme_if; 4218 4219 return 0; 4220 } 4221 4222 static struct nvme_bdev * 4223 nvme_bdev_alloc(void) 4224 { 4225 struct nvme_bdev *bdev; 4226 int rc; 4227 4228 bdev = calloc(1, sizeof(*bdev)); 4229 if (!bdev) { 4230 SPDK_ERRLOG("bdev calloc() failed\n"); 4231 return NULL; 4232 } 4233 4234 if (g_opts.nvme_error_stat) { 4235 bdev->err_stat = calloc(1, sizeof(struct nvme_error_stat)); 4236 if (!bdev->err_stat) { 4237 SPDK_ERRLOG("err_stat calloc() failed\n"); 4238 free(bdev); 4239 return NULL; 4240 } 4241 } 4242 4243 rc = pthread_mutex_init(&bdev->mutex, NULL); 4244 if (rc != 0) { 4245 free(bdev->err_stat); 4246 free(bdev); 4247 return NULL; 4248 } 4249 4250 bdev->ref = 1; 4251 bdev->mp_policy = BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE; 4252 bdev->mp_selector = BDEV_NVME_MP_SELECTOR_ROUND_ROBIN; 4253 bdev->rr_min_io = UINT32_MAX; 4254 TAILQ_INIT(&bdev->nvme_ns_list); 4255 4256 return bdev; 4257 } 4258 4259 static int 4260 nvme_bdev_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4261 { 4262 struct nvme_bdev *bdev; 4263 struct nvme_bdev_ctrlr *nbdev_ctrlr = nvme_ctrlr->nbdev_ctrlr; 4264 int rc; 4265 4266 bdev = nvme_bdev_alloc(); 4267 if (bdev == NULL) { 4268 SPDK_ERRLOG("Failed to allocate NVMe bdev\n"); 4269 return -ENOMEM; 4270 } 4271 4272 bdev->opal = nvme_ctrlr->opal_dev != NULL; 4273 4274 rc = nvme_disk_create(&bdev->disk, nbdev_ctrlr->name, nvme_ctrlr->ctrlr, 4275 nvme_ns->ns, nvme_ctrlr->opts.prchk_flags, bdev); 4276 if (rc != 0) { 4277 SPDK_ERRLOG("Failed to create NVMe disk\n"); 4278 nvme_bdev_free(bdev); 4279 return rc; 4280 } 4281 4282 spdk_io_device_register(bdev, 4283 bdev_nvme_create_bdev_channel_cb, 4284 bdev_nvme_destroy_bdev_channel_cb, 4285 sizeof(struct nvme_bdev_channel), 4286 bdev->disk.name); 4287 4288 nvme_ns->bdev = bdev; 4289 bdev->nsid = nvme_ns->id; 4290 TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); 4291 4292 bdev->nbdev_ctrlr = nbdev_ctrlr; 4293 TAILQ_INSERT_TAIL(&nbdev_ctrlr->bdevs, bdev, tailq); 4294 4295 rc = spdk_bdev_register(&bdev->disk); 4296 if (rc != 0) { 4297 SPDK_ERRLOG("spdk_bdev_register() failed\n"); 4298 spdk_io_device_unregister(bdev, NULL); 4299 nvme_ns->bdev = NULL; 4300 TAILQ_REMOVE(&nbdev_ctrlr->bdevs, bdev, tailq); 4301 nvme_bdev_free(bdev); 4302 return rc; 4303 } 4304 4305 return 0; 4306 } 4307 4308 static bool 4309 bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2) 4310 { 4311 const struct spdk_nvme_ns_data *nsdata1, *nsdata2; 4312 const struct spdk_uuid *uuid1, *uuid2; 4313 4314 nsdata1 = spdk_nvme_ns_get_data(ns1); 4315 nsdata2 = spdk_nvme_ns_get_data(ns2); 4316 uuid1 = spdk_nvme_ns_get_uuid(ns1); 4317 uuid2 = spdk_nvme_ns_get_uuid(ns2); 4318 4319 return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid)) == 0 && 4320 nsdata1->eui64 == nsdata2->eui64 && 4321 ((uuid1 == NULL && uuid2 == NULL) || 4322 (uuid1 != NULL && uuid2 != NULL && spdk_uuid_compare(uuid1, uuid2) == 0)) && 4323 spdk_nvme_ns_get_csi(ns1) == spdk_nvme_ns_get_csi(ns2); 4324 } 4325 4326 static bool 4327 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 4328 struct spdk_nvme_ctrlr_opts *opts) 4329 { 4330 struct nvme_probe_skip_entry *entry; 4331 4332 TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) { 4333 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 4334 return false; 4335 } 4336 } 4337 4338 opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst; 4339 opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight; 4340 opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight; 4341 opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight; 4342 opts->disable_read_ana_log_page = true; 4343 4344 SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr); 4345 4346 return true; 4347 } 4348 4349 static void 4350 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl) 4351 { 4352 struct nvme_ctrlr *nvme_ctrlr = ctx; 4353 4354 if (spdk_nvme_cpl_is_error(cpl)) { 4355 SPDK_WARNLOG("Abort failed. Resetting controller. sc is %u, sct is %u.\n", cpl->status.sc, 4356 cpl->status.sct); 4357 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4358 } else if (cpl->cdw0 & 0x1) { 4359 SPDK_WARNLOG("Specified command could not be aborted.\n"); 4360 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4361 } 4362 } 4363 4364 static void 4365 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr, 4366 struct spdk_nvme_qpair *qpair, uint16_t cid) 4367 { 4368 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 4369 union spdk_nvme_csts_register csts; 4370 int rc; 4371 4372 assert(nvme_ctrlr->ctrlr == ctrlr); 4373 4374 SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid); 4375 4376 /* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O 4377 * queue. (Note: qpair == NULL when there's an admin cmd timeout.) Otherwise we 4378 * would submit another fabrics cmd on the admin queue to read CSTS and check for its 4379 * completion recursively. 4380 */ 4381 if (nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) { 4382 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); 4383 if (csts.bits.cfs) { 4384 SPDK_ERRLOG("Controller Fatal Status, reset required\n"); 4385 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4386 return; 4387 } 4388 } 4389 4390 switch (g_opts.action_on_timeout) { 4391 case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT: 4392 if (qpair) { 4393 /* Don't send abort to ctrlr when ctrlr is not available. */ 4394 pthread_mutex_lock(&nvme_ctrlr->mutex); 4395 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 4396 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4397 SPDK_NOTICELOG("Quit abort. Ctrlr is not available.\n"); 4398 return; 4399 } 4400 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4401 4402 rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid, 4403 nvme_abort_cpl, nvme_ctrlr); 4404 if (rc == 0) { 4405 return; 4406 } 4407 4408 SPDK_ERRLOG("Unable to send abort. Resetting, rc is %d.\n", rc); 4409 } 4410 4411 /* FALLTHROUGH */ 4412 case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET: 4413 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4414 break; 4415 case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE: 4416 SPDK_DEBUGLOG(bdev_nvme, "No action for nvme controller timeout.\n"); 4417 break; 4418 default: 4419 SPDK_ERRLOG("An invalid timeout action value is found.\n"); 4420 break; 4421 } 4422 } 4423 4424 static struct nvme_ns * 4425 nvme_ns_alloc(void) 4426 { 4427 struct nvme_ns *nvme_ns; 4428 4429 nvme_ns = calloc(1, sizeof(struct nvme_ns)); 4430 if (nvme_ns == NULL) { 4431 return NULL; 4432 } 4433 4434 if (g_opts.io_path_stat) { 4435 nvme_ns->stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); 4436 if (nvme_ns->stat == NULL) { 4437 free(nvme_ns); 4438 return NULL; 4439 } 4440 spdk_bdev_reset_io_stat(nvme_ns->stat, SPDK_BDEV_RESET_STAT_MAXMIN); 4441 } 4442 4443 return nvme_ns; 4444 } 4445 4446 static void 4447 nvme_ns_free(struct nvme_ns *nvme_ns) 4448 { 4449 free(nvme_ns->stat); 4450 free(nvme_ns); 4451 } 4452 4453 static void 4454 nvme_ctrlr_populate_namespace_done(struct nvme_ns *nvme_ns, int rc) 4455 { 4456 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 4457 struct nvme_async_probe_ctx *ctx = nvme_ns->probe_ctx; 4458 4459 if (rc == 0) { 4460 nvme_ns->probe_ctx = NULL; 4461 pthread_mutex_lock(&nvme_ctrlr->mutex); 4462 nvme_ctrlr->ref++; 4463 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4464 } else { 4465 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4466 nvme_ns_free(nvme_ns); 4467 } 4468 4469 if (ctx) { 4470 ctx->populates_in_progress--; 4471 if (ctx->populates_in_progress == 0) { 4472 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 4473 } 4474 } 4475 } 4476 4477 static void 4478 bdev_nvme_add_io_path(struct spdk_io_channel_iter *i) 4479 { 4480 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4481 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4482 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4483 int rc; 4484 4485 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 4486 if (rc != 0) { 4487 SPDK_ERRLOG("Failed to add I/O path to bdev_channel dynamically.\n"); 4488 } 4489 4490 spdk_for_each_channel_continue(i, rc); 4491 } 4492 4493 static void 4494 bdev_nvme_delete_io_path(struct spdk_io_channel_iter *i) 4495 { 4496 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4497 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4498 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4499 struct nvme_io_path *io_path; 4500 4501 io_path = _bdev_nvme_get_io_path(nbdev_ch, nvme_ns); 4502 if (io_path != NULL) { 4503 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 4504 } 4505 4506 spdk_for_each_channel_continue(i, 0); 4507 } 4508 4509 static void 4510 bdev_nvme_add_io_path_failed(struct spdk_io_channel_iter *i, int status) 4511 { 4512 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4513 4514 nvme_ctrlr_populate_namespace_done(nvme_ns, -1); 4515 } 4516 4517 static void 4518 bdev_nvme_add_io_path_done(struct spdk_io_channel_iter *i, int status) 4519 { 4520 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4521 struct nvme_bdev *bdev = spdk_io_channel_iter_get_io_device(i); 4522 4523 if (status == 0) { 4524 nvme_ctrlr_populate_namespace_done(nvme_ns, 0); 4525 } else { 4526 /* Delete the added io_paths and fail populating the namespace. */ 4527 spdk_for_each_channel(bdev, 4528 bdev_nvme_delete_io_path, 4529 nvme_ns, 4530 bdev_nvme_add_io_path_failed); 4531 } 4532 } 4533 4534 static int 4535 nvme_bdev_add_ns(struct nvme_bdev *bdev, struct nvme_ns *nvme_ns) 4536 { 4537 struct nvme_ns *tmp_ns; 4538 const struct spdk_nvme_ns_data *nsdata; 4539 4540 nsdata = spdk_nvme_ns_get_data(nvme_ns->ns); 4541 if (!nsdata->nmic.can_share) { 4542 SPDK_ERRLOG("Namespace cannot be shared.\n"); 4543 return -EINVAL; 4544 } 4545 4546 pthread_mutex_lock(&bdev->mutex); 4547 4548 tmp_ns = TAILQ_FIRST(&bdev->nvme_ns_list); 4549 assert(tmp_ns != NULL); 4550 4551 if (tmp_ns->ns != NULL && !bdev_nvme_compare_ns(nvme_ns->ns, tmp_ns->ns)) { 4552 pthread_mutex_unlock(&bdev->mutex); 4553 SPDK_ERRLOG("Namespaces are not identical.\n"); 4554 return -EINVAL; 4555 } 4556 4557 bdev->ref++; 4558 TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); 4559 nvme_ns->bdev = bdev; 4560 4561 pthread_mutex_unlock(&bdev->mutex); 4562 4563 /* Add nvme_io_path to nvme_bdev_channels dynamically. */ 4564 spdk_for_each_channel(bdev, 4565 bdev_nvme_add_io_path, 4566 nvme_ns, 4567 bdev_nvme_add_io_path_done); 4568 4569 return 0; 4570 } 4571 4572 static void 4573 nvme_ctrlr_populate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4574 { 4575 struct spdk_nvme_ns *ns; 4576 struct nvme_bdev *bdev; 4577 int rc = 0; 4578 4579 ns = spdk_nvme_ctrlr_get_ns(nvme_ctrlr->ctrlr, nvme_ns->id); 4580 if (!ns) { 4581 SPDK_DEBUGLOG(bdev_nvme, "Invalid NS %d\n", nvme_ns->id); 4582 rc = -EINVAL; 4583 goto done; 4584 } 4585 4586 nvme_ns->ns = ns; 4587 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 4588 4589 if (nvme_ctrlr->ana_log_page != NULL) { 4590 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ns_set_ana_state, nvme_ns); 4591 } 4592 4593 bdev = nvme_bdev_ctrlr_get_bdev(nvme_ctrlr->nbdev_ctrlr, nvme_ns->id); 4594 if (bdev == NULL) { 4595 rc = nvme_bdev_create(nvme_ctrlr, nvme_ns); 4596 } else { 4597 rc = nvme_bdev_add_ns(bdev, nvme_ns); 4598 if (rc == 0) { 4599 return; 4600 } 4601 } 4602 done: 4603 nvme_ctrlr_populate_namespace_done(nvme_ns, rc); 4604 } 4605 4606 static void 4607 nvme_ctrlr_depopulate_namespace_done(struct nvme_ns *nvme_ns) 4608 { 4609 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 4610 4611 assert(nvme_ctrlr != NULL); 4612 4613 pthread_mutex_lock(&nvme_ctrlr->mutex); 4614 4615 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4616 4617 if (nvme_ns->bdev != NULL) { 4618 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4619 return; 4620 } 4621 4622 nvme_ns_free(nvme_ns); 4623 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4624 4625 nvme_ctrlr_release(nvme_ctrlr); 4626 } 4627 4628 static void 4629 bdev_nvme_delete_io_path_done(struct spdk_io_channel_iter *i, int status) 4630 { 4631 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4632 4633 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 4634 } 4635 4636 static void 4637 nvme_ctrlr_depopulate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4638 { 4639 struct nvme_bdev *bdev; 4640 4641 spdk_poller_unregister(&nvme_ns->anatt_timer); 4642 4643 bdev = nvme_ns->bdev; 4644 if (bdev != NULL) { 4645 pthread_mutex_lock(&bdev->mutex); 4646 4647 assert(bdev->ref > 0); 4648 bdev->ref--; 4649 if (bdev->ref == 0) { 4650 pthread_mutex_unlock(&bdev->mutex); 4651 4652 spdk_bdev_unregister(&bdev->disk, NULL, NULL); 4653 } else { 4654 /* spdk_bdev_unregister() is not called until the last nvme_ns is 4655 * depopulated. Hence we need to remove nvme_ns from bdev->nvme_ns_list 4656 * and clear nvme_ns->bdev here. 4657 */ 4658 TAILQ_REMOVE(&bdev->nvme_ns_list, nvme_ns, tailq); 4659 nvme_ns->bdev = NULL; 4660 4661 pthread_mutex_unlock(&bdev->mutex); 4662 4663 /* Delete nvme_io_paths from nvme_bdev_channels dynamically. After that, 4664 * we call depopulate_namespace_done() to avoid use-after-free. 4665 */ 4666 spdk_for_each_channel(bdev, 4667 bdev_nvme_delete_io_path, 4668 nvme_ns, 4669 bdev_nvme_delete_io_path_done); 4670 return; 4671 } 4672 } 4673 4674 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 4675 } 4676 4677 static void 4678 nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 4679 struct nvme_async_probe_ctx *ctx) 4680 { 4681 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 4682 struct nvme_ns *nvme_ns, *next; 4683 struct spdk_nvme_ns *ns; 4684 struct nvme_bdev *bdev; 4685 uint32_t nsid; 4686 int rc; 4687 uint64_t num_sectors; 4688 4689 if (ctx) { 4690 /* Initialize this count to 1 to handle the populate functions 4691 * calling nvme_ctrlr_populate_namespace_done() immediately. 4692 */ 4693 ctx->populates_in_progress = 1; 4694 } 4695 4696 /* First loop over our existing namespaces and see if they have been 4697 * removed. */ 4698 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 4699 while (nvme_ns != NULL) { 4700 next = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 4701 4702 if (spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) { 4703 /* NS is still there or added again. Its attributes may have changed. */ 4704 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id); 4705 if (nvme_ns->ns != ns) { 4706 assert(nvme_ns->ns == NULL); 4707 nvme_ns->ns = ns; 4708 SPDK_DEBUGLOG(bdev_nvme, "NSID %u was added\n", nvme_ns->id); 4709 } 4710 4711 num_sectors = spdk_nvme_ns_get_num_sectors(ns); 4712 bdev = nvme_ns->bdev; 4713 assert(bdev != NULL); 4714 if (bdev->disk.blockcnt != num_sectors) { 4715 SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n", 4716 nvme_ns->id, 4717 bdev->disk.name, 4718 bdev->disk.blockcnt, 4719 num_sectors); 4720 rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors); 4721 if (rc != 0) { 4722 SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n", 4723 bdev->disk.name, rc); 4724 } 4725 } 4726 } else { 4727 /* Namespace was removed */ 4728 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 4729 } 4730 4731 nvme_ns = next; 4732 } 4733 4734 /* Loop through all of the namespaces at the nvme level and see if any of them are new */ 4735 nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 4736 while (nsid != 0) { 4737 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 4738 4739 if (nvme_ns == NULL) { 4740 /* Found a new one */ 4741 nvme_ns = nvme_ns_alloc(); 4742 if (nvme_ns == NULL) { 4743 SPDK_ERRLOG("Failed to allocate namespace\n"); 4744 /* This just fails to attach the namespace. It may work on a future attempt. */ 4745 continue; 4746 } 4747 4748 nvme_ns->id = nsid; 4749 nvme_ns->ctrlr = nvme_ctrlr; 4750 4751 nvme_ns->bdev = NULL; 4752 4753 if (ctx) { 4754 ctx->populates_in_progress++; 4755 } 4756 nvme_ns->probe_ctx = ctx; 4757 4758 RB_INSERT(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4759 4760 nvme_ctrlr_populate_namespace(nvme_ctrlr, nvme_ns); 4761 } 4762 4763 nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid); 4764 } 4765 4766 if (ctx) { 4767 /* Decrement this count now that the loop is over to account 4768 * for the one we started with. If the count is then 0, we 4769 * know any populate_namespace functions completed immediately, 4770 * so we'll kick the callback here. 4771 */ 4772 ctx->populates_in_progress--; 4773 if (ctx->populates_in_progress == 0) { 4774 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 4775 } 4776 } 4777 4778 } 4779 4780 static void 4781 nvme_ctrlr_depopulate_namespaces(struct nvme_ctrlr *nvme_ctrlr) 4782 { 4783 struct nvme_ns *nvme_ns, *tmp; 4784 4785 RB_FOREACH_SAFE(nvme_ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp) { 4786 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 4787 } 4788 } 4789 4790 static uint32_t 4791 nvme_ctrlr_get_ana_log_page_size(struct nvme_ctrlr *nvme_ctrlr) 4792 { 4793 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 4794 const struct spdk_nvme_ctrlr_data *cdata; 4795 uint32_t nsid, ns_count = 0; 4796 4797 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 4798 4799 for (nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 4800 nsid != 0; nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid)) { 4801 ns_count++; 4802 } 4803 4804 return sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 4805 sizeof(struct spdk_nvme_ana_group_descriptor) + ns_count * 4806 sizeof(uint32_t); 4807 } 4808 4809 static int 4810 nvme_ctrlr_set_ana_states(const struct spdk_nvme_ana_group_descriptor *desc, 4811 void *cb_arg) 4812 { 4813 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 4814 struct nvme_ns *nvme_ns; 4815 uint32_t i, nsid; 4816 4817 for (i = 0; i < desc->num_of_nsid; i++) { 4818 nsid = desc->nsid[i]; 4819 if (nsid == 0) { 4820 continue; 4821 } 4822 4823 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 4824 4825 assert(nvme_ns != NULL); 4826 if (nvme_ns == NULL) { 4827 /* Target told us that an inactive namespace had an ANA change */ 4828 continue; 4829 } 4830 4831 _nvme_ns_set_ana_state(nvme_ns, desc); 4832 } 4833 4834 return 0; 4835 } 4836 4837 static void 4838 bdev_nvme_disable_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 4839 { 4840 struct nvme_ns *nvme_ns; 4841 4842 spdk_free(nvme_ctrlr->ana_log_page); 4843 nvme_ctrlr->ana_log_page = NULL; 4844 4845 for (nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 4846 nvme_ns != NULL; 4847 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns)) { 4848 nvme_ns->ana_state_updating = false; 4849 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 4850 } 4851 } 4852 4853 static void 4854 nvme_ctrlr_read_ana_log_page_done(void *ctx, const struct spdk_nvme_cpl *cpl) 4855 { 4856 struct nvme_ctrlr *nvme_ctrlr = ctx; 4857 4858 if (cpl != NULL && spdk_nvme_cpl_is_success(cpl)) { 4859 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ctrlr_set_ana_states, 4860 nvme_ctrlr); 4861 } else { 4862 bdev_nvme_disable_read_ana_log_page(nvme_ctrlr); 4863 } 4864 4865 pthread_mutex_lock(&nvme_ctrlr->mutex); 4866 4867 assert(nvme_ctrlr->ana_log_page_updating == true); 4868 nvme_ctrlr->ana_log_page_updating = false; 4869 4870 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 4871 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4872 4873 nvme_ctrlr_unregister(nvme_ctrlr); 4874 } else { 4875 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4876 4877 bdev_nvme_clear_io_path_caches(nvme_ctrlr); 4878 } 4879 } 4880 4881 static int 4882 nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 4883 { 4884 uint32_t ana_log_page_size; 4885 int rc; 4886 4887 if (nvme_ctrlr->ana_log_page == NULL) { 4888 return -EINVAL; 4889 } 4890 4891 ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); 4892 4893 if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { 4894 SPDK_ERRLOG("ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", 4895 ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); 4896 return -EINVAL; 4897 } 4898 4899 pthread_mutex_lock(&nvme_ctrlr->mutex); 4900 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 4901 nvme_ctrlr->ana_log_page_updating) { 4902 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4903 return -EBUSY; 4904 } 4905 4906 nvme_ctrlr->ana_log_page_updating = true; 4907 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4908 4909 rc = spdk_nvme_ctrlr_cmd_get_log_page(nvme_ctrlr->ctrlr, 4910 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 4911 SPDK_NVME_GLOBAL_NS_TAG, 4912 nvme_ctrlr->ana_log_page, 4913 ana_log_page_size, 0, 4914 nvme_ctrlr_read_ana_log_page_done, 4915 nvme_ctrlr); 4916 if (rc != 0) { 4917 nvme_ctrlr_read_ana_log_page_done(nvme_ctrlr, NULL); 4918 } 4919 4920 return rc; 4921 } 4922 4923 static void 4924 dummy_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 4925 { 4926 } 4927 4928 struct bdev_nvme_set_preferred_path_ctx { 4929 struct spdk_bdev_desc *desc; 4930 struct nvme_ns *nvme_ns; 4931 bdev_nvme_set_preferred_path_cb cb_fn; 4932 void *cb_arg; 4933 }; 4934 4935 static void 4936 bdev_nvme_set_preferred_path_done(struct spdk_io_channel_iter *i, int status) 4937 { 4938 struct bdev_nvme_set_preferred_path_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4939 4940 assert(ctx != NULL); 4941 assert(ctx->desc != NULL); 4942 assert(ctx->cb_fn != NULL); 4943 4944 spdk_bdev_close(ctx->desc); 4945 4946 ctx->cb_fn(ctx->cb_arg, status); 4947 4948 free(ctx); 4949 } 4950 4951 static void 4952 _bdev_nvme_set_preferred_path(struct spdk_io_channel_iter *i) 4953 { 4954 struct bdev_nvme_set_preferred_path_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4955 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4956 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4957 struct nvme_io_path *io_path, *prev; 4958 4959 prev = NULL; 4960 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 4961 if (io_path->nvme_ns == ctx->nvme_ns) { 4962 break; 4963 } 4964 prev = io_path; 4965 } 4966 4967 if (io_path != NULL) { 4968 if (prev != NULL) { 4969 STAILQ_REMOVE_AFTER(&nbdev_ch->io_path_list, prev, stailq); 4970 STAILQ_INSERT_HEAD(&nbdev_ch->io_path_list, io_path, stailq); 4971 } 4972 4973 /* We can set io_path to nbdev_ch->current_io_path directly here. 4974 * However, it needs to be conditional. To simplify the code, 4975 * just clear nbdev_ch->current_io_path and let find_io_path() 4976 * fill it. 4977 * 4978 * Automatic failback may be disabled. Hence even if the io_path is 4979 * already at the head, clear nbdev_ch->current_io_path. 4980 */ 4981 bdev_nvme_clear_current_io_path(nbdev_ch); 4982 } 4983 4984 spdk_for_each_channel_continue(i, 0); 4985 } 4986 4987 static struct nvme_ns * 4988 bdev_nvme_set_preferred_ns(struct nvme_bdev *nbdev, uint16_t cntlid) 4989 { 4990 struct nvme_ns *nvme_ns, *prev; 4991 const struct spdk_nvme_ctrlr_data *cdata; 4992 4993 prev = NULL; 4994 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 4995 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 4996 4997 if (cdata->cntlid == cntlid) { 4998 break; 4999 } 5000 prev = nvme_ns; 5001 } 5002 5003 if (nvme_ns != NULL && prev != NULL) { 5004 TAILQ_REMOVE(&nbdev->nvme_ns_list, nvme_ns, tailq); 5005 TAILQ_INSERT_HEAD(&nbdev->nvme_ns_list, nvme_ns, tailq); 5006 } 5007 5008 return nvme_ns; 5009 } 5010 5011 /* This function supports only multipath mode. There is only a single I/O path 5012 * for each NVMe-oF controller. Hence, just move the matched I/O path to the 5013 * head of the I/O path list for each NVMe bdev channel. 5014 * 5015 * NVMe bdev channel may be acquired after completing this function. move the 5016 * matched namespace to the head of the namespace list for the NVMe bdev too. 5017 */ 5018 void 5019 bdev_nvme_set_preferred_path(const char *name, uint16_t cntlid, 5020 bdev_nvme_set_preferred_path_cb cb_fn, void *cb_arg) 5021 { 5022 struct bdev_nvme_set_preferred_path_ctx *ctx; 5023 struct spdk_bdev *bdev; 5024 struct nvme_bdev *nbdev; 5025 int rc = 0; 5026 5027 assert(cb_fn != NULL); 5028 5029 ctx = calloc(1, sizeof(*ctx)); 5030 if (ctx == NULL) { 5031 SPDK_ERRLOG("Failed to alloc context.\n"); 5032 rc = -ENOMEM; 5033 goto err_alloc; 5034 } 5035 5036 ctx->cb_fn = cb_fn; 5037 ctx->cb_arg = cb_arg; 5038 5039 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 5040 if (rc != 0) { 5041 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 5042 goto err_open; 5043 } 5044 5045 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 5046 5047 if (bdev->module != &nvme_if) { 5048 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 5049 rc = -ENODEV; 5050 goto err_bdev; 5051 } 5052 5053 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 5054 5055 pthread_mutex_lock(&nbdev->mutex); 5056 5057 ctx->nvme_ns = bdev_nvme_set_preferred_ns(nbdev, cntlid); 5058 if (ctx->nvme_ns == NULL) { 5059 pthread_mutex_unlock(&nbdev->mutex); 5060 5061 SPDK_ERRLOG("bdev %s does not have namespace to controller %u.\n", name, cntlid); 5062 rc = -ENODEV; 5063 goto err_bdev; 5064 } 5065 5066 pthread_mutex_unlock(&nbdev->mutex); 5067 5068 spdk_for_each_channel(nbdev, 5069 _bdev_nvme_set_preferred_path, 5070 ctx, 5071 bdev_nvme_set_preferred_path_done); 5072 return; 5073 5074 err_bdev: 5075 spdk_bdev_close(ctx->desc); 5076 err_open: 5077 free(ctx); 5078 err_alloc: 5079 cb_fn(cb_arg, rc); 5080 } 5081 5082 struct bdev_nvme_set_multipath_policy_ctx { 5083 struct spdk_bdev_desc *desc; 5084 bdev_nvme_set_multipath_policy_cb cb_fn; 5085 void *cb_arg; 5086 }; 5087 5088 static void 5089 bdev_nvme_set_multipath_policy_done(struct spdk_io_channel_iter *i, int status) 5090 { 5091 struct bdev_nvme_set_multipath_policy_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 5092 5093 assert(ctx != NULL); 5094 assert(ctx->desc != NULL); 5095 assert(ctx->cb_fn != NULL); 5096 5097 spdk_bdev_close(ctx->desc); 5098 5099 ctx->cb_fn(ctx->cb_arg, status); 5100 5101 free(ctx); 5102 } 5103 5104 static void 5105 _bdev_nvme_set_multipath_policy(struct spdk_io_channel_iter *i) 5106 { 5107 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 5108 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 5109 struct nvme_bdev *nbdev = spdk_io_channel_get_io_device(_ch); 5110 5111 nbdev_ch->mp_policy = nbdev->mp_policy; 5112 nbdev_ch->mp_selector = nbdev->mp_selector; 5113 nbdev_ch->rr_min_io = nbdev->rr_min_io; 5114 bdev_nvme_clear_current_io_path(nbdev_ch); 5115 5116 spdk_for_each_channel_continue(i, 0); 5117 } 5118 5119 void 5120 bdev_nvme_set_multipath_policy(const char *name, enum bdev_nvme_multipath_policy policy, 5121 enum bdev_nvme_multipath_selector selector, uint32_t rr_min_io, 5122 bdev_nvme_set_multipath_policy_cb cb_fn, void *cb_arg) 5123 { 5124 struct bdev_nvme_set_multipath_policy_ctx *ctx; 5125 struct spdk_bdev *bdev; 5126 struct nvme_bdev *nbdev; 5127 int rc; 5128 5129 assert(cb_fn != NULL); 5130 5131 switch (policy) { 5132 case BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE: 5133 break; 5134 case BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE: 5135 switch (selector) { 5136 case BDEV_NVME_MP_SELECTOR_ROUND_ROBIN: 5137 if (rr_min_io == UINT32_MAX) { 5138 rr_min_io = 1; 5139 } else if (rr_min_io == 0) { 5140 rc = -EINVAL; 5141 goto exit; 5142 } 5143 break; 5144 case BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH: 5145 break; 5146 default: 5147 rc = -EINVAL; 5148 goto exit; 5149 } 5150 break; 5151 default: 5152 rc = -EINVAL; 5153 goto exit; 5154 } 5155 5156 ctx = calloc(1, sizeof(*ctx)); 5157 if (ctx == NULL) { 5158 SPDK_ERRLOG("Failed to alloc context.\n"); 5159 rc = -ENOMEM; 5160 goto exit; 5161 } 5162 5163 ctx->cb_fn = cb_fn; 5164 ctx->cb_arg = cb_arg; 5165 5166 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 5167 if (rc != 0) { 5168 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 5169 rc = -ENODEV; 5170 goto err_open; 5171 } 5172 5173 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 5174 if (bdev->module != &nvme_if) { 5175 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 5176 rc = -ENODEV; 5177 goto err_module; 5178 } 5179 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 5180 5181 pthread_mutex_lock(&nbdev->mutex); 5182 nbdev->mp_policy = policy; 5183 nbdev->mp_selector = selector; 5184 nbdev->rr_min_io = rr_min_io; 5185 pthread_mutex_unlock(&nbdev->mutex); 5186 5187 spdk_for_each_channel(nbdev, 5188 _bdev_nvme_set_multipath_policy, 5189 ctx, 5190 bdev_nvme_set_multipath_policy_done); 5191 return; 5192 5193 err_module: 5194 spdk_bdev_close(ctx->desc); 5195 err_open: 5196 free(ctx); 5197 exit: 5198 cb_fn(cb_arg, rc); 5199 } 5200 5201 static void 5202 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 5203 { 5204 struct nvme_ctrlr *nvme_ctrlr = arg; 5205 union spdk_nvme_async_event_completion event; 5206 5207 if (spdk_nvme_cpl_is_error(cpl)) { 5208 SPDK_WARNLOG("AER request execute failed\n"); 5209 return; 5210 } 5211 5212 event.raw = cpl->cdw0; 5213 if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 5214 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) { 5215 nvme_ctrlr_populate_namespaces(nvme_ctrlr, NULL); 5216 } else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 5217 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_ANA_CHANGE)) { 5218 nvme_ctrlr_read_ana_log_page(nvme_ctrlr); 5219 } 5220 } 5221 5222 static void 5223 free_nvme_async_probe_ctx(struct nvme_async_probe_ctx *ctx) 5224 { 5225 spdk_keyring_put_key(ctx->drv_opts.tls_psk); 5226 spdk_keyring_put_key(ctx->drv_opts.dhchap_key); 5227 spdk_keyring_put_key(ctx->drv_opts.dhchap_ctrlr_key); 5228 free(ctx); 5229 } 5230 5231 static void 5232 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, int rc) 5233 { 5234 if (ctx->cb_fn) { 5235 ctx->cb_fn(ctx->cb_ctx, ctx->reported_bdevs, rc); 5236 } 5237 5238 ctx->namespaces_populated = true; 5239 if (ctx->probe_done) { 5240 /* The probe was already completed, so we need to free the context 5241 * here. This can happen for cases like OCSSD, where we need to 5242 * send additional commands to the SSD after attach. 5243 */ 5244 free_nvme_async_probe_ctx(ctx); 5245 } 5246 } 5247 5248 static void 5249 nvme_ctrlr_create_done(struct nvme_ctrlr *nvme_ctrlr, 5250 struct nvme_async_probe_ctx *ctx) 5251 { 5252 spdk_io_device_register(nvme_ctrlr, 5253 bdev_nvme_create_ctrlr_channel_cb, 5254 bdev_nvme_destroy_ctrlr_channel_cb, 5255 sizeof(struct nvme_ctrlr_channel), 5256 nvme_ctrlr->nbdev_ctrlr->name); 5257 5258 nvme_ctrlr_populate_namespaces(nvme_ctrlr, ctx); 5259 } 5260 5261 static void 5262 nvme_ctrlr_init_ana_log_page_done(void *_ctx, const struct spdk_nvme_cpl *cpl) 5263 { 5264 struct nvme_ctrlr *nvme_ctrlr = _ctx; 5265 struct nvme_async_probe_ctx *ctx = nvme_ctrlr->probe_ctx; 5266 5267 nvme_ctrlr->probe_ctx = NULL; 5268 5269 if (spdk_nvme_cpl_is_error(cpl)) { 5270 nvme_ctrlr_delete(nvme_ctrlr); 5271 5272 if (ctx != NULL) { 5273 ctx->reported_bdevs = 0; 5274 populate_namespaces_cb(ctx, -1); 5275 } 5276 return; 5277 } 5278 5279 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 5280 } 5281 5282 static int 5283 nvme_ctrlr_init_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 5284 struct nvme_async_probe_ctx *ctx) 5285 { 5286 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 5287 const struct spdk_nvme_ctrlr_data *cdata; 5288 uint32_t ana_log_page_size; 5289 5290 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5291 5292 /* Set buffer size enough to include maximum number of allowed namespaces. */ 5293 ana_log_page_size = sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 5294 sizeof(struct spdk_nvme_ana_group_descriptor) + cdata->mnan * 5295 sizeof(uint32_t); 5296 5297 nvme_ctrlr->ana_log_page = spdk_zmalloc(ana_log_page_size, 64, NULL, 5298 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 5299 if (nvme_ctrlr->ana_log_page == NULL) { 5300 SPDK_ERRLOG("could not allocate ANA log page buffer\n"); 5301 return -ENXIO; 5302 } 5303 5304 /* Each descriptor in a ANA log page is not ensured to be 8-bytes aligned. 5305 * Hence copy each descriptor to a temporary area when parsing it. 5306 * 5307 * Allocate a buffer whose size is as large as ANA log page buffer because 5308 * we do not know the size of a descriptor until actually reading it. 5309 */ 5310 nvme_ctrlr->copied_ana_desc = calloc(1, ana_log_page_size); 5311 if (nvme_ctrlr->copied_ana_desc == NULL) { 5312 SPDK_ERRLOG("could not allocate a buffer to parse ANA descriptor\n"); 5313 return -ENOMEM; 5314 } 5315 5316 nvme_ctrlr->max_ana_log_page_size = ana_log_page_size; 5317 5318 nvme_ctrlr->probe_ctx = ctx; 5319 5320 /* Then, set the read size only to include the current active namespaces. */ 5321 ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); 5322 5323 if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { 5324 SPDK_ERRLOG("ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", 5325 ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); 5326 return -EINVAL; 5327 } 5328 5329 return spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, 5330 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 5331 SPDK_NVME_GLOBAL_NS_TAG, 5332 nvme_ctrlr->ana_log_page, 5333 ana_log_page_size, 0, 5334 nvme_ctrlr_init_ana_log_page_done, 5335 nvme_ctrlr); 5336 } 5337 5338 /* hostnqn and subnqn were already verified before attaching a controller. 5339 * Hence check only the multipath capability and cntlid here. 5340 */ 5341 static bool 5342 bdev_nvme_check_multipath(struct nvme_bdev_ctrlr *nbdev_ctrlr, struct spdk_nvme_ctrlr *ctrlr) 5343 { 5344 struct nvme_ctrlr *tmp; 5345 const struct spdk_nvme_ctrlr_data *cdata, *tmp_cdata; 5346 5347 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5348 5349 if (!cdata->cmic.multi_ctrlr) { 5350 SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); 5351 return false; 5352 } 5353 5354 TAILQ_FOREACH(tmp, &nbdev_ctrlr->ctrlrs, tailq) { 5355 tmp_cdata = spdk_nvme_ctrlr_get_data(tmp->ctrlr); 5356 5357 if (!tmp_cdata->cmic.multi_ctrlr) { 5358 SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); 5359 return false; 5360 } 5361 if (cdata->cntlid == tmp_cdata->cntlid) { 5362 SPDK_ERRLOG("cntlid %u are duplicated.\n", tmp_cdata->cntlid); 5363 return false; 5364 } 5365 } 5366 5367 return true; 5368 } 5369 5370 static int 5371 nvme_bdev_ctrlr_create(const char *name, struct nvme_ctrlr *nvme_ctrlr) 5372 { 5373 struct nvme_bdev_ctrlr *nbdev_ctrlr; 5374 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 5375 int rc = 0; 5376 5377 pthread_mutex_lock(&g_bdev_nvme_mutex); 5378 5379 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 5380 if (nbdev_ctrlr != NULL) { 5381 if (!bdev_nvme_check_multipath(nbdev_ctrlr, ctrlr)) { 5382 rc = -EINVAL; 5383 goto exit; 5384 } 5385 } else { 5386 nbdev_ctrlr = calloc(1, sizeof(*nbdev_ctrlr)); 5387 if (nbdev_ctrlr == NULL) { 5388 SPDK_ERRLOG("Failed to allocate nvme_bdev_ctrlr.\n"); 5389 rc = -ENOMEM; 5390 goto exit; 5391 } 5392 nbdev_ctrlr->name = strdup(name); 5393 if (nbdev_ctrlr->name == NULL) { 5394 SPDK_ERRLOG("Failed to allocate name of nvme_bdev_ctrlr.\n"); 5395 free(nbdev_ctrlr); 5396 goto exit; 5397 } 5398 TAILQ_INIT(&nbdev_ctrlr->ctrlrs); 5399 TAILQ_INIT(&nbdev_ctrlr->bdevs); 5400 TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 5401 } 5402 nvme_ctrlr->nbdev_ctrlr = nbdev_ctrlr; 5403 TAILQ_INSERT_TAIL(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 5404 exit: 5405 pthread_mutex_unlock(&g_bdev_nvme_mutex); 5406 return rc; 5407 } 5408 5409 static int 5410 nvme_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr, 5411 const char *name, 5412 const struct spdk_nvme_transport_id *trid, 5413 struct nvme_async_probe_ctx *ctx) 5414 { 5415 struct nvme_ctrlr *nvme_ctrlr; 5416 struct nvme_path_id *path_id; 5417 const struct spdk_nvme_ctrlr_data *cdata; 5418 int rc; 5419 5420 nvme_ctrlr = calloc(1, sizeof(*nvme_ctrlr)); 5421 if (nvme_ctrlr == NULL) { 5422 SPDK_ERRLOG("Failed to allocate device struct\n"); 5423 return -ENOMEM; 5424 } 5425 5426 rc = pthread_mutex_init(&nvme_ctrlr->mutex, NULL); 5427 if (rc != 0) { 5428 free(nvme_ctrlr); 5429 return rc; 5430 } 5431 5432 TAILQ_INIT(&nvme_ctrlr->trids); 5433 RB_INIT(&nvme_ctrlr->namespaces); 5434 5435 /* Get another reference to the key, so the first one can be released from probe_ctx */ 5436 if (ctx != NULL) { 5437 if (ctx->drv_opts.tls_psk != NULL) { 5438 nvme_ctrlr->psk = spdk_keyring_get_key( 5439 spdk_key_get_name(ctx->drv_opts.tls_psk)); 5440 if (nvme_ctrlr->psk == NULL) { 5441 /* Could only happen if the key was removed in the meantime */ 5442 SPDK_ERRLOG("Couldn't get a reference to the key '%s'\n", 5443 spdk_key_get_name(ctx->drv_opts.tls_psk)); 5444 rc = -ENOKEY; 5445 goto err; 5446 } 5447 } 5448 5449 if (ctx->drv_opts.dhchap_key != NULL) { 5450 nvme_ctrlr->dhchap_key = spdk_keyring_get_key( 5451 spdk_key_get_name(ctx->drv_opts.dhchap_key)); 5452 if (nvme_ctrlr->dhchap_key == NULL) { 5453 SPDK_ERRLOG("Couldn't get a reference to the key '%s'\n", 5454 spdk_key_get_name(ctx->drv_opts.dhchap_key)); 5455 rc = -ENOKEY; 5456 goto err; 5457 } 5458 } 5459 5460 if (ctx->drv_opts.dhchap_ctrlr_key != NULL) { 5461 nvme_ctrlr->dhchap_ctrlr_key = 5462 spdk_keyring_get_key( 5463 spdk_key_get_name(ctx->drv_opts.dhchap_ctrlr_key)); 5464 if (nvme_ctrlr->dhchap_ctrlr_key == NULL) { 5465 SPDK_ERRLOG("Couldn't get a reference to the key '%s'\n", 5466 spdk_key_get_name(ctx->drv_opts.tls_psk)); 5467 rc = -ENOKEY; 5468 goto err; 5469 } 5470 } 5471 } 5472 5473 path_id = calloc(1, sizeof(*path_id)); 5474 if (path_id == NULL) { 5475 SPDK_ERRLOG("Failed to allocate trid entry pointer\n"); 5476 rc = -ENOMEM; 5477 goto err; 5478 } 5479 5480 path_id->trid = *trid; 5481 if (ctx != NULL) { 5482 memcpy(path_id->hostid.hostaddr, ctx->drv_opts.src_addr, sizeof(path_id->hostid.hostaddr)); 5483 memcpy(path_id->hostid.hostsvcid, ctx->drv_opts.src_svcid, sizeof(path_id->hostid.hostsvcid)); 5484 } 5485 nvme_ctrlr->active_path_id = path_id; 5486 TAILQ_INSERT_HEAD(&nvme_ctrlr->trids, path_id, link); 5487 5488 nvme_ctrlr->thread = spdk_get_thread(); 5489 nvme_ctrlr->ctrlr = ctrlr; 5490 nvme_ctrlr->ref = 1; 5491 5492 if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) { 5493 SPDK_ERRLOG("OCSSDs are not supported"); 5494 rc = -ENOTSUP; 5495 goto err; 5496 } 5497 5498 if (ctx != NULL) { 5499 memcpy(&nvme_ctrlr->opts, &ctx->bdev_opts, sizeof(ctx->bdev_opts)); 5500 } else { 5501 bdev_nvme_get_default_ctrlr_opts(&nvme_ctrlr->opts); 5502 } 5503 5504 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_ctrlr, 5505 g_opts.nvme_adminq_poll_period_us); 5506 5507 if (g_opts.timeout_us > 0) { 5508 /* Register timeout callback. Timeout values for IO vs. admin reqs can be different. */ 5509 /* If timeout_admin_us is 0 (not specified), admin uses same timeout as IO. */ 5510 uint64_t adm_timeout_us = (g_opts.timeout_admin_us == 0) ? 5511 g_opts.timeout_us : g_opts.timeout_admin_us; 5512 spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us, 5513 adm_timeout_us, timeout_cb, nvme_ctrlr); 5514 } 5515 5516 spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_ctrlr); 5517 spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_ctrlr); 5518 5519 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 5520 SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) { 5521 nvme_ctrlr->opal_dev = spdk_opal_dev_construct(ctrlr); 5522 } 5523 5524 rc = nvme_bdev_ctrlr_create(name, nvme_ctrlr); 5525 if (rc != 0) { 5526 goto err; 5527 } 5528 5529 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5530 5531 if (cdata->cmic.ana_reporting) { 5532 rc = nvme_ctrlr_init_ana_log_page(nvme_ctrlr, ctx); 5533 if (rc == 0) { 5534 return 0; 5535 } 5536 } else { 5537 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 5538 return 0; 5539 } 5540 5541 err: 5542 nvme_ctrlr_delete(nvme_ctrlr); 5543 return rc; 5544 } 5545 5546 void 5547 bdev_nvme_get_default_ctrlr_opts(struct nvme_ctrlr_opts *opts) 5548 { 5549 opts->prchk_flags = 0; 5550 opts->ctrlr_loss_timeout_sec = g_opts.ctrlr_loss_timeout_sec; 5551 opts->reconnect_delay_sec = g_opts.reconnect_delay_sec; 5552 opts->fast_io_fail_timeout_sec = g_opts.fast_io_fail_timeout_sec; 5553 } 5554 5555 static void 5556 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5557 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *drv_opts) 5558 { 5559 char *name; 5560 5561 name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++); 5562 if (!name) { 5563 SPDK_ERRLOG("Failed to assign name to NVMe device\n"); 5564 return; 5565 } 5566 5567 if (nvme_ctrlr_create(ctrlr, name, trid, NULL) == 0) { 5568 SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name); 5569 } else { 5570 SPDK_ERRLOG("Failed to attach to %s (%s)\n", trid->traddr, name); 5571 } 5572 5573 free(name); 5574 } 5575 5576 static void 5577 _nvme_ctrlr_destruct(void *ctx) 5578 { 5579 struct nvme_ctrlr *nvme_ctrlr = ctx; 5580 5581 nvme_ctrlr_depopulate_namespaces(nvme_ctrlr); 5582 nvme_ctrlr_release(nvme_ctrlr); 5583 } 5584 5585 static int 5586 bdev_nvme_delete_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) 5587 { 5588 struct nvme_probe_skip_entry *entry; 5589 5590 /* The controller's destruction was already started */ 5591 if (nvme_ctrlr->destruct) { 5592 return -EALREADY; 5593 } 5594 5595 if (!hotplug && 5596 nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { 5597 entry = calloc(1, sizeof(*entry)); 5598 if (!entry) { 5599 return -ENOMEM; 5600 } 5601 entry->trid = nvme_ctrlr->active_path_id->trid; 5602 TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq); 5603 } 5604 5605 nvme_ctrlr->destruct = true; 5606 return 0; 5607 } 5608 5609 static int 5610 bdev_nvme_delete_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) 5611 { 5612 int rc; 5613 5614 pthread_mutex_lock(&nvme_ctrlr->mutex); 5615 rc = bdev_nvme_delete_ctrlr_unsafe(nvme_ctrlr, hotplug); 5616 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5617 5618 if (rc == 0) { 5619 _nvme_ctrlr_destruct(nvme_ctrlr); 5620 } else if (rc == -EALREADY) { 5621 rc = 0; 5622 } 5623 5624 return rc; 5625 } 5626 5627 static void 5628 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr) 5629 { 5630 struct nvme_ctrlr *nvme_ctrlr = cb_ctx; 5631 5632 bdev_nvme_delete_ctrlr(nvme_ctrlr, true); 5633 } 5634 5635 static int 5636 bdev_nvme_hotplug_probe(void *arg) 5637 { 5638 if (g_hotplug_probe_ctx == NULL) { 5639 spdk_poller_unregister(&g_hotplug_probe_poller); 5640 return SPDK_POLLER_IDLE; 5641 } 5642 5643 if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) { 5644 g_hotplug_probe_ctx = NULL; 5645 spdk_poller_unregister(&g_hotplug_probe_poller); 5646 } 5647 5648 return SPDK_POLLER_BUSY; 5649 } 5650 5651 static int 5652 bdev_nvme_hotplug(void *arg) 5653 { 5654 struct spdk_nvme_transport_id trid_pcie; 5655 5656 if (g_hotplug_probe_ctx) { 5657 return SPDK_POLLER_BUSY; 5658 } 5659 5660 memset(&trid_pcie, 0, sizeof(trid_pcie)); 5661 spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE); 5662 5663 g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL, 5664 hotplug_probe_cb, attach_cb, NULL); 5665 5666 if (g_hotplug_probe_ctx) { 5667 assert(g_hotplug_probe_poller == NULL); 5668 g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000); 5669 } 5670 5671 return SPDK_POLLER_BUSY; 5672 } 5673 5674 void 5675 bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts) 5676 { 5677 *opts = g_opts; 5678 } 5679 5680 static bool bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, 5681 uint32_t reconnect_delay_sec, 5682 uint32_t fast_io_fail_timeout_sec); 5683 5684 static int 5685 bdev_nvme_validate_opts(const struct spdk_bdev_nvme_opts *opts) 5686 { 5687 if ((opts->timeout_us == 0) && (opts->timeout_admin_us != 0)) { 5688 /* Can't set timeout_admin_us without also setting timeout_us */ 5689 SPDK_WARNLOG("Invalid options: Can't have (timeout_us == 0) with (timeout_admin_us > 0)\n"); 5690 return -EINVAL; 5691 } 5692 5693 if (opts->bdev_retry_count < -1) { 5694 SPDK_WARNLOG("Invalid option: bdev_retry_count can't be less than -1.\n"); 5695 return -EINVAL; 5696 } 5697 5698 if (!bdev_nvme_check_io_error_resiliency_params(opts->ctrlr_loss_timeout_sec, 5699 opts->reconnect_delay_sec, 5700 opts->fast_io_fail_timeout_sec)) { 5701 return -EINVAL; 5702 } 5703 5704 return 0; 5705 } 5706 5707 int 5708 bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts) 5709 { 5710 int ret; 5711 5712 ret = bdev_nvme_validate_opts(opts); 5713 if (ret) { 5714 SPDK_WARNLOG("Failed to set nvme opts.\n"); 5715 return ret; 5716 } 5717 5718 if (g_bdev_nvme_init_thread != NULL) { 5719 if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 5720 return -EPERM; 5721 } 5722 } 5723 5724 if (opts->rdma_srq_size != 0 || 5725 opts->rdma_max_cq_size != 0 || 5726 opts->rdma_cm_event_timeout_ms != 0) { 5727 struct spdk_nvme_transport_opts drv_opts; 5728 5729 spdk_nvme_transport_get_opts(&drv_opts, sizeof(drv_opts)); 5730 if (opts->rdma_srq_size != 0) { 5731 drv_opts.rdma_srq_size = opts->rdma_srq_size; 5732 } 5733 if (opts->rdma_max_cq_size != 0) { 5734 drv_opts.rdma_max_cq_size = opts->rdma_max_cq_size; 5735 } 5736 if (opts->rdma_cm_event_timeout_ms != 0) { 5737 drv_opts.rdma_cm_event_timeout_ms = opts->rdma_cm_event_timeout_ms; 5738 } 5739 5740 ret = spdk_nvme_transport_set_opts(&drv_opts, sizeof(drv_opts)); 5741 if (ret) { 5742 SPDK_ERRLOG("Failed to set NVMe transport opts.\n"); 5743 return ret; 5744 } 5745 } 5746 5747 g_opts = *opts; 5748 5749 return 0; 5750 } 5751 5752 struct set_nvme_hotplug_ctx { 5753 uint64_t period_us; 5754 bool enabled; 5755 spdk_msg_fn fn; 5756 void *fn_ctx; 5757 }; 5758 5759 static void 5760 set_nvme_hotplug_period_cb(void *_ctx) 5761 { 5762 struct set_nvme_hotplug_ctx *ctx = _ctx; 5763 5764 spdk_poller_unregister(&g_hotplug_poller); 5765 if (ctx->enabled) { 5766 g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us); 5767 } 5768 5769 g_nvme_hotplug_poll_period_us = ctx->period_us; 5770 g_nvme_hotplug_enabled = ctx->enabled; 5771 if (ctx->fn) { 5772 ctx->fn(ctx->fn_ctx); 5773 } 5774 5775 free(ctx); 5776 } 5777 5778 int 5779 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx) 5780 { 5781 struct set_nvme_hotplug_ctx *ctx; 5782 5783 if (enabled == true && !spdk_process_is_primary()) { 5784 return -EPERM; 5785 } 5786 5787 ctx = calloc(1, sizeof(*ctx)); 5788 if (ctx == NULL) { 5789 return -ENOMEM; 5790 } 5791 5792 period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us; 5793 ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX); 5794 ctx->enabled = enabled; 5795 ctx->fn = cb; 5796 ctx->fn_ctx = cb_ctx; 5797 5798 spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx); 5799 return 0; 5800 } 5801 5802 static void 5803 nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 5804 struct nvme_async_probe_ctx *ctx) 5805 { 5806 struct nvme_ns *nvme_ns; 5807 struct nvme_bdev *nvme_bdev; 5808 size_t j; 5809 5810 assert(nvme_ctrlr != NULL); 5811 5812 if (ctx->names == NULL) { 5813 ctx->reported_bdevs = 0; 5814 populate_namespaces_cb(ctx, 0); 5815 return; 5816 } 5817 5818 /* 5819 * Report the new bdevs that were created in this call. 5820 * There can be more than one bdev per NVMe controller. 5821 */ 5822 j = 0; 5823 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 5824 while (nvme_ns != NULL) { 5825 nvme_bdev = nvme_ns->bdev; 5826 if (j < ctx->max_bdevs) { 5827 ctx->names[j] = nvme_bdev->disk.name; 5828 j++; 5829 } else { 5830 SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n", 5831 ctx->max_bdevs); 5832 ctx->reported_bdevs = 0; 5833 populate_namespaces_cb(ctx, -ERANGE); 5834 return; 5835 } 5836 5837 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 5838 } 5839 5840 ctx->reported_bdevs = j; 5841 populate_namespaces_cb(ctx, 0); 5842 } 5843 5844 static int 5845 bdev_nvme_check_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 5846 struct spdk_nvme_ctrlr *new_ctrlr, 5847 struct spdk_nvme_transport_id *trid) 5848 { 5849 struct nvme_path_id *tmp_trid; 5850 5851 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 5852 SPDK_ERRLOG("PCIe failover is not supported.\n"); 5853 return -ENOTSUP; 5854 } 5855 5856 /* Currently we only support failover to the same transport type. */ 5857 if (nvme_ctrlr->active_path_id->trid.trtype != trid->trtype) { 5858 SPDK_WARNLOG("Failover from trtype: %s to a different trtype: %s is not supported currently\n", 5859 spdk_nvme_transport_id_trtype_str(nvme_ctrlr->active_path_id->trid.trtype), 5860 spdk_nvme_transport_id_trtype_str(trid->trtype)); 5861 return -EINVAL; 5862 } 5863 5864 5865 /* Currently we only support failover to the same NQN. */ 5866 if (strncmp(trid->subnqn, nvme_ctrlr->active_path_id->trid.subnqn, SPDK_NVMF_NQN_MAX_LEN)) { 5867 SPDK_WARNLOG("Failover from subnqn: %s to a different subnqn: %s is not supported currently\n", 5868 nvme_ctrlr->active_path_id->trid.subnqn, trid->subnqn); 5869 return -EINVAL; 5870 } 5871 5872 /* Skip all the other checks if we've already registered this path. */ 5873 TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) { 5874 if (!spdk_nvme_transport_id_compare(&tmp_trid->trid, trid)) { 5875 SPDK_WARNLOG("This path (traddr: %s subnqn: %s) is already registered\n", trid->traddr, 5876 trid->subnqn); 5877 return -EEXIST; 5878 } 5879 } 5880 5881 return 0; 5882 } 5883 5884 static int 5885 bdev_nvme_check_secondary_namespace(struct nvme_ctrlr *nvme_ctrlr, 5886 struct spdk_nvme_ctrlr *new_ctrlr) 5887 { 5888 struct nvme_ns *nvme_ns; 5889 struct spdk_nvme_ns *new_ns; 5890 5891 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 5892 while (nvme_ns != NULL) { 5893 new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nvme_ns->id); 5894 assert(new_ns != NULL); 5895 5896 if (!bdev_nvme_compare_ns(nvme_ns->ns, new_ns)) { 5897 return -EINVAL; 5898 } 5899 5900 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 5901 } 5902 5903 return 0; 5904 } 5905 5906 static int 5907 _bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 5908 struct spdk_nvme_transport_id *trid) 5909 { 5910 struct nvme_path_id *active_id, *new_trid, *tmp_trid; 5911 5912 new_trid = calloc(1, sizeof(*new_trid)); 5913 if (new_trid == NULL) { 5914 return -ENOMEM; 5915 } 5916 new_trid->trid = *trid; 5917 5918 active_id = nvme_ctrlr->active_path_id; 5919 assert(active_id != NULL); 5920 assert(active_id == TAILQ_FIRST(&nvme_ctrlr->trids)); 5921 5922 /* Skip the active trid not to replace it until it is failed. */ 5923 tmp_trid = TAILQ_NEXT(active_id, link); 5924 if (tmp_trid == NULL) { 5925 goto add_tail; 5926 } 5927 5928 /* It means the trid is faled if its last failed time is non-zero. 5929 * Insert the new alternate trid before any failed trid. 5930 */ 5931 TAILQ_FOREACH_FROM(tmp_trid, &nvme_ctrlr->trids, link) { 5932 if (tmp_trid->last_failed_tsc != 0) { 5933 TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link); 5934 return 0; 5935 } 5936 } 5937 5938 add_tail: 5939 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, new_trid, link); 5940 return 0; 5941 } 5942 5943 /* This is the case that a secondary path is added to an existing 5944 * nvme_ctrlr for failover. After checking if it can access the same 5945 * namespaces as the primary path, it is disconnected until failover occurs. 5946 */ 5947 static int 5948 bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 5949 struct spdk_nvme_ctrlr *new_ctrlr, 5950 struct spdk_nvme_transport_id *trid) 5951 { 5952 int rc; 5953 5954 assert(nvme_ctrlr != NULL); 5955 5956 pthread_mutex_lock(&nvme_ctrlr->mutex); 5957 5958 rc = bdev_nvme_check_secondary_trid(nvme_ctrlr, new_ctrlr, trid); 5959 if (rc != 0) { 5960 goto exit; 5961 } 5962 5963 rc = bdev_nvme_check_secondary_namespace(nvme_ctrlr, new_ctrlr); 5964 if (rc != 0) { 5965 goto exit; 5966 } 5967 5968 rc = _bdev_nvme_add_secondary_trid(nvme_ctrlr, trid); 5969 5970 exit: 5971 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5972 5973 spdk_nvme_detach(new_ctrlr); 5974 5975 return rc; 5976 } 5977 5978 static void 5979 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5980 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 5981 { 5982 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 5983 struct nvme_async_probe_ctx *ctx; 5984 int rc; 5985 5986 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 5987 ctx->ctrlr_attached = true; 5988 5989 rc = nvme_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx); 5990 if (rc != 0) { 5991 ctx->reported_bdevs = 0; 5992 populate_namespaces_cb(ctx, rc); 5993 } 5994 } 5995 5996 static void 5997 connect_set_failover_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5998 struct spdk_nvme_ctrlr *ctrlr, 5999 const struct spdk_nvme_ctrlr_opts *opts) 6000 { 6001 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 6002 struct nvme_ctrlr *nvme_ctrlr; 6003 struct nvme_async_probe_ctx *ctx; 6004 int rc; 6005 6006 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 6007 ctx->ctrlr_attached = true; 6008 6009 nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->base_name); 6010 if (nvme_ctrlr) { 6011 rc = bdev_nvme_add_secondary_trid(nvme_ctrlr, ctrlr, &ctx->trid); 6012 } else { 6013 rc = -ENODEV; 6014 } 6015 6016 ctx->reported_bdevs = 0; 6017 populate_namespaces_cb(ctx, rc); 6018 } 6019 6020 static int 6021 bdev_nvme_async_poll(void *arg) 6022 { 6023 struct nvme_async_probe_ctx *ctx = arg; 6024 int rc; 6025 6026 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 6027 if (spdk_unlikely(rc != -EAGAIN)) { 6028 ctx->probe_done = true; 6029 spdk_poller_unregister(&ctx->poller); 6030 if (!ctx->ctrlr_attached) { 6031 /* The probe is done, but no controller was attached. 6032 * That means we had a failure, so report -EIO back to 6033 * the caller (usually the RPC). populate_namespaces_cb() 6034 * will take care of freeing the nvme_async_probe_ctx. 6035 */ 6036 ctx->reported_bdevs = 0; 6037 populate_namespaces_cb(ctx, -EIO); 6038 } else if (ctx->namespaces_populated) { 6039 /* The namespaces for the attached controller were all 6040 * populated and the response was already sent to the 6041 * caller (usually the RPC). So free the context here. 6042 */ 6043 free_nvme_async_probe_ctx(ctx); 6044 } 6045 } 6046 6047 return SPDK_POLLER_BUSY; 6048 } 6049 6050 static bool 6051 bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, 6052 uint32_t reconnect_delay_sec, 6053 uint32_t fast_io_fail_timeout_sec) 6054 { 6055 if (ctrlr_loss_timeout_sec < -1) { 6056 SPDK_ERRLOG("ctrlr_loss_timeout_sec can't be less than -1.\n"); 6057 return false; 6058 } else if (ctrlr_loss_timeout_sec == -1) { 6059 if (reconnect_delay_sec == 0) { 6060 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 6061 return false; 6062 } else if (fast_io_fail_timeout_sec != 0 && 6063 fast_io_fail_timeout_sec < reconnect_delay_sec) { 6064 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io-fail_timeout_sec.\n"); 6065 return false; 6066 } 6067 } else if (ctrlr_loss_timeout_sec != 0) { 6068 if (reconnect_delay_sec == 0) { 6069 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 6070 return false; 6071 } else if (reconnect_delay_sec > (uint32_t)ctrlr_loss_timeout_sec) { 6072 SPDK_ERRLOG("reconnect_delay_sec can't be more than ctrlr_loss_timeout_sec.\n"); 6073 return false; 6074 } else if (fast_io_fail_timeout_sec != 0) { 6075 if (fast_io_fail_timeout_sec < reconnect_delay_sec) { 6076 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io_fail_timeout_sec.\n"); 6077 return false; 6078 } else if (fast_io_fail_timeout_sec > (uint32_t)ctrlr_loss_timeout_sec) { 6079 SPDK_ERRLOG("fast_io_fail_timeout_sec can't be more than ctrlr_loss_timeout_sec.\n"); 6080 return false; 6081 } 6082 } 6083 } else if (reconnect_delay_sec != 0 || fast_io_fail_timeout_sec != 0) { 6084 SPDK_ERRLOG("Both reconnect_delay_sec and fast_io_fail_timeout_sec must be 0 if ctrlr_loss_timeout_sec is 0.\n"); 6085 return false; 6086 } 6087 6088 return true; 6089 } 6090 6091 static int 6092 bdev_nvme_load_psk(const char *fname, char *buf, size_t bufsz) 6093 { 6094 FILE *psk_file; 6095 struct stat statbuf; 6096 int rc; 6097 #define TCP_PSK_INVALID_PERMISSIONS 0177 6098 6099 if (stat(fname, &statbuf) != 0) { 6100 SPDK_ERRLOG("Could not read permissions for PSK file\n"); 6101 return -EACCES; 6102 } 6103 6104 if ((statbuf.st_mode & TCP_PSK_INVALID_PERMISSIONS) != 0) { 6105 SPDK_ERRLOG("Incorrect permissions for PSK file\n"); 6106 return -EPERM; 6107 } 6108 if ((size_t)statbuf.st_size >= bufsz) { 6109 SPDK_ERRLOG("Invalid PSK: too long\n"); 6110 return -EINVAL; 6111 } 6112 psk_file = fopen(fname, "r"); 6113 if (psk_file == NULL) { 6114 SPDK_ERRLOG("Could not open PSK file\n"); 6115 return -EINVAL; 6116 } 6117 6118 memset(buf, 0, bufsz); 6119 rc = fread(buf, 1, statbuf.st_size, psk_file); 6120 if (rc != statbuf.st_size) { 6121 SPDK_ERRLOG("Failed to read PSK\n"); 6122 fclose(psk_file); 6123 return -EINVAL; 6124 } 6125 6126 fclose(psk_file); 6127 return 0; 6128 } 6129 6130 int 6131 bdev_nvme_create(struct spdk_nvme_transport_id *trid, 6132 const char *base_name, 6133 const char **names, 6134 uint32_t count, 6135 spdk_bdev_create_nvme_fn cb_fn, 6136 void *cb_ctx, 6137 struct spdk_nvme_ctrlr_opts *drv_opts, 6138 struct nvme_ctrlr_opts *bdev_opts, 6139 bool multipath) 6140 { 6141 struct nvme_probe_skip_entry *entry, *tmp; 6142 struct nvme_async_probe_ctx *ctx; 6143 spdk_nvme_attach_cb attach_cb; 6144 int rc, len; 6145 6146 /* TODO expand this check to include both the host and target TRIDs. 6147 * Only if both are the same should we fail. 6148 */ 6149 if (nvme_ctrlr_get(trid) != NULL) { 6150 SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr); 6151 return -EEXIST; 6152 } 6153 6154 len = strnlen(base_name, SPDK_CONTROLLER_NAME_MAX); 6155 6156 if (len == 0 || len == SPDK_CONTROLLER_NAME_MAX) { 6157 SPDK_ERRLOG("controller name must be between 1 and %d characters\n", SPDK_CONTROLLER_NAME_MAX - 1); 6158 return -EINVAL; 6159 } 6160 6161 if (bdev_opts != NULL && 6162 !bdev_nvme_check_io_error_resiliency_params(bdev_opts->ctrlr_loss_timeout_sec, 6163 bdev_opts->reconnect_delay_sec, 6164 bdev_opts->fast_io_fail_timeout_sec)) { 6165 return -EINVAL; 6166 } 6167 6168 ctx = calloc(1, sizeof(*ctx)); 6169 if (!ctx) { 6170 return -ENOMEM; 6171 } 6172 ctx->base_name = base_name; 6173 ctx->names = names; 6174 ctx->max_bdevs = count; 6175 ctx->cb_fn = cb_fn; 6176 ctx->cb_ctx = cb_ctx; 6177 ctx->trid = *trid; 6178 6179 if (bdev_opts) { 6180 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 6181 } else { 6182 bdev_nvme_get_default_ctrlr_opts(&ctx->bdev_opts); 6183 } 6184 6185 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 6186 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) { 6187 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 6188 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 6189 free(entry); 6190 break; 6191 } 6192 } 6193 } 6194 6195 if (drv_opts) { 6196 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 6197 } else { 6198 spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->drv_opts, sizeof(ctx->drv_opts)); 6199 } 6200 6201 ctx->drv_opts.transport_retry_count = g_opts.transport_retry_count; 6202 ctx->drv_opts.transport_ack_timeout = g_opts.transport_ack_timeout; 6203 ctx->drv_opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms; 6204 ctx->drv_opts.disable_read_ana_log_page = true; 6205 ctx->drv_opts.transport_tos = g_opts.transport_tos; 6206 6207 if (ctx->bdev_opts.psk[0] != '\0') { 6208 /* Try to use the keyring first */ 6209 ctx->drv_opts.tls_psk = spdk_keyring_get_key(ctx->bdev_opts.psk); 6210 if (ctx->drv_opts.tls_psk == NULL) { 6211 rc = bdev_nvme_load_psk(ctx->bdev_opts.psk, 6212 ctx->drv_opts.psk, sizeof(ctx->drv_opts.psk)); 6213 if (rc != 0) { 6214 SPDK_ERRLOG("Could not load PSK from %s\n", ctx->bdev_opts.psk); 6215 free_nvme_async_probe_ctx(ctx); 6216 return rc; 6217 } 6218 } 6219 } 6220 6221 if (ctx->bdev_opts.dhchap_key != NULL) { 6222 ctx->drv_opts.dhchap_key = spdk_keyring_get_key(ctx->bdev_opts.dhchap_key); 6223 if (ctx->drv_opts.dhchap_key == NULL) { 6224 SPDK_ERRLOG("Could not load DH-HMAC-CHAP key: %s\n", 6225 ctx->bdev_opts.dhchap_key); 6226 free_nvme_async_probe_ctx(ctx); 6227 return -ENOKEY; 6228 } 6229 6230 ctx->drv_opts.dhchap_digests = g_opts.dhchap_digests; 6231 ctx->drv_opts.dhchap_dhgroups = g_opts.dhchap_dhgroups; 6232 } 6233 if (ctx->bdev_opts.dhchap_ctrlr_key != NULL) { 6234 ctx->drv_opts.dhchap_ctrlr_key = 6235 spdk_keyring_get_key(ctx->bdev_opts.dhchap_ctrlr_key); 6236 if (ctx->drv_opts.dhchap_ctrlr_key == NULL) { 6237 SPDK_ERRLOG("Could not load DH-HMAC-CHAP controller key: %s\n", 6238 ctx->bdev_opts.dhchap_ctrlr_key); 6239 free_nvme_async_probe_ctx(ctx); 6240 return -ENOKEY; 6241 } 6242 } 6243 6244 if (nvme_bdev_ctrlr_get_by_name(base_name) == NULL || multipath) { 6245 attach_cb = connect_attach_cb; 6246 } else { 6247 attach_cb = connect_set_failover_cb; 6248 } 6249 6250 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, attach_cb); 6251 if (ctx->probe_ctx == NULL) { 6252 SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr); 6253 free_nvme_async_probe_ctx(ctx); 6254 return -ENODEV; 6255 } 6256 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000); 6257 6258 return 0; 6259 } 6260 6261 struct bdev_nvme_delete_ctx { 6262 char *name; 6263 struct nvme_path_id path_id; 6264 bdev_nvme_delete_done_fn delete_done; 6265 void *delete_done_ctx; 6266 uint64_t timeout_ticks; 6267 struct spdk_poller *poller; 6268 }; 6269 6270 static void 6271 free_bdev_nvme_delete_ctx(struct bdev_nvme_delete_ctx *ctx) 6272 { 6273 if (ctx != NULL) { 6274 free(ctx->name); 6275 free(ctx); 6276 } 6277 } 6278 6279 static bool 6280 nvme_path_id_compare(struct nvme_path_id *p, const struct nvme_path_id *path_id) 6281 { 6282 if (path_id->trid.trtype != 0) { 6283 if (path_id->trid.trtype == SPDK_NVME_TRANSPORT_CUSTOM) { 6284 if (strcasecmp(path_id->trid.trstring, p->trid.trstring) != 0) { 6285 return false; 6286 } 6287 } else { 6288 if (path_id->trid.trtype != p->trid.trtype) { 6289 return false; 6290 } 6291 } 6292 } 6293 6294 if (!spdk_mem_all_zero(path_id->trid.traddr, sizeof(path_id->trid.traddr))) { 6295 if (strcasecmp(path_id->trid.traddr, p->trid.traddr) != 0) { 6296 return false; 6297 } 6298 } 6299 6300 if (path_id->trid.adrfam != 0) { 6301 if (path_id->trid.adrfam != p->trid.adrfam) { 6302 return false; 6303 } 6304 } 6305 6306 if (!spdk_mem_all_zero(path_id->trid.trsvcid, sizeof(path_id->trid.trsvcid))) { 6307 if (strcasecmp(path_id->trid.trsvcid, p->trid.trsvcid) != 0) { 6308 return false; 6309 } 6310 } 6311 6312 if (!spdk_mem_all_zero(path_id->trid.subnqn, sizeof(path_id->trid.subnqn))) { 6313 if (strcmp(path_id->trid.subnqn, p->trid.subnqn) != 0) { 6314 return false; 6315 } 6316 } 6317 6318 if (!spdk_mem_all_zero(path_id->hostid.hostaddr, sizeof(path_id->hostid.hostaddr))) { 6319 if (strcmp(path_id->hostid.hostaddr, p->hostid.hostaddr) != 0) { 6320 return false; 6321 } 6322 } 6323 6324 if (!spdk_mem_all_zero(path_id->hostid.hostsvcid, sizeof(path_id->hostid.hostsvcid))) { 6325 if (strcmp(path_id->hostid.hostsvcid, p->hostid.hostsvcid) != 0) { 6326 return false; 6327 } 6328 } 6329 6330 return true; 6331 } 6332 6333 static bool 6334 nvme_path_id_exists(const char *name, const struct nvme_path_id *path_id) 6335 { 6336 struct nvme_bdev_ctrlr *nbdev_ctrlr; 6337 struct nvme_ctrlr *ctrlr; 6338 struct nvme_path_id *p; 6339 6340 pthread_mutex_lock(&g_bdev_nvme_mutex); 6341 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 6342 if (!nbdev_ctrlr) { 6343 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6344 return false; 6345 } 6346 6347 TAILQ_FOREACH(ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 6348 pthread_mutex_lock(&ctrlr->mutex); 6349 TAILQ_FOREACH(p, &ctrlr->trids, link) { 6350 if (nvme_path_id_compare(p, path_id)) { 6351 pthread_mutex_unlock(&ctrlr->mutex); 6352 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6353 return true; 6354 } 6355 } 6356 pthread_mutex_unlock(&ctrlr->mutex); 6357 } 6358 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6359 6360 return false; 6361 } 6362 6363 static int 6364 bdev_nvme_delete_complete_poll(void *arg) 6365 { 6366 struct bdev_nvme_delete_ctx *ctx = arg; 6367 int rc = 0; 6368 6369 if (nvme_path_id_exists(ctx->name, &ctx->path_id)) { 6370 if (ctx->timeout_ticks > spdk_get_ticks()) { 6371 return SPDK_POLLER_BUSY; 6372 } 6373 6374 SPDK_ERRLOG("NVMe path '%s' still exists after delete\n", ctx->name); 6375 rc = -ETIMEDOUT; 6376 } 6377 6378 spdk_poller_unregister(&ctx->poller); 6379 6380 ctx->delete_done(ctx->delete_done_ctx, rc); 6381 free_bdev_nvme_delete_ctx(ctx); 6382 6383 return SPDK_POLLER_BUSY; 6384 } 6385 6386 static int 6387 _bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, const struct nvme_path_id *path_id) 6388 { 6389 struct nvme_path_id *p, *t; 6390 spdk_msg_fn msg_fn; 6391 int rc = -ENXIO; 6392 6393 pthread_mutex_lock(&nvme_ctrlr->mutex); 6394 6395 TAILQ_FOREACH_REVERSE_SAFE(p, &nvme_ctrlr->trids, nvme_paths, link, t) { 6396 if (p == TAILQ_FIRST(&nvme_ctrlr->trids)) { 6397 break; 6398 } 6399 6400 if (!nvme_path_id_compare(p, path_id)) { 6401 continue; 6402 } 6403 6404 /* We are not using the specified path. */ 6405 TAILQ_REMOVE(&nvme_ctrlr->trids, p, link); 6406 free(p); 6407 rc = 0; 6408 } 6409 6410 if (p == NULL || !nvme_path_id_compare(p, path_id)) { 6411 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6412 return rc; 6413 } 6414 6415 /* If we made it here, then this path is a match! Now we need to remove it. */ 6416 6417 /* This is the active path in use right now. The active path is always the first in the list. */ 6418 assert(p == nvme_ctrlr->active_path_id); 6419 6420 if (!TAILQ_NEXT(p, link)) { 6421 /* The current path is the only path. */ 6422 msg_fn = _nvme_ctrlr_destruct; 6423 rc = bdev_nvme_delete_ctrlr_unsafe(nvme_ctrlr, false); 6424 } else { 6425 /* There is an alternative path. */ 6426 msg_fn = _bdev_nvme_reset_ctrlr; 6427 rc = bdev_nvme_failover_ctrlr_unsafe(nvme_ctrlr, true); 6428 } 6429 6430 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6431 6432 if (rc == 0) { 6433 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 6434 } else if (rc == -EALREADY) { 6435 rc = 0; 6436 } 6437 6438 return rc; 6439 } 6440 6441 int 6442 bdev_nvme_delete(const char *name, const struct nvme_path_id *path_id, 6443 bdev_nvme_delete_done_fn delete_done, void *delete_done_ctx) 6444 { 6445 struct nvme_bdev_ctrlr *nbdev_ctrlr; 6446 struct nvme_ctrlr *nvme_ctrlr, *tmp_nvme_ctrlr; 6447 struct bdev_nvme_delete_ctx *ctx = NULL; 6448 int rc = -ENXIO, _rc; 6449 6450 if (name == NULL || path_id == NULL) { 6451 rc = -EINVAL; 6452 goto exit; 6453 } 6454 6455 pthread_mutex_lock(&g_bdev_nvme_mutex); 6456 6457 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 6458 if (nbdev_ctrlr == NULL) { 6459 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6460 6461 SPDK_ERRLOG("Failed to find NVMe bdev controller\n"); 6462 rc = -ENODEV; 6463 goto exit; 6464 } 6465 6466 TAILQ_FOREACH_SAFE(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq, tmp_nvme_ctrlr) { 6467 _rc = _bdev_nvme_delete(nvme_ctrlr, path_id); 6468 if (_rc < 0 && _rc != -ENXIO) { 6469 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6470 rc = _rc; 6471 goto exit; 6472 } else if (_rc == 0) { 6473 /* We traverse all remaining nvme_ctrlrs even if one nvme_ctrlr 6474 * was deleted successfully. To remember the successful deletion, 6475 * overwrite rc only if _rc is zero. 6476 */ 6477 rc = 0; 6478 } 6479 } 6480 6481 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6482 6483 if (rc != 0 || delete_done == NULL) { 6484 goto exit; 6485 } 6486 6487 ctx = calloc(1, sizeof(*ctx)); 6488 if (ctx == NULL) { 6489 SPDK_ERRLOG("Failed to allocate context for bdev_nvme_delete\n"); 6490 rc = -ENOMEM; 6491 goto exit; 6492 } 6493 6494 ctx->name = strdup(name); 6495 if (ctx->name == NULL) { 6496 SPDK_ERRLOG("Failed to copy controller name for deletion\n"); 6497 rc = -ENOMEM; 6498 goto exit; 6499 } 6500 6501 ctx->delete_done = delete_done; 6502 ctx->delete_done_ctx = delete_done_ctx; 6503 ctx->path_id = *path_id; 6504 ctx->timeout_ticks = spdk_get_ticks() + 10 * spdk_get_ticks_hz(); 6505 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_delete_complete_poll, ctx, 1000); 6506 if (ctx->poller == NULL) { 6507 SPDK_ERRLOG("Failed to register bdev_nvme_delete poller\n"); 6508 rc = -ENOMEM; 6509 goto exit; 6510 } 6511 6512 exit: 6513 if (rc != 0) { 6514 free_bdev_nvme_delete_ctx(ctx); 6515 } 6516 6517 return rc; 6518 } 6519 6520 #define DISCOVERY_INFOLOG(ctx, format, ...) \ 6521 SPDK_INFOLOG(bdev_nvme, "Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 6522 6523 #define DISCOVERY_ERRLOG(ctx, format, ...) \ 6524 SPDK_ERRLOG("Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 6525 6526 struct discovery_entry_ctx { 6527 char name[128]; 6528 struct spdk_nvme_transport_id trid; 6529 struct spdk_nvme_ctrlr_opts drv_opts; 6530 struct spdk_nvmf_discovery_log_page_entry entry; 6531 TAILQ_ENTRY(discovery_entry_ctx) tailq; 6532 struct discovery_ctx *ctx; 6533 }; 6534 6535 struct discovery_ctx { 6536 char *name; 6537 spdk_bdev_nvme_start_discovery_fn start_cb_fn; 6538 spdk_bdev_nvme_stop_discovery_fn stop_cb_fn; 6539 void *cb_ctx; 6540 struct spdk_nvme_probe_ctx *probe_ctx; 6541 struct spdk_nvme_detach_ctx *detach_ctx; 6542 struct spdk_nvme_ctrlr *ctrlr; 6543 struct spdk_nvme_transport_id trid; 6544 struct discovery_entry_ctx *entry_ctx_in_use; 6545 struct spdk_poller *poller; 6546 struct spdk_nvme_ctrlr_opts drv_opts; 6547 struct nvme_ctrlr_opts bdev_opts; 6548 struct spdk_nvmf_discovery_log_page *log_page; 6549 TAILQ_ENTRY(discovery_ctx) tailq; 6550 TAILQ_HEAD(, discovery_entry_ctx) nvm_entry_ctxs; 6551 TAILQ_HEAD(, discovery_entry_ctx) discovery_entry_ctxs; 6552 int rc; 6553 bool wait_for_attach; 6554 uint64_t timeout_ticks; 6555 /* Denotes that the discovery service is being started. We're waiting 6556 * for the initial connection to the discovery controller to be 6557 * established and attach discovered NVM ctrlrs. 6558 */ 6559 bool initializing; 6560 /* Denotes if a discovery is currently in progress for this context. 6561 * That includes connecting to newly discovered subsystems. Used to 6562 * ensure we do not start a new discovery until an existing one is 6563 * complete. 6564 */ 6565 bool in_progress; 6566 6567 /* Denotes if another discovery is needed after the one in progress 6568 * completes. Set when we receive an AER completion while a discovery 6569 * is already in progress. 6570 */ 6571 bool pending; 6572 6573 /* Signal to the discovery context poller that it should stop the 6574 * discovery service, including detaching from the current discovery 6575 * controller. 6576 */ 6577 bool stop; 6578 6579 struct spdk_thread *calling_thread; 6580 uint32_t index; 6581 uint32_t attach_in_progress; 6582 char *hostnqn; 6583 6584 /* Denotes if the discovery service was started by the mdns discovery. 6585 */ 6586 bool from_mdns_discovery_service; 6587 }; 6588 6589 TAILQ_HEAD(discovery_ctxs, discovery_ctx); 6590 static struct discovery_ctxs g_discovery_ctxs = TAILQ_HEAD_INITIALIZER(g_discovery_ctxs); 6591 6592 static void get_discovery_log_page(struct discovery_ctx *ctx); 6593 6594 static void 6595 free_discovery_ctx(struct discovery_ctx *ctx) 6596 { 6597 free(ctx->log_page); 6598 free(ctx->hostnqn); 6599 free(ctx->name); 6600 free(ctx); 6601 } 6602 6603 static void 6604 discovery_complete(struct discovery_ctx *ctx) 6605 { 6606 ctx->initializing = false; 6607 ctx->in_progress = false; 6608 if (ctx->pending) { 6609 ctx->pending = false; 6610 get_discovery_log_page(ctx); 6611 } 6612 } 6613 6614 static void 6615 build_trid_from_log_page_entry(struct spdk_nvme_transport_id *trid, 6616 struct spdk_nvmf_discovery_log_page_entry *entry) 6617 { 6618 char *space; 6619 6620 trid->trtype = entry->trtype; 6621 trid->adrfam = entry->adrfam; 6622 memcpy(trid->traddr, entry->traddr, sizeof(entry->traddr)); 6623 memcpy(trid->trsvcid, entry->trsvcid, sizeof(entry->trsvcid)); 6624 /* Because the source buffer (entry->subnqn) is longer than trid->subnqn, and 6625 * before call to this function trid->subnqn is zeroed out, we need 6626 * to copy sizeof(trid->subnqn) minus one byte to make sure the last character 6627 * remains 0. Then we can shorten the string (replace ' ' with 0) if required 6628 */ 6629 memcpy(trid->subnqn, entry->subnqn, sizeof(trid->subnqn) - 1); 6630 6631 /* We want the traddr, trsvcid and subnqn fields to be NULL-terminated. 6632 * But the log page entries typically pad them with spaces, not zeroes. 6633 * So add a NULL terminator to each of these fields at the appropriate 6634 * location. 6635 */ 6636 space = strchr(trid->traddr, ' '); 6637 if (space) { 6638 *space = 0; 6639 } 6640 space = strchr(trid->trsvcid, ' '); 6641 if (space) { 6642 *space = 0; 6643 } 6644 space = strchr(trid->subnqn, ' '); 6645 if (space) { 6646 *space = 0; 6647 } 6648 } 6649 6650 static void 6651 _stop_discovery(void *_ctx) 6652 { 6653 struct discovery_ctx *ctx = _ctx; 6654 6655 if (ctx->attach_in_progress > 0) { 6656 spdk_thread_send_msg(spdk_get_thread(), _stop_discovery, ctx); 6657 return; 6658 } 6659 6660 ctx->stop = true; 6661 6662 while (!TAILQ_EMPTY(&ctx->nvm_entry_ctxs)) { 6663 struct discovery_entry_ctx *entry_ctx; 6664 struct nvme_path_id path = {}; 6665 6666 entry_ctx = TAILQ_FIRST(&ctx->nvm_entry_ctxs); 6667 path.trid = entry_ctx->trid; 6668 bdev_nvme_delete(entry_ctx->name, &path, NULL, NULL); 6669 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 6670 free(entry_ctx); 6671 } 6672 6673 while (!TAILQ_EMPTY(&ctx->discovery_entry_ctxs)) { 6674 struct discovery_entry_ctx *entry_ctx; 6675 6676 entry_ctx = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 6677 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 6678 free(entry_ctx); 6679 } 6680 6681 free(ctx->entry_ctx_in_use); 6682 ctx->entry_ctx_in_use = NULL; 6683 } 6684 6685 static void 6686 stop_discovery(struct discovery_ctx *ctx, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 6687 { 6688 ctx->stop_cb_fn = cb_fn; 6689 ctx->cb_ctx = cb_ctx; 6690 6691 if (ctx->attach_in_progress > 0) { 6692 DISCOVERY_INFOLOG(ctx, "stopping discovery with attach_in_progress: %"PRIu32"\n", 6693 ctx->attach_in_progress); 6694 } 6695 6696 _stop_discovery(ctx); 6697 } 6698 6699 static void 6700 remove_discovery_entry(struct nvme_ctrlr *nvme_ctrlr) 6701 { 6702 struct discovery_ctx *d_ctx; 6703 struct nvme_path_id *path_id; 6704 struct spdk_nvme_transport_id trid = {}; 6705 struct discovery_entry_ctx *entry_ctx, *tmp; 6706 6707 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 6708 6709 TAILQ_FOREACH(d_ctx, &g_discovery_ctxs, tailq) { 6710 TAILQ_FOREACH_SAFE(entry_ctx, &d_ctx->nvm_entry_ctxs, tailq, tmp) { 6711 build_trid_from_log_page_entry(&trid, &entry_ctx->entry); 6712 if (spdk_nvme_transport_id_compare(&trid, &path_id->trid) != 0) { 6713 continue; 6714 } 6715 6716 TAILQ_REMOVE(&d_ctx->nvm_entry_ctxs, entry_ctx, tailq); 6717 free(entry_ctx); 6718 DISCOVERY_INFOLOG(d_ctx, "Remove discovery entry: %s:%s:%s\n", 6719 trid.subnqn, trid.traddr, trid.trsvcid); 6720 6721 /* Fail discovery ctrlr to force reattach attempt */ 6722 spdk_nvme_ctrlr_fail(d_ctx->ctrlr); 6723 } 6724 } 6725 } 6726 6727 static void 6728 discovery_remove_controllers(struct discovery_ctx *ctx) 6729 { 6730 struct spdk_nvmf_discovery_log_page *log_page = ctx->log_page; 6731 struct discovery_entry_ctx *entry_ctx, *tmp; 6732 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 6733 struct spdk_nvme_transport_id old_trid = {}; 6734 uint64_t numrec, i; 6735 bool found; 6736 6737 numrec = from_le64(&log_page->numrec); 6738 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->nvm_entry_ctxs, tailq, tmp) { 6739 found = false; 6740 old_entry = &entry_ctx->entry; 6741 build_trid_from_log_page_entry(&old_trid, old_entry); 6742 for (i = 0; i < numrec; i++) { 6743 new_entry = &log_page->entries[i]; 6744 if (!memcmp(old_entry, new_entry, sizeof(*old_entry))) { 6745 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s found again\n", 6746 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 6747 found = true; 6748 break; 6749 } 6750 } 6751 if (!found) { 6752 struct nvme_path_id path = {}; 6753 6754 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s not found\n", 6755 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 6756 6757 path.trid = entry_ctx->trid; 6758 bdev_nvme_delete(entry_ctx->name, &path, NULL, NULL); 6759 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 6760 free(entry_ctx); 6761 } 6762 } 6763 free(log_page); 6764 ctx->log_page = NULL; 6765 discovery_complete(ctx); 6766 } 6767 6768 static void 6769 complete_discovery_start(struct discovery_ctx *ctx, int status) 6770 { 6771 ctx->timeout_ticks = 0; 6772 ctx->rc = status; 6773 if (ctx->start_cb_fn) { 6774 ctx->start_cb_fn(ctx->cb_ctx, status); 6775 ctx->start_cb_fn = NULL; 6776 ctx->cb_ctx = NULL; 6777 } 6778 } 6779 6780 static void 6781 discovery_attach_controller_done(void *cb_ctx, size_t bdev_count, int rc) 6782 { 6783 struct discovery_entry_ctx *entry_ctx = cb_ctx; 6784 struct discovery_ctx *ctx = entry_ctx->ctx; 6785 6786 DISCOVERY_INFOLOG(ctx, "attach %s done\n", entry_ctx->name); 6787 ctx->attach_in_progress--; 6788 if (ctx->attach_in_progress == 0) { 6789 complete_discovery_start(ctx, ctx->rc); 6790 if (ctx->initializing && ctx->rc != 0) { 6791 DISCOVERY_ERRLOG(ctx, "stopping discovery due to errors: %d\n", ctx->rc); 6792 stop_discovery(ctx, NULL, ctx->cb_ctx); 6793 } else { 6794 discovery_remove_controllers(ctx); 6795 } 6796 } 6797 } 6798 6799 static struct discovery_entry_ctx * 6800 create_discovery_entry_ctx(struct discovery_ctx *ctx, struct spdk_nvme_transport_id *trid) 6801 { 6802 struct discovery_entry_ctx *new_ctx; 6803 6804 new_ctx = calloc(1, sizeof(*new_ctx)); 6805 if (new_ctx == NULL) { 6806 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6807 return NULL; 6808 } 6809 6810 new_ctx->ctx = ctx; 6811 memcpy(&new_ctx->trid, trid, sizeof(*trid)); 6812 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 6813 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 6814 return new_ctx; 6815 } 6816 6817 static void 6818 discovery_log_page_cb(void *cb_arg, int rc, const struct spdk_nvme_cpl *cpl, 6819 struct spdk_nvmf_discovery_log_page *log_page) 6820 { 6821 struct discovery_ctx *ctx = cb_arg; 6822 struct discovery_entry_ctx *entry_ctx, *tmp; 6823 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 6824 uint64_t numrec, i; 6825 bool found; 6826 6827 if (rc || spdk_nvme_cpl_is_error(cpl)) { 6828 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 6829 return; 6830 } 6831 6832 ctx->log_page = log_page; 6833 assert(ctx->attach_in_progress == 0); 6834 numrec = from_le64(&log_page->numrec); 6835 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->discovery_entry_ctxs, tailq, tmp) { 6836 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 6837 free(entry_ctx); 6838 } 6839 for (i = 0; i < numrec; i++) { 6840 found = false; 6841 new_entry = &log_page->entries[i]; 6842 if (new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY_CURRENT || 6843 new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) { 6844 struct discovery_entry_ctx *new_ctx; 6845 struct spdk_nvme_transport_id trid = {}; 6846 6847 build_trid_from_log_page_entry(&trid, new_entry); 6848 new_ctx = create_discovery_entry_ctx(ctx, &trid); 6849 if (new_ctx == NULL) { 6850 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6851 break; 6852 } 6853 6854 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, new_ctx, tailq); 6855 continue; 6856 } 6857 TAILQ_FOREACH(entry_ctx, &ctx->nvm_entry_ctxs, tailq) { 6858 old_entry = &entry_ctx->entry; 6859 if (!memcmp(new_entry, old_entry, sizeof(*new_entry))) { 6860 found = true; 6861 break; 6862 } 6863 } 6864 if (!found) { 6865 struct discovery_entry_ctx *subnqn_ctx = NULL, *new_ctx; 6866 struct discovery_ctx *d_ctx; 6867 6868 TAILQ_FOREACH(d_ctx, &g_discovery_ctxs, tailq) { 6869 TAILQ_FOREACH(subnqn_ctx, &d_ctx->nvm_entry_ctxs, tailq) { 6870 if (!memcmp(subnqn_ctx->entry.subnqn, new_entry->subnqn, 6871 sizeof(new_entry->subnqn))) { 6872 break; 6873 } 6874 } 6875 if (subnqn_ctx) { 6876 break; 6877 } 6878 } 6879 6880 new_ctx = calloc(1, sizeof(*new_ctx)); 6881 if (new_ctx == NULL) { 6882 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6883 break; 6884 } 6885 6886 new_ctx->ctx = ctx; 6887 memcpy(&new_ctx->entry, new_entry, sizeof(*new_entry)); 6888 build_trid_from_log_page_entry(&new_ctx->trid, new_entry); 6889 if (subnqn_ctx) { 6890 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s", subnqn_ctx->name); 6891 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new path for %s\n", 6892 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 6893 new_ctx->name); 6894 } else { 6895 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s%d", ctx->name, ctx->index++); 6896 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new subsystem %s\n", 6897 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 6898 new_ctx->name); 6899 } 6900 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 6901 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 6902 rc = bdev_nvme_create(&new_ctx->trid, new_ctx->name, NULL, 0, 6903 discovery_attach_controller_done, new_ctx, 6904 &new_ctx->drv_opts, &ctx->bdev_opts, true); 6905 if (rc == 0) { 6906 TAILQ_INSERT_TAIL(&ctx->nvm_entry_ctxs, new_ctx, tailq); 6907 ctx->attach_in_progress++; 6908 } else { 6909 DISCOVERY_ERRLOG(ctx, "bdev_nvme_create failed (%s)\n", spdk_strerror(-rc)); 6910 } 6911 } 6912 } 6913 6914 if (ctx->attach_in_progress == 0) { 6915 discovery_remove_controllers(ctx); 6916 } 6917 } 6918 6919 static void 6920 get_discovery_log_page(struct discovery_ctx *ctx) 6921 { 6922 int rc; 6923 6924 assert(ctx->in_progress == false); 6925 ctx->in_progress = true; 6926 rc = spdk_nvme_ctrlr_get_discovery_log_page(ctx->ctrlr, discovery_log_page_cb, ctx); 6927 if (rc != 0) { 6928 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 6929 } 6930 DISCOVERY_INFOLOG(ctx, "sent discovery log page command\n"); 6931 } 6932 6933 static void 6934 discovery_aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 6935 { 6936 struct discovery_ctx *ctx = arg; 6937 uint32_t log_page_id = (cpl->cdw0 & 0xFF0000) >> 16; 6938 6939 if (spdk_nvme_cpl_is_error(cpl)) { 6940 DISCOVERY_ERRLOG(ctx, "aer failed\n"); 6941 return; 6942 } 6943 6944 if (log_page_id != SPDK_NVME_LOG_DISCOVERY) { 6945 DISCOVERY_ERRLOG(ctx, "unexpected log page 0x%x\n", log_page_id); 6946 return; 6947 } 6948 6949 DISCOVERY_INFOLOG(ctx, "got aer\n"); 6950 if (ctx->in_progress) { 6951 ctx->pending = true; 6952 return; 6953 } 6954 6955 get_discovery_log_page(ctx); 6956 } 6957 6958 static void 6959 discovery_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 6960 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 6961 { 6962 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 6963 struct discovery_ctx *ctx; 6964 6965 ctx = SPDK_CONTAINEROF(user_opts, struct discovery_ctx, drv_opts); 6966 6967 DISCOVERY_INFOLOG(ctx, "discovery ctrlr attached\n"); 6968 ctx->probe_ctx = NULL; 6969 ctx->ctrlr = ctrlr; 6970 6971 if (ctx->rc != 0) { 6972 DISCOVERY_ERRLOG(ctx, "encountered error while attaching discovery ctrlr: %d\n", 6973 ctx->rc); 6974 return; 6975 } 6976 6977 spdk_nvme_ctrlr_register_aer_callback(ctx->ctrlr, discovery_aer_cb, ctx); 6978 } 6979 6980 static int 6981 discovery_poller(void *arg) 6982 { 6983 struct discovery_ctx *ctx = arg; 6984 struct spdk_nvme_transport_id *trid; 6985 int rc; 6986 6987 if (ctx->detach_ctx) { 6988 rc = spdk_nvme_detach_poll_async(ctx->detach_ctx); 6989 if (rc != -EAGAIN) { 6990 ctx->detach_ctx = NULL; 6991 ctx->ctrlr = NULL; 6992 } 6993 } else if (ctx->stop) { 6994 if (ctx->ctrlr != NULL) { 6995 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 6996 if (rc == 0) { 6997 return SPDK_POLLER_BUSY; 6998 } 6999 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 7000 } 7001 spdk_poller_unregister(&ctx->poller); 7002 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 7003 assert(ctx->start_cb_fn == NULL); 7004 if (ctx->stop_cb_fn != NULL) { 7005 ctx->stop_cb_fn(ctx->cb_ctx); 7006 } 7007 free_discovery_ctx(ctx); 7008 } else if (ctx->probe_ctx == NULL && ctx->ctrlr == NULL) { 7009 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 7010 DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); 7011 assert(ctx->initializing); 7012 spdk_poller_unregister(&ctx->poller); 7013 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 7014 complete_discovery_start(ctx, -ETIMEDOUT); 7015 stop_discovery(ctx, NULL, NULL); 7016 free_discovery_ctx(ctx); 7017 return SPDK_POLLER_BUSY; 7018 } 7019 7020 assert(ctx->entry_ctx_in_use == NULL); 7021 ctx->entry_ctx_in_use = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 7022 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 7023 trid = &ctx->entry_ctx_in_use->trid; 7024 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, discovery_attach_cb); 7025 if (ctx->probe_ctx) { 7026 spdk_poller_unregister(&ctx->poller); 7027 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000); 7028 } else { 7029 DISCOVERY_ERRLOG(ctx, "could not start discovery connect\n"); 7030 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 7031 ctx->entry_ctx_in_use = NULL; 7032 } 7033 } else if (ctx->probe_ctx) { 7034 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 7035 DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); 7036 complete_discovery_start(ctx, -ETIMEDOUT); 7037 return SPDK_POLLER_BUSY; 7038 } 7039 7040 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 7041 if (rc != -EAGAIN) { 7042 if (ctx->rc != 0) { 7043 assert(ctx->initializing); 7044 stop_discovery(ctx, NULL, ctx->cb_ctx); 7045 } else { 7046 assert(rc == 0); 7047 DISCOVERY_INFOLOG(ctx, "discovery ctrlr connected\n"); 7048 ctx->rc = rc; 7049 get_discovery_log_page(ctx); 7050 } 7051 } 7052 } else { 7053 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 7054 DISCOVERY_ERRLOG(ctx, "timed out while attaching NVM ctrlrs\n"); 7055 complete_discovery_start(ctx, -ETIMEDOUT); 7056 /* We need to wait until all NVM ctrlrs are attached before we stop the 7057 * discovery service to make sure we don't detach a ctrlr that is still 7058 * being attached. 7059 */ 7060 if (ctx->attach_in_progress == 0) { 7061 stop_discovery(ctx, NULL, ctx->cb_ctx); 7062 return SPDK_POLLER_BUSY; 7063 } 7064 } 7065 7066 rc = spdk_nvme_ctrlr_process_admin_completions(ctx->ctrlr); 7067 if (rc < 0) { 7068 spdk_poller_unregister(&ctx->poller); 7069 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 7070 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 7071 ctx->entry_ctx_in_use = NULL; 7072 7073 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 7074 if (rc != 0) { 7075 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 7076 ctx->ctrlr = NULL; 7077 } 7078 } 7079 } 7080 7081 return SPDK_POLLER_BUSY; 7082 } 7083 7084 static void 7085 start_discovery_poller(void *arg) 7086 { 7087 struct discovery_ctx *ctx = arg; 7088 7089 TAILQ_INSERT_TAIL(&g_discovery_ctxs, ctx, tailq); 7090 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 7091 } 7092 7093 int 7094 bdev_nvme_start_discovery(struct spdk_nvme_transport_id *trid, 7095 const char *base_name, 7096 struct spdk_nvme_ctrlr_opts *drv_opts, 7097 struct nvme_ctrlr_opts *bdev_opts, 7098 uint64_t attach_timeout, 7099 bool from_mdns, 7100 spdk_bdev_nvme_start_discovery_fn cb_fn, void *cb_ctx) 7101 { 7102 struct discovery_ctx *ctx; 7103 struct discovery_entry_ctx *discovery_entry_ctx; 7104 7105 snprintf(trid->subnqn, sizeof(trid->subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN); 7106 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 7107 if (strcmp(ctx->name, base_name) == 0) { 7108 return -EEXIST; 7109 } 7110 7111 if (ctx->entry_ctx_in_use != NULL) { 7112 if (!spdk_nvme_transport_id_compare(trid, &ctx->entry_ctx_in_use->trid)) { 7113 return -EEXIST; 7114 } 7115 } 7116 7117 TAILQ_FOREACH(discovery_entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 7118 if (!spdk_nvme_transport_id_compare(trid, &discovery_entry_ctx->trid)) { 7119 return -EEXIST; 7120 } 7121 } 7122 } 7123 7124 ctx = calloc(1, sizeof(*ctx)); 7125 if (ctx == NULL) { 7126 return -ENOMEM; 7127 } 7128 7129 ctx->name = strdup(base_name); 7130 if (ctx->name == NULL) { 7131 free_discovery_ctx(ctx); 7132 return -ENOMEM; 7133 } 7134 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 7135 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 7136 ctx->from_mdns_discovery_service = from_mdns; 7137 ctx->bdev_opts.from_discovery_service = true; 7138 ctx->calling_thread = spdk_get_thread(); 7139 ctx->start_cb_fn = cb_fn; 7140 ctx->cb_ctx = cb_ctx; 7141 ctx->initializing = true; 7142 if (ctx->start_cb_fn) { 7143 /* We can use this when dumping json to denote if this RPC parameter 7144 * was specified or not. 7145 */ 7146 ctx->wait_for_attach = true; 7147 } 7148 if (attach_timeout != 0) { 7149 ctx->timeout_ticks = spdk_get_ticks() + attach_timeout * 7150 spdk_get_ticks_hz() / 1000ull; 7151 } 7152 TAILQ_INIT(&ctx->nvm_entry_ctxs); 7153 TAILQ_INIT(&ctx->discovery_entry_ctxs); 7154 memcpy(&ctx->trid, trid, sizeof(*trid)); 7155 /* Even if user did not specify hostnqn, we can still strdup("\0"); */ 7156 ctx->hostnqn = strdup(ctx->drv_opts.hostnqn); 7157 if (ctx->hostnqn == NULL) { 7158 free_discovery_ctx(ctx); 7159 return -ENOMEM; 7160 } 7161 discovery_entry_ctx = create_discovery_entry_ctx(ctx, trid); 7162 if (discovery_entry_ctx == NULL) { 7163 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 7164 free_discovery_ctx(ctx); 7165 return -ENOMEM; 7166 } 7167 7168 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, discovery_entry_ctx, tailq); 7169 spdk_thread_send_msg(g_bdev_nvme_init_thread, start_discovery_poller, ctx); 7170 return 0; 7171 } 7172 7173 int 7174 bdev_nvme_stop_discovery(const char *name, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 7175 { 7176 struct discovery_ctx *ctx; 7177 7178 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 7179 if (strcmp(name, ctx->name) == 0) { 7180 if (ctx->stop) { 7181 return -EALREADY; 7182 } 7183 /* If we're still starting the discovery service and ->rc is non-zero, we're 7184 * going to stop it as soon as we can 7185 */ 7186 if (ctx->initializing && ctx->rc != 0) { 7187 return -EALREADY; 7188 } 7189 stop_discovery(ctx, cb_fn, cb_ctx); 7190 return 0; 7191 } 7192 } 7193 7194 return -ENOENT; 7195 } 7196 7197 static int 7198 bdev_nvme_library_init(void) 7199 { 7200 g_bdev_nvme_init_thread = spdk_get_thread(); 7201 7202 spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_create_poll_group_cb, 7203 bdev_nvme_destroy_poll_group_cb, 7204 sizeof(struct nvme_poll_group), "nvme_poll_groups"); 7205 7206 return 0; 7207 } 7208 7209 static void 7210 bdev_nvme_fini_destruct_ctrlrs(void) 7211 { 7212 struct nvme_bdev_ctrlr *nbdev_ctrlr; 7213 struct nvme_ctrlr *nvme_ctrlr; 7214 7215 pthread_mutex_lock(&g_bdev_nvme_mutex); 7216 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 7217 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 7218 pthread_mutex_lock(&nvme_ctrlr->mutex); 7219 if (nvme_ctrlr->destruct) { 7220 /* This controller's destruction was already started 7221 * before the application started shutting down 7222 */ 7223 pthread_mutex_unlock(&nvme_ctrlr->mutex); 7224 continue; 7225 } 7226 nvme_ctrlr->destruct = true; 7227 pthread_mutex_unlock(&nvme_ctrlr->mutex); 7228 7229 spdk_thread_send_msg(nvme_ctrlr->thread, _nvme_ctrlr_destruct, 7230 nvme_ctrlr); 7231 } 7232 } 7233 7234 g_bdev_nvme_module_finish = true; 7235 if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 7236 pthread_mutex_unlock(&g_bdev_nvme_mutex); 7237 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 7238 spdk_bdev_module_fini_done(); 7239 return; 7240 } 7241 7242 pthread_mutex_unlock(&g_bdev_nvme_mutex); 7243 } 7244 7245 static void 7246 check_discovery_fini(void *arg) 7247 { 7248 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 7249 bdev_nvme_fini_destruct_ctrlrs(); 7250 } 7251 } 7252 7253 static void 7254 bdev_nvme_library_fini(void) 7255 { 7256 struct nvme_probe_skip_entry *entry, *entry_tmp; 7257 struct discovery_ctx *ctx; 7258 7259 spdk_poller_unregister(&g_hotplug_poller); 7260 free(g_hotplug_probe_ctx); 7261 g_hotplug_probe_ctx = NULL; 7262 7263 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) { 7264 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 7265 free(entry); 7266 } 7267 7268 assert(spdk_get_thread() == g_bdev_nvme_init_thread); 7269 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 7270 bdev_nvme_fini_destruct_ctrlrs(); 7271 } else { 7272 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 7273 stop_discovery(ctx, check_discovery_fini, NULL); 7274 } 7275 } 7276 } 7277 7278 static void 7279 bdev_nvme_verify_pi_error(struct nvme_bdev_io *bio) 7280 { 7281 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7282 struct spdk_bdev *bdev = bdev_io->bdev; 7283 struct spdk_dif_ctx dif_ctx; 7284 struct spdk_dif_error err_blk = {}; 7285 int rc; 7286 struct spdk_dif_ctx_init_ext_opts dif_opts; 7287 7288 dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format); 7289 dif_opts.dif_pi_format = SPDK_DIF_PI_FORMAT_16; 7290 rc = spdk_dif_ctx_init(&dif_ctx, 7291 bdev->blocklen, bdev->md_len, bdev->md_interleave, 7292 bdev->dif_is_head_of_md, bdev->dif_type, 7293 bdev_io->u.bdev.dif_check_flags, 7294 bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0, &dif_opts); 7295 if (rc != 0) { 7296 SPDK_ERRLOG("Initialization of DIF context failed\n"); 7297 return; 7298 } 7299 7300 if (bdev->md_interleave) { 7301 rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 7302 bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 7303 } else { 7304 struct iovec md_iov = { 7305 .iov_base = bdev_io->u.bdev.md_buf, 7306 .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len, 7307 }; 7308 7309 rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 7310 &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 7311 } 7312 7313 if (rc != 0) { 7314 SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n", 7315 err_blk.err_type, err_blk.err_offset); 7316 } else { 7317 SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n"); 7318 } 7319 } 7320 7321 static void 7322 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 7323 { 7324 struct nvme_bdev_io *bio = ref; 7325 7326 if (spdk_nvme_cpl_is_success(cpl)) { 7327 /* Run PI verification for read data buffer. */ 7328 bdev_nvme_verify_pi_error(bio); 7329 } 7330 7331 /* Return original completion status */ 7332 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 7333 } 7334 7335 static void 7336 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 7337 { 7338 struct nvme_bdev_io *bio = ref; 7339 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7340 int ret; 7341 7342 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 7343 SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n", 7344 cpl->status.sct, cpl->status.sc); 7345 7346 /* Save completion status to use after verifying PI error. */ 7347 bio->cpl = *cpl; 7348 7349 if (spdk_likely(nvme_io_path_is_available(bio->io_path))) { 7350 /* Read without PI checking to verify PI error. */ 7351 ret = bdev_nvme_no_pi_readv(bio, 7352 bdev_io->u.bdev.iovs, 7353 bdev_io->u.bdev.iovcnt, 7354 bdev_io->u.bdev.md_buf, 7355 bdev_io->u.bdev.num_blocks, 7356 bdev_io->u.bdev.offset_blocks); 7357 if (ret == 0) { 7358 return; 7359 } 7360 } 7361 } 7362 7363 bdev_nvme_io_complete_nvme_status(bio, cpl); 7364 } 7365 7366 static void 7367 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7368 { 7369 struct nvme_bdev_io *bio = ref; 7370 7371 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 7372 SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n", 7373 cpl->status.sct, cpl->status.sc); 7374 /* Run PI verification for write data buffer if PI error is detected. */ 7375 bdev_nvme_verify_pi_error(bio); 7376 } 7377 7378 bdev_nvme_io_complete_nvme_status(bio, cpl); 7379 } 7380 7381 static void 7382 bdev_nvme_zone_appendv_done(void *ref, const struct spdk_nvme_cpl *cpl) 7383 { 7384 struct nvme_bdev_io *bio = ref; 7385 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7386 7387 /* spdk_bdev_io_get_append_location() requires that the ALBA is stored in offset_blocks. 7388 * Additionally, offset_blocks has to be set before calling bdev_nvme_verify_pi_error(). 7389 */ 7390 bdev_io->u.bdev.offset_blocks = *(uint64_t *)&cpl->cdw0; 7391 7392 if (spdk_nvme_cpl_is_pi_error(cpl)) { 7393 SPDK_ERRLOG("zone append completed with PI error (sct=%d, sc=%d)\n", 7394 cpl->status.sct, cpl->status.sc); 7395 /* Run PI verification for zone append data buffer if PI error is detected. */ 7396 bdev_nvme_verify_pi_error(bio); 7397 } 7398 7399 bdev_nvme_io_complete_nvme_status(bio, cpl); 7400 } 7401 7402 static void 7403 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7404 { 7405 struct nvme_bdev_io *bio = ref; 7406 7407 if (spdk_nvme_cpl_is_pi_error(cpl)) { 7408 SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n", 7409 cpl->status.sct, cpl->status.sc); 7410 /* Run PI verification for compare data buffer if PI error is detected. */ 7411 bdev_nvme_verify_pi_error(bio); 7412 } 7413 7414 bdev_nvme_io_complete_nvme_status(bio, cpl); 7415 } 7416 7417 static void 7418 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7419 { 7420 struct nvme_bdev_io *bio = ref; 7421 7422 /* Compare operation completion */ 7423 if (!bio->first_fused_completed) { 7424 /* Save compare result for write callback */ 7425 bio->cpl = *cpl; 7426 bio->first_fused_completed = true; 7427 return; 7428 } 7429 7430 /* Write operation completion */ 7431 if (spdk_nvme_cpl_is_error(&bio->cpl)) { 7432 /* If bio->cpl is already an error, it means the compare operation failed. In that case, 7433 * complete the IO with the compare operation's status. 7434 */ 7435 if (!spdk_nvme_cpl_is_error(cpl)) { 7436 SPDK_ERRLOG("Unexpected write success after compare failure.\n"); 7437 } 7438 7439 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 7440 } else { 7441 bdev_nvme_io_complete_nvme_status(bio, cpl); 7442 } 7443 } 7444 7445 static void 7446 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl) 7447 { 7448 struct nvme_bdev_io *bio = ref; 7449 7450 bdev_nvme_io_complete_nvme_status(bio, cpl); 7451 } 7452 7453 static int 7454 fill_zone_from_report(struct spdk_bdev_zone_info *info, struct spdk_nvme_zns_zone_desc *desc) 7455 { 7456 switch (desc->zt) { 7457 case SPDK_NVME_ZONE_TYPE_SEQWR: 7458 info->type = SPDK_BDEV_ZONE_TYPE_SEQWR; 7459 break; 7460 default: 7461 SPDK_ERRLOG("Invalid zone type: %#x in zone report\n", desc->zt); 7462 return -EIO; 7463 } 7464 7465 switch (desc->zs) { 7466 case SPDK_NVME_ZONE_STATE_EMPTY: 7467 info->state = SPDK_BDEV_ZONE_STATE_EMPTY; 7468 break; 7469 case SPDK_NVME_ZONE_STATE_IOPEN: 7470 info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN; 7471 break; 7472 case SPDK_NVME_ZONE_STATE_EOPEN: 7473 info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN; 7474 break; 7475 case SPDK_NVME_ZONE_STATE_CLOSED: 7476 info->state = SPDK_BDEV_ZONE_STATE_CLOSED; 7477 break; 7478 case SPDK_NVME_ZONE_STATE_RONLY: 7479 info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY; 7480 break; 7481 case SPDK_NVME_ZONE_STATE_FULL: 7482 info->state = SPDK_BDEV_ZONE_STATE_FULL; 7483 break; 7484 case SPDK_NVME_ZONE_STATE_OFFLINE: 7485 info->state = SPDK_BDEV_ZONE_STATE_OFFLINE; 7486 break; 7487 default: 7488 SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", desc->zs); 7489 return -EIO; 7490 } 7491 7492 info->zone_id = desc->zslba; 7493 info->write_pointer = desc->wp; 7494 info->capacity = desc->zcap; 7495 7496 return 0; 7497 } 7498 7499 static void 7500 bdev_nvme_get_zone_info_done(void *ref, const struct spdk_nvme_cpl *cpl) 7501 { 7502 struct nvme_bdev_io *bio = ref; 7503 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7504 uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id; 7505 uint32_t zones_to_copy = bdev_io->u.zone_mgmt.num_zones; 7506 struct spdk_bdev_zone_info *info = bdev_io->u.zone_mgmt.buf; 7507 uint64_t max_zones_per_buf, i; 7508 uint32_t zone_report_bufsize; 7509 struct spdk_nvme_ns *ns; 7510 struct spdk_nvme_qpair *qpair; 7511 int ret; 7512 7513 if (spdk_nvme_cpl_is_error(cpl)) { 7514 goto out_complete_io_nvme_cpl; 7515 } 7516 7517 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 7518 ret = -ENXIO; 7519 goto out_complete_io_ret; 7520 } 7521 7522 ns = bio->io_path->nvme_ns->ns; 7523 qpair = bio->io_path->qpair->qpair; 7524 7525 zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 7526 max_zones_per_buf = (zone_report_bufsize - sizeof(*bio->zone_report_buf)) / 7527 sizeof(bio->zone_report_buf->descs[0]); 7528 7529 if (bio->zone_report_buf->nr_zones > max_zones_per_buf) { 7530 ret = -EINVAL; 7531 goto out_complete_io_ret; 7532 } 7533 7534 if (!bio->zone_report_buf->nr_zones) { 7535 ret = -EINVAL; 7536 goto out_complete_io_ret; 7537 } 7538 7539 for (i = 0; i < bio->zone_report_buf->nr_zones && bio->handled_zones < zones_to_copy; i++) { 7540 ret = fill_zone_from_report(&info[bio->handled_zones], 7541 &bio->zone_report_buf->descs[i]); 7542 if (ret) { 7543 goto out_complete_io_ret; 7544 } 7545 bio->handled_zones++; 7546 } 7547 7548 if (bio->handled_zones < zones_to_copy) { 7549 uint64_t zone_size_lba = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 7550 uint64_t slba = zone_id + (zone_size_lba * bio->handled_zones); 7551 7552 memset(bio->zone_report_buf, 0, zone_report_bufsize); 7553 ret = spdk_nvme_zns_report_zones(ns, qpair, 7554 bio->zone_report_buf, zone_report_bufsize, 7555 slba, SPDK_NVME_ZRA_LIST_ALL, true, 7556 bdev_nvme_get_zone_info_done, bio); 7557 if (!ret) { 7558 return; 7559 } else { 7560 goto out_complete_io_ret; 7561 } 7562 } 7563 7564 out_complete_io_nvme_cpl: 7565 free(bio->zone_report_buf); 7566 bio->zone_report_buf = NULL; 7567 bdev_nvme_io_complete_nvme_status(bio, cpl); 7568 return; 7569 7570 out_complete_io_ret: 7571 free(bio->zone_report_buf); 7572 bio->zone_report_buf = NULL; 7573 bdev_nvme_io_complete(bio, ret); 7574 } 7575 7576 static void 7577 bdev_nvme_zone_management_done(void *ref, const struct spdk_nvme_cpl *cpl) 7578 { 7579 struct nvme_bdev_io *bio = ref; 7580 7581 bdev_nvme_io_complete_nvme_status(bio, cpl); 7582 } 7583 7584 static void 7585 bdev_nvme_admin_passthru_complete_nvme_status(void *ctx) 7586 { 7587 struct nvme_bdev_io *bio = ctx; 7588 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7589 const struct spdk_nvme_cpl *cpl = &bio->cpl; 7590 7591 assert(bdev_nvme_io_type_is_admin(bdev_io->type)); 7592 7593 __bdev_nvme_io_complete(bdev_io, 0, cpl); 7594 } 7595 7596 static void 7597 bdev_nvme_abort_complete(void *ctx) 7598 { 7599 struct nvme_bdev_io *bio = ctx; 7600 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7601 7602 if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) { 7603 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS, NULL); 7604 } else { 7605 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED, NULL); 7606 } 7607 } 7608 7609 static void 7610 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl) 7611 { 7612 struct nvme_bdev_io *bio = ref; 7613 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7614 7615 bio->cpl = *cpl; 7616 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), bdev_nvme_abort_complete, bio); 7617 } 7618 7619 static void 7620 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl) 7621 { 7622 struct nvme_bdev_io *bio = ref; 7623 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7624 7625 bio->cpl = *cpl; 7626 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7627 bdev_nvme_admin_passthru_complete_nvme_status, bio); 7628 } 7629 7630 static void 7631 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset) 7632 { 7633 struct nvme_bdev_io *bio = ref; 7634 struct iovec *iov; 7635 7636 bio->iov_offset = sgl_offset; 7637 for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) { 7638 iov = &bio->iovs[bio->iovpos]; 7639 if (bio->iov_offset < iov->iov_len) { 7640 break; 7641 } 7642 7643 bio->iov_offset -= iov->iov_len; 7644 } 7645 } 7646 7647 static int 7648 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length) 7649 { 7650 struct nvme_bdev_io *bio = ref; 7651 struct iovec *iov; 7652 7653 assert(bio->iovpos < bio->iovcnt); 7654 7655 iov = &bio->iovs[bio->iovpos]; 7656 7657 *address = iov->iov_base; 7658 *length = iov->iov_len; 7659 7660 if (bio->iov_offset) { 7661 assert(bio->iov_offset <= iov->iov_len); 7662 *address += bio->iov_offset; 7663 *length -= bio->iov_offset; 7664 } 7665 7666 bio->iov_offset += *length; 7667 if (bio->iov_offset == iov->iov_len) { 7668 bio->iovpos++; 7669 bio->iov_offset = 0; 7670 } 7671 7672 return 0; 7673 } 7674 7675 static void 7676 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset) 7677 { 7678 struct nvme_bdev_io *bio = ref; 7679 struct iovec *iov; 7680 7681 bio->fused_iov_offset = sgl_offset; 7682 for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) { 7683 iov = &bio->fused_iovs[bio->fused_iovpos]; 7684 if (bio->fused_iov_offset < iov->iov_len) { 7685 break; 7686 } 7687 7688 bio->fused_iov_offset -= iov->iov_len; 7689 } 7690 } 7691 7692 static int 7693 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length) 7694 { 7695 struct nvme_bdev_io *bio = ref; 7696 struct iovec *iov; 7697 7698 assert(bio->fused_iovpos < bio->fused_iovcnt); 7699 7700 iov = &bio->fused_iovs[bio->fused_iovpos]; 7701 7702 *address = iov->iov_base; 7703 *length = iov->iov_len; 7704 7705 if (bio->fused_iov_offset) { 7706 assert(bio->fused_iov_offset <= iov->iov_len); 7707 *address += bio->fused_iov_offset; 7708 *length -= bio->fused_iov_offset; 7709 } 7710 7711 bio->fused_iov_offset += *length; 7712 if (bio->fused_iov_offset == iov->iov_len) { 7713 bio->fused_iovpos++; 7714 bio->fused_iov_offset = 0; 7715 } 7716 7717 return 0; 7718 } 7719 7720 static int 7721 bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7722 void *md, uint64_t lba_count, uint64_t lba) 7723 { 7724 int rc; 7725 7726 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n", 7727 lba_count, lba); 7728 7729 bio->iovs = iov; 7730 bio->iovcnt = iovcnt; 7731 bio->iovpos = 0; 7732 bio->iov_offset = 0; 7733 7734 rc = spdk_nvme_ns_cmd_readv_with_md(bio->io_path->nvme_ns->ns, 7735 bio->io_path->qpair->qpair, 7736 lba, lba_count, 7737 bdev_nvme_no_pi_readv_done, bio, 0, 7738 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 7739 md, 0, 0); 7740 7741 if (rc != 0 && rc != -ENOMEM) { 7742 SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc); 7743 } 7744 return rc; 7745 } 7746 7747 static int 7748 bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7749 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, 7750 struct spdk_memory_domain *domain, void *domain_ctx, 7751 struct spdk_accel_sequence *seq) 7752 { 7753 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7754 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7755 int rc; 7756 7757 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7758 lba_count, lba); 7759 7760 bio->iovs = iov; 7761 bio->iovcnt = iovcnt; 7762 bio->iovpos = 0; 7763 bio->iov_offset = 0; 7764 7765 if (domain != NULL || seq != NULL) { 7766 bio->ext_opts.size = SPDK_SIZEOF(&bio->ext_opts, accel_sequence); 7767 bio->ext_opts.memory_domain = domain; 7768 bio->ext_opts.memory_domain_ctx = domain_ctx; 7769 bio->ext_opts.io_flags = flags; 7770 bio->ext_opts.metadata = md; 7771 bio->ext_opts.accel_sequence = seq; 7772 7773 if (iovcnt == 1) { 7774 rc = spdk_nvme_ns_cmd_read_ext(ns, qpair, iov[0].iov_base, lba, lba_count, bdev_nvme_readv_done, 7775 bio, &bio->ext_opts); 7776 } else { 7777 rc = spdk_nvme_ns_cmd_readv_ext(ns, qpair, lba, lba_count, 7778 bdev_nvme_readv_done, bio, 7779 bdev_nvme_queued_reset_sgl, 7780 bdev_nvme_queued_next_sge, 7781 &bio->ext_opts); 7782 } 7783 } else if (iovcnt == 1) { 7784 rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base, 7785 md, lba, lba_count, bdev_nvme_readv_done, 7786 bio, flags, 0, 0); 7787 } else { 7788 rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count, 7789 bdev_nvme_readv_done, bio, flags, 7790 bdev_nvme_queued_reset_sgl, 7791 bdev_nvme_queued_next_sge, md, 0, 0); 7792 } 7793 7794 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 7795 SPDK_ERRLOG("readv failed: rc = %d\n", rc); 7796 } 7797 return rc; 7798 } 7799 7800 static int 7801 bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7802 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, 7803 struct spdk_memory_domain *domain, void *domain_ctx, 7804 struct spdk_accel_sequence *seq, 7805 union spdk_bdev_nvme_cdw12 cdw12, union spdk_bdev_nvme_cdw13 cdw13) 7806 { 7807 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7808 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7809 int rc; 7810 7811 SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7812 lba_count, lba); 7813 7814 bio->iovs = iov; 7815 bio->iovcnt = iovcnt; 7816 bio->iovpos = 0; 7817 bio->iov_offset = 0; 7818 7819 if (domain != NULL || seq != NULL) { 7820 bio->ext_opts.size = SPDK_SIZEOF(&bio->ext_opts, accel_sequence); 7821 bio->ext_opts.memory_domain = domain; 7822 bio->ext_opts.memory_domain_ctx = domain_ctx; 7823 bio->ext_opts.io_flags = flags | SPDK_NVME_IO_FLAGS_DIRECTIVE(cdw12.write.dtype); 7824 bio->ext_opts.cdw13 = cdw13.raw; 7825 bio->ext_opts.metadata = md; 7826 bio->ext_opts.accel_sequence = seq; 7827 7828 if (iovcnt == 1) { 7829 rc = spdk_nvme_ns_cmd_write_ext(ns, qpair, iov[0].iov_base, lba, lba_count, bdev_nvme_writev_done, 7830 bio, &bio->ext_opts); 7831 } else { 7832 rc = spdk_nvme_ns_cmd_writev_ext(ns, qpair, lba, lba_count, 7833 bdev_nvme_writev_done, bio, 7834 bdev_nvme_queued_reset_sgl, 7835 bdev_nvme_queued_next_sge, 7836 &bio->ext_opts); 7837 } 7838 } else if (iovcnt == 1) { 7839 rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base, 7840 md, lba, lba_count, bdev_nvme_writev_done, 7841 bio, flags, 0, 0); 7842 } else { 7843 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 7844 bdev_nvme_writev_done, bio, flags, 7845 bdev_nvme_queued_reset_sgl, 7846 bdev_nvme_queued_next_sge, md, 0, 0); 7847 } 7848 7849 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 7850 SPDK_ERRLOG("writev failed: rc = %d\n", rc); 7851 } 7852 return rc; 7853 } 7854 7855 static int 7856 bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7857 void *md, uint64_t lba_count, uint64_t zslba, 7858 uint32_t flags) 7859 { 7860 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7861 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7862 int rc; 7863 7864 SPDK_DEBUGLOG(bdev_nvme, "zone append %" PRIu64 " blocks to zone start lba %#" PRIx64 "\n", 7865 lba_count, zslba); 7866 7867 bio->iovs = iov; 7868 bio->iovcnt = iovcnt; 7869 bio->iovpos = 0; 7870 bio->iov_offset = 0; 7871 7872 if (iovcnt == 1) { 7873 rc = spdk_nvme_zns_zone_append_with_md(ns, qpair, iov[0].iov_base, md, zslba, 7874 lba_count, 7875 bdev_nvme_zone_appendv_done, bio, 7876 flags, 7877 0, 0); 7878 } else { 7879 rc = spdk_nvme_zns_zone_appendv_with_md(ns, qpair, zslba, lba_count, 7880 bdev_nvme_zone_appendv_done, bio, flags, 7881 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 7882 md, 0, 0); 7883 } 7884 7885 if (rc != 0 && rc != -ENOMEM) { 7886 SPDK_ERRLOG("zone append failed: rc = %d\n", rc); 7887 } 7888 return rc; 7889 } 7890 7891 static int 7892 bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7893 void *md, uint64_t lba_count, uint64_t lba, 7894 uint32_t flags) 7895 { 7896 int rc; 7897 7898 SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7899 lba_count, lba); 7900 7901 bio->iovs = iov; 7902 bio->iovcnt = iovcnt; 7903 bio->iovpos = 0; 7904 bio->iov_offset = 0; 7905 7906 rc = spdk_nvme_ns_cmd_comparev_with_md(bio->io_path->nvme_ns->ns, 7907 bio->io_path->qpair->qpair, 7908 lba, lba_count, 7909 bdev_nvme_comparev_done, bio, flags, 7910 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 7911 md, 0, 0); 7912 7913 if (rc != 0 && rc != -ENOMEM) { 7914 SPDK_ERRLOG("comparev failed: rc = %d\n", rc); 7915 } 7916 return rc; 7917 } 7918 7919 static int 7920 bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, 7921 struct iovec *write_iov, int write_iovcnt, 7922 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags) 7923 { 7924 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7925 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7926 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7927 int rc; 7928 7929 SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7930 lba_count, lba); 7931 7932 bio->iovs = cmp_iov; 7933 bio->iovcnt = cmp_iovcnt; 7934 bio->iovpos = 0; 7935 bio->iov_offset = 0; 7936 bio->fused_iovs = write_iov; 7937 bio->fused_iovcnt = write_iovcnt; 7938 bio->fused_iovpos = 0; 7939 bio->fused_iov_offset = 0; 7940 7941 if (bdev_io->num_retries == 0) { 7942 bio->first_fused_submitted = false; 7943 bio->first_fused_completed = false; 7944 } 7945 7946 if (!bio->first_fused_submitted) { 7947 flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST; 7948 memset(&bio->cpl, 0, sizeof(bio->cpl)); 7949 7950 rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count, 7951 bdev_nvme_comparev_and_writev_done, bio, flags, 7952 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0); 7953 if (rc == 0) { 7954 bio->first_fused_submitted = true; 7955 flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST; 7956 } else { 7957 if (rc != -ENOMEM) { 7958 SPDK_ERRLOG("compare failed: rc = %d\n", rc); 7959 } 7960 return rc; 7961 } 7962 } 7963 7964 flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND; 7965 7966 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 7967 bdev_nvme_comparev_and_writev_done, bio, flags, 7968 bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0); 7969 if (rc != 0 && rc != -ENOMEM) { 7970 SPDK_ERRLOG("write failed: rc = %d\n", rc); 7971 rc = 0; 7972 } 7973 7974 return rc; 7975 } 7976 7977 static int 7978 bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 7979 { 7980 struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES]; 7981 struct spdk_nvme_dsm_range *range; 7982 uint64_t offset, remaining; 7983 uint64_t num_ranges_u64; 7984 uint16_t num_ranges; 7985 int rc; 7986 7987 num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) / 7988 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 7989 if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) { 7990 SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks); 7991 return -EINVAL; 7992 } 7993 num_ranges = (uint16_t)num_ranges_u64; 7994 7995 offset = offset_blocks; 7996 remaining = num_blocks; 7997 range = &dsm_ranges[0]; 7998 7999 /* Fill max-size ranges until the remaining blocks fit into one range */ 8000 while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) { 8001 range->attributes.raw = 0; 8002 range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 8003 range->starting_lba = offset; 8004 8005 offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 8006 remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 8007 range++; 8008 } 8009 8010 /* Final range describes the remaining blocks */ 8011 range->attributes.raw = 0; 8012 range->length = remaining; 8013 range->starting_lba = offset; 8014 8015 rc = spdk_nvme_ns_cmd_dataset_management(bio->io_path->nvme_ns->ns, 8016 bio->io_path->qpair->qpair, 8017 SPDK_NVME_DSM_ATTR_DEALLOCATE, 8018 dsm_ranges, num_ranges, 8019 bdev_nvme_queued_done, bio); 8020 8021 return rc; 8022 } 8023 8024 static int 8025 bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 8026 { 8027 if (num_blocks > UINT16_MAX + 1) { 8028 SPDK_ERRLOG("NVMe write zeroes is limited to 16-bit block count\n"); 8029 return -EINVAL; 8030 } 8031 8032 return spdk_nvme_ns_cmd_write_zeroes(bio->io_path->nvme_ns->ns, 8033 bio->io_path->qpair->qpair, 8034 offset_blocks, num_blocks, 8035 bdev_nvme_queued_done, bio, 8036 0); 8037 } 8038 8039 static int 8040 bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones, 8041 struct spdk_bdev_zone_info *info) 8042 { 8043 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8044 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8045 uint32_t zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 8046 uint64_t zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 8047 uint64_t total_zones = spdk_nvme_zns_ns_get_num_zones(ns); 8048 8049 if (zone_id % zone_size != 0) { 8050 return -EINVAL; 8051 } 8052 8053 if (num_zones > total_zones || !num_zones) { 8054 return -EINVAL; 8055 } 8056 8057 assert(!bio->zone_report_buf); 8058 bio->zone_report_buf = calloc(1, zone_report_bufsize); 8059 if (!bio->zone_report_buf) { 8060 return -ENOMEM; 8061 } 8062 8063 bio->handled_zones = 0; 8064 8065 return spdk_nvme_zns_report_zones(ns, qpair, bio->zone_report_buf, zone_report_bufsize, 8066 zone_id, SPDK_NVME_ZRA_LIST_ALL, true, 8067 bdev_nvme_get_zone_info_done, bio); 8068 } 8069 8070 static int 8071 bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 8072 enum spdk_bdev_zone_action action) 8073 { 8074 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8075 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8076 8077 switch (action) { 8078 case SPDK_BDEV_ZONE_CLOSE: 8079 return spdk_nvme_zns_close_zone(ns, qpair, zone_id, false, 8080 bdev_nvme_zone_management_done, bio); 8081 case SPDK_BDEV_ZONE_FINISH: 8082 return spdk_nvme_zns_finish_zone(ns, qpair, zone_id, false, 8083 bdev_nvme_zone_management_done, bio); 8084 case SPDK_BDEV_ZONE_OPEN: 8085 return spdk_nvme_zns_open_zone(ns, qpair, zone_id, false, 8086 bdev_nvme_zone_management_done, bio); 8087 case SPDK_BDEV_ZONE_RESET: 8088 return spdk_nvme_zns_reset_zone(ns, qpair, zone_id, false, 8089 bdev_nvme_zone_management_done, bio); 8090 case SPDK_BDEV_ZONE_OFFLINE: 8091 return spdk_nvme_zns_offline_zone(ns, qpair, zone_id, false, 8092 bdev_nvme_zone_management_done, bio); 8093 default: 8094 return -EINVAL; 8095 } 8096 } 8097 8098 static void 8099 bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 8100 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) 8101 { 8102 struct nvme_io_path *io_path; 8103 struct nvme_ctrlr *nvme_ctrlr; 8104 uint32_t max_xfer_size; 8105 int rc = -ENXIO; 8106 8107 /* Choose the first ctrlr which is not failed. */ 8108 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 8109 nvme_ctrlr = io_path->qpair->ctrlr; 8110 8111 /* We should skip any unavailable nvme_ctrlr rather than checking 8112 * if the return value of spdk_nvme_ctrlr_cmd_admin_raw() is -ENXIO. 8113 */ 8114 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 8115 continue; 8116 } 8117 8118 max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ctrlr->ctrlr); 8119 8120 if (nbytes > max_xfer_size) { 8121 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 8122 rc = -EINVAL; 8123 goto err; 8124 } 8125 8126 rc = spdk_nvme_ctrlr_cmd_admin_raw(nvme_ctrlr->ctrlr, cmd, buf, (uint32_t)nbytes, 8127 bdev_nvme_admin_passthru_done, bio); 8128 if (rc == 0) { 8129 return; 8130 } 8131 } 8132 8133 err: 8134 bdev_nvme_admin_complete(bio, rc); 8135 } 8136 8137 static int 8138 bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 8139 void *buf, size_t nbytes) 8140 { 8141 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8142 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8143 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 8144 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 8145 8146 if (nbytes > max_xfer_size) { 8147 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 8148 return -EINVAL; 8149 } 8150 8151 /* 8152 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 8153 * so fill it out automatically. 8154 */ 8155 cmd->nsid = spdk_nvme_ns_get_id(ns); 8156 8157 return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf, 8158 (uint32_t)nbytes, bdev_nvme_queued_done, bio); 8159 } 8160 8161 static int 8162 bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 8163 void *buf, size_t nbytes, void *md_buf, size_t md_len) 8164 { 8165 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8166 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8167 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 8168 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 8169 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 8170 8171 if (nbytes > max_xfer_size) { 8172 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 8173 return -EINVAL; 8174 } 8175 8176 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 8177 SPDK_ERRLOG("invalid meta data buffer size\n"); 8178 return -EINVAL; 8179 } 8180 8181 /* 8182 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 8183 * so fill it out automatically. 8184 */ 8185 cmd->nsid = spdk_nvme_ns_get_id(ns); 8186 8187 return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf, 8188 (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio); 8189 } 8190 8191 static int 8192 bdev_nvme_iov_passthru_md(struct nvme_bdev_io *bio, 8193 struct spdk_nvme_cmd *cmd, struct iovec *iov, int iovcnt, 8194 size_t nbytes, void *md_buf, size_t md_len) 8195 { 8196 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8197 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8198 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 8199 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 8200 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 8201 8202 bio->iovs = iov; 8203 bio->iovcnt = iovcnt; 8204 bio->iovpos = 0; 8205 bio->iov_offset = 0; 8206 8207 if (nbytes > max_xfer_size) { 8208 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 8209 return -EINVAL; 8210 } 8211 8212 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 8213 SPDK_ERRLOG("invalid meta data buffer size\n"); 8214 return -EINVAL; 8215 } 8216 8217 /* 8218 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands 8219 * require a nsid, so fill it out automatically. 8220 */ 8221 cmd->nsid = spdk_nvme_ns_get_id(ns); 8222 8223 return spdk_nvme_ctrlr_cmd_iov_raw_with_md( 8224 ctrlr, qpair, cmd, (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio, 8225 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge); 8226 } 8227 8228 static void 8229 bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 8230 struct nvme_bdev_io *bio_to_abort) 8231 { 8232 struct nvme_io_path *io_path; 8233 int rc = 0; 8234 8235 rc = bdev_nvme_abort_retry_io(nbdev_ch, bio_to_abort); 8236 if (rc == 0) { 8237 bdev_nvme_admin_complete(bio, 0); 8238 return; 8239 } 8240 8241 io_path = bio_to_abort->io_path; 8242 if (io_path != NULL) { 8243 rc = spdk_nvme_ctrlr_cmd_abort_ext(io_path->qpair->ctrlr->ctrlr, 8244 io_path->qpair->qpair, 8245 bio_to_abort, 8246 bdev_nvme_abort_done, bio); 8247 } else { 8248 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 8249 rc = spdk_nvme_ctrlr_cmd_abort_ext(io_path->qpair->ctrlr->ctrlr, 8250 NULL, 8251 bio_to_abort, 8252 bdev_nvme_abort_done, bio); 8253 8254 if (rc != -ENOENT) { 8255 break; 8256 } 8257 } 8258 } 8259 8260 if (rc != 0) { 8261 /* If no command was found or there was any error, complete the abort 8262 * request with failure. 8263 */ 8264 bdev_nvme_admin_complete(bio, rc); 8265 } 8266 } 8267 8268 static int 8269 bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, uint64_t src_offset_blocks, 8270 uint64_t num_blocks) 8271 { 8272 struct spdk_nvme_scc_source_range range = { 8273 .slba = src_offset_blocks, 8274 .nlb = num_blocks - 1 8275 }; 8276 8277 return spdk_nvme_ns_cmd_copy(bio->io_path->nvme_ns->ns, 8278 bio->io_path->qpair->qpair, 8279 &range, 1, dst_offset_blocks, 8280 bdev_nvme_queued_done, bio); 8281 } 8282 8283 static void 8284 bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w) 8285 { 8286 const char *action; 8287 uint32_t i; 8288 8289 if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) { 8290 action = "reset"; 8291 } else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) { 8292 action = "abort"; 8293 } else { 8294 action = "none"; 8295 } 8296 8297 spdk_json_write_object_begin(w); 8298 8299 spdk_json_write_named_string(w, "method", "bdev_nvme_set_options"); 8300 8301 spdk_json_write_named_object_begin(w, "params"); 8302 spdk_json_write_named_string(w, "action_on_timeout", action); 8303 spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us); 8304 spdk_json_write_named_uint64(w, "timeout_admin_us", g_opts.timeout_admin_us); 8305 spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms); 8306 spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst); 8307 spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight); 8308 spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight); 8309 spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight); 8310 spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us); 8311 spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us); 8312 spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests); 8313 spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit); 8314 spdk_json_write_named_uint32(w, "transport_retry_count", g_opts.transport_retry_count); 8315 spdk_json_write_named_int32(w, "bdev_retry_count", g_opts.bdev_retry_count); 8316 spdk_json_write_named_uint8(w, "transport_ack_timeout", g_opts.transport_ack_timeout); 8317 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", g_opts.ctrlr_loss_timeout_sec); 8318 spdk_json_write_named_uint32(w, "reconnect_delay_sec", g_opts.reconnect_delay_sec); 8319 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", g_opts.fast_io_fail_timeout_sec); 8320 spdk_json_write_named_bool(w, "disable_auto_failback", g_opts.disable_auto_failback); 8321 spdk_json_write_named_bool(w, "generate_uuids", g_opts.generate_uuids); 8322 spdk_json_write_named_uint8(w, "transport_tos", g_opts.transport_tos); 8323 spdk_json_write_named_bool(w, "nvme_error_stat", g_opts.nvme_error_stat); 8324 spdk_json_write_named_uint32(w, "rdma_srq_size", g_opts.rdma_srq_size); 8325 spdk_json_write_named_bool(w, "io_path_stat", g_opts.io_path_stat); 8326 spdk_json_write_named_bool(w, "allow_accel_sequence", g_opts.allow_accel_sequence); 8327 spdk_json_write_named_uint32(w, "rdma_max_cq_size", g_opts.rdma_max_cq_size); 8328 spdk_json_write_named_uint16(w, "rdma_cm_event_timeout_ms", g_opts.rdma_cm_event_timeout_ms); 8329 spdk_json_write_named_array_begin(w, "dhchap_digests"); 8330 for (i = 0; i < 32; ++i) { 8331 if (g_opts.dhchap_digests & SPDK_BIT(i)) { 8332 spdk_json_write_string(w, spdk_nvme_dhchap_get_digest_name(i)); 8333 } 8334 } 8335 spdk_json_write_array_end(w); 8336 spdk_json_write_named_array_begin(w, "dhchap_dhgroups"); 8337 for (i = 0; i < 32; ++i) { 8338 if (g_opts.dhchap_dhgroups & SPDK_BIT(i)) { 8339 spdk_json_write_string(w, spdk_nvme_dhchap_get_dhgroup_name(i)); 8340 } 8341 } 8342 8343 spdk_json_write_array_end(w); 8344 spdk_json_write_object_end(w); 8345 8346 spdk_json_write_object_end(w); 8347 } 8348 8349 static void 8350 bdev_nvme_discovery_config_json(struct spdk_json_write_ctx *w, struct discovery_ctx *ctx) 8351 { 8352 struct spdk_nvme_transport_id trid; 8353 8354 spdk_json_write_object_begin(w); 8355 8356 spdk_json_write_named_string(w, "method", "bdev_nvme_start_discovery"); 8357 8358 spdk_json_write_named_object_begin(w, "params"); 8359 spdk_json_write_named_string(w, "name", ctx->name); 8360 spdk_json_write_named_string(w, "hostnqn", ctx->hostnqn); 8361 8362 trid = ctx->trid; 8363 memset(trid.subnqn, 0, sizeof(trid.subnqn)); 8364 nvme_bdev_dump_trid_json(&trid, w); 8365 8366 spdk_json_write_named_bool(w, "wait_for_attach", ctx->wait_for_attach); 8367 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", ctx->bdev_opts.ctrlr_loss_timeout_sec); 8368 spdk_json_write_named_uint32(w, "reconnect_delay_sec", ctx->bdev_opts.reconnect_delay_sec); 8369 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 8370 ctx->bdev_opts.fast_io_fail_timeout_sec); 8371 spdk_json_write_object_end(w); 8372 8373 spdk_json_write_object_end(w); 8374 } 8375 8376 #ifdef SPDK_CONFIG_NVME_CUSE 8377 static void 8378 nvme_ctrlr_cuse_config_json(struct spdk_json_write_ctx *w, 8379 struct nvme_ctrlr *nvme_ctrlr) 8380 { 8381 size_t cuse_name_size = 128; 8382 char cuse_name[cuse_name_size]; 8383 8384 if (spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr, 8385 cuse_name, &cuse_name_size) != 0) { 8386 return; 8387 } 8388 8389 spdk_json_write_object_begin(w); 8390 8391 spdk_json_write_named_string(w, "method", "bdev_nvme_cuse_register"); 8392 8393 spdk_json_write_named_object_begin(w, "params"); 8394 spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name); 8395 spdk_json_write_object_end(w); 8396 8397 spdk_json_write_object_end(w); 8398 } 8399 #endif 8400 8401 static void 8402 nvme_ctrlr_config_json(struct spdk_json_write_ctx *w, 8403 struct nvme_ctrlr *nvme_ctrlr) 8404 { 8405 struct spdk_nvme_transport_id *trid; 8406 const struct spdk_nvme_ctrlr_opts *opts; 8407 8408 if (nvme_ctrlr->opts.from_discovery_service) { 8409 /* Do not emit an RPC for this - it will be implicitly 8410 * covered by a separate bdev_nvme_start_discovery or 8411 * bdev_nvme_start_mdns_discovery RPC. 8412 */ 8413 return; 8414 } 8415 8416 trid = &nvme_ctrlr->active_path_id->trid; 8417 8418 spdk_json_write_object_begin(w); 8419 8420 spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller"); 8421 8422 spdk_json_write_named_object_begin(w, "params"); 8423 spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name); 8424 nvme_bdev_dump_trid_json(trid, w); 8425 spdk_json_write_named_bool(w, "prchk_reftag", 8426 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0); 8427 spdk_json_write_named_bool(w, "prchk_guard", 8428 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0); 8429 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", nvme_ctrlr->opts.ctrlr_loss_timeout_sec); 8430 spdk_json_write_named_uint32(w, "reconnect_delay_sec", nvme_ctrlr->opts.reconnect_delay_sec); 8431 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 8432 nvme_ctrlr->opts.fast_io_fail_timeout_sec); 8433 if (nvme_ctrlr->psk != NULL) { 8434 spdk_json_write_named_string(w, "psk", spdk_key_get_name(nvme_ctrlr->psk)); 8435 } else if (nvme_ctrlr->opts.psk[0] != '\0') { 8436 spdk_json_write_named_string(w, "psk", nvme_ctrlr->opts.psk); 8437 } 8438 8439 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 8440 spdk_json_write_named_string(w, "hostnqn", opts->hostnqn); 8441 spdk_json_write_named_bool(w, "hdgst", opts->header_digest); 8442 spdk_json_write_named_bool(w, "ddgst", opts->data_digest); 8443 if (opts->src_addr[0] != '\0') { 8444 spdk_json_write_named_string(w, "hostaddr", opts->src_addr); 8445 } 8446 if (opts->src_svcid[0] != '\0') { 8447 spdk_json_write_named_string(w, "hostsvcid", opts->src_svcid); 8448 } 8449 8450 spdk_json_write_object_end(w); 8451 8452 spdk_json_write_object_end(w); 8453 } 8454 8455 static void 8456 bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w) 8457 { 8458 spdk_json_write_object_begin(w); 8459 spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug"); 8460 8461 spdk_json_write_named_object_begin(w, "params"); 8462 spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us); 8463 spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled); 8464 spdk_json_write_object_end(w); 8465 8466 spdk_json_write_object_end(w); 8467 } 8468 8469 static int 8470 bdev_nvme_config_json(struct spdk_json_write_ctx *w) 8471 { 8472 struct nvme_bdev_ctrlr *nbdev_ctrlr; 8473 struct nvme_ctrlr *nvme_ctrlr; 8474 struct discovery_ctx *ctx; 8475 8476 bdev_nvme_opts_config_json(w); 8477 8478 pthread_mutex_lock(&g_bdev_nvme_mutex); 8479 8480 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 8481 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 8482 nvme_ctrlr_config_json(w, nvme_ctrlr); 8483 8484 #ifdef SPDK_CONFIG_NVME_CUSE 8485 nvme_ctrlr_cuse_config_json(w, nvme_ctrlr); 8486 #endif 8487 } 8488 } 8489 8490 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 8491 if (!ctx->from_mdns_discovery_service) { 8492 bdev_nvme_discovery_config_json(w, ctx); 8493 } 8494 } 8495 8496 bdev_nvme_mdns_discovery_config_json(w); 8497 8498 /* Dump as last parameter to give all NVMe bdevs chance to be constructed 8499 * before enabling hotplug poller. 8500 */ 8501 bdev_nvme_hotplug_config_json(w); 8502 8503 pthread_mutex_unlock(&g_bdev_nvme_mutex); 8504 return 0; 8505 } 8506 8507 struct spdk_nvme_ctrlr * 8508 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev) 8509 { 8510 struct nvme_bdev *nbdev; 8511 struct nvme_ns *nvme_ns; 8512 8513 if (!bdev || bdev->module != &nvme_if) { 8514 return NULL; 8515 } 8516 8517 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 8518 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 8519 assert(nvme_ns != NULL); 8520 8521 return nvme_ns->ctrlr->ctrlr; 8522 } 8523 8524 void 8525 nvme_io_path_info_json(struct spdk_json_write_ctx *w, struct nvme_io_path *io_path) 8526 { 8527 struct nvme_ns *nvme_ns = io_path->nvme_ns; 8528 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 8529 const struct spdk_nvme_ctrlr_data *cdata; 8530 const struct spdk_nvme_transport_id *trid; 8531 const struct nvme_bdev_channel *nbdev_ch; 8532 const char *adrfam_str; 8533 bool current; 8534 8535 spdk_json_write_object_begin(w); 8536 8537 spdk_json_write_named_string(w, "bdev_name", nvme_ns->bdev->disk.name); 8538 8539 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 8540 trid = spdk_nvme_ctrlr_get_transport_id(nvme_ctrlr->ctrlr); 8541 8542 spdk_json_write_named_uint32(w, "cntlid", cdata->cntlid); 8543 nbdev_ch = io_path->nbdev_ch; 8544 if (nbdev_ch == NULL) { 8545 current = false; 8546 } else if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) { 8547 struct nvme_io_path *optimized_io_path = NULL; 8548 8549 STAILQ_FOREACH(optimized_io_path, &nbdev_ch->io_path_list, stailq) { 8550 if (optimized_io_path->nvme_ns->ana_state == SPDK_NVME_ANA_OPTIMIZED_STATE) { 8551 break; 8552 } 8553 } 8554 8555 current = nvme_io_path_is_available(io_path); 8556 if (io_path->nvme_ns->ana_state == SPDK_NVME_ANA_NON_OPTIMIZED_STATE) { 8557 /* A non-optimized path is only current if there are no optimized paths. */ 8558 current = current && (optimized_io_path == NULL); 8559 } 8560 } else { 8561 current = (io_path == nbdev_ch->current_io_path); 8562 } 8563 spdk_json_write_named_bool(w, "current", current); 8564 spdk_json_write_named_bool(w, "connected", nvme_qpair_is_connected(io_path->qpair)); 8565 spdk_json_write_named_bool(w, "accessible", nvme_ns_is_accessible(nvme_ns)); 8566 8567 spdk_json_write_named_object_begin(w, "transport"); 8568 spdk_json_write_named_string(w, "trtype", trid->trstring); 8569 spdk_json_write_named_string(w, "traddr", trid->traddr); 8570 if (trid->trsvcid[0] != '\0') { 8571 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 8572 } 8573 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 8574 if (adrfam_str) { 8575 spdk_json_write_named_string(w, "adrfam", adrfam_str); 8576 } 8577 spdk_json_write_object_end(w); 8578 8579 spdk_json_write_object_end(w); 8580 } 8581 8582 void 8583 bdev_nvme_get_discovery_info(struct spdk_json_write_ctx *w) 8584 { 8585 struct discovery_ctx *ctx; 8586 struct discovery_entry_ctx *entry_ctx; 8587 8588 spdk_json_write_array_begin(w); 8589 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 8590 spdk_json_write_object_begin(w); 8591 spdk_json_write_named_string(w, "name", ctx->name); 8592 8593 spdk_json_write_named_object_begin(w, "trid"); 8594 nvme_bdev_dump_trid_json(&ctx->trid, w); 8595 spdk_json_write_object_end(w); 8596 8597 spdk_json_write_named_array_begin(w, "referrals"); 8598 TAILQ_FOREACH(entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 8599 spdk_json_write_object_begin(w); 8600 spdk_json_write_named_object_begin(w, "trid"); 8601 nvme_bdev_dump_trid_json(&entry_ctx->trid, w); 8602 spdk_json_write_object_end(w); 8603 spdk_json_write_object_end(w); 8604 } 8605 spdk_json_write_array_end(w); 8606 8607 spdk_json_write_object_end(w); 8608 } 8609 spdk_json_write_array_end(w); 8610 } 8611 8612 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme) 8613 8614 SPDK_TRACE_REGISTER_FN(bdev_nvme_trace, "bdev_nvme", TRACE_GROUP_BDEV_NVME) 8615 { 8616 struct spdk_trace_tpoint_opts opts[] = { 8617 { 8618 "BDEV_NVME_IO_START", TRACE_BDEV_NVME_IO_START, 8619 OWNER_TYPE_NONE, OBJECT_BDEV_NVME_IO, 1, 8620 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 8621 }, 8622 { 8623 "BDEV_NVME_IO_DONE", TRACE_BDEV_NVME_IO_DONE, 8624 OWNER_TYPE_NONE, OBJECT_BDEV_NVME_IO, 0, 8625 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 8626 } 8627 }; 8628 8629 8630 spdk_trace_register_object(OBJECT_BDEV_NVME_IO, 'N'); 8631 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 8632 spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_SUBMIT, OBJECT_BDEV_NVME_IO, 0); 8633 spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_SUBMIT, OBJECT_BDEV_NVME_IO, 0); 8634 spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_COMPLETE, OBJECT_BDEV_NVME_IO, 0); 8635 spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_COMPLETE, OBJECT_BDEV_NVME_IO, 0); 8636 } 8637