1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 * Copyright (c) 2022 Dell Inc, or its subsidiaries. All rights reserved. 6 */ 7 8 #include "spdk/stdinc.h" 9 10 #include "bdev_nvme.h" 11 12 #include "spdk/accel.h" 13 #include "spdk/config.h" 14 #include "spdk/endian.h" 15 #include "spdk/bdev.h" 16 #include "spdk/json.h" 17 #include "spdk/keyring.h" 18 #include "spdk/likely.h" 19 #include "spdk/nvme.h" 20 #include "spdk/nvme_ocssd.h" 21 #include "spdk/nvme_zns.h" 22 #include "spdk/opal.h" 23 #include "spdk/thread.h" 24 #include "spdk/trace.h" 25 #include "spdk/string.h" 26 #include "spdk/util.h" 27 #include "spdk/uuid.h" 28 29 #include "spdk/bdev_module.h" 30 #include "spdk/log.h" 31 32 #include "spdk_internal/usdt.h" 33 #include "spdk_internal/trace_defs.h" 34 35 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true 36 #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS (10000) 37 38 #define NSID_STR_LEN 10 39 40 #define SPDK_CONTROLLER_NAME_MAX 512 41 42 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w); 43 44 struct nvme_bdev_io { 45 /** array of iovecs to transfer. */ 46 struct iovec *iovs; 47 48 /** Number of iovecs in iovs array. */ 49 int iovcnt; 50 51 /** Current iovec position. */ 52 int iovpos; 53 54 /** Offset in current iovec. */ 55 uint32_t iov_offset; 56 57 /** I/O path the current I/O or admin passthrough is submitted on, or the I/O path 58 * being reset in a reset I/O. 59 */ 60 struct nvme_io_path *io_path; 61 62 /** array of iovecs to transfer. */ 63 struct iovec *fused_iovs; 64 65 /** Number of iovecs in iovs array. */ 66 int fused_iovcnt; 67 68 /** Current iovec position. */ 69 int fused_iovpos; 70 71 /** Offset in current iovec. */ 72 uint32_t fused_iov_offset; 73 74 /** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */ 75 struct spdk_nvme_cpl cpl; 76 77 /** Extended IO opts passed by the user to bdev layer and mapped to NVME format */ 78 struct spdk_nvme_ns_cmd_ext_io_opts ext_opts; 79 80 /** Keeps track if first of fused commands was submitted */ 81 bool first_fused_submitted; 82 83 /** Keeps track if first of fused commands was completed */ 84 bool first_fused_completed; 85 86 /** Temporary pointer to zone report buffer */ 87 struct spdk_nvme_zns_zone_report *zone_report_buf; 88 89 /** Keep track of how many zones that have been copied to the spdk_bdev_zone_info struct */ 90 uint64_t handled_zones; 91 92 /** Expiration value in ticks to retry the current I/O. */ 93 uint64_t retry_ticks; 94 95 /* How many times the current I/O was retried. */ 96 int32_t retry_count; 97 98 /* Current tsc at submit time. */ 99 uint64_t submit_tsc; 100 }; 101 102 struct nvme_probe_skip_entry { 103 struct spdk_nvme_transport_id trid; 104 TAILQ_ENTRY(nvme_probe_skip_entry) tailq; 105 }; 106 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */ 107 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER( 108 g_skipped_nvme_ctrlrs); 109 110 #define BDEV_NVME_DEFAULT_DIGESTS (SPDK_BIT(SPDK_NVMF_DHCHAP_HASH_SHA256) | \ 111 SPDK_BIT(SPDK_NVMF_DHCHAP_HASH_SHA384) | \ 112 SPDK_BIT(SPDK_NVMF_DHCHAP_HASH_SHA512)) 113 114 #define BDEV_NVME_DEFAULT_DHGROUPS (SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_NULL) | \ 115 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_2048) | \ 116 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_3072) | \ 117 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_4096) | \ 118 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_6144) | \ 119 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_8192)) 120 121 static struct spdk_bdev_nvme_opts g_opts = { 122 .action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE, 123 .timeout_us = 0, 124 .timeout_admin_us = 0, 125 .keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS, 126 .transport_retry_count = 4, 127 .arbitration_burst = 0, 128 .low_priority_weight = 0, 129 .medium_priority_weight = 0, 130 .high_priority_weight = 0, 131 .nvme_adminq_poll_period_us = 10000ULL, 132 .nvme_ioq_poll_period_us = 0, 133 .io_queue_requests = 0, 134 .delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT, 135 .bdev_retry_count = 3, 136 .transport_ack_timeout = 0, 137 .ctrlr_loss_timeout_sec = 0, 138 .reconnect_delay_sec = 0, 139 .fast_io_fail_timeout_sec = 0, 140 .disable_auto_failback = false, 141 .generate_uuids = false, 142 .transport_tos = 0, 143 .nvme_error_stat = false, 144 .io_path_stat = false, 145 .allow_accel_sequence = false, 146 .dhchap_digests = BDEV_NVME_DEFAULT_DIGESTS, 147 .dhchap_dhgroups = BDEV_NVME_DEFAULT_DHGROUPS, 148 }; 149 150 #define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL 151 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT 100000ULL 152 153 static int g_hot_insert_nvme_controller_index = 0; 154 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT; 155 static bool g_nvme_hotplug_enabled = false; 156 struct spdk_thread *g_bdev_nvme_init_thread; 157 static struct spdk_poller *g_hotplug_poller; 158 static struct spdk_poller *g_hotplug_probe_poller; 159 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx; 160 161 static void nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 162 struct nvme_async_probe_ctx *ctx); 163 static void nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 164 struct nvme_async_probe_ctx *ctx); 165 static int bdev_nvme_library_init(void); 166 static void bdev_nvme_library_fini(void); 167 static void _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, 168 struct spdk_bdev_io *bdev_io); 169 static void bdev_nvme_submit_request(struct spdk_io_channel *ch, 170 struct spdk_bdev_io *bdev_io); 171 static int bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 172 void *md, uint64_t lba_count, uint64_t lba, 173 uint32_t flags, struct spdk_memory_domain *domain, void *domain_ctx, 174 struct spdk_accel_sequence *seq); 175 static int bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 176 void *md, uint64_t lba_count, uint64_t lba); 177 static int bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 178 void *md, uint64_t lba_count, uint64_t lba, 179 uint32_t flags, struct spdk_memory_domain *domain, void *domain_ctx, 180 struct spdk_accel_sequence *seq, 181 union spdk_bdev_nvme_cdw12 cdw12, union spdk_bdev_nvme_cdw13 cdw13); 182 static int bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 183 void *md, uint64_t lba_count, 184 uint64_t zslba, uint32_t flags); 185 static int bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 186 void *md, uint64_t lba_count, uint64_t lba, 187 uint32_t flags); 188 static int bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, 189 struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov, 190 int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba, 191 uint32_t flags); 192 static int bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, 193 uint32_t num_zones, struct spdk_bdev_zone_info *info); 194 static int bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 195 enum spdk_bdev_zone_action action); 196 static void bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, 197 struct nvme_bdev_io *bio, 198 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); 199 static int bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 200 void *buf, size_t nbytes); 201 static int bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 202 void *buf, size_t nbytes, void *md_buf, size_t md_len); 203 static int bdev_nvme_iov_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 204 struct iovec *iov, int iovcnt, size_t nbytes, 205 void *md_buf, size_t md_len); 206 static void bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, 207 struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort); 208 static void bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio); 209 static int bdev_nvme_reset_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 210 static int bdev_nvme_failover_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 211 static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr); 212 static int nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr); 213 214 static struct nvme_ns *nvme_ns_alloc(void); 215 static void nvme_ns_free(struct nvme_ns *ns); 216 217 static int 218 nvme_ns_cmp(struct nvme_ns *ns1, struct nvme_ns *ns2) 219 { 220 return ns1->id < ns2->id ? -1 : ns1->id > ns2->id; 221 } 222 223 RB_GENERATE_STATIC(nvme_ns_tree, nvme_ns, node, nvme_ns_cmp); 224 225 struct spdk_nvme_qpair * 226 bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch) 227 { 228 struct nvme_ctrlr_channel *ctrlr_ch; 229 230 assert(ctrlr_io_ch != NULL); 231 232 ctrlr_ch = spdk_io_channel_get_ctx(ctrlr_io_ch); 233 234 return ctrlr_ch->qpair->qpair; 235 } 236 237 static int 238 bdev_nvme_get_ctx_size(void) 239 { 240 return sizeof(struct nvme_bdev_io); 241 } 242 243 static struct spdk_bdev_module nvme_if = { 244 .name = "nvme", 245 .async_fini = true, 246 .module_init = bdev_nvme_library_init, 247 .module_fini = bdev_nvme_library_fini, 248 .config_json = bdev_nvme_config_json, 249 .get_ctx_size = bdev_nvme_get_ctx_size, 250 251 }; 252 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if) 253 254 struct nvme_bdev_ctrlrs g_nvme_bdev_ctrlrs = TAILQ_HEAD_INITIALIZER(g_nvme_bdev_ctrlrs); 255 pthread_mutex_t g_bdev_nvme_mutex = PTHREAD_MUTEX_INITIALIZER; 256 bool g_bdev_nvme_module_finish; 257 258 struct nvme_bdev_ctrlr * 259 nvme_bdev_ctrlr_get_by_name(const char *name) 260 { 261 struct nvme_bdev_ctrlr *nbdev_ctrlr; 262 263 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 264 if (strcmp(name, nbdev_ctrlr->name) == 0) { 265 break; 266 } 267 } 268 269 return nbdev_ctrlr; 270 } 271 272 static struct nvme_ctrlr * 273 nvme_bdev_ctrlr_get_ctrlr(struct nvme_bdev_ctrlr *nbdev_ctrlr, 274 const struct spdk_nvme_transport_id *trid, const char *hostnqn) 275 { 276 const struct spdk_nvme_ctrlr_opts *opts; 277 struct nvme_ctrlr *nvme_ctrlr; 278 279 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 280 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 281 if (spdk_nvme_transport_id_compare(trid, &nvme_ctrlr->active_path_id->trid) == 0 && 282 strcmp(hostnqn, opts->hostnqn) == 0) { 283 break; 284 } 285 } 286 287 return nvme_ctrlr; 288 } 289 290 struct nvme_ctrlr * 291 nvme_bdev_ctrlr_get_ctrlr_by_id(struct nvme_bdev_ctrlr *nbdev_ctrlr, 292 uint16_t cntlid) 293 { 294 struct nvme_ctrlr *nvme_ctrlr; 295 const struct spdk_nvme_ctrlr_data *cdata; 296 297 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 298 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 299 if (cdata->cntlid == cntlid) { 300 break; 301 } 302 } 303 304 return nvme_ctrlr; 305 } 306 307 static struct nvme_bdev * 308 nvme_bdev_ctrlr_get_bdev(struct nvme_bdev_ctrlr *nbdev_ctrlr, uint32_t nsid) 309 { 310 struct nvme_bdev *bdev; 311 312 pthread_mutex_lock(&g_bdev_nvme_mutex); 313 TAILQ_FOREACH(bdev, &nbdev_ctrlr->bdevs, tailq) { 314 if (bdev->nsid == nsid) { 315 break; 316 } 317 } 318 pthread_mutex_unlock(&g_bdev_nvme_mutex); 319 320 return bdev; 321 } 322 323 struct nvme_ns * 324 nvme_ctrlr_get_ns(struct nvme_ctrlr *nvme_ctrlr, uint32_t nsid) 325 { 326 struct nvme_ns ns; 327 328 assert(nsid > 0); 329 330 ns.id = nsid; 331 return RB_FIND(nvme_ns_tree, &nvme_ctrlr->namespaces, &ns); 332 } 333 334 struct nvme_ns * 335 nvme_ctrlr_get_first_active_ns(struct nvme_ctrlr *nvme_ctrlr) 336 { 337 return RB_MIN(nvme_ns_tree, &nvme_ctrlr->namespaces); 338 } 339 340 struct nvme_ns * 341 nvme_ctrlr_get_next_active_ns(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *ns) 342 { 343 if (ns == NULL) { 344 return NULL; 345 } 346 347 return RB_NEXT(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 348 } 349 350 static struct nvme_ctrlr * 351 nvme_ctrlr_get(const struct spdk_nvme_transport_id *trid, const char *hostnqn) 352 { 353 struct nvme_bdev_ctrlr *nbdev_ctrlr; 354 struct nvme_ctrlr *nvme_ctrlr = NULL; 355 356 pthread_mutex_lock(&g_bdev_nvme_mutex); 357 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 358 nvme_ctrlr = nvme_bdev_ctrlr_get_ctrlr(nbdev_ctrlr, trid, hostnqn); 359 if (nvme_ctrlr != NULL) { 360 break; 361 } 362 } 363 pthread_mutex_unlock(&g_bdev_nvme_mutex); 364 365 return nvme_ctrlr; 366 } 367 368 struct nvme_ctrlr * 369 nvme_ctrlr_get_by_name(const char *name) 370 { 371 struct nvme_bdev_ctrlr *nbdev_ctrlr; 372 struct nvme_ctrlr *nvme_ctrlr = NULL; 373 374 if (name == NULL) { 375 return NULL; 376 } 377 378 pthread_mutex_lock(&g_bdev_nvme_mutex); 379 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 380 if (nbdev_ctrlr != NULL) { 381 nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 382 } 383 pthread_mutex_unlock(&g_bdev_nvme_mutex); 384 385 return nvme_ctrlr; 386 } 387 388 void 389 nvme_bdev_ctrlr_for_each(nvme_bdev_ctrlr_for_each_fn fn, void *ctx) 390 { 391 struct nvme_bdev_ctrlr *nbdev_ctrlr; 392 393 pthread_mutex_lock(&g_bdev_nvme_mutex); 394 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 395 fn(nbdev_ctrlr, ctx); 396 } 397 pthread_mutex_unlock(&g_bdev_nvme_mutex); 398 } 399 400 void 401 nvme_bdev_dump_trid_json(const struct spdk_nvme_transport_id *trid, struct spdk_json_write_ctx *w) 402 { 403 const char *trtype_str; 404 const char *adrfam_str; 405 406 trtype_str = spdk_nvme_transport_id_trtype_str(trid->trtype); 407 if (trtype_str) { 408 spdk_json_write_named_string(w, "trtype", trtype_str); 409 } 410 411 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 412 if (adrfam_str) { 413 spdk_json_write_named_string(w, "adrfam", adrfam_str); 414 } 415 416 if (trid->traddr[0] != '\0') { 417 spdk_json_write_named_string(w, "traddr", trid->traddr); 418 } 419 420 if (trid->trsvcid[0] != '\0') { 421 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 422 } 423 424 if (trid->subnqn[0] != '\0') { 425 spdk_json_write_named_string(w, "subnqn", trid->subnqn); 426 } 427 } 428 429 static void 430 nvme_bdev_ctrlr_delete(struct nvme_bdev_ctrlr *nbdev_ctrlr, 431 struct nvme_ctrlr *nvme_ctrlr) 432 { 433 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_delete, nvme_ctrlr->nbdev_ctrlr->name); 434 pthread_mutex_lock(&g_bdev_nvme_mutex); 435 436 TAILQ_REMOVE(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 437 if (!TAILQ_EMPTY(&nbdev_ctrlr->ctrlrs)) { 438 pthread_mutex_unlock(&g_bdev_nvme_mutex); 439 440 return; 441 } 442 TAILQ_REMOVE(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 443 444 pthread_mutex_unlock(&g_bdev_nvme_mutex); 445 446 assert(TAILQ_EMPTY(&nbdev_ctrlr->bdevs)); 447 448 free(nbdev_ctrlr->name); 449 free(nbdev_ctrlr); 450 } 451 452 static void 453 _nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 454 { 455 struct nvme_path_id *path_id, *tmp_path; 456 struct nvme_ns *ns, *tmp_ns; 457 458 free(nvme_ctrlr->copied_ana_desc); 459 spdk_free(nvme_ctrlr->ana_log_page); 460 461 if (nvme_ctrlr->opal_dev) { 462 spdk_opal_dev_destruct(nvme_ctrlr->opal_dev); 463 nvme_ctrlr->opal_dev = NULL; 464 } 465 466 if (nvme_ctrlr->nbdev_ctrlr) { 467 nvme_bdev_ctrlr_delete(nvme_ctrlr->nbdev_ctrlr, nvme_ctrlr); 468 } 469 470 RB_FOREACH_SAFE(ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp_ns) { 471 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 472 nvme_ns_free(ns); 473 } 474 475 TAILQ_FOREACH_SAFE(path_id, &nvme_ctrlr->trids, link, tmp_path) { 476 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 477 free(path_id); 478 } 479 480 pthread_mutex_destroy(&nvme_ctrlr->mutex); 481 spdk_keyring_put_key(nvme_ctrlr->psk); 482 spdk_keyring_put_key(nvme_ctrlr->dhchap_key); 483 spdk_keyring_put_key(nvme_ctrlr->dhchap_ctrlr_key); 484 free(nvme_ctrlr); 485 486 pthread_mutex_lock(&g_bdev_nvme_mutex); 487 if (g_bdev_nvme_module_finish && TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 488 pthread_mutex_unlock(&g_bdev_nvme_mutex); 489 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 490 spdk_bdev_module_fini_done(); 491 return; 492 } 493 pthread_mutex_unlock(&g_bdev_nvme_mutex); 494 } 495 496 static int 497 nvme_detach_poller(void *arg) 498 { 499 struct nvme_ctrlr *nvme_ctrlr = arg; 500 int rc; 501 502 rc = spdk_nvme_detach_poll_async(nvme_ctrlr->detach_ctx); 503 if (rc != -EAGAIN) { 504 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 505 _nvme_ctrlr_delete(nvme_ctrlr); 506 } 507 508 return SPDK_POLLER_BUSY; 509 } 510 511 static void 512 nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 513 { 514 int rc; 515 516 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 517 518 /* First, unregister the adminq poller, as the driver will poll adminq if necessary */ 519 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 520 521 /* If we got here, the reset/detach poller cannot be active */ 522 assert(nvme_ctrlr->reset_detach_poller == NULL); 523 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(nvme_detach_poller, 524 nvme_ctrlr, 1000); 525 if (nvme_ctrlr->reset_detach_poller == NULL) { 526 SPDK_ERRLOG("Failed to register detach poller\n"); 527 goto error; 528 } 529 530 rc = spdk_nvme_detach_async(nvme_ctrlr->ctrlr, &nvme_ctrlr->detach_ctx); 531 if (rc != 0) { 532 SPDK_ERRLOG("Failed to detach the NVMe controller\n"); 533 goto error; 534 } 535 536 return; 537 error: 538 /* We don't have a good way to handle errors here, so just do what we can and delete the 539 * controller without detaching the underlying NVMe device. 540 */ 541 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 542 _nvme_ctrlr_delete(nvme_ctrlr); 543 } 544 545 static void 546 nvme_ctrlr_unregister_cb(void *io_device) 547 { 548 struct nvme_ctrlr *nvme_ctrlr = io_device; 549 550 nvme_ctrlr_delete(nvme_ctrlr); 551 } 552 553 static void 554 nvme_ctrlr_unregister(void *ctx) 555 { 556 struct nvme_ctrlr *nvme_ctrlr = ctx; 557 558 spdk_io_device_unregister(nvme_ctrlr, nvme_ctrlr_unregister_cb); 559 } 560 561 static bool 562 nvme_ctrlr_can_be_unregistered(struct nvme_ctrlr *nvme_ctrlr) 563 { 564 if (!nvme_ctrlr->destruct) { 565 return false; 566 } 567 568 if (nvme_ctrlr->ref > 0) { 569 return false; 570 } 571 572 if (nvme_ctrlr->resetting) { 573 return false; 574 } 575 576 if (nvme_ctrlr->ana_log_page_updating) { 577 return false; 578 } 579 580 if (nvme_ctrlr->io_path_cache_clearing) { 581 return false; 582 } 583 584 return true; 585 } 586 587 static void 588 nvme_ctrlr_release(struct nvme_ctrlr *nvme_ctrlr) 589 { 590 pthread_mutex_lock(&nvme_ctrlr->mutex); 591 SPDK_DTRACE_PROBE2(bdev_nvme_ctrlr_release, nvme_ctrlr->nbdev_ctrlr->name, nvme_ctrlr->ref); 592 593 assert(nvme_ctrlr->ref > 0); 594 nvme_ctrlr->ref--; 595 596 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 597 pthread_mutex_unlock(&nvme_ctrlr->mutex); 598 return; 599 } 600 601 pthread_mutex_unlock(&nvme_ctrlr->mutex); 602 603 spdk_thread_exec_msg(nvme_ctrlr->thread, nvme_ctrlr_unregister, nvme_ctrlr); 604 } 605 606 static void 607 bdev_nvme_clear_current_io_path(struct nvme_bdev_channel *nbdev_ch) 608 { 609 nbdev_ch->current_io_path = NULL; 610 nbdev_ch->rr_counter = 0; 611 } 612 613 static struct nvme_io_path * 614 _bdev_nvme_get_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 615 { 616 struct nvme_io_path *io_path; 617 618 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 619 if (io_path->nvme_ns == nvme_ns) { 620 break; 621 } 622 } 623 624 return io_path; 625 } 626 627 static struct nvme_io_path * 628 nvme_io_path_alloc(void) 629 { 630 struct nvme_io_path *io_path; 631 632 io_path = calloc(1, sizeof(*io_path)); 633 if (io_path == NULL) { 634 SPDK_ERRLOG("Failed to alloc io_path.\n"); 635 return NULL; 636 } 637 638 if (g_opts.io_path_stat) { 639 io_path->stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); 640 if (io_path->stat == NULL) { 641 free(io_path); 642 SPDK_ERRLOG("Failed to alloc io_path stat.\n"); 643 return NULL; 644 } 645 spdk_bdev_reset_io_stat(io_path->stat, SPDK_BDEV_RESET_STAT_MAXMIN); 646 } 647 648 return io_path; 649 } 650 651 static void 652 nvme_io_path_free(struct nvme_io_path *io_path) 653 { 654 free(io_path->stat); 655 free(io_path); 656 } 657 658 static int 659 _bdev_nvme_add_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 660 { 661 struct nvme_io_path *io_path; 662 struct spdk_io_channel *ch; 663 struct nvme_ctrlr_channel *ctrlr_ch; 664 struct nvme_qpair *nvme_qpair; 665 666 io_path = nvme_io_path_alloc(); 667 if (io_path == NULL) { 668 return -ENOMEM; 669 } 670 671 io_path->nvme_ns = nvme_ns; 672 673 ch = spdk_get_io_channel(nvme_ns->ctrlr); 674 if (ch == NULL) { 675 nvme_io_path_free(io_path); 676 SPDK_ERRLOG("Failed to alloc io_channel.\n"); 677 return -ENOMEM; 678 } 679 680 ctrlr_ch = spdk_io_channel_get_ctx(ch); 681 682 nvme_qpair = ctrlr_ch->qpair; 683 assert(nvme_qpair != NULL); 684 685 io_path->qpair = nvme_qpair; 686 TAILQ_INSERT_TAIL(&nvme_qpair->io_path_list, io_path, tailq); 687 688 io_path->nbdev_ch = nbdev_ch; 689 STAILQ_INSERT_TAIL(&nbdev_ch->io_path_list, io_path, stailq); 690 691 bdev_nvme_clear_current_io_path(nbdev_ch); 692 693 return 0; 694 } 695 696 static void 697 bdev_nvme_clear_retry_io_path(struct nvme_bdev_channel *nbdev_ch, 698 struct nvme_io_path *io_path) 699 { 700 struct spdk_bdev_io *bdev_io; 701 struct nvme_bdev_io *bio; 702 703 TAILQ_FOREACH(bdev_io, &nbdev_ch->retry_io_list, module_link) { 704 bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 705 if (bio->io_path == io_path) { 706 bio->io_path = NULL; 707 } 708 } 709 } 710 711 static void 712 _bdev_nvme_delete_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *io_path) 713 { 714 struct spdk_io_channel *ch; 715 struct nvme_qpair *nvme_qpair; 716 struct nvme_ctrlr_channel *ctrlr_ch; 717 struct nvme_bdev *nbdev; 718 719 nbdev = spdk_io_channel_get_io_device(spdk_io_channel_from_ctx(nbdev_ch)); 720 721 /* Add the statistics to nvme_ns before this path is destroyed. */ 722 pthread_mutex_lock(&nbdev->mutex); 723 if (nbdev->ref != 0 && io_path->nvme_ns->stat != NULL && io_path->stat != NULL) { 724 spdk_bdev_add_io_stat(io_path->nvme_ns->stat, io_path->stat); 725 } 726 pthread_mutex_unlock(&nbdev->mutex); 727 728 bdev_nvme_clear_current_io_path(nbdev_ch); 729 bdev_nvme_clear_retry_io_path(nbdev_ch, io_path); 730 731 STAILQ_REMOVE(&nbdev_ch->io_path_list, io_path, nvme_io_path, stailq); 732 io_path->nbdev_ch = NULL; 733 734 nvme_qpair = io_path->qpair; 735 assert(nvme_qpair != NULL); 736 737 ctrlr_ch = nvme_qpair->ctrlr_ch; 738 assert(ctrlr_ch != NULL); 739 740 ch = spdk_io_channel_from_ctx(ctrlr_ch); 741 spdk_put_io_channel(ch); 742 743 /* After an io_path is removed, I/Os submitted to it may complete and update statistics 744 * of the io_path. To avoid heap-use-after-free error from this case, do not free the 745 * io_path here but free the io_path when the associated qpair is freed. It is ensured 746 * that all I/Os submitted to the io_path are completed when the associated qpair is freed. 747 */ 748 } 749 750 static void 751 _bdev_nvme_delete_io_paths(struct nvme_bdev_channel *nbdev_ch) 752 { 753 struct nvme_io_path *io_path, *tmp_io_path; 754 755 STAILQ_FOREACH_SAFE(io_path, &nbdev_ch->io_path_list, stailq, tmp_io_path) { 756 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 757 } 758 } 759 760 static int 761 bdev_nvme_create_bdev_channel_cb(void *io_device, void *ctx_buf) 762 { 763 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 764 struct nvme_bdev *nbdev = io_device; 765 struct nvme_ns *nvme_ns; 766 int rc; 767 768 STAILQ_INIT(&nbdev_ch->io_path_list); 769 TAILQ_INIT(&nbdev_ch->retry_io_list); 770 771 pthread_mutex_lock(&nbdev->mutex); 772 773 nbdev_ch->mp_policy = nbdev->mp_policy; 774 nbdev_ch->mp_selector = nbdev->mp_selector; 775 nbdev_ch->rr_min_io = nbdev->rr_min_io; 776 777 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 778 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 779 if (rc != 0) { 780 pthread_mutex_unlock(&nbdev->mutex); 781 782 _bdev_nvme_delete_io_paths(nbdev_ch); 783 return rc; 784 } 785 } 786 pthread_mutex_unlock(&nbdev->mutex); 787 788 return 0; 789 } 790 791 /* If cpl != NULL, complete the bdev_io with nvme status based on 'cpl'. 792 * If cpl == NULL, complete the bdev_io with bdev status based on 'status'. 793 */ 794 static inline void 795 __bdev_nvme_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status, 796 const struct spdk_nvme_cpl *cpl) 797 { 798 spdk_trace_record(TRACE_BDEV_NVME_IO_DONE, 0, 0, (uintptr_t)bdev_io->driver_ctx, 799 (uintptr_t)bdev_io); 800 if (cpl) { 801 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 802 } else { 803 spdk_bdev_io_complete(bdev_io, status); 804 } 805 } 806 807 static void bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch); 808 809 static void 810 bdev_nvme_destroy_bdev_channel_cb(void *io_device, void *ctx_buf) 811 { 812 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 813 814 bdev_nvme_abort_retry_ios(nbdev_ch); 815 _bdev_nvme_delete_io_paths(nbdev_ch); 816 } 817 818 static inline bool 819 bdev_nvme_io_type_is_admin(enum spdk_bdev_io_type io_type) 820 { 821 switch (io_type) { 822 case SPDK_BDEV_IO_TYPE_RESET: 823 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 824 case SPDK_BDEV_IO_TYPE_ABORT: 825 return true; 826 default: 827 break; 828 } 829 830 return false; 831 } 832 833 static inline bool 834 nvme_ns_is_active(struct nvme_ns *nvme_ns) 835 { 836 if (spdk_unlikely(nvme_ns->ana_state_updating)) { 837 return false; 838 } 839 840 if (spdk_unlikely(nvme_ns->ns == NULL)) { 841 return false; 842 } 843 844 return true; 845 } 846 847 static inline bool 848 nvme_ns_is_accessible(struct nvme_ns *nvme_ns) 849 { 850 if (spdk_unlikely(!nvme_ns_is_active(nvme_ns))) { 851 return false; 852 } 853 854 switch (nvme_ns->ana_state) { 855 case SPDK_NVME_ANA_OPTIMIZED_STATE: 856 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 857 return true; 858 default: 859 break; 860 } 861 862 return false; 863 } 864 865 static inline bool 866 nvme_qpair_is_connected(struct nvme_qpair *nvme_qpair) 867 { 868 if (spdk_unlikely(nvme_qpair->qpair == NULL)) { 869 return false; 870 } 871 872 if (spdk_unlikely(spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) != 873 SPDK_NVME_QPAIR_FAILURE_NONE)) { 874 return false; 875 } 876 877 if (spdk_unlikely(nvme_qpair->ctrlr_ch->reset_iter != NULL)) { 878 return false; 879 } 880 881 return true; 882 } 883 884 static inline bool 885 nvme_io_path_is_available(struct nvme_io_path *io_path) 886 { 887 if (spdk_unlikely(!nvme_qpair_is_connected(io_path->qpair))) { 888 return false; 889 } 890 891 if (spdk_unlikely(!nvme_ns_is_accessible(io_path->nvme_ns))) { 892 return false; 893 } 894 895 return true; 896 } 897 898 static inline bool 899 nvme_ctrlr_is_failed(struct nvme_ctrlr *nvme_ctrlr) 900 { 901 if (nvme_ctrlr->destruct) { 902 return true; 903 } 904 905 if (nvme_ctrlr->fast_io_fail_timedout) { 906 return true; 907 } 908 909 if (nvme_ctrlr->resetting) { 910 if (nvme_ctrlr->opts.reconnect_delay_sec != 0) { 911 return false; 912 } else { 913 return true; 914 } 915 } 916 917 if (nvme_ctrlr->reconnect_is_delayed) { 918 return false; 919 } 920 921 if (nvme_ctrlr->disabled) { 922 return true; 923 } 924 925 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 926 return true; 927 } else { 928 return false; 929 } 930 } 931 932 static bool 933 nvme_ctrlr_is_available(struct nvme_ctrlr *nvme_ctrlr) 934 { 935 if (nvme_ctrlr->destruct) { 936 return false; 937 } 938 939 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 940 return false; 941 } 942 943 if (nvme_ctrlr->resetting || nvme_ctrlr->reconnect_is_delayed) { 944 return false; 945 } 946 947 if (nvme_ctrlr->disabled) { 948 return false; 949 } 950 951 return true; 952 } 953 954 /* Simulate circular linked list. */ 955 static inline struct nvme_io_path * 956 nvme_io_path_get_next(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *prev_path) 957 { 958 struct nvme_io_path *next_path; 959 960 if (prev_path != NULL) { 961 next_path = STAILQ_NEXT(prev_path, stailq); 962 if (next_path != NULL) { 963 return next_path; 964 } 965 } 966 967 return STAILQ_FIRST(&nbdev_ch->io_path_list); 968 } 969 970 static struct nvme_io_path * 971 _bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 972 { 973 struct nvme_io_path *io_path, *start, *non_optimized = NULL; 974 975 start = nvme_io_path_get_next(nbdev_ch, nbdev_ch->current_io_path); 976 977 io_path = start; 978 do { 979 if (spdk_likely(nvme_io_path_is_available(io_path))) { 980 switch (io_path->nvme_ns->ana_state) { 981 case SPDK_NVME_ANA_OPTIMIZED_STATE: 982 nbdev_ch->current_io_path = io_path; 983 return io_path; 984 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 985 if (non_optimized == NULL) { 986 non_optimized = io_path; 987 } 988 break; 989 default: 990 assert(false); 991 break; 992 } 993 } 994 io_path = nvme_io_path_get_next(nbdev_ch, io_path); 995 } while (io_path != start); 996 997 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) { 998 /* We come here only if there is no optimized path. Cache even non_optimized 999 * path for load balance across multiple non_optimized paths. 1000 */ 1001 nbdev_ch->current_io_path = non_optimized; 1002 } 1003 1004 return non_optimized; 1005 } 1006 1007 static struct nvme_io_path * 1008 _bdev_nvme_find_io_path_min_qd(struct nvme_bdev_channel *nbdev_ch) 1009 { 1010 struct nvme_io_path *io_path; 1011 struct nvme_io_path *optimized = NULL, *non_optimized = NULL; 1012 uint32_t opt_min_qd = UINT32_MAX, non_opt_min_qd = UINT32_MAX; 1013 uint32_t num_outstanding_reqs; 1014 1015 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 1016 if (spdk_unlikely(!nvme_qpair_is_connected(io_path->qpair))) { 1017 /* The device is currently resetting. */ 1018 continue; 1019 } 1020 1021 if (spdk_unlikely(!nvme_ns_is_active(io_path->nvme_ns))) { 1022 continue; 1023 } 1024 1025 num_outstanding_reqs = spdk_nvme_qpair_get_num_outstanding_reqs(io_path->qpair->qpair); 1026 switch (io_path->nvme_ns->ana_state) { 1027 case SPDK_NVME_ANA_OPTIMIZED_STATE: 1028 if (num_outstanding_reqs < opt_min_qd) { 1029 opt_min_qd = num_outstanding_reqs; 1030 optimized = io_path; 1031 } 1032 break; 1033 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 1034 if (num_outstanding_reqs < non_opt_min_qd) { 1035 non_opt_min_qd = num_outstanding_reqs; 1036 non_optimized = io_path; 1037 } 1038 break; 1039 default: 1040 break; 1041 } 1042 } 1043 1044 /* don't cache io path for BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH selector */ 1045 if (optimized != NULL) { 1046 return optimized; 1047 } 1048 1049 return non_optimized; 1050 } 1051 1052 static inline struct nvme_io_path * 1053 bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 1054 { 1055 if (spdk_likely(nbdev_ch->current_io_path != NULL)) { 1056 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE) { 1057 return nbdev_ch->current_io_path; 1058 } else if (nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 1059 if (++nbdev_ch->rr_counter < nbdev_ch->rr_min_io) { 1060 return nbdev_ch->current_io_path; 1061 } 1062 nbdev_ch->rr_counter = 0; 1063 } 1064 } 1065 1066 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE || 1067 nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 1068 return _bdev_nvme_find_io_path(nbdev_ch); 1069 } else { 1070 return _bdev_nvme_find_io_path_min_qd(nbdev_ch); 1071 } 1072 } 1073 1074 /* Return true if there is any io_path whose qpair is active or ctrlr is not failed, 1075 * or false otherwise. 1076 * 1077 * If any io_path has an active qpair but find_io_path() returned NULL, its namespace 1078 * is likely to be non-accessible now but may become accessible. 1079 * 1080 * If any io_path has an unfailed ctrlr but find_io_path() returned NULL, the ctrlr 1081 * is likely to be resetting now but the reset may succeed. A ctrlr is set to unfailed 1082 * when starting to reset it but it is set to failed when the reset failed. Hence, if 1083 * a ctrlr is unfailed, it is likely that it works fine or is resetting. 1084 */ 1085 static bool 1086 any_io_path_may_become_available(struct nvme_bdev_channel *nbdev_ch) 1087 { 1088 struct nvme_io_path *io_path; 1089 1090 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 1091 if (io_path->nvme_ns->ana_transition_timedout) { 1092 continue; 1093 } 1094 1095 if (nvme_qpair_is_connected(io_path->qpair) || 1096 !nvme_ctrlr_is_failed(io_path->qpair->ctrlr)) { 1097 return true; 1098 } 1099 } 1100 1101 return false; 1102 } 1103 1104 static void 1105 bdev_nvme_retry_io(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io) 1106 { 1107 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 1108 struct spdk_io_channel *ch; 1109 1110 if (nbdev_io->io_path != NULL && nvme_io_path_is_available(nbdev_io->io_path)) { 1111 _bdev_nvme_submit_request(nbdev_ch, bdev_io); 1112 } else { 1113 ch = spdk_io_channel_from_ctx(nbdev_ch); 1114 bdev_nvme_submit_request(ch, bdev_io); 1115 } 1116 } 1117 1118 static int 1119 bdev_nvme_retry_ios(void *arg) 1120 { 1121 struct nvme_bdev_channel *nbdev_ch = arg; 1122 struct spdk_bdev_io *bdev_io, *tmp_bdev_io; 1123 struct nvme_bdev_io *bio; 1124 uint64_t now, delay_us; 1125 1126 now = spdk_get_ticks(); 1127 1128 TAILQ_FOREACH_SAFE(bdev_io, &nbdev_ch->retry_io_list, module_link, tmp_bdev_io) { 1129 bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 1130 if (bio->retry_ticks > now) { 1131 break; 1132 } 1133 1134 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io, module_link); 1135 1136 bdev_nvme_retry_io(nbdev_ch, bdev_io); 1137 } 1138 1139 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1140 1141 bdev_io = TAILQ_FIRST(&nbdev_ch->retry_io_list); 1142 if (bdev_io != NULL) { 1143 bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 1144 1145 delay_us = (bio->retry_ticks - now) * SPDK_SEC_TO_USEC / spdk_get_ticks_hz(); 1146 1147 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 1148 delay_us); 1149 } 1150 1151 return SPDK_POLLER_BUSY; 1152 } 1153 1154 static void 1155 bdev_nvme_queue_retry_io(struct nvme_bdev_channel *nbdev_ch, 1156 struct nvme_bdev_io *bio, uint64_t delay_ms) 1157 { 1158 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1159 struct spdk_bdev_io *tmp_bdev_io; 1160 struct nvme_bdev_io *tmp_bio; 1161 1162 bio->retry_ticks = spdk_get_ticks() + delay_ms * spdk_get_ticks_hz() / 1000ULL; 1163 1164 TAILQ_FOREACH_REVERSE(tmp_bdev_io, &nbdev_ch->retry_io_list, retry_io_head, module_link) { 1165 tmp_bio = (struct nvme_bdev_io *)tmp_bdev_io->driver_ctx; 1166 1167 if (tmp_bio->retry_ticks <= bio->retry_ticks) { 1168 TAILQ_INSERT_AFTER(&nbdev_ch->retry_io_list, tmp_bdev_io, bdev_io, 1169 module_link); 1170 return; 1171 } 1172 } 1173 1174 /* No earlier I/Os were found. This I/O must be the new head. */ 1175 TAILQ_INSERT_HEAD(&nbdev_ch->retry_io_list, bdev_io, module_link); 1176 1177 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1178 1179 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 1180 delay_ms * 1000ULL); 1181 } 1182 1183 static void 1184 bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch) 1185 { 1186 struct spdk_bdev_io *bdev_io, *tmp_io; 1187 1188 TAILQ_FOREACH_SAFE(bdev_io, &nbdev_ch->retry_io_list, module_link, tmp_io) { 1189 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io, module_link); 1190 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED, NULL); 1191 } 1192 1193 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1194 } 1195 1196 static int 1197 bdev_nvme_abort_retry_io(struct nvme_bdev_channel *nbdev_ch, 1198 struct nvme_bdev_io *bio_to_abort) 1199 { 1200 struct spdk_bdev_io *bdev_io_to_abort; 1201 1202 TAILQ_FOREACH(bdev_io_to_abort, &nbdev_ch->retry_io_list, module_link) { 1203 if ((struct nvme_bdev_io *)bdev_io_to_abort->driver_ctx == bio_to_abort) { 1204 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bdev_io_to_abort, module_link); 1205 __bdev_nvme_io_complete(bdev_io_to_abort, SPDK_BDEV_IO_STATUS_ABORTED, NULL); 1206 return 0; 1207 } 1208 } 1209 1210 return -ENOENT; 1211 } 1212 1213 static void 1214 bdev_nvme_update_nvme_error_stat(struct spdk_bdev_io *bdev_io, const struct spdk_nvme_cpl *cpl) 1215 { 1216 struct nvme_bdev *nbdev; 1217 uint16_t sct, sc; 1218 1219 assert(spdk_nvme_cpl_is_error(cpl)); 1220 1221 nbdev = bdev_io->bdev->ctxt; 1222 1223 if (nbdev->err_stat == NULL) { 1224 return; 1225 } 1226 1227 sct = cpl->status.sct; 1228 sc = cpl->status.sc; 1229 1230 pthread_mutex_lock(&nbdev->mutex); 1231 1232 nbdev->err_stat->status_type[sct]++; 1233 switch (sct) { 1234 case SPDK_NVME_SCT_GENERIC: 1235 case SPDK_NVME_SCT_COMMAND_SPECIFIC: 1236 case SPDK_NVME_SCT_MEDIA_ERROR: 1237 case SPDK_NVME_SCT_PATH: 1238 nbdev->err_stat->status[sct][sc]++; 1239 break; 1240 default: 1241 break; 1242 } 1243 1244 pthread_mutex_unlock(&nbdev->mutex); 1245 } 1246 1247 static inline void 1248 bdev_nvme_update_io_path_stat(struct nvme_bdev_io *bio) 1249 { 1250 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1251 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 1252 uint32_t blocklen = bdev_io->bdev->blocklen; 1253 struct spdk_bdev_io_stat *stat; 1254 uint64_t tsc_diff; 1255 1256 if (bio->io_path->stat == NULL) { 1257 return; 1258 } 1259 1260 tsc_diff = spdk_get_ticks() - bio->submit_tsc; 1261 stat = bio->io_path->stat; 1262 1263 switch (bdev_io->type) { 1264 case SPDK_BDEV_IO_TYPE_READ: 1265 stat->bytes_read += num_blocks * blocklen; 1266 stat->num_read_ops++; 1267 stat->read_latency_ticks += tsc_diff; 1268 if (stat->max_read_latency_ticks < tsc_diff) { 1269 stat->max_read_latency_ticks = tsc_diff; 1270 } 1271 if (stat->min_read_latency_ticks > tsc_diff) { 1272 stat->min_read_latency_ticks = tsc_diff; 1273 } 1274 break; 1275 case SPDK_BDEV_IO_TYPE_WRITE: 1276 stat->bytes_written += num_blocks * blocklen; 1277 stat->num_write_ops++; 1278 stat->write_latency_ticks += tsc_diff; 1279 if (stat->max_write_latency_ticks < tsc_diff) { 1280 stat->max_write_latency_ticks = tsc_diff; 1281 } 1282 if (stat->min_write_latency_ticks > tsc_diff) { 1283 stat->min_write_latency_ticks = tsc_diff; 1284 } 1285 break; 1286 case SPDK_BDEV_IO_TYPE_UNMAP: 1287 stat->bytes_unmapped += num_blocks * blocklen; 1288 stat->num_unmap_ops++; 1289 stat->unmap_latency_ticks += tsc_diff; 1290 if (stat->max_unmap_latency_ticks < tsc_diff) { 1291 stat->max_unmap_latency_ticks = tsc_diff; 1292 } 1293 if (stat->min_unmap_latency_ticks > tsc_diff) { 1294 stat->min_unmap_latency_ticks = tsc_diff; 1295 } 1296 break; 1297 case SPDK_BDEV_IO_TYPE_ZCOPY: 1298 /* Track the data in the start phase only */ 1299 if (!bdev_io->u.bdev.zcopy.start) { 1300 break; 1301 } 1302 if (bdev_io->u.bdev.zcopy.populate) { 1303 stat->bytes_read += num_blocks * blocklen; 1304 stat->num_read_ops++; 1305 stat->read_latency_ticks += tsc_diff; 1306 if (stat->max_read_latency_ticks < tsc_diff) { 1307 stat->max_read_latency_ticks = tsc_diff; 1308 } 1309 if (stat->min_read_latency_ticks > tsc_diff) { 1310 stat->min_read_latency_ticks = tsc_diff; 1311 } 1312 } else { 1313 stat->bytes_written += num_blocks * blocklen; 1314 stat->num_write_ops++; 1315 stat->write_latency_ticks += tsc_diff; 1316 if (stat->max_write_latency_ticks < tsc_diff) { 1317 stat->max_write_latency_ticks = tsc_diff; 1318 } 1319 if (stat->min_write_latency_ticks > tsc_diff) { 1320 stat->min_write_latency_ticks = tsc_diff; 1321 } 1322 } 1323 break; 1324 case SPDK_BDEV_IO_TYPE_COPY: 1325 stat->bytes_copied += num_blocks * blocklen; 1326 stat->num_copy_ops++; 1327 stat->copy_latency_ticks += tsc_diff; 1328 if (stat->max_copy_latency_ticks < tsc_diff) { 1329 stat->max_copy_latency_ticks = tsc_diff; 1330 } 1331 if (stat->min_copy_latency_ticks > tsc_diff) { 1332 stat->min_copy_latency_ticks = tsc_diff; 1333 } 1334 break; 1335 default: 1336 break; 1337 } 1338 } 1339 1340 static bool 1341 bdev_nvme_check_retry_io(struct nvme_bdev_io *bio, 1342 const struct spdk_nvme_cpl *cpl, 1343 struct nvme_bdev_channel *nbdev_ch, 1344 uint64_t *_delay_ms) 1345 { 1346 struct nvme_io_path *io_path = bio->io_path; 1347 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 1348 const struct spdk_nvme_ctrlr_data *cdata; 1349 1350 if (spdk_nvme_cpl_is_path_error(cpl) || 1351 spdk_nvme_cpl_is_aborted_sq_deletion(cpl) || 1352 !nvme_io_path_is_available(io_path) || 1353 !nvme_ctrlr_is_available(nvme_ctrlr)) { 1354 bdev_nvme_clear_current_io_path(nbdev_ch); 1355 bio->io_path = NULL; 1356 if (spdk_nvme_cpl_is_ana_error(cpl)) { 1357 if (nvme_ctrlr_read_ana_log_page(nvme_ctrlr) == 0) { 1358 io_path->nvme_ns->ana_state_updating = true; 1359 } 1360 } 1361 if (!any_io_path_may_become_available(nbdev_ch)) { 1362 return false; 1363 } 1364 *_delay_ms = 0; 1365 } else { 1366 bio->retry_count++; 1367 1368 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 1369 1370 if (cpl->status.crd != 0) { 1371 *_delay_ms = cdata->crdt[cpl->status.crd] * 100; 1372 } else { 1373 *_delay_ms = 0; 1374 } 1375 } 1376 1377 return true; 1378 } 1379 1380 static inline void 1381 bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio, 1382 const struct spdk_nvme_cpl *cpl) 1383 { 1384 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1385 struct nvme_bdev_channel *nbdev_ch; 1386 uint64_t delay_ms; 1387 1388 assert(!bdev_nvme_io_type_is_admin(bdev_io->type)); 1389 1390 if (spdk_likely(spdk_nvme_cpl_is_success(cpl))) { 1391 bdev_nvme_update_io_path_stat(bio); 1392 goto complete; 1393 } 1394 1395 /* Update error counts before deciding if retry is needed. 1396 * Hence, error counts may be more than the number of I/O errors. 1397 */ 1398 bdev_nvme_update_nvme_error_stat(bdev_io, cpl); 1399 1400 if (cpl->status.dnr != 0 || spdk_nvme_cpl_is_aborted_by_request(cpl) || 1401 (g_opts.bdev_retry_count != -1 && bio->retry_count >= g_opts.bdev_retry_count)) { 1402 goto complete; 1403 } 1404 1405 /* At this point we don't know whether the sequence was successfully executed or not, so we 1406 * cannot retry the IO */ 1407 if (bdev_io->u.bdev.accel_sequence != NULL) { 1408 goto complete; 1409 } 1410 1411 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1412 1413 if (bdev_nvme_check_retry_io(bio, cpl, nbdev_ch, &delay_ms)) { 1414 bdev_nvme_queue_retry_io(nbdev_ch, bio, delay_ms); 1415 return; 1416 } 1417 1418 complete: 1419 bio->retry_count = 0; 1420 bio->submit_tsc = 0; 1421 bdev_io->u.bdev.accel_sequence = NULL; 1422 __bdev_nvme_io_complete(bdev_io, 0, cpl); 1423 } 1424 1425 static inline void 1426 bdev_nvme_io_complete(struct nvme_bdev_io *bio, int rc) 1427 { 1428 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1429 struct nvme_bdev_channel *nbdev_ch; 1430 enum spdk_bdev_io_status io_status; 1431 1432 assert(!bdev_nvme_io_type_is_admin(bdev_io->type)); 1433 1434 switch (rc) { 1435 case 0: 1436 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1437 break; 1438 case -ENOMEM: 1439 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1440 break; 1441 case -ENXIO: 1442 if (g_opts.bdev_retry_count == -1 || bio->retry_count < g_opts.bdev_retry_count) { 1443 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1444 1445 bdev_nvme_clear_current_io_path(nbdev_ch); 1446 bio->io_path = NULL; 1447 1448 if (any_io_path_may_become_available(nbdev_ch)) { 1449 bdev_nvme_queue_retry_io(nbdev_ch, bio, 1000ULL); 1450 return; 1451 } 1452 } 1453 1454 /* fallthrough */ 1455 default: 1456 spdk_accel_sequence_abort(bdev_io->u.bdev.accel_sequence); 1457 bdev_io->u.bdev.accel_sequence = NULL; 1458 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1459 break; 1460 } 1461 1462 bio->retry_count = 0; 1463 bio->submit_tsc = 0; 1464 __bdev_nvme_io_complete(bdev_io, io_status, NULL); 1465 } 1466 1467 static inline void 1468 bdev_nvme_admin_complete(struct nvme_bdev_io *bio, int rc) 1469 { 1470 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1471 enum spdk_bdev_io_status io_status; 1472 1473 switch (rc) { 1474 case 0: 1475 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1476 break; 1477 case -ENOMEM: 1478 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1479 break; 1480 case -ENXIO: 1481 /* fallthrough */ 1482 default: 1483 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1484 break; 1485 } 1486 1487 __bdev_nvme_io_complete(bdev_io, io_status, NULL); 1488 } 1489 1490 static void 1491 bdev_nvme_clear_io_path_caches_done(struct spdk_io_channel_iter *i, int status) 1492 { 1493 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 1494 1495 pthread_mutex_lock(&nvme_ctrlr->mutex); 1496 1497 assert(nvme_ctrlr->io_path_cache_clearing == true); 1498 nvme_ctrlr->io_path_cache_clearing = false; 1499 1500 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 1501 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1502 return; 1503 } 1504 1505 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1506 1507 nvme_ctrlr_unregister(nvme_ctrlr); 1508 } 1509 1510 static void 1511 _bdev_nvme_clear_io_path_cache(struct nvme_qpair *nvme_qpair) 1512 { 1513 struct nvme_io_path *io_path; 1514 1515 TAILQ_FOREACH(io_path, &nvme_qpair->io_path_list, tailq) { 1516 if (io_path->nbdev_ch == NULL) { 1517 continue; 1518 } 1519 bdev_nvme_clear_current_io_path(io_path->nbdev_ch); 1520 } 1521 } 1522 1523 static void 1524 bdev_nvme_clear_io_path_cache(struct spdk_io_channel_iter *i) 1525 { 1526 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 1527 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 1528 1529 assert(ctrlr_ch->qpair != NULL); 1530 1531 _bdev_nvme_clear_io_path_cache(ctrlr_ch->qpair); 1532 1533 spdk_for_each_channel_continue(i, 0); 1534 } 1535 1536 static void 1537 bdev_nvme_clear_io_path_caches(struct nvme_ctrlr *nvme_ctrlr) 1538 { 1539 pthread_mutex_lock(&nvme_ctrlr->mutex); 1540 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 1541 nvme_ctrlr->io_path_cache_clearing) { 1542 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1543 return; 1544 } 1545 1546 nvme_ctrlr->io_path_cache_clearing = true; 1547 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1548 1549 spdk_for_each_channel(nvme_ctrlr, 1550 bdev_nvme_clear_io_path_cache, 1551 NULL, 1552 bdev_nvme_clear_io_path_caches_done); 1553 } 1554 1555 static struct nvme_qpair * 1556 nvme_poll_group_get_qpair(struct nvme_poll_group *group, struct spdk_nvme_qpair *qpair) 1557 { 1558 struct nvme_qpair *nvme_qpair; 1559 1560 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1561 if (nvme_qpair->qpair == qpair) { 1562 break; 1563 } 1564 } 1565 1566 return nvme_qpair; 1567 } 1568 1569 static void nvme_qpair_delete(struct nvme_qpair *nvme_qpair); 1570 1571 static void 1572 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx) 1573 { 1574 struct nvme_poll_group *group = poll_group_ctx; 1575 struct nvme_qpair *nvme_qpair; 1576 struct nvme_ctrlr_channel *ctrlr_ch; 1577 int status; 1578 1579 nvme_qpair = nvme_poll_group_get_qpair(group, qpair); 1580 if (nvme_qpair == NULL) { 1581 return; 1582 } 1583 1584 if (nvme_qpair->qpair != NULL) { 1585 spdk_nvme_ctrlr_free_io_qpair(nvme_qpair->qpair); 1586 nvme_qpair->qpair = NULL; 1587 } 1588 1589 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1590 1591 ctrlr_ch = nvme_qpair->ctrlr_ch; 1592 1593 if (ctrlr_ch != NULL) { 1594 if (ctrlr_ch->reset_iter != NULL) { 1595 /* We are in a full reset sequence. */ 1596 if (ctrlr_ch->connect_poller != NULL) { 1597 /* qpair was failed to connect. Abort the reset sequence. */ 1598 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was failed to connect. abort the reset ctrlr sequence.\n", 1599 qpair); 1600 spdk_poller_unregister(&ctrlr_ch->connect_poller); 1601 status = -1; 1602 } else { 1603 /* qpair was completed to disconnect. Just move to the next ctrlr_channel. */ 1604 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was disconnected and freed in a reset ctrlr sequence.\n", 1605 qpair); 1606 status = 0; 1607 } 1608 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, status); 1609 ctrlr_ch->reset_iter = NULL; 1610 } else { 1611 /* qpair was disconnected unexpectedly. Reset controller for recovery. */ 1612 SPDK_NOTICELOG("qpair %p was disconnected and freed. reset controller.\n", qpair); 1613 bdev_nvme_failover_ctrlr(nvme_qpair->ctrlr); 1614 } 1615 } else { 1616 /* In this case, ctrlr_channel is already deleted. */ 1617 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was disconnected and freed. delete nvme_qpair.\n", qpair); 1618 nvme_qpair_delete(nvme_qpair); 1619 } 1620 } 1621 1622 static void 1623 bdev_nvme_check_io_qpairs(struct nvme_poll_group *group) 1624 { 1625 struct nvme_qpair *nvme_qpair; 1626 1627 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1628 if (nvme_qpair->qpair == NULL || nvme_qpair->ctrlr_ch == NULL) { 1629 continue; 1630 } 1631 1632 if (spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) != 1633 SPDK_NVME_QPAIR_FAILURE_NONE) { 1634 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1635 } 1636 } 1637 } 1638 1639 static int 1640 bdev_nvme_poll(void *arg) 1641 { 1642 struct nvme_poll_group *group = arg; 1643 int64_t num_completions; 1644 1645 if (group->collect_spin_stat && group->start_ticks == 0) { 1646 group->start_ticks = spdk_get_ticks(); 1647 } 1648 1649 num_completions = spdk_nvme_poll_group_process_completions(group->group, 0, 1650 bdev_nvme_disconnected_qpair_cb); 1651 if (group->collect_spin_stat) { 1652 if (num_completions > 0) { 1653 if (group->end_ticks != 0) { 1654 group->spin_ticks += (group->end_ticks - group->start_ticks); 1655 group->end_ticks = 0; 1656 } 1657 group->start_ticks = 0; 1658 } else { 1659 group->end_ticks = spdk_get_ticks(); 1660 } 1661 } 1662 1663 if (spdk_unlikely(num_completions < 0)) { 1664 bdev_nvme_check_io_qpairs(group); 1665 } 1666 1667 return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 1668 } 1669 1670 static int bdev_nvme_poll_adminq(void *arg); 1671 1672 static void 1673 bdev_nvme_change_adminq_poll_period(struct nvme_ctrlr *nvme_ctrlr, uint64_t new_period_us) 1674 { 1675 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 1676 1677 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, 1678 nvme_ctrlr, new_period_us); 1679 } 1680 1681 static int 1682 bdev_nvme_poll_adminq(void *arg) 1683 { 1684 int32_t rc; 1685 struct nvme_ctrlr *nvme_ctrlr = arg; 1686 nvme_ctrlr_disconnected_cb disconnected_cb; 1687 1688 assert(nvme_ctrlr != NULL); 1689 1690 rc = spdk_nvme_ctrlr_process_admin_completions(nvme_ctrlr->ctrlr); 1691 if (rc < 0) { 1692 disconnected_cb = nvme_ctrlr->disconnected_cb; 1693 nvme_ctrlr->disconnected_cb = NULL; 1694 1695 if (disconnected_cb != NULL) { 1696 bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 1697 g_opts.nvme_adminq_poll_period_us); 1698 disconnected_cb(nvme_ctrlr); 1699 } else { 1700 bdev_nvme_failover_ctrlr(nvme_ctrlr); 1701 } 1702 } else if (spdk_nvme_ctrlr_get_admin_qp_failure_reason(nvme_ctrlr->ctrlr) != 1703 SPDK_NVME_QPAIR_FAILURE_NONE) { 1704 bdev_nvme_clear_io_path_caches(nvme_ctrlr); 1705 } 1706 1707 return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY; 1708 } 1709 1710 static void 1711 nvme_bdev_free(void *io_device) 1712 { 1713 struct nvme_bdev *nvme_disk = io_device; 1714 1715 pthread_mutex_destroy(&nvme_disk->mutex); 1716 free(nvme_disk->disk.name); 1717 free(nvme_disk->err_stat); 1718 free(nvme_disk); 1719 } 1720 1721 static int 1722 bdev_nvme_destruct(void *ctx) 1723 { 1724 struct nvme_bdev *nvme_disk = ctx; 1725 struct nvme_ns *nvme_ns, *tmp_nvme_ns; 1726 1727 SPDK_DTRACE_PROBE2(bdev_nvme_destruct, nvme_disk->nbdev_ctrlr->name, nvme_disk->nsid); 1728 1729 TAILQ_FOREACH_SAFE(nvme_ns, &nvme_disk->nvme_ns_list, tailq, tmp_nvme_ns) { 1730 pthread_mutex_lock(&nvme_ns->ctrlr->mutex); 1731 1732 nvme_ns->bdev = NULL; 1733 1734 assert(nvme_ns->id > 0); 1735 1736 if (nvme_ctrlr_get_ns(nvme_ns->ctrlr, nvme_ns->id) == NULL) { 1737 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1738 1739 nvme_ctrlr_release(nvme_ns->ctrlr); 1740 nvme_ns_free(nvme_ns); 1741 } else { 1742 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1743 } 1744 } 1745 1746 pthread_mutex_lock(&g_bdev_nvme_mutex); 1747 TAILQ_REMOVE(&nvme_disk->nbdev_ctrlr->bdevs, nvme_disk, tailq); 1748 pthread_mutex_unlock(&g_bdev_nvme_mutex); 1749 1750 spdk_io_device_unregister(nvme_disk, nvme_bdev_free); 1751 1752 return 0; 1753 } 1754 1755 static int 1756 bdev_nvme_create_qpair(struct nvme_qpair *nvme_qpair) 1757 { 1758 struct nvme_ctrlr *nvme_ctrlr; 1759 struct spdk_nvme_io_qpair_opts opts; 1760 struct spdk_nvme_qpair *qpair; 1761 int rc; 1762 1763 nvme_ctrlr = nvme_qpair->ctrlr; 1764 1765 spdk_nvme_ctrlr_get_default_io_qpair_opts(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1766 opts.delay_cmd_submit = g_opts.delay_cmd_submit; 1767 opts.create_only = true; 1768 opts.async_mode = true; 1769 opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests); 1770 g_opts.io_queue_requests = opts.io_queue_requests; 1771 1772 qpair = spdk_nvme_ctrlr_alloc_io_qpair(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1773 if (qpair == NULL) { 1774 return -1; 1775 } 1776 1777 SPDK_DTRACE_PROBE3(bdev_nvme_create_qpair, nvme_ctrlr->nbdev_ctrlr->name, 1778 spdk_nvme_qpair_get_id(qpair), spdk_thread_get_id(nvme_ctrlr->thread)); 1779 1780 assert(nvme_qpair->group != NULL); 1781 1782 rc = spdk_nvme_poll_group_add(nvme_qpair->group->group, qpair); 1783 if (rc != 0) { 1784 SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n"); 1785 goto err; 1786 } 1787 1788 rc = spdk_nvme_ctrlr_connect_io_qpair(nvme_ctrlr->ctrlr, qpair); 1789 if (rc != 0) { 1790 SPDK_ERRLOG("Unable to connect I/O qpair.\n"); 1791 goto err; 1792 } 1793 1794 nvme_qpair->qpair = qpair; 1795 1796 if (!g_opts.disable_auto_failback) { 1797 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1798 } 1799 1800 return 0; 1801 1802 err: 1803 spdk_nvme_ctrlr_free_io_qpair(qpair); 1804 1805 return rc; 1806 } 1807 1808 static void bdev_nvme_reset_io_continue(void *cb_arg, int rc); 1809 1810 static void 1811 bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i) 1812 { 1813 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 1814 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 1815 int rc = 0; 1816 struct spdk_bdev_io *bdev_io; 1817 struct nvme_bdev_io *bio; 1818 1819 if (spdk_io_channel_iter_get_ctx(i) != NULL) { 1820 rc = -1; 1821 } 1822 1823 while (!TAILQ_EMPTY(&ctrlr_ch->pending_resets)) { 1824 bdev_io = TAILQ_FIRST(&ctrlr_ch->pending_resets); 1825 TAILQ_REMOVE(&ctrlr_ch->pending_resets, bdev_io, module_link); 1826 1827 bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 1828 bdev_nvme_reset_io_continue(bio, rc); 1829 } 1830 1831 spdk_for_each_channel_continue(i, 0); 1832 } 1833 1834 /* This function marks the current trid as failed by storing the current ticks 1835 * and then sets the next trid to the active trid within a controller if exists. 1836 * 1837 * The purpose of the boolean return value is to request the caller to disconnect 1838 * the current trid now to try connecting the next trid. 1839 */ 1840 static bool 1841 bdev_nvme_failover_trid(struct nvme_ctrlr *nvme_ctrlr, bool remove, bool start) 1842 { 1843 struct nvme_path_id *path_id, *next_path; 1844 int rc __attribute__((unused)); 1845 1846 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 1847 assert(path_id); 1848 assert(path_id == nvme_ctrlr->active_path_id); 1849 next_path = TAILQ_NEXT(path_id, link); 1850 1851 /* Update the last failed time. It means the trid is failed if its last 1852 * failed time is non-zero. 1853 */ 1854 path_id->last_failed_tsc = spdk_get_ticks(); 1855 1856 if (next_path == NULL) { 1857 /* There is no alternate trid within a controller. */ 1858 return false; 1859 } 1860 1861 if (!start && nvme_ctrlr->opts.reconnect_delay_sec == 0) { 1862 /* Connect is not retried in a controller reset sequence. Connecting 1863 * the next trid will be done by the next bdev_nvme_failover_ctrlr() call. 1864 */ 1865 return false; 1866 } 1867 1868 assert(path_id->trid.trtype != SPDK_NVME_TRANSPORT_PCIE); 1869 1870 SPDK_NOTICELOG("Start failover from %s:%s to %s:%s\n", path_id->trid.traddr, 1871 path_id->trid.trsvcid, next_path->trid.traddr, next_path->trid.trsvcid); 1872 1873 spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr); 1874 nvme_ctrlr->active_path_id = next_path; 1875 rc = spdk_nvme_ctrlr_set_trid(nvme_ctrlr->ctrlr, &next_path->trid); 1876 assert(rc == 0); 1877 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 1878 if (!remove) { 1879 /** Shuffle the old trid to the end of the list and use the new one. 1880 * Allows for round robin through multiple connections. 1881 */ 1882 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, path_id, link); 1883 } else { 1884 free(path_id); 1885 } 1886 1887 if (start || next_path->last_failed_tsc == 0) { 1888 /* bdev_nvme_failover_ctrlr() is just called or the next trid is not failed 1889 * or used yet. Try the next trid now. 1890 */ 1891 return true; 1892 } 1893 1894 if (spdk_get_ticks() > next_path->last_failed_tsc + spdk_get_ticks_hz() * 1895 nvme_ctrlr->opts.reconnect_delay_sec) { 1896 /* Enough backoff passed since the next trid failed. Try the next trid now. */ 1897 return true; 1898 } 1899 1900 /* The next trid will be tried after reconnect_delay_sec seconds. */ 1901 return false; 1902 } 1903 1904 static bool 1905 bdev_nvme_check_ctrlr_loss_timeout(struct nvme_ctrlr *nvme_ctrlr) 1906 { 1907 int32_t elapsed; 1908 1909 if (nvme_ctrlr->opts.ctrlr_loss_timeout_sec == 0 || 1910 nvme_ctrlr->opts.ctrlr_loss_timeout_sec == -1) { 1911 return false; 1912 } 1913 1914 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 1915 if (elapsed >= nvme_ctrlr->opts.ctrlr_loss_timeout_sec) { 1916 return true; 1917 } else { 1918 return false; 1919 } 1920 } 1921 1922 static bool 1923 bdev_nvme_check_fast_io_fail_timeout(struct nvme_ctrlr *nvme_ctrlr) 1924 { 1925 uint32_t elapsed; 1926 1927 if (nvme_ctrlr->opts.fast_io_fail_timeout_sec == 0) { 1928 return false; 1929 } 1930 1931 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 1932 if (elapsed >= nvme_ctrlr->opts.fast_io_fail_timeout_sec) { 1933 return true; 1934 } else { 1935 return false; 1936 } 1937 } 1938 1939 static void bdev_nvme_reset_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, bool success); 1940 1941 static void 1942 nvme_ctrlr_disconnect(struct nvme_ctrlr *nvme_ctrlr, nvme_ctrlr_disconnected_cb cb_fn) 1943 { 1944 int rc; 1945 1946 rc = spdk_nvme_ctrlr_disconnect(nvme_ctrlr->ctrlr); 1947 if (rc != 0) { 1948 /* Disconnect fails if ctrlr is already resetting or removed. In this case, 1949 * fail the reset sequence immediately. 1950 */ 1951 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 1952 return; 1953 } 1954 1955 /* spdk_nvme_ctrlr_disconnect() may complete asynchronously later by polling adminq. 1956 * Set callback here to execute the specified operation after ctrlr is really disconnected. 1957 */ 1958 assert(nvme_ctrlr->disconnected_cb == NULL); 1959 nvme_ctrlr->disconnected_cb = cb_fn; 1960 1961 /* During disconnection, reduce the period to poll adminq more often. */ 1962 bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 0); 1963 } 1964 1965 enum bdev_nvme_op_after_reset { 1966 OP_NONE, 1967 OP_COMPLETE_PENDING_DESTRUCT, 1968 OP_DESTRUCT, 1969 OP_DELAYED_RECONNECT, 1970 OP_FAILOVER, 1971 }; 1972 1973 typedef enum bdev_nvme_op_after_reset _bdev_nvme_op_after_reset; 1974 1975 static _bdev_nvme_op_after_reset 1976 bdev_nvme_check_op_after_reset(struct nvme_ctrlr *nvme_ctrlr, bool success) 1977 { 1978 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 1979 /* Complete pending destruct after reset completes. */ 1980 return OP_COMPLETE_PENDING_DESTRUCT; 1981 } else if (nvme_ctrlr->pending_failover) { 1982 nvme_ctrlr->pending_failover = false; 1983 nvme_ctrlr->reset_start_tsc = 0; 1984 return OP_FAILOVER; 1985 } else if (success || nvme_ctrlr->opts.reconnect_delay_sec == 0) { 1986 nvme_ctrlr->reset_start_tsc = 0; 1987 return OP_NONE; 1988 } else if (bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 1989 return OP_DESTRUCT; 1990 } else { 1991 if (bdev_nvme_check_fast_io_fail_timeout(nvme_ctrlr)) { 1992 nvme_ctrlr->fast_io_fail_timedout = true; 1993 } 1994 return OP_DELAYED_RECONNECT; 1995 } 1996 } 1997 1998 static int bdev_nvme_delete_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool hotplug); 1999 static void bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 2000 2001 static int 2002 bdev_nvme_reconnect_delay_timer_expired(void *ctx) 2003 { 2004 struct nvme_ctrlr *nvme_ctrlr = ctx; 2005 2006 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect_delay, nvme_ctrlr->nbdev_ctrlr->name); 2007 pthread_mutex_lock(&nvme_ctrlr->mutex); 2008 2009 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 2010 2011 if (!nvme_ctrlr->reconnect_is_delayed) { 2012 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2013 return SPDK_POLLER_BUSY; 2014 } 2015 2016 nvme_ctrlr->reconnect_is_delayed = false; 2017 2018 if (nvme_ctrlr->destruct) { 2019 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2020 return SPDK_POLLER_BUSY; 2021 } 2022 2023 assert(nvme_ctrlr->resetting == false); 2024 nvme_ctrlr->resetting = true; 2025 2026 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2027 2028 spdk_poller_resume(nvme_ctrlr->adminq_timer_poller); 2029 2030 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2031 return SPDK_POLLER_BUSY; 2032 } 2033 2034 static void 2035 bdev_nvme_start_reconnect_delay_timer(struct nvme_ctrlr *nvme_ctrlr) 2036 { 2037 spdk_poller_pause(nvme_ctrlr->adminq_timer_poller); 2038 2039 assert(nvme_ctrlr->reconnect_is_delayed == false); 2040 nvme_ctrlr->reconnect_is_delayed = true; 2041 2042 assert(nvme_ctrlr->reconnect_delay_timer == NULL); 2043 nvme_ctrlr->reconnect_delay_timer = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_delay_timer_expired, 2044 nvme_ctrlr, 2045 nvme_ctrlr->opts.reconnect_delay_sec * SPDK_SEC_TO_USEC); 2046 } 2047 2048 static void remove_discovery_entry(struct nvme_ctrlr *nvme_ctrlr); 2049 2050 static void 2051 _bdev_nvme_reset_ctrlr_complete(struct spdk_io_channel_iter *i, int status) 2052 { 2053 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2054 bool success = spdk_io_channel_iter_get_ctx(i) == NULL; 2055 bdev_nvme_ctrlr_op_cb ctrlr_op_cb_fn = nvme_ctrlr->ctrlr_op_cb_fn; 2056 void *ctrlr_op_cb_arg = nvme_ctrlr->ctrlr_op_cb_arg; 2057 enum bdev_nvme_op_after_reset op_after_reset; 2058 2059 assert(nvme_ctrlr->thread == spdk_get_thread()); 2060 2061 nvme_ctrlr->ctrlr_op_cb_fn = NULL; 2062 nvme_ctrlr->ctrlr_op_cb_arg = NULL; 2063 2064 if (!success) { 2065 SPDK_ERRLOG("Resetting controller failed.\n"); 2066 } else { 2067 SPDK_NOTICELOG("Resetting controller successful.\n"); 2068 } 2069 2070 pthread_mutex_lock(&nvme_ctrlr->mutex); 2071 nvme_ctrlr->resetting = false; 2072 nvme_ctrlr->dont_retry = false; 2073 nvme_ctrlr->in_failover = false; 2074 2075 op_after_reset = bdev_nvme_check_op_after_reset(nvme_ctrlr, success); 2076 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2077 2078 /* Delay callbacks when the next operation is a failover. */ 2079 if (ctrlr_op_cb_fn && op_after_reset != OP_FAILOVER) { 2080 ctrlr_op_cb_fn(ctrlr_op_cb_arg, success ? 0 : -1); 2081 } 2082 2083 switch (op_after_reset) { 2084 case OP_COMPLETE_PENDING_DESTRUCT: 2085 nvme_ctrlr_unregister(nvme_ctrlr); 2086 break; 2087 case OP_DESTRUCT: 2088 bdev_nvme_delete_ctrlr(nvme_ctrlr, false); 2089 remove_discovery_entry(nvme_ctrlr); 2090 break; 2091 case OP_DELAYED_RECONNECT: 2092 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_start_reconnect_delay_timer); 2093 break; 2094 case OP_FAILOVER: 2095 nvme_ctrlr->ctrlr_op_cb_fn = ctrlr_op_cb_fn; 2096 nvme_ctrlr->ctrlr_op_cb_arg = ctrlr_op_cb_arg; 2097 bdev_nvme_failover_ctrlr(nvme_ctrlr); 2098 break; 2099 default: 2100 break; 2101 } 2102 } 2103 2104 static void 2105 bdev_nvme_reset_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, bool success) 2106 { 2107 pthread_mutex_lock(&nvme_ctrlr->mutex); 2108 if (!success) { 2109 /* Connecting the active trid failed. Set the next alternate trid to the 2110 * active trid if it exists. 2111 */ 2112 if (bdev_nvme_failover_trid(nvme_ctrlr, false, false)) { 2113 /* The next alternate trid exists and is ready to try. Try it now. */ 2114 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2115 2116 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr); 2117 return; 2118 } 2119 2120 /* We came here if there is no alternate trid or if the next trid exists but 2121 * is not ready to try. We will try the active trid after reconnect_delay_sec 2122 * seconds if it is non-zero or at the next reset call otherwise. 2123 */ 2124 } else { 2125 /* Connecting the active trid succeeded. Clear the last failed time because it 2126 * means the trid is failed if its last failed time is non-zero. 2127 */ 2128 nvme_ctrlr->active_path_id->last_failed_tsc = 0; 2129 } 2130 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2131 2132 /* Make sure we clear any pending resets before returning. */ 2133 spdk_for_each_channel(nvme_ctrlr, 2134 bdev_nvme_complete_pending_resets, 2135 success ? NULL : (void *)0x1, 2136 _bdev_nvme_reset_ctrlr_complete); 2137 } 2138 2139 static void 2140 bdev_nvme_reset_create_qpairs_failed(struct spdk_io_channel_iter *i, int status) 2141 { 2142 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2143 2144 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 2145 } 2146 2147 static void 2148 bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i) 2149 { 2150 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 2151 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch); 2152 struct nvme_qpair *nvme_qpair; 2153 2154 nvme_qpair = ctrlr_ch->qpair; 2155 assert(nvme_qpair != NULL); 2156 2157 _bdev_nvme_clear_io_path_cache(nvme_qpair); 2158 2159 if (nvme_qpair->qpair != NULL) { 2160 if (nvme_qpair->ctrlr->dont_retry) { 2161 spdk_nvme_qpair_set_abort_dnr(nvme_qpair->qpair, true); 2162 } 2163 spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); 2164 2165 /* The current full reset sequence will move to the next 2166 * ctrlr_channel after the qpair is actually disconnected. 2167 */ 2168 assert(ctrlr_ch->reset_iter == NULL); 2169 ctrlr_ch->reset_iter = i; 2170 } else { 2171 spdk_for_each_channel_continue(i, 0); 2172 } 2173 } 2174 2175 static void 2176 bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status) 2177 { 2178 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2179 2180 if (status == 0) { 2181 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, true); 2182 } else { 2183 /* Delete the added qpairs and quiesce ctrlr to make the states clean. */ 2184 spdk_for_each_channel(nvme_ctrlr, 2185 bdev_nvme_reset_destroy_qpair, 2186 NULL, 2187 bdev_nvme_reset_create_qpairs_failed); 2188 } 2189 } 2190 2191 static int 2192 bdev_nvme_reset_check_qpair_connected(void *ctx) 2193 { 2194 struct nvme_ctrlr_channel *ctrlr_ch = ctx; 2195 2196 if (ctrlr_ch->reset_iter == NULL) { 2197 /* qpair was already failed to connect and the reset sequence is being aborted. */ 2198 assert(ctrlr_ch->connect_poller == NULL); 2199 assert(ctrlr_ch->qpair->qpair == NULL); 2200 return SPDK_POLLER_BUSY; 2201 } 2202 2203 assert(ctrlr_ch->qpair->qpair != NULL); 2204 2205 if (!spdk_nvme_qpair_is_connected(ctrlr_ch->qpair->qpair)) { 2206 return SPDK_POLLER_BUSY; 2207 } 2208 2209 spdk_poller_unregister(&ctrlr_ch->connect_poller); 2210 2211 /* qpair was completed to connect. Move to the next ctrlr_channel */ 2212 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 2213 ctrlr_ch->reset_iter = NULL; 2214 2215 if (!g_opts.disable_auto_failback) { 2216 _bdev_nvme_clear_io_path_cache(ctrlr_ch->qpair); 2217 } 2218 2219 return SPDK_POLLER_BUSY; 2220 } 2221 2222 static void 2223 bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i) 2224 { 2225 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 2226 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 2227 int rc; 2228 2229 rc = bdev_nvme_create_qpair(ctrlr_ch->qpair); 2230 if (rc == 0) { 2231 ctrlr_ch->connect_poller = SPDK_POLLER_REGISTER(bdev_nvme_reset_check_qpair_connected, 2232 ctrlr_ch, 0); 2233 2234 /* The current full reset sequence will move to the next 2235 * ctrlr_channel after the qpair is actually connected. 2236 */ 2237 assert(ctrlr_ch->reset_iter == NULL); 2238 ctrlr_ch->reset_iter = i; 2239 } else { 2240 spdk_for_each_channel_continue(i, rc); 2241 } 2242 } 2243 2244 static void 2245 nvme_ctrlr_check_namespaces(struct nvme_ctrlr *nvme_ctrlr) 2246 { 2247 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 2248 struct nvme_ns *nvme_ns; 2249 2250 for (nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 2251 nvme_ns != NULL; 2252 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns)) { 2253 if (!spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) { 2254 SPDK_DEBUGLOG(bdev_nvme, "NSID %u was removed during reset.\n", nvme_ns->id); 2255 /* NS can be added again. Just nullify nvme_ns->ns. */ 2256 nvme_ns->ns = NULL; 2257 } 2258 } 2259 } 2260 2261 2262 static int 2263 bdev_nvme_reconnect_ctrlr_poll(void *arg) 2264 { 2265 struct nvme_ctrlr *nvme_ctrlr = arg; 2266 int rc = -ETIMEDOUT; 2267 2268 if (bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 2269 /* Mark the ctrlr as failed. The next call to 2270 * spdk_nvme_ctrlr_reconnect_poll_async() will then 2271 * do the necessary cleanup and return failure. 2272 */ 2273 spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr); 2274 } 2275 2276 rc = spdk_nvme_ctrlr_reconnect_poll_async(nvme_ctrlr->ctrlr); 2277 if (rc == -EAGAIN) { 2278 return SPDK_POLLER_BUSY; 2279 } 2280 2281 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 2282 if (rc == 0) { 2283 nvme_ctrlr_check_namespaces(nvme_ctrlr); 2284 2285 /* Recreate all of the I/O queue pairs */ 2286 spdk_for_each_channel(nvme_ctrlr, 2287 bdev_nvme_reset_create_qpair, 2288 NULL, 2289 bdev_nvme_reset_create_qpairs_done); 2290 } else { 2291 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 2292 } 2293 return SPDK_POLLER_BUSY; 2294 } 2295 2296 static void 2297 bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2298 { 2299 spdk_nvme_ctrlr_reconnect_async(nvme_ctrlr->ctrlr); 2300 2301 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect, nvme_ctrlr->nbdev_ctrlr->name); 2302 assert(nvme_ctrlr->reset_detach_poller == NULL); 2303 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_ctrlr_poll, 2304 nvme_ctrlr, 0); 2305 } 2306 2307 static void 2308 bdev_nvme_reset_destroy_qpair_done(struct spdk_io_channel_iter *i, int status) 2309 { 2310 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2311 2312 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reset, nvme_ctrlr->nbdev_ctrlr->name); 2313 assert(status == 0); 2314 2315 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2316 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2317 } else { 2318 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr); 2319 } 2320 } 2321 2322 static void 2323 bdev_nvme_reset_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr) 2324 { 2325 spdk_for_each_channel(nvme_ctrlr, 2326 bdev_nvme_reset_destroy_qpair, 2327 NULL, 2328 bdev_nvme_reset_destroy_qpair_done); 2329 } 2330 2331 static void 2332 bdev_nvme_reconnect_ctrlr_now(void *ctx) 2333 { 2334 struct nvme_ctrlr *nvme_ctrlr = ctx; 2335 2336 assert(nvme_ctrlr->resetting == true); 2337 assert(nvme_ctrlr->thread == spdk_get_thread()); 2338 2339 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 2340 2341 spdk_poller_resume(nvme_ctrlr->adminq_timer_poller); 2342 2343 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2344 } 2345 2346 static void 2347 _bdev_nvme_reset_ctrlr(void *ctx) 2348 { 2349 struct nvme_ctrlr *nvme_ctrlr = ctx; 2350 2351 assert(nvme_ctrlr->resetting == true); 2352 assert(nvme_ctrlr->thread == spdk_get_thread()); 2353 2354 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2355 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reset_destroy_qpairs); 2356 } else { 2357 bdev_nvme_reset_destroy_qpairs(nvme_ctrlr); 2358 } 2359 } 2360 2361 static int 2362 bdev_nvme_reset_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2363 { 2364 spdk_msg_fn msg_fn; 2365 2366 pthread_mutex_lock(&nvme_ctrlr->mutex); 2367 if (nvme_ctrlr->destruct) { 2368 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2369 return -ENXIO; 2370 } 2371 2372 if (nvme_ctrlr->resetting) { 2373 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2374 SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); 2375 return -EBUSY; 2376 } 2377 2378 if (nvme_ctrlr->disabled) { 2379 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2380 SPDK_NOTICELOG("Unable to perform reset. Controller is disabled.\n"); 2381 return -EALREADY; 2382 } 2383 2384 nvme_ctrlr->resetting = true; 2385 nvme_ctrlr->dont_retry = true; 2386 2387 if (nvme_ctrlr->reconnect_is_delayed) { 2388 SPDK_DEBUGLOG(bdev_nvme, "Reconnect is already scheduled.\n"); 2389 msg_fn = bdev_nvme_reconnect_ctrlr_now; 2390 nvme_ctrlr->reconnect_is_delayed = false; 2391 } else { 2392 msg_fn = _bdev_nvme_reset_ctrlr; 2393 assert(nvme_ctrlr->reset_start_tsc == 0); 2394 } 2395 2396 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2397 2398 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2399 2400 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 2401 return 0; 2402 } 2403 2404 static int 2405 bdev_nvme_enable_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2406 { 2407 pthread_mutex_lock(&nvme_ctrlr->mutex); 2408 if (nvme_ctrlr->destruct) { 2409 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2410 return -ENXIO; 2411 } 2412 2413 if (nvme_ctrlr->resetting) { 2414 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2415 return -EBUSY; 2416 } 2417 2418 if (!nvme_ctrlr->disabled) { 2419 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2420 return -EALREADY; 2421 } 2422 2423 nvme_ctrlr->disabled = false; 2424 nvme_ctrlr->resetting = true; 2425 2426 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2427 2428 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2429 2430 spdk_thread_send_msg(nvme_ctrlr->thread, bdev_nvme_reconnect_ctrlr_now, nvme_ctrlr); 2431 return 0; 2432 } 2433 2434 static void 2435 _bdev_nvme_disable_ctrlr_complete(struct spdk_io_channel_iter *i, int status) 2436 { 2437 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2438 bdev_nvme_ctrlr_op_cb ctrlr_op_cb_fn = nvme_ctrlr->ctrlr_op_cb_fn; 2439 void *ctrlr_op_cb_arg = nvme_ctrlr->ctrlr_op_cb_arg; 2440 enum bdev_nvme_op_after_reset op_after_disable; 2441 2442 assert(nvme_ctrlr->thread == spdk_get_thread()); 2443 2444 nvme_ctrlr->ctrlr_op_cb_fn = NULL; 2445 nvme_ctrlr->ctrlr_op_cb_arg = NULL; 2446 2447 pthread_mutex_lock(&nvme_ctrlr->mutex); 2448 2449 nvme_ctrlr->resetting = false; 2450 nvme_ctrlr->dont_retry = false; 2451 2452 op_after_disable = bdev_nvme_check_op_after_reset(nvme_ctrlr, true); 2453 2454 nvme_ctrlr->disabled = true; 2455 spdk_poller_pause(nvme_ctrlr->adminq_timer_poller); 2456 2457 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2458 2459 if (ctrlr_op_cb_fn) { 2460 ctrlr_op_cb_fn(ctrlr_op_cb_arg, 0); 2461 } 2462 2463 switch (op_after_disable) { 2464 case OP_COMPLETE_PENDING_DESTRUCT: 2465 nvme_ctrlr_unregister(nvme_ctrlr); 2466 break; 2467 default: 2468 break; 2469 } 2470 2471 } 2472 2473 static void 2474 bdev_nvme_disable_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr) 2475 { 2476 /* Make sure we clear any pending resets before returning. */ 2477 spdk_for_each_channel(nvme_ctrlr, 2478 bdev_nvme_complete_pending_resets, 2479 NULL, 2480 _bdev_nvme_disable_ctrlr_complete); 2481 } 2482 2483 static void 2484 bdev_nvme_disable_destroy_qpairs_done(struct spdk_io_channel_iter *i, int status) 2485 { 2486 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2487 2488 assert(status == 0); 2489 2490 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2491 bdev_nvme_disable_ctrlr_complete(nvme_ctrlr); 2492 } else { 2493 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_disable_ctrlr_complete); 2494 } 2495 } 2496 2497 static void 2498 bdev_nvme_disable_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr) 2499 { 2500 spdk_for_each_channel(nvme_ctrlr, 2501 bdev_nvme_reset_destroy_qpair, 2502 NULL, 2503 bdev_nvme_disable_destroy_qpairs_done); 2504 } 2505 2506 static void 2507 _bdev_nvme_cancel_reconnect_and_disable_ctrlr(void *ctx) 2508 { 2509 struct nvme_ctrlr *nvme_ctrlr = ctx; 2510 2511 assert(nvme_ctrlr->resetting == true); 2512 assert(nvme_ctrlr->thread == spdk_get_thread()); 2513 2514 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 2515 2516 bdev_nvme_disable_ctrlr_complete(nvme_ctrlr); 2517 } 2518 2519 static void 2520 _bdev_nvme_disconnect_and_disable_ctrlr(void *ctx) 2521 { 2522 struct nvme_ctrlr *nvme_ctrlr = ctx; 2523 2524 assert(nvme_ctrlr->resetting == true); 2525 assert(nvme_ctrlr->thread == spdk_get_thread()); 2526 2527 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2528 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_disable_destroy_qpairs); 2529 } else { 2530 bdev_nvme_disable_destroy_qpairs(nvme_ctrlr); 2531 } 2532 } 2533 2534 static int 2535 bdev_nvme_disable_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2536 { 2537 spdk_msg_fn msg_fn; 2538 2539 pthread_mutex_lock(&nvme_ctrlr->mutex); 2540 if (nvme_ctrlr->destruct) { 2541 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2542 return -ENXIO; 2543 } 2544 2545 if (nvme_ctrlr->resetting) { 2546 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2547 return -EBUSY; 2548 } 2549 2550 if (nvme_ctrlr->disabled) { 2551 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2552 return -EALREADY; 2553 } 2554 2555 nvme_ctrlr->resetting = true; 2556 nvme_ctrlr->dont_retry = true; 2557 2558 if (nvme_ctrlr->reconnect_is_delayed) { 2559 msg_fn = _bdev_nvme_cancel_reconnect_and_disable_ctrlr; 2560 nvme_ctrlr->reconnect_is_delayed = false; 2561 } else { 2562 msg_fn = _bdev_nvme_disconnect_and_disable_ctrlr; 2563 } 2564 2565 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2566 2567 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2568 2569 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 2570 return 0; 2571 } 2572 2573 static int 2574 nvme_ctrlr_op(struct nvme_ctrlr *nvme_ctrlr, enum nvme_ctrlr_op op, 2575 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2576 { 2577 int rc; 2578 2579 switch (op) { 2580 case NVME_CTRLR_OP_RESET: 2581 rc = bdev_nvme_reset_ctrlr(nvme_ctrlr); 2582 break; 2583 case NVME_CTRLR_OP_ENABLE: 2584 rc = bdev_nvme_enable_ctrlr(nvme_ctrlr); 2585 break; 2586 case NVME_CTRLR_OP_DISABLE: 2587 rc = bdev_nvme_disable_ctrlr(nvme_ctrlr); 2588 break; 2589 default: 2590 rc = -EINVAL; 2591 break; 2592 } 2593 2594 if (rc == 0) { 2595 assert(nvme_ctrlr->ctrlr_op_cb_fn == NULL); 2596 assert(nvme_ctrlr->ctrlr_op_cb_arg == NULL); 2597 nvme_ctrlr->ctrlr_op_cb_fn = cb_fn; 2598 nvme_ctrlr->ctrlr_op_cb_arg = cb_arg; 2599 } 2600 return rc; 2601 } 2602 2603 struct nvme_ctrlr_op_rpc_ctx { 2604 struct nvme_ctrlr *nvme_ctrlr; 2605 struct spdk_thread *orig_thread; 2606 enum nvme_ctrlr_op op; 2607 int rc; 2608 bdev_nvme_ctrlr_op_cb cb_fn; 2609 void *cb_arg; 2610 }; 2611 2612 static void 2613 _nvme_ctrlr_op_rpc_complete(void *_ctx) 2614 { 2615 struct nvme_ctrlr_op_rpc_ctx *ctx = _ctx; 2616 2617 assert(ctx != NULL); 2618 assert(ctx->cb_fn != NULL); 2619 2620 ctx->cb_fn(ctx->cb_arg, ctx->rc); 2621 2622 free(ctx); 2623 } 2624 2625 static void 2626 nvme_ctrlr_op_rpc_complete(void *cb_arg, int rc) 2627 { 2628 struct nvme_ctrlr_op_rpc_ctx *ctx = cb_arg; 2629 2630 ctx->rc = rc; 2631 2632 spdk_thread_send_msg(ctx->orig_thread, _nvme_ctrlr_op_rpc_complete, ctx); 2633 } 2634 2635 void 2636 nvme_ctrlr_op_rpc(struct nvme_ctrlr *nvme_ctrlr, enum nvme_ctrlr_op op, 2637 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2638 { 2639 struct nvme_ctrlr_op_rpc_ctx *ctx; 2640 int rc; 2641 2642 assert(cb_fn != NULL); 2643 2644 ctx = calloc(1, sizeof(*ctx)); 2645 if (ctx == NULL) { 2646 SPDK_ERRLOG("Failed to allocate nvme_ctrlr_op_rpc_ctx.\n"); 2647 cb_fn(cb_arg, -ENOMEM); 2648 return; 2649 } 2650 2651 ctx->orig_thread = spdk_get_thread(); 2652 ctx->cb_fn = cb_fn; 2653 ctx->cb_arg = cb_arg; 2654 2655 rc = nvme_ctrlr_op(nvme_ctrlr, op, nvme_ctrlr_op_rpc_complete, ctx); 2656 if (rc == 0) { 2657 return; 2658 } else if (rc == -EALREADY) { 2659 rc = 0; 2660 } 2661 2662 nvme_ctrlr_op_rpc_complete(ctx, rc); 2663 } 2664 2665 static void nvme_bdev_ctrlr_op_rpc_continue(void *cb_arg, int rc); 2666 2667 static void 2668 _nvme_bdev_ctrlr_op_rpc_continue(void *_ctx) 2669 { 2670 struct nvme_ctrlr_op_rpc_ctx *ctx = _ctx; 2671 struct nvme_ctrlr *prev_nvme_ctrlr, *next_nvme_ctrlr; 2672 int rc; 2673 2674 prev_nvme_ctrlr = ctx->nvme_ctrlr; 2675 ctx->nvme_ctrlr = NULL; 2676 2677 if (ctx->rc != 0) { 2678 goto complete; 2679 } 2680 2681 next_nvme_ctrlr = TAILQ_NEXT(prev_nvme_ctrlr, tailq); 2682 if (next_nvme_ctrlr == NULL) { 2683 goto complete; 2684 } 2685 2686 rc = nvme_ctrlr_op(next_nvme_ctrlr, ctx->op, nvme_bdev_ctrlr_op_rpc_continue, ctx); 2687 if (rc == 0) { 2688 ctx->nvme_ctrlr = next_nvme_ctrlr; 2689 return; 2690 } else if (rc == -EALREADY) { 2691 ctx->nvme_ctrlr = next_nvme_ctrlr; 2692 rc = 0; 2693 } 2694 2695 ctx->rc = rc; 2696 2697 complete: 2698 ctx->cb_fn(ctx->cb_arg, ctx->rc); 2699 free(ctx); 2700 } 2701 2702 static void 2703 nvme_bdev_ctrlr_op_rpc_continue(void *cb_arg, int rc) 2704 { 2705 struct nvme_ctrlr_op_rpc_ctx *ctx = cb_arg; 2706 2707 ctx->rc = rc; 2708 2709 spdk_thread_send_msg(ctx->orig_thread, _nvme_bdev_ctrlr_op_rpc_continue, ctx); 2710 } 2711 2712 void 2713 nvme_bdev_ctrlr_op_rpc(struct nvme_bdev_ctrlr *nbdev_ctrlr, enum nvme_ctrlr_op op, 2714 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2715 { 2716 struct nvme_ctrlr_op_rpc_ctx *ctx; 2717 struct nvme_ctrlr *nvme_ctrlr; 2718 int rc; 2719 2720 assert(cb_fn != NULL); 2721 2722 ctx = calloc(1, sizeof(*ctx)); 2723 if (ctx == NULL) { 2724 SPDK_ERRLOG("Failed to allocate nvme_ctrlr_op_rpc_ctx.\n"); 2725 cb_fn(cb_arg, -ENOMEM); 2726 return; 2727 } 2728 2729 ctx->orig_thread = spdk_get_thread(); 2730 ctx->op = op; 2731 ctx->cb_fn = cb_fn; 2732 ctx->cb_arg = cb_arg; 2733 2734 nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 2735 assert(nvme_ctrlr != NULL); 2736 2737 rc = nvme_ctrlr_op(nvme_ctrlr, op, nvme_bdev_ctrlr_op_rpc_continue, ctx); 2738 if (rc == 0) { 2739 ctx->nvme_ctrlr = nvme_ctrlr; 2740 return; 2741 } else if (rc == -EALREADY) { 2742 ctx->nvme_ctrlr = nvme_ctrlr; 2743 rc = 0; 2744 } 2745 2746 nvme_bdev_ctrlr_op_rpc_continue(ctx, rc); 2747 } 2748 2749 static int _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio); 2750 2751 static void 2752 _bdev_nvme_reset_io_complete(struct spdk_io_channel_iter *i, int status) 2753 { 2754 struct nvme_bdev_io *bio = spdk_io_channel_iter_get_ctx(i); 2755 enum spdk_bdev_io_status io_status; 2756 2757 if (bio->cpl.cdw0 == 0) { 2758 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 2759 } else { 2760 io_status = SPDK_BDEV_IO_STATUS_FAILED; 2761 } 2762 2763 __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), io_status, NULL); 2764 } 2765 2766 static void 2767 bdev_nvme_abort_bdev_channel(struct spdk_io_channel_iter *i) 2768 { 2769 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 2770 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 2771 2772 bdev_nvme_abort_retry_ios(nbdev_ch); 2773 2774 spdk_for_each_channel_continue(i, 0); 2775 } 2776 2777 static void 2778 bdev_nvme_reset_io_complete(struct nvme_bdev_io *bio) 2779 { 2780 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2781 struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt; 2782 2783 /* Abort all queued I/Os for retry. */ 2784 spdk_for_each_channel(nbdev, 2785 bdev_nvme_abort_bdev_channel, 2786 bio, 2787 _bdev_nvme_reset_io_complete); 2788 } 2789 2790 static void 2791 _bdev_nvme_reset_io_continue(void *ctx) 2792 { 2793 struct nvme_bdev_io *bio = ctx; 2794 struct nvme_io_path *prev_io_path, *next_io_path; 2795 int rc; 2796 2797 prev_io_path = bio->io_path; 2798 bio->io_path = NULL; 2799 2800 if (bio->cpl.cdw0 != 0) { 2801 goto complete; 2802 } 2803 2804 next_io_path = STAILQ_NEXT(prev_io_path, stailq); 2805 if (next_io_path == NULL) { 2806 goto complete; 2807 } 2808 2809 rc = _bdev_nvme_reset_io(next_io_path, bio); 2810 if (rc == 0) { 2811 return; 2812 } 2813 2814 bio->cpl.cdw0 = 1; 2815 2816 complete: 2817 bdev_nvme_reset_io_complete(bio); 2818 } 2819 2820 static void 2821 bdev_nvme_reset_io_continue(void *cb_arg, int rc) 2822 { 2823 struct nvme_bdev_io *bio = cb_arg; 2824 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2825 2826 bio->cpl.cdw0 = (rc == 0) ? 0 : 1; 2827 2828 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), _bdev_nvme_reset_io_continue, bio); 2829 } 2830 2831 static int 2832 _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio) 2833 { 2834 struct nvme_ctrlr_channel *ctrlr_ch; 2835 struct spdk_bdev_io *bdev_io; 2836 int rc; 2837 2838 rc = nvme_ctrlr_op(io_path->qpair->ctrlr, NVME_CTRLR_OP_RESET, 2839 bdev_nvme_reset_io_continue, bio); 2840 if (rc != 0 && rc != -EBUSY) { 2841 return rc; 2842 } 2843 2844 assert(bio->io_path == NULL); 2845 bio->io_path = io_path; 2846 2847 if (rc == -EBUSY) { 2848 ctrlr_ch = io_path->qpair->ctrlr_ch; 2849 assert(ctrlr_ch != NULL); 2850 /* 2851 * Reset call is queued only if it is from the app framework. This is on purpose so that 2852 * we don't interfere with the app framework reset strategy. i.e. we are deferring to the 2853 * upper level. If they are in the middle of a reset, we won't try to schedule another one. 2854 */ 2855 bdev_io = spdk_bdev_io_from_ctx(bio); 2856 TAILQ_INSERT_TAIL(&ctrlr_ch->pending_resets, bdev_io, module_link); 2857 } 2858 2859 return 0; 2860 } 2861 2862 static void 2863 bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio) 2864 { 2865 struct nvme_io_path *io_path; 2866 int rc; 2867 2868 bio->cpl.cdw0 = 0; 2869 2870 /* Reset all nvme_ctrlrs of a bdev controller sequentially. */ 2871 io_path = STAILQ_FIRST(&nbdev_ch->io_path_list); 2872 assert(io_path != NULL); 2873 2874 rc = _bdev_nvme_reset_io(io_path, bio); 2875 if (rc != 0) { 2876 /* If the current nvme_ctrlr is disabled, skip it and move to the next nvme_ctrlr. */ 2877 rc = (rc == -EALREADY) ? 0 : rc; 2878 2879 bdev_nvme_reset_io_continue(bio, rc); 2880 } 2881 } 2882 2883 static int 2884 bdev_nvme_failover_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool remove) 2885 { 2886 if (nvme_ctrlr->destruct) { 2887 /* Don't bother resetting if the controller is in the process of being destructed. */ 2888 return -ENXIO; 2889 } 2890 2891 if (nvme_ctrlr->resetting) { 2892 if (!nvme_ctrlr->in_failover) { 2893 SPDK_NOTICELOG("Reset is already in progress. Defer failover until reset completes.\n"); 2894 2895 /* Defer failover until reset completes. */ 2896 nvme_ctrlr->pending_failover = true; 2897 return -EINPROGRESS; 2898 } else { 2899 SPDK_NOTICELOG("Unable to perform failover, already in progress.\n"); 2900 return -EBUSY; 2901 } 2902 } 2903 2904 bdev_nvme_failover_trid(nvme_ctrlr, remove, true); 2905 2906 if (nvme_ctrlr->reconnect_is_delayed) { 2907 SPDK_NOTICELOG("Reconnect is already scheduled.\n"); 2908 2909 /* We rely on the next reconnect for the failover. */ 2910 return -EALREADY; 2911 } 2912 2913 if (nvme_ctrlr->disabled) { 2914 SPDK_NOTICELOG("Controller is disabled.\n"); 2915 2916 /* We rely on the enablement for the failover. */ 2917 return -EALREADY; 2918 } 2919 2920 nvme_ctrlr->resetting = true; 2921 nvme_ctrlr->in_failover = true; 2922 2923 assert(nvme_ctrlr->reset_start_tsc == 0); 2924 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2925 2926 return 0; 2927 } 2928 2929 static int 2930 bdev_nvme_failover_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2931 { 2932 int rc; 2933 2934 pthread_mutex_lock(&nvme_ctrlr->mutex); 2935 rc = bdev_nvme_failover_ctrlr_unsafe(nvme_ctrlr, false); 2936 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2937 2938 if (rc == 0) { 2939 spdk_thread_send_msg(nvme_ctrlr->thread, _bdev_nvme_reset_ctrlr, nvme_ctrlr); 2940 } else if (rc == -EALREADY) { 2941 rc = 0; 2942 } 2943 2944 return rc; 2945 } 2946 2947 static int bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, 2948 uint64_t num_blocks); 2949 2950 static int bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, 2951 uint64_t num_blocks); 2952 2953 static int bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, 2954 uint64_t src_offset_blocks, 2955 uint64_t num_blocks); 2956 2957 static void 2958 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 2959 bool success) 2960 { 2961 struct nvme_bdev_io *bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 2962 int ret; 2963 2964 if (!success) { 2965 ret = -EINVAL; 2966 goto exit; 2967 } 2968 2969 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 2970 ret = -ENXIO; 2971 goto exit; 2972 } 2973 2974 ret = bdev_nvme_readv(bio, 2975 bdev_io->u.bdev.iovs, 2976 bdev_io->u.bdev.iovcnt, 2977 bdev_io->u.bdev.md_buf, 2978 bdev_io->u.bdev.num_blocks, 2979 bdev_io->u.bdev.offset_blocks, 2980 bdev_io->u.bdev.dif_check_flags, 2981 bdev_io->u.bdev.memory_domain, 2982 bdev_io->u.bdev.memory_domain_ctx, 2983 bdev_io->u.bdev.accel_sequence); 2984 2985 exit: 2986 if (spdk_unlikely(ret != 0)) { 2987 bdev_nvme_io_complete(bio, ret); 2988 } 2989 } 2990 2991 static inline void 2992 _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io) 2993 { 2994 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 2995 struct spdk_bdev *bdev = bdev_io->bdev; 2996 struct nvme_bdev_io *nbdev_io_to_abort; 2997 int rc = 0; 2998 2999 switch (bdev_io->type) { 3000 case SPDK_BDEV_IO_TYPE_READ: 3001 if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) { 3002 3003 rc = bdev_nvme_readv(nbdev_io, 3004 bdev_io->u.bdev.iovs, 3005 bdev_io->u.bdev.iovcnt, 3006 bdev_io->u.bdev.md_buf, 3007 bdev_io->u.bdev.num_blocks, 3008 bdev_io->u.bdev.offset_blocks, 3009 bdev_io->u.bdev.dif_check_flags, 3010 bdev_io->u.bdev.memory_domain, 3011 bdev_io->u.bdev.memory_domain_ctx, 3012 bdev_io->u.bdev.accel_sequence); 3013 } else { 3014 spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb, 3015 bdev_io->u.bdev.num_blocks * bdev->blocklen); 3016 rc = 0; 3017 } 3018 break; 3019 case SPDK_BDEV_IO_TYPE_WRITE: 3020 rc = bdev_nvme_writev(nbdev_io, 3021 bdev_io->u.bdev.iovs, 3022 bdev_io->u.bdev.iovcnt, 3023 bdev_io->u.bdev.md_buf, 3024 bdev_io->u.bdev.num_blocks, 3025 bdev_io->u.bdev.offset_blocks, 3026 bdev_io->u.bdev.dif_check_flags, 3027 bdev_io->u.bdev.memory_domain, 3028 bdev_io->u.bdev.memory_domain_ctx, 3029 bdev_io->u.bdev.accel_sequence, 3030 bdev_io->u.bdev.nvme_cdw12, 3031 bdev_io->u.bdev.nvme_cdw13); 3032 break; 3033 case SPDK_BDEV_IO_TYPE_COMPARE: 3034 rc = bdev_nvme_comparev(nbdev_io, 3035 bdev_io->u.bdev.iovs, 3036 bdev_io->u.bdev.iovcnt, 3037 bdev_io->u.bdev.md_buf, 3038 bdev_io->u.bdev.num_blocks, 3039 bdev_io->u.bdev.offset_blocks, 3040 bdev_io->u.bdev.dif_check_flags); 3041 break; 3042 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 3043 rc = bdev_nvme_comparev_and_writev(nbdev_io, 3044 bdev_io->u.bdev.iovs, 3045 bdev_io->u.bdev.iovcnt, 3046 bdev_io->u.bdev.fused_iovs, 3047 bdev_io->u.bdev.fused_iovcnt, 3048 bdev_io->u.bdev.md_buf, 3049 bdev_io->u.bdev.num_blocks, 3050 bdev_io->u.bdev.offset_blocks, 3051 bdev_io->u.bdev.dif_check_flags); 3052 break; 3053 case SPDK_BDEV_IO_TYPE_UNMAP: 3054 rc = bdev_nvme_unmap(nbdev_io, 3055 bdev_io->u.bdev.offset_blocks, 3056 bdev_io->u.bdev.num_blocks); 3057 break; 3058 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3059 rc = bdev_nvme_write_zeroes(nbdev_io, 3060 bdev_io->u.bdev.offset_blocks, 3061 bdev_io->u.bdev.num_blocks); 3062 break; 3063 case SPDK_BDEV_IO_TYPE_RESET: 3064 nbdev_io->io_path = NULL; 3065 bdev_nvme_reset_io(nbdev_ch, nbdev_io); 3066 return; 3067 3068 case SPDK_BDEV_IO_TYPE_FLUSH: 3069 bdev_nvme_io_complete(nbdev_io, 0); 3070 return; 3071 3072 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 3073 rc = bdev_nvme_zone_appendv(nbdev_io, 3074 bdev_io->u.bdev.iovs, 3075 bdev_io->u.bdev.iovcnt, 3076 bdev_io->u.bdev.md_buf, 3077 bdev_io->u.bdev.num_blocks, 3078 bdev_io->u.bdev.offset_blocks, 3079 bdev_io->u.bdev.dif_check_flags); 3080 break; 3081 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 3082 rc = bdev_nvme_get_zone_info(nbdev_io, 3083 bdev_io->u.zone_mgmt.zone_id, 3084 bdev_io->u.zone_mgmt.num_zones, 3085 bdev_io->u.zone_mgmt.buf); 3086 break; 3087 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 3088 rc = bdev_nvme_zone_management(nbdev_io, 3089 bdev_io->u.zone_mgmt.zone_id, 3090 bdev_io->u.zone_mgmt.zone_action); 3091 break; 3092 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 3093 nbdev_io->io_path = NULL; 3094 bdev_nvme_admin_passthru(nbdev_ch, 3095 nbdev_io, 3096 &bdev_io->u.nvme_passthru.cmd, 3097 bdev_io->u.nvme_passthru.buf, 3098 bdev_io->u.nvme_passthru.nbytes); 3099 return; 3100 3101 case SPDK_BDEV_IO_TYPE_NVME_IO: 3102 rc = bdev_nvme_io_passthru(nbdev_io, 3103 &bdev_io->u.nvme_passthru.cmd, 3104 bdev_io->u.nvme_passthru.buf, 3105 bdev_io->u.nvme_passthru.nbytes); 3106 break; 3107 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3108 rc = bdev_nvme_io_passthru_md(nbdev_io, 3109 &bdev_io->u.nvme_passthru.cmd, 3110 bdev_io->u.nvme_passthru.buf, 3111 bdev_io->u.nvme_passthru.nbytes, 3112 bdev_io->u.nvme_passthru.md_buf, 3113 bdev_io->u.nvme_passthru.md_len); 3114 break; 3115 case SPDK_BDEV_IO_TYPE_NVME_IOV_MD: 3116 rc = bdev_nvme_iov_passthru_md(nbdev_io, 3117 &bdev_io->u.nvme_passthru.cmd, 3118 bdev_io->u.nvme_passthru.iovs, 3119 bdev_io->u.nvme_passthru.iovcnt, 3120 bdev_io->u.nvme_passthru.nbytes, 3121 bdev_io->u.nvme_passthru.md_buf, 3122 bdev_io->u.nvme_passthru.md_len); 3123 break; 3124 case SPDK_BDEV_IO_TYPE_ABORT: 3125 nbdev_io->io_path = NULL; 3126 nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx; 3127 bdev_nvme_abort(nbdev_ch, 3128 nbdev_io, 3129 nbdev_io_to_abort); 3130 return; 3131 3132 case SPDK_BDEV_IO_TYPE_COPY: 3133 rc = bdev_nvme_copy(nbdev_io, 3134 bdev_io->u.bdev.offset_blocks, 3135 bdev_io->u.bdev.copy.src_offset_blocks, 3136 bdev_io->u.bdev.num_blocks); 3137 break; 3138 default: 3139 rc = -EINVAL; 3140 break; 3141 } 3142 3143 if (spdk_unlikely(rc != 0)) { 3144 bdev_nvme_io_complete(nbdev_io, rc); 3145 } 3146 } 3147 3148 static void 3149 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 3150 { 3151 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 3152 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 3153 3154 if (spdk_likely(nbdev_io->submit_tsc == 0)) { 3155 nbdev_io->submit_tsc = spdk_bdev_io_get_submit_tsc(bdev_io); 3156 } else { 3157 /* There are cases where submit_tsc != 0, i.e. retry I/O. 3158 * We need to update submit_tsc here. 3159 */ 3160 nbdev_io->submit_tsc = spdk_get_ticks(); 3161 } 3162 3163 spdk_trace_record(TRACE_BDEV_NVME_IO_START, 0, 0, (uintptr_t)nbdev_io, (uintptr_t)bdev_io); 3164 nbdev_io->io_path = bdev_nvme_find_io_path(nbdev_ch); 3165 if (spdk_unlikely(!nbdev_io->io_path)) { 3166 if (!bdev_nvme_io_type_is_admin(bdev_io->type)) { 3167 bdev_nvme_io_complete(nbdev_io, -ENXIO); 3168 return; 3169 } 3170 3171 /* Admin commands do not use the optimal I/O path. 3172 * Simply fall through even if it is not found. 3173 */ 3174 } 3175 3176 _bdev_nvme_submit_request(nbdev_ch, bdev_io); 3177 } 3178 3179 static bool 3180 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 3181 { 3182 struct nvme_bdev *nbdev = ctx; 3183 struct nvme_ns *nvme_ns; 3184 struct spdk_nvme_ns *ns; 3185 struct spdk_nvme_ctrlr *ctrlr; 3186 const struct spdk_nvme_ctrlr_data *cdata; 3187 3188 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 3189 assert(nvme_ns != NULL); 3190 ns = nvme_ns->ns; 3191 if (ns == NULL) { 3192 return false; 3193 } 3194 3195 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 3196 3197 switch (io_type) { 3198 case SPDK_BDEV_IO_TYPE_READ: 3199 case SPDK_BDEV_IO_TYPE_WRITE: 3200 case SPDK_BDEV_IO_TYPE_RESET: 3201 case SPDK_BDEV_IO_TYPE_FLUSH: 3202 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 3203 case SPDK_BDEV_IO_TYPE_NVME_IO: 3204 case SPDK_BDEV_IO_TYPE_ABORT: 3205 return true; 3206 3207 case SPDK_BDEV_IO_TYPE_COMPARE: 3208 return spdk_nvme_ns_supports_compare(ns); 3209 3210 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3211 return spdk_nvme_ns_get_md_size(ns) ? true : false; 3212 3213 case SPDK_BDEV_IO_TYPE_UNMAP: 3214 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3215 return cdata->oncs.dsm; 3216 3217 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3218 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3219 return cdata->oncs.write_zeroes; 3220 3221 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 3222 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 3223 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) { 3224 return true; 3225 } 3226 return false; 3227 3228 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 3229 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 3230 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS; 3231 3232 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 3233 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS && 3234 spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ZONE_APPEND_SUPPORTED; 3235 3236 case SPDK_BDEV_IO_TYPE_COPY: 3237 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3238 return cdata->oncs.copy; 3239 3240 default: 3241 return false; 3242 } 3243 } 3244 3245 static int 3246 nvme_qpair_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ctrlr_channel *ctrlr_ch) 3247 { 3248 struct nvme_qpair *nvme_qpair; 3249 struct spdk_io_channel *pg_ch; 3250 int rc; 3251 3252 nvme_qpair = calloc(1, sizeof(*nvme_qpair)); 3253 if (!nvme_qpair) { 3254 SPDK_ERRLOG("Failed to alloc nvme_qpair.\n"); 3255 return -1; 3256 } 3257 3258 TAILQ_INIT(&nvme_qpair->io_path_list); 3259 3260 nvme_qpair->ctrlr = nvme_ctrlr; 3261 nvme_qpair->ctrlr_ch = ctrlr_ch; 3262 3263 pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs); 3264 if (!pg_ch) { 3265 free(nvme_qpair); 3266 return -1; 3267 } 3268 3269 nvme_qpair->group = spdk_io_channel_get_ctx(pg_ch); 3270 3271 #ifdef SPDK_CONFIG_VTUNE 3272 nvme_qpair->group->collect_spin_stat = true; 3273 #else 3274 nvme_qpair->group->collect_spin_stat = false; 3275 #endif 3276 3277 if (!nvme_ctrlr->disabled) { 3278 /* If a nvme_ctrlr is disabled, don't try to create qpair for it. Qpair will 3279 * be created when it's enabled. 3280 */ 3281 rc = bdev_nvme_create_qpair(nvme_qpair); 3282 if (rc != 0) { 3283 /* nvme_ctrlr can't create IO qpair if connection is down. 3284 * If reconnect_delay_sec is non-zero, creating IO qpair is retried 3285 * after reconnect_delay_sec seconds. If bdev_retry_count is non-zero, 3286 * submitted IO will be queued until IO qpair is successfully created. 3287 * 3288 * Hence, if both are satisfied, ignore the failure. 3289 */ 3290 if (nvme_ctrlr->opts.reconnect_delay_sec == 0 || g_opts.bdev_retry_count == 0) { 3291 spdk_put_io_channel(pg_ch); 3292 free(nvme_qpair); 3293 return rc; 3294 } 3295 } 3296 } 3297 3298 TAILQ_INSERT_TAIL(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 3299 3300 ctrlr_ch->qpair = nvme_qpair; 3301 3302 pthread_mutex_lock(&nvme_qpair->ctrlr->mutex); 3303 nvme_qpair->ctrlr->ref++; 3304 pthread_mutex_unlock(&nvme_qpair->ctrlr->mutex); 3305 3306 return 0; 3307 } 3308 3309 static int 3310 bdev_nvme_create_ctrlr_channel_cb(void *io_device, void *ctx_buf) 3311 { 3312 struct nvme_ctrlr *nvme_ctrlr = io_device; 3313 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 3314 3315 TAILQ_INIT(&ctrlr_ch->pending_resets); 3316 3317 return nvme_qpair_create(nvme_ctrlr, ctrlr_ch); 3318 } 3319 3320 static void 3321 nvme_qpair_delete(struct nvme_qpair *nvme_qpair) 3322 { 3323 struct nvme_io_path *io_path, *next; 3324 3325 assert(nvme_qpair->group != NULL); 3326 3327 TAILQ_FOREACH_SAFE(io_path, &nvme_qpair->io_path_list, tailq, next) { 3328 TAILQ_REMOVE(&nvme_qpair->io_path_list, io_path, tailq); 3329 nvme_io_path_free(io_path); 3330 } 3331 3332 TAILQ_REMOVE(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 3333 3334 spdk_put_io_channel(spdk_io_channel_from_ctx(nvme_qpair->group)); 3335 3336 nvme_ctrlr_release(nvme_qpair->ctrlr); 3337 3338 free(nvme_qpair); 3339 } 3340 3341 static void 3342 bdev_nvme_destroy_ctrlr_channel_cb(void *io_device, void *ctx_buf) 3343 { 3344 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 3345 struct nvme_qpair *nvme_qpair; 3346 3347 nvme_qpair = ctrlr_ch->qpair; 3348 assert(nvme_qpair != NULL); 3349 3350 _bdev_nvme_clear_io_path_cache(nvme_qpair); 3351 3352 if (nvme_qpair->qpair != NULL) { 3353 if (ctrlr_ch->reset_iter == NULL) { 3354 spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); 3355 } else { 3356 /* Skip current ctrlr_channel in a full reset sequence because 3357 * it is being deleted now. The qpair is already being disconnected. 3358 * We do not have to restart disconnecting it. 3359 */ 3360 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 3361 } 3362 3363 /* We cannot release a reference to the poll group now. 3364 * The qpair may be disconnected asynchronously later. 3365 * We need to poll it until it is actually disconnected. 3366 * Just detach the qpair from the deleting ctrlr_channel. 3367 */ 3368 nvme_qpair->ctrlr_ch = NULL; 3369 } else { 3370 assert(ctrlr_ch->reset_iter == NULL); 3371 3372 nvme_qpair_delete(nvme_qpair); 3373 } 3374 } 3375 3376 static inline struct spdk_io_channel * 3377 bdev_nvme_get_accel_channel(struct nvme_poll_group *group) 3378 { 3379 if (spdk_unlikely(!group->accel_channel)) { 3380 group->accel_channel = spdk_accel_get_io_channel(); 3381 if (!group->accel_channel) { 3382 SPDK_ERRLOG("Cannot get the accel_channel for bdev nvme polling group=%p\n", 3383 group); 3384 return NULL; 3385 } 3386 } 3387 3388 return group->accel_channel; 3389 } 3390 3391 static void 3392 bdev_nvme_submit_accel_crc32c(void *ctx, uint32_t *dst, struct iovec *iov, 3393 uint32_t iov_cnt, uint32_t seed, 3394 spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) 3395 { 3396 struct spdk_io_channel *accel_ch; 3397 struct nvme_poll_group *group = ctx; 3398 int rc; 3399 3400 assert(cb_fn != NULL); 3401 3402 accel_ch = bdev_nvme_get_accel_channel(group); 3403 if (spdk_unlikely(accel_ch == NULL)) { 3404 cb_fn(cb_arg, -ENOMEM); 3405 return; 3406 } 3407 3408 rc = spdk_accel_submit_crc32cv(accel_ch, dst, iov, iov_cnt, seed, cb_fn, cb_arg); 3409 if (rc) { 3410 /* For the two cases, spdk_accel_submit_crc32cv does not call the user's cb_fn */ 3411 if (rc == -ENOMEM || rc == -EINVAL) { 3412 cb_fn(cb_arg, rc); 3413 } 3414 SPDK_ERRLOG("Cannot complete the accelerated crc32c operation with iov=%p\n", iov); 3415 } 3416 } 3417 3418 static void 3419 bdev_nvme_finish_sequence(void *seq, spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) 3420 { 3421 spdk_accel_sequence_finish(seq, cb_fn, cb_arg); 3422 } 3423 3424 static void 3425 bdev_nvme_abort_sequence(void *seq) 3426 { 3427 spdk_accel_sequence_abort(seq); 3428 } 3429 3430 static void 3431 bdev_nvme_reverse_sequence(void *seq) 3432 { 3433 spdk_accel_sequence_reverse(seq); 3434 } 3435 3436 static int 3437 bdev_nvme_append_crc32c(void *ctx, void **seq, uint32_t *dst, struct iovec *iovs, uint32_t iovcnt, 3438 struct spdk_memory_domain *domain, void *domain_ctx, uint32_t seed, 3439 spdk_nvme_accel_step_cb cb_fn, void *cb_arg) 3440 { 3441 struct spdk_io_channel *ch; 3442 struct nvme_poll_group *group = ctx; 3443 3444 ch = bdev_nvme_get_accel_channel(group); 3445 if (spdk_unlikely(ch == NULL)) { 3446 return -ENOMEM; 3447 } 3448 3449 return spdk_accel_append_crc32c((struct spdk_accel_sequence **)seq, ch, dst, iovs, iovcnt, 3450 domain, domain_ctx, seed, cb_fn, cb_arg); 3451 } 3452 3453 static struct spdk_nvme_accel_fn_table g_bdev_nvme_accel_fn_table = { 3454 .table_size = sizeof(struct spdk_nvme_accel_fn_table), 3455 .submit_accel_crc32c = bdev_nvme_submit_accel_crc32c, 3456 .append_crc32c = bdev_nvme_append_crc32c, 3457 .finish_sequence = bdev_nvme_finish_sequence, 3458 .reverse_sequence = bdev_nvme_reverse_sequence, 3459 .abort_sequence = bdev_nvme_abort_sequence, 3460 }; 3461 3462 static int 3463 bdev_nvme_create_poll_group_cb(void *io_device, void *ctx_buf) 3464 { 3465 struct nvme_poll_group *group = ctx_buf; 3466 3467 TAILQ_INIT(&group->qpair_list); 3468 3469 group->group = spdk_nvme_poll_group_create(group, &g_bdev_nvme_accel_fn_table); 3470 if (group->group == NULL) { 3471 return -1; 3472 } 3473 3474 group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us); 3475 3476 if (group->poller == NULL) { 3477 spdk_nvme_poll_group_destroy(group->group); 3478 return -1; 3479 } 3480 3481 return 0; 3482 } 3483 3484 static void 3485 bdev_nvme_destroy_poll_group_cb(void *io_device, void *ctx_buf) 3486 { 3487 struct nvme_poll_group *group = ctx_buf; 3488 3489 assert(TAILQ_EMPTY(&group->qpair_list)); 3490 3491 if (group->accel_channel) { 3492 spdk_put_io_channel(group->accel_channel); 3493 } 3494 3495 spdk_poller_unregister(&group->poller); 3496 if (spdk_nvme_poll_group_destroy(group->group)) { 3497 SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.\n"); 3498 assert(false); 3499 } 3500 } 3501 3502 static struct spdk_io_channel * 3503 bdev_nvme_get_io_channel(void *ctx) 3504 { 3505 struct nvme_bdev *nvme_bdev = ctx; 3506 3507 return spdk_get_io_channel(nvme_bdev); 3508 } 3509 3510 static void * 3511 bdev_nvme_get_module_ctx(void *ctx) 3512 { 3513 struct nvme_bdev *nvme_bdev = ctx; 3514 struct nvme_ns *nvme_ns; 3515 3516 if (!nvme_bdev || nvme_bdev->disk.module != &nvme_if) { 3517 return NULL; 3518 } 3519 3520 nvme_ns = TAILQ_FIRST(&nvme_bdev->nvme_ns_list); 3521 if (!nvme_ns) { 3522 return NULL; 3523 } 3524 3525 return nvme_ns->ns; 3526 } 3527 3528 static const char * 3529 _nvme_ana_state_str(enum spdk_nvme_ana_state ana_state) 3530 { 3531 switch (ana_state) { 3532 case SPDK_NVME_ANA_OPTIMIZED_STATE: 3533 return "optimized"; 3534 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 3535 return "non_optimized"; 3536 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 3537 return "inaccessible"; 3538 case SPDK_NVME_ANA_PERSISTENT_LOSS_STATE: 3539 return "persistent_loss"; 3540 case SPDK_NVME_ANA_CHANGE_STATE: 3541 return "change"; 3542 default: 3543 return NULL; 3544 } 3545 } 3546 3547 static int 3548 bdev_nvme_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size) 3549 { 3550 struct spdk_memory_domain **_domains = NULL; 3551 struct nvme_bdev *nbdev = ctx; 3552 struct nvme_ns *nvme_ns; 3553 int i = 0, _array_size = array_size; 3554 int rc = 0; 3555 3556 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 3557 if (domains && array_size >= i) { 3558 _domains = &domains[i]; 3559 } else { 3560 _domains = NULL; 3561 } 3562 rc = spdk_nvme_ctrlr_get_memory_domains(nvme_ns->ctrlr->ctrlr, _domains, _array_size); 3563 if (rc > 0) { 3564 i += rc; 3565 if (_array_size >= rc) { 3566 _array_size -= rc; 3567 } else { 3568 _array_size = 0; 3569 } 3570 } else if (rc < 0) { 3571 return rc; 3572 } 3573 } 3574 3575 return i; 3576 } 3577 3578 static const char * 3579 nvme_ctrlr_get_state_str(struct nvme_ctrlr *nvme_ctrlr) 3580 { 3581 if (nvme_ctrlr->destruct) { 3582 return "deleting"; 3583 } else if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 3584 return "failed"; 3585 } else if (nvme_ctrlr->resetting) { 3586 return "resetting"; 3587 } else if (nvme_ctrlr->reconnect_is_delayed > 0) { 3588 return "reconnect_is_delayed"; 3589 } else if (nvme_ctrlr->disabled) { 3590 return "disabled"; 3591 } else { 3592 return "enabled"; 3593 } 3594 } 3595 3596 void 3597 nvme_ctrlr_info_json(struct spdk_json_write_ctx *w, struct nvme_ctrlr *nvme_ctrlr) 3598 { 3599 struct spdk_nvme_transport_id *trid; 3600 const struct spdk_nvme_ctrlr_opts *opts; 3601 const struct spdk_nvme_ctrlr_data *cdata; 3602 struct nvme_path_id *path_id; 3603 3604 spdk_json_write_object_begin(w); 3605 3606 spdk_json_write_named_string(w, "state", nvme_ctrlr_get_state_str(nvme_ctrlr)); 3607 3608 #ifdef SPDK_CONFIG_NVME_CUSE 3609 size_t cuse_name_size = 128; 3610 char cuse_name[cuse_name_size]; 3611 3612 int rc = spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr, cuse_name, &cuse_name_size); 3613 if (rc == 0) { 3614 spdk_json_write_named_string(w, "cuse_device", cuse_name); 3615 } 3616 #endif 3617 trid = &nvme_ctrlr->active_path_id->trid; 3618 spdk_json_write_named_object_begin(w, "trid"); 3619 nvme_bdev_dump_trid_json(trid, w); 3620 spdk_json_write_object_end(w); 3621 3622 path_id = TAILQ_NEXT(nvme_ctrlr->active_path_id, link); 3623 if (path_id != NULL) { 3624 spdk_json_write_named_array_begin(w, "alternate_trids"); 3625 do { 3626 trid = &path_id->trid; 3627 spdk_json_write_object_begin(w); 3628 nvme_bdev_dump_trid_json(trid, w); 3629 spdk_json_write_object_end(w); 3630 3631 path_id = TAILQ_NEXT(path_id, link); 3632 } while (path_id != NULL); 3633 spdk_json_write_array_end(w); 3634 } 3635 3636 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 3637 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 3638 3639 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 3640 spdk_json_write_named_object_begin(w, "host"); 3641 spdk_json_write_named_string(w, "nqn", opts->hostnqn); 3642 spdk_json_write_named_string(w, "addr", opts->src_addr); 3643 spdk_json_write_named_string(w, "svcid", opts->src_svcid); 3644 spdk_json_write_object_end(w); 3645 3646 spdk_json_write_object_end(w); 3647 } 3648 3649 static void 3650 nvme_namespace_info_json(struct spdk_json_write_ctx *w, 3651 struct nvme_ns *nvme_ns) 3652 { 3653 struct spdk_nvme_ns *ns; 3654 struct spdk_nvme_ctrlr *ctrlr; 3655 const struct spdk_nvme_ctrlr_data *cdata; 3656 const struct spdk_nvme_transport_id *trid; 3657 union spdk_nvme_vs_register vs; 3658 const struct spdk_nvme_ns_data *nsdata; 3659 char buf[128]; 3660 3661 ns = nvme_ns->ns; 3662 if (ns == NULL) { 3663 return; 3664 } 3665 3666 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 3667 3668 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3669 trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); 3670 vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr); 3671 3672 spdk_json_write_object_begin(w); 3673 3674 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 3675 spdk_json_write_named_string(w, "pci_address", trid->traddr); 3676 } 3677 3678 spdk_json_write_named_object_begin(w, "trid"); 3679 3680 nvme_bdev_dump_trid_json(trid, w); 3681 3682 spdk_json_write_object_end(w); 3683 3684 #ifdef SPDK_CONFIG_NVME_CUSE 3685 size_t cuse_name_size = 128; 3686 char cuse_name[cuse_name_size]; 3687 3688 int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns), 3689 cuse_name, &cuse_name_size); 3690 if (rc == 0) { 3691 spdk_json_write_named_string(w, "cuse_device", cuse_name); 3692 } 3693 #endif 3694 3695 spdk_json_write_named_object_begin(w, "ctrlr_data"); 3696 3697 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 3698 3699 spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid); 3700 3701 snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn); 3702 spdk_str_trim(buf); 3703 spdk_json_write_named_string(w, "model_number", buf); 3704 3705 snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn); 3706 spdk_str_trim(buf); 3707 spdk_json_write_named_string(w, "serial_number", buf); 3708 3709 snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr); 3710 spdk_str_trim(buf); 3711 spdk_json_write_named_string(w, "firmware_revision", buf); 3712 3713 if (cdata->subnqn[0] != '\0') { 3714 spdk_json_write_named_string(w, "subnqn", cdata->subnqn); 3715 } 3716 3717 spdk_json_write_named_object_begin(w, "oacs"); 3718 3719 spdk_json_write_named_uint32(w, "security", cdata->oacs.security); 3720 spdk_json_write_named_uint32(w, "format", cdata->oacs.format); 3721 spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware); 3722 spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage); 3723 3724 spdk_json_write_object_end(w); 3725 3726 spdk_json_write_named_bool(w, "multi_ctrlr", cdata->cmic.multi_ctrlr); 3727 spdk_json_write_named_bool(w, "ana_reporting", cdata->cmic.ana_reporting); 3728 3729 spdk_json_write_object_end(w); 3730 3731 spdk_json_write_named_object_begin(w, "vs"); 3732 3733 spdk_json_write_name(w, "nvme_version"); 3734 if (vs.bits.ter) { 3735 spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter); 3736 } else { 3737 spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr); 3738 } 3739 3740 spdk_json_write_object_end(w); 3741 3742 nsdata = spdk_nvme_ns_get_data(ns); 3743 3744 spdk_json_write_named_object_begin(w, "ns_data"); 3745 3746 spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns)); 3747 3748 if (cdata->cmic.ana_reporting) { 3749 spdk_json_write_named_string(w, "ana_state", 3750 _nvme_ana_state_str(nvme_ns->ana_state)); 3751 } 3752 3753 spdk_json_write_named_bool(w, "can_share", nsdata->nmic.can_share); 3754 3755 spdk_json_write_object_end(w); 3756 3757 if (cdata->oacs.security) { 3758 spdk_json_write_named_object_begin(w, "security"); 3759 3760 spdk_json_write_named_bool(w, "opal", nvme_ns->bdev->opal); 3761 3762 spdk_json_write_object_end(w); 3763 } 3764 3765 spdk_json_write_object_end(w); 3766 } 3767 3768 static const char * 3769 nvme_bdev_get_mp_policy_str(struct nvme_bdev *nbdev) 3770 { 3771 switch (nbdev->mp_policy) { 3772 case BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE: 3773 return "active_passive"; 3774 case BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE: 3775 return "active_active"; 3776 default: 3777 assert(false); 3778 return "invalid"; 3779 } 3780 } 3781 3782 static const char * 3783 nvme_bdev_get_mp_selector_str(struct nvme_bdev *nbdev) 3784 { 3785 switch (nbdev->mp_selector) { 3786 case BDEV_NVME_MP_SELECTOR_ROUND_ROBIN: 3787 return "round_robin"; 3788 case BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH: 3789 return "queue_depth"; 3790 default: 3791 assert(false); 3792 return "invalid"; 3793 } 3794 } 3795 3796 static int 3797 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) 3798 { 3799 struct nvme_bdev *nvme_bdev = ctx; 3800 struct nvme_ns *nvme_ns; 3801 3802 pthread_mutex_lock(&nvme_bdev->mutex); 3803 spdk_json_write_named_array_begin(w, "nvme"); 3804 TAILQ_FOREACH(nvme_ns, &nvme_bdev->nvme_ns_list, tailq) { 3805 nvme_namespace_info_json(w, nvme_ns); 3806 } 3807 spdk_json_write_array_end(w); 3808 spdk_json_write_named_string(w, "mp_policy", nvme_bdev_get_mp_policy_str(nvme_bdev)); 3809 if (nvme_bdev->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) { 3810 spdk_json_write_named_string(w, "selector", nvme_bdev_get_mp_selector_str(nvme_bdev)); 3811 if (nvme_bdev->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 3812 spdk_json_write_named_uint32(w, "rr_min_io", nvme_bdev->rr_min_io); 3813 } 3814 } 3815 pthread_mutex_unlock(&nvme_bdev->mutex); 3816 3817 return 0; 3818 } 3819 3820 static void 3821 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3822 { 3823 /* No config per bdev needed */ 3824 } 3825 3826 static uint64_t 3827 bdev_nvme_get_spin_time(struct spdk_io_channel *ch) 3828 { 3829 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 3830 struct nvme_io_path *io_path; 3831 struct nvme_poll_group *group; 3832 uint64_t spin_time = 0; 3833 3834 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 3835 group = io_path->qpair->group; 3836 3837 if (!group || !group->collect_spin_stat) { 3838 continue; 3839 } 3840 3841 if (group->end_ticks != 0) { 3842 group->spin_ticks += (group->end_ticks - group->start_ticks); 3843 group->end_ticks = 0; 3844 } 3845 3846 spin_time += group->spin_ticks; 3847 group->start_ticks = 0; 3848 group->spin_ticks = 0; 3849 } 3850 3851 return (spin_time * 1000000ULL) / spdk_get_ticks_hz(); 3852 } 3853 3854 static void 3855 bdev_nvme_reset_device_stat(void *ctx) 3856 { 3857 struct nvme_bdev *nbdev = ctx; 3858 3859 if (nbdev->err_stat != NULL) { 3860 memset(nbdev->err_stat, 0, sizeof(struct nvme_error_stat)); 3861 } 3862 } 3863 3864 /* JSON string should be lowercases and underscore delimited string. */ 3865 static void 3866 bdev_nvme_format_nvme_status(char *dst, const char *src) 3867 { 3868 char tmp[256]; 3869 3870 spdk_strcpy_replace(dst, 256, src, " - ", "_"); 3871 spdk_strcpy_replace(tmp, 256, dst, "-", "_"); 3872 spdk_strcpy_replace(dst, 256, tmp, " ", "_"); 3873 spdk_strlwr(dst); 3874 } 3875 3876 static void 3877 bdev_nvme_dump_device_stat_json(void *ctx, struct spdk_json_write_ctx *w) 3878 { 3879 struct nvme_bdev *nbdev = ctx; 3880 struct spdk_nvme_status status = {}; 3881 uint16_t sct, sc; 3882 char status_json[256]; 3883 const char *status_str; 3884 3885 if (nbdev->err_stat == NULL) { 3886 return; 3887 } 3888 3889 spdk_json_write_named_object_begin(w, "nvme_error"); 3890 3891 spdk_json_write_named_object_begin(w, "status_type"); 3892 for (sct = 0; sct < 8; sct++) { 3893 if (nbdev->err_stat->status_type[sct] == 0) { 3894 continue; 3895 } 3896 status.sct = sct; 3897 3898 status_str = spdk_nvme_cpl_get_status_type_string(&status); 3899 assert(status_str != NULL); 3900 bdev_nvme_format_nvme_status(status_json, status_str); 3901 3902 spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status_type[sct]); 3903 } 3904 spdk_json_write_object_end(w); 3905 3906 spdk_json_write_named_object_begin(w, "status_code"); 3907 for (sct = 0; sct < 4; sct++) { 3908 status.sct = sct; 3909 for (sc = 0; sc < 256; sc++) { 3910 if (nbdev->err_stat->status[sct][sc] == 0) { 3911 continue; 3912 } 3913 status.sc = sc; 3914 3915 status_str = spdk_nvme_cpl_get_status_string(&status); 3916 assert(status_str != NULL); 3917 bdev_nvme_format_nvme_status(status_json, status_str); 3918 3919 spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status[sct][sc]); 3920 } 3921 } 3922 spdk_json_write_object_end(w); 3923 3924 spdk_json_write_object_end(w); 3925 } 3926 3927 static bool 3928 bdev_nvme_accel_sequence_supported(void *ctx, enum spdk_bdev_io_type type) 3929 { 3930 struct nvme_bdev *nbdev = ctx; 3931 struct spdk_nvme_ctrlr *ctrlr; 3932 3933 if (!g_opts.allow_accel_sequence) { 3934 return false; 3935 } 3936 3937 switch (type) { 3938 case SPDK_BDEV_IO_TYPE_WRITE: 3939 case SPDK_BDEV_IO_TYPE_READ: 3940 break; 3941 default: 3942 return false; 3943 } 3944 3945 ctrlr = bdev_nvme_get_ctrlr(&nbdev->disk); 3946 assert(ctrlr != NULL); 3947 3948 return spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ACCEL_SEQUENCE_SUPPORTED; 3949 } 3950 3951 static const struct spdk_bdev_fn_table nvmelib_fn_table = { 3952 .destruct = bdev_nvme_destruct, 3953 .submit_request = bdev_nvme_submit_request, 3954 .io_type_supported = bdev_nvme_io_type_supported, 3955 .get_io_channel = bdev_nvme_get_io_channel, 3956 .dump_info_json = bdev_nvme_dump_info_json, 3957 .write_config_json = bdev_nvme_write_config_json, 3958 .get_spin_time = bdev_nvme_get_spin_time, 3959 .get_module_ctx = bdev_nvme_get_module_ctx, 3960 .get_memory_domains = bdev_nvme_get_memory_domains, 3961 .accel_sequence_supported = bdev_nvme_accel_sequence_supported, 3962 .reset_device_stat = bdev_nvme_reset_device_stat, 3963 .dump_device_stat_json = bdev_nvme_dump_device_stat_json, 3964 }; 3965 3966 typedef int (*bdev_nvme_parse_ana_log_page_cb)( 3967 const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg); 3968 3969 static int 3970 bdev_nvme_parse_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 3971 bdev_nvme_parse_ana_log_page_cb cb_fn, void *cb_arg) 3972 { 3973 struct spdk_nvme_ana_group_descriptor *copied_desc; 3974 uint8_t *orig_desc; 3975 uint32_t i, desc_size, copy_len; 3976 int rc = 0; 3977 3978 if (nvme_ctrlr->ana_log_page == NULL) { 3979 return -EINVAL; 3980 } 3981 3982 copied_desc = nvme_ctrlr->copied_ana_desc; 3983 3984 orig_desc = (uint8_t *)nvme_ctrlr->ana_log_page + sizeof(struct spdk_nvme_ana_page); 3985 copy_len = nvme_ctrlr->max_ana_log_page_size - sizeof(struct spdk_nvme_ana_page); 3986 3987 for (i = 0; i < nvme_ctrlr->ana_log_page->num_ana_group_desc; i++) { 3988 memcpy(copied_desc, orig_desc, copy_len); 3989 3990 rc = cb_fn(copied_desc, cb_arg); 3991 if (rc != 0) { 3992 break; 3993 } 3994 3995 desc_size = sizeof(struct spdk_nvme_ana_group_descriptor) + 3996 copied_desc->num_of_nsid * sizeof(uint32_t); 3997 orig_desc += desc_size; 3998 copy_len -= desc_size; 3999 } 4000 4001 return rc; 4002 } 4003 4004 static int 4005 nvme_ns_ana_transition_timedout(void *ctx) 4006 { 4007 struct nvme_ns *nvme_ns = ctx; 4008 4009 spdk_poller_unregister(&nvme_ns->anatt_timer); 4010 nvme_ns->ana_transition_timedout = true; 4011 4012 return SPDK_POLLER_BUSY; 4013 } 4014 4015 static void 4016 _nvme_ns_set_ana_state(struct nvme_ns *nvme_ns, 4017 const struct spdk_nvme_ana_group_descriptor *desc) 4018 { 4019 const struct spdk_nvme_ctrlr_data *cdata; 4020 4021 nvme_ns->ana_group_id = desc->ana_group_id; 4022 nvme_ns->ana_state = desc->ana_state; 4023 nvme_ns->ana_state_updating = false; 4024 4025 switch (nvme_ns->ana_state) { 4026 case SPDK_NVME_ANA_OPTIMIZED_STATE: 4027 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 4028 nvme_ns->ana_transition_timedout = false; 4029 spdk_poller_unregister(&nvme_ns->anatt_timer); 4030 break; 4031 4032 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 4033 case SPDK_NVME_ANA_CHANGE_STATE: 4034 if (nvme_ns->anatt_timer != NULL) { 4035 break; 4036 } 4037 4038 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 4039 nvme_ns->anatt_timer = SPDK_POLLER_REGISTER(nvme_ns_ana_transition_timedout, 4040 nvme_ns, 4041 cdata->anatt * SPDK_SEC_TO_USEC); 4042 break; 4043 default: 4044 break; 4045 } 4046 } 4047 4048 static int 4049 nvme_ns_set_ana_state(const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg) 4050 { 4051 struct nvme_ns *nvme_ns = cb_arg; 4052 uint32_t i; 4053 4054 assert(nvme_ns->ns != NULL); 4055 4056 for (i = 0; i < desc->num_of_nsid; i++) { 4057 if (desc->nsid[i] != spdk_nvme_ns_get_id(nvme_ns->ns)) { 4058 continue; 4059 } 4060 4061 _nvme_ns_set_ana_state(nvme_ns, desc); 4062 return 1; 4063 } 4064 4065 return 0; 4066 } 4067 4068 static int 4069 nvme_generate_uuid(const char *sn, uint32_t nsid, struct spdk_uuid *uuid) 4070 { 4071 int rc = 0; 4072 struct spdk_uuid new_uuid, namespace_uuid; 4073 char merged_str[SPDK_NVME_CTRLR_SN_LEN + NSID_STR_LEN + 1] = {'\0'}; 4074 /* This namespace UUID was generated using uuid_generate() method. */ 4075 const char *namespace_str = {"edaed2de-24bc-4b07-b559-f47ecbe730fd"}; 4076 int size; 4077 4078 assert(strlen(sn) <= SPDK_NVME_CTRLR_SN_LEN); 4079 4080 spdk_uuid_set_null(&new_uuid); 4081 spdk_uuid_set_null(&namespace_uuid); 4082 4083 size = snprintf(merged_str, sizeof(merged_str), "%s%"PRIu32, sn, nsid); 4084 if (size <= 0 || (unsigned long)size >= sizeof(merged_str)) { 4085 return -EINVAL; 4086 } 4087 4088 spdk_uuid_parse(&namespace_uuid, namespace_str); 4089 4090 rc = spdk_uuid_generate_sha1(&new_uuid, &namespace_uuid, merged_str, size); 4091 if (rc == 0) { 4092 memcpy(uuid, &new_uuid, sizeof(struct spdk_uuid)); 4093 } 4094 4095 return rc; 4096 } 4097 4098 static int 4099 nvme_disk_create(struct spdk_bdev *disk, const char *base_name, 4100 struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns, 4101 uint32_t prchk_flags, void *ctx) 4102 { 4103 const struct spdk_uuid *uuid; 4104 const uint8_t *nguid; 4105 const struct spdk_nvme_ctrlr_data *cdata; 4106 const struct spdk_nvme_ns_data *nsdata; 4107 const struct spdk_nvme_ctrlr_opts *opts; 4108 enum spdk_nvme_csi csi; 4109 uint32_t atomic_bs, phys_bs, bs; 4110 char sn_tmp[SPDK_NVME_CTRLR_SN_LEN + 1] = {'\0'}; 4111 int rc; 4112 4113 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 4114 csi = spdk_nvme_ns_get_csi(ns); 4115 opts = spdk_nvme_ctrlr_get_opts(ctrlr); 4116 4117 switch (csi) { 4118 case SPDK_NVME_CSI_NVM: 4119 disk->product_name = "NVMe disk"; 4120 break; 4121 case SPDK_NVME_CSI_ZNS: 4122 disk->product_name = "NVMe ZNS disk"; 4123 disk->zoned = true; 4124 disk->zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 4125 disk->max_zone_append_size = spdk_nvme_zns_ctrlr_get_max_zone_append_size(ctrlr) / 4126 spdk_nvme_ns_get_extended_sector_size(ns); 4127 disk->max_open_zones = spdk_nvme_zns_ns_get_max_open_zones(ns); 4128 disk->max_active_zones = spdk_nvme_zns_ns_get_max_active_zones(ns); 4129 break; 4130 default: 4131 SPDK_ERRLOG("unsupported CSI: %u\n", csi); 4132 return -ENOTSUP; 4133 } 4134 4135 nguid = spdk_nvme_ns_get_nguid(ns); 4136 if (!nguid) { 4137 uuid = spdk_nvme_ns_get_uuid(ns); 4138 if (uuid) { 4139 disk->uuid = *uuid; 4140 } else if (g_opts.generate_uuids) { 4141 spdk_strcpy_pad(sn_tmp, cdata->sn, SPDK_NVME_CTRLR_SN_LEN, '\0'); 4142 rc = nvme_generate_uuid(sn_tmp, spdk_nvme_ns_get_id(ns), &disk->uuid); 4143 if (rc < 0) { 4144 SPDK_ERRLOG("UUID generation failed (%s)\n", spdk_strerror(-rc)); 4145 return rc; 4146 } 4147 } 4148 } else { 4149 memcpy(&disk->uuid, nguid, sizeof(disk->uuid)); 4150 } 4151 4152 disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns)); 4153 if (!disk->name) { 4154 return -ENOMEM; 4155 } 4156 4157 disk->write_cache = 0; 4158 if (cdata->vwc.present) { 4159 /* Enable if the Volatile Write Cache exists */ 4160 disk->write_cache = 1; 4161 } 4162 if (cdata->oncs.write_zeroes) { 4163 disk->max_write_zeroes = UINT16_MAX + 1; 4164 } 4165 disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns); 4166 disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns); 4167 disk->max_segment_size = spdk_nvme_ctrlr_get_max_xfer_size(ctrlr); 4168 disk->ctratt.raw = cdata->ctratt.raw; 4169 /* NVMe driver will split one request into multiple requests 4170 * based on MDTS and stripe boundary, the bdev layer will use 4171 * max_segment_size and max_num_segments to split one big IO 4172 * into multiple requests, then small request can't run out 4173 * of NVMe internal requests data structure. 4174 */ 4175 if (opts && opts->io_queue_requests) { 4176 disk->max_num_segments = opts->io_queue_requests / 2; 4177 } 4178 if (spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_SGL_SUPPORTED) { 4179 /* The nvme driver will try to split I/O that have too many 4180 * SGEs, but it doesn't work if that last SGE doesn't end on 4181 * an aggregate total that is block aligned. The bdev layer has 4182 * a more robust splitting framework, so use that instead for 4183 * this case. (See issue #3269.) 4184 */ 4185 uint16_t max_sges = spdk_nvme_ctrlr_get_max_sges(ctrlr); 4186 4187 if (disk->max_num_segments == 0) { 4188 disk->max_num_segments = max_sges; 4189 } else { 4190 disk->max_num_segments = spdk_min(disk->max_num_segments, max_sges); 4191 } 4192 } 4193 disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns); 4194 4195 nsdata = spdk_nvme_ns_get_data(ns); 4196 bs = spdk_nvme_ns_get_sector_size(ns); 4197 atomic_bs = bs; 4198 phys_bs = bs; 4199 if (nsdata->nabo == 0) { 4200 if (nsdata->nsfeat.ns_atomic_write_unit && nsdata->nawupf) { 4201 atomic_bs = bs * (1 + nsdata->nawupf); 4202 } else { 4203 atomic_bs = bs * (1 + cdata->awupf); 4204 } 4205 } 4206 if (nsdata->nsfeat.optperf) { 4207 phys_bs = bs * (1 + nsdata->npwg); 4208 } 4209 disk->phys_blocklen = spdk_min(phys_bs, atomic_bs); 4210 4211 disk->md_len = spdk_nvme_ns_get_md_size(ns); 4212 if (disk->md_len != 0) { 4213 disk->md_interleave = nsdata->flbas.extended; 4214 disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns); 4215 if (disk->dif_type != SPDK_DIF_DISABLE) { 4216 disk->dif_is_head_of_md = nsdata->dps.md_start; 4217 disk->dif_check_flags = prchk_flags; 4218 disk->dif_pi_format = (enum spdk_dif_pi_format)spdk_nvme_ns_get_pi_format(ns); 4219 } 4220 } 4221 4222 if (!(spdk_nvme_ctrlr_get_flags(ctrlr) & 4223 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) { 4224 disk->acwu = 0; 4225 } else if (nsdata->nsfeat.ns_atomic_write_unit) { 4226 disk->acwu = nsdata->nacwu + 1; /* 0-based */ 4227 } else { 4228 disk->acwu = cdata->acwu + 1; /* 0-based */ 4229 } 4230 4231 if (cdata->oncs.copy) { 4232 /* For now bdev interface allows only single segment copy */ 4233 disk->max_copy = nsdata->mssrl; 4234 } 4235 4236 disk->ctxt = ctx; 4237 disk->fn_table = &nvmelib_fn_table; 4238 disk->module = &nvme_if; 4239 4240 return 0; 4241 } 4242 4243 static struct nvme_bdev * 4244 nvme_bdev_alloc(void) 4245 { 4246 struct nvme_bdev *bdev; 4247 int rc; 4248 4249 bdev = calloc(1, sizeof(*bdev)); 4250 if (!bdev) { 4251 SPDK_ERRLOG("bdev calloc() failed\n"); 4252 return NULL; 4253 } 4254 4255 if (g_opts.nvme_error_stat) { 4256 bdev->err_stat = calloc(1, sizeof(struct nvme_error_stat)); 4257 if (!bdev->err_stat) { 4258 SPDK_ERRLOG("err_stat calloc() failed\n"); 4259 free(bdev); 4260 return NULL; 4261 } 4262 } 4263 4264 rc = pthread_mutex_init(&bdev->mutex, NULL); 4265 if (rc != 0) { 4266 free(bdev->err_stat); 4267 free(bdev); 4268 return NULL; 4269 } 4270 4271 bdev->ref = 1; 4272 bdev->mp_policy = BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE; 4273 bdev->mp_selector = BDEV_NVME_MP_SELECTOR_ROUND_ROBIN; 4274 bdev->rr_min_io = UINT32_MAX; 4275 TAILQ_INIT(&bdev->nvme_ns_list); 4276 4277 return bdev; 4278 } 4279 4280 static int 4281 nvme_bdev_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4282 { 4283 struct nvme_bdev *bdev; 4284 struct nvme_bdev_ctrlr *nbdev_ctrlr = nvme_ctrlr->nbdev_ctrlr; 4285 int rc; 4286 4287 bdev = nvme_bdev_alloc(); 4288 if (bdev == NULL) { 4289 SPDK_ERRLOG("Failed to allocate NVMe bdev\n"); 4290 return -ENOMEM; 4291 } 4292 4293 bdev->opal = nvme_ctrlr->opal_dev != NULL; 4294 4295 rc = nvme_disk_create(&bdev->disk, nbdev_ctrlr->name, nvme_ctrlr->ctrlr, 4296 nvme_ns->ns, nvme_ctrlr->opts.prchk_flags, bdev); 4297 if (rc != 0) { 4298 SPDK_ERRLOG("Failed to create NVMe disk\n"); 4299 nvme_bdev_free(bdev); 4300 return rc; 4301 } 4302 4303 spdk_io_device_register(bdev, 4304 bdev_nvme_create_bdev_channel_cb, 4305 bdev_nvme_destroy_bdev_channel_cb, 4306 sizeof(struct nvme_bdev_channel), 4307 bdev->disk.name); 4308 4309 nvme_ns->bdev = bdev; 4310 bdev->nsid = nvme_ns->id; 4311 TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); 4312 4313 bdev->nbdev_ctrlr = nbdev_ctrlr; 4314 TAILQ_INSERT_TAIL(&nbdev_ctrlr->bdevs, bdev, tailq); 4315 4316 rc = spdk_bdev_register(&bdev->disk); 4317 if (rc != 0) { 4318 SPDK_ERRLOG("spdk_bdev_register() failed\n"); 4319 spdk_io_device_unregister(bdev, NULL); 4320 nvme_ns->bdev = NULL; 4321 TAILQ_REMOVE(&nbdev_ctrlr->bdevs, bdev, tailq); 4322 nvme_bdev_free(bdev); 4323 return rc; 4324 } 4325 4326 return 0; 4327 } 4328 4329 static bool 4330 bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2) 4331 { 4332 const struct spdk_nvme_ns_data *nsdata1, *nsdata2; 4333 const struct spdk_uuid *uuid1, *uuid2; 4334 4335 nsdata1 = spdk_nvme_ns_get_data(ns1); 4336 nsdata2 = spdk_nvme_ns_get_data(ns2); 4337 uuid1 = spdk_nvme_ns_get_uuid(ns1); 4338 uuid2 = spdk_nvme_ns_get_uuid(ns2); 4339 4340 return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid)) == 0 && 4341 nsdata1->eui64 == nsdata2->eui64 && 4342 ((uuid1 == NULL && uuid2 == NULL) || 4343 (uuid1 != NULL && uuid2 != NULL && spdk_uuid_compare(uuid1, uuid2) == 0)) && 4344 spdk_nvme_ns_get_csi(ns1) == spdk_nvme_ns_get_csi(ns2); 4345 } 4346 4347 static bool 4348 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 4349 struct spdk_nvme_ctrlr_opts *opts) 4350 { 4351 struct nvme_probe_skip_entry *entry; 4352 4353 TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) { 4354 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 4355 return false; 4356 } 4357 } 4358 4359 opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst; 4360 opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight; 4361 opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight; 4362 opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight; 4363 opts->disable_read_ana_log_page = true; 4364 4365 SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr); 4366 4367 return true; 4368 } 4369 4370 static void 4371 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl) 4372 { 4373 struct nvme_ctrlr *nvme_ctrlr = ctx; 4374 4375 if (spdk_nvme_cpl_is_error(cpl)) { 4376 SPDK_WARNLOG("Abort failed. Resetting controller. sc is %u, sct is %u.\n", cpl->status.sc, 4377 cpl->status.sct); 4378 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4379 } else if (cpl->cdw0 & 0x1) { 4380 SPDK_WARNLOG("Specified command could not be aborted.\n"); 4381 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4382 } 4383 } 4384 4385 static void 4386 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr, 4387 struct spdk_nvme_qpair *qpair, uint16_t cid) 4388 { 4389 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 4390 union spdk_nvme_csts_register csts; 4391 int rc; 4392 4393 assert(nvme_ctrlr->ctrlr == ctrlr); 4394 4395 SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid); 4396 4397 /* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O 4398 * queue. (Note: qpair == NULL when there's an admin cmd timeout.) Otherwise we 4399 * would submit another fabrics cmd on the admin queue to read CSTS and check for its 4400 * completion recursively. 4401 */ 4402 if (nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) { 4403 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); 4404 if (csts.bits.cfs) { 4405 SPDK_ERRLOG("Controller Fatal Status, reset required\n"); 4406 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4407 return; 4408 } 4409 } 4410 4411 switch (g_opts.action_on_timeout) { 4412 case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT: 4413 if (qpair) { 4414 /* Don't send abort to ctrlr when ctrlr is not available. */ 4415 pthread_mutex_lock(&nvme_ctrlr->mutex); 4416 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 4417 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4418 SPDK_NOTICELOG("Quit abort. Ctrlr is not available.\n"); 4419 return; 4420 } 4421 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4422 4423 rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid, 4424 nvme_abort_cpl, nvme_ctrlr); 4425 if (rc == 0) { 4426 return; 4427 } 4428 4429 SPDK_ERRLOG("Unable to send abort. Resetting, rc is %d.\n", rc); 4430 } 4431 4432 /* FALLTHROUGH */ 4433 case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET: 4434 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4435 break; 4436 case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE: 4437 SPDK_DEBUGLOG(bdev_nvme, "No action for nvme controller timeout.\n"); 4438 break; 4439 default: 4440 SPDK_ERRLOG("An invalid timeout action value is found.\n"); 4441 break; 4442 } 4443 } 4444 4445 static struct nvme_ns * 4446 nvme_ns_alloc(void) 4447 { 4448 struct nvme_ns *nvme_ns; 4449 4450 nvme_ns = calloc(1, sizeof(struct nvme_ns)); 4451 if (nvme_ns == NULL) { 4452 return NULL; 4453 } 4454 4455 if (g_opts.io_path_stat) { 4456 nvme_ns->stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); 4457 if (nvme_ns->stat == NULL) { 4458 free(nvme_ns); 4459 return NULL; 4460 } 4461 spdk_bdev_reset_io_stat(nvme_ns->stat, SPDK_BDEV_RESET_STAT_MAXMIN); 4462 } 4463 4464 return nvme_ns; 4465 } 4466 4467 static void 4468 nvme_ns_free(struct nvme_ns *nvme_ns) 4469 { 4470 free(nvme_ns->stat); 4471 free(nvme_ns); 4472 } 4473 4474 static void 4475 nvme_ctrlr_populate_namespace_done(struct nvme_ns *nvme_ns, int rc) 4476 { 4477 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 4478 struct nvme_async_probe_ctx *ctx = nvme_ns->probe_ctx; 4479 4480 if (rc == 0) { 4481 nvme_ns->probe_ctx = NULL; 4482 pthread_mutex_lock(&nvme_ctrlr->mutex); 4483 nvme_ctrlr->ref++; 4484 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4485 } else { 4486 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4487 nvme_ns_free(nvme_ns); 4488 } 4489 4490 if (ctx) { 4491 ctx->populates_in_progress--; 4492 if (ctx->populates_in_progress == 0) { 4493 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 4494 } 4495 } 4496 } 4497 4498 static void 4499 bdev_nvme_add_io_path(struct spdk_io_channel_iter *i) 4500 { 4501 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4502 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4503 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4504 int rc; 4505 4506 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 4507 if (rc != 0) { 4508 SPDK_ERRLOG("Failed to add I/O path to bdev_channel dynamically.\n"); 4509 } 4510 4511 spdk_for_each_channel_continue(i, rc); 4512 } 4513 4514 static void 4515 bdev_nvme_delete_io_path(struct spdk_io_channel_iter *i) 4516 { 4517 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4518 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4519 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4520 struct nvme_io_path *io_path; 4521 4522 io_path = _bdev_nvme_get_io_path(nbdev_ch, nvme_ns); 4523 if (io_path != NULL) { 4524 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 4525 } 4526 4527 spdk_for_each_channel_continue(i, 0); 4528 } 4529 4530 static void 4531 bdev_nvme_add_io_path_failed(struct spdk_io_channel_iter *i, int status) 4532 { 4533 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4534 4535 nvme_ctrlr_populate_namespace_done(nvme_ns, -1); 4536 } 4537 4538 static void 4539 bdev_nvme_add_io_path_done(struct spdk_io_channel_iter *i, int status) 4540 { 4541 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4542 struct nvme_bdev *bdev = spdk_io_channel_iter_get_io_device(i); 4543 4544 if (status == 0) { 4545 nvme_ctrlr_populate_namespace_done(nvme_ns, 0); 4546 } else { 4547 /* Delete the added io_paths and fail populating the namespace. */ 4548 spdk_for_each_channel(bdev, 4549 bdev_nvme_delete_io_path, 4550 nvme_ns, 4551 bdev_nvme_add_io_path_failed); 4552 } 4553 } 4554 4555 static int 4556 nvme_bdev_add_ns(struct nvme_bdev *bdev, struct nvme_ns *nvme_ns) 4557 { 4558 struct nvme_ns *tmp_ns; 4559 const struct spdk_nvme_ns_data *nsdata; 4560 4561 nsdata = spdk_nvme_ns_get_data(nvme_ns->ns); 4562 if (!nsdata->nmic.can_share) { 4563 SPDK_ERRLOG("Namespace cannot be shared.\n"); 4564 return -EINVAL; 4565 } 4566 4567 pthread_mutex_lock(&bdev->mutex); 4568 4569 tmp_ns = TAILQ_FIRST(&bdev->nvme_ns_list); 4570 assert(tmp_ns != NULL); 4571 4572 if (tmp_ns->ns != NULL && !bdev_nvme_compare_ns(nvme_ns->ns, tmp_ns->ns)) { 4573 pthread_mutex_unlock(&bdev->mutex); 4574 SPDK_ERRLOG("Namespaces are not identical.\n"); 4575 return -EINVAL; 4576 } 4577 4578 bdev->ref++; 4579 TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); 4580 nvme_ns->bdev = bdev; 4581 4582 pthread_mutex_unlock(&bdev->mutex); 4583 4584 /* Add nvme_io_path to nvme_bdev_channels dynamically. */ 4585 spdk_for_each_channel(bdev, 4586 bdev_nvme_add_io_path, 4587 nvme_ns, 4588 bdev_nvme_add_io_path_done); 4589 4590 return 0; 4591 } 4592 4593 static void 4594 nvme_ctrlr_populate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4595 { 4596 struct spdk_nvme_ns *ns; 4597 struct nvme_bdev *bdev; 4598 int rc = 0; 4599 4600 ns = spdk_nvme_ctrlr_get_ns(nvme_ctrlr->ctrlr, nvme_ns->id); 4601 if (!ns) { 4602 SPDK_DEBUGLOG(bdev_nvme, "Invalid NS %d\n", nvme_ns->id); 4603 rc = -EINVAL; 4604 goto done; 4605 } 4606 4607 nvme_ns->ns = ns; 4608 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 4609 4610 if (nvme_ctrlr->ana_log_page != NULL) { 4611 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ns_set_ana_state, nvme_ns); 4612 } 4613 4614 bdev = nvme_bdev_ctrlr_get_bdev(nvme_ctrlr->nbdev_ctrlr, nvme_ns->id); 4615 if (bdev == NULL) { 4616 rc = nvme_bdev_create(nvme_ctrlr, nvme_ns); 4617 } else { 4618 rc = nvme_bdev_add_ns(bdev, nvme_ns); 4619 if (rc == 0) { 4620 return; 4621 } 4622 } 4623 done: 4624 nvme_ctrlr_populate_namespace_done(nvme_ns, rc); 4625 } 4626 4627 static void 4628 nvme_ctrlr_depopulate_namespace_done(struct nvme_ns *nvme_ns) 4629 { 4630 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 4631 4632 assert(nvme_ctrlr != NULL); 4633 4634 pthread_mutex_lock(&nvme_ctrlr->mutex); 4635 4636 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4637 4638 if (nvme_ns->bdev != NULL) { 4639 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4640 return; 4641 } 4642 4643 nvme_ns_free(nvme_ns); 4644 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4645 4646 nvme_ctrlr_release(nvme_ctrlr); 4647 } 4648 4649 static void 4650 bdev_nvme_delete_io_path_done(struct spdk_io_channel_iter *i, int status) 4651 { 4652 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4653 4654 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 4655 } 4656 4657 static void 4658 nvme_ctrlr_depopulate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4659 { 4660 struct nvme_bdev *bdev; 4661 4662 spdk_poller_unregister(&nvme_ns->anatt_timer); 4663 4664 bdev = nvme_ns->bdev; 4665 if (bdev != NULL) { 4666 pthread_mutex_lock(&bdev->mutex); 4667 4668 assert(bdev->ref > 0); 4669 bdev->ref--; 4670 if (bdev->ref == 0) { 4671 pthread_mutex_unlock(&bdev->mutex); 4672 4673 spdk_bdev_unregister(&bdev->disk, NULL, NULL); 4674 } else { 4675 /* spdk_bdev_unregister() is not called until the last nvme_ns is 4676 * depopulated. Hence we need to remove nvme_ns from bdev->nvme_ns_list 4677 * and clear nvme_ns->bdev here. 4678 */ 4679 TAILQ_REMOVE(&bdev->nvme_ns_list, nvme_ns, tailq); 4680 nvme_ns->bdev = NULL; 4681 4682 pthread_mutex_unlock(&bdev->mutex); 4683 4684 /* Delete nvme_io_paths from nvme_bdev_channels dynamically. After that, 4685 * we call depopulate_namespace_done() to avoid use-after-free. 4686 */ 4687 spdk_for_each_channel(bdev, 4688 bdev_nvme_delete_io_path, 4689 nvme_ns, 4690 bdev_nvme_delete_io_path_done); 4691 return; 4692 } 4693 } 4694 4695 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 4696 } 4697 4698 static void 4699 nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 4700 struct nvme_async_probe_ctx *ctx) 4701 { 4702 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 4703 struct nvme_ns *nvme_ns, *next; 4704 struct spdk_nvme_ns *ns; 4705 struct nvme_bdev *bdev; 4706 uint32_t nsid; 4707 int rc; 4708 uint64_t num_sectors; 4709 4710 if (ctx) { 4711 /* Initialize this count to 1 to handle the populate functions 4712 * calling nvme_ctrlr_populate_namespace_done() immediately. 4713 */ 4714 ctx->populates_in_progress = 1; 4715 } 4716 4717 /* First loop over our existing namespaces and see if they have been 4718 * removed. */ 4719 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 4720 while (nvme_ns != NULL) { 4721 next = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 4722 4723 if (spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) { 4724 /* NS is still there or added again. Its attributes may have changed. */ 4725 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id); 4726 if (nvme_ns->ns != ns) { 4727 assert(nvme_ns->ns == NULL); 4728 nvme_ns->ns = ns; 4729 SPDK_DEBUGLOG(bdev_nvme, "NSID %u was added\n", nvme_ns->id); 4730 } 4731 4732 num_sectors = spdk_nvme_ns_get_num_sectors(ns); 4733 bdev = nvme_ns->bdev; 4734 assert(bdev != NULL); 4735 if (bdev->disk.blockcnt != num_sectors) { 4736 SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n", 4737 nvme_ns->id, 4738 bdev->disk.name, 4739 bdev->disk.blockcnt, 4740 num_sectors); 4741 rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors); 4742 if (rc != 0) { 4743 SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n", 4744 bdev->disk.name, rc); 4745 } 4746 } 4747 } else { 4748 /* Namespace was removed */ 4749 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 4750 } 4751 4752 nvme_ns = next; 4753 } 4754 4755 /* Loop through all of the namespaces at the nvme level and see if any of them are new */ 4756 nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 4757 while (nsid != 0) { 4758 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 4759 4760 if (nvme_ns == NULL) { 4761 /* Found a new one */ 4762 nvme_ns = nvme_ns_alloc(); 4763 if (nvme_ns == NULL) { 4764 SPDK_ERRLOG("Failed to allocate namespace\n"); 4765 /* This just fails to attach the namespace. It may work on a future attempt. */ 4766 continue; 4767 } 4768 4769 nvme_ns->id = nsid; 4770 nvme_ns->ctrlr = nvme_ctrlr; 4771 4772 nvme_ns->bdev = NULL; 4773 4774 if (ctx) { 4775 ctx->populates_in_progress++; 4776 } 4777 nvme_ns->probe_ctx = ctx; 4778 4779 RB_INSERT(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4780 4781 nvme_ctrlr_populate_namespace(nvme_ctrlr, nvme_ns); 4782 } 4783 4784 nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid); 4785 } 4786 4787 if (ctx) { 4788 /* Decrement this count now that the loop is over to account 4789 * for the one we started with. If the count is then 0, we 4790 * know any populate_namespace functions completed immediately, 4791 * so we'll kick the callback here. 4792 */ 4793 ctx->populates_in_progress--; 4794 if (ctx->populates_in_progress == 0) { 4795 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 4796 } 4797 } 4798 4799 } 4800 4801 static void 4802 nvme_ctrlr_depopulate_namespaces(struct nvme_ctrlr *nvme_ctrlr) 4803 { 4804 struct nvme_ns *nvme_ns, *tmp; 4805 4806 RB_FOREACH_SAFE(nvme_ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp) { 4807 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 4808 } 4809 } 4810 4811 static uint32_t 4812 nvme_ctrlr_get_ana_log_page_size(struct nvme_ctrlr *nvme_ctrlr) 4813 { 4814 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 4815 const struct spdk_nvme_ctrlr_data *cdata; 4816 uint32_t nsid, ns_count = 0; 4817 4818 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 4819 4820 for (nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 4821 nsid != 0; nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid)) { 4822 ns_count++; 4823 } 4824 4825 return sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 4826 sizeof(struct spdk_nvme_ana_group_descriptor) + ns_count * 4827 sizeof(uint32_t); 4828 } 4829 4830 static int 4831 nvme_ctrlr_set_ana_states(const struct spdk_nvme_ana_group_descriptor *desc, 4832 void *cb_arg) 4833 { 4834 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 4835 struct nvme_ns *nvme_ns; 4836 uint32_t i, nsid; 4837 4838 for (i = 0; i < desc->num_of_nsid; i++) { 4839 nsid = desc->nsid[i]; 4840 if (nsid == 0) { 4841 continue; 4842 } 4843 4844 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 4845 4846 if (nvme_ns == NULL) { 4847 /* Target told us that an inactive namespace had an ANA change */ 4848 continue; 4849 } 4850 4851 _nvme_ns_set_ana_state(nvme_ns, desc); 4852 } 4853 4854 return 0; 4855 } 4856 4857 static void 4858 bdev_nvme_disable_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 4859 { 4860 struct nvme_ns *nvme_ns; 4861 4862 spdk_free(nvme_ctrlr->ana_log_page); 4863 nvme_ctrlr->ana_log_page = NULL; 4864 4865 for (nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 4866 nvme_ns != NULL; 4867 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns)) { 4868 nvme_ns->ana_state_updating = false; 4869 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 4870 } 4871 } 4872 4873 static void 4874 nvme_ctrlr_read_ana_log_page_done(void *ctx, const struct spdk_nvme_cpl *cpl) 4875 { 4876 struct nvme_ctrlr *nvme_ctrlr = ctx; 4877 4878 if (cpl != NULL && spdk_nvme_cpl_is_success(cpl)) { 4879 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ctrlr_set_ana_states, 4880 nvme_ctrlr); 4881 } else { 4882 bdev_nvme_disable_read_ana_log_page(nvme_ctrlr); 4883 } 4884 4885 pthread_mutex_lock(&nvme_ctrlr->mutex); 4886 4887 assert(nvme_ctrlr->ana_log_page_updating == true); 4888 nvme_ctrlr->ana_log_page_updating = false; 4889 4890 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 4891 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4892 4893 nvme_ctrlr_unregister(nvme_ctrlr); 4894 } else { 4895 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4896 4897 bdev_nvme_clear_io_path_caches(nvme_ctrlr); 4898 } 4899 } 4900 4901 static int 4902 nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 4903 { 4904 uint32_t ana_log_page_size; 4905 int rc; 4906 4907 if (nvme_ctrlr->ana_log_page == NULL) { 4908 return -EINVAL; 4909 } 4910 4911 ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); 4912 4913 if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { 4914 SPDK_ERRLOG("ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", 4915 ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); 4916 return -EINVAL; 4917 } 4918 4919 pthread_mutex_lock(&nvme_ctrlr->mutex); 4920 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 4921 nvme_ctrlr->ana_log_page_updating) { 4922 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4923 return -EBUSY; 4924 } 4925 4926 nvme_ctrlr->ana_log_page_updating = true; 4927 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4928 4929 rc = spdk_nvme_ctrlr_cmd_get_log_page(nvme_ctrlr->ctrlr, 4930 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 4931 SPDK_NVME_GLOBAL_NS_TAG, 4932 nvme_ctrlr->ana_log_page, 4933 ana_log_page_size, 0, 4934 nvme_ctrlr_read_ana_log_page_done, 4935 nvme_ctrlr); 4936 if (rc != 0) { 4937 nvme_ctrlr_read_ana_log_page_done(nvme_ctrlr, NULL); 4938 } 4939 4940 return rc; 4941 } 4942 4943 static void 4944 dummy_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 4945 { 4946 } 4947 4948 struct bdev_nvme_set_preferred_path_ctx { 4949 struct spdk_bdev_desc *desc; 4950 struct nvme_ns *nvme_ns; 4951 bdev_nvme_set_preferred_path_cb cb_fn; 4952 void *cb_arg; 4953 }; 4954 4955 static void 4956 bdev_nvme_set_preferred_path_done(struct spdk_io_channel_iter *i, int status) 4957 { 4958 struct bdev_nvme_set_preferred_path_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4959 4960 assert(ctx != NULL); 4961 assert(ctx->desc != NULL); 4962 assert(ctx->cb_fn != NULL); 4963 4964 spdk_bdev_close(ctx->desc); 4965 4966 ctx->cb_fn(ctx->cb_arg, status); 4967 4968 free(ctx); 4969 } 4970 4971 static void 4972 _bdev_nvme_set_preferred_path(struct spdk_io_channel_iter *i) 4973 { 4974 struct bdev_nvme_set_preferred_path_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4975 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4976 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4977 struct nvme_io_path *io_path, *prev; 4978 4979 prev = NULL; 4980 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 4981 if (io_path->nvme_ns == ctx->nvme_ns) { 4982 break; 4983 } 4984 prev = io_path; 4985 } 4986 4987 if (io_path != NULL) { 4988 if (prev != NULL) { 4989 STAILQ_REMOVE_AFTER(&nbdev_ch->io_path_list, prev, stailq); 4990 STAILQ_INSERT_HEAD(&nbdev_ch->io_path_list, io_path, stailq); 4991 } 4992 4993 /* We can set io_path to nbdev_ch->current_io_path directly here. 4994 * However, it needs to be conditional. To simplify the code, 4995 * just clear nbdev_ch->current_io_path and let find_io_path() 4996 * fill it. 4997 * 4998 * Automatic failback may be disabled. Hence even if the io_path is 4999 * already at the head, clear nbdev_ch->current_io_path. 5000 */ 5001 bdev_nvme_clear_current_io_path(nbdev_ch); 5002 } 5003 5004 spdk_for_each_channel_continue(i, 0); 5005 } 5006 5007 static struct nvme_ns * 5008 bdev_nvme_set_preferred_ns(struct nvme_bdev *nbdev, uint16_t cntlid) 5009 { 5010 struct nvme_ns *nvme_ns, *prev; 5011 const struct spdk_nvme_ctrlr_data *cdata; 5012 5013 prev = NULL; 5014 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 5015 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 5016 5017 if (cdata->cntlid == cntlid) { 5018 break; 5019 } 5020 prev = nvme_ns; 5021 } 5022 5023 if (nvme_ns != NULL && prev != NULL) { 5024 TAILQ_REMOVE(&nbdev->nvme_ns_list, nvme_ns, tailq); 5025 TAILQ_INSERT_HEAD(&nbdev->nvme_ns_list, nvme_ns, tailq); 5026 } 5027 5028 return nvme_ns; 5029 } 5030 5031 /* This function supports only multipath mode. There is only a single I/O path 5032 * for each NVMe-oF controller. Hence, just move the matched I/O path to the 5033 * head of the I/O path list for each NVMe bdev channel. 5034 * 5035 * NVMe bdev channel may be acquired after completing this function. move the 5036 * matched namespace to the head of the namespace list for the NVMe bdev too. 5037 */ 5038 void 5039 bdev_nvme_set_preferred_path(const char *name, uint16_t cntlid, 5040 bdev_nvme_set_preferred_path_cb cb_fn, void *cb_arg) 5041 { 5042 struct bdev_nvme_set_preferred_path_ctx *ctx; 5043 struct spdk_bdev *bdev; 5044 struct nvme_bdev *nbdev; 5045 int rc = 0; 5046 5047 assert(cb_fn != NULL); 5048 5049 ctx = calloc(1, sizeof(*ctx)); 5050 if (ctx == NULL) { 5051 SPDK_ERRLOG("Failed to alloc context.\n"); 5052 rc = -ENOMEM; 5053 goto err_alloc; 5054 } 5055 5056 ctx->cb_fn = cb_fn; 5057 ctx->cb_arg = cb_arg; 5058 5059 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 5060 if (rc != 0) { 5061 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 5062 goto err_open; 5063 } 5064 5065 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 5066 5067 if (bdev->module != &nvme_if) { 5068 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 5069 rc = -ENODEV; 5070 goto err_bdev; 5071 } 5072 5073 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 5074 5075 pthread_mutex_lock(&nbdev->mutex); 5076 5077 ctx->nvme_ns = bdev_nvme_set_preferred_ns(nbdev, cntlid); 5078 if (ctx->nvme_ns == NULL) { 5079 pthread_mutex_unlock(&nbdev->mutex); 5080 5081 SPDK_ERRLOG("bdev %s does not have namespace to controller %u.\n", name, cntlid); 5082 rc = -ENODEV; 5083 goto err_bdev; 5084 } 5085 5086 pthread_mutex_unlock(&nbdev->mutex); 5087 5088 spdk_for_each_channel(nbdev, 5089 _bdev_nvme_set_preferred_path, 5090 ctx, 5091 bdev_nvme_set_preferred_path_done); 5092 return; 5093 5094 err_bdev: 5095 spdk_bdev_close(ctx->desc); 5096 err_open: 5097 free(ctx); 5098 err_alloc: 5099 cb_fn(cb_arg, rc); 5100 } 5101 5102 struct bdev_nvme_set_multipath_policy_ctx { 5103 struct spdk_bdev_desc *desc; 5104 bdev_nvme_set_multipath_policy_cb cb_fn; 5105 void *cb_arg; 5106 }; 5107 5108 static void 5109 bdev_nvme_set_multipath_policy_done(struct spdk_io_channel_iter *i, int status) 5110 { 5111 struct bdev_nvme_set_multipath_policy_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 5112 5113 assert(ctx != NULL); 5114 assert(ctx->desc != NULL); 5115 assert(ctx->cb_fn != NULL); 5116 5117 spdk_bdev_close(ctx->desc); 5118 5119 ctx->cb_fn(ctx->cb_arg, status); 5120 5121 free(ctx); 5122 } 5123 5124 static void 5125 _bdev_nvme_set_multipath_policy(struct spdk_io_channel_iter *i) 5126 { 5127 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 5128 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 5129 struct nvme_bdev *nbdev = spdk_io_channel_get_io_device(_ch); 5130 5131 nbdev_ch->mp_policy = nbdev->mp_policy; 5132 nbdev_ch->mp_selector = nbdev->mp_selector; 5133 nbdev_ch->rr_min_io = nbdev->rr_min_io; 5134 bdev_nvme_clear_current_io_path(nbdev_ch); 5135 5136 spdk_for_each_channel_continue(i, 0); 5137 } 5138 5139 void 5140 bdev_nvme_set_multipath_policy(const char *name, enum bdev_nvme_multipath_policy policy, 5141 enum bdev_nvme_multipath_selector selector, uint32_t rr_min_io, 5142 bdev_nvme_set_multipath_policy_cb cb_fn, void *cb_arg) 5143 { 5144 struct bdev_nvme_set_multipath_policy_ctx *ctx; 5145 struct spdk_bdev *bdev; 5146 struct nvme_bdev *nbdev; 5147 int rc; 5148 5149 assert(cb_fn != NULL); 5150 5151 switch (policy) { 5152 case BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE: 5153 break; 5154 case BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE: 5155 switch (selector) { 5156 case BDEV_NVME_MP_SELECTOR_ROUND_ROBIN: 5157 if (rr_min_io == UINT32_MAX) { 5158 rr_min_io = 1; 5159 } else if (rr_min_io == 0) { 5160 rc = -EINVAL; 5161 goto exit; 5162 } 5163 break; 5164 case BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH: 5165 break; 5166 default: 5167 rc = -EINVAL; 5168 goto exit; 5169 } 5170 break; 5171 default: 5172 rc = -EINVAL; 5173 goto exit; 5174 } 5175 5176 ctx = calloc(1, sizeof(*ctx)); 5177 if (ctx == NULL) { 5178 SPDK_ERRLOG("Failed to alloc context.\n"); 5179 rc = -ENOMEM; 5180 goto exit; 5181 } 5182 5183 ctx->cb_fn = cb_fn; 5184 ctx->cb_arg = cb_arg; 5185 5186 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 5187 if (rc != 0) { 5188 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 5189 rc = -ENODEV; 5190 goto err_open; 5191 } 5192 5193 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 5194 if (bdev->module != &nvme_if) { 5195 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 5196 rc = -ENODEV; 5197 goto err_module; 5198 } 5199 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 5200 5201 pthread_mutex_lock(&nbdev->mutex); 5202 nbdev->mp_policy = policy; 5203 nbdev->mp_selector = selector; 5204 nbdev->rr_min_io = rr_min_io; 5205 pthread_mutex_unlock(&nbdev->mutex); 5206 5207 spdk_for_each_channel(nbdev, 5208 _bdev_nvme_set_multipath_policy, 5209 ctx, 5210 bdev_nvme_set_multipath_policy_done); 5211 return; 5212 5213 err_module: 5214 spdk_bdev_close(ctx->desc); 5215 err_open: 5216 free(ctx); 5217 exit: 5218 cb_fn(cb_arg, rc); 5219 } 5220 5221 static void 5222 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 5223 { 5224 struct nvme_ctrlr *nvme_ctrlr = arg; 5225 union spdk_nvme_async_event_completion event; 5226 5227 if (spdk_nvme_cpl_is_error(cpl)) { 5228 SPDK_WARNLOG("AER request execute failed\n"); 5229 return; 5230 } 5231 5232 event.raw = cpl->cdw0; 5233 if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 5234 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) { 5235 nvme_ctrlr_populate_namespaces(nvme_ctrlr, NULL); 5236 } else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 5237 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_ANA_CHANGE)) { 5238 nvme_ctrlr_read_ana_log_page(nvme_ctrlr); 5239 } 5240 } 5241 5242 static void 5243 free_nvme_async_probe_ctx(struct nvme_async_probe_ctx *ctx) 5244 { 5245 spdk_keyring_put_key(ctx->drv_opts.tls_psk); 5246 spdk_keyring_put_key(ctx->drv_opts.dhchap_key); 5247 spdk_keyring_put_key(ctx->drv_opts.dhchap_ctrlr_key); 5248 free(ctx); 5249 } 5250 5251 static void 5252 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, int rc) 5253 { 5254 if (ctx->cb_fn) { 5255 ctx->cb_fn(ctx->cb_ctx, ctx->reported_bdevs, rc); 5256 } 5257 5258 ctx->namespaces_populated = true; 5259 if (ctx->probe_done) { 5260 /* The probe was already completed, so we need to free the context 5261 * here. This can happen for cases like OCSSD, where we need to 5262 * send additional commands to the SSD after attach. 5263 */ 5264 free_nvme_async_probe_ctx(ctx); 5265 } 5266 } 5267 5268 static void 5269 nvme_ctrlr_create_done(struct nvme_ctrlr *nvme_ctrlr, 5270 struct nvme_async_probe_ctx *ctx) 5271 { 5272 spdk_io_device_register(nvme_ctrlr, 5273 bdev_nvme_create_ctrlr_channel_cb, 5274 bdev_nvme_destroy_ctrlr_channel_cb, 5275 sizeof(struct nvme_ctrlr_channel), 5276 nvme_ctrlr->nbdev_ctrlr->name); 5277 5278 nvme_ctrlr_populate_namespaces(nvme_ctrlr, ctx); 5279 } 5280 5281 static void 5282 nvme_ctrlr_init_ana_log_page_done(void *_ctx, const struct spdk_nvme_cpl *cpl) 5283 { 5284 struct nvme_ctrlr *nvme_ctrlr = _ctx; 5285 struct nvme_async_probe_ctx *ctx = nvme_ctrlr->probe_ctx; 5286 5287 nvme_ctrlr->probe_ctx = NULL; 5288 5289 if (spdk_nvme_cpl_is_error(cpl)) { 5290 nvme_ctrlr_delete(nvme_ctrlr); 5291 5292 if (ctx != NULL) { 5293 ctx->reported_bdevs = 0; 5294 populate_namespaces_cb(ctx, -1); 5295 } 5296 return; 5297 } 5298 5299 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 5300 } 5301 5302 static int 5303 nvme_ctrlr_init_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 5304 struct nvme_async_probe_ctx *ctx) 5305 { 5306 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 5307 const struct spdk_nvme_ctrlr_data *cdata; 5308 uint32_t ana_log_page_size; 5309 5310 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5311 5312 /* Set buffer size enough to include maximum number of allowed namespaces. */ 5313 ana_log_page_size = sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 5314 sizeof(struct spdk_nvme_ana_group_descriptor) + cdata->mnan * 5315 sizeof(uint32_t); 5316 5317 nvme_ctrlr->ana_log_page = spdk_zmalloc(ana_log_page_size, 64, NULL, 5318 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 5319 if (nvme_ctrlr->ana_log_page == NULL) { 5320 SPDK_ERRLOG("could not allocate ANA log page buffer\n"); 5321 return -ENXIO; 5322 } 5323 5324 /* Each descriptor in a ANA log page is not ensured to be 8-bytes aligned. 5325 * Hence copy each descriptor to a temporary area when parsing it. 5326 * 5327 * Allocate a buffer whose size is as large as ANA log page buffer because 5328 * we do not know the size of a descriptor until actually reading it. 5329 */ 5330 nvme_ctrlr->copied_ana_desc = calloc(1, ana_log_page_size); 5331 if (nvme_ctrlr->copied_ana_desc == NULL) { 5332 SPDK_ERRLOG("could not allocate a buffer to parse ANA descriptor\n"); 5333 return -ENOMEM; 5334 } 5335 5336 nvme_ctrlr->max_ana_log_page_size = ana_log_page_size; 5337 5338 nvme_ctrlr->probe_ctx = ctx; 5339 5340 /* Then, set the read size only to include the current active namespaces. */ 5341 ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); 5342 5343 if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { 5344 SPDK_ERRLOG("ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", 5345 ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); 5346 return -EINVAL; 5347 } 5348 5349 return spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, 5350 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 5351 SPDK_NVME_GLOBAL_NS_TAG, 5352 nvme_ctrlr->ana_log_page, 5353 ana_log_page_size, 0, 5354 nvme_ctrlr_init_ana_log_page_done, 5355 nvme_ctrlr); 5356 } 5357 5358 /* hostnqn and subnqn were already verified before attaching a controller. 5359 * Hence check only the multipath capability and cntlid here. 5360 */ 5361 static bool 5362 bdev_nvme_check_multipath(struct nvme_bdev_ctrlr *nbdev_ctrlr, struct spdk_nvme_ctrlr *ctrlr) 5363 { 5364 struct nvme_ctrlr *tmp; 5365 const struct spdk_nvme_ctrlr_data *cdata, *tmp_cdata; 5366 5367 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5368 5369 if (!cdata->cmic.multi_ctrlr) { 5370 SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); 5371 return false; 5372 } 5373 5374 TAILQ_FOREACH(tmp, &nbdev_ctrlr->ctrlrs, tailq) { 5375 tmp_cdata = spdk_nvme_ctrlr_get_data(tmp->ctrlr); 5376 5377 if (!tmp_cdata->cmic.multi_ctrlr) { 5378 SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); 5379 return false; 5380 } 5381 if (cdata->cntlid == tmp_cdata->cntlid) { 5382 SPDK_ERRLOG("cntlid %u are duplicated.\n", tmp_cdata->cntlid); 5383 return false; 5384 } 5385 } 5386 5387 return true; 5388 } 5389 5390 static int 5391 nvme_bdev_ctrlr_create(const char *name, struct nvme_ctrlr *nvme_ctrlr) 5392 { 5393 struct nvme_bdev_ctrlr *nbdev_ctrlr; 5394 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 5395 int rc = 0; 5396 5397 pthread_mutex_lock(&g_bdev_nvme_mutex); 5398 5399 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 5400 if (nbdev_ctrlr != NULL) { 5401 if (!bdev_nvme_check_multipath(nbdev_ctrlr, ctrlr)) { 5402 rc = -EINVAL; 5403 goto exit; 5404 } 5405 } else { 5406 nbdev_ctrlr = calloc(1, sizeof(*nbdev_ctrlr)); 5407 if (nbdev_ctrlr == NULL) { 5408 SPDK_ERRLOG("Failed to allocate nvme_bdev_ctrlr.\n"); 5409 rc = -ENOMEM; 5410 goto exit; 5411 } 5412 nbdev_ctrlr->name = strdup(name); 5413 if (nbdev_ctrlr->name == NULL) { 5414 SPDK_ERRLOG("Failed to allocate name of nvme_bdev_ctrlr.\n"); 5415 free(nbdev_ctrlr); 5416 goto exit; 5417 } 5418 TAILQ_INIT(&nbdev_ctrlr->ctrlrs); 5419 TAILQ_INIT(&nbdev_ctrlr->bdevs); 5420 TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 5421 } 5422 nvme_ctrlr->nbdev_ctrlr = nbdev_ctrlr; 5423 TAILQ_INSERT_TAIL(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 5424 exit: 5425 pthread_mutex_unlock(&g_bdev_nvme_mutex); 5426 return rc; 5427 } 5428 5429 static int 5430 nvme_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr, 5431 const char *name, 5432 const struct spdk_nvme_transport_id *trid, 5433 struct nvme_async_probe_ctx *ctx) 5434 { 5435 struct nvme_ctrlr *nvme_ctrlr; 5436 struct nvme_path_id *path_id; 5437 const struct spdk_nvme_ctrlr_data *cdata; 5438 int rc; 5439 5440 nvme_ctrlr = calloc(1, sizeof(*nvme_ctrlr)); 5441 if (nvme_ctrlr == NULL) { 5442 SPDK_ERRLOG("Failed to allocate device struct\n"); 5443 return -ENOMEM; 5444 } 5445 5446 rc = pthread_mutex_init(&nvme_ctrlr->mutex, NULL); 5447 if (rc != 0) { 5448 free(nvme_ctrlr); 5449 return rc; 5450 } 5451 5452 TAILQ_INIT(&nvme_ctrlr->trids); 5453 RB_INIT(&nvme_ctrlr->namespaces); 5454 5455 /* Get another reference to the key, so the first one can be released from probe_ctx */ 5456 if (ctx != NULL) { 5457 if (ctx->drv_opts.tls_psk != NULL) { 5458 nvme_ctrlr->psk = spdk_keyring_get_key( 5459 spdk_key_get_name(ctx->drv_opts.tls_psk)); 5460 if (nvme_ctrlr->psk == NULL) { 5461 /* Could only happen if the key was removed in the meantime */ 5462 SPDK_ERRLOG("Couldn't get a reference to the key '%s'\n", 5463 spdk_key_get_name(ctx->drv_opts.tls_psk)); 5464 rc = -ENOKEY; 5465 goto err; 5466 } 5467 } 5468 5469 if (ctx->drv_opts.dhchap_key != NULL) { 5470 nvme_ctrlr->dhchap_key = spdk_keyring_get_key( 5471 spdk_key_get_name(ctx->drv_opts.dhchap_key)); 5472 if (nvme_ctrlr->dhchap_key == NULL) { 5473 SPDK_ERRLOG("Couldn't get a reference to the key '%s'\n", 5474 spdk_key_get_name(ctx->drv_opts.dhchap_key)); 5475 rc = -ENOKEY; 5476 goto err; 5477 } 5478 } 5479 5480 if (ctx->drv_opts.dhchap_ctrlr_key != NULL) { 5481 nvme_ctrlr->dhchap_ctrlr_key = 5482 spdk_keyring_get_key( 5483 spdk_key_get_name(ctx->drv_opts.dhchap_ctrlr_key)); 5484 if (nvme_ctrlr->dhchap_ctrlr_key == NULL) { 5485 SPDK_ERRLOG("Couldn't get a reference to the key '%s'\n", 5486 spdk_key_get_name(ctx->drv_opts.dhchap_ctrlr_key)); 5487 rc = -ENOKEY; 5488 goto err; 5489 } 5490 } 5491 } 5492 5493 path_id = calloc(1, sizeof(*path_id)); 5494 if (path_id == NULL) { 5495 SPDK_ERRLOG("Failed to allocate trid entry pointer\n"); 5496 rc = -ENOMEM; 5497 goto err; 5498 } 5499 5500 path_id->trid = *trid; 5501 if (ctx != NULL) { 5502 memcpy(path_id->hostid.hostaddr, ctx->drv_opts.src_addr, sizeof(path_id->hostid.hostaddr)); 5503 memcpy(path_id->hostid.hostsvcid, ctx->drv_opts.src_svcid, sizeof(path_id->hostid.hostsvcid)); 5504 } 5505 nvme_ctrlr->active_path_id = path_id; 5506 TAILQ_INSERT_HEAD(&nvme_ctrlr->trids, path_id, link); 5507 5508 nvme_ctrlr->thread = spdk_get_thread(); 5509 nvme_ctrlr->ctrlr = ctrlr; 5510 nvme_ctrlr->ref = 1; 5511 5512 if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) { 5513 SPDK_ERRLOG("OCSSDs are not supported"); 5514 rc = -ENOTSUP; 5515 goto err; 5516 } 5517 5518 if (ctx != NULL) { 5519 memcpy(&nvme_ctrlr->opts, &ctx->bdev_opts, sizeof(ctx->bdev_opts)); 5520 } else { 5521 bdev_nvme_get_default_ctrlr_opts(&nvme_ctrlr->opts); 5522 } 5523 5524 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_ctrlr, 5525 g_opts.nvme_adminq_poll_period_us); 5526 5527 if (g_opts.timeout_us > 0) { 5528 /* Register timeout callback. Timeout values for IO vs. admin reqs can be different. */ 5529 /* If timeout_admin_us is 0 (not specified), admin uses same timeout as IO. */ 5530 uint64_t adm_timeout_us = (g_opts.timeout_admin_us == 0) ? 5531 g_opts.timeout_us : g_opts.timeout_admin_us; 5532 spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us, 5533 adm_timeout_us, timeout_cb, nvme_ctrlr); 5534 } 5535 5536 spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_ctrlr); 5537 spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_ctrlr); 5538 5539 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 5540 SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) { 5541 nvme_ctrlr->opal_dev = spdk_opal_dev_construct(ctrlr); 5542 } 5543 5544 rc = nvme_bdev_ctrlr_create(name, nvme_ctrlr); 5545 if (rc != 0) { 5546 goto err; 5547 } 5548 5549 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5550 5551 if (cdata->cmic.ana_reporting) { 5552 rc = nvme_ctrlr_init_ana_log_page(nvme_ctrlr, ctx); 5553 if (rc == 0) { 5554 return 0; 5555 } 5556 } else { 5557 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 5558 return 0; 5559 } 5560 5561 err: 5562 nvme_ctrlr_delete(nvme_ctrlr); 5563 return rc; 5564 } 5565 5566 void 5567 bdev_nvme_get_default_ctrlr_opts(struct nvme_ctrlr_opts *opts) 5568 { 5569 opts->prchk_flags = 0; 5570 opts->ctrlr_loss_timeout_sec = g_opts.ctrlr_loss_timeout_sec; 5571 opts->reconnect_delay_sec = g_opts.reconnect_delay_sec; 5572 opts->fast_io_fail_timeout_sec = g_opts.fast_io_fail_timeout_sec; 5573 } 5574 5575 static void 5576 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5577 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *drv_opts) 5578 { 5579 char *name; 5580 5581 name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++); 5582 if (!name) { 5583 SPDK_ERRLOG("Failed to assign name to NVMe device\n"); 5584 return; 5585 } 5586 5587 if (nvme_ctrlr_create(ctrlr, name, trid, NULL) == 0) { 5588 SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name); 5589 } else { 5590 SPDK_ERRLOG("Failed to attach to %s (%s)\n", trid->traddr, name); 5591 } 5592 5593 free(name); 5594 } 5595 5596 static void 5597 _nvme_ctrlr_destruct(void *ctx) 5598 { 5599 struct nvme_ctrlr *nvme_ctrlr = ctx; 5600 5601 nvme_ctrlr_depopulate_namespaces(nvme_ctrlr); 5602 nvme_ctrlr_release(nvme_ctrlr); 5603 } 5604 5605 static int 5606 bdev_nvme_delete_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) 5607 { 5608 struct nvme_probe_skip_entry *entry; 5609 5610 /* The controller's destruction was already started */ 5611 if (nvme_ctrlr->destruct) { 5612 return -EALREADY; 5613 } 5614 5615 if (!hotplug && 5616 nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { 5617 entry = calloc(1, sizeof(*entry)); 5618 if (!entry) { 5619 return -ENOMEM; 5620 } 5621 entry->trid = nvme_ctrlr->active_path_id->trid; 5622 TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq); 5623 } 5624 5625 nvme_ctrlr->destruct = true; 5626 return 0; 5627 } 5628 5629 static int 5630 bdev_nvme_delete_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) 5631 { 5632 int rc; 5633 5634 pthread_mutex_lock(&nvme_ctrlr->mutex); 5635 rc = bdev_nvme_delete_ctrlr_unsafe(nvme_ctrlr, hotplug); 5636 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5637 5638 if (rc == 0) { 5639 _nvme_ctrlr_destruct(nvme_ctrlr); 5640 } else if (rc == -EALREADY) { 5641 rc = 0; 5642 } 5643 5644 return rc; 5645 } 5646 5647 static void 5648 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr) 5649 { 5650 struct nvme_ctrlr *nvme_ctrlr = cb_ctx; 5651 5652 bdev_nvme_delete_ctrlr(nvme_ctrlr, true); 5653 } 5654 5655 static int 5656 bdev_nvme_hotplug_probe(void *arg) 5657 { 5658 if (g_hotplug_probe_ctx == NULL) { 5659 spdk_poller_unregister(&g_hotplug_probe_poller); 5660 return SPDK_POLLER_IDLE; 5661 } 5662 5663 if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) { 5664 g_hotplug_probe_ctx = NULL; 5665 spdk_poller_unregister(&g_hotplug_probe_poller); 5666 } 5667 5668 return SPDK_POLLER_BUSY; 5669 } 5670 5671 static int 5672 bdev_nvme_hotplug(void *arg) 5673 { 5674 struct spdk_nvme_transport_id trid_pcie; 5675 5676 if (g_hotplug_probe_ctx) { 5677 return SPDK_POLLER_BUSY; 5678 } 5679 5680 memset(&trid_pcie, 0, sizeof(trid_pcie)); 5681 spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE); 5682 5683 g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL, 5684 hotplug_probe_cb, attach_cb, NULL); 5685 5686 if (g_hotplug_probe_ctx) { 5687 assert(g_hotplug_probe_poller == NULL); 5688 g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000); 5689 } 5690 5691 return SPDK_POLLER_BUSY; 5692 } 5693 5694 void 5695 bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts) 5696 { 5697 *opts = g_opts; 5698 } 5699 5700 static bool bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, 5701 uint32_t reconnect_delay_sec, 5702 uint32_t fast_io_fail_timeout_sec); 5703 5704 static int 5705 bdev_nvme_validate_opts(const struct spdk_bdev_nvme_opts *opts) 5706 { 5707 if ((opts->timeout_us == 0) && (opts->timeout_admin_us != 0)) { 5708 /* Can't set timeout_admin_us without also setting timeout_us */ 5709 SPDK_WARNLOG("Invalid options: Can't have (timeout_us == 0) with (timeout_admin_us > 0)\n"); 5710 return -EINVAL; 5711 } 5712 5713 if (opts->bdev_retry_count < -1) { 5714 SPDK_WARNLOG("Invalid option: bdev_retry_count can't be less than -1.\n"); 5715 return -EINVAL; 5716 } 5717 5718 if (!bdev_nvme_check_io_error_resiliency_params(opts->ctrlr_loss_timeout_sec, 5719 opts->reconnect_delay_sec, 5720 opts->fast_io_fail_timeout_sec)) { 5721 return -EINVAL; 5722 } 5723 5724 return 0; 5725 } 5726 5727 int 5728 bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts) 5729 { 5730 int ret; 5731 5732 ret = bdev_nvme_validate_opts(opts); 5733 if (ret) { 5734 SPDK_WARNLOG("Failed to set nvme opts.\n"); 5735 return ret; 5736 } 5737 5738 if (g_bdev_nvme_init_thread != NULL) { 5739 if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 5740 return -EPERM; 5741 } 5742 } 5743 5744 if (opts->rdma_srq_size != 0 || 5745 opts->rdma_max_cq_size != 0 || 5746 opts->rdma_cm_event_timeout_ms != 0) { 5747 struct spdk_nvme_transport_opts drv_opts; 5748 5749 spdk_nvme_transport_get_opts(&drv_opts, sizeof(drv_opts)); 5750 if (opts->rdma_srq_size != 0) { 5751 drv_opts.rdma_srq_size = opts->rdma_srq_size; 5752 } 5753 if (opts->rdma_max_cq_size != 0) { 5754 drv_opts.rdma_max_cq_size = opts->rdma_max_cq_size; 5755 } 5756 if (opts->rdma_cm_event_timeout_ms != 0) { 5757 drv_opts.rdma_cm_event_timeout_ms = opts->rdma_cm_event_timeout_ms; 5758 } 5759 5760 ret = spdk_nvme_transport_set_opts(&drv_opts, sizeof(drv_opts)); 5761 if (ret) { 5762 SPDK_ERRLOG("Failed to set NVMe transport opts.\n"); 5763 return ret; 5764 } 5765 } 5766 5767 g_opts = *opts; 5768 5769 return 0; 5770 } 5771 5772 struct set_nvme_hotplug_ctx { 5773 uint64_t period_us; 5774 bool enabled; 5775 spdk_msg_fn fn; 5776 void *fn_ctx; 5777 }; 5778 5779 static void 5780 set_nvme_hotplug_period_cb(void *_ctx) 5781 { 5782 struct set_nvme_hotplug_ctx *ctx = _ctx; 5783 5784 spdk_poller_unregister(&g_hotplug_poller); 5785 if (ctx->enabled) { 5786 g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us); 5787 } 5788 5789 g_nvme_hotplug_poll_period_us = ctx->period_us; 5790 g_nvme_hotplug_enabled = ctx->enabled; 5791 if (ctx->fn) { 5792 ctx->fn(ctx->fn_ctx); 5793 } 5794 5795 free(ctx); 5796 } 5797 5798 int 5799 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx) 5800 { 5801 struct set_nvme_hotplug_ctx *ctx; 5802 5803 if (enabled == true && !spdk_process_is_primary()) { 5804 return -EPERM; 5805 } 5806 5807 ctx = calloc(1, sizeof(*ctx)); 5808 if (ctx == NULL) { 5809 return -ENOMEM; 5810 } 5811 5812 period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us; 5813 ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX); 5814 ctx->enabled = enabled; 5815 ctx->fn = cb; 5816 ctx->fn_ctx = cb_ctx; 5817 5818 spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx); 5819 return 0; 5820 } 5821 5822 static void 5823 nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 5824 struct nvme_async_probe_ctx *ctx) 5825 { 5826 struct nvme_ns *nvme_ns; 5827 struct nvme_bdev *nvme_bdev; 5828 size_t j; 5829 5830 assert(nvme_ctrlr != NULL); 5831 5832 if (ctx->names == NULL) { 5833 ctx->reported_bdevs = 0; 5834 populate_namespaces_cb(ctx, 0); 5835 return; 5836 } 5837 5838 /* 5839 * Report the new bdevs that were created in this call. 5840 * There can be more than one bdev per NVMe controller. 5841 */ 5842 j = 0; 5843 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 5844 while (nvme_ns != NULL) { 5845 nvme_bdev = nvme_ns->bdev; 5846 if (j < ctx->max_bdevs) { 5847 ctx->names[j] = nvme_bdev->disk.name; 5848 j++; 5849 } else { 5850 SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n", 5851 ctx->max_bdevs); 5852 ctx->reported_bdevs = 0; 5853 populate_namespaces_cb(ctx, -ERANGE); 5854 return; 5855 } 5856 5857 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 5858 } 5859 5860 ctx->reported_bdevs = j; 5861 populate_namespaces_cb(ctx, 0); 5862 } 5863 5864 static int 5865 bdev_nvme_check_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 5866 struct spdk_nvme_ctrlr *new_ctrlr, 5867 struct spdk_nvme_transport_id *trid) 5868 { 5869 struct nvme_path_id *tmp_trid; 5870 5871 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 5872 SPDK_ERRLOG("PCIe failover is not supported.\n"); 5873 return -ENOTSUP; 5874 } 5875 5876 /* Currently we only support failover to the same transport type. */ 5877 if (nvme_ctrlr->active_path_id->trid.trtype != trid->trtype) { 5878 SPDK_WARNLOG("Failover from trtype: %s to a different trtype: %s is not supported currently\n", 5879 spdk_nvme_transport_id_trtype_str(nvme_ctrlr->active_path_id->trid.trtype), 5880 spdk_nvme_transport_id_trtype_str(trid->trtype)); 5881 return -EINVAL; 5882 } 5883 5884 5885 /* Currently we only support failover to the same NQN. */ 5886 if (strncmp(trid->subnqn, nvme_ctrlr->active_path_id->trid.subnqn, SPDK_NVMF_NQN_MAX_LEN)) { 5887 SPDK_WARNLOG("Failover from subnqn: %s to a different subnqn: %s is not supported currently\n", 5888 nvme_ctrlr->active_path_id->trid.subnqn, trid->subnqn); 5889 return -EINVAL; 5890 } 5891 5892 /* Skip all the other checks if we've already registered this path. */ 5893 TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) { 5894 if (!spdk_nvme_transport_id_compare(&tmp_trid->trid, trid)) { 5895 SPDK_WARNLOG("This path (traddr: %s subnqn: %s) is already registered\n", trid->traddr, 5896 trid->subnqn); 5897 return -EALREADY; 5898 } 5899 } 5900 5901 return 0; 5902 } 5903 5904 static int 5905 bdev_nvme_check_secondary_namespace(struct nvme_ctrlr *nvme_ctrlr, 5906 struct spdk_nvme_ctrlr *new_ctrlr) 5907 { 5908 struct nvme_ns *nvme_ns; 5909 struct spdk_nvme_ns *new_ns; 5910 5911 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 5912 while (nvme_ns != NULL) { 5913 new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nvme_ns->id); 5914 assert(new_ns != NULL); 5915 5916 if (!bdev_nvme_compare_ns(nvme_ns->ns, new_ns)) { 5917 return -EINVAL; 5918 } 5919 5920 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 5921 } 5922 5923 return 0; 5924 } 5925 5926 static int 5927 _bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 5928 struct spdk_nvme_transport_id *trid) 5929 { 5930 struct nvme_path_id *active_id, *new_trid, *tmp_trid; 5931 5932 new_trid = calloc(1, sizeof(*new_trid)); 5933 if (new_trid == NULL) { 5934 return -ENOMEM; 5935 } 5936 new_trid->trid = *trid; 5937 5938 active_id = nvme_ctrlr->active_path_id; 5939 assert(active_id != NULL); 5940 assert(active_id == TAILQ_FIRST(&nvme_ctrlr->trids)); 5941 5942 /* Skip the active trid not to replace it until it is failed. */ 5943 tmp_trid = TAILQ_NEXT(active_id, link); 5944 if (tmp_trid == NULL) { 5945 goto add_tail; 5946 } 5947 5948 /* It means the trid is faled if its last failed time is non-zero. 5949 * Insert the new alternate trid before any failed trid. 5950 */ 5951 TAILQ_FOREACH_FROM(tmp_trid, &nvme_ctrlr->trids, link) { 5952 if (tmp_trid->last_failed_tsc != 0) { 5953 TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link); 5954 return 0; 5955 } 5956 } 5957 5958 add_tail: 5959 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, new_trid, link); 5960 return 0; 5961 } 5962 5963 /* This is the case that a secondary path is added to an existing 5964 * nvme_ctrlr for failover. After checking if it can access the same 5965 * namespaces as the primary path, it is disconnected until failover occurs. 5966 */ 5967 static int 5968 bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 5969 struct spdk_nvme_ctrlr *new_ctrlr, 5970 struct spdk_nvme_transport_id *trid) 5971 { 5972 int rc; 5973 5974 assert(nvme_ctrlr != NULL); 5975 5976 pthread_mutex_lock(&nvme_ctrlr->mutex); 5977 5978 rc = bdev_nvme_check_secondary_trid(nvme_ctrlr, new_ctrlr, trid); 5979 if (rc != 0) { 5980 goto exit; 5981 } 5982 5983 rc = bdev_nvme_check_secondary_namespace(nvme_ctrlr, new_ctrlr); 5984 if (rc != 0) { 5985 goto exit; 5986 } 5987 5988 rc = _bdev_nvme_add_secondary_trid(nvme_ctrlr, trid); 5989 5990 exit: 5991 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5992 5993 spdk_nvme_detach(new_ctrlr); 5994 5995 return rc; 5996 } 5997 5998 static void 5999 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 6000 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 6001 { 6002 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 6003 struct nvme_async_probe_ctx *ctx; 6004 int rc; 6005 6006 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 6007 ctx->ctrlr_attached = true; 6008 6009 rc = nvme_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx); 6010 if (rc != 0) { 6011 ctx->reported_bdevs = 0; 6012 populate_namespaces_cb(ctx, rc); 6013 } 6014 } 6015 6016 static void 6017 connect_set_failover_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 6018 struct spdk_nvme_ctrlr *ctrlr, 6019 const struct spdk_nvme_ctrlr_opts *opts) 6020 { 6021 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 6022 struct nvme_ctrlr *nvme_ctrlr; 6023 struct nvme_async_probe_ctx *ctx; 6024 int rc; 6025 6026 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 6027 ctx->ctrlr_attached = true; 6028 6029 nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->base_name); 6030 if (nvme_ctrlr) { 6031 rc = bdev_nvme_add_secondary_trid(nvme_ctrlr, ctrlr, &ctx->trid); 6032 } else { 6033 rc = -ENODEV; 6034 } 6035 6036 ctx->reported_bdevs = 0; 6037 populate_namespaces_cb(ctx, rc); 6038 } 6039 6040 static int 6041 bdev_nvme_async_poll(void *arg) 6042 { 6043 struct nvme_async_probe_ctx *ctx = arg; 6044 int rc; 6045 6046 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 6047 if (spdk_unlikely(rc != -EAGAIN)) { 6048 ctx->probe_done = true; 6049 spdk_poller_unregister(&ctx->poller); 6050 if (!ctx->ctrlr_attached) { 6051 /* The probe is done, but no controller was attached. 6052 * That means we had a failure, so report -EIO back to 6053 * the caller (usually the RPC). populate_namespaces_cb() 6054 * will take care of freeing the nvme_async_probe_ctx. 6055 */ 6056 ctx->reported_bdevs = 0; 6057 populate_namespaces_cb(ctx, -EIO); 6058 } else if (ctx->namespaces_populated) { 6059 /* The namespaces for the attached controller were all 6060 * populated and the response was already sent to the 6061 * caller (usually the RPC). So free the context here. 6062 */ 6063 free_nvme_async_probe_ctx(ctx); 6064 } 6065 } 6066 6067 return SPDK_POLLER_BUSY; 6068 } 6069 6070 static bool 6071 bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, 6072 uint32_t reconnect_delay_sec, 6073 uint32_t fast_io_fail_timeout_sec) 6074 { 6075 if (ctrlr_loss_timeout_sec < -1) { 6076 SPDK_ERRLOG("ctrlr_loss_timeout_sec can't be less than -1.\n"); 6077 return false; 6078 } else if (ctrlr_loss_timeout_sec == -1) { 6079 if (reconnect_delay_sec == 0) { 6080 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 6081 return false; 6082 } else if (fast_io_fail_timeout_sec != 0 && 6083 fast_io_fail_timeout_sec < reconnect_delay_sec) { 6084 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io-fail_timeout_sec.\n"); 6085 return false; 6086 } 6087 } else if (ctrlr_loss_timeout_sec != 0) { 6088 if (reconnect_delay_sec == 0) { 6089 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 6090 return false; 6091 } else if (reconnect_delay_sec > (uint32_t)ctrlr_loss_timeout_sec) { 6092 SPDK_ERRLOG("reconnect_delay_sec can't be more than ctrlr_loss_timeout_sec.\n"); 6093 return false; 6094 } else if (fast_io_fail_timeout_sec != 0) { 6095 if (fast_io_fail_timeout_sec < reconnect_delay_sec) { 6096 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io_fail_timeout_sec.\n"); 6097 return false; 6098 } else if (fast_io_fail_timeout_sec > (uint32_t)ctrlr_loss_timeout_sec) { 6099 SPDK_ERRLOG("fast_io_fail_timeout_sec can't be more than ctrlr_loss_timeout_sec.\n"); 6100 return false; 6101 } 6102 } 6103 } else if (reconnect_delay_sec != 0 || fast_io_fail_timeout_sec != 0) { 6104 SPDK_ERRLOG("Both reconnect_delay_sec and fast_io_fail_timeout_sec must be 0 if ctrlr_loss_timeout_sec is 0.\n"); 6105 return false; 6106 } 6107 6108 return true; 6109 } 6110 6111 static int 6112 bdev_nvme_load_psk(const char *fname, char *buf, size_t bufsz) 6113 { 6114 FILE *psk_file; 6115 struct stat statbuf; 6116 int rc; 6117 #define TCP_PSK_INVALID_PERMISSIONS 0177 6118 6119 if (stat(fname, &statbuf) != 0) { 6120 SPDK_ERRLOG("Could not read permissions for PSK file\n"); 6121 return -EACCES; 6122 } 6123 6124 if ((statbuf.st_mode & TCP_PSK_INVALID_PERMISSIONS) != 0) { 6125 SPDK_ERRLOG("Incorrect permissions for PSK file\n"); 6126 return -EPERM; 6127 } 6128 if ((size_t)statbuf.st_size >= bufsz) { 6129 SPDK_ERRLOG("Invalid PSK: too long\n"); 6130 return -EINVAL; 6131 } 6132 psk_file = fopen(fname, "r"); 6133 if (psk_file == NULL) { 6134 SPDK_ERRLOG("Could not open PSK file\n"); 6135 return -EINVAL; 6136 } 6137 6138 memset(buf, 0, bufsz); 6139 rc = fread(buf, 1, statbuf.st_size, psk_file); 6140 if (rc != statbuf.st_size) { 6141 SPDK_ERRLOG("Failed to read PSK\n"); 6142 fclose(psk_file); 6143 return -EINVAL; 6144 } 6145 6146 fclose(psk_file); 6147 return 0; 6148 } 6149 6150 int 6151 bdev_nvme_create(struct spdk_nvme_transport_id *trid, 6152 const char *base_name, 6153 const char **names, 6154 uint32_t count, 6155 spdk_bdev_create_nvme_fn cb_fn, 6156 void *cb_ctx, 6157 struct spdk_nvme_ctrlr_opts *drv_opts, 6158 struct nvme_ctrlr_opts *bdev_opts, 6159 bool multipath) 6160 { 6161 struct nvme_probe_skip_entry *entry, *tmp; 6162 struct nvme_async_probe_ctx *ctx; 6163 spdk_nvme_attach_cb attach_cb; 6164 int rc, len; 6165 6166 /* TODO expand this check to include both the host and target TRIDs. 6167 * Only if both are the same should we fail. 6168 */ 6169 if (nvme_ctrlr_get(trid, drv_opts->hostnqn) != NULL) { 6170 SPDK_ERRLOG("A controller with the provided trid (traddr: %s, hostnqn: %s) " 6171 "already exists.\n", trid->traddr, drv_opts->hostnqn); 6172 return -EEXIST; 6173 } 6174 6175 len = strnlen(base_name, SPDK_CONTROLLER_NAME_MAX); 6176 6177 if (len == 0 || len == SPDK_CONTROLLER_NAME_MAX) { 6178 SPDK_ERRLOG("controller name must be between 1 and %d characters\n", SPDK_CONTROLLER_NAME_MAX - 1); 6179 return -EINVAL; 6180 } 6181 6182 if (bdev_opts != NULL && 6183 !bdev_nvme_check_io_error_resiliency_params(bdev_opts->ctrlr_loss_timeout_sec, 6184 bdev_opts->reconnect_delay_sec, 6185 bdev_opts->fast_io_fail_timeout_sec)) { 6186 return -EINVAL; 6187 } 6188 6189 ctx = calloc(1, sizeof(*ctx)); 6190 if (!ctx) { 6191 return -ENOMEM; 6192 } 6193 ctx->base_name = base_name; 6194 ctx->names = names; 6195 ctx->max_bdevs = count; 6196 ctx->cb_fn = cb_fn; 6197 ctx->cb_ctx = cb_ctx; 6198 ctx->trid = *trid; 6199 6200 if (bdev_opts) { 6201 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 6202 } else { 6203 bdev_nvme_get_default_ctrlr_opts(&ctx->bdev_opts); 6204 } 6205 6206 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 6207 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) { 6208 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 6209 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 6210 free(entry); 6211 break; 6212 } 6213 } 6214 } 6215 6216 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 6217 ctx->drv_opts.transport_retry_count = g_opts.transport_retry_count; 6218 ctx->drv_opts.transport_ack_timeout = g_opts.transport_ack_timeout; 6219 ctx->drv_opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms; 6220 ctx->drv_opts.disable_read_ana_log_page = true; 6221 ctx->drv_opts.transport_tos = g_opts.transport_tos; 6222 6223 if (ctx->bdev_opts.psk[0] != '\0') { 6224 /* Try to use the keyring first */ 6225 ctx->drv_opts.tls_psk = spdk_keyring_get_key(ctx->bdev_opts.psk); 6226 if (ctx->drv_opts.tls_psk == NULL) { 6227 rc = bdev_nvme_load_psk(ctx->bdev_opts.psk, 6228 ctx->drv_opts.psk, sizeof(ctx->drv_opts.psk)); 6229 if (rc != 0) { 6230 SPDK_ERRLOG("Could not load PSK from %s\n", ctx->bdev_opts.psk); 6231 free_nvme_async_probe_ctx(ctx); 6232 return rc; 6233 } 6234 } 6235 } 6236 6237 if (ctx->bdev_opts.dhchap_key != NULL) { 6238 ctx->drv_opts.dhchap_key = spdk_keyring_get_key(ctx->bdev_opts.dhchap_key); 6239 if (ctx->drv_opts.dhchap_key == NULL) { 6240 SPDK_ERRLOG("Could not load DH-HMAC-CHAP key: %s\n", 6241 ctx->bdev_opts.dhchap_key); 6242 free_nvme_async_probe_ctx(ctx); 6243 return -ENOKEY; 6244 } 6245 6246 ctx->drv_opts.dhchap_digests = g_opts.dhchap_digests; 6247 ctx->drv_opts.dhchap_dhgroups = g_opts.dhchap_dhgroups; 6248 } 6249 if (ctx->bdev_opts.dhchap_ctrlr_key != NULL) { 6250 ctx->drv_opts.dhchap_ctrlr_key = 6251 spdk_keyring_get_key(ctx->bdev_opts.dhchap_ctrlr_key); 6252 if (ctx->drv_opts.dhchap_ctrlr_key == NULL) { 6253 SPDK_ERRLOG("Could not load DH-HMAC-CHAP controller key: %s\n", 6254 ctx->bdev_opts.dhchap_ctrlr_key); 6255 free_nvme_async_probe_ctx(ctx); 6256 return -ENOKEY; 6257 } 6258 } 6259 6260 if (nvme_bdev_ctrlr_get_by_name(base_name) == NULL || multipath) { 6261 attach_cb = connect_attach_cb; 6262 } else { 6263 attach_cb = connect_set_failover_cb; 6264 } 6265 6266 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, attach_cb); 6267 if (ctx->probe_ctx == NULL) { 6268 SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr); 6269 free_nvme_async_probe_ctx(ctx); 6270 return -ENODEV; 6271 } 6272 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000); 6273 6274 return 0; 6275 } 6276 6277 struct bdev_nvme_delete_ctx { 6278 char *name; 6279 struct nvme_path_id path_id; 6280 bdev_nvme_delete_done_fn delete_done; 6281 void *delete_done_ctx; 6282 uint64_t timeout_ticks; 6283 struct spdk_poller *poller; 6284 }; 6285 6286 static void 6287 free_bdev_nvme_delete_ctx(struct bdev_nvme_delete_ctx *ctx) 6288 { 6289 if (ctx != NULL) { 6290 free(ctx->name); 6291 free(ctx); 6292 } 6293 } 6294 6295 static bool 6296 nvme_path_id_compare(struct nvme_path_id *p, const struct nvme_path_id *path_id) 6297 { 6298 if (path_id->trid.trtype != 0) { 6299 if (path_id->trid.trtype == SPDK_NVME_TRANSPORT_CUSTOM) { 6300 if (strcasecmp(path_id->trid.trstring, p->trid.trstring) != 0) { 6301 return false; 6302 } 6303 } else { 6304 if (path_id->trid.trtype != p->trid.trtype) { 6305 return false; 6306 } 6307 } 6308 } 6309 6310 if (!spdk_mem_all_zero(path_id->trid.traddr, sizeof(path_id->trid.traddr))) { 6311 if (strcasecmp(path_id->trid.traddr, p->trid.traddr) != 0) { 6312 return false; 6313 } 6314 } 6315 6316 if (path_id->trid.adrfam != 0) { 6317 if (path_id->trid.adrfam != p->trid.adrfam) { 6318 return false; 6319 } 6320 } 6321 6322 if (!spdk_mem_all_zero(path_id->trid.trsvcid, sizeof(path_id->trid.trsvcid))) { 6323 if (strcasecmp(path_id->trid.trsvcid, p->trid.trsvcid) != 0) { 6324 return false; 6325 } 6326 } 6327 6328 if (!spdk_mem_all_zero(path_id->trid.subnqn, sizeof(path_id->trid.subnqn))) { 6329 if (strcmp(path_id->trid.subnqn, p->trid.subnqn) != 0) { 6330 return false; 6331 } 6332 } 6333 6334 if (!spdk_mem_all_zero(path_id->hostid.hostaddr, sizeof(path_id->hostid.hostaddr))) { 6335 if (strcmp(path_id->hostid.hostaddr, p->hostid.hostaddr) != 0) { 6336 return false; 6337 } 6338 } 6339 6340 if (!spdk_mem_all_zero(path_id->hostid.hostsvcid, sizeof(path_id->hostid.hostsvcid))) { 6341 if (strcmp(path_id->hostid.hostsvcid, p->hostid.hostsvcid) != 0) { 6342 return false; 6343 } 6344 } 6345 6346 return true; 6347 } 6348 6349 static bool 6350 nvme_path_id_exists(const char *name, const struct nvme_path_id *path_id) 6351 { 6352 struct nvme_bdev_ctrlr *nbdev_ctrlr; 6353 struct nvme_ctrlr *ctrlr; 6354 struct nvme_path_id *p; 6355 6356 pthread_mutex_lock(&g_bdev_nvme_mutex); 6357 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 6358 if (!nbdev_ctrlr) { 6359 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6360 return false; 6361 } 6362 6363 TAILQ_FOREACH(ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 6364 pthread_mutex_lock(&ctrlr->mutex); 6365 TAILQ_FOREACH(p, &ctrlr->trids, link) { 6366 if (nvme_path_id_compare(p, path_id)) { 6367 pthread_mutex_unlock(&ctrlr->mutex); 6368 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6369 return true; 6370 } 6371 } 6372 pthread_mutex_unlock(&ctrlr->mutex); 6373 } 6374 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6375 6376 return false; 6377 } 6378 6379 static int 6380 bdev_nvme_delete_complete_poll(void *arg) 6381 { 6382 struct bdev_nvme_delete_ctx *ctx = arg; 6383 int rc = 0; 6384 6385 if (nvme_path_id_exists(ctx->name, &ctx->path_id)) { 6386 if (ctx->timeout_ticks > spdk_get_ticks()) { 6387 return SPDK_POLLER_BUSY; 6388 } 6389 6390 SPDK_ERRLOG("NVMe path '%s' still exists after delete\n", ctx->name); 6391 rc = -ETIMEDOUT; 6392 } 6393 6394 spdk_poller_unregister(&ctx->poller); 6395 6396 ctx->delete_done(ctx->delete_done_ctx, rc); 6397 free_bdev_nvme_delete_ctx(ctx); 6398 6399 return SPDK_POLLER_BUSY; 6400 } 6401 6402 static int 6403 _bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, const struct nvme_path_id *path_id) 6404 { 6405 struct nvme_path_id *p, *t; 6406 spdk_msg_fn msg_fn; 6407 int rc = -ENXIO; 6408 6409 pthread_mutex_lock(&nvme_ctrlr->mutex); 6410 6411 TAILQ_FOREACH_REVERSE_SAFE(p, &nvme_ctrlr->trids, nvme_paths, link, t) { 6412 if (p == TAILQ_FIRST(&nvme_ctrlr->trids)) { 6413 break; 6414 } 6415 6416 if (!nvme_path_id_compare(p, path_id)) { 6417 continue; 6418 } 6419 6420 /* We are not using the specified path. */ 6421 TAILQ_REMOVE(&nvme_ctrlr->trids, p, link); 6422 free(p); 6423 rc = 0; 6424 } 6425 6426 if (p == NULL || !nvme_path_id_compare(p, path_id)) { 6427 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6428 return rc; 6429 } 6430 6431 /* If we made it here, then this path is a match! Now we need to remove it. */ 6432 6433 /* This is the active path in use right now. The active path is always the first in the list. */ 6434 assert(p == nvme_ctrlr->active_path_id); 6435 6436 if (!TAILQ_NEXT(p, link)) { 6437 /* The current path is the only path. */ 6438 msg_fn = _nvme_ctrlr_destruct; 6439 rc = bdev_nvme_delete_ctrlr_unsafe(nvme_ctrlr, false); 6440 } else { 6441 /* There is an alternative path. */ 6442 msg_fn = _bdev_nvme_reset_ctrlr; 6443 rc = bdev_nvme_failover_ctrlr_unsafe(nvme_ctrlr, true); 6444 } 6445 6446 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6447 6448 if (rc == 0) { 6449 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 6450 } else if (rc == -EALREADY) { 6451 rc = 0; 6452 } 6453 6454 return rc; 6455 } 6456 6457 int 6458 bdev_nvme_delete(const char *name, const struct nvme_path_id *path_id, 6459 bdev_nvme_delete_done_fn delete_done, void *delete_done_ctx) 6460 { 6461 struct nvme_bdev_ctrlr *nbdev_ctrlr; 6462 struct nvme_ctrlr *nvme_ctrlr, *tmp_nvme_ctrlr; 6463 struct bdev_nvme_delete_ctx *ctx = NULL; 6464 int rc = -ENXIO, _rc; 6465 6466 if (name == NULL || path_id == NULL) { 6467 rc = -EINVAL; 6468 goto exit; 6469 } 6470 6471 pthread_mutex_lock(&g_bdev_nvme_mutex); 6472 6473 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 6474 if (nbdev_ctrlr == NULL) { 6475 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6476 6477 SPDK_ERRLOG("Failed to find NVMe bdev controller\n"); 6478 rc = -ENODEV; 6479 goto exit; 6480 } 6481 6482 TAILQ_FOREACH_SAFE(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq, tmp_nvme_ctrlr) { 6483 _rc = _bdev_nvme_delete(nvme_ctrlr, path_id); 6484 if (_rc < 0 && _rc != -ENXIO) { 6485 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6486 rc = _rc; 6487 goto exit; 6488 } else if (_rc == 0) { 6489 /* We traverse all remaining nvme_ctrlrs even if one nvme_ctrlr 6490 * was deleted successfully. To remember the successful deletion, 6491 * overwrite rc only if _rc is zero. 6492 */ 6493 rc = 0; 6494 } 6495 } 6496 6497 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6498 6499 if (rc != 0 || delete_done == NULL) { 6500 goto exit; 6501 } 6502 6503 ctx = calloc(1, sizeof(*ctx)); 6504 if (ctx == NULL) { 6505 SPDK_ERRLOG("Failed to allocate context for bdev_nvme_delete\n"); 6506 rc = -ENOMEM; 6507 goto exit; 6508 } 6509 6510 ctx->name = strdup(name); 6511 if (ctx->name == NULL) { 6512 SPDK_ERRLOG("Failed to copy controller name for deletion\n"); 6513 rc = -ENOMEM; 6514 goto exit; 6515 } 6516 6517 ctx->delete_done = delete_done; 6518 ctx->delete_done_ctx = delete_done_ctx; 6519 ctx->path_id = *path_id; 6520 ctx->timeout_ticks = spdk_get_ticks() + 10 * spdk_get_ticks_hz(); 6521 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_delete_complete_poll, ctx, 1000); 6522 if (ctx->poller == NULL) { 6523 SPDK_ERRLOG("Failed to register bdev_nvme_delete poller\n"); 6524 rc = -ENOMEM; 6525 goto exit; 6526 } 6527 6528 exit: 6529 if (rc != 0) { 6530 free_bdev_nvme_delete_ctx(ctx); 6531 } 6532 6533 return rc; 6534 } 6535 6536 #define DISCOVERY_INFOLOG(ctx, format, ...) \ 6537 SPDK_INFOLOG(bdev_nvme, "Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 6538 6539 #define DISCOVERY_ERRLOG(ctx, format, ...) \ 6540 SPDK_ERRLOG("Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 6541 6542 struct discovery_entry_ctx { 6543 char name[128]; 6544 struct spdk_nvme_transport_id trid; 6545 struct spdk_nvme_ctrlr_opts drv_opts; 6546 struct spdk_nvmf_discovery_log_page_entry entry; 6547 TAILQ_ENTRY(discovery_entry_ctx) tailq; 6548 struct discovery_ctx *ctx; 6549 }; 6550 6551 struct discovery_ctx { 6552 char *name; 6553 spdk_bdev_nvme_start_discovery_fn start_cb_fn; 6554 spdk_bdev_nvme_stop_discovery_fn stop_cb_fn; 6555 void *cb_ctx; 6556 struct spdk_nvme_probe_ctx *probe_ctx; 6557 struct spdk_nvme_detach_ctx *detach_ctx; 6558 struct spdk_nvme_ctrlr *ctrlr; 6559 struct spdk_nvme_transport_id trid; 6560 struct discovery_entry_ctx *entry_ctx_in_use; 6561 struct spdk_poller *poller; 6562 struct spdk_nvme_ctrlr_opts drv_opts; 6563 struct nvme_ctrlr_opts bdev_opts; 6564 struct spdk_nvmf_discovery_log_page *log_page; 6565 TAILQ_ENTRY(discovery_ctx) tailq; 6566 TAILQ_HEAD(, discovery_entry_ctx) nvm_entry_ctxs; 6567 TAILQ_HEAD(, discovery_entry_ctx) discovery_entry_ctxs; 6568 int rc; 6569 bool wait_for_attach; 6570 uint64_t timeout_ticks; 6571 /* Denotes that the discovery service is being started. We're waiting 6572 * for the initial connection to the discovery controller to be 6573 * established and attach discovered NVM ctrlrs. 6574 */ 6575 bool initializing; 6576 /* Denotes if a discovery is currently in progress for this context. 6577 * That includes connecting to newly discovered subsystems. Used to 6578 * ensure we do not start a new discovery until an existing one is 6579 * complete. 6580 */ 6581 bool in_progress; 6582 6583 /* Denotes if another discovery is needed after the one in progress 6584 * completes. Set when we receive an AER completion while a discovery 6585 * is already in progress. 6586 */ 6587 bool pending; 6588 6589 /* Signal to the discovery context poller that it should stop the 6590 * discovery service, including detaching from the current discovery 6591 * controller. 6592 */ 6593 bool stop; 6594 6595 struct spdk_thread *calling_thread; 6596 uint32_t index; 6597 uint32_t attach_in_progress; 6598 char *hostnqn; 6599 6600 /* Denotes if the discovery service was started by the mdns discovery. 6601 */ 6602 bool from_mdns_discovery_service; 6603 }; 6604 6605 TAILQ_HEAD(discovery_ctxs, discovery_ctx); 6606 static struct discovery_ctxs g_discovery_ctxs = TAILQ_HEAD_INITIALIZER(g_discovery_ctxs); 6607 6608 static void get_discovery_log_page(struct discovery_ctx *ctx); 6609 6610 static void 6611 free_discovery_ctx(struct discovery_ctx *ctx) 6612 { 6613 free(ctx->log_page); 6614 free(ctx->hostnqn); 6615 free(ctx->name); 6616 free(ctx); 6617 } 6618 6619 static void 6620 discovery_complete(struct discovery_ctx *ctx) 6621 { 6622 ctx->initializing = false; 6623 ctx->in_progress = false; 6624 if (ctx->pending) { 6625 ctx->pending = false; 6626 get_discovery_log_page(ctx); 6627 } 6628 } 6629 6630 static void 6631 build_trid_from_log_page_entry(struct spdk_nvme_transport_id *trid, 6632 struct spdk_nvmf_discovery_log_page_entry *entry) 6633 { 6634 char *space; 6635 6636 trid->trtype = entry->trtype; 6637 trid->adrfam = entry->adrfam; 6638 memcpy(trid->traddr, entry->traddr, sizeof(entry->traddr)); 6639 memcpy(trid->trsvcid, entry->trsvcid, sizeof(entry->trsvcid)); 6640 /* Because the source buffer (entry->subnqn) is longer than trid->subnqn, and 6641 * before call to this function trid->subnqn is zeroed out, we need 6642 * to copy sizeof(trid->subnqn) minus one byte to make sure the last character 6643 * remains 0. Then we can shorten the string (replace ' ' with 0) if required 6644 */ 6645 memcpy(trid->subnqn, entry->subnqn, sizeof(trid->subnqn) - 1); 6646 6647 /* We want the traddr, trsvcid and subnqn fields to be NULL-terminated. 6648 * But the log page entries typically pad them with spaces, not zeroes. 6649 * So add a NULL terminator to each of these fields at the appropriate 6650 * location. 6651 */ 6652 space = strchr(trid->traddr, ' '); 6653 if (space) { 6654 *space = 0; 6655 } 6656 space = strchr(trid->trsvcid, ' '); 6657 if (space) { 6658 *space = 0; 6659 } 6660 space = strchr(trid->subnqn, ' '); 6661 if (space) { 6662 *space = 0; 6663 } 6664 } 6665 6666 static void 6667 _stop_discovery(void *_ctx) 6668 { 6669 struct discovery_ctx *ctx = _ctx; 6670 6671 if (ctx->attach_in_progress > 0) { 6672 spdk_thread_send_msg(spdk_get_thread(), _stop_discovery, ctx); 6673 return; 6674 } 6675 6676 ctx->stop = true; 6677 6678 while (!TAILQ_EMPTY(&ctx->nvm_entry_ctxs)) { 6679 struct discovery_entry_ctx *entry_ctx; 6680 struct nvme_path_id path = {}; 6681 6682 entry_ctx = TAILQ_FIRST(&ctx->nvm_entry_ctxs); 6683 path.trid = entry_ctx->trid; 6684 bdev_nvme_delete(entry_ctx->name, &path, NULL, NULL); 6685 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 6686 free(entry_ctx); 6687 } 6688 6689 while (!TAILQ_EMPTY(&ctx->discovery_entry_ctxs)) { 6690 struct discovery_entry_ctx *entry_ctx; 6691 6692 entry_ctx = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 6693 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 6694 free(entry_ctx); 6695 } 6696 6697 free(ctx->entry_ctx_in_use); 6698 ctx->entry_ctx_in_use = NULL; 6699 } 6700 6701 static void 6702 stop_discovery(struct discovery_ctx *ctx, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 6703 { 6704 ctx->stop_cb_fn = cb_fn; 6705 ctx->cb_ctx = cb_ctx; 6706 6707 if (ctx->attach_in_progress > 0) { 6708 DISCOVERY_INFOLOG(ctx, "stopping discovery with attach_in_progress: %"PRIu32"\n", 6709 ctx->attach_in_progress); 6710 } 6711 6712 _stop_discovery(ctx); 6713 } 6714 6715 static void 6716 remove_discovery_entry(struct nvme_ctrlr *nvme_ctrlr) 6717 { 6718 struct discovery_ctx *d_ctx; 6719 struct nvme_path_id *path_id; 6720 struct spdk_nvme_transport_id trid = {}; 6721 struct discovery_entry_ctx *entry_ctx, *tmp; 6722 6723 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 6724 6725 TAILQ_FOREACH(d_ctx, &g_discovery_ctxs, tailq) { 6726 TAILQ_FOREACH_SAFE(entry_ctx, &d_ctx->nvm_entry_ctxs, tailq, tmp) { 6727 build_trid_from_log_page_entry(&trid, &entry_ctx->entry); 6728 if (spdk_nvme_transport_id_compare(&trid, &path_id->trid) != 0) { 6729 continue; 6730 } 6731 6732 TAILQ_REMOVE(&d_ctx->nvm_entry_ctxs, entry_ctx, tailq); 6733 free(entry_ctx); 6734 DISCOVERY_INFOLOG(d_ctx, "Remove discovery entry: %s:%s:%s\n", 6735 trid.subnqn, trid.traddr, trid.trsvcid); 6736 6737 /* Fail discovery ctrlr to force reattach attempt */ 6738 spdk_nvme_ctrlr_fail(d_ctx->ctrlr); 6739 } 6740 } 6741 } 6742 6743 static void 6744 discovery_remove_controllers(struct discovery_ctx *ctx) 6745 { 6746 struct spdk_nvmf_discovery_log_page *log_page = ctx->log_page; 6747 struct discovery_entry_ctx *entry_ctx, *tmp; 6748 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 6749 struct spdk_nvme_transport_id old_trid = {}; 6750 uint64_t numrec, i; 6751 bool found; 6752 6753 numrec = from_le64(&log_page->numrec); 6754 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->nvm_entry_ctxs, tailq, tmp) { 6755 found = false; 6756 old_entry = &entry_ctx->entry; 6757 build_trid_from_log_page_entry(&old_trid, old_entry); 6758 for (i = 0; i < numrec; i++) { 6759 new_entry = &log_page->entries[i]; 6760 if (!memcmp(old_entry, new_entry, sizeof(*old_entry))) { 6761 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s found again\n", 6762 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 6763 found = true; 6764 break; 6765 } 6766 } 6767 if (!found) { 6768 struct nvme_path_id path = {}; 6769 6770 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s not found\n", 6771 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 6772 6773 path.trid = entry_ctx->trid; 6774 bdev_nvme_delete(entry_ctx->name, &path, NULL, NULL); 6775 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 6776 free(entry_ctx); 6777 } 6778 } 6779 free(log_page); 6780 ctx->log_page = NULL; 6781 discovery_complete(ctx); 6782 } 6783 6784 static void 6785 complete_discovery_start(struct discovery_ctx *ctx, int status) 6786 { 6787 ctx->timeout_ticks = 0; 6788 ctx->rc = status; 6789 if (ctx->start_cb_fn) { 6790 ctx->start_cb_fn(ctx->cb_ctx, status); 6791 ctx->start_cb_fn = NULL; 6792 ctx->cb_ctx = NULL; 6793 } 6794 } 6795 6796 static void 6797 discovery_attach_controller_done(void *cb_ctx, size_t bdev_count, int rc) 6798 { 6799 struct discovery_entry_ctx *entry_ctx = cb_ctx; 6800 struct discovery_ctx *ctx = entry_ctx->ctx; 6801 6802 DISCOVERY_INFOLOG(ctx, "attach %s done\n", entry_ctx->name); 6803 ctx->attach_in_progress--; 6804 if (ctx->attach_in_progress == 0) { 6805 complete_discovery_start(ctx, ctx->rc); 6806 if (ctx->initializing && ctx->rc != 0) { 6807 DISCOVERY_ERRLOG(ctx, "stopping discovery due to errors: %d\n", ctx->rc); 6808 stop_discovery(ctx, NULL, ctx->cb_ctx); 6809 } else { 6810 discovery_remove_controllers(ctx); 6811 } 6812 } 6813 } 6814 6815 static struct discovery_entry_ctx * 6816 create_discovery_entry_ctx(struct discovery_ctx *ctx, struct spdk_nvme_transport_id *trid) 6817 { 6818 struct discovery_entry_ctx *new_ctx; 6819 6820 new_ctx = calloc(1, sizeof(*new_ctx)); 6821 if (new_ctx == NULL) { 6822 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6823 return NULL; 6824 } 6825 6826 new_ctx->ctx = ctx; 6827 memcpy(&new_ctx->trid, trid, sizeof(*trid)); 6828 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 6829 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 6830 return new_ctx; 6831 } 6832 6833 static void 6834 discovery_log_page_cb(void *cb_arg, int rc, const struct spdk_nvme_cpl *cpl, 6835 struct spdk_nvmf_discovery_log_page *log_page) 6836 { 6837 struct discovery_ctx *ctx = cb_arg; 6838 struct discovery_entry_ctx *entry_ctx, *tmp; 6839 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 6840 uint64_t numrec, i; 6841 bool found; 6842 6843 if (rc || spdk_nvme_cpl_is_error(cpl)) { 6844 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 6845 return; 6846 } 6847 6848 ctx->log_page = log_page; 6849 assert(ctx->attach_in_progress == 0); 6850 numrec = from_le64(&log_page->numrec); 6851 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->discovery_entry_ctxs, tailq, tmp) { 6852 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 6853 free(entry_ctx); 6854 } 6855 for (i = 0; i < numrec; i++) { 6856 found = false; 6857 new_entry = &log_page->entries[i]; 6858 if (new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY_CURRENT || 6859 new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) { 6860 struct discovery_entry_ctx *new_ctx; 6861 struct spdk_nvme_transport_id trid = {}; 6862 6863 build_trid_from_log_page_entry(&trid, new_entry); 6864 new_ctx = create_discovery_entry_ctx(ctx, &trid); 6865 if (new_ctx == NULL) { 6866 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6867 break; 6868 } 6869 6870 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, new_ctx, tailq); 6871 continue; 6872 } 6873 TAILQ_FOREACH(entry_ctx, &ctx->nvm_entry_ctxs, tailq) { 6874 old_entry = &entry_ctx->entry; 6875 if (!memcmp(new_entry, old_entry, sizeof(*new_entry))) { 6876 found = true; 6877 break; 6878 } 6879 } 6880 if (!found) { 6881 struct discovery_entry_ctx *subnqn_ctx = NULL, *new_ctx; 6882 struct discovery_ctx *d_ctx; 6883 6884 TAILQ_FOREACH(d_ctx, &g_discovery_ctxs, tailq) { 6885 TAILQ_FOREACH(subnqn_ctx, &d_ctx->nvm_entry_ctxs, tailq) { 6886 if (!memcmp(subnqn_ctx->entry.subnqn, new_entry->subnqn, 6887 sizeof(new_entry->subnqn))) { 6888 break; 6889 } 6890 } 6891 if (subnqn_ctx) { 6892 break; 6893 } 6894 } 6895 6896 new_ctx = calloc(1, sizeof(*new_ctx)); 6897 if (new_ctx == NULL) { 6898 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6899 break; 6900 } 6901 6902 new_ctx->ctx = ctx; 6903 memcpy(&new_ctx->entry, new_entry, sizeof(*new_entry)); 6904 build_trid_from_log_page_entry(&new_ctx->trid, new_entry); 6905 if (subnqn_ctx) { 6906 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s", subnqn_ctx->name); 6907 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new path for %s\n", 6908 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 6909 new_ctx->name); 6910 } else { 6911 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s%d", ctx->name, ctx->index++); 6912 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new subsystem %s\n", 6913 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 6914 new_ctx->name); 6915 } 6916 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 6917 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 6918 rc = bdev_nvme_create(&new_ctx->trid, new_ctx->name, NULL, 0, 6919 discovery_attach_controller_done, new_ctx, 6920 &new_ctx->drv_opts, &ctx->bdev_opts, true); 6921 if (rc == 0) { 6922 TAILQ_INSERT_TAIL(&ctx->nvm_entry_ctxs, new_ctx, tailq); 6923 ctx->attach_in_progress++; 6924 } else { 6925 DISCOVERY_ERRLOG(ctx, "bdev_nvme_create failed (%s)\n", spdk_strerror(-rc)); 6926 } 6927 } 6928 } 6929 6930 if (ctx->attach_in_progress == 0) { 6931 discovery_remove_controllers(ctx); 6932 } 6933 } 6934 6935 static void 6936 get_discovery_log_page(struct discovery_ctx *ctx) 6937 { 6938 int rc; 6939 6940 assert(ctx->in_progress == false); 6941 ctx->in_progress = true; 6942 rc = spdk_nvme_ctrlr_get_discovery_log_page(ctx->ctrlr, discovery_log_page_cb, ctx); 6943 if (rc != 0) { 6944 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 6945 } 6946 DISCOVERY_INFOLOG(ctx, "sent discovery log page command\n"); 6947 } 6948 6949 static void 6950 discovery_aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 6951 { 6952 struct discovery_ctx *ctx = arg; 6953 uint32_t log_page_id = (cpl->cdw0 & 0xFF0000) >> 16; 6954 6955 if (spdk_nvme_cpl_is_error(cpl)) { 6956 DISCOVERY_ERRLOG(ctx, "aer failed\n"); 6957 return; 6958 } 6959 6960 if (log_page_id != SPDK_NVME_LOG_DISCOVERY) { 6961 DISCOVERY_ERRLOG(ctx, "unexpected log page 0x%x\n", log_page_id); 6962 return; 6963 } 6964 6965 DISCOVERY_INFOLOG(ctx, "got aer\n"); 6966 if (ctx->in_progress) { 6967 ctx->pending = true; 6968 return; 6969 } 6970 6971 get_discovery_log_page(ctx); 6972 } 6973 6974 static void 6975 discovery_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 6976 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 6977 { 6978 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 6979 struct discovery_ctx *ctx; 6980 6981 ctx = SPDK_CONTAINEROF(user_opts, struct discovery_ctx, drv_opts); 6982 6983 DISCOVERY_INFOLOG(ctx, "discovery ctrlr attached\n"); 6984 ctx->probe_ctx = NULL; 6985 ctx->ctrlr = ctrlr; 6986 6987 if (ctx->rc != 0) { 6988 DISCOVERY_ERRLOG(ctx, "encountered error while attaching discovery ctrlr: %d\n", 6989 ctx->rc); 6990 return; 6991 } 6992 6993 spdk_nvme_ctrlr_register_aer_callback(ctx->ctrlr, discovery_aer_cb, ctx); 6994 } 6995 6996 static int 6997 discovery_poller(void *arg) 6998 { 6999 struct discovery_ctx *ctx = arg; 7000 struct spdk_nvme_transport_id *trid; 7001 int rc; 7002 7003 if (ctx->detach_ctx) { 7004 rc = spdk_nvme_detach_poll_async(ctx->detach_ctx); 7005 if (rc != -EAGAIN) { 7006 ctx->detach_ctx = NULL; 7007 ctx->ctrlr = NULL; 7008 } 7009 } else if (ctx->stop) { 7010 if (ctx->ctrlr != NULL) { 7011 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 7012 if (rc == 0) { 7013 return SPDK_POLLER_BUSY; 7014 } 7015 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 7016 } 7017 spdk_poller_unregister(&ctx->poller); 7018 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 7019 assert(ctx->start_cb_fn == NULL); 7020 if (ctx->stop_cb_fn != NULL) { 7021 ctx->stop_cb_fn(ctx->cb_ctx); 7022 } 7023 free_discovery_ctx(ctx); 7024 } else if (ctx->probe_ctx == NULL && ctx->ctrlr == NULL) { 7025 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 7026 DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); 7027 assert(ctx->initializing); 7028 spdk_poller_unregister(&ctx->poller); 7029 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 7030 complete_discovery_start(ctx, -ETIMEDOUT); 7031 stop_discovery(ctx, NULL, NULL); 7032 free_discovery_ctx(ctx); 7033 return SPDK_POLLER_BUSY; 7034 } 7035 7036 assert(ctx->entry_ctx_in_use == NULL); 7037 ctx->entry_ctx_in_use = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 7038 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 7039 trid = &ctx->entry_ctx_in_use->trid; 7040 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, discovery_attach_cb); 7041 if (ctx->probe_ctx) { 7042 spdk_poller_unregister(&ctx->poller); 7043 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000); 7044 } else { 7045 DISCOVERY_ERRLOG(ctx, "could not start discovery connect\n"); 7046 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 7047 ctx->entry_ctx_in_use = NULL; 7048 } 7049 } else if (ctx->probe_ctx) { 7050 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 7051 DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); 7052 complete_discovery_start(ctx, -ETIMEDOUT); 7053 return SPDK_POLLER_BUSY; 7054 } 7055 7056 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 7057 if (rc != -EAGAIN) { 7058 if (ctx->rc != 0) { 7059 assert(ctx->initializing); 7060 stop_discovery(ctx, NULL, ctx->cb_ctx); 7061 } else { 7062 assert(rc == 0); 7063 DISCOVERY_INFOLOG(ctx, "discovery ctrlr connected\n"); 7064 ctx->rc = rc; 7065 get_discovery_log_page(ctx); 7066 } 7067 } 7068 } else { 7069 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 7070 DISCOVERY_ERRLOG(ctx, "timed out while attaching NVM ctrlrs\n"); 7071 complete_discovery_start(ctx, -ETIMEDOUT); 7072 /* We need to wait until all NVM ctrlrs are attached before we stop the 7073 * discovery service to make sure we don't detach a ctrlr that is still 7074 * being attached. 7075 */ 7076 if (ctx->attach_in_progress == 0) { 7077 stop_discovery(ctx, NULL, ctx->cb_ctx); 7078 return SPDK_POLLER_BUSY; 7079 } 7080 } 7081 7082 rc = spdk_nvme_ctrlr_process_admin_completions(ctx->ctrlr); 7083 if (rc < 0) { 7084 spdk_poller_unregister(&ctx->poller); 7085 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 7086 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 7087 ctx->entry_ctx_in_use = NULL; 7088 7089 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 7090 if (rc != 0) { 7091 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 7092 ctx->ctrlr = NULL; 7093 } 7094 } 7095 } 7096 7097 return SPDK_POLLER_BUSY; 7098 } 7099 7100 static void 7101 start_discovery_poller(void *arg) 7102 { 7103 struct discovery_ctx *ctx = arg; 7104 7105 TAILQ_INSERT_TAIL(&g_discovery_ctxs, ctx, tailq); 7106 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 7107 } 7108 7109 int 7110 bdev_nvme_start_discovery(struct spdk_nvme_transport_id *trid, 7111 const char *base_name, 7112 struct spdk_nvme_ctrlr_opts *drv_opts, 7113 struct nvme_ctrlr_opts *bdev_opts, 7114 uint64_t attach_timeout, 7115 bool from_mdns, 7116 spdk_bdev_nvme_start_discovery_fn cb_fn, void *cb_ctx) 7117 { 7118 struct discovery_ctx *ctx; 7119 struct discovery_entry_ctx *discovery_entry_ctx; 7120 7121 snprintf(trid->subnqn, sizeof(trid->subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN); 7122 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 7123 if (strcmp(ctx->name, base_name) == 0) { 7124 return -EEXIST; 7125 } 7126 7127 if (ctx->entry_ctx_in_use != NULL) { 7128 if (!spdk_nvme_transport_id_compare(trid, &ctx->entry_ctx_in_use->trid)) { 7129 return -EEXIST; 7130 } 7131 } 7132 7133 TAILQ_FOREACH(discovery_entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 7134 if (!spdk_nvme_transport_id_compare(trid, &discovery_entry_ctx->trid)) { 7135 return -EEXIST; 7136 } 7137 } 7138 } 7139 7140 ctx = calloc(1, sizeof(*ctx)); 7141 if (ctx == NULL) { 7142 return -ENOMEM; 7143 } 7144 7145 ctx->name = strdup(base_name); 7146 if (ctx->name == NULL) { 7147 free_discovery_ctx(ctx); 7148 return -ENOMEM; 7149 } 7150 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 7151 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 7152 ctx->from_mdns_discovery_service = from_mdns; 7153 ctx->bdev_opts.from_discovery_service = true; 7154 ctx->calling_thread = spdk_get_thread(); 7155 ctx->start_cb_fn = cb_fn; 7156 ctx->cb_ctx = cb_ctx; 7157 ctx->initializing = true; 7158 if (ctx->start_cb_fn) { 7159 /* We can use this when dumping json to denote if this RPC parameter 7160 * was specified or not. 7161 */ 7162 ctx->wait_for_attach = true; 7163 } 7164 if (attach_timeout != 0) { 7165 ctx->timeout_ticks = spdk_get_ticks() + attach_timeout * 7166 spdk_get_ticks_hz() / 1000ull; 7167 } 7168 TAILQ_INIT(&ctx->nvm_entry_ctxs); 7169 TAILQ_INIT(&ctx->discovery_entry_ctxs); 7170 memcpy(&ctx->trid, trid, sizeof(*trid)); 7171 /* Even if user did not specify hostnqn, we can still strdup("\0"); */ 7172 ctx->hostnqn = strdup(ctx->drv_opts.hostnqn); 7173 if (ctx->hostnqn == NULL) { 7174 free_discovery_ctx(ctx); 7175 return -ENOMEM; 7176 } 7177 discovery_entry_ctx = create_discovery_entry_ctx(ctx, trid); 7178 if (discovery_entry_ctx == NULL) { 7179 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 7180 free_discovery_ctx(ctx); 7181 return -ENOMEM; 7182 } 7183 7184 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, discovery_entry_ctx, tailq); 7185 spdk_thread_send_msg(g_bdev_nvme_init_thread, start_discovery_poller, ctx); 7186 return 0; 7187 } 7188 7189 int 7190 bdev_nvme_stop_discovery(const char *name, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 7191 { 7192 struct discovery_ctx *ctx; 7193 7194 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 7195 if (strcmp(name, ctx->name) == 0) { 7196 if (ctx->stop) { 7197 return -EALREADY; 7198 } 7199 /* If we're still starting the discovery service and ->rc is non-zero, we're 7200 * going to stop it as soon as we can 7201 */ 7202 if (ctx->initializing && ctx->rc != 0) { 7203 return -EALREADY; 7204 } 7205 stop_discovery(ctx, cb_fn, cb_ctx); 7206 return 0; 7207 } 7208 } 7209 7210 return -ENOENT; 7211 } 7212 7213 static int 7214 bdev_nvme_library_init(void) 7215 { 7216 g_bdev_nvme_init_thread = spdk_get_thread(); 7217 7218 spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_create_poll_group_cb, 7219 bdev_nvme_destroy_poll_group_cb, 7220 sizeof(struct nvme_poll_group), "nvme_poll_groups"); 7221 7222 return 0; 7223 } 7224 7225 static void 7226 bdev_nvme_fini_destruct_ctrlrs(void) 7227 { 7228 struct nvme_bdev_ctrlr *nbdev_ctrlr; 7229 struct nvme_ctrlr *nvme_ctrlr; 7230 7231 pthread_mutex_lock(&g_bdev_nvme_mutex); 7232 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 7233 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 7234 pthread_mutex_lock(&nvme_ctrlr->mutex); 7235 if (nvme_ctrlr->destruct) { 7236 /* This controller's destruction was already started 7237 * before the application started shutting down 7238 */ 7239 pthread_mutex_unlock(&nvme_ctrlr->mutex); 7240 continue; 7241 } 7242 nvme_ctrlr->destruct = true; 7243 pthread_mutex_unlock(&nvme_ctrlr->mutex); 7244 7245 spdk_thread_send_msg(nvme_ctrlr->thread, _nvme_ctrlr_destruct, 7246 nvme_ctrlr); 7247 } 7248 } 7249 7250 g_bdev_nvme_module_finish = true; 7251 if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 7252 pthread_mutex_unlock(&g_bdev_nvme_mutex); 7253 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 7254 spdk_bdev_module_fini_done(); 7255 return; 7256 } 7257 7258 pthread_mutex_unlock(&g_bdev_nvme_mutex); 7259 } 7260 7261 static void 7262 check_discovery_fini(void *arg) 7263 { 7264 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 7265 bdev_nvme_fini_destruct_ctrlrs(); 7266 } 7267 } 7268 7269 static void 7270 bdev_nvme_library_fini(void) 7271 { 7272 struct nvme_probe_skip_entry *entry, *entry_tmp; 7273 struct discovery_ctx *ctx; 7274 7275 spdk_poller_unregister(&g_hotplug_poller); 7276 free(g_hotplug_probe_ctx); 7277 g_hotplug_probe_ctx = NULL; 7278 7279 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) { 7280 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 7281 free(entry); 7282 } 7283 7284 assert(spdk_get_thread() == g_bdev_nvme_init_thread); 7285 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 7286 bdev_nvme_fini_destruct_ctrlrs(); 7287 } else { 7288 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 7289 stop_discovery(ctx, check_discovery_fini, NULL); 7290 } 7291 } 7292 } 7293 7294 static void 7295 bdev_nvme_verify_pi_error(struct nvme_bdev_io *bio) 7296 { 7297 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7298 struct spdk_bdev *bdev = bdev_io->bdev; 7299 struct spdk_dif_ctx dif_ctx; 7300 struct spdk_dif_error err_blk = {}; 7301 int rc; 7302 struct spdk_dif_ctx_init_ext_opts dif_opts; 7303 7304 dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format); 7305 dif_opts.dif_pi_format = bdev->dif_pi_format; 7306 rc = spdk_dif_ctx_init(&dif_ctx, 7307 bdev->blocklen, bdev->md_len, bdev->md_interleave, 7308 bdev->dif_is_head_of_md, bdev->dif_type, 7309 bdev_io->u.bdev.dif_check_flags, 7310 bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0, &dif_opts); 7311 if (rc != 0) { 7312 SPDK_ERRLOG("Initialization of DIF context failed\n"); 7313 return; 7314 } 7315 7316 if (bdev->md_interleave) { 7317 rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 7318 bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 7319 } else { 7320 struct iovec md_iov = { 7321 .iov_base = bdev_io->u.bdev.md_buf, 7322 .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len, 7323 }; 7324 7325 rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 7326 &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 7327 } 7328 7329 if (rc != 0) { 7330 SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n", 7331 err_blk.err_type, err_blk.err_offset); 7332 } else { 7333 SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n"); 7334 } 7335 } 7336 7337 static void 7338 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 7339 { 7340 struct nvme_bdev_io *bio = ref; 7341 7342 if (spdk_nvme_cpl_is_success(cpl)) { 7343 /* Run PI verification for read data buffer. */ 7344 bdev_nvme_verify_pi_error(bio); 7345 } 7346 7347 /* Return original completion status */ 7348 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 7349 } 7350 7351 static void 7352 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 7353 { 7354 struct nvme_bdev_io *bio = ref; 7355 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7356 int ret; 7357 7358 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 7359 SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n", 7360 cpl->status.sct, cpl->status.sc); 7361 7362 /* Save completion status to use after verifying PI error. */ 7363 bio->cpl = *cpl; 7364 7365 if (spdk_likely(nvme_io_path_is_available(bio->io_path))) { 7366 /* Read without PI checking to verify PI error. */ 7367 ret = bdev_nvme_no_pi_readv(bio, 7368 bdev_io->u.bdev.iovs, 7369 bdev_io->u.bdev.iovcnt, 7370 bdev_io->u.bdev.md_buf, 7371 bdev_io->u.bdev.num_blocks, 7372 bdev_io->u.bdev.offset_blocks); 7373 if (ret == 0) { 7374 return; 7375 } 7376 } 7377 } 7378 7379 bdev_nvme_io_complete_nvme_status(bio, cpl); 7380 } 7381 7382 static void 7383 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7384 { 7385 struct nvme_bdev_io *bio = ref; 7386 7387 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 7388 SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n", 7389 cpl->status.sct, cpl->status.sc); 7390 /* Run PI verification for write data buffer if PI error is detected. */ 7391 bdev_nvme_verify_pi_error(bio); 7392 } 7393 7394 bdev_nvme_io_complete_nvme_status(bio, cpl); 7395 } 7396 7397 static void 7398 bdev_nvme_zone_appendv_done(void *ref, const struct spdk_nvme_cpl *cpl) 7399 { 7400 struct nvme_bdev_io *bio = ref; 7401 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7402 7403 /* spdk_bdev_io_get_append_location() requires that the ALBA is stored in offset_blocks. 7404 * Additionally, offset_blocks has to be set before calling bdev_nvme_verify_pi_error(). 7405 */ 7406 bdev_io->u.bdev.offset_blocks = *(uint64_t *)&cpl->cdw0; 7407 7408 if (spdk_nvme_cpl_is_pi_error(cpl)) { 7409 SPDK_ERRLOG("zone append completed with PI error (sct=%d, sc=%d)\n", 7410 cpl->status.sct, cpl->status.sc); 7411 /* Run PI verification for zone append data buffer if PI error is detected. */ 7412 bdev_nvme_verify_pi_error(bio); 7413 } 7414 7415 bdev_nvme_io_complete_nvme_status(bio, cpl); 7416 } 7417 7418 static void 7419 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7420 { 7421 struct nvme_bdev_io *bio = ref; 7422 7423 if (spdk_nvme_cpl_is_pi_error(cpl)) { 7424 SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n", 7425 cpl->status.sct, cpl->status.sc); 7426 /* Run PI verification for compare data buffer if PI error is detected. */ 7427 bdev_nvme_verify_pi_error(bio); 7428 } 7429 7430 bdev_nvme_io_complete_nvme_status(bio, cpl); 7431 } 7432 7433 static void 7434 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7435 { 7436 struct nvme_bdev_io *bio = ref; 7437 7438 /* Compare operation completion */ 7439 if (!bio->first_fused_completed) { 7440 /* Save compare result for write callback */ 7441 bio->cpl = *cpl; 7442 bio->first_fused_completed = true; 7443 return; 7444 } 7445 7446 /* Write operation completion */ 7447 if (spdk_nvme_cpl_is_error(&bio->cpl)) { 7448 /* If bio->cpl is already an error, it means the compare operation failed. In that case, 7449 * complete the IO with the compare operation's status. 7450 */ 7451 if (!spdk_nvme_cpl_is_error(cpl)) { 7452 SPDK_ERRLOG("Unexpected write success after compare failure.\n"); 7453 } 7454 7455 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 7456 } else { 7457 bdev_nvme_io_complete_nvme_status(bio, cpl); 7458 } 7459 } 7460 7461 static void 7462 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl) 7463 { 7464 struct nvme_bdev_io *bio = ref; 7465 7466 bdev_nvme_io_complete_nvme_status(bio, cpl); 7467 } 7468 7469 static int 7470 fill_zone_from_report(struct spdk_bdev_zone_info *info, struct spdk_nvme_zns_zone_desc *desc) 7471 { 7472 switch (desc->zt) { 7473 case SPDK_NVME_ZONE_TYPE_SEQWR: 7474 info->type = SPDK_BDEV_ZONE_TYPE_SEQWR; 7475 break; 7476 default: 7477 SPDK_ERRLOG("Invalid zone type: %#x in zone report\n", desc->zt); 7478 return -EIO; 7479 } 7480 7481 switch (desc->zs) { 7482 case SPDK_NVME_ZONE_STATE_EMPTY: 7483 info->state = SPDK_BDEV_ZONE_STATE_EMPTY; 7484 break; 7485 case SPDK_NVME_ZONE_STATE_IOPEN: 7486 info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN; 7487 break; 7488 case SPDK_NVME_ZONE_STATE_EOPEN: 7489 info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN; 7490 break; 7491 case SPDK_NVME_ZONE_STATE_CLOSED: 7492 info->state = SPDK_BDEV_ZONE_STATE_CLOSED; 7493 break; 7494 case SPDK_NVME_ZONE_STATE_RONLY: 7495 info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY; 7496 break; 7497 case SPDK_NVME_ZONE_STATE_FULL: 7498 info->state = SPDK_BDEV_ZONE_STATE_FULL; 7499 break; 7500 case SPDK_NVME_ZONE_STATE_OFFLINE: 7501 info->state = SPDK_BDEV_ZONE_STATE_OFFLINE; 7502 break; 7503 default: 7504 SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", desc->zs); 7505 return -EIO; 7506 } 7507 7508 info->zone_id = desc->zslba; 7509 info->write_pointer = desc->wp; 7510 info->capacity = desc->zcap; 7511 7512 return 0; 7513 } 7514 7515 static void 7516 bdev_nvme_get_zone_info_done(void *ref, const struct spdk_nvme_cpl *cpl) 7517 { 7518 struct nvme_bdev_io *bio = ref; 7519 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7520 uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id; 7521 uint32_t zones_to_copy = bdev_io->u.zone_mgmt.num_zones; 7522 struct spdk_bdev_zone_info *info = bdev_io->u.zone_mgmt.buf; 7523 uint64_t max_zones_per_buf, i; 7524 uint32_t zone_report_bufsize; 7525 struct spdk_nvme_ns *ns; 7526 struct spdk_nvme_qpair *qpair; 7527 int ret; 7528 7529 if (spdk_nvme_cpl_is_error(cpl)) { 7530 goto out_complete_io_nvme_cpl; 7531 } 7532 7533 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 7534 ret = -ENXIO; 7535 goto out_complete_io_ret; 7536 } 7537 7538 ns = bio->io_path->nvme_ns->ns; 7539 qpair = bio->io_path->qpair->qpair; 7540 7541 zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 7542 max_zones_per_buf = (zone_report_bufsize - sizeof(*bio->zone_report_buf)) / 7543 sizeof(bio->zone_report_buf->descs[0]); 7544 7545 if (bio->zone_report_buf->nr_zones > max_zones_per_buf) { 7546 ret = -EINVAL; 7547 goto out_complete_io_ret; 7548 } 7549 7550 if (!bio->zone_report_buf->nr_zones) { 7551 ret = -EINVAL; 7552 goto out_complete_io_ret; 7553 } 7554 7555 for (i = 0; i < bio->zone_report_buf->nr_zones && bio->handled_zones < zones_to_copy; i++) { 7556 ret = fill_zone_from_report(&info[bio->handled_zones], 7557 &bio->zone_report_buf->descs[i]); 7558 if (ret) { 7559 goto out_complete_io_ret; 7560 } 7561 bio->handled_zones++; 7562 } 7563 7564 if (bio->handled_zones < zones_to_copy) { 7565 uint64_t zone_size_lba = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 7566 uint64_t slba = zone_id + (zone_size_lba * bio->handled_zones); 7567 7568 memset(bio->zone_report_buf, 0, zone_report_bufsize); 7569 ret = spdk_nvme_zns_report_zones(ns, qpair, 7570 bio->zone_report_buf, zone_report_bufsize, 7571 slba, SPDK_NVME_ZRA_LIST_ALL, true, 7572 bdev_nvme_get_zone_info_done, bio); 7573 if (!ret) { 7574 return; 7575 } else { 7576 goto out_complete_io_ret; 7577 } 7578 } 7579 7580 out_complete_io_nvme_cpl: 7581 free(bio->zone_report_buf); 7582 bio->zone_report_buf = NULL; 7583 bdev_nvme_io_complete_nvme_status(bio, cpl); 7584 return; 7585 7586 out_complete_io_ret: 7587 free(bio->zone_report_buf); 7588 bio->zone_report_buf = NULL; 7589 bdev_nvme_io_complete(bio, ret); 7590 } 7591 7592 static void 7593 bdev_nvme_zone_management_done(void *ref, const struct spdk_nvme_cpl *cpl) 7594 { 7595 struct nvme_bdev_io *bio = ref; 7596 7597 bdev_nvme_io_complete_nvme_status(bio, cpl); 7598 } 7599 7600 static void 7601 bdev_nvme_admin_passthru_complete_nvme_status(void *ctx) 7602 { 7603 struct nvme_bdev_io *bio = ctx; 7604 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7605 const struct spdk_nvme_cpl *cpl = &bio->cpl; 7606 7607 assert(bdev_nvme_io_type_is_admin(bdev_io->type)); 7608 7609 __bdev_nvme_io_complete(bdev_io, 0, cpl); 7610 } 7611 7612 static void 7613 bdev_nvme_abort_complete(void *ctx) 7614 { 7615 struct nvme_bdev_io *bio = ctx; 7616 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7617 7618 if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) { 7619 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS, NULL); 7620 } else { 7621 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED, NULL); 7622 } 7623 } 7624 7625 static void 7626 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl) 7627 { 7628 struct nvme_bdev_io *bio = ref; 7629 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7630 7631 bio->cpl = *cpl; 7632 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), bdev_nvme_abort_complete, bio); 7633 } 7634 7635 static void 7636 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl) 7637 { 7638 struct nvme_bdev_io *bio = ref; 7639 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7640 7641 bio->cpl = *cpl; 7642 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7643 bdev_nvme_admin_passthru_complete_nvme_status, bio); 7644 } 7645 7646 static void 7647 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset) 7648 { 7649 struct nvme_bdev_io *bio = ref; 7650 struct iovec *iov; 7651 7652 bio->iov_offset = sgl_offset; 7653 for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) { 7654 iov = &bio->iovs[bio->iovpos]; 7655 if (bio->iov_offset < iov->iov_len) { 7656 break; 7657 } 7658 7659 bio->iov_offset -= iov->iov_len; 7660 } 7661 } 7662 7663 static int 7664 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length) 7665 { 7666 struct nvme_bdev_io *bio = ref; 7667 struct iovec *iov; 7668 7669 assert(bio->iovpos < bio->iovcnt); 7670 7671 iov = &bio->iovs[bio->iovpos]; 7672 7673 *address = iov->iov_base; 7674 *length = iov->iov_len; 7675 7676 if (bio->iov_offset) { 7677 assert(bio->iov_offset <= iov->iov_len); 7678 *address += bio->iov_offset; 7679 *length -= bio->iov_offset; 7680 } 7681 7682 bio->iov_offset += *length; 7683 if (bio->iov_offset == iov->iov_len) { 7684 bio->iovpos++; 7685 bio->iov_offset = 0; 7686 } 7687 7688 return 0; 7689 } 7690 7691 static void 7692 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset) 7693 { 7694 struct nvme_bdev_io *bio = ref; 7695 struct iovec *iov; 7696 7697 bio->fused_iov_offset = sgl_offset; 7698 for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) { 7699 iov = &bio->fused_iovs[bio->fused_iovpos]; 7700 if (bio->fused_iov_offset < iov->iov_len) { 7701 break; 7702 } 7703 7704 bio->fused_iov_offset -= iov->iov_len; 7705 } 7706 } 7707 7708 static int 7709 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length) 7710 { 7711 struct nvme_bdev_io *bio = ref; 7712 struct iovec *iov; 7713 7714 assert(bio->fused_iovpos < bio->fused_iovcnt); 7715 7716 iov = &bio->fused_iovs[bio->fused_iovpos]; 7717 7718 *address = iov->iov_base; 7719 *length = iov->iov_len; 7720 7721 if (bio->fused_iov_offset) { 7722 assert(bio->fused_iov_offset <= iov->iov_len); 7723 *address += bio->fused_iov_offset; 7724 *length -= bio->fused_iov_offset; 7725 } 7726 7727 bio->fused_iov_offset += *length; 7728 if (bio->fused_iov_offset == iov->iov_len) { 7729 bio->fused_iovpos++; 7730 bio->fused_iov_offset = 0; 7731 } 7732 7733 return 0; 7734 } 7735 7736 static int 7737 bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7738 void *md, uint64_t lba_count, uint64_t lba) 7739 { 7740 int rc; 7741 7742 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n", 7743 lba_count, lba); 7744 7745 bio->iovs = iov; 7746 bio->iovcnt = iovcnt; 7747 bio->iovpos = 0; 7748 bio->iov_offset = 0; 7749 7750 rc = spdk_nvme_ns_cmd_readv_with_md(bio->io_path->nvme_ns->ns, 7751 bio->io_path->qpair->qpair, 7752 lba, lba_count, 7753 bdev_nvme_no_pi_readv_done, bio, 0, 7754 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 7755 md, 0, 0); 7756 7757 if (rc != 0 && rc != -ENOMEM) { 7758 SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc); 7759 } 7760 return rc; 7761 } 7762 7763 static int 7764 bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7765 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, 7766 struct spdk_memory_domain *domain, void *domain_ctx, 7767 struct spdk_accel_sequence *seq) 7768 { 7769 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7770 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7771 int rc; 7772 7773 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7774 lba_count, lba); 7775 7776 bio->iovs = iov; 7777 bio->iovcnt = iovcnt; 7778 bio->iovpos = 0; 7779 bio->iov_offset = 0; 7780 7781 if (domain != NULL || seq != NULL) { 7782 bio->ext_opts.size = SPDK_SIZEOF(&bio->ext_opts, accel_sequence); 7783 bio->ext_opts.memory_domain = domain; 7784 bio->ext_opts.memory_domain_ctx = domain_ctx; 7785 bio->ext_opts.io_flags = flags; 7786 bio->ext_opts.metadata = md; 7787 bio->ext_opts.accel_sequence = seq; 7788 7789 if (iovcnt == 1) { 7790 rc = spdk_nvme_ns_cmd_read_ext(ns, qpair, iov[0].iov_base, lba, lba_count, bdev_nvme_readv_done, 7791 bio, &bio->ext_opts); 7792 } else { 7793 rc = spdk_nvme_ns_cmd_readv_ext(ns, qpair, lba, lba_count, 7794 bdev_nvme_readv_done, bio, 7795 bdev_nvme_queued_reset_sgl, 7796 bdev_nvme_queued_next_sge, 7797 &bio->ext_opts); 7798 } 7799 } else if (iovcnt == 1) { 7800 rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base, 7801 md, lba, lba_count, bdev_nvme_readv_done, 7802 bio, flags, 0, 0); 7803 } else { 7804 rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count, 7805 bdev_nvme_readv_done, bio, flags, 7806 bdev_nvme_queued_reset_sgl, 7807 bdev_nvme_queued_next_sge, md, 0, 0); 7808 } 7809 7810 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 7811 SPDK_ERRLOG("readv failed: rc = %d\n", rc); 7812 } 7813 return rc; 7814 } 7815 7816 static int 7817 bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7818 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, 7819 struct spdk_memory_domain *domain, void *domain_ctx, 7820 struct spdk_accel_sequence *seq, 7821 union spdk_bdev_nvme_cdw12 cdw12, union spdk_bdev_nvme_cdw13 cdw13) 7822 { 7823 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7824 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7825 int rc; 7826 7827 SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7828 lba_count, lba); 7829 7830 bio->iovs = iov; 7831 bio->iovcnt = iovcnt; 7832 bio->iovpos = 0; 7833 bio->iov_offset = 0; 7834 7835 if (domain != NULL || seq != NULL) { 7836 bio->ext_opts.size = SPDK_SIZEOF(&bio->ext_opts, accel_sequence); 7837 bio->ext_opts.memory_domain = domain; 7838 bio->ext_opts.memory_domain_ctx = domain_ctx; 7839 bio->ext_opts.io_flags = flags | SPDK_NVME_IO_FLAGS_DIRECTIVE(cdw12.write.dtype); 7840 bio->ext_opts.cdw13 = cdw13.raw; 7841 bio->ext_opts.metadata = md; 7842 bio->ext_opts.accel_sequence = seq; 7843 7844 if (iovcnt == 1) { 7845 rc = spdk_nvme_ns_cmd_write_ext(ns, qpair, iov[0].iov_base, lba, lba_count, bdev_nvme_writev_done, 7846 bio, &bio->ext_opts); 7847 } else { 7848 rc = spdk_nvme_ns_cmd_writev_ext(ns, qpair, lba, lba_count, 7849 bdev_nvme_writev_done, bio, 7850 bdev_nvme_queued_reset_sgl, 7851 bdev_nvme_queued_next_sge, 7852 &bio->ext_opts); 7853 } 7854 } else if (iovcnt == 1) { 7855 rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base, 7856 md, lba, lba_count, bdev_nvme_writev_done, 7857 bio, flags, 0, 0); 7858 } else { 7859 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 7860 bdev_nvme_writev_done, bio, flags, 7861 bdev_nvme_queued_reset_sgl, 7862 bdev_nvme_queued_next_sge, md, 0, 0); 7863 } 7864 7865 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 7866 SPDK_ERRLOG("writev failed: rc = %d\n", rc); 7867 } 7868 return rc; 7869 } 7870 7871 static int 7872 bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7873 void *md, uint64_t lba_count, uint64_t zslba, 7874 uint32_t flags) 7875 { 7876 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7877 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7878 int rc; 7879 7880 SPDK_DEBUGLOG(bdev_nvme, "zone append %" PRIu64 " blocks to zone start lba %#" PRIx64 "\n", 7881 lba_count, zslba); 7882 7883 bio->iovs = iov; 7884 bio->iovcnt = iovcnt; 7885 bio->iovpos = 0; 7886 bio->iov_offset = 0; 7887 7888 if (iovcnt == 1) { 7889 rc = spdk_nvme_zns_zone_append_with_md(ns, qpair, iov[0].iov_base, md, zslba, 7890 lba_count, 7891 bdev_nvme_zone_appendv_done, bio, 7892 flags, 7893 0, 0); 7894 } else { 7895 rc = spdk_nvme_zns_zone_appendv_with_md(ns, qpair, zslba, lba_count, 7896 bdev_nvme_zone_appendv_done, bio, flags, 7897 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 7898 md, 0, 0); 7899 } 7900 7901 if (rc != 0 && rc != -ENOMEM) { 7902 SPDK_ERRLOG("zone append failed: rc = %d\n", rc); 7903 } 7904 return rc; 7905 } 7906 7907 static int 7908 bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7909 void *md, uint64_t lba_count, uint64_t lba, 7910 uint32_t flags) 7911 { 7912 int rc; 7913 7914 SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7915 lba_count, lba); 7916 7917 bio->iovs = iov; 7918 bio->iovcnt = iovcnt; 7919 bio->iovpos = 0; 7920 bio->iov_offset = 0; 7921 7922 rc = spdk_nvme_ns_cmd_comparev_with_md(bio->io_path->nvme_ns->ns, 7923 bio->io_path->qpair->qpair, 7924 lba, lba_count, 7925 bdev_nvme_comparev_done, bio, flags, 7926 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 7927 md, 0, 0); 7928 7929 if (rc != 0 && rc != -ENOMEM) { 7930 SPDK_ERRLOG("comparev failed: rc = %d\n", rc); 7931 } 7932 return rc; 7933 } 7934 7935 static int 7936 bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, 7937 struct iovec *write_iov, int write_iovcnt, 7938 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags) 7939 { 7940 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7941 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7942 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7943 int rc; 7944 7945 SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7946 lba_count, lba); 7947 7948 bio->iovs = cmp_iov; 7949 bio->iovcnt = cmp_iovcnt; 7950 bio->iovpos = 0; 7951 bio->iov_offset = 0; 7952 bio->fused_iovs = write_iov; 7953 bio->fused_iovcnt = write_iovcnt; 7954 bio->fused_iovpos = 0; 7955 bio->fused_iov_offset = 0; 7956 7957 if (bdev_io->num_retries == 0) { 7958 bio->first_fused_submitted = false; 7959 bio->first_fused_completed = false; 7960 } 7961 7962 if (!bio->first_fused_submitted) { 7963 flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST; 7964 memset(&bio->cpl, 0, sizeof(bio->cpl)); 7965 7966 rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count, 7967 bdev_nvme_comparev_and_writev_done, bio, flags, 7968 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0); 7969 if (rc == 0) { 7970 bio->first_fused_submitted = true; 7971 flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST; 7972 } else { 7973 if (rc != -ENOMEM) { 7974 SPDK_ERRLOG("compare failed: rc = %d\n", rc); 7975 } 7976 return rc; 7977 } 7978 } 7979 7980 flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND; 7981 7982 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 7983 bdev_nvme_comparev_and_writev_done, bio, flags, 7984 bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0); 7985 if (rc != 0 && rc != -ENOMEM) { 7986 SPDK_ERRLOG("write failed: rc = %d\n", rc); 7987 rc = 0; 7988 } 7989 7990 return rc; 7991 } 7992 7993 static int 7994 bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 7995 { 7996 struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES]; 7997 struct spdk_nvme_dsm_range *range; 7998 uint64_t offset, remaining; 7999 uint64_t num_ranges_u64; 8000 uint16_t num_ranges; 8001 int rc; 8002 8003 num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) / 8004 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 8005 if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) { 8006 SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks); 8007 return -EINVAL; 8008 } 8009 num_ranges = (uint16_t)num_ranges_u64; 8010 8011 offset = offset_blocks; 8012 remaining = num_blocks; 8013 range = &dsm_ranges[0]; 8014 8015 /* Fill max-size ranges until the remaining blocks fit into one range */ 8016 while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) { 8017 range->attributes.raw = 0; 8018 range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 8019 range->starting_lba = offset; 8020 8021 offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 8022 remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 8023 range++; 8024 } 8025 8026 /* Final range describes the remaining blocks */ 8027 range->attributes.raw = 0; 8028 range->length = remaining; 8029 range->starting_lba = offset; 8030 8031 rc = spdk_nvme_ns_cmd_dataset_management(bio->io_path->nvme_ns->ns, 8032 bio->io_path->qpair->qpair, 8033 SPDK_NVME_DSM_ATTR_DEALLOCATE, 8034 dsm_ranges, num_ranges, 8035 bdev_nvme_queued_done, bio); 8036 8037 return rc; 8038 } 8039 8040 static int 8041 bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 8042 { 8043 if (num_blocks > UINT16_MAX + 1) { 8044 SPDK_ERRLOG("NVMe write zeroes is limited to 16-bit block count\n"); 8045 return -EINVAL; 8046 } 8047 8048 return spdk_nvme_ns_cmd_write_zeroes(bio->io_path->nvme_ns->ns, 8049 bio->io_path->qpair->qpair, 8050 offset_blocks, num_blocks, 8051 bdev_nvme_queued_done, bio, 8052 0); 8053 } 8054 8055 static int 8056 bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones, 8057 struct spdk_bdev_zone_info *info) 8058 { 8059 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8060 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8061 uint32_t zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 8062 uint64_t zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 8063 uint64_t total_zones = spdk_nvme_zns_ns_get_num_zones(ns); 8064 8065 if (zone_id % zone_size != 0) { 8066 return -EINVAL; 8067 } 8068 8069 if (num_zones > total_zones || !num_zones) { 8070 return -EINVAL; 8071 } 8072 8073 assert(!bio->zone_report_buf); 8074 bio->zone_report_buf = calloc(1, zone_report_bufsize); 8075 if (!bio->zone_report_buf) { 8076 return -ENOMEM; 8077 } 8078 8079 bio->handled_zones = 0; 8080 8081 return spdk_nvme_zns_report_zones(ns, qpair, bio->zone_report_buf, zone_report_bufsize, 8082 zone_id, SPDK_NVME_ZRA_LIST_ALL, true, 8083 bdev_nvme_get_zone_info_done, bio); 8084 } 8085 8086 static int 8087 bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 8088 enum spdk_bdev_zone_action action) 8089 { 8090 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8091 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8092 8093 switch (action) { 8094 case SPDK_BDEV_ZONE_CLOSE: 8095 return spdk_nvme_zns_close_zone(ns, qpair, zone_id, false, 8096 bdev_nvme_zone_management_done, bio); 8097 case SPDK_BDEV_ZONE_FINISH: 8098 return spdk_nvme_zns_finish_zone(ns, qpair, zone_id, false, 8099 bdev_nvme_zone_management_done, bio); 8100 case SPDK_BDEV_ZONE_OPEN: 8101 return spdk_nvme_zns_open_zone(ns, qpair, zone_id, false, 8102 bdev_nvme_zone_management_done, bio); 8103 case SPDK_BDEV_ZONE_RESET: 8104 return spdk_nvme_zns_reset_zone(ns, qpair, zone_id, false, 8105 bdev_nvme_zone_management_done, bio); 8106 case SPDK_BDEV_ZONE_OFFLINE: 8107 return spdk_nvme_zns_offline_zone(ns, qpair, zone_id, false, 8108 bdev_nvme_zone_management_done, bio); 8109 default: 8110 return -EINVAL; 8111 } 8112 } 8113 8114 static void 8115 bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 8116 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) 8117 { 8118 struct nvme_io_path *io_path; 8119 struct nvme_ctrlr *nvme_ctrlr; 8120 uint32_t max_xfer_size; 8121 int rc = -ENXIO; 8122 8123 /* Choose the first ctrlr which is not failed. */ 8124 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 8125 nvme_ctrlr = io_path->qpair->ctrlr; 8126 8127 /* We should skip any unavailable nvme_ctrlr rather than checking 8128 * if the return value of spdk_nvme_ctrlr_cmd_admin_raw() is -ENXIO. 8129 */ 8130 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 8131 continue; 8132 } 8133 8134 max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ctrlr->ctrlr); 8135 8136 if (nbytes > max_xfer_size) { 8137 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 8138 rc = -EINVAL; 8139 goto err; 8140 } 8141 8142 rc = spdk_nvme_ctrlr_cmd_admin_raw(nvme_ctrlr->ctrlr, cmd, buf, (uint32_t)nbytes, 8143 bdev_nvme_admin_passthru_done, bio); 8144 if (rc == 0) { 8145 return; 8146 } 8147 } 8148 8149 err: 8150 bdev_nvme_admin_complete(bio, rc); 8151 } 8152 8153 static int 8154 bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 8155 void *buf, size_t nbytes) 8156 { 8157 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8158 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8159 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 8160 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 8161 8162 if (nbytes > max_xfer_size) { 8163 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 8164 return -EINVAL; 8165 } 8166 8167 /* 8168 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 8169 * so fill it out automatically. 8170 */ 8171 cmd->nsid = spdk_nvme_ns_get_id(ns); 8172 8173 return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf, 8174 (uint32_t)nbytes, bdev_nvme_queued_done, bio); 8175 } 8176 8177 static int 8178 bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 8179 void *buf, size_t nbytes, void *md_buf, size_t md_len) 8180 { 8181 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8182 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8183 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 8184 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 8185 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 8186 8187 if (nbytes > max_xfer_size) { 8188 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 8189 return -EINVAL; 8190 } 8191 8192 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 8193 SPDK_ERRLOG("invalid meta data buffer size\n"); 8194 return -EINVAL; 8195 } 8196 8197 /* 8198 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 8199 * so fill it out automatically. 8200 */ 8201 cmd->nsid = spdk_nvme_ns_get_id(ns); 8202 8203 return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf, 8204 (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio); 8205 } 8206 8207 static int 8208 bdev_nvme_iov_passthru_md(struct nvme_bdev_io *bio, 8209 struct spdk_nvme_cmd *cmd, struct iovec *iov, int iovcnt, 8210 size_t nbytes, void *md_buf, size_t md_len) 8211 { 8212 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8213 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8214 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 8215 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 8216 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 8217 8218 bio->iovs = iov; 8219 bio->iovcnt = iovcnt; 8220 bio->iovpos = 0; 8221 bio->iov_offset = 0; 8222 8223 if (nbytes > max_xfer_size) { 8224 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 8225 return -EINVAL; 8226 } 8227 8228 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 8229 SPDK_ERRLOG("invalid meta data buffer size\n"); 8230 return -EINVAL; 8231 } 8232 8233 /* 8234 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands 8235 * require a nsid, so fill it out automatically. 8236 */ 8237 cmd->nsid = spdk_nvme_ns_get_id(ns); 8238 8239 return spdk_nvme_ctrlr_cmd_iov_raw_with_md( 8240 ctrlr, qpair, cmd, (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio, 8241 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge); 8242 } 8243 8244 static void 8245 bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 8246 struct nvme_bdev_io *bio_to_abort) 8247 { 8248 struct nvme_io_path *io_path; 8249 int rc = 0; 8250 8251 rc = bdev_nvme_abort_retry_io(nbdev_ch, bio_to_abort); 8252 if (rc == 0) { 8253 bdev_nvme_admin_complete(bio, 0); 8254 return; 8255 } 8256 8257 io_path = bio_to_abort->io_path; 8258 if (io_path != NULL) { 8259 rc = spdk_nvme_ctrlr_cmd_abort_ext(io_path->qpair->ctrlr->ctrlr, 8260 io_path->qpair->qpair, 8261 bio_to_abort, 8262 bdev_nvme_abort_done, bio); 8263 } else { 8264 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 8265 rc = spdk_nvme_ctrlr_cmd_abort_ext(io_path->qpair->ctrlr->ctrlr, 8266 NULL, 8267 bio_to_abort, 8268 bdev_nvme_abort_done, bio); 8269 8270 if (rc != -ENOENT) { 8271 break; 8272 } 8273 } 8274 } 8275 8276 if (rc != 0) { 8277 /* If no command was found or there was any error, complete the abort 8278 * request with failure. 8279 */ 8280 bdev_nvme_admin_complete(bio, rc); 8281 } 8282 } 8283 8284 static int 8285 bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, uint64_t src_offset_blocks, 8286 uint64_t num_blocks) 8287 { 8288 struct spdk_nvme_scc_source_range range = { 8289 .slba = src_offset_blocks, 8290 .nlb = num_blocks - 1 8291 }; 8292 8293 return spdk_nvme_ns_cmd_copy(bio->io_path->nvme_ns->ns, 8294 bio->io_path->qpair->qpair, 8295 &range, 1, dst_offset_blocks, 8296 bdev_nvme_queued_done, bio); 8297 } 8298 8299 static void 8300 bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w) 8301 { 8302 const char *action; 8303 uint32_t i; 8304 8305 if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) { 8306 action = "reset"; 8307 } else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) { 8308 action = "abort"; 8309 } else { 8310 action = "none"; 8311 } 8312 8313 spdk_json_write_object_begin(w); 8314 8315 spdk_json_write_named_string(w, "method", "bdev_nvme_set_options"); 8316 8317 spdk_json_write_named_object_begin(w, "params"); 8318 spdk_json_write_named_string(w, "action_on_timeout", action); 8319 spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us); 8320 spdk_json_write_named_uint64(w, "timeout_admin_us", g_opts.timeout_admin_us); 8321 spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms); 8322 spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst); 8323 spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight); 8324 spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight); 8325 spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight); 8326 spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us); 8327 spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us); 8328 spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests); 8329 spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit); 8330 spdk_json_write_named_uint32(w, "transport_retry_count", g_opts.transport_retry_count); 8331 spdk_json_write_named_int32(w, "bdev_retry_count", g_opts.bdev_retry_count); 8332 spdk_json_write_named_uint8(w, "transport_ack_timeout", g_opts.transport_ack_timeout); 8333 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", g_opts.ctrlr_loss_timeout_sec); 8334 spdk_json_write_named_uint32(w, "reconnect_delay_sec", g_opts.reconnect_delay_sec); 8335 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", g_opts.fast_io_fail_timeout_sec); 8336 spdk_json_write_named_bool(w, "disable_auto_failback", g_opts.disable_auto_failback); 8337 spdk_json_write_named_bool(w, "generate_uuids", g_opts.generate_uuids); 8338 spdk_json_write_named_uint8(w, "transport_tos", g_opts.transport_tos); 8339 spdk_json_write_named_bool(w, "nvme_error_stat", g_opts.nvme_error_stat); 8340 spdk_json_write_named_uint32(w, "rdma_srq_size", g_opts.rdma_srq_size); 8341 spdk_json_write_named_bool(w, "io_path_stat", g_opts.io_path_stat); 8342 spdk_json_write_named_bool(w, "allow_accel_sequence", g_opts.allow_accel_sequence); 8343 spdk_json_write_named_uint32(w, "rdma_max_cq_size", g_opts.rdma_max_cq_size); 8344 spdk_json_write_named_uint16(w, "rdma_cm_event_timeout_ms", g_opts.rdma_cm_event_timeout_ms); 8345 spdk_json_write_named_array_begin(w, "dhchap_digests"); 8346 for (i = 0; i < 32; ++i) { 8347 if (g_opts.dhchap_digests & SPDK_BIT(i)) { 8348 spdk_json_write_string(w, spdk_nvme_dhchap_get_digest_name(i)); 8349 } 8350 } 8351 spdk_json_write_array_end(w); 8352 spdk_json_write_named_array_begin(w, "dhchap_dhgroups"); 8353 for (i = 0; i < 32; ++i) { 8354 if (g_opts.dhchap_dhgroups & SPDK_BIT(i)) { 8355 spdk_json_write_string(w, spdk_nvme_dhchap_get_dhgroup_name(i)); 8356 } 8357 } 8358 8359 spdk_json_write_array_end(w); 8360 spdk_json_write_object_end(w); 8361 8362 spdk_json_write_object_end(w); 8363 } 8364 8365 static void 8366 bdev_nvme_discovery_config_json(struct spdk_json_write_ctx *w, struct discovery_ctx *ctx) 8367 { 8368 struct spdk_nvme_transport_id trid; 8369 8370 spdk_json_write_object_begin(w); 8371 8372 spdk_json_write_named_string(w, "method", "bdev_nvme_start_discovery"); 8373 8374 spdk_json_write_named_object_begin(w, "params"); 8375 spdk_json_write_named_string(w, "name", ctx->name); 8376 spdk_json_write_named_string(w, "hostnqn", ctx->hostnqn); 8377 8378 trid = ctx->trid; 8379 memset(trid.subnqn, 0, sizeof(trid.subnqn)); 8380 nvme_bdev_dump_trid_json(&trid, w); 8381 8382 spdk_json_write_named_bool(w, "wait_for_attach", ctx->wait_for_attach); 8383 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", ctx->bdev_opts.ctrlr_loss_timeout_sec); 8384 spdk_json_write_named_uint32(w, "reconnect_delay_sec", ctx->bdev_opts.reconnect_delay_sec); 8385 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 8386 ctx->bdev_opts.fast_io_fail_timeout_sec); 8387 spdk_json_write_object_end(w); 8388 8389 spdk_json_write_object_end(w); 8390 } 8391 8392 #ifdef SPDK_CONFIG_NVME_CUSE 8393 static void 8394 nvme_ctrlr_cuse_config_json(struct spdk_json_write_ctx *w, 8395 struct nvme_ctrlr *nvme_ctrlr) 8396 { 8397 size_t cuse_name_size = 128; 8398 char cuse_name[cuse_name_size]; 8399 8400 if (spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr, 8401 cuse_name, &cuse_name_size) != 0) { 8402 return; 8403 } 8404 8405 spdk_json_write_object_begin(w); 8406 8407 spdk_json_write_named_string(w, "method", "bdev_nvme_cuse_register"); 8408 8409 spdk_json_write_named_object_begin(w, "params"); 8410 spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name); 8411 spdk_json_write_object_end(w); 8412 8413 spdk_json_write_object_end(w); 8414 } 8415 #endif 8416 8417 static void 8418 nvme_ctrlr_config_json(struct spdk_json_write_ctx *w, 8419 struct nvme_ctrlr *nvme_ctrlr) 8420 { 8421 struct spdk_nvme_transport_id *trid; 8422 const struct spdk_nvme_ctrlr_opts *opts; 8423 8424 if (nvme_ctrlr->opts.from_discovery_service) { 8425 /* Do not emit an RPC for this - it will be implicitly 8426 * covered by a separate bdev_nvme_start_discovery or 8427 * bdev_nvme_start_mdns_discovery RPC. 8428 */ 8429 return; 8430 } 8431 8432 trid = &nvme_ctrlr->active_path_id->trid; 8433 8434 spdk_json_write_object_begin(w); 8435 8436 spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller"); 8437 8438 spdk_json_write_named_object_begin(w, "params"); 8439 spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name); 8440 nvme_bdev_dump_trid_json(trid, w); 8441 spdk_json_write_named_bool(w, "prchk_reftag", 8442 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0); 8443 spdk_json_write_named_bool(w, "prchk_guard", 8444 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0); 8445 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", nvme_ctrlr->opts.ctrlr_loss_timeout_sec); 8446 spdk_json_write_named_uint32(w, "reconnect_delay_sec", nvme_ctrlr->opts.reconnect_delay_sec); 8447 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 8448 nvme_ctrlr->opts.fast_io_fail_timeout_sec); 8449 if (nvme_ctrlr->psk != NULL) { 8450 spdk_json_write_named_string(w, "psk", spdk_key_get_name(nvme_ctrlr->psk)); 8451 } else if (nvme_ctrlr->opts.psk[0] != '\0') { 8452 spdk_json_write_named_string(w, "psk", nvme_ctrlr->opts.psk); 8453 } 8454 8455 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 8456 spdk_json_write_named_string(w, "hostnqn", opts->hostnqn); 8457 spdk_json_write_named_bool(w, "hdgst", opts->header_digest); 8458 spdk_json_write_named_bool(w, "ddgst", opts->data_digest); 8459 if (opts->src_addr[0] != '\0') { 8460 spdk_json_write_named_string(w, "hostaddr", opts->src_addr); 8461 } 8462 if (opts->src_svcid[0] != '\0') { 8463 spdk_json_write_named_string(w, "hostsvcid", opts->src_svcid); 8464 } 8465 8466 spdk_json_write_object_end(w); 8467 8468 spdk_json_write_object_end(w); 8469 } 8470 8471 static void 8472 bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w) 8473 { 8474 spdk_json_write_object_begin(w); 8475 spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug"); 8476 8477 spdk_json_write_named_object_begin(w, "params"); 8478 spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us); 8479 spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled); 8480 spdk_json_write_object_end(w); 8481 8482 spdk_json_write_object_end(w); 8483 } 8484 8485 static int 8486 bdev_nvme_config_json(struct spdk_json_write_ctx *w) 8487 { 8488 struct nvme_bdev_ctrlr *nbdev_ctrlr; 8489 struct nvme_ctrlr *nvme_ctrlr; 8490 struct discovery_ctx *ctx; 8491 8492 bdev_nvme_opts_config_json(w); 8493 8494 pthread_mutex_lock(&g_bdev_nvme_mutex); 8495 8496 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 8497 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 8498 nvme_ctrlr_config_json(w, nvme_ctrlr); 8499 8500 #ifdef SPDK_CONFIG_NVME_CUSE 8501 nvme_ctrlr_cuse_config_json(w, nvme_ctrlr); 8502 #endif 8503 } 8504 } 8505 8506 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 8507 if (!ctx->from_mdns_discovery_service) { 8508 bdev_nvme_discovery_config_json(w, ctx); 8509 } 8510 } 8511 8512 bdev_nvme_mdns_discovery_config_json(w); 8513 8514 /* Dump as last parameter to give all NVMe bdevs chance to be constructed 8515 * before enabling hotplug poller. 8516 */ 8517 bdev_nvme_hotplug_config_json(w); 8518 8519 pthread_mutex_unlock(&g_bdev_nvme_mutex); 8520 return 0; 8521 } 8522 8523 struct spdk_nvme_ctrlr * 8524 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev) 8525 { 8526 struct nvme_bdev *nbdev; 8527 struct nvme_ns *nvme_ns; 8528 8529 if (!bdev || bdev->module != &nvme_if) { 8530 return NULL; 8531 } 8532 8533 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 8534 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 8535 assert(nvme_ns != NULL); 8536 8537 return nvme_ns->ctrlr->ctrlr; 8538 } 8539 8540 static bool 8541 nvme_io_path_is_current(struct nvme_io_path *io_path) 8542 { 8543 const struct nvme_bdev_channel *nbdev_ch; 8544 bool current; 8545 8546 if (!nvme_io_path_is_available(io_path)) { 8547 return false; 8548 } 8549 8550 nbdev_ch = io_path->nbdev_ch; 8551 if (nbdev_ch == NULL) { 8552 current = false; 8553 } else if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) { 8554 struct nvme_io_path *optimized_io_path = NULL; 8555 8556 STAILQ_FOREACH(optimized_io_path, &nbdev_ch->io_path_list, stailq) { 8557 if (optimized_io_path->nvme_ns->ana_state == SPDK_NVME_ANA_OPTIMIZED_STATE) { 8558 break; 8559 } 8560 } 8561 8562 /* A non-optimized path is only current if there are no optimized paths. */ 8563 current = (io_path->nvme_ns->ana_state == SPDK_NVME_ANA_OPTIMIZED_STATE) || 8564 (optimized_io_path == NULL); 8565 } else { 8566 if (nbdev_ch->current_io_path) { 8567 current = (io_path == nbdev_ch->current_io_path); 8568 } else { 8569 struct nvme_io_path *first_path; 8570 8571 /* We arrived here as there are no optimized paths for active-passive 8572 * mode. Check if this io_path is the first one available on the list. 8573 */ 8574 current = false; 8575 STAILQ_FOREACH(first_path, &nbdev_ch->io_path_list, stailq) { 8576 if (nvme_io_path_is_available(first_path)) { 8577 current = (io_path == first_path); 8578 break; 8579 } 8580 } 8581 } 8582 } 8583 8584 return current; 8585 } 8586 8587 void 8588 nvme_io_path_info_json(struct spdk_json_write_ctx *w, struct nvme_io_path *io_path) 8589 { 8590 struct nvme_ns *nvme_ns = io_path->nvme_ns; 8591 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 8592 const struct spdk_nvme_ctrlr_data *cdata; 8593 const struct spdk_nvme_transport_id *trid; 8594 const char *adrfam_str; 8595 8596 spdk_json_write_object_begin(w); 8597 8598 spdk_json_write_named_string(w, "bdev_name", nvme_ns->bdev->disk.name); 8599 8600 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 8601 trid = spdk_nvme_ctrlr_get_transport_id(nvme_ctrlr->ctrlr); 8602 8603 spdk_json_write_named_uint32(w, "cntlid", cdata->cntlid); 8604 spdk_json_write_named_bool(w, "current", nvme_io_path_is_current(io_path)); 8605 spdk_json_write_named_bool(w, "connected", nvme_qpair_is_connected(io_path->qpair)); 8606 spdk_json_write_named_bool(w, "accessible", nvme_ns_is_accessible(nvme_ns)); 8607 8608 spdk_json_write_named_object_begin(w, "transport"); 8609 spdk_json_write_named_string(w, "trtype", trid->trstring); 8610 spdk_json_write_named_string(w, "traddr", trid->traddr); 8611 if (trid->trsvcid[0] != '\0') { 8612 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 8613 } 8614 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 8615 if (adrfam_str) { 8616 spdk_json_write_named_string(w, "adrfam", adrfam_str); 8617 } 8618 spdk_json_write_object_end(w); 8619 8620 spdk_json_write_object_end(w); 8621 } 8622 8623 void 8624 bdev_nvme_get_discovery_info(struct spdk_json_write_ctx *w) 8625 { 8626 struct discovery_ctx *ctx; 8627 struct discovery_entry_ctx *entry_ctx; 8628 8629 spdk_json_write_array_begin(w); 8630 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 8631 spdk_json_write_object_begin(w); 8632 spdk_json_write_named_string(w, "name", ctx->name); 8633 8634 spdk_json_write_named_object_begin(w, "trid"); 8635 nvme_bdev_dump_trid_json(&ctx->trid, w); 8636 spdk_json_write_object_end(w); 8637 8638 spdk_json_write_named_array_begin(w, "referrals"); 8639 TAILQ_FOREACH(entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 8640 spdk_json_write_object_begin(w); 8641 spdk_json_write_named_object_begin(w, "trid"); 8642 nvme_bdev_dump_trid_json(&entry_ctx->trid, w); 8643 spdk_json_write_object_end(w); 8644 spdk_json_write_object_end(w); 8645 } 8646 spdk_json_write_array_end(w); 8647 8648 spdk_json_write_object_end(w); 8649 } 8650 spdk_json_write_array_end(w); 8651 } 8652 8653 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme) 8654 8655 SPDK_TRACE_REGISTER_FN(bdev_nvme_trace, "bdev_nvme", TRACE_GROUP_BDEV_NVME) 8656 { 8657 struct spdk_trace_tpoint_opts opts[] = { 8658 { 8659 "BDEV_NVME_IO_START", TRACE_BDEV_NVME_IO_START, 8660 OWNER_TYPE_NONE, OBJECT_BDEV_NVME_IO, 1, 8661 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 8662 }, 8663 { 8664 "BDEV_NVME_IO_DONE", TRACE_BDEV_NVME_IO_DONE, 8665 OWNER_TYPE_NONE, OBJECT_BDEV_NVME_IO, 0, 8666 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 8667 } 8668 }; 8669 8670 8671 spdk_trace_register_object(OBJECT_BDEV_NVME_IO, 'N'); 8672 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 8673 spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_SUBMIT, OBJECT_BDEV_NVME_IO, 0); 8674 spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_SUBMIT, OBJECT_BDEV_NVME_IO, 0); 8675 spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_COMPLETE, OBJECT_BDEV_NVME_IO, 0); 8676 spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_COMPLETE, OBJECT_BDEV_NVME_IO, 0); 8677 } 8678