1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 * Copyright (c) 2022 Dell Inc, or its subsidiaries. All rights reserved. 6 */ 7 8 #include "spdk/stdinc.h" 9 10 #include "bdev_nvme.h" 11 12 #include "spdk/accel.h" 13 #include "spdk/config.h" 14 #include "spdk/endian.h" 15 #include "spdk/bdev.h" 16 #include "spdk/json.h" 17 #include "spdk/keyring.h" 18 #include "spdk/likely.h" 19 #include "spdk/nvme.h" 20 #include "spdk/nvme_ocssd.h" 21 #include "spdk/nvme_zns.h" 22 #include "spdk/opal.h" 23 #include "spdk/thread.h" 24 #include "spdk/trace.h" 25 #include "spdk/string.h" 26 #include "spdk/util.h" 27 #include "spdk/uuid.h" 28 29 #include "spdk/bdev_module.h" 30 #include "spdk/log.h" 31 32 #include "spdk_internal/usdt.h" 33 #include "spdk_internal/trace_defs.h" 34 35 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true 36 #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS (10000) 37 38 #define NSID_STR_LEN 10 39 40 #define SPDK_CONTROLLER_NAME_MAX 512 41 42 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w); 43 44 struct nvme_bdev_io { 45 /** array of iovecs to transfer. */ 46 struct iovec *iovs; 47 48 /** Number of iovecs in iovs array. */ 49 int iovcnt; 50 51 /** Current iovec position. */ 52 int iovpos; 53 54 /** Offset in current iovec. */ 55 uint32_t iov_offset; 56 57 /** I/O path the current I/O or admin passthrough is submitted on, or the I/O path 58 * being reset in a reset I/O. 59 */ 60 struct nvme_io_path *io_path; 61 62 /** array of iovecs to transfer. */ 63 struct iovec *fused_iovs; 64 65 /** Number of iovecs in iovs array. */ 66 int fused_iovcnt; 67 68 /** Current iovec position. */ 69 int fused_iovpos; 70 71 /** Offset in current iovec. */ 72 uint32_t fused_iov_offset; 73 74 /** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */ 75 struct spdk_nvme_cpl cpl; 76 77 /** Extended IO opts passed by the user to bdev layer and mapped to NVME format */ 78 struct spdk_nvme_ns_cmd_ext_io_opts ext_opts; 79 80 /** Keeps track if first of fused commands was submitted */ 81 bool first_fused_submitted; 82 83 /** Keeps track if first of fused commands was completed */ 84 bool first_fused_completed; 85 86 /** Temporary pointer to zone report buffer */ 87 struct spdk_nvme_zns_zone_report *zone_report_buf; 88 89 /** Keep track of how many zones that have been copied to the spdk_bdev_zone_info struct */ 90 uint64_t handled_zones; 91 92 /** Expiration value in ticks to retry the current I/O. */ 93 uint64_t retry_ticks; 94 95 /* How many times the current I/O was retried. */ 96 int32_t retry_count; 97 98 /* Current tsc at submit time. */ 99 uint64_t submit_tsc; 100 101 /* Used to put nvme_bdev_io into the list */ 102 TAILQ_ENTRY(nvme_bdev_io) retry_link; 103 }; 104 105 struct nvme_probe_skip_entry { 106 struct spdk_nvme_transport_id trid; 107 TAILQ_ENTRY(nvme_probe_skip_entry) tailq; 108 }; 109 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */ 110 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER( 111 g_skipped_nvme_ctrlrs); 112 113 #define BDEV_NVME_DEFAULT_DIGESTS (SPDK_BIT(SPDK_NVMF_DHCHAP_HASH_SHA256) | \ 114 SPDK_BIT(SPDK_NVMF_DHCHAP_HASH_SHA384) | \ 115 SPDK_BIT(SPDK_NVMF_DHCHAP_HASH_SHA512)) 116 117 #define BDEV_NVME_DEFAULT_DHGROUPS (SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_NULL) | \ 118 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_2048) | \ 119 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_3072) | \ 120 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_4096) | \ 121 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_6144) | \ 122 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_8192)) 123 124 static struct spdk_bdev_nvme_opts g_opts = { 125 .action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE, 126 .timeout_us = 0, 127 .timeout_admin_us = 0, 128 .keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS, 129 .transport_retry_count = 4, 130 .arbitration_burst = 0, 131 .low_priority_weight = 0, 132 .medium_priority_weight = 0, 133 .high_priority_weight = 0, 134 .nvme_adminq_poll_period_us = 10000ULL, 135 .nvme_ioq_poll_period_us = 0, 136 .io_queue_requests = 0, 137 .delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT, 138 .bdev_retry_count = 3, 139 .transport_ack_timeout = 0, 140 .ctrlr_loss_timeout_sec = 0, 141 .reconnect_delay_sec = 0, 142 .fast_io_fail_timeout_sec = 0, 143 .disable_auto_failback = false, 144 .generate_uuids = false, 145 .transport_tos = 0, 146 .nvme_error_stat = false, 147 .io_path_stat = false, 148 .allow_accel_sequence = false, 149 .dhchap_digests = BDEV_NVME_DEFAULT_DIGESTS, 150 .dhchap_dhgroups = BDEV_NVME_DEFAULT_DHGROUPS, 151 }; 152 153 #define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL 154 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT 100000ULL 155 156 static int g_hot_insert_nvme_controller_index = 0; 157 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT; 158 static bool g_nvme_hotplug_enabled = false; 159 struct spdk_thread *g_bdev_nvme_init_thread; 160 static struct spdk_poller *g_hotplug_poller; 161 static struct spdk_poller *g_hotplug_probe_poller; 162 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx; 163 164 static void nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 165 struct nvme_async_probe_ctx *ctx); 166 static void nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 167 struct nvme_async_probe_ctx *ctx); 168 static int bdev_nvme_library_init(void); 169 static void bdev_nvme_library_fini(void); 170 static void _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, 171 struct spdk_bdev_io *bdev_io); 172 static void bdev_nvme_submit_request(struct spdk_io_channel *ch, 173 struct spdk_bdev_io *bdev_io); 174 static int bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 175 void *md, uint64_t lba_count, uint64_t lba, 176 uint32_t flags, struct spdk_memory_domain *domain, void *domain_ctx, 177 struct spdk_accel_sequence *seq); 178 static int bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 179 void *md, uint64_t lba_count, uint64_t lba); 180 static int bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 181 void *md, uint64_t lba_count, uint64_t lba, 182 uint32_t flags, struct spdk_memory_domain *domain, void *domain_ctx, 183 struct spdk_accel_sequence *seq, 184 union spdk_bdev_nvme_cdw12 cdw12, union spdk_bdev_nvme_cdw13 cdw13); 185 static int bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 186 void *md, uint64_t lba_count, 187 uint64_t zslba, uint32_t flags); 188 static int bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 189 void *md, uint64_t lba_count, uint64_t lba, 190 uint32_t flags); 191 static int bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, 192 struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov, 193 int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba, 194 uint32_t flags); 195 static int bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, 196 uint32_t num_zones, struct spdk_bdev_zone_info *info); 197 static int bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 198 enum spdk_bdev_zone_action action); 199 static void bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, 200 struct nvme_bdev_io *bio, 201 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); 202 static int bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 203 void *buf, size_t nbytes); 204 static int bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 205 void *buf, size_t nbytes, void *md_buf, size_t md_len); 206 static int bdev_nvme_iov_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 207 struct iovec *iov, int iovcnt, size_t nbytes, 208 void *md_buf, size_t md_len); 209 static void bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, 210 struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort); 211 static void bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio); 212 static int bdev_nvme_reset_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 213 static int bdev_nvme_failover_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 214 static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr); 215 static int nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr); 216 217 static struct nvme_ns *nvme_ns_alloc(void); 218 static void nvme_ns_free(struct nvme_ns *ns); 219 220 static int 221 nvme_ns_cmp(struct nvme_ns *ns1, struct nvme_ns *ns2) 222 { 223 return ns1->id < ns2->id ? -1 : ns1->id > ns2->id; 224 } 225 226 RB_GENERATE_STATIC(nvme_ns_tree, nvme_ns, node, nvme_ns_cmp); 227 228 struct spdk_nvme_qpair * 229 bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch) 230 { 231 struct nvme_ctrlr_channel *ctrlr_ch; 232 233 assert(ctrlr_io_ch != NULL); 234 235 ctrlr_ch = spdk_io_channel_get_ctx(ctrlr_io_ch); 236 237 return ctrlr_ch->qpair->qpair; 238 } 239 240 static int 241 bdev_nvme_get_ctx_size(void) 242 { 243 return sizeof(struct nvme_bdev_io); 244 } 245 246 static struct spdk_bdev_module nvme_if = { 247 .name = "nvme", 248 .async_fini = true, 249 .module_init = bdev_nvme_library_init, 250 .module_fini = bdev_nvme_library_fini, 251 .config_json = bdev_nvme_config_json, 252 .get_ctx_size = bdev_nvme_get_ctx_size, 253 254 }; 255 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if) 256 257 struct nvme_bdev_ctrlrs g_nvme_bdev_ctrlrs = TAILQ_HEAD_INITIALIZER(g_nvme_bdev_ctrlrs); 258 pthread_mutex_t g_bdev_nvme_mutex = PTHREAD_MUTEX_INITIALIZER; 259 bool g_bdev_nvme_module_finish; 260 261 struct nvme_bdev_ctrlr * 262 nvme_bdev_ctrlr_get_by_name(const char *name) 263 { 264 struct nvme_bdev_ctrlr *nbdev_ctrlr; 265 266 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 267 if (strcmp(name, nbdev_ctrlr->name) == 0) { 268 break; 269 } 270 } 271 272 return nbdev_ctrlr; 273 } 274 275 static struct nvme_ctrlr * 276 nvme_bdev_ctrlr_get_ctrlr(struct nvme_bdev_ctrlr *nbdev_ctrlr, 277 const struct spdk_nvme_transport_id *trid, const char *hostnqn) 278 { 279 const struct spdk_nvme_ctrlr_opts *opts; 280 struct nvme_ctrlr *nvme_ctrlr; 281 282 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 283 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 284 if (spdk_nvme_transport_id_compare(trid, &nvme_ctrlr->active_path_id->trid) == 0 && 285 strcmp(hostnqn, opts->hostnqn) == 0) { 286 break; 287 } 288 } 289 290 return nvme_ctrlr; 291 } 292 293 struct nvme_ctrlr * 294 nvme_bdev_ctrlr_get_ctrlr_by_id(struct nvme_bdev_ctrlr *nbdev_ctrlr, 295 uint16_t cntlid) 296 { 297 struct nvme_ctrlr *nvme_ctrlr; 298 const struct spdk_nvme_ctrlr_data *cdata; 299 300 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 301 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 302 if (cdata->cntlid == cntlid) { 303 break; 304 } 305 } 306 307 return nvme_ctrlr; 308 } 309 310 static struct nvme_bdev * 311 nvme_bdev_ctrlr_get_bdev(struct nvme_bdev_ctrlr *nbdev_ctrlr, uint32_t nsid) 312 { 313 struct nvme_bdev *bdev; 314 315 pthread_mutex_lock(&g_bdev_nvme_mutex); 316 TAILQ_FOREACH(bdev, &nbdev_ctrlr->bdevs, tailq) { 317 if (bdev->nsid == nsid) { 318 break; 319 } 320 } 321 pthread_mutex_unlock(&g_bdev_nvme_mutex); 322 323 return bdev; 324 } 325 326 struct nvme_ns * 327 nvme_ctrlr_get_ns(struct nvme_ctrlr *nvme_ctrlr, uint32_t nsid) 328 { 329 struct nvme_ns ns; 330 331 assert(nsid > 0); 332 333 ns.id = nsid; 334 return RB_FIND(nvme_ns_tree, &nvme_ctrlr->namespaces, &ns); 335 } 336 337 struct nvme_ns * 338 nvme_ctrlr_get_first_active_ns(struct nvme_ctrlr *nvme_ctrlr) 339 { 340 return RB_MIN(nvme_ns_tree, &nvme_ctrlr->namespaces); 341 } 342 343 struct nvme_ns * 344 nvme_ctrlr_get_next_active_ns(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *ns) 345 { 346 if (ns == NULL) { 347 return NULL; 348 } 349 350 return RB_NEXT(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 351 } 352 353 static struct nvme_ctrlr * 354 nvme_ctrlr_get(const struct spdk_nvme_transport_id *trid, const char *hostnqn) 355 { 356 struct nvme_bdev_ctrlr *nbdev_ctrlr; 357 struct nvme_ctrlr *nvme_ctrlr = NULL; 358 359 pthread_mutex_lock(&g_bdev_nvme_mutex); 360 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 361 nvme_ctrlr = nvme_bdev_ctrlr_get_ctrlr(nbdev_ctrlr, trid, hostnqn); 362 if (nvme_ctrlr != NULL) { 363 break; 364 } 365 } 366 pthread_mutex_unlock(&g_bdev_nvme_mutex); 367 368 return nvme_ctrlr; 369 } 370 371 struct nvme_ctrlr * 372 nvme_ctrlr_get_by_name(const char *name) 373 { 374 struct nvme_bdev_ctrlr *nbdev_ctrlr; 375 struct nvme_ctrlr *nvme_ctrlr = NULL; 376 377 if (name == NULL) { 378 return NULL; 379 } 380 381 pthread_mutex_lock(&g_bdev_nvme_mutex); 382 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 383 if (nbdev_ctrlr != NULL) { 384 nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 385 } 386 pthread_mutex_unlock(&g_bdev_nvme_mutex); 387 388 return nvme_ctrlr; 389 } 390 391 void 392 nvme_bdev_ctrlr_for_each(nvme_bdev_ctrlr_for_each_fn fn, void *ctx) 393 { 394 struct nvme_bdev_ctrlr *nbdev_ctrlr; 395 396 pthread_mutex_lock(&g_bdev_nvme_mutex); 397 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 398 fn(nbdev_ctrlr, ctx); 399 } 400 pthread_mutex_unlock(&g_bdev_nvme_mutex); 401 } 402 403 void 404 nvme_bdev_dump_trid_json(const struct spdk_nvme_transport_id *trid, struct spdk_json_write_ctx *w) 405 { 406 const char *trtype_str; 407 const char *adrfam_str; 408 409 trtype_str = spdk_nvme_transport_id_trtype_str(trid->trtype); 410 if (trtype_str) { 411 spdk_json_write_named_string(w, "trtype", trtype_str); 412 } 413 414 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 415 if (adrfam_str) { 416 spdk_json_write_named_string(w, "adrfam", adrfam_str); 417 } 418 419 if (trid->traddr[0] != '\0') { 420 spdk_json_write_named_string(w, "traddr", trid->traddr); 421 } 422 423 if (trid->trsvcid[0] != '\0') { 424 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 425 } 426 427 if (trid->subnqn[0] != '\0') { 428 spdk_json_write_named_string(w, "subnqn", trid->subnqn); 429 } 430 } 431 432 static void 433 nvme_bdev_ctrlr_delete(struct nvme_bdev_ctrlr *nbdev_ctrlr, 434 struct nvme_ctrlr *nvme_ctrlr) 435 { 436 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_delete, nvme_ctrlr->nbdev_ctrlr->name); 437 pthread_mutex_lock(&g_bdev_nvme_mutex); 438 439 TAILQ_REMOVE(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 440 if (!TAILQ_EMPTY(&nbdev_ctrlr->ctrlrs)) { 441 pthread_mutex_unlock(&g_bdev_nvme_mutex); 442 443 return; 444 } 445 TAILQ_REMOVE(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 446 447 pthread_mutex_unlock(&g_bdev_nvme_mutex); 448 449 assert(TAILQ_EMPTY(&nbdev_ctrlr->bdevs)); 450 451 free(nbdev_ctrlr->name); 452 free(nbdev_ctrlr); 453 } 454 455 static void 456 _nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 457 { 458 struct nvme_path_id *path_id, *tmp_path; 459 struct nvme_ns *ns, *tmp_ns; 460 461 free(nvme_ctrlr->copied_ana_desc); 462 spdk_free(nvme_ctrlr->ana_log_page); 463 464 if (nvme_ctrlr->opal_dev) { 465 spdk_opal_dev_destruct(nvme_ctrlr->opal_dev); 466 nvme_ctrlr->opal_dev = NULL; 467 } 468 469 if (nvme_ctrlr->nbdev_ctrlr) { 470 nvme_bdev_ctrlr_delete(nvme_ctrlr->nbdev_ctrlr, nvme_ctrlr); 471 } 472 473 RB_FOREACH_SAFE(ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp_ns) { 474 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 475 nvme_ns_free(ns); 476 } 477 478 TAILQ_FOREACH_SAFE(path_id, &nvme_ctrlr->trids, link, tmp_path) { 479 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 480 free(path_id); 481 } 482 483 pthread_mutex_destroy(&nvme_ctrlr->mutex); 484 spdk_keyring_put_key(nvme_ctrlr->psk); 485 spdk_keyring_put_key(nvme_ctrlr->dhchap_key); 486 spdk_keyring_put_key(nvme_ctrlr->dhchap_ctrlr_key); 487 free(nvme_ctrlr); 488 489 pthread_mutex_lock(&g_bdev_nvme_mutex); 490 if (g_bdev_nvme_module_finish && TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 491 pthread_mutex_unlock(&g_bdev_nvme_mutex); 492 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 493 spdk_bdev_module_fini_done(); 494 return; 495 } 496 pthread_mutex_unlock(&g_bdev_nvme_mutex); 497 } 498 499 static int 500 nvme_detach_poller(void *arg) 501 { 502 struct nvme_ctrlr *nvme_ctrlr = arg; 503 int rc; 504 505 rc = spdk_nvme_detach_poll_async(nvme_ctrlr->detach_ctx); 506 if (rc != -EAGAIN) { 507 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 508 _nvme_ctrlr_delete(nvme_ctrlr); 509 } 510 511 return SPDK_POLLER_BUSY; 512 } 513 514 static void 515 nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 516 { 517 int rc; 518 519 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 520 521 /* First, unregister the adminq poller, as the driver will poll adminq if necessary */ 522 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 523 524 /* If we got here, the reset/detach poller cannot be active */ 525 assert(nvme_ctrlr->reset_detach_poller == NULL); 526 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(nvme_detach_poller, 527 nvme_ctrlr, 1000); 528 if (nvme_ctrlr->reset_detach_poller == NULL) { 529 SPDK_ERRLOG("Failed to register detach poller\n"); 530 goto error; 531 } 532 533 rc = spdk_nvme_detach_async(nvme_ctrlr->ctrlr, &nvme_ctrlr->detach_ctx); 534 if (rc != 0) { 535 SPDK_ERRLOG("Failed to detach the NVMe controller\n"); 536 goto error; 537 } 538 539 return; 540 error: 541 /* We don't have a good way to handle errors here, so just do what we can and delete the 542 * controller without detaching the underlying NVMe device. 543 */ 544 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 545 _nvme_ctrlr_delete(nvme_ctrlr); 546 } 547 548 static void 549 nvme_ctrlr_unregister_cb(void *io_device) 550 { 551 struct nvme_ctrlr *nvme_ctrlr = io_device; 552 553 nvme_ctrlr_delete(nvme_ctrlr); 554 } 555 556 static void 557 nvme_ctrlr_unregister(void *ctx) 558 { 559 struct nvme_ctrlr *nvme_ctrlr = ctx; 560 561 spdk_io_device_unregister(nvme_ctrlr, nvme_ctrlr_unregister_cb); 562 } 563 564 static bool 565 nvme_ctrlr_can_be_unregistered(struct nvme_ctrlr *nvme_ctrlr) 566 { 567 if (!nvme_ctrlr->destruct) { 568 return false; 569 } 570 571 if (nvme_ctrlr->ref > 0) { 572 return false; 573 } 574 575 if (nvme_ctrlr->resetting) { 576 return false; 577 } 578 579 if (nvme_ctrlr->ana_log_page_updating) { 580 return false; 581 } 582 583 if (nvme_ctrlr->io_path_cache_clearing) { 584 return false; 585 } 586 587 return true; 588 } 589 590 static void 591 nvme_ctrlr_release(struct nvme_ctrlr *nvme_ctrlr) 592 { 593 pthread_mutex_lock(&nvme_ctrlr->mutex); 594 SPDK_DTRACE_PROBE2(bdev_nvme_ctrlr_release, nvme_ctrlr->nbdev_ctrlr->name, nvme_ctrlr->ref); 595 596 assert(nvme_ctrlr->ref > 0); 597 nvme_ctrlr->ref--; 598 599 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 600 pthread_mutex_unlock(&nvme_ctrlr->mutex); 601 return; 602 } 603 604 pthread_mutex_unlock(&nvme_ctrlr->mutex); 605 606 spdk_thread_exec_msg(nvme_ctrlr->thread, nvme_ctrlr_unregister, nvme_ctrlr); 607 } 608 609 static void 610 bdev_nvme_clear_current_io_path(struct nvme_bdev_channel *nbdev_ch) 611 { 612 nbdev_ch->current_io_path = NULL; 613 nbdev_ch->rr_counter = 0; 614 } 615 616 static struct nvme_io_path * 617 _bdev_nvme_get_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 618 { 619 struct nvme_io_path *io_path; 620 621 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 622 if (io_path->nvme_ns == nvme_ns) { 623 break; 624 } 625 } 626 627 return io_path; 628 } 629 630 static struct nvme_io_path * 631 nvme_io_path_alloc(void) 632 { 633 struct nvme_io_path *io_path; 634 635 io_path = calloc(1, sizeof(*io_path)); 636 if (io_path == NULL) { 637 SPDK_ERRLOG("Failed to alloc io_path.\n"); 638 return NULL; 639 } 640 641 if (g_opts.io_path_stat) { 642 io_path->stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); 643 if (io_path->stat == NULL) { 644 free(io_path); 645 SPDK_ERRLOG("Failed to alloc io_path stat.\n"); 646 return NULL; 647 } 648 spdk_bdev_reset_io_stat(io_path->stat, SPDK_BDEV_RESET_STAT_MAXMIN); 649 } 650 651 return io_path; 652 } 653 654 static void 655 nvme_io_path_free(struct nvme_io_path *io_path) 656 { 657 free(io_path->stat); 658 free(io_path); 659 } 660 661 static int 662 _bdev_nvme_add_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 663 { 664 struct nvme_io_path *io_path; 665 struct spdk_io_channel *ch; 666 struct nvme_ctrlr_channel *ctrlr_ch; 667 struct nvme_qpair *nvme_qpair; 668 669 io_path = nvme_io_path_alloc(); 670 if (io_path == NULL) { 671 return -ENOMEM; 672 } 673 674 io_path->nvme_ns = nvme_ns; 675 676 ch = spdk_get_io_channel(nvme_ns->ctrlr); 677 if (ch == NULL) { 678 nvme_io_path_free(io_path); 679 SPDK_ERRLOG("Failed to alloc io_channel.\n"); 680 return -ENOMEM; 681 } 682 683 ctrlr_ch = spdk_io_channel_get_ctx(ch); 684 685 nvme_qpair = ctrlr_ch->qpair; 686 assert(nvme_qpair != NULL); 687 688 io_path->qpair = nvme_qpair; 689 TAILQ_INSERT_TAIL(&nvme_qpair->io_path_list, io_path, tailq); 690 691 io_path->nbdev_ch = nbdev_ch; 692 STAILQ_INSERT_TAIL(&nbdev_ch->io_path_list, io_path, stailq); 693 694 bdev_nvme_clear_current_io_path(nbdev_ch); 695 696 return 0; 697 } 698 699 static void 700 bdev_nvme_clear_retry_io_path(struct nvme_bdev_channel *nbdev_ch, 701 struct nvme_io_path *io_path) 702 { 703 struct nvme_bdev_io *bio; 704 705 TAILQ_FOREACH(bio, &nbdev_ch->retry_io_list, retry_link) { 706 if (bio->io_path == io_path) { 707 bio->io_path = NULL; 708 } 709 } 710 } 711 712 static void 713 _bdev_nvme_delete_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *io_path) 714 { 715 struct spdk_io_channel *ch; 716 struct nvme_qpair *nvme_qpair; 717 struct nvme_ctrlr_channel *ctrlr_ch; 718 struct nvme_bdev *nbdev; 719 720 nbdev = spdk_io_channel_get_io_device(spdk_io_channel_from_ctx(nbdev_ch)); 721 722 /* Add the statistics to nvme_ns before this path is destroyed. */ 723 pthread_mutex_lock(&nbdev->mutex); 724 if (nbdev->ref != 0 && io_path->nvme_ns->stat != NULL && io_path->stat != NULL) { 725 spdk_bdev_add_io_stat(io_path->nvme_ns->stat, io_path->stat); 726 } 727 pthread_mutex_unlock(&nbdev->mutex); 728 729 bdev_nvme_clear_current_io_path(nbdev_ch); 730 bdev_nvme_clear_retry_io_path(nbdev_ch, io_path); 731 732 STAILQ_REMOVE(&nbdev_ch->io_path_list, io_path, nvme_io_path, stailq); 733 io_path->nbdev_ch = NULL; 734 735 nvme_qpair = io_path->qpair; 736 assert(nvme_qpair != NULL); 737 738 ctrlr_ch = nvme_qpair->ctrlr_ch; 739 assert(ctrlr_ch != NULL); 740 741 ch = spdk_io_channel_from_ctx(ctrlr_ch); 742 spdk_put_io_channel(ch); 743 744 /* After an io_path is removed, I/Os submitted to it may complete and update statistics 745 * of the io_path. To avoid heap-use-after-free error from this case, do not free the 746 * io_path here but free the io_path when the associated qpair is freed. It is ensured 747 * that all I/Os submitted to the io_path are completed when the associated qpair is freed. 748 */ 749 } 750 751 static void 752 _bdev_nvme_delete_io_paths(struct nvme_bdev_channel *nbdev_ch) 753 { 754 struct nvme_io_path *io_path, *tmp_io_path; 755 756 STAILQ_FOREACH_SAFE(io_path, &nbdev_ch->io_path_list, stailq, tmp_io_path) { 757 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 758 } 759 } 760 761 static int 762 bdev_nvme_create_bdev_channel_cb(void *io_device, void *ctx_buf) 763 { 764 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 765 struct nvme_bdev *nbdev = io_device; 766 struct nvme_ns *nvme_ns; 767 int rc; 768 769 STAILQ_INIT(&nbdev_ch->io_path_list); 770 TAILQ_INIT(&nbdev_ch->retry_io_list); 771 772 pthread_mutex_lock(&nbdev->mutex); 773 774 nbdev_ch->mp_policy = nbdev->mp_policy; 775 nbdev_ch->mp_selector = nbdev->mp_selector; 776 nbdev_ch->rr_min_io = nbdev->rr_min_io; 777 778 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 779 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 780 if (rc != 0) { 781 pthread_mutex_unlock(&nbdev->mutex); 782 783 _bdev_nvme_delete_io_paths(nbdev_ch); 784 return rc; 785 } 786 } 787 pthread_mutex_unlock(&nbdev->mutex); 788 789 return 0; 790 } 791 792 /* If cpl != NULL, complete the bdev_io with nvme status based on 'cpl'. 793 * If cpl == NULL, complete the bdev_io with bdev status based on 'status'. 794 */ 795 static inline void 796 __bdev_nvme_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status, 797 const struct spdk_nvme_cpl *cpl) 798 { 799 spdk_trace_record(TRACE_BDEV_NVME_IO_DONE, 0, 0, (uintptr_t)bdev_io->driver_ctx, 800 (uintptr_t)bdev_io); 801 if (cpl) { 802 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 803 } else { 804 spdk_bdev_io_complete(bdev_io, status); 805 } 806 } 807 808 static void bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch); 809 810 static void 811 bdev_nvme_destroy_bdev_channel_cb(void *io_device, void *ctx_buf) 812 { 813 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 814 815 bdev_nvme_abort_retry_ios(nbdev_ch); 816 _bdev_nvme_delete_io_paths(nbdev_ch); 817 } 818 819 static inline bool 820 bdev_nvme_io_type_is_admin(enum spdk_bdev_io_type io_type) 821 { 822 switch (io_type) { 823 case SPDK_BDEV_IO_TYPE_RESET: 824 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 825 case SPDK_BDEV_IO_TYPE_ABORT: 826 return true; 827 default: 828 break; 829 } 830 831 return false; 832 } 833 834 static inline bool 835 nvme_ns_is_active(struct nvme_ns *nvme_ns) 836 { 837 if (spdk_unlikely(nvme_ns->ana_state_updating)) { 838 return false; 839 } 840 841 if (spdk_unlikely(nvme_ns->ns == NULL)) { 842 return false; 843 } 844 845 return true; 846 } 847 848 static inline bool 849 nvme_ns_is_accessible(struct nvme_ns *nvme_ns) 850 { 851 if (spdk_unlikely(!nvme_ns_is_active(nvme_ns))) { 852 return false; 853 } 854 855 switch (nvme_ns->ana_state) { 856 case SPDK_NVME_ANA_OPTIMIZED_STATE: 857 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 858 return true; 859 default: 860 break; 861 } 862 863 return false; 864 } 865 866 static inline bool 867 nvme_qpair_is_connected(struct nvme_qpair *nvme_qpair) 868 { 869 if (spdk_unlikely(nvme_qpair->qpair == NULL)) { 870 return false; 871 } 872 873 if (spdk_unlikely(spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) != 874 SPDK_NVME_QPAIR_FAILURE_NONE)) { 875 return false; 876 } 877 878 if (spdk_unlikely(nvme_qpair->ctrlr_ch->reset_iter != NULL)) { 879 return false; 880 } 881 882 return true; 883 } 884 885 static inline bool 886 nvme_io_path_is_available(struct nvme_io_path *io_path) 887 { 888 if (spdk_unlikely(!nvme_qpair_is_connected(io_path->qpair))) { 889 return false; 890 } 891 892 if (spdk_unlikely(!nvme_ns_is_accessible(io_path->nvme_ns))) { 893 return false; 894 } 895 896 return true; 897 } 898 899 static inline bool 900 nvme_ctrlr_is_failed(struct nvme_ctrlr *nvme_ctrlr) 901 { 902 if (nvme_ctrlr->destruct) { 903 return true; 904 } 905 906 if (nvme_ctrlr->fast_io_fail_timedout) { 907 return true; 908 } 909 910 if (nvme_ctrlr->resetting) { 911 if (nvme_ctrlr->opts.reconnect_delay_sec != 0) { 912 return false; 913 } else { 914 return true; 915 } 916 } 917 918 if (nvme_ctrlr->reconnect_is_delayed) { 919 return false; 920 } 921 922 if (nvme_ctrlr->disabled) { 923 return true; 924 } 925 926 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 927 return true; 928 } else { 929 return false; 930 } 931 } 932 933 static bool 934 nvme_ctrlr_is_available(struct nvme_ctrlr *nvme_ctrlr) 935 { 936 if (nvme_ctrlr->destruct) { 937 return false; 938 } 939 940 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 941 return false; 942 } 943 944 if (nvme_ctrlr->resetting || nvme_ctrlr->reconnect_is_delayed) { 945 return false; 946 } 947 948 if (nvme_ctrlr->disabled) { 949 return false; 950 } 951 952 return true; 953 } 954 955 /* Simulate circular linked list. */ 956 static inline struct nvme_io_path * 957 nvme_io_path_get_next(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *prev_path) 958 { 959 struct nvme_io_path *next_path; 960 961 if (prev_path != NULL) { 962 next_path = STAILQ_NEXT(prev_path, stailq); 963 if (next_path != NULL) { 964 return next_path; 965 } 966 } 967 968 return STAILQ_FIRST(&nbdev_ch->io_path_list); 969 } 970 971 static struct nvme_io_path * 972 _bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 973 { 974 struct nvme_io_path *io_path, *start, *non_optimized = NULL; 975 976 start = nvme_io_path_get_next(nbdev_ch, nbdev_ch->current_io_path); 977 978 io_path = start; 979 do { 980 if (spdk_likely(nvme_io_path_is_available(io_path))) { 981 switch (io_path->nvme_ns->ana_state) { 982 case SPDK_NVME_ANA_OPTIMIZED_STATE: 983 nbdev_ch->current_io_path = io_path; 984 return io_path; 985 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 986 if (non_optimized == NULL) { 987 non_optimized = io_path; 988 } 989 break; 990 default: 991 assert(false); 992 break; 993 } 994 } 995 io_path = nvme_io_path_get_next(nbdev_ch, io_path); 996 } while (io_path != start); 997 998 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) { 999 /* We come here only if there is no optimized path. Cache even non_optimized 1000 * path for load balance across multiple non_optimized paths. 1001 */ 1002 nbdev_ch->current_io_path = non_optimized; 1003 } 1004 1005 return non_optimized; 1006 } 1007 1008 static struct nvme_io_path * 1009 _bdev_nvme_find_io_path_min_qd(struct nvme_bdev_channel *nbdev_ch) 1010 { 1011 struct nvme_io_path *io_path; 1012 struct nvme_io_path *optimized = NULL, *non_optimized = NULL; 1013 uint32_t opt_min_qd = UINT32_MAX, non_opt_min_qd = UINT32_MAX; 1014 uint32_t num_outstanding_reqs; 1015 1016 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 1017 if (spdk_unlikely(!nvme_qpair_is_connected(io_path->qpair))) { 1018 /* The device is currently resetting. */ 1019 continue; 1020 } 1021 1022 if (spdk_unlikely(!nvme_ns_is_active(io_path->nvme_ns))) { 1023 continue; 1024 } 1025 1026 num_outstanding_reqs = spdk_nvme_qpair_get_num_outstanding_reqs(io_path->qpair->qpair); 1027 switch (io_path->nvme_ns->ana_state) { 1028 case SPDK_NVME_ANA_OPTIMIZED_STATE: 1029 if (num_outstanding_reqs < opt_min_qd) { 1030 opt_min_qd = num_outstanding_reqs; 1031 optimized = io_path; 1032 } 1033 break; 1034 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 1035 if (num_outstanding_reqs < non_opt_min_qd) { 1036 non_opt_min_qd = num_outstanding_reqs; 1037 non_optimized = io_path; 1038 } 1039 break; 1040 default: 1041 break; 1042 } 1043 } 1044 1045 /* don't cache io path for BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH selector */ 1046 if (optimized != NULL) { 1047 return optimized; 1048 } 1049 1050 return non_optimized; 1051 } 1052 1053 static inline struct nvme_io_path * 1054 bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 1055 { 1056 if (spdk_likely(nbdev_ch->current_io_path != NULL)) { 1057 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE) { 1058 return nbdev_ch->current_io_path; 1059 } else if (nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 1060 if (++nbdev_ch->rr_counter < nbdev_ch->rr_min_io) { 1061 return nbdev_ch->current_io_path; 1062 } 1063 nbdev_ch->rr_counter = 0; 1064 } 1065 } 1066 1067 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE || 1068 nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 1069 return _bdev_nvme_find_io_path(nbdev_ch); 1070 } else { 1071 return _bdev_nvme_find_io_path_min_qd(nbdev_ch); 1072 } 1073 } 1074 1075 /* Return true if there is any io_path whose qpair is active or ctrlr is not failed, 1076 * or false otherwise. 1077 * 1078 * If any io_path has an active qpair but find_io_path() returned NULL, its namespace 1079 * is likely to be non-accessible now but may become accessible. 1080 * 1081 * If any io_path has an unfailed ctrlr but find_io_path() returned NULL, the ctrlr 1082 * is likely to be resetting now but the reset may succeed. A ctrlr is set to unfailed 1083 * when starting to reset it but it is set to failed when the reset failed. Hence, if 1084 * a ctrlr is unfailed, it is likely that it works fine or is resetting. 1085 */ 1086 static bool 1087 any_io_path_may_become_available(struct nvme_bdev_channel *nbdev_ch) 1088 { 1089 struct nvme_io_path *io_path; 1090 1091 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 1092 if (io_path->nvme_ns->ana_transition_timedout) { 1093 continue; 1094 } 1095 1096 if (nvme_qpair_is_connected(io_path->qpair) || 1097 !nvme_ctrlr_is_failed(io_path->qpair->ctrlr)) { 1098 return true; 1099 } 1100 } 1101 1102 return false; 1103 } 1104 1105 static void 1106 bdev_nvme_retry_io(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io) 1107 { 1108 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 1109 struct spdk_io_channel *ch; 1110 1111 if (nbdev_io->io_path != NULL && nvme_io_path_is_available(nbdev_io->io_path)) { 1112 _bdev_nvme_submit_request(nbdev_ch, bdev_io); 1113 } else { 1114 ch = spdk_io_channel_from_ctx(nbdev_ch); 1115 bdev_nvme_submit_request(ch, bdev_io); 1116 } 1117 } 1118 1119 static int 1120 bdev_nvme_retry_ios(void *arg) 1121 { 1122 struct nvme_bdev_channel *nbdev_ch = arg; 1123 struct nvme_bdev_io *bio, *tmp_bio; 1124 uint64_t now, delay_us; 1125 1126 now = spdk_get_ticks(); 1127 1128 TAILQ_FOREACH_SAFE(bio, &nbdev_ch->retry_io_list, retry_link, tmp_bio) { 1129 if (bio->retry_ticks > now) { 1130 break; 1131 } 1132 1133 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bio, retry_link); 1134 1135 bdev_nvme_retry_io(nbdev_ch, spdk_bdev_io_from_ctx(bio)); 1136 } 1137 1138 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1139 1140 bio = TAILQ_FIRST(&nbdev_ch->retry_io_list); 1141 if (bio != NULL) { 1142 delay_us = (bio->retry_ticks - now) * SPDK_SEC_TO_USEC / spdk_get_ticks_hz(); 1143 1144 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 1145 delay_us); 1146 } 1147 1148 return SPDK_POLLER_BUSY; 1149 } 1150 1151 static void 1152 bdev_nvme_queue_retry_io(struct nvme_bdev_channel *nbdev_ch, 1153 struct nvme_bdev_io *bio, uint64_t delay_ms) 1154 { 1155 struct nvme_bdev_io *tmp_bio; 1156 1157 bio->retry_ticks = spdk_get_ticks() + delay_ms * spdk_get_ticks_hz() / 1000ULL; 1158 1159 TAILQ_FOREACH_REVERSE(tmp_bio, &nbdev_ch->retry_io_list, retry_io_head, retry_link) { 1160 if (tmp_bio->retry_ticks <= bio->retry_ticks) { 1161 TAILQ_INSERT_AFTER(&nbdev_ch->retry_io_list, tmp_bio, bio, 1162 retry_link); 1163 return; 1164 } 1165 } 1166 1167 /* No earlier I/Os were found. This I/O must be the new head. */ 1168 TAILQ_INSERT_HEAD(&nbdev_ch->retry_io_list, bio, retry_link); 1169 1170 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1171 1172 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 1173 delay_ms * 1000ULL); 1174 } 1175 1176 static void 1177 bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch) 1178 { 1179 struct nvme_bdev_io *bio, *tmp_bio; 1180 1181 TAILQ_FOREACH_SAFE(bio, &nbdev_ch->retry_io_list, retry_link, tmp_bio) { 1182 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bio, retry_link); 1183 __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_ABORTED, NULL); 1184 } 1185 1186 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1187 } 1188 1189 static int 1190 bdev_nvme_abort_retry_io(struct nvme_bdev_channel *nbdev_ch, 1191 struct nvme_bdev_io *bio_to_abort) 1192 { 1193 struct nvme_bdev_io *bio; 1194 1195 TAILQ_FOREACH(bio, &nbdev_ch->retry_io_list, retry_link) { 1196 if (bio == bio_to_abort) { 1197 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bio, retry_link); 1198 __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_ABORTED, NULL); 1199 return 0; 1200 } 1201 } 1202 1203 return -ENOENT; 1204 } 1205 1206 static void 1207 bdev_nvme_update_nvme_error_stat(struct spdk_bdev_io *bdev_io, const struct spdk_nvme_cpl *cpl) 1208 { 1209 struct nvme_bdev *nbdev; 1210 uint16_t sct, sc; 1211 1212 assert(spdk_nvme_cpl_is_error(cpl)); 1213 1214 nbdev = bdev_io->bdev->ctxt; 1215 1216 if (nbdev->err_stat == NULL) { 1217 return; 1218 } 1219 1220 sct = cpl->status.sct; 1221 sc = cpl->status.sc; 1222 1223 pthread_mutex_lock(&nbdev->mutex); 1224 1225 nbdev->err_stat->status_type[sct]++; 1226 switch (sct) { 1227 case SPDK_NVME_SCT_GENERIC: 1228 case SPDK_NVME_SCT_COMMAND_SPECIFIC: 1229 case SPDK_NVME_SCT_MEDIA_ERROR: 1230 case SPDK_NVME_SCT_PATH: 1231 nbdev->err_stat->status[sct][sc]++; 1232 break; 1233 default: 1234 break; 1235 } 1236 1237 pthread_mutex_unlock(&nbdev->mutex); 1238 } 1239 1240 static inline void 1241 bdev_nvme_update_io_path_stat(struct nvme_bdev_io *bio) 1242 { 1243 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1244 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 1245 uint32_t blocklen = bdev_io->bdev->blocklen; 1246 struct spdk_bdev_io_stat *stat; 1247 uint64_t tsc_diff; 1248 1249 if (bio->io_path->stat == NULL) { 1250 return; 1251 } 1252 1253 tsc_diff = spdk_get_ticks() - bio->submit_tsc; 1254 stat = bio->io_path->stat; 1255 1256 switch (bdev_io->type) { 1257 case SPDK_BDEV_IO_TYPE_READ: 1258 stat->bytes_read += num_blocks * blocklen; 1259 stat->num_read_ops++; 1260 stat->read_latency_ticks += tsc_diff; 1261 if (stat->max_read_latency_ticks < tsc_diff) { 1262 stat->max_read_latency_ticks = tsc_diff; 1263 } 1264 if (stat->min_read_latency_ticks > tsc_diff) { 1265 stat->min_read_latency_ticks = tsc_diff; 1266 } 1267 break; 1268 case SPDK_BDEV_IO_TYPE_WRITE: 1269 stat->bytes_written += num_blocks * blocklen; 1270 stat->num_write_ops++; 1271 stat->write_latency_ticks += tsc_diff; 1272 if (stat->max_write_latency_ticks < tsc_diff) { 1273 stat->max_write_latency_ticks = tsc_diff; 1274 } 1275 if (stat->min_write_latency_ticks > tsc_diff) { 1276 stat->min_write_latency_ticks = tsc_diff; 1277 } 1278 break; 1279 case SPDK_BDEV_IO_TYPE_UNMAP: 1280 stat->bytes_unmapped += num_blocks * blocklen; 1281 stat->num_unmap_ops++; 1282 stat->unmap_latency_ticks += tsc_diff; 1283 if (stat->max_unmap_latency_ticks < tsc_diff) { 1284 stat->max_unmap_latency_ticks = tsc_diff; 1285 } 1286 if (stat->min_unmap_latency_ticks > tsc_diff) { 1287 stat->min_unmap_latency_ticks = tsc_diff; 1288 } 1289 break; 1290 case SPDK_BDEV_IO_TYPE_ZCOPY: 1291 /* Track the data in the start phase only */ 1292 if (!bdev_io->u.bdev.zcopy.start) { 1293 break; 1294 } 1295 if (bdev_io->u.bdev.zcopy.populate) { 1296 stat->bytes_read += num_blocks * blocklen; 1297 stat->num_read_ops++; 1298 stat->read_latency_ticks += tsc_diff; 1299 if (stat->max_read_latency_ticks < tsc_diff) { 1300 stat->max_read_latency_ticks = tsc_diff; 1301 } 1302 if (stat->min_read_latency_ticks > tsc_diff) { 1303 stat->min_read_latency_ticks = tsc_diff; 1304 } 1305 } else { 1306 stat->bytes_written += num_blocks * blocklen; 1307 stat->num_write_ops++; 1308 stat->write_latency_ticks += tsc_diff; 1309 if (stat->max_write_latency_ticks < tsc_diff) { 1310 stat->max_write_latency_ticks = tsc_diff; 1311 } 1312 if (stat->min_write_latency_ticks > tsc_diff) { 1313 stat->min_write_latency_ticks = tsc_diff; 1314 } 1315 } 1316 break; 1317 case SPDK_BDEV_IO_TYPE_COPY: 1318 stat->bytes_copied += num_blocks * blocklen; 1319 stat->num_copy_ops++; 1320 stat->copy_latency_ticks += tsc_diff; 1321 if (stat->max_copy_latency_ticks < tsc_diff) { 1322 stat->max_copy_latency_ticks = tsc_diff; 1323 } 1324 if (stat->min_copy_latency_ticks > tsc_diff) { 1325 stat->min_copy_latency_ticks = tsc_diff; 1326 } 1327 break; 1328 default: 1329 break; 1330 } 1331 } 1332 1333 static bool 1334 bdev_nvme_check_retry_io(struct nvme_bdev_io *bio, 1335 const struct spdk_nvme_cpl *cpl, 1336 struct nvme_bdev_channel *nbdev_ch, 1337 uint64_t *_delay_ms) 1338 { 1339 struct nvme_io_path *io_path = bio->io_path; 1340 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 1341 const struct spdk_nvme_ctrlr_data *cdata; 1342 1343 if (spdk_nvme_cpl_is_path_error(cpl) || 1344 spdk_nvme_cpl_is_aborted_sq_deletion(cpl) || 1345 !nvme_io_path_is_available(io_path) || 1346 !nvme_ctrlr_is_available(nvme_ctrlr)) { 1347 bdev_nvme_clear_current_io_path(nbdev_ch); 1348 bio->io_path = NULL; 1349 if (spdk_nvme_cpl_is_ana_error(cpl)) { 1350 if (nvme_ctrlr_read_ana_log_page(nvme_ctrlr) == 0) { 1351 io_path->nvme_ns->ana_state_updating = true; 1352 } 1353 } 1354 if (!any_io_path_may_become_available(nbdev_ch)) { 1355 return false; 1356 } 1357 *_delay_ms = 0; 1358 } else { 1359 bio->retry_count++; 1360 1361 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 1362 1363 if (cpl->status.crd != 0) { 1364 *_delay_ms = cdata->crdt[cpl->status.crd] * 100; 1365 } else { 1366 *_delay_ms = 0; 1367 } 1368 } 1369 1370 return true; 1371 } 1372 1373 static inline void 1374 bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio, 1375 const struct spdk_nvme_cpl *cpl) 1376 { 1377 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1378 struct nvme_bdev_channel *nbdev_ch; 1379 uint64_t delay_ms; 1380 1381 assert(!bdev_nvme_io_type_is_admin(bdev_io->type)); 1382 1383 if (spdk_likely(spdk_nvme_cpl_is_success(cpl))) { 1384 bdev_nvme_update_io_path_stat(bio); 1385 goto complete; 1386 } 1387 1388 /* Update error counts before deciding if retry is needed. 1389 * Hence, error counts may be more than the number of I/O errors. 1390 */ 1391 bdev_nvme_update_nvme_error_stat(bdev_io, cpl); 1392 1393 if (cpl->status.dnr != 0 || spdk_nvme_cpl_is_aborted_by_request(cpl) || 1394 (g_opts.bdev_retry_count != -1 && bio->retry_count >= g_opts.bdev_retry_count)) { 1395 goto complete; 1396 } 1397 1398 /* At this point we don't know whether the sequence was successfully executed or not, so we 1399 * cannot retry the IO */ 1400 if (bdev_io->u.bdev.accel_sequence != NULL) { 1401 goto complete; 1402 } 1403 1404 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1405 1406 if (bdev_nvme_check_retry_io(bio, cpl, nbdev_ch, &delay_ms)) { 1407 bdev_nvme_queue_retry_io(nbdev_ch, bio, delay_ms); 1408 return; 1409 } 1410 1411 complete: 1412 bio->retry_count = 0; 1413 bio->submit_tsc = 0; 1414 bdev_io->u.bdev.accel_sequence = NULL; 1415 __bdev_nvme_io_complete(bdev_io, 0, cpl); 1416 } 1417 1418 static inline void 1419 bdev_nvme_io_complete(struct nvme_bdev_io *bio, int rc) 1420 { 1421 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1422 struct nvme_bdev_channel *nbdev_ch; 1423 enum spdk_bdev_io_status io_status; 1424 1425 assert(!bdev_nvme_io_type_is_admin(bdev_io->type)); 1426 1427 switch (rc) { 1428 case 0: 1429 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1430 break; 1431 case -ENOMEM: 1432 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1433 break; 1434 case -ENXIO: 1435 if (g_opts.bdev_retry_count == -1 || bio->retry_count < g_opts.bdev_retry_count) { 1436 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1437 1438 bdev_nvme_clear_current_io_path(nbdev_ch); 1439 bio->io_path = NULL; 1440 1441 if (any_io_path_may_become_available(nbdev_ch)) { 1442 bdev_nvme_queue_retry_io(nbdev_ch, bio, 1000ULL); 1443 return; 1444 } 1445 } 1446 1447 /* fallthrough */ 1448 default: 1449 spdk_accel_sequence_abort(bdev_io->u.bdev.accel_sequence); 1450 bdev_io->u.bdev.accel_sequence = NULL; 1451 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1452 break; 1453 } 1454 1455 bio->retry_count = 0; 1456 bio->submit_tsc = 0; 1457 __bdev_nvme_io_complete(bdev_io, io_status, NULL); 1458 } 1459 1460 static inline void 1461 bdev_nvme_admin_complete(struct nvme_bdev_io *bio, int rc) 1462 { 1463 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1464 enum spdk_bdev_io_status io_status; 1465 1466 switch (rc) { 1467 case 0: 1468 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1469 break; 1470 case -ENOMEM: 1471 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1472 break; 1473 case -ENXIO: 1474 /* fallthrough */ 1475 default: 1476 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1477 break; 1478 } 1479 1480 __bdev_nvme_io_complete(bdev_io, io_status, NULL); 1481 } 1482 1483 static void 1484 bdev_nvme_clear_io_path_caches_done(struct spdk_io_channel_iter *i, int status) 1485 { 1486 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 1487 1488 pthread_mutex_lock(&nvme_ctrlr->mutex); 1489 1490 assert(nvme_ctrlr->io_path_cache_clearing == true); 1491 nvme_ctrlr->io_path_cache_clearing = false; 1492 1493 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 1494 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1495 return; 1496 } 1497 1498 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1499 1500 nvme_ctrlr_unregister(nvme_ctrlr); 1501 } 1502 1503 static void 1504 _bdev_nvme_clear_io_path_cache(struct nvme_qpair *nvme_qpair) 1505 { 1506 struct nvme_io_path *io_path; 1507 1508 TAILQ_FOREACH(io_path, &nvme_qpair->io_path_list, tailq) { 1509 if (io_path->nbdev_ch == NULL) { 1510 continue; 1511 } 1512 bdev_nvme_clear_current_io_path(io_path->nbdev_ch); 1513 } 1514 } 1515 1516 static void 1517 bdev_nvme_clear_io_path_cache(struct spdk_io_channel_iter *i) 1518 { 1519 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 1520 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 1521 1522 assert(ctrlr_ch->qpair != NULL); 1523 1524 _bdev_nvme_clear_io_path_cache(ctrlr_ch->qpair); 1525 1526 spdk_for_each_channel_continue(i, 0); 1527 } 1528 1529 static void 1530 bdev_nvme_clear_io_path_caches(struct nvme_ctrlr *nvme_ctrlr) 1531 { 1532 pthread_mutex_lock(&nvme_ctrlr->mutex); 1533 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 1534 nvme_ctrlr->io_path_cache_clearing) { 1535 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1536 return; 1537 } 1538 1539 nvme_ctrlr->io_path_cache_clearing = true; 1540 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1541 1542 spdk_for_each_channel(nvme_ctrlr, 1543 bdev_nvme_clear_io_path_cache, 1544 NULL, 1545 bdev_nvme_clear_io_path_caches_done); 1546 } 1547 1548 static struct nvme_qpair * 1549 nvme_poll_group_get_qpair(struct nvme_poll_group *group, struct spdk_nvme_qpair *qpair) 1550 { 1551 struct nvme_qpair *nvme_qpair; 1552 1553 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1554 if (nvme_qpair->qpair == qpair) { 1555 break; 1556 } 1557 } 1558 1559 return nvme_qpair; 1560 } 1561 1562 static void nvme_qpair_delete(struct nvme_qpair *nvme_qpair); 1563 1564 static void 1565 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx) 1566 { 1567 struct nvme_poll_group *group = poll_group_ctx; 1568 struct nvme_qpair *nvme_qpair; 1569 struct nvme_ctrlr_channel *ctrlr_ch; 1570 int status; 1571 1572 nvme_qpair = nvme_poll_group_get_qpair(group, qpair); 1573 if (nvme_qpair == NULL) { 1574 return; 1575 } 1576 1577 if (nvme_qpair->qpair != NULL) { 1578 spdk_nvme_ctrlr_free_io_qpair(nvme_qpair->qpair); 1579 nvme_qpair->qpair = NULL; 1580 } 1581 1582 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1583 1584 ctrlr_ch = nvme_qpair->ctrlr_ch; 1585 1586 if (ctrlr_ch != NULL) { 1587 if (ctrlr_ch->reset_iter != NULL) { 1588 /* We are in a full reset sequence. */ 1589 if (ctrlr_ch->connect_poller != NULL) { 1590 /* qpair was failed to connect. Abort the reset sequence. */ 1591 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was failed to connect. abort the reset ctrlr sequence.\n", 1592 qpair); 1593 spdk_poller_unregister(&ctrlr_ch->connect_poller); 1594 status = -1; 1595 } else { 1596 /* qpair was completed to disconnect. Just move to the next ctrlr_channel. */ 1597 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was disconnected and freed in a reset ctrlr sequence.\n", 1598 qpair); 1599 status = 0; 1600 } 1601 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, status); 1602 ctrlr_ch->reset_iter = NULL; 1603 } else { 1604 /* qpair was disconnected unexpectedly. Reset controller for recovery. */ 1605 SPDK_NOTICELOG("qpair %p was disconnected and freed. reset controller.\n", qpair); 1606 bdev_nvme_failover_ctrlr(nvme_qpair->ctrlr); 1607 } 1608 } else { 1609 /* In this case, ctrlr_channel is already deleted. */ 1610 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was disconnected and freed. delete nvme_qpair.\n", qpair); 1611 nvme_qpair_delete(nvme_qpair); 1612 } 1613 } 1614 1615 static void 1616 bdev_nvme_check_io_qpairs(struct nvme_poll_group *group) 1617 { 1618 struct nvme_qpair *nvme_qpair; 1619 1620 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1621 if (nvme_qpair->qpair == NULL || nvme_qpair->ctrlr_ch == NULL) { 1622 continue; 1623 } 1624 1625 if (spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) != 1626 SPDK_NVME_QPAIR_FAILURE_NONE) { 1627 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1628 } 1629 } 1630 } 1631 1632 static int 1633 bdev_nvme_poll(void *arg) 1634 { 1635 struct nvme_poll_group *group = arg; 1636 int64_t num_completions; 1637 1638 if (group->collect_spin_stat && group->start_ticks == 0) { 1639 group->start_ticks = spdk_get_ticks(); 1640 } 1641 1642 num_completions = spdk_nvme_poll_group_process_completions(group->group, 0, 1643 bdev_nvme_disconnected_qpair_cb); 1644 if (group->collect_spin_stat) { 1645 if (num_completions > 0) { 1646 if (group->end_ticks != 0) { 1647 group->spin_ticks += (group->end_ticks - group->start_ticks); 1648 group->end_ticks = 0; 1649 } 1650 group->start_ticks = 0; 1651 } else { 1652 group->end_ticks = spdk_get_ticks(); 1653 } 1654 } 1655 1656 if (spdk_unlikely(num_completions < 0)) { 1657 bdev_nvme_check_io_qpairs(group); 1658 } 1659 1660 return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 1661 } 1662 1663 static int bdev_nvme_poll_adminq(void *arg); 1664 1665 static void 1666 bdev_nvme_change_adminq_poll_period(struct nvme_ctrlr *nvme_ctrlr, uint64_t new_period_us) 1667 { 1668 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 1669 1670 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, 1671 nvme_ctrlr, new_period_us); 1672 } 1673 1674 static int 1675 bdev_nvme_poll_adminq(void *arg) 1676 { 1677 int32_t rc; 1678 struct nvme_ctrlr *nvme_ctrlr = arg; 1679 nvme_ctrlr_disconnected_cb disconnected_cb; 1680 1681 assert(nvme_ctrlr != NULL); 1682 1683 rc = spdk_nvme_ctrlr_process_admin_completions(nvme_ctrlr->ctrlr); 1684 if (rc < 0) { 1685 disconnected_cb = nvme_ctrlr->disconnected_cb; 1686 nvme_ctrlr->disconnected_cb = NULL; 1687 1688 if (disconnected_cb != NULL) { 1689 bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 1690 g_opts.nvme_adminq_poll_period_us); 1691 disconnected_cb(nvme_ctrlr); 1692 } else { 1693 bdev_nvme_failover_ctrlr(nvme_ctrlr); 1694 } 1695 } else if (spdk_nvme_ctrlr_get_admin_qp_failure_reason(nvme_ctrlr->ctrlr) != 1696 SPDK_NVME_QPAIR_FAILURE_NONE) { 1697 bdev_nvme_clear_io_path_caches(nvme_ctrlr); 1698 } 1699 1700 return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY; 1701 } 1702 1703 static void 1704 nvme_bdev_free(void *io_device) 1705 { 1706 struct nvme_bdev *nvme_disk = io_device; 1707 1708 pthread_mutex_destroy(&nvme_disk->mutex); 1709 free(nvme_disk->disk.name); 1710 free(nvme_disk->err_stat); 1711 free(nvme_disk); 1712 } 1713 1714 static int 1715 bdev_nvme_destruct(void *ctx) 1716 { 1717 struct nvme_bdev *nvme_disk = ctx; 1718 struct nvme_ns *nvme_ns, *tmp_nvme_ns; 1719 1720 SPDK_DTRACE_PROBE2(bdev_nvme_destruct, nvme_disk->nbdev_ctrlr->name, nvme_disk->nsid); 1721 1722 TAILQ_FOREACH_SAFE(nvme_ns, &nvme_disk->nvme_ns_list, tailq, tmp_nvme_ns) { 1723 pthread_mutex_lock(&nvme_ns->ctrlr->mutex); 1724 1725 nvme_ns->bdev = NULL; 1726 1727 assert(nvme_ns->id > 0); 1728 1729 if (nvme_ctrlr_get_ns(nvme_ns->ctrlr, nvme_ns->id) == NULL) { 1730 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1731 1732 nvme_ctrlr_release(nvme_ns->ctrlr); 1733 nvme_ns_free(nvme_ns); 1734 } else { 1735 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1736 } 1737 } 1738 1739 pthread_mutex_lock(&g_bdev_nvme_mutex); 1740 TAILQ_REMOVE(&nvme_disk->nbdev_ctrlr->bdevs, nvme_disk, tailq); 1741 pthread_mutex_unlock(&g_bdev_nvme_mutex); 1742 1743 spdk_io_device_unregister(nvme_disk, nvme_bdev_free); 1744 1745 return 0; 1746 } 1747 1748 static int 1749 bdev_nvme_create_qpair(struct nvme_qpair *nvme_qpair) 1750 { 1751 struct nvme_ctrlr *nvme_ctrlr; 1752 struct spdk_nvme_io_qpair_opts opts; 1753 struct spdk_nvme_qpair *qpair; 1754 int rc; 1755 1756 nvme_ctrlr = nvme_qpair->ctrlr; 1757 1758 spdk_nvme_ctrlr_get_default_io_qpair_opts(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1759 opts.delay_cmd_submit = g_opts.delay_cmd_submit; 1760 opts.create_only = true; 1761 opts.async_mode = true; 1762 opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests); 1763 g_opts.io_queue_requests = opts.io_queue_requests; 1764 1765 qpair = spdk_nvme_ctrlr_alloc_io_qpair(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1766 if (qpair == NULL) { 1767 return -1; 1768 } 1769 1770 SPDK_DTRACE_PROBE3(bdev_nvme_create_qpair, nvme_ctrlr->nbdev_ctrlr->name, 1771 spdk_nvme_qpair_get_id(qpair), spdk_thread_get_id(nvme_ctrlr->thread)); 1772 1773 assert(nvme_qpair->group != NULL); 1774 1775 rc = spdk_nvme_poll_group_add(nvme_qpair->group->group, qpair); 1776 if (rc != 0) { 1777 SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n"); 1778 goto err; 1779 } 1780 1781 rc = spdk_nvme_ctrlr_connect_io_qpair(nvme_ctrlr->ctrlr, qpair); 1782 if (rc != 0) { 1783 SPDK_ERRLOG("Unable to connect I/O qpair.\n"); 1784 goto err; 1785 } 1786 1787 nvme_qpair->qpair = qpair; 1788 1789 if (!g_opts.disable_auto_failback) { 1790 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1791 } 1792 1793 return 0; 1794 1795 err: 1796 spdk_nvme_ctrlr_free_io_qpair(qpair); 1797 1798 return rc; 1799 } 1800 1801 static void bdev_nvme_reset_io_continue(void *cb_arg, int rc); 1802 1803 static void 1804 bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i) 1805 { 1806 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 1807 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 1808 int rc = 0; 1809 struct nvme_bdev_io *bio; 1810 1811 if (spdk_io_channel_iter_get_ctx(i) != NULL) { 1812 rc = -1; 1813 } 1814 1815 while (!TAILQ_EMPTY(&ctrlr_ch->pending_resets)) { 1816 bio = TAILQ_FIRST(&ctrlr_ch->pending_resets); 1817 TAILQ_REMOVE(&ctrlr_ch->pending_resets, bio, retry_link); 1818 1819 bdev_nvme_reset_io_continue(bio, rc); 1820 } 1821 1822 spdk_for_each_channel_continue(i, 0); 1823 } 1824 1825 /* This function marks the current trid as failed by storing the current ticks 1826 * and then sets the next trid to the active trid within a controller if exists. 1827 * 1828 * The purpose of the boolean return value is to request the caller to disconnect 1829 * the current trid now to try connecting the next trid. 1830 */ 1831 static bool 1832 bdev_nvme_failover_trid(struct nvme_ctrlr *nvme_ctrlr, bool remove, bool start) 1833 { 1834 struct nvme_path_id *path_id, *next_path; 1835 int rc __attribute__((unused)); 1836 1837 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 1838 assert(path_id); 1839 assert(path_id == nvme_ctrlr->active_path_id); 1840 next_path = TAILQ_NEXT(path_id, link); 1841 1842 /* Update the last failed time. It means the trid is failed if its last 1843 * failed time is non-zero. 1844 */ 1845 path_id->last_failed_tsc = spdk_get_ticks(); 1846 1847 if (next_path == NULL) { 1848 /* There is no alternate trid within a controller. */ 1849 return false; 1850 } 1851 1852 if (!start && nvme_ctrlr->opts.reconnect_delay_sec == 0) { 1853 /* Connect is not retried in a controller reset sequence. Connecting 1854 * the next trid will be done by the next bdev_nvme_failover_ctrlr() call. 1855 */ 1856 return false; 1857 } 1858 1859 assert(path_id->trid.trtype != SPDK_NVME_TRANSPORT_PCIE); 1860 1861 SPDK_NOTICELOG("Start failover from %s:%s to %s:%s\n", path_id->trid.traddr, 1862 path_id->trid.trsvcid, next_path->trid.traddr, next_path->trid.trsvcid); 1863 1864 spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr); 1865 nvme_ctrlr->active_path_id = next_path; 1866 rc = spdk_nvme_ctrlr_set_trid(nvme_ctrlr->ctrlr, &next_path->trid); 1867 assert(rc == 0); 1868 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 1869 if (!remove) { 1870 /** Shuffle the old trid to the end of the list and use the new one. 1871 * Allows for round robin through multiple connections. 1872 */ 1873 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, path_id, link); 1874 } else { 1875 free(path_id); 1876 } 1877 1878 if (start || next_path->last_failed_tsc == 0) { 1879 /* bdev_nvme_failover_ctrlr() is just called or the next trid is not failed 1880 * or used yet. Try the next trid now. 1881 */ 1882 return true; 1883 } 1884 1885 if (spdk_get_ticks() > next_path->last_failed_tsc + spdk_get_ticks_hz() * 1886 nvme_ctrlr->opts.reconnect_delay_sec) { 1887 /* Enough backoff passed since the next trid failed. Try the next trid now. */ 1888 return true; 1889 } 1890 1891 /* The next trid will be tried after reconnect_delay_sec seconds. */ 1892 return false; 1893 } 1894 1895 static bool 1896 bdev_nvme_check_ctrlr_loss_timeout(struct nvme_ctrlr *nvme_ctrlr) 1897 { 1898 int32_t elapsed; 1899 1900 if (nvme_ctrlr->opts.ctrlr_loss_timeout_sec == 0 || 1901 nvme_ctrlr->opts.ctrlr_loss_timeout_sec == -1) { 1902 return false; 1903 } 1904 1905 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 1906 if (elapsed >= nvme_ctrlr->opts.ctrlr_loss_timeout_sec) { 1907 return true; 1908 } else { 1909 return false; 1910 } 1911 } 1912 1913 static bool 1914 bdev_nvme_check_fast_io_fail_timeout(struct nvme_ctrlr *nvme_ctrlr) 1915 { 1916 uint32_t elapsed; 1917 1918 if (nvme_ctrlr->opts.fast_io_fail_timeout_sec == 0) { 1919 return false; 1920 } 1921 1922 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 1923 if (elapsed >= nvme_ctrlr->opts.fast_io_fail_timeout_sec) { 1924 return true; 1925 } else { 1926 return false; 1927 } 1928 } 1929 1930 static void bdev_nvme_reset_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, bool success); 1931 1932 static void 1933 nvme_ctrlr_disconnect(struct nvme_ctrlr *nvme_ctrlr, nvme_ctrlr_disconnected_cb cb_fn) 1934 { 1935 int rc; 1936 1937 rc = spdk_nvme_ctrlr_disconnect(nvme_ctrlr->ctrlr); 1938 if (rc != 0) { 1939 /* Disconnect fails if ctrlr is already resetting or removed. In this case, 1940 * fail the reset sequence immediately. 1941 */ 1942 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 1943 return; 1944 } 1945 1946 /* spdk_nvme_ctrlr_disconnect() may complete asynchronously later by polling adminq. 1947 * Set callback here to execute the specified operation after ctrlr is really disconnected. 1948 */ 1949 assert(nvme_ctrlr->disconnected_cb == NULL); 1950 nvme_ctrlr->disconnected_cb = cb_fn; 1951 1952 /* During disconnection, reduce the period to poll adminq more often. */ 1953 bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 0); 1954 } 1955 1956 enum bdev_nvme_op_after_reset { 1957 OP_NONE, 1958 OP_COMPLETE_PENDING_DESTRUCT, 1959 OP_DESTRUCT, 1960 OP_DELAYED_RECONNECT, 1961 OP_FAILOVER, 1962 }; 1963 1964 typedef enum bdev_nvme_op_after_reset _bdev_nvme_op_after_reset; 1965 1966 static _bdev_nvme_op_after_reset 1967 bdev_nvme_check_op_after_reset(struct nvme_ctrlr *nvme_ctrlr, bool success) 1968 { 1969 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 1970 /* Complete pending destruct after reset completes. */ 1971 return OP_COMPLETE_PENDING_DESTRUCT; 1972 } else if (nvme_ctrlr->pending_failover) { 1973 nvme_ctrlr->pending_failover = false; 1974 nvme_ctrlr->reset_start_tsc = 0; 1975 return OP_FAILOVER; 1976 } else if (success || nvme_ctrlr->opts.reconnect_delay_sec == 0) { 1977 nvme_ctrlr->reset_start_tsc = 0; 1978 return OP_NONE; 1979 } else if (bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 1980 return OP_DESTRUCT; 1981 } else { 1982 if (bdev_nvme_check_fast_io_fail_timeout(nvme_ctrlr)) { 1983 nvme_ctrlr->fast_io_fail_timedout = true; 1984 } 1985 return OP_DELAYED_RECONNECT; 1986 } 1987 } 1988 1989 static int bdev_nvme_delete_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool hotplug); 1990 static void bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 1991 1992 static int 1993 bdev_nvme_reconnect_delay_timer_expired(void *ctx) 1994 { 1995 struct nvme_ctrlr *nvme_ctrlr = ctx; 1996 1997 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect_delay, nvme_ctrlr->nbdev_ctrlr->name); 1998 pthread_mutex_lock(&nvme_ctrlr->mutex); 1999 2000 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 2001 2002 if (!nvme_ctrlr->reconnect_is_delayed) { 2003 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2004 return SPDK_POLLER_BUSY; 2005 } 2006 2007 nvme_ctrlr->reconnect_is_delayed = false; 2008 2009 if (nvme_ctrlr->destruct) { 2010 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2011 return SPDK_POLLER_BUSY; 2012 } 2013 2014 assert(nvme_ctrlr->resetting == false); 2015 nvme_ctrlr->resetting = true; 2016 2017 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2018 2019 spdk_poller_resume(nvme_ctrlr->adminq_timer_poller); 2020 2021 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2022 return SPDK_POLLER_BUSY; 2023 } 2024 2025 static void 2026 bdev_nvme_start_reconnect_delay_timer(struct nvme_ctrlr *nvme_ctrlr) 2027 { 2028 spdk_poller_pause(nvme_ctrlr->adminq_timer_poller); 2029 2030 assert(nvme_ctrlr->reconnect_is_delayed == false); 2031 nvme_ctrlr->reconnect_is_delayed = true; 2032 2033 assert(nvme_ctrlr->reconnect_delay_timer == NULL); 2034 nvme_ctrlr->reconnect_delay_timer = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_delay_timer_expired, 2035 nvme_ctrlr, 2036 nvme_ctrlr->opts.reconnect_delay_sec * SPDK_SEC_TO_USEC); 2037 } 2038 2039 static void remove_discovery_entry(struct nvme_ctrlr *nvme_ctrlr); 2040 2041 static void 2042 _bdev_nvme_reset_ctrlr_complete(struct spdk_io_channel_iter *i, int status) 2043 { 2044 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2045 bool success = spdk_io_channel_iter_get_ctx(i) == NULL; 2046 bdev_nvme_ctrlr_op_cb ctrlr_op_cb_fn = nvme_ctrlr->ctrlr_op_cb_fn; 2047 void *ctrlr_op_cb_arg = nvme_ctrlr->ctrlr_op_cb_arg; 2048 enum bdev_nvme_op_after_reset op_after_reset; 2049 2050 assert(nvme_ctrlr->thread == spdk_get_thread()); 2051 2052 nvme_ctrlr->ctrlr_op_cb_fn = NULL; 2053 nvme_ctrlr->ctrlr_op_cb_arg = NULL; 2054 2055 if (!success) { 2056 SPDK_ERRLOG("Resetting controller failed.\n"); 2057 } else { 2058 SPDK_NOTICELOG("Resetting controller successful.\n"); 2059 } 2060 2061 pthread_mutex_lock(&nvme_ctrlr->mutex); 2062 nvme_ctrlr->resetting = false; 2063 nvme_ctrlr->dont_retry = false; 2064 nvme_ctrlr->in_failover = false; 2065 2066 op_after_reset = bdev_nvme_check_op_after_reset(nvme_ctrlr, success); 2067 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2068 2069 /* Delay callbacks when the next operation is a failover. */ 2070 if (ctrlr_op_cb_fn && op_after_reset != OP_FAILOVER) { 2071 ctrlr_op_cb_fn(ctrlr_op_cb_arg, success ? 0 : -1); 2072 } 2073 2074 switch (op_after_reset) { 2075 case OP_COMPLETE_PENDING_DESTRUCT: 2076 nvme_ctrlr_unregister(nvme_ctrlr); 2077 break; 2078 case OP_DESTRUCT: 2079 bdev_nvme_delete_ctrlr(nvme_ctrlr, false); 2080 remove_discovery_entry(nvme_ctrlr); 2081 break; 2082 case OP_DELAYED_RECONNECT: 2083 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_start_reconnect_delay_timer); 2084 break; 2085 case OP_FAILOVER: 2086 nvme_ctrlr->ctrlr_op_cb_fn = ctrlr_op_cb_fn; 2087 nvme_ctrlr->ctrlr_op_cb_arg = ctrlr_op_cb_arg; 2088 bdev_nvme_failover_ctrlr(nvme_ctrlr); 2089 break; 2090 default: 2091 break; 2092 } 2093 } 2094 2095 static void 2096 bdev_nvme_reset_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, bool success) 2097 { 2098 pthread_mutex_lock(&nvme_ctrlr->mutex); 2099 if (!success) { 2100 /* Connecting the active trid failed. Set the next alternate trid to the 2101 * active trid if it exists. 2102 */ 2103 if (bdev_nvme_failover_trid(nvme_ctrlr, false, false)) { 2104 /* The next alternate trid exists and is ready to try. Try it now. */ 2105 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2106 2107 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr); 2108 return; 2109 } 2110 2111 /* We came here if there is no alternate trid or if the next trid exists but 2112 * is not ready to try. We will try the active trid after reconnect_delay_sec 2113 * seconds if it is non-zero or at the next reset call otherwise. 2114 */ 2115 } else { 2116 /* Connecting the active trid succeeded. Clear the last failed time because it 2117 * means the trid is failed if its last failed time is non-zero. 2118 */ 2119 nvme_ctrlr->active_path_id->last_failed_tsc = 0; 2120 } 2121 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2122 2123 /* Make sure we clear any pending resets before returning. */ 2124 spdk_for_each_channel(nvme_ctrlr, 2125 bdev_nvme_complete_pending_resets, 2126 success ? NULL : (void *)0x1, 2127 _bdev_nvme_reset_ctrlr_complete); 2128 } 2129 2130 static void 2131 bdev_nvme_reset_create_qpairs_failed(struct spdk_io_channel_iter *i, int status) 2132 { 2133 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2134 2135 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 2136 } 2137 2138 static void 2139 bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i) 2140 { 2141 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 2142 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch); 2143 struct nvme_qpair *nvme_qpair; 2144 2145 nvme_qpair = ctrlr_ch->qpair; 2146 assert(nvme_qpair != NULL); 2147 2148 _bdev_nvme_clear_io_path_cache(nvme_qpair); 2149 2150 if (nvme_qpair->qpair != NULL) { 2151 if (nvme_qpair->ctrlr->dont_retry) { 2152 spdk_nvme_qpair_set_abort_dnr(nvme_qpair->qpair, true); 2153 } 2154 spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); 2155 2156 /* The current full reset sequence will move to the next 2157 * ctrlr_channel after the qpair is actually disconnected. 2158 */ 2159 assert(ctrlr_ch->reset_iter == NULL); 2160 ctrlr_ch->reset_iter = i; 2161 } else { 2162 spdk_for_each_channel_continue(i, 0); 2163 } 2164 } 2165 2166 static void 2167 bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status) 2168 { 2169 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2170 2171 if (status == 0) { 2172 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, true); 2173 } else { 2174 /* Delete the added qpairs and quiesce ctrlr to make the states clean. */ 2175 spdk_for_each_channel(nvme_ctrlr, 2176 bdev_nvme_reset_destroy_qpair, 2177 NULL, 2178 bdev_nvme_reset_create_qpairs_failed); 2179 } 2180 } 2181 2182 static int 2183 bdev_nvme_reset_check_qpair_connected(void *ctx) 2184 { 2185 struct nvme_ctrlr_channel *ctrlr_ch = ctx; 2186 2187 if (ctrlr_ch->reset_iter == NULL) { 2188 /* qpair was already failed to connect and the reset sequence is being aborted. */ 2189 assert(ctrlr_ch->connect_poller == NULL); 2190 assert(ctrlr_ch->qpair->qpair == NULL); 2191 return SPDK_POLLER_BUSY; 2192 } 2193 2194 assert(ctrlr_ch->qpair->qpair != NULL); 2195 2196 if (!spdk_nvme_qpair_is_connected(ctrlr_ch->qpair->qpair)) { 2197 return SPDK_POLLER_BUSY; 2198 } 2199 2200 spdk_poller_unregister(&ctrlr_ch->connect_poller); 2201 2202 /* qpair was completed to connect. Move to the next ctrlr_channel */ 2203 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 2204 ctrlr_ch->reset_iter = NULL; 2205 2206 if (!g_opts.disable_auto_failback) { 2207 _bdev_nvme_clear_io_path_cache(ctrlr_ch->qpair); 2208 } 2209 2210 return SPDK_POLLER_BUSY; 2211 } 2212 2213 static void 2214 bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i) 2215 { 2216 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 2217 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 2218 int rc; 2219 2220 rc = bdev_nvme_create_qpair(ctrlr_ch->qpair); 2221 if (rc == 0) { 2222 ctrlr_ch->connect_poller = SPDK_POLLER_REGISTER(bdev_nvme_reset_check_qpair_connected, 2223 ctrlr_ch, 0); 2224 2225 /* The current full reset sequence will move to the next 2226 * ctrlr_channel after the qpair is actually connected. 2227 */ 2228 assert(ctrlr_ch->reset_iter == NULL); 2229 ctrlr_ch->reset_iter = i; 2230 } else { 2231 spdk_for_each_channel_continue(i, rc); 2232 } 2233 } 2234 2235 static void 2236 nvme_ctrlr_check_namespaces(struct nvme_ctrlr *nvme_ctrlr) 2237 { 2238 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 2239 struct nvme_ns *nvme_ns; 2240 2241 for (nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 2242 nvme_ns != NULL; 2243 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns)) { 2244 if (!spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) { 2245 SPDK_DEBUGLOG(bdev_nvme, "NSID %u was removed during reset.\n", nvme_ns->id); 2246 /* NS can be added again. Just nullify nvme_ns->ns. */ 2247 nvme_ns->ns = NULL; 2248 } 2249 } 2250 } 2251 2252 2253 static int 2254 bdev_nvme_reconnect_ctrlr_poll(void *arg) 2255 { 2256 struct nvme_ctrlr *nvme_ctrlr = arg; 2257 int rc = -ETIMEDOUT; 2258 2259 if (bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 2260 /* Mark the ctrlr as failed. The next call to 2261 * spdk_nvme_ctrlr_reconnect_poll_async() will then 2262 * do the necessary cleanup and return failure. 2263 */ 2264 spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr); 2265 } 2266 2267 rc = spdk_nvme_ctrlr_reconnect_poll_async(nvme_ctrlr->ctrlr); 2268 if (rc == -EAGAIN) { 2269 return SPDK_POLLER_BUSY; 2270 } 2271 2272 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 2273 if (rc == 0) { 2274 nvme_ctrlr_check_namespaces(nvme_ctrlr); 2275 2276 /* Recreate all of the I/O queue pairs */ 2277 spdk_for_each_channel(nvme_ctrlr, 2278 bdev_nvme_reset_create_qpair, 2279 NULL, 2280 bdev_nvme_reset_create_qpairs_done); 2281 } else { 2282 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 2283 } 2284 return SPDK_POLLER_BUSY; 2285 } 2286 2287 static void 2288 bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2289 { 2290 spdk_nvme_ctrlr_reconnect_async(nvme_ctrlr->ctrlr); 2291 2292 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect, nvme_ctrlr->nbdev_ctrlr->name); 2293 assert(nvme_ctrlr->reset_detach_poller == NULL); 2294 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_ctrlr_poll, 2295 nvme_ctrlr, 0); 2296 } 2297 2298 static void 2299 bdev_nvme_reset_destroy_qpair_done(struct spdk_io_channel_iter *i, int status) 2300 { 2301 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2302 2303 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reset, nvme_ctrlr->nbdev_ctrlr->name); 2304 assert(status == 0); 2305 2306 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2307 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2308 } else { 2309 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr); 2310 } 2311 } 2312 2313 static void 2314 bdev_nvme_reset_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr) 2315 { 2316 spdk_for_each_channel(nvme_ctrlr, 2317 bdev_nvme_reset_destroy_qpair, 2318 NULL, 2319 bdev_nvme_reset_destroy_qpair_done); 2320 } 2321 2322 static void 2323 bdev_nvme_reconnect_ctrlr_now(void *ctx) 2324 { 2325 struct nvme_ctrlr *nvme_ctrlr = ctx; 2326 2327 assert(nvme_ctrlr->resetting == true); 2328 assert(nvme_ctrlr->thread == spdk_get_thread()); 2329 2330 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 2331 2332 spdk_poller_resume(nvme_ctrlr->adminq_timer_poller); 2333 2334 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2335 } 2336 2337 static void 2338 _bdev_nvme_reset_ctrlr(void *ctx) 2339 { 2340 struct nvme_ctrlr *nvme_ctrlr = ctx; 2341 2342 assert(nvme_ctrlr->resetting == true); 2343 assert(nvme_ctrlr->thread == spdk_get_thread()); 2344 2345 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2346 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reset_destroy_qpairs); 2347 } else { 2348 bdev_nvme_reset_destroy_qpairs(nvme_ctrlr); 2349 } 2350 } 2351 2352 static int 2353 bdev_nvme_reset_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2354 { 2355 spdk_msg_fn msg_fn; 2356 2357 pthread_mutex_lock(&nvme_ctrlr->mutex); 2358 if (nvme_ctrlr->destruct) { 2359 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2360 return -ENXIO; 2361 } 2362 2363 if (nvme_ctrlr->resetting) { 2364 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2365 SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); 2366 return -EBUSY; 2367 } 2368 2369 if (nvme_ctrlr->disabled) { 2370 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2371 SPDK_NOTICELOG("Unable to perform reset. Controller is disabled.\n"); 2372 return -EALREADY; 2373 } 2374 2375 nvme_ctrlr->resetting = true; 2376 nvme_ctrlr->dont_retry = true; 2377 2378 if (nvme_ctrlr->reconnect_is_delayed) { 2379 SPDK_DEBUGLOG(bdev_nvme, "Reconnect is already scheduled.\n"); 2380 msg_fn = bdev_nvme_reconnect_ctrlr_now; 2381 nvme_ctrlr->reconnect_is_delayed = false; 2382 } else { 2383 msg_fn = _bdev_nvme_reset_ctrlr; 2384 assert(nvme_ctrlr->reset_start_tsc == 0); 2385 } 2386 2387 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2388 2389 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2390 2391 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 2392 return 0; 2393 } 2394 2395 static int 2396 bdev_nvme_enable_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2397 { 2398 pthread_mutex_lock(&nvme_ctrlr->mutex); 2399 if (nvme_ctrlr->destruct) { 2400 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2401 return -ENXIO; 2402 } 2403 2404 if (nvme_ctrlr->resetting) { 2405 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2406 return -EBUSY; 2407 } 2408 2409 if (!nvme_ctrlr->disabled) { 2410 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2411 return -EALREADY; 2412 } 2413 2414 nvme_ctrlr->disabled = false; 2415 nvme_ctrlr->resetting = true; 2416 2417 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2418 2419 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2420 2421 spdk_thread_send_msg(nvme_ctrlr->thread, bdev_nvme_reconnect_ctrlr_now, nvme_ctrlr); 2422 return 0; 2423 } 2424 2425 static void 2426 _bdev_nvme_disable_ctrlr_complete(struct spdk_io_channel_iter *i, int status) 2427 { 2428 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2429 bdev_nvme_ctrlr_op_cb ctrlr_op_cb_fn = nvme_ctrlr->ctrlr_op_cb_fn; 2430 void *ctrlr_op_cb_arg = nvme_ctrlr->ctrlr_op_cb_arg; 2431 enum bdev_nvme_op_after_reset op_after_disable; 2432 2433 assert(nvme_ctrlr->thread == spdk_get_thread()); 2434 2435 nvme_ctrlr->ctrlr_op_cb_fn = NULL; 2436 nvme_ctrlr->ctrlr_op_cb_arg = NULL; 2437 2438 pthread_mutex_lock(&nvme_ctrlr->mutex); 2439 2440 nvme_ctrlr->resetting = false; 2441 nvme_ctrlr->dont_retry = false; 2442 2443 op_after_disable = bdev_nvme_check_op_after_reset(nvme_ctrlr, true); 2444 2445 nvme_ctrlr->disabled = true; 2446 spdk_poller_pause(nvme_ctrlr->adminq_timer_poller); 2447 2448 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2449 2450 if (ctrlr_op_cb_fn) { 2451 ctrlr_op_cb_fn(ctrlr_op_cb_arg, 0); 2452 } 2453 2454 switch (op_after_disable) { 2455 case OP_COMPLETE_PENDING_DESTRUCT: 2456 nvme_ctrlr_unregister(nvme_ctrlr); 2457 break; 2458 default: 2459 break; 2460 } 2461 2462 } 2463 2464 static void 2465 bdev_nvme_disable_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr) 2466 { 2467 /* Make sure we clear any pending resets before returning. */ 2468 spdk_for_each_channel(nvme_ctrlr, 2469 bdev_nvme_complete_pending_resets, 2470 NULL, 2471 _bdev_nvme_disable_ctrlr_complete); 2472 } 2473 2474 static void 2475 bdev_nvme_disable_destroy_qpairs_done(struct spdk_io_channel_iter *i, int status) 2476 { 2477 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2478 2479 assert(status == 0); 2480 2481 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2482 bdev_nvme_disable_ctrlr_complete(nvme_ctrlr); 2483 } else { 2484 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_disable_ctrlr_complete); 2485 } 2486 } 2487 2488 static void 2489 bdev_nvme_disable_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr) 2490 { 2491 spdk_for_each_channel(nvme_ctrlr, 2492 bdev_nvme_reset_destroy_qpair, 2493 NULL, 2494 bdev_nvme_disable_destroy_qpairs_done); 2495 } 2496 2497 static void 2498 _bdev_nvme_cancel_reconnect_and_disable_ctrlr(void *ctx) 2499 { 2500 struct nvme_ctrlr *nvme_ctrlr = ctx; 2501 2502 assert(nvme_ctrlr->resetting == true); 2503 assert(nvme_ctrlr->thread == spdk_get_thread()); 2504 2505 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 2506 2507 bdev_nvme_disable_ctrlr_complete(nvme_ctrlr); 2508 } 2509 2510 static void 2511 _bdev_nvme_disconnect_and_disable_ctrlr(void *ctx) 2512 { 2513 struct nvme_ctrlr *nvme_ctrlr = ctx; 2514 2515 assert(nvme_ctrlr->resetting == true); 2516 assert(nvme_ctrlr->thread == spdk_get_thread()); 2517 2518 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2519 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_disable_destroy_qpairs); 2520 } else { 2521 bdev_nvme_disable_destroy_qpairs(nvme_ctrlr); 2522 } 2523 } 2524 2525 static int 2526 bdev_nvme_disable_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2527 { 2528 spdk_msg_fn msg_fn; 2529 2530 pthread_mutex_lock(&nvme_ctrlr->mutex); 2531 if (nvme_ctrlr->destruct) { 2532 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2533 return -ENXIO; 2534 } 2535 2536 if (nvme_ctrlr->resetting) { 2537 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2538 return -EBUSY; 2539 } 2540 2541 if (nvme_ctrlr->disabled) { 2542 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2543 return -EALREADY; 2544 } 2545 2546 nvme_ctrlr->resetting = true; 2547 nvme_ctrlr->dont_retry = true; 2548 2549 if (nvme_ctrlr->reconnect_is_delayed) { 2550 msg_fn = _bdev_nvme_cancel_reconnect_and_disable_ctrlr; 2551 nvme_ctrlr->reconnect_is_delayed = false; 2552 } else { 2553 msg_fn = _bdev_nvme_disconnect_and_disable_ctrlr; 2554 } 2555 2556 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2557 2558 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2559 2560 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 2561 return 0; 2562 } 2563 2564 static int 2565 nvme_ctrlr_op(struct nvme_ctrlr *nvme_ctrlr, enum nvme_ctrlr_op op, 2566 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2567 { 2568 int rc; 2569 2570 switch (op) { 2571 case NVME_CTRLR_OP_RESET: 2572 rc = bdev_nvme_reset_ctrlr(nvme_ctrlr); 2573 break; 2574 case NVME_CTRLR_OP_ENABLE: 2575 rc = bdev_nvme_enable_ctrlr(nvme_ctrlr); 2576 break; 2577 case NVME_CTRLR_OP_DISABLE: 2578 rc = bdev_nvme_disable_ctrlr(nvme_ctrlr); 2579 break; 2580 default: 2581 rc = -EINVAL; 2582 break; 2583 } 2584 2585 if (rc == 0) { 2586 assert(nvme_ctrlr->ctrlr_op_cb_fn == NULL); 2587 assert(nvme_ctrlr->ctrlr_op_cb_arg == NULL); 2588 nvme_ctrlr->ctrlr_op_cb_fn = cb_fn; 2589 nvme_ctrlr->ctrlr_op_cb_arg = cb_arg; 2590 } 2591 return rc; 2592 } 2593 2594 struct nvme_ctrlr_op_rpc_ctx { 2595 struct nvme_ctrlr *nvme_ctrlr; 2596 struct spdk_thread *orig_thread; 2597 enum nvme_ctrlr_op op; 2598 int rc; 2599 bdev_nvme_ctrlr_op_cb cb_fn; 2600 void *cb_arg; 2601 }; 2602 2603 static void 2604 _nvme_ctrlr_op_rpc_complete(void *_ctx) 2605 { 2606 struct nvme_ctrlr_op_rpc_ctx *ctx = _ctx; 2607 2608 assert(ctx != NULL); 2609 assert(ctx->cb_fn != NULL); 2610 2611 ctx->cb_fn(ctx->cb_arg, ctx->rc); 2612 2613 free(ctx); 2614 } 2615 2616 static void 2617 nvme_ctrlr_op_rpc_complete(void *cb_arg, int rc) 2618 { 2619 struct nvme_ctrlr_op_rpc_ctx *ctx = cb_arg; 2620 2621 ctx->rc = rc; 2622 2623 spdk_thread_send_msg(ctx->orig_thread, _nvme_ctrlr_op_rpc_complete, ctx); 2624 } 2625 2626 void 2627 nvme_ctrlr_op_rpc(struct nvme_ctrlr *nvme_ctrlr, enum nvme_ctrlr_op op, 2628 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2629 { 2630 struct nvme_ctrlr_op_rpc_ctx *ctx; 2631 int rc; 2632 2633 assert(cb_fn != NULL); 2634 2635 ctx = calloc(1, sizeof(*ctx)); 2636 if (ctx == NULL) { 2637 SPDK_ERRLOG("Failed to allocate nvme_ctrlr_op_rpc_ctx.\n"); 2638 cb_fn(cb_arg, -ENOMEM); 2639 return; 2640 } 2641 2642 ctx->orig_thread = spdk_get_thread(); 2643 ctx->cb_fn = cb_fn; 2644 ctx->cb_arg = cb_arg; 2645 2646 rc = nvme_ctrlr_op(nvme_ctrlr, op, nvme_ctrlr_op_rpc_complete, ctx); 2647 if (rc == 0) { 2648 return; 2649 } else if (rc == -EALREADY) { 2650 rc = 0; 2651 } 2652 2653 nvme_ctrlr_op_rpc_complete(ctx, rc); 2654 } 2655 2656 static void nvme_bdev_ctrlr_op_rpc_continue(void *cb_arg, int rc); 2657 2658 static void 2659 _nvme_bdev_ctrlr_op_rpc_continue(void *_ctx) 2660 { 2661 struct nvme_ctrlr_op_rpc_ctx *ctx = _ctx; 2662 struct nvme_ctrlr *prev_nvme_ctrlr, *next_nvme_ctrlr; 2663 int rc; 2664 2665 prev_nvme_ctrlr = ctx->nvme_ctrlr; 2666 ctx->nvme_ctrlr = NULL; 2667 2668 if (ctx->rc != 0) { 2669 goto complete; 2670 } 2671 2672 next_nvme_ctrlr = TAILQ_NEXT(prev_nvme_ctrlr, tailq); 2673 if (next_nvme_ctrlr == NULL) { 2674 goto complete; 2675 } 2676 2677 rc = nvme_ctrlr_op(next_nvme_ctrlr, ctx->op, nvme_bdev_ctrlr_op_rpc_continue, ctx); 2678 if (rc == 0) { 2679 ctx->nvme_ctrlr = next_nvme_ctrlr; 2680 return; 2681 } else if (rc == -EALREADY) { 2682 ctx->nvme_ctrlr = next_nvme_ctrlr; 2683 rc = 0; 2684 } 2685 2686 ctx->rc = rc; 2687 2688 complete: 2689 ctx->cb_fn(ctx->cb_arg, ctx->rc); 2690 free(ctx); 2691 } 2692 2693 static void 2694 nvme_bdev_ctrlr_op_rpc_continue(void *cb_arg, int rc) 2695 { 2696 struct nvme_ctrlr_op_rpc_ctx *ctx = cb_arg; 2697 2698 ctx->rc = rc; 2699 2700 spdk_thread_send_msg(ctx->orig_thread, _nvme_bdev_ctrlr_op_rpc_continue, ctx); 2701 } 2702 2703 void 2704 nvme_bdev_ctrlr_op_rpc(struct nvme_bdev_ctrlr *nbdev_ctrlr, enum nvme_ctrlr_op op, 2705 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2706 { 2707 struct nvme_ctrlr_op_rpc_ctx *ctx; 2708 struct nvme_ctrlr *nvme_ctrlr; 2709 int rc; 2710 2711 assert(cb_fn != NULL); 2712 2713 ctx = calloc(1, sizeof(*ctx)); 2714 if (ctx == NULL) { 2715 SPDK_ERRLOG("Failed to allocate nvme_ctrlr_op_rpc_ctx.\n"); 2716 cb_fn(cb_arg, -ENOMEM); 2717 return; 2718 } 2719 2720 ctx->orig_thread = spdk_get_thread(); 2721 ctx->op = op; 2722 ctx->cb_fn = cb_fn; 2723 ctx->cb_arg = cb_arg; 2724 2725 nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 2726 assert(nvme_ctrlr != NULL); 2727 2728 rc = nvme_ctrlr_op(nvme_ctrlr, op, nvme_bdev_ctrlr_op_rpc_continue, ctx); 2729 if (rc == 0) { 2730 ctx->nvme_ctrlr = nvme_ctrlr; 2731 return; 2732 } else if (rc == -EALREADY) { 2733 ctx->nvme_ctrlr = nvme_ctrlr; 2734 rc = 0; 2735 } 2736 2737 nvme_bdev_ctrlr_op_rpc_continue(ctx, rc); 2738 } 2739 2740 static int _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio); 2741 2742 static void 2743 _bdev_nvme_reset_io_complete(struct spdk_io_channel_iter *i, int status) 2744 { 2745 struct nvme_bdev_io *bio = spdk_io_channel_iter_get_ctx(i); 2746 enum spdk_bdev_io_status io_status; 2747 2748 if (bio->cpl.cdw0 == 0) { 2749 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 2750 } else { 2751 io_status = SPDK_BDEV_IO_STATUS_FAILED; 2752 } 2753 2754 __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), io_status, NULL); 2755 } 2756 2757 static void 2758 bdev_nvme_abort_bdev_channel(struct spdk_io_channel_iter *i) 2759 { 2760 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 2761 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 2762 2763 bdev_nvme_abort_retry_ios(nbdev_ch); 2764 2765 spdk_for_each_channel_continue(i, 0); 2766 } 2767 2768 static void 2769 bdev_nvme_reset_io_complete(struct nvme_bdev_io *bio) 2770 { 2771 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2772 struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt; 2773 2774 /* Abort all queued I/Os for retry. */ 2775 spdk_for_each_channel(nbdev, 2776 bdev_nvme_abort_bdev_channel, 2777 bio, 2778 _bdev_nvme_reset_io_complete); 2779 } 2780 2781 static void 2782 _bdev_nvme_reset_io_continue(void *ctx) 2783 { 2784 struct nvme_bdev_io *bio = ctx; 2785 struct nvme_io_path *prev_io_path, *next_io_path; 2786 int rc; 2787 2788 prev_io_path = bio->io_path; 2789 bio->io_path = NULL; 2790 2791 if (bio->cpl.cdw0 != 0) { 2792 goto complete; 2793 } 2794 2795 next_io_path = STAILQ_NEXT(prev_io_path, stailq); 2796 if (next_io_path == NULL) { 2797 goto complete; 2798 } 2799 2800 rc = _bdev_nvme_reset_io(next_io_path, bio); 2801 if (rc == 0) { 2802 return; 2803 } 2804 2805 bio->cpl.cdw0 = 1; 2806 2807 complete: 2808 bdev_nvme_reset_io_complete(bio); 2809 } 2810 2811 static void 2812 bdev_nvme_reset_io_continue(void *cb_arg, int rc) 2813 { 2814 struct nvme_bdev_io *bio = cb_arg; 2815 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2816 2817 bio->cpl.cdw0 = (rc == 0) ? 0 : 1; 2818 2819 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), _bdev_nvme_reset_io_continue, bio); 2820 } 2821 2822 static int 2823 _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio) 2824 { 2825 struct nvme_ctrlr_channel *ctrlr_ch; 2826 int rc; 2827 2828 rc = nvme_ctrlr_op(io_path->qpair->ctrlr, NVME_CTRLR_OP_RESET, 2829 bdev_nvme_reset_io_continue, bio); 2830 if (rc != 0 && rc != -EBUSY) { 2831 return rc; 2832 } 2833 2834 assert(bio->io_path == NULL); 2835 bio->io_path = io_path; 2836 2837 if (rc == -EBUSY) { 2838 ctrlr_ch = io_path->qpair->ctrlr_ch; 2839 assert(ctrlr_ch != NULL); 2840 /* 2841 * Reset call is queued only if it is from the app framework. This is on purpose so that 2842 * we don't interfere with the app framework reset strategy. i.e. we are deferring to the 2843 * upper level. If they are in the middle of a reset, we won't try to schedule another one. 2844 */ 2845 TAILQ_INSERT_TAIL(&ctrlr_ch->pending_resets, bio, retry_link); 2846 } 2847 2848 return 0; 2849 } 2850 2851 static void 2852 bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio) 2853 { 2854 struct nvme_io_path *io_path; 2855 int rc; 2856 2857 bio->cpl.cdw0 = 0; 2858 2859 /* Reset all nvme_ctrlrs of a bdev controller sequentially. */ 2860 io_path = STAILQ_FIRST(&nbdev_ch->io_path_list); 2861 assert(io_path != NULL); 2862 2863 rc = _bdev_nvme_reset_io(io_path, bio); 2864 if (rc != 0) { 2865 /* If the current nvme_ctrlr is disabled, skip it and move to the next nvme_ctrlr. */ 2866 rc = (rc == -EALREADY) ? 0 : rc; 2867 2868 bdev_nvme_reset_io_continue(bio, rc); 2869 } 2870 } 2871 2872 static int 2873 bdev_nvme_failover_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool remove) 2874 { 2875 if (nvme_ctrlr->destruct) { 2876 /* Don't bother resetting if the controller is in the process of being destructed. */ 2877 return -ENXIO; 2878 } 2879 2880 if (nvme_ctrlr->resetting) { 2881 if (!nvme_ctrlr->in_failover) { 2882 SPDK_NOTICELOG("Reset is already in progress. Defer failover until reset completes.\n"); 2883 2884 /* Defer failover until reset completes. */ 2885 nvme_ctrlr->pending_failover = true; 2886 return -EINPROGRESS; 2887 } else { 2888 SPDK_NOTICELOG("Unable to perform failover, already in progress.\n"); 2889 return -EBUSY; 2890 } 2891 } 2892 2893 bdev_nvme_failover_trid(nvme_ctrlr, remove, true); 2894 2895 if (nvme_ctrlr->reconnect_is_delayed) { 2896 SPDK_NOTICELOG("Reconnect is already scheduled.\n"); 2897 2898 /* We rely on the next reconnect for the failover. */ 2899 return -EALREADY; 2900 } 2901 2902 if (nvme_ctrlr->disabled) { 2903 SPDK_NOTICELOG("Controller is disabled.\n"); 2904 2905 /* We rely on the enablement for the failover. */ 2906 return -EALREADY; 2907 } 2908 2909 nvme_ctrlr->resetting = true; 2910 nvme_ctrlr->in_failover = true; 2911 2912 assert(nvme_ctrlr->reset_start_tsc == 0); 2913 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2914 2915 return 0; 2916 } 2917 2918 static int 2919 bdev_nvme_failover_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2920 { 2921 int rc; 2922 2923 pthread_mutex_lock(&nvme_ctrlr->mutex); 2924 rc = bdev_nvme_failover_ctrlr_unsafe(nvme_ctrlr, false); 2925 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2926 2927 if (rc == 0) { 2928 spdk_thread_send_msg(nvme_ctrlr->thread, _bdev_nvme_reset_ctrlr, nvme_ctrlr); 2929 } else if (rc == -EALREADY) { 2930 rc = 0; 2931 } 2932 2933 return rc; 2934 } 2935 2936 static int bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, 2937 uint64_t num_blocks); 2938 2939 static int bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, 2940 uint64_t num_blocks); 2941 2942 static int bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, 2943 uint64_t src_offset_blocks, 2944 uint64_t num_blocks); 2945 2946 static void 2947 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 2948 bool success) 2949 { 2950 struct nvme_bdev_io *bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 2951 int ret; 2952 2953 if (!success) { 2954 ret = -EINVAL; 2955 goto exit; 2956 } 2957 2958 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 2959 ret = -ENXIO; 2960 goto exit; 2961 } 2962 2963 ret = bdev_nvme_readv(bio, 2964 bdev_io->u.bdev.iovs, 2965 bdev_io->u.bdev.iovcnt, 2966 bdev_io->u.bdev.md_buf, 2967 bdev_io->u.bdev.num_blocks, 2968 bdev_io->u.bdev.offset_blocks, 2969 bdev_io->u.bdev.dif_check_flags, 2970 bdev_io->u.bdev.memory_domain, 2971 bdev_io->u.bdev.memory_domain_ctx, 2972 bdev_io->u.bdev.accel_sequence); 2973 2974 exit: 2975 if (spdk_unlikely(ret != 0)) { 2976 bdev_nvme_io_complete(bio, ret); 2977 } 2978 } 2979 2980 static inline void 2981 _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io) 2982 { 2983 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 2984 struct spdk_bdev *bdev = bdev_io->bdev; 2985 struct nvme_bdev_io *nbdev_io_to_abort; 2986 int rc = 0; 2987 2988 switch (bdev_io->type) { 2989 case SPDK_BDEV_IO_TYPE_READ: 2990 if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) { 2991 2992 rc = bdev_nvme_readv(nbdev_io, 2993 bdev_io->u.bdev.iovs, 2994 bdev_io->u.bdev.iovcnt, 2995 bdev_io->u.bdev.md_buf, 2996 bdev_io->u.bdev.num_blocks, 2997 bdev_io->u.bdev.offset_blocks, 2998 bdev_io->u.bdev.dif_check_flags, 2999 bdev_io->u.bdev.memory_domain, 3000 bdev_io->u.bdev.memory_domain_ctx, 3001 bdev_io->u.bdev.accel_sequence); 3002 } else { 3003 spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb, 3004 bdev_io->u.bdev.num_blocks * bdev->blocklen); 3005 rc = 0; 3006 } 3007 break; 3008 case SPDK_BDEV_IO_TYPE_WRITE: 3009 rc = bdev_nvme_writev(nbdev_io, 3010 bdev_io->u.bdev.iovs, 3011 bdev_io->u.bdev.iovcnt, 3012 bdev_io->u.bdev.md_buf, 3013 bdev_io->u.bdev.num_blocks, 3014 bdev_io->u.bdev.offset_blocks, 3015 bdev_io->u.bdev.dif_check_flags, 3016 bdev_io->u.bdev.memory_domain, 3017 bdev_io->u.bdev.memory_domain_ctx, 3018 bdev_io->u.bdev.accel_sequence, 3019 bdev_io->u.bdev.nvme_cdw12, 3020 bdev_io->u.bdev.nvme_cdw13); 3021 break; 3022 case SPDK_BDEV_IO_TYPE_COMPARE: 3023 rc = bdev_nvme_comparev(nbdev_io, 3024 bdev_io->u.bdev.iovs, 3025 bdev_io->u.bdev.iovcnt, 3026 bdev_io->u.bdev.md_buf, 3027 bdev_io->u.bdev.num_blocks, 3028 bdev_io->u.bdev.offset_blocks, 3029 bdev_io->u.bdev.dif_check_flags); 3030 break; 3031 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 3032 rc = bdev_nvme_comparev_and_writev(nbdev_io, 3033 bdev_io->u.bdev.iovs, 3034 bdev_io->u.bdev.iovcnt, 3035 bdev_io->u.bdev.fused_iovs, 3036 bdev_io->u.bdev.fused_iovcnt, 3037 bdev_io->u.bdev.md_buf, 3038 bdev_io->u.bdev.num_blocks, 3039 bdev_io->u.bdev.offset_blocks, 3040 bdev_io->u.bdev.dif_check_flags); 3041 break; 3042 case SPDK_BDEV_IO_TYPE_UNMAP: 3043 rc = bdev_nvme_unmap(nbdev_io, 3044 bdev_io->u.bdev.offset_blocks, 3045 bdev_io->u.bdev.num_blocks); 3046 break; 3047 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3048 rc = bdev_nvme_write_zeroes(nbdev_io, 3049 bdev_io->u.bdev.offset_blocks, 3050 bdev_io->u.bdev.num_blocks); 3051 break; 3052 case SPDK_BDEV_IO_TYPE_RESET: 3053 nbdev_io->io_path = NULL; 3054 bdev_nvme_reset_io(nbdev_ch, nbdev_io); 3055 return; 3056 3057 case SPDK_BDEV_IO_TYPE_FLUSH: 3058 bdev_nvme_io_complete(nbdev_io, 0); 3059 return; 3060 3061 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 3062 rc = bdev_nvme_zone_appendv(nbdev_io, 3063 bdev_io->u.bdev.iovs, 3064 bdev_io->u.bdev.iovcnt, 3065 bdev_io->u.bdev.md_buf, 3066 bdev_io->u.bdev.num_blocks, 3067 bdev_io->u.bdev.offset_blocks, 3068 bdev_io->u.bdev.dif_check_flags); 3069 break; 3070 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 3071 rc = bdev_nvme_get_zone_info(nbdev_io, 3072 bdev_io->u.zone_mgmt.zone_id, 3073 bdev_io->u.zone_mgmt.num_zones, 3074 bdev_io->u.zone_mgmt.buf); 3075 break; 3076 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 3077 rc = bdev_nvme_zone_management(nbdev_io, 3078 bdev_io->u.zone_mgmt.zone_id, 3079 bdev_io->u.zone_mgmt.zone_action); 3080 break; 3081 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 3082 nbdev_io->io_path = NULL; 3083 bdev_nvme_admin_passthru(nbdev_ch, 3084 nbdev_io, 3085 &bdev_io->u.nvme_passthru.cmd, 3086 bdev_io->u.nvme_passthru.buf, 3087 bdev_io->u.nvme_passthru.nbytes); 3088 return; 3089 3090 case SPDK_BDEV_IO_TYPE_NVME_IO: 3091 rc = bdev_nvme_io_passthru(nbdev_io, 3092 &bdev_io->u.nvme_passthru.cmd, 3093 bdev_io->u.nvme_passthru.buf, 3094 bdev_io->u.nvme_passthru.nbytes); 3095 break; 3096 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3097 rc = bdev_nvme_io_passthru_md(nbdev_io, 3098 &bdev_io->u.nvme_passthru.cmd, 3099 bdev_io->u.nvme_passthru.buf, 3100 bdev_io->u.nvme_passthru.nbytes, 3101 bdev_io->u.nvme_passthru.md_buf, 3102 bdev_io->u.nvme_passthru.md_len); 3103 break; 3104 case SPDK_BDEV_IO_TYPE_NVME_IOV_MD: 3105 rc = bdev_nvme_iov_passthru_md(nbdev_io, 3106 &bdev_io->u.nvme_passthru.cmd, 3107 bdev_io->u.nvme_passthru.iovs, 3108 bdev_io->u.nvme_passthru.iovcnt, 3109 bdev_io->u.nvme_passthru.nbytes, 3110 bdev_io->u.nvme_passthru.md_buf, 3111 bdev_io->u.nvme_passthru.md_len); 3112 break; 3113 case SPDK_BDEV_IO_TYPE_ABORT: 3114 nbdev_io->io_path = NULL; 3115 nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx; 3116 bdev_nvme_abort(nbdev_ch, 3117 nbdev_io, 3118 nbdev_io_to_abort); 3119 return; 3120 3121 case SPDK_BDEV_IO_TYPE_COPY: 3122 rc = bdev_nvme_copy(nbdev_io, 3123 bdev_io->u.bdev.offset_blocks, 3124 bdev_io->u.bdev.copy.src_offset_blocks, 3125 bdev_io->u.bdev.num_blocks); 3126 break; 3127 default: 3128 rc = -EINVAL; 3129 break; 3130 } 3131 3132 if (spdk_unlikely(rc != 0)) { 3133 bdev_nvme_io_complete(nbdev_io, rc); 3134 } 3135 } 3136 3137 static void 3138 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 3139 { 3140 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 3141 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 3142 3143 if (spdk_likely(nbdev_io->submit_tsc == 0)) { 3144 nbdev_io->submit_tsc = spdk_bdev_io_get_submit_tsc(bdev_io); 3145 } else { 3146 /* There are cases where submit_tsc != 0, i.e. retry I/O. 3147 * We need to update submit_tsc here. 3148 */ 3149 nbdev_io->submit_tsc = spdk_get_ticks(); 3150 } 3151 3152 spdk_trace_record(TRACE_BDEV_NVME_IO_START, 0, 0, (uintptr_t)nbdev_io, (uintptr_t)bdev_io); 3153 nbdev_io->io_path = bdev_nvme_find_io_path(nbdev_ch); 3154 if (spdk_unlikely(!nbdev_io->io_path)) { 3155 if (!bdev_nvme_io_type_is_admin(bdev_io->type)) { 3156 bdev_nvme_io_complete(nbdev_io, -ENXIO); 3157 return; 3158 } 3159 3160 /* Admin commands do not use the optimal I/O path. 3161 * Simply fall through even if it is not found. 3162 */ 3163 } 3164 3165 _bdev_nvme_submit_request(nbdev_ch, bdev_io); 3166 } 3167 3168 static bool 3169 bdev_nvme_is_supported_csi(enum spdk_nvme_csi csi) 3170 { 3171 switch (csi) { 3172 case SPDK_NVME_CSI_NVM: 3173 return true; 3174 case SPDK_NVME_CSI_ZNS: 3175 return true; 3176 default: 3177 return false; 3178 } 3179 } 3180 3181 static bool 3182 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 3183 { 3184 struct nvme_bdev *nbdev = ctx; 3185 struct nvme_ns *nvme_ns; 3186 struct spdk_nvme_ns *ns; 3187 struct spdk_nvme_ctrlr *ctrlr; 3188 const struct spdk_nvme_ctrlr_data *cdata; 3189 3190 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 3191 assert(nvme_ns != NULL); 3192 ns = nvme_ns->ns; 3193 if (ns == NULL) { 3194 return false; 3195 } 3196 3197 if (!bdev_nvme_is_supported_csi(spdk_nvme_ns_get_csi(ns))) { 3198 switch (io_type) { 3199 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 3200 case SPDK_BDEV_IO_TYPE_NVME_IO: 3201 return true; 3202 3203 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3204 return spdk_nvme_ns_get_md_size(ns) ? true : false; 3205 3206 default: 3207 return false; 3208 } 3209 } 3210 3211 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 3212 3213 switch (io_type) { 3214 case SPDK_BDEV_IO_TYPE_READ: 3215 case SPDK_BDEV_IO_TYPE_WRITE: 3216 case SPDK_BDEV_IO_TYPE_RESET: 3217 case SPDK_BDEV_IO_TYPE_FLUSH: 3218 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 3219 case SPDK_BDEV_IO_TYPE_NVME_IO: 3220 case SPDK_BDEV_IO_TYPE_ABORT: 3221 return true; 3222 3223 case SPDK_BDEV_IO_TYPE_COMPARE: 3224 return spdk_nvme_ns_supports_compare(ns); 3225 3226 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3227 return spdk_nvme_ns_get_md_size(ns) ? true : false; 3228 3229 case SPDK_BDEV_IO_TYPE_UNMAP: 3230 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3231 return cdata->oncs.dsm; 3232 3233 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3234 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3235 return cdata->oncs.write_zeroes; 3236 3237 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 3238 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 3239 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) { 3240 return true; 3241 } 3242 return false; 3243 3244 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 3245 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 3246 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS; 3247 3248 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 3249 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS && 3250 spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ZONE_APPEND_SUPPORTED; 3251 3252 case SPDK_BDEV_IO_TYPE_COPY: 3253 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3254 return cdata->oncs.copy; 3255 3256 default: 3257 return false; 3258 } 3259 } 3260 3261 static int 3262 nvme_qpair_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ctrlr_channel *ctrlr_ch) 3263 { 3264 struct nvme_qpair *nvme_qpair; 3265 struct spdk_io_channel *pg_ch; 3266 int rc; 3267 3268 nvme_qpair = calloc(1, sizeof(*nvme_qpair)); 3269 if (!nvme_qpair) { 3270 SPDK_ERRLOG("Failed to alloc nvme_qpair.\n"); 3271 return -1; 3272 } 3273 3274 TAILQ_INIT(&nvme_qpair->io_path_list); 3275 3276 nvme_qpair->ctrlr = nvme_ctrlr; 3277 nvme_qpair->ctrlr_ch = ctrlr_ch; 3278 3279 pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs); 3280 if (!pg_ch) { 3281 free(nvme_qpair); 3282 return -1; 3283 } 3284 3285 nvme_qpair->group = spdk_io_channel_get_ctx(pg_ch); 3286 3287 #ifdef SPDK_CONFIG_VTUNE 3288 nvme_qpair->group->collect_spin_stat = true; 3289 #else 3290 nvme_qpair->group->collect_spin_stat = false; 3291 #endif 3292 3293 if (!nvme_ctrlr->disabled) { 3294 /* If a nvme_ctrlr is disabled, don't try to create qpair for it. Qpair will 3295 * be created when it's enabled. 3296 */ 3297 rc = bdev_nvme_create_qpair(nvme_qpair); 3298 if (rc != 0) { 3299 /* nvme_ctrlr can't create IO qpair if connection is down. 3300 * If reconnect_delay_sec is non-zero, creating IO qpair is retried 3301 * after reconnect_delay_sec seconds. If bdev_retry_count is non-zero, 3302 * submitted IO will be queued until IO qpair is successfully created. 3303 * 3304 * Hence, if both are satisfied, ignore the failure. 3305 */ 3306 if (nvme_ctrlr->opts.reconnect_delay_sec == 0 || g_opts.bdev_retry_count == 0) { 3307 spdk_put_io_channel(pg_ch); 3308 free(nvme_qpair); 3309 return rc; 3310 } 3311 } 3312 } 3313 3314 TAILQ_INSERT_TAIL(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 3315 3316 ctrlr_ch->qpair = nvme_qpair; 3317 3318 pthread_mutex_lock(&nvme_qpair->ctrlr->mutex); 3319 nvme_qpair->ctrlr->ref++; 3320 pthread_mutex_unlock(&nvme_qpair->ctrlr->mutex); 3321 3322 return 0; 3323 } 3324 3325 static int 3326 bdev_nvme_create_ctrlr_channel_cb(void *io_device, void *ctx_buf) 3327 { 3328 struct nvme_ctrlr *nvme_ctrlr = io_device; 3329 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 3330 3331 TAILQ_INIT(&ctrlr_ch->pending_resets); 3332 3333 return nvme_qpair_create(nvme_ctrlr, ctrlr_ch); 3334 } 3335 3336 static void 3337 nvme_qpair_delete(struct nvme_qpair *nvme_qpair) 3338 { 3339 struct nvme_io_path *io_path, *next; 3340 3341 assert(nvme_qpair->group != NULL); 3342 3343 TAILQ_FOREACH_SAFE(io_path, &nvme_qpair->io_path_list, tailq, next) { 3344 TAILQ_REMOVE(&nvme_qpair->io_path_list, io_path, tailq); 3345 nvme_io_path_free(io_path); 3346 } 3347 3348 TAILQ_REMOVE(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 3349 3350 spdk_put_io_channel(spdk_io_channel_from_ctx(nvme_qpair->group)); 3351 3352 nvme_ctrlr_release(nvme_qpair->ctrlr); 3353 3354 free(nvme_qpair); 3355 } 3356 3357 static void 3358 bdev_nvme_destroy_ctrlr_channel_cb(void *io_device, void *ctx_buf) 3359 { 3360 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 3361 struct nvme_qpair *nvme_qpair; 3362 3363 nvme_qpair = ctrlr_ch->qpair; 3364 assert(nvme_qpair != NULL); 3365 3366 _bdev_nvme_clear_io_path_cache(nvme_qpair); 3367 3368 if (nvme_qpair->qpair != NULL) { 3369 if (ctrlr_ch->reset_iter == NULL) { 3370 spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); 3371 } else { 3372 /* Skip current ctrlr_channel in a full reset sequence because 3373 * it is being deleted now. The qpair is already being disconnected. 3374 * We do not have to restart disconnecting it. 3375 */ 3376 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 3377 } 3378 3379 /* We cannot release a reference to the poll group now. 3380 * The qpair may be disconnected asynchronously later. 3381 * We need to poll it until it is actually disconnected. 3382 * Just detach the qpair from the deleting ctrlr_channel. 3383 */ 3384 nvme_qpair->ctrlr_ch = NULL; 3385 } else { 3386 assert(ctrlr_ch->reset_iter == NULL); 3387 3388 nvme_qpair_delete(nvme_qpair); 3389 } 3390 } 3391 3392 static inline struct spdk_io_channel * 3393 bdev_nvme_get_accel_channel(struct nvme_poll_group *group) 3394 { 3395 if (spdk_unlikely(!group->accel_channel)) { 3396 group->accel_channel = spdk_accel_get_io_channel(); 3397 if (!group->accel_channel) { 3398 SPDK_ERRLOG("Cannot get the accel_channel for bdev nvme polling group=%p\n", 3399 group); 3400 return NULL; 3401 } 3402 } 3403 3404 return group->accel_channel; 3405 } 3406 3407 static void 3408 bdev_nvme_submit_accel_crc32c(void *ctx, uint32_t *dst, struct iovec *iov, 3409 uint32_t iov_cnt, uint32_t seed, 3410 spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) 3411 { 3412 struct spdk_io_channel *accel_ch; 3413 struct nvme_poll_group *group = ctx; 3414 int rc; 3415 3416 assert(cb_fn != NULL); 3417 3418 accel_ch = bdev_nvme_get_accel_channel(group); 3419 if (spdk_unlikely(accel_ch == NULL)) { 3420 cb_fn(cb_arg, -ENOMEM); 3421 return; 3422 } 3423 3424 rc = spdk_accel_submit_crc32cv(accel_ch, dst, iov, iov_cnt, seed, cb_fn, cb_arg); 3425 if (rc) { 3426 /* For the two cases, spdk_accel_submit_crc32cv does not call the user's cb_fn */ 3427 if (rc == -ENOMEM || rc == -EINVAL) { 3428 cb_fn(cb_arg, rc); 3429 } 3430 SPDK_ERRLOG("Cannot complete the accelerated crc32c operation with iov=%p\n", iov); 3431 } 3432 } 3433 3434 static void 3435 bdev_nvme_finish_sequence(void *seq, spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) 3436 { 3437 spdk_accel_sequence_finish(seq, cb_fn, cb_arg); 3438 } 3439 3440 static void 3441 bdev_nvme_abort_sequence(void *seq) 3442 { 3443 spdk_accel_sequence_abort(seq); 3444 } 3445 3446 static void 3447 bdev_nvme_reverse_sequence(void *seq) 3448 { 3449 spdk_accel_sequence_reverse(seq); 3450 } 3451 3452 static int 3453 bdev_nvme_append_crc32c(void *ctx, void **seq, uint32_t *dst, struct iovec *iovs, uint32_t iovcnt, 3454 struct spdk_memory_domain *domain, void *domain_ctx, uint32_t seed, 3455 spdk_nvme_accel_step_cb cb_fn, void *cb_arg) 3456 { 3457 struct spdk_io_channel *ch; 3458 struct nvme_poll_group *group = ctx; 3459 3460 ch = bdev_nvme_get_accel_channel(group); 3461 if (spdk_unlikely(ch == NULL)) { 3462 return -ENOMEM; 3463 } 3464 3465 return spdk_accel_append_crc32c((struct spdk_accel_sequence **)seq, ch, dst, iovs, iovcnt, 3466 domain, domain_ctx, seed, cb_fn, cb_arg); 3467 } 3468 3469 static struct spdk_nvme_accel_fn_table g_bdev_nvme_accel_fn_table = { 3470 .table_size = sizeof(struct spdk_nvme_accel_fn_table), 3471 .submit_accel_crc32c = bdev_nvme_submit_accel_crc32c, 3472 .append_crc32c = bdev_nvme_append_crc32c, 3473 .finish_sequence = bdev_nvme_finish_sequence, 3474 .reverse_sequence = bdev_nvme_reverse_sequence, 3475 .abort_sequence = bdev_nvme_abort_sequence, 3476 }; 3477 3478 static int 3479 bdev_nvme_create_poll_group_cb(void *io_device, void *ctx_buf) 3480 { 3481 struct nvme_poll_group *group = ctx_buf; 3482 3483 TAILQ_INIT(&group->qpair_list); 3484 3485 group->group = spdk_nvme_poll_group_create(group, &g_bdev_nvme_accel_fn_table); 3486 if (group->group == NULL) { 3487 return -1; 3488 } 3489 3490 group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us); 3491 3492 if (group->poller == NULL) { 3493 spdk_nvme_poll_group_destroy(group->group); 3494 return -1; 3495 } 3496 3497 return 0; 3498 } 3499 3500 static void 3501 bdev_nvme_destroy_poll_group_cb(void *io_device, void *ctx_buf) 3502 { 3503 struct nvme_poll_group *group = ctx_buf; 3504 3505 assert(TAILQ_EMPTY(&group->qpair_list)); 3506 3507 if (group->accel_channel) { 3508 spdk_put_io_channel(group->accel_channel); 3509 } 3510 3511 spdk_poller_unregister(&group->poller); 3512 if (spdk_nvme_poll_group_destroy(group->group)) { 3513 SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.\n"); 3514 assert(false); 3515 } 3516 } 3517 3518 static struct spdk_io_channel * 3519 bdev_nvme_get_io_channel(void *ctx) 3520 { 3521 struct nvme_bdev *nvme_bdev = ctx; 3522 3523 return spdk_get_io_channel(nvme_bdev); 3524 } 3525 3526 static void * 3527 bdev_nvme_get_module_ctx(void *ctx) 3528 { 3529 struct nvme_bdev *nvme_bdev = ctx; 3530 struct nvme_ns *nvme_ns; 3531 3532 if (!nvme_bdev || nvme_bdev->disk.module != &nvme_if) { 3533 return NULL; 3534 } 3535 3536 nvme_ns = TAILQ_FIRST(&nvme_bdev->nvme_ns_list); 3537 if (!nvme_ns) { 3538 return NULL; 3539 } 3540 3541 return nvme_ns->ns; 3542 } 3543 3544 static const char * 3545 _nvme_ana_state_str(enum spdk_nvme_ana_state ana_state) 3546 { 3547 switch (ana_state) { 3548 case SPDK_NVME_ANA_OPTIMIZED_STATE: 3549 return "optimized"; 3550 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 3551 return "non_optimized"; 3552 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 3553 return "inaccessible"; 3554 case SPDK_NVME_ANA_PERSISTENT_LOSS_STATE: 3555 return "persistent_loss"; 3556 case SPDK_NVME_ANA_CHANGE_STATE: 3557 return "change"; 3558 default: 3559 return NULL; 3560 } 3561 } 3562 3563 static int 3564 bdev_nvme_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size) 3565 { 3566 struct spdk_memory_domain **_domains = NULL; 3567 struct nvme_bdev *nbdev = ctx; 3568 struct nvme_ns *nvme_ns; 3569 int i = 0, _array_size = array_size; 3570 int rc = 0; 3571 3572 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 3573 if (domains && array_size >= i) { 3574 _domains = &domains[i]; 3575 } else { 3576 _domains = NULL; 3577 } 3578 rc = spdk_nvme_ctrlr_get_memory_domains(nvme_ns->ctrlr->ctrlr, _domains, _array_size); 3579 if (rc > 0) { 3580 i += rc; 3581 if (_array_size >= rc) { 3582 _array_size -= rc; 3583 } else { 3584 _array_size = 0; 3585 } 3586 } else if (rc < 0) { 3587 return rc; 3588 } 3589 } 3590 3591 return i; 3592 } 3593 3594 static const char * 3595 nvme_ctrlr_get_state_str(struct nvme_ctrlr *nvme_ctrlr) 3596 { 3597 if (nvme_ctrlr->destruct) { 3598 return "deleting"; 3599 } else if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 3600 return "failed"; 3601 } else if (nvme_ctrlr->resetting) { 3602 return "resetting"; 3603 } else if (nvme_ctrlr->reconnect_is_delayed > 0) { 3604 return "reconnect_is_delayed"; 3605 } else if (nvme_ctrlr->disabled) { 3606 return "disabled"; 3607 } else { 3608 return "enabled"; 3609 } 3610 } 3611 3612 void 3613 nvme_ctrlr_info_json(struct spdk_json_write_ctx *w, struct nvme_ctrlr *nvme_ctrlr) 3614 { 3615 struct spdk_nvme_transport_id *trid; 3616 const struct spdk_nvme_ctrlr_opts *opts; 3617 const struct spdk_nvme_ctrlr_data *cdata; 3618 struct nvme_path_id *path_id; 3619 int32_t numa_id; 3620 3621 spdk_json_write_object_begin(w); 3622 3623 spdk_json_write_named_string(w, "state", nvme_ctrlr_get_state_str(nvme_ctrlr)); 3624 3625 #ifdef SPDK_CONFIG_NVME_CUSE 3626 size_t cuse_name_size = 128; 3627 char cuse_name[cuse_name_size]; 3628 3629 int rc = spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr, cuse_name, &cuse_name_size); 3630 if (rc == 0) { 3631 spdk_json_write_named_string(w, "cuse_device", cuse_name); 3632 } 3633 #endif 3634 trid = &nvme_ctrlr->active_path_id->trid; 3635 spdk_json_write_named_object_begin(w, "trid"); 3636 nvme_bdev_dump_trid_json(trid, w); 3637 spdk_json_write_object_end(w); 3638 3639 path_id = TAILQ_NEXT(nvme_ctrlr->active_path_id, link); 3640 if (path_id != NULL) { 3641 spdk_json_write_named_array_begin(w, "alternate_trids"); 3642 do { 3643 trid = &path_id->trid; 3644 spdk_json_write_object_begin(w); 3645 nvme_bdev_dump_trid_json(trid, w); 3646 spdk_json_write_object_end(w); 3647 3648 path_id = TAILQ_NEXT(path_id, link); 3649 } while (path_id != NULL); 3650 spdk_json_write_array_end(w); 3651 } 3652 3653 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 3654 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 3655 3656 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 3657 spdk_json_write_named_object_begin(w, "host"); 3658 spdk_json_write_named_string(w, "nqn", opts->hostnqn); 3659 spdk_json_write_named_string(w, "addr", opts->src_addr); 3660 spdk_json_write_named_string(w, "svcid", opts->src_svcid); 3661 spdk_json_write_object_end(w); 3662 3663 numa_id = spdk_nvme_ctrlr_get_numa_id(nvme_ctrlr->ctrlr); 3664 if (numa_id != SPDK_ENV_NUMA_ID_ANY) { 3665 spdk_json_write_named_uint32(w, "numa_id", numa_id); 3666 } 3667 spdk_json_write_object_end(w); 3668 } 3669 3670 static void 3671 nvme_namespace_info_json(struct spdk_json_write_ctx *w, 3672 struct nvme_ns *nvme_ns) 3673 { 3674 struct spdk_nvme_ns *ns; 3675 struct spdk_nvme_ctrlr *ctrlr; 3676 const struct spdk_nvme_ctrlr_data *cdata; 3677 const struct spdk_nvme_transport_id *trid; 3678 union spdk_nvme_vs_register vs; 3679 const struct spdk_nvme_ns_data *nsdata; 3680 char buf[128]; 3681 3682 ns = nvme_ns->ns; 3683 if (ns == NULL) { 3684 return; 3685 } 3686 3687 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 3688 3689 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3690 trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); 3691 vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr); 3692 3693 spdk_json_write_object_begin(w); 3694 3695 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 3696 spdk_json_write_named_string(w, "pci_address", trid->traddr); 3697 } 3698 3699 spdk_json_write_named_object_begin(w, "trid"); 3700 3701 nvme_bdev_dump_trid_json(trid, w); 3702 3703 spdk_json_write_object_end(w); 3704 3705 #ifdef SPDK_CONFIG_NVME_CUSE 3706 size_t cuse_name_size = 128; 3707 char cuse_name[cuse_name_size]; 3708 3709 int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns), 3710 cuse_name, &cuse_name_size); 3711 if (rc == 0) { 3712 spdk_json_write_named_string(w, "cuse_device", cuse_name); 3713 } 3714 #endif 3715 3716 spdk_json_write_named_object_begin(w, "ctrlr_data"); 3717 3718 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 3719 3720 spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid); 3721 3722 snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn); 3723 spdk_str_trim(buf); 3724 spdk_json_write_named_string(w, "model_number", buf); 3725 3726 snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn); 3727 spdk_str_trim(buf); 3728 spdk_json_write_named_string(w, "serial_number", buf); 3729 3730 snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr); 3731 spdk_str_trim(buf); 3732 spdk_json_write_named_string(w, "firmware_revision", buf); 3733 3734 if (cdata->subnqn[0] != '\0') { 3735 spdk_json_write_named_string(w, "subnqn", cdata->subnqn); 3736 } 3737 3738 spdk_json_write_named_object_begin(w, "oacs"); 3739 3740 spdk_json_write_named_uint32(w, "security", cdata->oacs.security); 3741 spdk_json_write_named_uint32(w, "format", cdata->oacs.format); 3742 spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware); 3743 spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage); 3744 3745 spdk_json_write_object_end(w); 3746 3747 spdk_json_write_named_bool(w, "multi_ctrlr", cdata->cmic.multi_ctrlr); 3748 spdk_json_write_named_bool(w, "ana_reporting", cdata->cmic.ana_reporting); 3749 3750 spdk_json_write_object_end(w); 3751 3752 spdk_json_write_named_object_begin(w, "vs"); 3753 3754 spdk_json_write_name(w, "nvme_version"); 3755 if (vs.bits.ter) { 3756 spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter); 3757 } else { 3758 spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr); 3759 } 3760 3761 spdk_json_write_object_end(w); 3762 3763 nsdata = spdk_nvme_ns_get_data(ns); 3764 3765 spdk_json_write_named_object_begin(w, "ns_data"); 3766 3767 spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns)); 3768 3769 if (cdata->cmic.ana_reporting) { 3770 spdk_json_write_named_string(w, "ana_state", 3771 _nvme_ana_state_str(nvme_ns->ana_state)); 3772 } 3773 3774 spdk_json_write_named_bool(w, "can_share", nsdata->nmic.can_share); 3775 3776 spdk_json_write_object_end(w); 3777 3778 if (cdata->oacs.security) { 3779 spdk_json_write_named_object_begin(w, "security"); 3780 3781 spdk_json_write_named_bool(w, "opal", nvme_ns->bdev->opal); 3782 3783 spdk_json_write_object_end(w); 3784 } 3785 3786 spdk_json_write_object_end(w); 3787 } 3788 3789 static const char * 3790 nvme_bdev_get_mp_policy_str(struct nvme_bdev *nbdev) 3791 { 3792 switch (nbdev->mp_policy) { 3793 case BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE: 3794 return "active_passive"; 3795 case BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE: 3796 return "active_active"; 3797 default: 3798 assert(false); 3799 return "invalid"; 3800 } 3801 } 3802 3803 static const char * 3804 nvme_bdev_get_mp_selector_str(struct nvme_bdev *nbdev) 3805 { 3806 switch (nbdev->mp_selector) { 3807 case BDEV_NVME_MP_SELECTOR_ROUND_ROBIN: 3808 return "round_robin"; 3809 case BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH: 3810 return "queue_depth"; 3811 default: 3812 assert(false); 3813 return "invalid"; 3814 } 3815 } 3816 3817 static int 3818 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) 3819 { 3820 struct nvme_bdev *nvme_bdev = ctx; 3821 struct nvme_ns *nvme_ns; 3822 3823 pthread_mutex_lock(&nvme_bdev->mutex); 3824 spdk_json_write_named_array_begin(w, "nvme"); 3825 TAILQ_FOREACH(nvme_ns, &nvme_bdev->nvme_ns_list, tailq) { 3826 nvme_namespace_info_json(w, nvme_ns); 3827 } 3828 spdk_json_write_array_end(w); 3829 spdk_json_write_named_string(w, "mp_policy", nvme_bdev_get_mp_policy_str(nvme_bdev)); 3830 if (nvme_bdev->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) { 3831 spdk_json_write_named_string(w, "selector", nvme_bdev_get_mp_selector_str(nvme_bdev)); 3832 if (nvme_bdev->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 3833 spdk_json_write_named_uint32(w, "rr_min_io", nvme_bdev->rr_min_io); 3834 } 3835 } 3836 pthread_mutex_unlock(&nvme_bdev->mutex); 3837 3838 return 0; 3839 } 3840 3841 static void 3842 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3843 { 3844 /* No config per bdev needed */ 3845 } 3846 3847 static uint64_t 3848 bdev_nvme_get_spin_time(struct spdk_io_channel *ch) 3849 { 3850 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 3851 struct nvme_io_path *io_path; 3852 struct nvme_poll_group *group; 3853 uint64_t spin_time = 0; 3854 3855 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 3856 group = io_path->qpair->group; 3857 3858 if (!group || !group->collect_spin_stat) { 3859 continue; 3860 } 3861 3862 if (group->end_ticks != 0) { 3863 group->spin_ticks += (group->end_ticks - group->start_ticks); 3864 group->end_ticks = 0; 3865 } 3866 3867 spin_time += group->spin_ticks; 3868 group->start_ticks = 0; 3869 group->spin_ticks = 0; 3870 } 3871 3872 return (spin_time * 1000000ULL) / spdk_get_ticks_hz(); 3873 } 3874 3875 static void 3876 bdev_nvme_reset_device_stat(void *ctx) 3877 { 3878 struct nvme_bdev *nbdev = ctx; 3879 3880 if (nbdev->err_stat != NULL) { 3881 memset(nbdev->err_stat, 0, sizeof(struct nvme_error_stat)); 3882 } 3883 } 3884 3885 /* JSON string should be lowercases and underscore delimited string. */ 3886 static void 3887 bdev_nvme_format_nvme_status(char *dst, const char *src) 3888 { 3889 char tmp[256]; 3890 3891 spdk_strcpy_replace(dst, 256, src, " - ", "_"); 3892 spdk_strcpy_replace(tmp, 256, dst, "-", "_"); 3893 spdk_strcpy_replace(dst, 256, tmp, " ", "_"); 3894 spdk_strlwr(dst); 3895 } 3896 3897 static void 3898 bdev_nvme_dump_device_stat_json(void *ctx, struct spdk_json_write_ctx *w) 3899 { 3900 struct nvme_bdev *nbdev = ctx; 3901 struct spdk_nvme_status status = {}; 3902 uint16_t sct, sc; 3903 char status_json[256]; 3904 const char *status_str; 3905 3906 if (nbdev->err_stat == NULL) { 3907 return; 3908 } 3909 3910 spdk_json_write_named_object_begin(w, "nvme_error"); 3911 3912 spdk_json_write_named_object_begin(w, "status_type"); 3913 for (sct = 0; sct < 8; sct++) { 3914 if (nbdev->err_stat->status_type[sct] == 0) { 3915 continue; 3916 } 3917 status.sct = sct; 3918 3919 status_str = spdk_nvme_cpl_get_status_type_string(&status); 3920 assert(status_str != NULL); 3921 bdev_nvme_format_nvme_status(status_json, status_str); 3922 3923 spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status_type[sct]); 3924 } 3925 spdk_json_write_object_end(w); 3926 3927 spdk_json_write_named_object_begin(w, "status_code"); 3928 for (sct = 0; sct < 4; sct++) { 3929 status.sct = sct; 3930 for (sc = 0; sc < 256; sc++) { 3931 if (nbdev->err_stat->status[sct][sc] == 0) { 3932 continue; 3933 } 3934 status.sc = sc; 3935 3936 status_str = spdk_nvme_cpl_get_status_string(&status); 3937 assert(status_str != NULL); 3938 bdev_nvme_format_nvme_status(status_json, status_str); 3939 3940 spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status[sct][sc]); 3941 } 3942 } 3943 spdk_json_write_object_end(w); 3944 3945 spdk_json_write_object_end(w); 3946 } 3947 3948 static bool 3949 bdev_nvme_accel_sequence_supported(void *ctx, enum spdk_bdev_io_type type) 3950 { 3951 struct nvme_bdev *nbdev = ctx; 3952 struct spdk_nvme_ctrlr *ctrlr; 3953 3954 if (!g_opts.allow_accel_sequence) { 3955 return false; 3956 } 3957 3958 switch (type) { 3959 case SPDK_BDEV_IO_TYPE_WRITE: 3960 case SPDK_BDEV_IO_TYPE_READ: 3961 break; 3962 default: 3963 return false; 3964 } 3965 3966 ctrlr = bdev_nvme_get_ctrlr(&nbdev->disk); 3967 assert(ctrlr != NULL); 3968 3969 return spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ACCEL_SEQUENCE_SUPPORTED; 3970 } 3971 3972 static const struct spdk_bdev_fn_table nvmelib_fn_table = { 3973 .destruct = bdev_nvme_destruct, 3974 .submit_request = bdev_nvme_submit_request, 3975 .io_type_supported = bdev_nvme_io_type_supported, 3976 .get_io_channel = bdev_nvme_get_io_channel, 3977 .dump_info_json = bdev_nvme_dump_info_json, 3978 .write_config_json = bdev_nvme_write_config_json, 3979 .get_spin_time = bdev_nvme_get_spin_time, 3980 .get_module_ctx = bdev_nvme_get_module_ctx, 3981 .get_memory_domains = bdev_nvme_get_memory_domains, 3982 .accel_sequence_supported = bdev_nvme_accel_sequence_supported, 3983 .reset_device_stat = bdev_nvme_reset_device_stat, 3984 .dump_device_stat_json = bdev_nvme_dump_device_stat_json, 3985 }; 3986 3987 typedef int (*bdev_nvme_parse_ana_log_page_cb)( 3988 const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg); 3989 3990 static int 3991 bdev_nvme_parse_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 3992 bdev_nvme_parse_ana_log_page_cb cb_fn, void *cb_arg) 3993 { 3994 struct spdk_nvme_ana_group_descriptor *copied_desc; 3995 uint8_t *orig_desc; 3996 uint32_t i, desc_size, copy_len; 3997 int rc = 0; 3998 3999 if (nvme_ctrlr->ana_log_page == NULL) { 4000 return -EINVAL; 4001 } 4002 4003 copied_desc = nvme_ctrlr->copied_ana_desc; 4004 4005 orig_desc = (uint8_t *)nvme_ctrlr->ana_log_page + sizeof(struct spdk_nvme_ana_page); 4006 copy_len = nvme_ctrlr->max_ana_log_page_size - sizeof(struct spdk_nvme_ana_page); 4007 4008 for (i = 0; i < nvme_ctrlr->ana_log_page->num_ana_group_desc; i++) { 4009 memcpy(copied_desc, orig_desc, copy_len); 4010 4011 rc = cb_fn(copied_desc, cb_arg); 4012 if (rc != 0) { 4013 break; 4014 } 4015 4016 desc_size = sizeof(struct spdk_nvme_ana_group_descriptor) + 4017 copied_desc->num_of_nsid * sizeof(uint32_t); 4018 orig_desc += desc_size; 4019 copy_len -= desc_size; 4020 } 4021 4022 return rc; 4023 } 4024 4025 static int 4026 nvme_ns_ana_transition_timedout(void *ctx) 4027 { 4028 struct nvme_ns *nvme_ns = ctx; 4029 4030 spdk_poller_unregister(&nvme_ns->anatt_timer); 4031 nvme_ns->ana_transition_timedout = true; 4032 4033 return SPDK_POLLER_BUSY; 4034 } 4035 4036 static void 4037 _nvme_ns_set_ana_state(struct nvme_ns *nvme_ns, 4038 const struct spdk_nvme_ana_group_descriptor *desc) 4039 { 4040 const struct spdk_nvme_ctrlr_data *cdata; 4041 4042 nvme_ns->ana_group_id = desc->ana_group_id; 4043 nvme_ns->ana_state = desc->ana_state; 4044 nvme_ns->ana_state_updating = false; 4045 4046 switch (nvme_ns->ana_state) { 4047 case SPDK_NVME_ANA_OPTIMIZED_STATE: 4048 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 4049 nvme_ns->ana_transition_timedout = false; 4050 spdk_poller_unregister(&nvme_ns->anatt_timer); 4051 break; 4052 4053 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 4054 case SPDK_NVME_ANA_CHANGE_STATE: 4055 if (nvme_ns->anatt_timer != NULL) { 4056 break; 4057 } 4058 4059 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 4060 nvme_ns->anatt_timer = SPDK_POLLER_REGISTER(nvme_ns_ana_transition_timedout, 4061 nvme_ns, 4062 cdata->anatt * SPDK_SEC_TO_USEC); 4063 break; 4064 default: 4065 break; 4066 } 4067 } 4068 4069 static int 4070 nvme_ns_set_ana_state(const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg) 4071 { 4072 struct nvme_ns *nvme_ns = cb_arg; 4073 uint32_t i; 4074 4075 assert(nvme_ns->ns != NULL); 4076 4077 for (i = 0; i < desc->num_of_nsid; i++) { 4078 if (desc->nsid[i] != spdk_nvme_ns_get_id(nvme_ns->ns)) { 4079 continue; 4080 } 4081 4082 _nvme_ns_set_ana_state(nvme_ns, desc); 4083 return 1; 4084 } 4085 4086 return 0; 4087 } 4088 4089 static int 4090 nvme_generate_uuid(const char *sn, uint32_t nsid, struct spdk_uuid *uuid) 4091 { 4092 int rc = 0; 4093 struct spdk_uuid new_uuid, namespace_uuid; 4094 char merged_str[SPDK_NVME_CTRLR_SN_LEN + NSID_STR_LEN + 1] = {'\0'}; 4095 /* This namespace UUID was generated using uuid_generate() method. */ 4096 const char *namespace_str = {"edaed2de-24bc-4b07-b559-f47ecbe730fd"}; 4097 int size; 4098 4099 assert(strlen(sn) <= SPDK_NVME_CTRLR_SN_LEN); 4100 4101 spdk_uuid_set_null(&new_uuid); 4102 spdk_uuid_set_null(&namespace_uuid); 4103 4104 size = snprintf(merged_str, sizeof(merged_str), "%s%"PRIu32, sn, nsid); 4105 if (size <= 0 || (unsigned long)size >= sizeof(merged_str)) { 4106 return -EINVAL; 4107 } 4108 4109 spdk_uuid_parse(&namespace_uuid, namespace_str); 4110 4111 rc = spdk_uuid_generate_sha1(&new_uuid, &namespace_uuid, merged_str, size); 4112 if (rc == 0) { 4113 memcpy(uuid, &new_uuid, sizeof(struct spdk_uuid)); 4114 } 4115 4116 return rc; 4117 } 4118 4119 static int 4120 nvme_disk_create(struct spdk_bdev *disk, const char *base_name, 4121 struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns, 4122 struct spdk_bdev_nvme_ctrlr_opts *bdev_opts, void *ctx) 4123 { 4124 const struct spdk_uuid *uuid; 4125 const uint8_t *nguid; 4126 const struct spdk_nvme_ctrlr_data *cdata; 4127 const struct spdk_nvme_ns_data *nsdata; 4128 const struct spdk_nvme_ctrlr_opts *opts; 4129 enum spdk_nvme_csi csi; 4130 uint32_t atomic_bs, phys_bs, bs; 4131 char sn_tmp[SPDK_NVME_CTRLR_SN_LEN + 1] = {'\0'}; 4132 int rc; 4133 4134 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 4135 csi = spdk_nvme_ns_get_csi(ns); 4136 opts = spdk_nvme_ctrlr_get_opts(ctrlr); 4137 4138 switch (csi) { 4139 case SPDK_NVME_CSI_NVM: 4140 disk->product_name = "NVMe disk"; 4141 break; 4142 case SPDK_NVME_CSI_ZNS: 4143 disk->product_name = "NVMe ZNS disk"; 4144 disk->zoned = true; 4145 disk->zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 4146 disk->max_zone_append_size = spdk_nvme_zns_ctrlr_get_max_zone_append_size(ctrlr) / 4147 spdk_nvme_ns_get_extended_sector_size(ns); 4148 disk->max_open_zones = spdk_nvme_zns_ns_get_max_open_zones(ns); 4149 disk->max_active_zones = spdk_nvme_zns_ns_get_max_active_zones(ns); 4150 break; 4151 default: 4152 if (bdev_opts->allow_unrecognized_csi) { 4153 disk->product_name = "NVMe Passthrough disk"; 4154 break; 4155 } 4156 SPDK_ERRLOG("unsupported CSI: %u\n", csi); 4157 return -ENOTSUP; 4158 } 4159 4160 nguid = spdk_nvme_ns_get_nguid(ns); 4161 if (!nguid) { 4162 uuid = spdk_nvme_ns_get_uuid(ns); 4163 if (uuid) { 4164 disk->uuid = *uuid; 4165 } else if (g_opts.generate_uuids) { 4166 spdk_strcpy_pad(sn_tmp, cdata->sn, SPDK_NVME_CTRLR_SN_LEN, '\0'); 4167 rc = nvme_generate_uuid(sn_tmp, spdk_nvme_ns_get_id(ns), &disk->uuid); 4168 if (rc < 0) { 4169 SPDK_ERRLOG("UUID generation failed (%s)\n", spdk_strerror(-rc)); 4170 return rc; 4171 } 4172 } 4173 } else { 4174 memcpy(&disk->uuid, nguid, sizeof(disk->uuid)); 4175 } 4176 4177 disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns)); 4178 if (!disk->name) { 4179 return -ENOMEM; 4180 } 4181 4182 disk->write_cache = 0; 4183 if (cdata->vwc.present) { 4184 /* Enable if the Volatile Write Cache exists */ 4185 disk->write_cache = 1; 4186 } 4187 if (cdata->oncs.write_zeroes) { 4188 disk->max_write_zeroes = UINT16_MAX + 1; 4189 } 4190 disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns); 4191 disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns); 4192 disk->max_segment_size = spdk_nvme_ctrlr_get_max_xfer_size(ctrlr); 4193 disk->ctratt.raw = cdata->ctratt.raw; 4194 /* NVMe driver will split one request into multiple requests 4195 * based on MDTS and stripe boundary, the bdev layer will use 4196 * max_segment_size and max_num_segments to split one big IO 4197 * into multiple requests, then small request can't run out 4198 * of NVMe internal requests data structure. 4199 */ 4200 if (opts && opts->io_queue_requests) { 4201 disk->max_num_segments = opts->io_queue_requests / 2; 4202 } 4203 if (spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_SGL_SUPPORTED) { 4204 /* The nvme driver will try to split I/O that have too many 4205 * SGEs, but it doesn't work if that last SGE doesn't end on 4206 * an aggregate total that is block aligned. The bdev layer has 4207 * a more robust splitting framework, so use that instead for 4208 * this case. (See issue #3269.) 4209 */ 4210 uint16_t max_sges = spdk_nvme_ctrlr_get_max_sges(ctrlr); 4211 4212 if (disk->max_num_segments == 0) { 4213 disk->max_num_segments = max_sges; 4214 } else { 4215 disk->max_num_segments = spdk_min(disk->max_num_segments, max_sges); 4216 } 4217 } 4218 disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns); 4219 4220 nsdata = spdk_nvme_ns_get_data(ns); 4221 bs = spdk_nvme_ns_get_sector_size(ns); 4222 atomic_bs = bs; 4223 phys_bs = bs; 4224 if (nsdata->nabo == 0) { 4225 if (nsdata->nsfeat.ns_atomic_write_unit && nsdata->nawupf) { 4226 atomic_bs = bs * (1 + nsdata->nawupf); 4227 } else { 4228 atomic_bs = bs * (1 + cdata->awupf); 4229 } 4230 } 4231 if (nsdata->nsfeat.optperf) { 4232 phys_bs = bs * (1 + nsdata->npwg); 4233 } 4234 disk->phys_blocklen = spdk_min(phys_bs, atomic_bs); 4235 4236 disk->md_len = spdk_nvme_ns_get_md_size(ns); 4237 if (disk->md_len != 0) { 4238 disk->md_interleave = nsdata->flbas.extended; 4239 disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns); 4240 if (disk->dif_type != SPDK_DIF_DISABLE) { 4241 disk->dif_is_head_of_md = nsdata->dps.md_start; 4242 disk->dif_check_flags = bdev_opts->prchk_flags; 4243 disk->dif_pi_format = (enum spdk_dif_pi_format)spdk_nvme_ns_get_pi_format(ns); 4244 } 4245 } 4246 4247 if (!(spdk_nvme_ctrlr_get_flags(ctrlr) & 4248 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) { 4249 disk->acwu = 0; 4250 } else if (nsdata->nsfeat.ns_atomic_write_unit) { 4251 disk->acwu = nsdata->nacwu + 1; /* 0-based */ 4252 } else { 4253 disk->acwu = cdata->acwu + 1; /* 0-based */ 4254 } 4255 4256 if (cdata->oncs.copy) { 4257 /* For now bdev interface allows only single segment copy */ 4258 disk->max_copy = nsdata->mssrl; 4259 } 4260 4261 disk->ctxt = ctx; 4262 disk->fn_table = &nvmelib_fn_table; 4263 disk->module = &nvme_if; 4264 4265 disk->numa.id_valid = 1; 4266 disk->numa.id = spdk_nvme_ctrlr_get_numa_id(ctrlr); 4267 4268 return 0; 4269 } 4270 4271 static struct nvme_bdev * 4272 nvme_bdev_alloc(void) 4273 { 4274 struct nvme_bdev *bdev; 4275 int rc; 4276 4277 bdev = calloc(1, sizeof(*bdev)); 4278 if (!bdev) { 4279 SPDK_ERRLOG("bdev calloc() failed\n"); 4280 return NULL; 4281 } 4282 4283 if (g_opts.nvme_error_stat) { 4284 bdev->err_stat = calloc(1, sizeof(struct nvme_error_stat)); 4285 if (!bdev->err_stat) { 4286 SPDK_ERRLOG("err_stat calloc() failed\n"); 4287 free(bdev); 4288 return NULL; 4289 } 4290 } 4291 4292 rc = pthread_mutex_init(&bdev->mutex, NULL); 4293 if (rc != 0) { 4294 free(bdev->err_stat); 4295 free(bdev); 4296 return NULL; 4297 } 4298 4299 bdev->ref = 1; 4300 bdev->mp_policy = BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE; 4301 bdev->mp_selector = BDEV_NVME_MP_SELECTOR_ROUND_ROBIN; 4302 bdev->rr_min_io = UINT32_MAX; 4303 TAILQ_INIT(&bdev->nvme_ns_list); 4304 4305 return bdev; 4306 } 4307 4308 static int 4309 nvme_bdev_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4310 { 4311 struct nvme_bdev *bdev; 4312 struct nvme_bdev_ctrlr *nbdev_ctrlr = nvme_ctrlr->nbdev_ctrlr; 4313 int rc; 4314 4315 bdev = nvme_bdev_alloc(); 4316 if (bdev == NULL) { 4317 SPDK_ERRLOG("Failed to allocate NVMe bdev\n"); 4318 return -ENOMEM; 4319 } 4320 4321 bdev->opal = nvme_ctrlr->opal_dev != NULL; 4322 4323 rc = nvme_disk_create(&bdev->disk, nbdev_ctrlr->name, nvme_ctrlr->ctrlr, 4324 nvme_ns->ns, &nvme_ctrlr->opts, bdev); 4325 if (rc != 0) { 4326 SPDK_ERRLOG("Failed to create NVMe disk\n"); 4327 nvme_bdev_free(bdev); 4328 return rc; 4329 } 4330 4331 spdk_io_device_register(bdev, 4332 bdev_nvme_create_bdev_channel_cb, 4333 bdev_nvme_destroy_bdev_channel_cb, 4334 sizeof(struct nvme_bdev_channel), 4335 bdev->disk.name); 4336 4337 nvme_ns->bdev = bdev; 4338 bdev->nsid = nvme_ns->id; 4339 TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); 4340 4341 bdev->nbdev_ctrlr = nbdev_ctrlr; 4342 TAILQ_INSERT_TAIL(&nbdev_ctrlr->bdevs, bdev, tailq); 4343 4344 rc = spdk_bdev_register(&bdev->disk); 4345 if (rc != 0) { 4346 SPDK_ERRLOG("spdk_bdev_register() failed\n"); 4347 spdk_io_device_unregister(bdev, NULL); 4348 nvme_ns->bdev = NULL; 4349 TAILQ_REMOVE(&nbdev_ctrlr->bdevs, bdev, tailq); 4350 nvme_bdev_free(bdev); 4351 return rc; 4352 } 4353 4354 return 0; 4355 } 4356 4357 static bool 4358 bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2) 4359 { 4360 const struct spdk_nvme_ns_data *nsdata1, *nsdata2; 4361 const struct spdk_uuid *uuid1, *uuid2; 4362 4363 nsdata1 = spdk_nvme_ns_get_data(ns1); 4364 nsdata2 = spdk_nvme_ns_get_data(ns2); 4365 uuid1 = spdk_nvme_ns_get_uuid(ns1); 4366 uuid2 = spdk_nvme_ns_get_uuid(ns2); 4367 4368 return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid)) == 0 && 4369 nsdata1->eui64 == nsdata2->eui64 && 4370 ((uuid1 == NULL && uuid2 == NULL) || 4371 (uuid1 != NULL && uuid2 != NULL && spdk_uuid_compare(uuid1, uuid2) == 0)) && 4372 spdk_nvme_ns_get_csi(ns1) == spdk_nvme_ns_get_csi(ns2); 4373 } 4374 4375 static bool 4376 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 4377 struct spdk_nvme_ctrlr_opts *opts) 4378 { 4379 struct nvme_probe_skip_entry *entry; 4380 4381 TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) { 4382 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 4383 return false; 4384 } 4385 } 4386 4387 opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst; 4388 opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight; 4389 opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight; 4390 opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight; 4391 opts->disable_read_ana_log_page = true; 4392 4393 SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr); 4394 4395 return true; 4396 } 4397 4398 static void 4399 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl) 4400 { 4401 struct nvme_ctrlr *nvme_ctrlr = ctx; 4402 4403 if (spdk_nvme_cpl_is_error(cpl)) { 4404 SPDK_WARNLOG("Abort failed. Resetting controller. sc is %u, sct is %u.\n", cpl->status.sc, 4405 cpl->status.sct); 4406 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4407 } else if (cpl->cdw0 & 0x1) { 4408 SPDK_WARNLOG("Specified command could not be aborted.\n"); 4409 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4410 } 4411 } 4412 4413 static void 4414 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr, 4415 struct spdk_nvme_qpair *qpair, uint16_t cid) 4416 { 4417 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 4418 union spdk_nvme_csts_register csts; 4419 int rc; 4420 4421 assert(nvme_ctrlr->ctrlr == ctrlr); 4422 4423 SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid); 4424 4425 /* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O 4426 * queue. (Note: qpair == NULL when there's an admin cmd timeout.) Otherwise we 4427 * would submit another fabrics cmd on the admin queue to read CSTS and check for its 4428 * completion recursively. 4429 */ 4430 if (nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) { 4431 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); 4432 if (csts.bits.cfs) { 4433 SPDK_ERRLOG("Controller Fatal Status, reset required\n"); 4434 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4435 return; 4436 } 4437 } 4438 4439 switch (g_opts.action_on_timeout) { 4440 case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT: 4441 if (qpair) { 4442 /* Don't send abort to ctrlr when ctrlr is not available. */ 4443 pthread_mutex_lock(&nvme_ctrlr->mutex); 4444 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 4445 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4446 SPDK_NOTICELOG("Quit abort. Ctrlr is not available.\n"); 4447 return; 4448 } 4449 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4450 4451 rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid, 4452 nvme_abort_cpl, nvme_ctrlr); 4453 if (rc == 0) { 4454 return; 4455 } 4456 4457 SPDK_ERRLOG("Unable to send abort. Resetting, rc is %d.\n", rc); 4458 } 4459 4460 /* FALLTHROUGH */ 4461 case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET: 4462 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4463 break; 4464 case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE: 4465 SPDK_DEBUGLOG(bdev_nvme, "No action for nvme controller timeout.\n"); 4466 break; 4467 default: 4468 SPDK_ERRLOG("An invalid timeout action value is found.\n"); 4469 break; 4470 } 4471 } 4472 4473 static struct nvme_ns * 4474 nvme_ns_alloc(void) 4475 { 4476 struct nvme_ns *nvme_ns; 4477 4478 nvme_ns = calloc(1, sizeof(struct nvme_ns)); 4479 if (nvme_ns == NULL) { 4480 return NULL; 4481 } 4482 4483 if (g_opts.io_path_stat) { 4484 nvme_ns->stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); 4485 if (nvme_ns->stat == NULL) { 4486 free(nvme_ns); 4487 return NULL; 4488 } 4489 spdk_bdev_reset_io_stat(nvme_ns->stat, SPDK_BDEV_RESET_STAT_MAXMIN); 4490 } 4491 4492 return nvme_ns; 4493 } 4494 4495 static void 4496 nvme_ns_free(struct nvme_ns *nvme_ns) 4497 { 4498 free(nvme_ns->stat); 4499 free(nvme_ns); 4500 } 4501 4502 static void 4503 nvme_ctrlr_populate_namespace_done(struct nvme_ns *nvme_ns, int rc) 4504 { 4505 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 4506 struct nvme_async_probe_ctx *ctx = nvme_ns->probe_ctx; 4507 4508 if (rc == 0) { 4509 nvme_ns->probe_ctx = NULL; 4510 pthread_mutex_lock(&nvme_ctrlr->mutex); 4511 nvme_ctrlr->ref++; 4512 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4513 } else { 4514 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4515 nvme_ns_free(nvme_ns); 4516 } 4517 4518 if (ctx) { 4519 ctx->populates_in_progress--; 4520 if (ctx->populates_in_progress == 0) { 4521 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 4522 } 4523 } 4524 } 4525 4526 static void 4527 bdev_nvme_add_io_path(struct spdk_io_channel_iter *i) 4528 { 4529 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4530 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4531 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4532 int rc; 4533 4534 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 4535 if (rc != 0) { 4536 SPDK_ERRLOG("Failed to add I/O path to bdev_channel dynamically.\n"); 4537 } 4538 4539 spdk_for_each_channel_continue(i, rc); 4540 } 4541 4542 static void 4543 bdev_nvme_delete_io_path(struct spdk_io_channel_iter *i) 4544 { 4545 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4546 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4547 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4548 struct nvme_io_path *io_path; 4549 4550 io_path = _bdev_nvme_get_io_path(nbdev_ch, nvme_ns); 4551 if (io_path != NULL) { 4552 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 4553 } 4554 4555 spdk_for_each_channel_continue(i, 0); 4556 } 4557 4558 static void 4559 bdev_nvme_add_io_path_failed(struct spdk_io_channel_iter *i, int status) 4560 { 4561 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4562 4563 nvme_ctrlr_populate_namespace_done(nvme_ns, -1); 4564 } 4565 4566 static void 4567 bdev_nvme_add_io_path_done(struct spdk_io_channel_iter *i, int status) 4568 { 4569 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4570 struct nvme_bdev *bdev = spdk_io_channel_iter_get_io_device(i); 4571 4572 if (status == 0) { 4573 nvme_ctrlr_populate_namespace_done(nvme_ns, 0); 4574 } else { 4575 /* Delete the added io_paths and fail populating the namespace. */ 4576 spdk_for_each_channel(bdev, 4577 bdev_nvme_delete_io_path, 4578 nvme_ns, 4579 bdev_nvme_add_io_path_failed); 4580 } 4581 } 4582 4583 static int 4584 nvme_bdev_add_ns(struct nvme_bdev *bdev, struct nvme_ns *nvme_ns) 4585 { 4586 struct nvme_ns *tmp_ns; 4587 const struct spdk_nvme_ns_data *nsdata; 4588 4589 nsdata = spdk_nvme_ns_get_data(nvme_ns->ns); 4590 if (!nsdata->nmic.can_share) { 4591 SPDK_ERRLOG("Namespace cannot be shared.\n"); 4592 return -EINVAL; 4593 } 4594 4595 pthread_mutex_lock(&bdev->mutex); 4596 4597 tmp_ns = TAILQ_FIRST(&bdev->nvme_ns_list); 4598 assert(tmp_ns != NULL); 4599 4600 if (tmp_ns->ns != NULL && !bdev_nvme_compare_ns(nvme_ns->ns, tmp_ns->ns)) { 4601 pthread_mutex_unlock(&bdev->mutex); 4602 SPDK_ERRLOG("Namespaces are not identical.\n"); 4603 return -EINVAL; 4604 } 4605 4606 bdev->ref++; 4607 TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); 4608 nvme_ns->bdev = bdev; 4609 4610 pthread_mutex_unlock(&bdev->mutex); 4611 4612 /* Add nvme_io_path to nvme_bdev_channels dynamically. */ 4613 spdk_for_each_channel(bdev, 4614 bdev_nvme_add_io_path, 4615 nvme_ns, 4616 bdev_nvme_add_io_path_done); 4617 4618 return 0; 4619 } 4620 4621 static void 4622 nvme_ctrlr_populate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4623 { 4624 struct spdk_nvme_ns *ns; 4625 struct nvme_bdev *bdev; 4626 int rc = 0; 4627 4628 ns = spdk_nvme_ctrlr_get_ns(nvme_ctrlr->ctrlr, nvme_ns->id); 4629 if (!ns) { 4630 SPDK_DEBUGLOG(bdev_nvme, "Invalid NS %d\n", nvme_ns->id); 4631 rc = -EINVAL; 4632 goto done; 4633 } 4634 4635 nvme_ns->ns = ns; 4636 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 4637 4638 if (nvme_ctrlr->ana_log_page != NULL) { 4639 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ns_set_ana_state, nvme_ns); 4640 } 4641 4642 bdev = nvme_bdev_ctrlr_get_bdev(nvme_ctrlr->nbdev_ctrlr, nvme_ns->id); 4643 if (bdev == NULL) { 4644 rc = nvme_bdev_create(nvme_ctrlr, nvme_ns); 4645 } else { 4646 rc = nvme_bdev_add_ns(bdev, nvme_ns); 4647 if (rc == 0) { 4648 return; 4649 } 4650 } 4651 done: 4652 nvme_ctrlr_populate_namespace_done(nvme_ns, rc); 4653 } 4654 4655 static void 4656 nvme_ctrlr_depopulate_namespace_done(struct nvme_ns *nvme_ns) 4657 { 4658 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 4659 4660 assert(nvme_ctrlr != NULL); 4661 4662 pthread_mutex_lock(&nvme_ctrlr->mutex); 4663 4664 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4665 4666 if (nvme_ns->bdev != NULL) { 4667 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4668 return; 4669 } 4670 4671 nvme_ns_free(nvme_ns); 4672 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4673 4674 nvme_ctrlr_release(nvme_ctrlr); 4675 } 4676 4677 static void 4678 bdev_nvme_delete_io_path_done(struct spdk_io_channel_iter *i, int status) 4679 { 4680 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4681 4682 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 4683 } 4684 4685 static void 4686 nvme_ctrlr_depopulate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4687 { 4688 struct nvme_bdev *bdev; 4689 4690 spdk_poller_unregister(&nvme_ns->anatt_timer); 4691 4692 bdev = nvme_ns->bdev; 4693 if (bdev != NULL) { 4694 pthread_mutex_lock(&bdev->mutex); 4695 4696 assert(bdev->ref > 0); 4697 bdev->ref--; 4698 if (bdev->ref == 0) { 4699 pthread_mutex_unlock(&bdev->mutex); 4700 4701 spdk_bdev_unregister(&bdev->disk, NULL, NULL); 4702 } else { 4703 /* spdk_bdev_unregister() is not called until the last nvme_ns is 4704 * depopulated. Hence we need to remove nvme_ns from bdev->nvme_ns_list 4705 * and clear nvme_ns->bdev here. 4706 */ 4707 TAILQ_REMOVE(&bdev->nvme_ns_list, nvme_ns, tailq); 4708 nvme_ns->bdev = NULL; 4709 4710 pthread_mutex_unlock(&bdev->mutex); 4711 4712 /* Delete nvme_io_paths from nvme_bdev_channels dynamically. After that, 4713 * we call depopulate_namespace_done() to avoid use-after-free. 4714 */ 4715 spdk_for_each_channel(bdev, 4716 bdev_nvme_delete_io_path, 4717 nvme_ns, 4718 bdev_nvme_delete_io_path_done); 4719 return; 4720 } 4721 } 4722 4723 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 4724 } 4725 4726 static void 4727 nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 4728 struct nvme_async_probe_ctx *ctx) 4729 { 4730 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 4731 struct nvme_ns *nvme_ns, *next; 4732 struct spdk_nvme_ns *ns; 4733 struct nvme_bdev *bdev; 4734 uint32_t nsid; 4735 int rc; 4736 uint64_t num_sectors; 4737 4738 if (ctx) { 4739 /* Initialize this count to 1 to handle the populate functions 4740 * calling nvme_ctrlr_populate_namespace_done() immediately. 4741 */ 4742 ctx->populates_in_progress = 1; 4743 } 4744 4745 /* First loop over our existing namespaces and see if they have been 4746 * removed. */ 4747 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 4748 while (nvme_ns != NULL) { 4749 next = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 4750 4751 if (spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) { 4752 /* NS is still there or added again. Its attributes may have changed. */ 4753 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id); 4754 if (nvme_ns->ns != ns) { 4755 assert(nvme_ns->ns == NULL); 4756 nvme_ns->ns = ns; 4757 SPDK_DEBUGLOG(bdev_nvme, "NSID %u was added\n", nvme_ns->id); 4758 } 4759 4760 num_sectors = spdk_nvme_ns_get_num_sectors(ns); 4761 bdev = nvme_ns->bdev; 4762 assert(bdev != NULL); 4763 if (bdev->disk.blockcnt != num_sectors) { 4764 SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n", 4765 nvme_ns->id, 4766 bdev->disk.name, 4767 bdev->disk.blockcnt, 4768 num_sectors); 4769 rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors); 4770 if (rc != 0) { 4771 SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n", 4772 bdev->disk.name, rc); 4773 } 4774 } 4775 } else { 4776 /* Namespace was removed */ 4777 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 4778 } 4779 4780 nvme_ns = next; 4781 } 4782 4783 /* Loop through all of the namespaces at the nvme level and see if any of them are new */ 4784 nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 4785 while (nsid != 0) { 4786 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 4787 4788 if (nvme_ns == NULL) { 4789 /* Found a new one */ 4790 nvme_ns = nvme_ns_alloc(); 4791 if (nvme_ns == NULL) { 4792 SPDK_ERRLOG("Failed to allocate namespace\n"); 4793 /* This just fails to attach the namespace. It may work on a future attempt. */ 4794 continue; 4795 } 4796 4797 nvme_ns->id = nsid; 4798 nvme_ns->ctrlr = nvme_ctrlr; 4799 4800 nvme_ns->bdev = NULL; 4801 4802 if (ctx) { 4803 ctx->populates_in_progress++; 4804 } 4805 nvme_ns->probe_ctx = ctx; 4806 4807 RB_INSERT(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4808 4809 nvme_ctrlr_populate_namespace(nvme_ctrlr, nvme_ns); 4810 } 4811 4812 nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid); 4813 } 4814 4815 if (ctx) { 4816 /* Decrement this count now that the loop is over to account 4817 * for the one we started with. If the count is then 0, we 4818 * know any populate_namespace functions completed immediately, 4819 * so we'll kick the callback here. 4820 */ 4821 ctx->populates_in_progress--; 4822 if (ctx->populates_in_progress == 0) { 4823 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 4824 } 4825 } 4826 4827 } 4828 4829 static void 4830 nvme_ctrlr_depopulate_namespaces(struct nvme_ctrlr *nvme_ctrlr) 4831 { 4832 struct nvme_ns *nvme_ns, *tmp; 4833 4834 RB_FOREACH_SAFE(nvme_ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp) { 4835 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 4836 } 4837 } 4838 4839 static uint32_t 4840 nvme_ctrlr_get_ana_log_page_size(struct nvme_ctrlr *nvme_ctrlr) 4841 { 4842 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 4843 const struct spdk_nvme_ctrlr_data *cdata; 4844 uint32_t nsid, ns_count = 0; 4845 4846 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 4847 4848 for (nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 4849 nsid != 0; nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid)) { 4850 ns_count++; 4851 } 4852 4853 return sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 4854 sizeof(struct spdk_nvme_ana_group_descriptor) + ns_count * 4855 sizeof(uint32_t); 4856 } 4857 4858 static int 4859 nvme_ctrlr_set_ana_states(const struct spdk_nvme_ana_group_descriptor *desc, 4860 void *cb_arg) 4861 { 4862 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 4863 struct nvme_ns *nvme_ns; 4864 uint32_t i, nsid; 4865 4866 for (i = 0; i < desc->num_of_nsid; i++) { 4867 nsid = desc->nsid[i]; 4868 if (nsid == 0) { 4869 continue; 4870 } 4871 4872 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 4873 4874 if (nvme_ns == NULL) { 4875 /* Target told us that an inactive namespace had an ANA change */ 4876 continue; 4877 } 4878 4879 _nvme_ns_set_ana_state(nvme_ns, desc); 4880 } 4881 4882 return 0; 4883 } 4884 4885 static void 4886 bdev_nvme_disable_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 4887 { 4888 struct nvme_ns *nvme_ns; 4889 4890 spdk_free(nvme_ctrlr->ana_log_page); 4891 nvme_ctrlr->ana_log_page = NULL; 4892 4893 for (nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 4894 nvme_ns != NULL; 4895 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns)) { 4896 nvme_ns->ana_state_updating = false; 4897 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 4898 } 4899 } 4900 4901 static void 4902 nvme_ctrlr_read_ana_log_page_done(void *ctx, const struct spdk_nvme_cpl *cpl) 4903 { 4904 struct nvme_ctrlr *nvme_ctrlr = ctx; 4905 4906 if (cpl != NULL && spdk_nvme_cpl_is_success(cpl)) { 4907 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ctrlr_set_ana_states, 4908 nvme_ctrlr); 4909 } else { 4910 bdev_nvme_disable_read_ana_log_page(nvme_ctrlr); 4911 } 4912 4913 pthread_mutex_lock(&nvme_ctrlr->mutex); 4914 4915 assert(nvme_ctrlr->ana_log_page_updating == true); 4916 nvme_ctrlr->ana_log_page_updating = false; 4917 4918 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 4919 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4920 4921 nvme_ctrlr_unregister(nvme_ctrlr); 4922 } else { 4923 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4924 4925 bdev_nvme_clear_io_path_caches(nvme_ctrlr); 4926 } 4927 } 4928 4929 static int 4930 nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 4931 { 4932 uint32_t ana_log_page_size; 4933 int rc; 4934 4935 if (nvme_ctrlr->ana_log_page == NULL) { 4936 return -EINVAL; 4937 } 4938 4939 ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); 4940 4941 if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { 4942 SPDK_ERRLOG("ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", 4943 ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); 4944 return -EINVAL; 4945 } 4946 4947 pthread_mutex_lock(&nvme_ctrlr->mutex); 4948 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 4949 nvme_ctrlr->ana_log_page_updating) { 4950 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4951 return -EBUSY; 4952 } 4953 4954 nvme_ctrlr->ana_log_page_updating = true; 4955 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4956 4957 rc = spdk_nvme_ctrlr_cmd_get_log_page(nvme_ctrlr->ctrlr, 4958 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 4959 SPDK_NVME_GLOBAL_NS_TAG, 4960 nvme_ctrlr->ana_log_page, 4961 ana_log_page_size, 0, 4962 nvme_ctrlr_read_ana_log_page_done, 4963 nvme_ctrlr); 4964 if (rc != 0) { 4965 nvme_ctrlr_read_ana_log_page_done(nvme_ctrlr, NULL); 4966 } 4967 4968 return rc; 4969 } 4970 4971 static void 4972 dummy_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 4973 { 4974 } 4975 4976 struct bdev_nvme_set_preferred_path_ctx { 4977 struct spdk_bdev_desc *desc; 4978 struct nvme_ns *nvme_ns; 4979 bdev_nvme_set_preferred_path_cb cb_fn; 4980 void *cb_arg; 4981 }; 4982 4983 static void 4984 bdev_nvme_set_preferred_path_done(struct spdk_io_channel_iter *i, int status) 4985 { 4986 struct bdev_nvme_set_preferred_path_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4987 4988 assert(ctx != NULL); 4989 assert(ctx->desc != NULL); 4990 assert(ctx->cb_fn != NULL); 4991 4992 spdk_bdev_close(ctx->desc); 4993 4994 ctx->cb_fn(ctx->cb_arg, status); 4995 4996 free(ctx); 4997 } 4998 4999 static void 5000 _bdev_nvme_set_preferred_path(struct spdk_io_channel_iter *i) 5001 { 5002 struct bdev_nvme_set_preferred_path_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 5003 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 5004 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 5005 struct nvme_io_path *io_path, *prev; 5006 5007 prev = NULL; 5008 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 5009 if (io_path->nvme_ns == ctx->nvme_ns) { 5010 break; 5011 } 5012 prev = io_path; 5013 } 5014 5015 if (io_path != NULL) { 5016 if (prev != NULL) { 5017 STAILQ_REMOVE_AFTER(&nbdev_ch->io_path_list, prev, stailq); 5018 STAILQ_INSERT_HEAD(&nbdev_ch->io_path_list, io_path, stailq); 5019 } 5020 5021 /* We can set io_path to nbdev_ch->current_io_path directly here. 5022 * However, it needs to be conditional. To simplify the code, 5023 * just clear nbdev_ch->current_io_path and let find_io_path() 5024 * fill it. 5025 * 5026 * Automatic failback may be disabled. Hence even if the io_path is 5027 * already at the head, clear nbdev_ch->current_io_path. 5028 */ 5029 bdev_nvme_clear_current_io_path(nbdev_ch); 5030 } 5031 5032 spdk_for_each_channel_continue(i, 0); 5033 } 5034 5035 static struct nvme_ns * 5036 bdev_nvme_set_preferred_ns(struct nvme_bdev *nbdev, uint16_t cntlid) 5037 { 5038 struct nvme_ns *nvme_ns, *prev; 5039 const struct spdk_nvme_ctrlr_data *cdata; 5040 5041 prev = NULL; 5042 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 5043 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 5044 5045 if (cdata->cntlid == cntlid) { 5046 break; 5047 } 5048 prev = nvme_ns; 5049 } 5050 5051 if (nvme_ns != NULL && prev != NULL) { 5052 TAILQ_REMOVE(&nbdev->nvme_ns_list, nvme_ns, tailq); 5053 TAILQ_INSERT_HEAD(&nbdev->nvme_ns_list, nvme_ns, tailq); 5054 } 5055 5056 return nvme_ns; 5057 } 5058 5059 /* This function supports only multipath mode. There is only a single I/O path 5060 * for each NVMe-oF controller. Hence, just move the matched I/O path to the 5061 * head of the I/O path list for each NVMe bdev channel. 5062 * 5063 * NVMe bdev channel may be acquired after completing this function. move the 5064 * matched namespace to the head of the namespace list for the NVMe bdev too. 5065 */ 5066 void 5067 bdev_nvme_set_preferred_path(const char *name, uint16_t cntlid, 5068 bdev_nvme_set_preferred_path_cb cb_fn, void *cb_arg) 5069 { 5070 struct bdev_nvme_set_preferred_path_ctx *ctx; 5071 struct spdk_bdev *bdev; 5072 struct nvme_bdev *nbdev; 5073 int rc = 0; 5074 5075 assert(cb_fn != NULL); 5076 5077 ctx = calloc(1, sizeof(*ctx)); 5078 if (ctx == NULL) { 5079 SPDK_ERRLOG("Failed to alloc context.\n"); 5080 rc = -ENOMEM; 5081 goto err_alloc; 5082 } 5083 5084 ctx->cb_fn = cb_fn; 5085 ctx->cb_arg = cb_arg; 5086 5087 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 5088 if (rc != 0) { 5089 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 5090 goto err_open; 5091 } 5092 5093 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 5094 5095 if (bdev->module != &nvme_if) { 5096 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 5097 rc = -ENODEV; 5098 goto err_bdev; 5099 } 5100 5101 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 5102 5103 pthread_mutex_lock(&nbdev->mutex); 5104 5105 ctx->nvme_ns = bdev_nvme_set_preferred_ns(nbdev, cntlid); 5106 if (ctx->nvme_ns == NULL) { 5107 pthread_mutex_unlock(&nbdev->mutex); 5108 5109 SPDK_ERRLOG("bdev %s does not have namespace to controller %u.\n", name, cntlid); 5110 rc = -ENODEV; 5111 goto err_bdev; 5112 } 5113 5114 pthread_mutex_unlock(&nbdev->mutex); 5115 5116 spdk_for_each_channel(nbdev, 5117 _bdev_nvme_set_preferred_path, 5118 ctx, 5119 bdev_nvme_set_preferred_path_done); 5120 return; 5121 5122 err_bdev: 5123 spdk_bdev_close(ctx->desc); 5124 err_open: 5125 free(ctx); 5126 err_alloc: 5127 cb_fn(cb_arg, rc); 5128 } 5129 5130 struct bdev_nvme_set_multipath_policy_ctx { 5131 struct spdk_bdev_desc *desc; 5132 spdk_bdev_nvme_set_multipath_policy_cb cb_fn; 5133 void *cb_arg; 5134 }; 5135 5136 static void 5137 bdev_nvme_set_multipath_policy_done(struct spdk_io_channel_iter *i, int status) 5138 { 5139 struct bdev_nvme_set_multipath_policy_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 5140 5141 assert(ctx != NULL); 5142 assert(ctx->desc != NULL); 5143 assert(ctx->cb_fn != NULL); 5144 5145 spdk_bdev_close(ctx->desc); 5146 5147 ctx->cb_fn(ctx->cb_arg, status); 5148 5149 free(ctx); 5150 } 5151 5152 static void 5153 _bdev_nvme_set_multipath_policy(struct spdk_io_channel_iter *i) 5154 { 5155 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 5156 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 5157 struct nvme_bdev *nbdev = spdk_io_channel_get_io_device(_ch); 5158 5159 nbdev_ch->mp_policy = nbdev->mp_policy; 5160 nbdev_ch->mp_selector = nbdev->mp_selector; 5161 nbdev_ch->rr_min_io = nbdev->rr_min_io; 5162 bdev_nvme_clear_current_io_path(nbdev_ch); 5163 5164 spdk_for_each_channel_continue(i, 0); 5165 } 5166 5167 void 5168 spdk_bdev_nvme_set_multipath_policy(const char *name, enum spdk_bdev_nvme_multipath_policy policy, 5169 enum spdk_bdev_nvme_multipath_selector selector, uint32_t rr_min_io, 5170 spdk_bdev_nvme_set_multipath_policy_cb cb_fn, void *cb_arg) 5171 { 5172 struct bdev_nvme_set_multipath_policy_ctx *ctx; 5173 struct spdk_bdev *bdev; 5174 struct nvme_bdev *nbdev; 5175 int rc; 5176 5177 assert(cb_fn != NULL); 5178 5179 switch (policy) { 5180 case BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE: 5181 break; 5182 case BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE: 5183 switch (selector) { 5184 case BDEV_NVME_MP_SELECTOR_ROUND_ROBIN: 5185 if (rr_min_io == UINT32_MAX) { 5186 rr_min_io = 1; 5187 } else if (rr_min_io == 0) { 5188 rc = -EINVAL; 5189 goto exit; 5190 } 5191 break; 5192 case BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH: 5193 break; 5194 default: 5195 rc = -EINVAL; 5196 goto exit; 5197 } 5198 break; 5199 default: 5200 rc = -EINVAL; 5201 goto exit; 5202 } 5203 5204 ctx = calloc(1, sizeof(*ctx)); 5205 if (ctx == NULL) { 5206 SPDK_ERRLOG("Failed to alloc context.\n"); 5207 rc = -ENOMEM; 5208 goto exit; 5209 } 5210 5211 ctx->cb_fn = cb_fn; 5212 ctx->cb_arg = cb_arg; 5213 5214 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 5215 if (rc != 0) { 5216 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 5217 rc = -ENODEV; 5218 goto err_open; 5219 } 5220 5221 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 5222 if (bdev->module != &nvme_if) { 5223 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 5224 rc = -ENODEV; 5225 goto err_module; 5226 } 5227 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 5228 5229 pthread_mutex_lock(&nbdev->mutex); 5230 nbdev->mp_policy = policy; 5231 nbdev->mp_selector = selector; 5232 nbdev->rr_min_io = rr_min_io; 5233 pthread_mutex_unlock(&nbdev->mutex); 5234 5235 spdk_for_each_channel(nbdev, 5236 _bdev_nvme_set_multipath_policy, 5237 ctx, 5238 bdev_nvme_set_multipath_policy_done); 5239 return; 5240 5241 err_module: 5242 spdk_bdev_close(ctx->desc); 5243 err_open: 5244 free(ctx); 5245 exit: 5246 cb_fn(cb_arg, rc); 5247 } 5248 5249 static void 5250 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 5251 { 5252 struct nvme_ctrlr *nvme_ctrlr = arg; 5253 union spdk_nvme_async_event_completion event; 5254 5255 if (spdk_nvme_cpl_is_error(cpl)) { 5256 SPDK_WARNLOG("AER request execute failed\n"); 5257 return; 5258 } 5259 5260 event.raw = cpl->cdw0; 5261 if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 5262 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) { 5263 nvme_ctrlr_populate_namespaces(nvme_ctrlr, NULL); 5264 } else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 5265 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_ANA_CHANGE)) { 5266 nvme_ctrlr_read_ana_log_page(nvme_ctrlr); 5267 } 5268 } 5269 5270 static void 5271 free_nvme_async_probe_ctx(struct nvme_async_probe_ctx *ctx) 5272 { 5273 spdk_keyring_put_key(ctx->drv_opts.tls_psk); 5274 spdk_keyring_put_key(ctx->drv_opts.dhchap_key); 5275 spdk_keyring_put_key(ctx->drv_opts.dhchap_ctrlr_key); 5276 free(ctx); 5277 } 5278 5279 static void 5280 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, int rc) 5281 { 5282 if (ctx->cb_fn) { 5283 ctx->cb_fn(ctx->cb_ctx, ctx->reported_bdevs, rc); 5284 } 5285 5286 ctx->namespaces_populated = true; 5287 if (ctx->probe_done) { 5288 /* The probe was already completed, so we need to free the context 5289 * here. This can happen for cases like OCSSD, where we need to 5290 * send additional commands to the SSD after attach. 5291 */ 5292 free_nvme_async_probe_ctx(ctx); 5293 } 5294 } 5295 5296 static int 5297 bdev_nvme_remove_poller(void *ctx) 5298 { 5299 struct spdk_nvme_transport_id trid_pcie; 5300 5301 if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 5302 spdk_poller_unregister(&g_hotplug_poller); 5303 return SPDK_POLLER_IDLE; 5304 } 5305 5306 memset(&trid_pcie, 0, sizeof(trid_pcie)); 5307 spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE); 5308 5309 if (spdk_nvme_scan_attached(&trid_pcie)) { 5310 SPDK_ERRLOG_RATELIMIT("spdk_nvme_scan_attached() failed\n"); 5311 } 5312 5313 return SPDK_POLLER_BUSY; 5314 } 5315 5316 static void 5317 nvme_ctrlr_create_done(struct nvme_ctrlr *nvme_ctrlr, 5318 struct nvme_async_probe_ctx *ctx) 5319 { 5320 spdk_io_device_register(nvme_ctrlr, 5321 bdev_nvme_create_ctrlr_channel_cb, 5322 bdev_nvme_destroy_ctrlr_channel_cb, 5323 sizeof(struct nvme_ctrlr_channel), 5324 nvme_ctrlr->nbdev_ctrlr->name); 5325 5326 nvme_ctrlr_populate_namespaces(nvme_ctrlr, ctx); 5327 5328 if (g_hotplug_poller == NULL) { 5329 g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_remove_poller, NULL, 5330 NVME_HOTPLUG_POLL_PERIOD_DEFAULT); 5331 } 5332 } 5333 5334 static void 5335 nvme_ctrlr_init_ana_log_page_done(void *_ctx, const struct spdk_nvme_cpl *cpl) 5336 { 5337 struct nvme_ctrlr *nvme_ctrlr = _ctx; 5338 struct nvme_async_probe_ctx *ctx = nvme_ctrlr->probe_ctx; 5339 5340 nvme_ctrlr->probe_ctx = NULL; 5341 5342 if (spdk_nvme_cpl_is_error(cpl)) { 5343 nvme_ctrlr_delete(nvme_ctrlr); 5344 5345 if (ctx != NULL) { 5346 ctx->reported_bdevs = 0; 5347 populate_namespaces_cb(ctx, -1); 5348 } 5349 return; 5350 } 5351 5352 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 5353 } 5354 5355 static int 5356 nvme_ctrlr_init_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 5357 struct nvme_async_probe_ctx *ctx) 5358 { 5359 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 5360 const struct spdk_nvme_ctrlr_data *cdata; 5361 uint32_t ana_log_page_size; 5362 5363 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5364 5365 /* Set buffer size enough to include maximum number of allowed namespaces. */ 5366 ana_log_page_size = sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 5367 sizeof(struct spdk_nvme_ana_group_descriptor) + cdata->mnan * 5368 sizeof(uint32_t); 5369 5370 nvme_ctrlr->ana_log_page = spdk_zmalloc(ana_log_page_size, 64, NULL, 5371 SPDK_ENV_NUMA_ID_ANY, SPDK_MALLOC_DMA); 5372 if (nvme_ctrlr->ana_log_page == NULL) { 5373 SPDK_ERRLOG("could not allocate ANA log page buffer\n"); 5374 return -ENXIO; 5375 } 5376 5377 /* Each descriptor in a ANA log page is not ensured to be 8-bytes aligned. 5378 * Hence copy each descriptor to a temporary area when parsing it. 5379 * 5380 * Allocate a buffer whose size is as large as ANA log page buffer because 5381 * we do not know the size of a descriptor until actually reading it. 5382 */ 5383 nvme_ctrlr->copied_ana_desc = calloc(1, ana_log_page_size); 5384 if (nvme_ctrlr->copied_ana_desc == NULL) { 5385 SPDK_ERRLOG("could not allocate a buffer to parse ANA descriptor\n"); 5386 return -ENOMEM; 5387 } 5388 5389 nvme_ctrlr->max_ana_log_page_size = ana_log_page_size; 5390 5391 nvme_ctrlr->probe_ctx = ctx; 5392 5393 /* Then, set the read size only to include the current active namespaces. */ 5394 ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); 5395 5396 if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { 5397 SPDK_ERRLOG("ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", 5398 ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); 5399 return -EINVAL; 5400 } 5401 5402 return spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, 5403 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 5404 SPDK_NVME_GLOBAL_NS_TAG, 5405 nvme_ctrlr->ana_log_page, 5406 ana_log_page_size, 0, 5407 nvme_ctrlr_init_ana_log_page_done, 5408 nvme_ctrlr); 5409 } 5410 5411 /* hostnqn and subnqn were already verified before attaching a controller. 5412 * Hence check only the multipath capability and cntlid here. 5413 */ 5414 static bool 5415 bdev_nvme_check_multipath(struct nvme_bdev_ctrlr *nbdev_ctrlr, struct spdk_nvme_ctrlr *ctrlr) 5416 { 5417 struct nvme_ctrlr *tmp; 5418 const struct spdk_nvme_ctrlr_data *cdata, *tmp_cdata; 5419 5420 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5421 5422 if (!cdata->cmic.multi_ctrlr) { 5423 SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); 5424 return false; 5425 } 5426 5427 TAILQ_FOREACH(tmp, &nbdev_ctrlr->ctrlrs, tailq) { 5428 tmp_cdata = spdk_nvme_ctrlr_get_data(tmp->ctrlr); 5429 5430 if (!tmp_cdata->cmic.multi_ctrlr) { 5431 SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); 5432 return false; 5433 } 5434 if (cdata->cntlid == tmp_cdata->cntlid) { 5435 SPDK_ERRLOG("cntlid %u are duplicated.\n", tmp_cdata->cntlid); 5436 return false; 5437 } 5438 } 5439 5440 return true; 5441 } 5442 5443 static int 5444 nvme_bdev_ctrlr_create(const char *name, struct nvme_ctrlr *nvme_ctrlr) 5445 { 5446 struct nvme_bdev_ctrlr *nbdev_ctrlr; 5447 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 5448 int rc = 0; 5449 5450 pthread_mutex_lock(&g_bdev_nvme_mutex); 5451 5452 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 5453 if (nbdev_ctrlr != NULL) { 5454 if (!bdev_nvme_check_multipath(nbdev_ctrlr, ctrlr)) { 5455 rc = -EINVAL; 5456 goto exit; 5457 } 5458 } else { 5459 nbdev_ctrlr = calloc(1, sizeof(*nbdev_ctrlr)); 5460 if (nbdev_ctrlr == NULL) { 5461 SPDK_ERRLOG("Failed to allocate nvme_bdev_ctrlr.\n"); 5462 rc = -ENOMEM; 5463 goto exit; 5464 } 5465 nbdev_ctrlr->name = strdup(name); 5466 if (nbdev_ctrlr->name == NULL) { 5467 SPDK_ERRLOG("Failed to allocate name of nvme_bdev_ctrlr.\n"); 5468 free(nbdev_ctrlr); 5469 goto exit; 5470 } 5471 TAILQ_INIT(&nbdev_ctrlr->ctrlrs); 5472 TAILQ_INIT(&nbdev_ctrlr->bdevs); 5473 TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 5474 } 5475 nvme_ctrlr->nbdev_ctrlr = nbdev_ctrlr; 5476 TAILQ_INSERT_TAIL(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 5477 exit: 5478 pthread_mutex_unlock(&g_bdev_nvme_mutex); 5479 return rc; 5480 } 5481 5482 static int 5483 nvme_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr, 5484 const char *name, 5485 const struct spdk_nvme_transport_id *trid, 5486 struct nvme_async_probe_ctx *ctx) 5487 { 5488 struct nvme_ctrlr *nvme_ctrlr; 5489 struct nvme_path_id *path_id; 5490 const struct spdk_nvme_ctrlr_data *cdata; 5491 int rc; 5492 5493 nvme_ctrlr = calloc(1, sizeof(*nvme_ctrlr)); 5494 if (nvme_ctrlr == NULL) { 5495 SPDK_ERRLOG("Failed to allocate device struct\n"); 5496 return -ENOMEM; 5497 } 5498 5499 rc = pthread_mutex_init(&nvme_ctrlr->mutex, NULL); 5500 if (rc != 0) { 5501 free(nvme_ctrlr); 5502 return rc; 5503 } 5504 5505 TAILQ_INIT(&nvme_ctrlr->trids); 5506 RB_INIT(&nvme_ctrlr->namespaces); 5507 5508 /* Get another reference to the key, so the first one can be released from probe_ctx */ 5509 if (ctx != NULL) { 5510 if (ctx->drv_opts.tls_psk != NULL) { 5511 nvme_ctrlr->psk = spdk_keyring_get_key( 5512 spdk_key_get_name(ctx->drv_opts.tls_psk)); 5513 if (nvme_ctrlr->psk == NULL) { 5514 /* Could only happen if the key was removed in the meantime */ 5515 SPDK_ERRLOG("Couldn't get a reference to the key '%s'\n", 5516 spdk_key_get_name(ctx->drv_opts.tls_psk)); 5517 rc = -ENOKEY; 5518 goto err; 5519 } 5520 } 5521 5522 if (ctx->drv_opts.dhchap_key != NULL) { 5523 nvme_ctrlr->dhchap_key = spdk_keyring_get_key( 5524 spdk_key_get_name(ctx->drv_opts.dhchap_key)); 5525 if (nvme_ctrlr->dhchap_key == NULL) { 5526 SPDK_ERRLOG("Couldn't get a reference to the key '%s'\n", 5527 spdk_key_get_name(ctx->drv_opts.dhchap_key)); 5528 rc = -ENOKEY; 5529 goto err; 5530 } 5531 } 5532 5533 if (ctx->drv_opts.dhchap_ctrlr_key != NULL) { 5534 nvme_ctrlr->dhchap_ctrlr_key = 5535 spdk_keyring_get_key( 5536 spdk_key_get_name(ctx->drv_opts.dhchap_ctrlr_key)); 5537 if (nvme_ctrlr->dhchap_ctrlr_key == NULL) { 5538 SPDK_ERRLOG("Couldn't get a reference to the key '%s'\n", 5539 spdk_key_get_name(ctx->drv_opts.dhchap_ctrlr_key)); 5540 rc = -ENOKEY; 5541 goto err; 5542 } 5543 } 5544 } 5545 5546 path_id = calloc(1, sizeof(*path_id)); 5547 if (path_id == NULL) { 5548 SPDK_ERRLOG("Failed to allocate trid entry pointer\n"); 5549 rc = -ENOMEM; 5550 goto err; 5551 } 5552 5553 path_id->trid = *trid; 5554 if (ctx != NULL) { 5555 memcpy(path_id->hostid.hostaddr, ctx->drv_opts.src_addr, sizeof(path_id->hostid.hostaddr)); 5556 memcpy(path_id->hostid.hostsvcid, ctx->drv_opts.src_svcid, sizeof(path_id->hostid.hostsvcid)); 5557 } 5558 nvme_ctrlr->active_path_id = path_id; 5559 TAILQ_INSERT_HEAD(&nvme_ctrlr->trids, path_id, link); 5560 5561 nvme_ctrlr->thread = spdk_get_thread(); 5562 nvme_ctrlr->ctrlr = ctrlr; 5563 nvme_ctrlr->ref = 1; 5564 5565 if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) { 5566 SPDK_ERRLOG("OCSSDs are not supported"); 5567 rc = -ENOTSUP; 5568 goto err; 5569 } 5570 5571 if (ctx != NULL) { 5572 memcpy(&nvme_ctrlr->opts, &ctx->bdev_opts, sizeof(ctx->bdev_opts)); 5573 } else { 5574 spdk_bdev_nvme_get_default_ctrlr_opts(&nvme_ctrlr->opts); 5575 } 5576 5577 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_ctrlr, 5578 g_opts.nvme_adminq_poll_period_us); 5579 5580 if (g_opts.timeout_us > 0) { 5581 /* Register timeout callback. Timeout values for IO vs. admin reqs can be different. */ 5582 /* If timeout_admin_us is 0 (not specified), admin uses same timeout as IO. */ 5583 uint64_t adm_timeout_us = (g_opts.timeout_admin_us == 0) ? 5584 g_opts.timeout_us : g_opts.timeout_admin_us; 5585 spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us, 5586 adm_timeout_us, timeout_cb, nvme_ctrlr); 5587 } 5588 5589 spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_ctrlr); 5590 spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_ctrlr); 5591 5592 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 5593 SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) { 5594 nvme_ctrlr->opal_dev = spdk_opal_dev_construct(ctrlr); 5595 } 5596 5597 rc = nvme_bdev_ctrlr_create(name, nvme_ctrlr); 5598 if (rc != 0) { 5599 goto err; 5600 } 5601 5602 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5603 5604 if (cdata->cmic.ana_reporting) { 5605 rc = nvme_ctrlr_init_ana_log_page(nvme_ctrlr, ctx); 5606 if (rc == 0) { 5607 return 0; 5608 } 5609 } else { 5610 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 5611 return 0; 5612 } 5613 5614 err: 5615 nvme_ctrlr_delete(nvme_ctrlr); 5616 return rc; 5617 } 5618 5619 void 5620 spdk_bdev_nvme_get_default_ctrlr_opts(struct spdk_bdev_nvme_ctrlr_opts *opts) 5621 { 5622 opts->prchk_flags = 0; 5623 opts->ctrlr_loss_timeout_sec = g_opts.ctrlr_loss_timeout_sec; 5624 opts->reconnect_delay_sec = g_opts.reconnect_delay_sec; 5625 opts->fast_io_fail_timeout_sec = g_opts.fast_io_fail_timeout_sec; 5626 } 5627 5628 static void 5629 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5630 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *drv_opts) 5631 { 5632 char *name; 5633 5634 name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++); 5635 if (!name) { 5636 SPDK_ERRLOG("Failed to assign name to NVMe device\n"); 5637 return; 5638 } 5639 5640 if (nvme_ctrlr_create(ctrlr, name, trid, NULL) == 0) { 5641 SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name); 5642 } else { 5643 SPDK_ERRLOG("Failed to attach to %s (%s)\n", trid->traddr, name); 5644 } 5645 5646 free(name); 5647 } 5648 5649 static void 5650 _nvme_ctrlr_destruct(void *ctx) 5651 { 5652 struct nvme_ctrlr *nvme_ctrlr = ctx; 5653 5654 nvme_ctrlr_depopulate_namespaces(nvme_ctrlr); 5655 nvme_ctrlr_release(nvme_ctrlr); 5656 } 5657 5658 static int 5659 bdev_nvme_delete_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) 5660 { 5661 struct nvme_probe_skip_entry *entry; 5662 5663 /* The controller's destruction was already started */ 5664 if (nvme_ctrlr->destruct) { 5665 return -EALREADY; 5666 } 5667 5668 if (!hotplug && 5669 nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { 5670 entry = calloc(1, sizeof(*entry)); 5671 if (!entry) { 5672 return -ENOMEM; 5673 } 5674 entry->trid = nvme_ctrlr->active_path_id->trid; 5675 TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq); 5676 } 5677 5678 nvme_ctrlr->destruct = true; 5679 return 0; 5680 } 5681 5682 static int 5683 bdev_nvme_delete_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) 5684 { 5685 int rc; 5686 5687 pthread_mutex_lock(&nvme_ctrlr->mutex); 5688 rc = bdev_nvme_delete_ctrlr_unsafe(nvme_ctrlr, hotplug); 5689 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5690 5691 if (rc == 0) { 5692 _nvme_ctrlr_destruct(nvme_ctrlr); 5693 } else if (rc == -EALREADY) { 5694 rc = 0; 5695 } 5696 5697 return rc; 5698 } 5699 5700 static void 5701 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr) 5702 { 5703 struct nvme_ctrlr *nvme_ctrlr = cb_ctx; 5704 5705 bdev_nvme_delete_ctrlr(nvme_ctrlr, true); 5706 } 5707 5708 static int 5709 bdev_nvme_hotplug_probe(void *arg) 5710 { 5711 if (g_hotplug_probe_ctx == NULL) { 5712 spdk_poller_unregister(&g_hotplug_probe_poller); 5713 return SPDK_POLLER_IDLE; 5714 } 5715 5716 if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) { 5717 g_hotplug_probe_ctx = NULL; 5718 spdk_poller_unregister(&g_hotplug_probe_poller); 5719 } 5720 5721 return SPDK_POLLER_BUSY; 5722 } 5723 5724 static int 5725 bdev_nvme_hotplug(void *arg) 5726 { 5727 struct spdk_nvme_transport_id trid_pcie; 5728 5729 if (g_hotplug_probe_ctx) { 5730 return SPDK_POLLER_BUSY; 5731 } 5732 5733 memset(&trid_pcie, 0, sizeof(trid_pcie)); 5734 spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE); 5735 5736 g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL, 5737 hotplug_probe_cb, attach_cb, NULL); 5738 5739 if (g_hotplug_probe_ctx) { 5740 assert(g_hotplug_probe_poller == NULL); 5741 g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000); 5742 } 5743 5744 return SPDK_POLLER_BUSY; 5745 } 5746 5747 void 5748 bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts) 5749 { 5750 *opts = g_opts; 5751 } 5752 5753 static bool bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, 5754 uint32_t reconnect_delay_sec, 5755 uint32_t fast_io_fail_timeout_sec); 5756 5757 static int 5758 bdev_nvme_validate_opts(const struct spdk_bdev_nvme_opts *opts) 5759 { 5760 if ((opts->timeout_us == 0) && (opts->timeout_admin_us != 0)) { 5761 /* Can't set timeout_admin_us without also setting timeout_us */ 5762 SPDK_WARNLOG("Invalid options: Can't have (timeout_us == 0) with (timeout_admin_us > 0)\n"); 5763 return -EINVAL; 5764 } 5765 5766 if (opts->bdev_retry_count < -1) { 5767 SPDK_WARNLOG("Invalid option: bdev_retry_count can't be less than -1.\n"); 5768 return -EINVAL; 5769 } 5770 5771 if (!bdev_nvme_check_io_error_resiliency_params(opts->ctrlr_loss_timeout_sec, 5772 opts->reconnect_delay_sec, 5773 opts->fast_io_fail_timeout_sec)) { 5774 return -EINVAL; 5775 } 5776 5777 return 0; 5778 } 5779 5780 int 5781 bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts) 5782 { 5783 int ret; 5784 5785 ret = bdev_nvme_validate_opts(opts); 5786 if (ret) { 5787 SPDK_WARNLOG("Failed to set nvme opts.\n"); 5788 return ret; 5789 } 5790 5791 if (g_bdev_nvme_init_thread != NULL) { 5792 if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 5793 return -EPERM; 5794 } 5795 } 5796 5797 if (opts->rdma_srq_size != 0 || 5798 opts->rdma_max_cq_size != 0 || 5799 opts->rdma_cm_event_timeout_ms != 0) { 5800 struct spdk_nvme_transport_opts drv_opts; 5801 5802 spdk_nvme_transport_get_opts(&drv_opts, sizeof(drv_opts)); 5803 if (opts->rdma_srq_size != 0) { 5804 drv_opts.rdma_srq_size = opts->rdma_srq_size; 5805 } 5806 if (opts->rdma_max_cq_size != 0) { 5807 drv_opts.rdma_max_cq_size = opts->rdma_max_cq_size; 5808 } 5809 if (opts->rdma_cm_event_timeout_ms != 0) { 5810 drv_opts.rdma_cm_event_timeout_ms = opts->rdma_cm_event_timeout_ms; 5811 } 5812 5813 ret = spdk_nvme_transport_set_opts(&drv_opts, sizeof(drv_opts)); 5814 if (ret) { 5815 SPDK_ERRLOG("Failed to set NVMe transport opts.\n"); 5816 return ret; 5817 } 5818 } 5819 5820 g_opts = *opts; 5821 5822 return 0; 5823 } 5824 5825 struct set_nvme_hotplug_ctx { 5826 uint64_t period_us; 5827 bool enabled; 5828 spdk_msg_fn fn; 5829 void *fn_ctx; 5830 }; 5831 5832 static void 5833 set_nvme_hotplug_period_cb(void *_ctx) 5834 { 5835 struct set_nvme_hotplug_ctx *ctx = _ctx; 5836 5837 spdk_poller_unregister(&g_hotplug_poller); 5838 if (ctx->enabled) { 5839 g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us); 5840 } else { 5841 g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_remove_poller, NULL, 5842 NVME_HOTPLUG_POLL_PERIOD_DEFAULT); 5843 } 5844 5845 g_nvme_hotplug_poll_period_us = ctx->period_us; 5846 g_nvme_hotplug_enabled = ctx->enabled; 5847 if (ctx->fn) { 5848 ctx->fn(ctx->fn_ctx); 5849 } 5850 5851 free(ctx); 5852 } 5853 5854 int 5855 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx) 5856 { 5857 struct set_nvme_hotplug_ctx *ctx; 5858 5859 if (enabled == true && !spdk_process_is_primary()) { 5860 return -EPERM; 5861 } 5862 5863 ctx = calloc(1, sizeof(*ctx)); 5864 if (ctx == NULL) { 5865 return -ENOMEM; 5866 } 5867 5868 period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us; 5869 ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX); 5870 ctx->enabled = enabled; 5871 ctx->fn = cb; 5872 ctx->fn_ctx = cb_ctx; 5873 5874 spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx); 5875 return 0; 5876 } 5877 5878 static void 5879 nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 5880 struct nvme_async_probe_ctx *ctx) 5881 { 5882 struct nvme_ns *nvme_ns; 5883 struct nvme_bdev *nvme_bdev; 5884 size_t j; 5885 5886 assert(nvme_ctrlr != NULL); 5887 5888 if (ctx->names == NULL) { 5889 ctx->reported_bdevs = 0; 5890 populate_namespaces_cb(ctx, 0); 5891 return; 5892 } 5893 5894 /* 5895 * Report the new bdevs that were created in this call. 5896 * There can be more than one bdev per NVMe controller. 5897 */ 5898 j = 0; 5899 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 5900 while (nvme_ns != NULL) { 5901 nvme_bdev = nvme_ns->bdev; 5902 if (j < ctx->max_bdevs) { 5903 ctx->names[j] = nvme_bdev->disk.name; 5904 j++; 5905 } else { 5906 SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n", 5907 ctx->max_bdevs); 5908 ctx->reported_bdevs = 0; 5909 populate_namespaces_cb(ctx, -ERANGE); 5910 return; 5911 } 5912 5913 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 5914 } 5915 5916 ctx->reported_bdevs = j; 5917 populate_namespaces_cb(ctx, 0); 5918 } 5919 5920 static int 5921 bdev_nvme_check_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 5922 struct spdk_nvme_ctrlr *new_ctrlr, 5923 struct spdk_nvme_transport_id *trid) 5924 { 5925 struct nvme_path_id *tmp_trid; 5926 5927 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 5928 SPDK_ERRLOG("PCIe failover is not supported.\n"); 5929 return -ENOTSUP; 5930 } 5931 5932 /* Currently we only support failover to the same transport type. */ 5933 if (nvme_ctrlr->active_path_id->trid.trtype != trid->trtype) { 5934 SPDK_WARNLOG("Failover from trtype: %s to a different trtype: %s is not supported currently\n", 5935 spdk_nvme_transport_id_trtype_str(nvme_ctrlr->active_path_id->trid.trtype), 5936 spdk_nvme_transport_id_trtype_str(trid->trtype)); 5937 return -EINVAL; 5938 } 5939 5940 5941 /* Currently we only support failover to the same NQN. */ 5942 if (strncmp(trid->subnqn, nvme_ctrlr->active_path_id->trid.subnqn, SPDK_NVMF_NQN_MAX_LEN)) { 5943 SPDK_WARNLOG("Failover from subnqn: %s to a different subnqn: %s is not supported currently\n", 5944 nvme_ctrlr->active_path_id->trid.subnqn, trid->subnqn); 5945 return -EINVAL; 5946 } 5947 5948 /* Skip all the other checks if we've already registered this path. */ 5949 TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) { 5950 if (!spdk_nvme_transport_id_compare(&tmp_trid->trid, trid)) { 5951 SPDK_WARNLOG("This path (traddr: %s subnqn: %s) is already registered\n", trid->traddr, 5952 trid->subnqn); 5953 return -EALREADY; 5954 } 5955 } 5956 5957 return 0; 5958 } 5959 5960 static int 5961 bdev_nvme_check_secondary_namespace(struct nvme_ctrlr *nvme_ctrlr, 5962 struct spdk_nvme_ctrlr *new_ctrlr) 5963 { 5964 struct nvme_ns *nvme_ns; 5965 struct spdk_nvme_ns *new_ns; 5966 5967 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 5968 while (nvme_ns != NULL) { 5969 new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nvme_ns->id); 5970 assert(new_ns != NULL); 5971 5972 if (!bdev_nvme_compare_ns(nvme_ns->ns, new_ns)) { 5973 return -EINVAL; 5974 } 5975 5976 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 5977 } 5978 5979 return 0; 5980 } 5981 5982 static int 5983 _bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 5984 struct spdk_nvme_transport_id *trid) 5985 { 5986 struct nvme_path_id *active_id, *new_trid, *tmp_trid; 5987 5988 new_trid = calloc(1, sizeof(*new_trid)); 5989 if (new_trid == NULL) { 5990 return -ENOMEM; 5991 } 5992 new_trid->trid = *trid; 5993 5994 active_id = nvme_ctrlr->active_path_id; 5995 assert(active_id != NULL); 5996 assert(active_id == TAILQ_FIRST(&nvme_ctrlr->trids)); 5997 5998 /* Skip the active trid not to replace it until it is failed. */ 5999 tmp_trid = TAILQ_NEXT(active_id, link); 6000 if (tmp_trid == NULL) { 6001 goto add_tail; 6002 } 6003 6004 /* It means the trid is faled if its last failed time is non-zero. 6005 * Insert the new alternate trid before any failed trid. 6006 */ 6007 TAILQ_FOREACH_FROM(tmp_trid, &nvme_ctrlr->trids, link) { 6008 if (tmp_trid->last_failed_tsc != 0) { 6009 TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link); 6010 return 0; 6011 } 6012 } 6013 6014 add_tail: 6015 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, new_trid, link); 6016 return 0; 6017 } 6018 6019 /* This is the case that a secondary path is added to an existing 6020 * nvme_ctrlr for failover. After checking if it can access the same 6021 * namespaces as the primary path, it is disconnected until failover occurs. 6022 */ 6023 static int 6024 bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 6025 struct spdk_nvme_ctrlr *new_ctrlr, 6026 struct spdk_nvme_transport_id *trid) 6027 { 6028 int rc; 6029 6030 assert(nvme_ctrlr != NULL); 6031 6032 pthread_mutex_lock(&nvme_ctrlr->mutex); 6033 6034 rc = bdev_nvme_check_secondary_trid(nvme_ctrlr, new_ctrlr, trid); 6035 if (rc != 0) { 6036 goto exit; 6037 } 6038 6039 rc = bdev_nvme_check_secondary_namespace(nvme_ctrlr, new_ctrlr); 6040 if (rc != 0) { 6041 goto exit; 6042 } 6043 6044 rc = _bdev_nvme_add_secondary_trid(nvme_ctrlr, trid); 6045 6046 exit: 6047 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6048 6049 spdk_nvme_detach(new_ctrlr); 6050 6051 return rc; 6052 } 6053 6054 static void 6055 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 6056 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 6057 { 6058 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 6059 struct nvme_async_probe_ctx *ctx; 6060 int rc; 6061 6062 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 6063 ctx->ctrlr_attached = true; 6064 6065 rc = nvme_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx); 6066 if (rc != 0) { 6067 ctx->reported_bdevs = 0; 6068 populate_namespaces_cb(ctx, rc); 6069 } 6070 } 6071 6072 static void 6073 connect_set_failover_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 6074 struct spdk_nvme_ctrlr *ctrlr, 6075 const struct spdk_nvme_ctrlr_opts *opts) 6076 { 6077 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 6078 struct nvme_ctrlr *nvme_ctrlr; 6079 struct nvme_async_probe_ctx *ctx; 6080 int rc; 6081 6082 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 6083 ctx->ctrlr_attached = true; 6084 6085 nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->base_name); 6086 if (nvme_ctrlr) { 6087 rc = bdev_nvme_add_secondary_trid(nvme_ctrlr, ctrlr, &ctx->trid); 6088 } else { 6089 rc = -ENODEV; 6090 } 6091 6092 ctx->reported_bdevs = 0; 6093 populate_namespaces_cb(ctx, rc); 6094 } 6095 6096 static int 6097 bdev_nvme_async_poll(void *arg) 6098 { 6099 struct nvme_async_probe_ctx *ctx = arg; 6100 int rc; 6101 6102 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 6103 if (spdk_unlikely(rc != -EAGAIN)) { 6104 ctx->probe_done = true; 6105 spdk_poller_unregister(&ctx->poller); 6106 if (!ctx->ctrlr_attached) { 6107 /* The probe is done, but no controller was attached. 6108 * That means we had a failure, so report -EIO back to 6109 * the caller (usually the RPC). populate_namespaces_cb() 6110 * will take care of freeing the nvme_async_probe_ctx. 6111 */ 6112 ctx->reported_bdevs = 0; 6113 populate_namespaces_cb(ctx, -EIO); 6114 } else if (ctx->namespaces_populated) { 6115 /* The namespaces for the attached controller were all 6116 * populated and the response was already sent to the 6117 * caller (usually the RPC). So free the context here. 6118 */ 6119 free_nvme_async_probe_ctx(ctx); 6120 } 6121 } 6122 6123 return SPDK_POLLER_BUSY; 6124 } 6125 6126 static bool 6127 bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, 6128 uint32_t reconnect_delay_sec, 6129 uint32_t fast_io_fail_timeout_sec) 6130 { 6131 if (ctrlr_loss_timeout_sec < -1) { 6132 SPDK_ERRLOG("ctrlr_loss_timeout_sec can't be less than -1.\n"); 6133 return false; 6134 } else if (ctrlr_loss_timeout_sec == -1) { 6135 if (reconnect_delay_sec == 0) { 6136 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 6137 return false; 6138 } else if (fast_io_fail_timeout_sec != 0 && 6139 fast_io_fail_timeout_sec < reconnect_delay_sec) { 6140 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io-fail_timeout_sec.\n"); 6141 return false; 6142 } 6143 } else if (ctrlr_loss_timeout_sec != 0) { 6144 if (reconnect_delay_sec == 0) { 6145 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 6146 return false; 6147 } else if (reconnect_delay_sec > (uint32_t)ctrlr_loss_timeout_sec) { 6148 SPDK_ERRLOG("reconnect_delay_sec can't be more than ctrlr_loss_timeout_sec.\n"); 6149 return false; 6150 } else if (fast_io_fail_timeout_sec != 0) { 6151 if (fast_io_fail_timeout_sec < reconnect_delay_sec) { 6152 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io_fail_timeout_sec.\n"); 6153 return false; 6154 } else if (fast_io_fail_timeout_sec > (uint32_t)ctrlr_loss_timeout_sec) { 6155 SPDK_ERRLOG("fast_io_fail_timeout_sec can't be more than ctrlr_loss_timeout_sec.\n"); 6156 return false; 6157 } 6158 } 6159 } else if (reconnect_delay_sec != 0 || fast_io_fail_timeout_sec != 0) { 6160 SPDK_ERRLOG("Both reconnect_delay_sec and fast_io_fail_timeout_sec must be 0 if ctrlr_loss_timeout_sec is 0.\n"); 6161 return false; 6162 } 6163 6164 return true; 6165 } 6166 6167 static int 6168 bdev_nvme_load_psk(const char *fname, char *buf, size_t bufsz) 6169 { 6170 FILE *psk_file; 6171 struct stat statbuf; 6172 int rc; 6173 #define TCP_PSK_INVALID_PERMISSIONS 0177 6174 6175 if (stat(fname, &statbuf) != 0) { 6176 SPDK_ERRLOG("Could not read permissions for PSK file\n"); 6177 return -EACCES; 6178 } 6179 6180 if ((statbuf.st_mode & TCP_PSK_INVALID_PERMISSIONS) != 0) { 6181 SPDK_ERRLOG("Incorrect permissions for PSK file\n"); 6182 return -EPERM; 6183 } 6184 if ((size_t)statbuf.st_size >= bufsz) { 6185 SPDK_ERRLOG("Invalid PSK: too long\n"); 6186 return -EINVAL; 6187 } 6188 psk_file = fopen(fname, "r"); 6189 if (psk_file == NULL) { 6190 SPDK_ERRLOG("Could not open PSK file\n"); 6191 return -EINVAL; 6192 } 6193 6194 memset(buf, 0, bufsz); 6195 rc = fread(buf, 1, statbuf.st_size, psk_file); 6196 if (rc != statbuf.st_size) { 6197 SPDK_ERRLOG("Failed to read PSK\n"); 6198 fclose(psk_file); 6199 return -EINVAL; 6200 } 6201 6202 fclose(psk_file); 6203 return 0; 6204 } 6205 6206 int 6207 spdk_bdev_nvme_create(struct spdk_nvme_transport_id *trid, 6208 const char *base_name, 6209 const char **names, 6210 uint32_t count, 6211 spdk_bdev_nvme_create_cb cb_fn, 6212 void *cb_ctx, 6213 struct spdk_nvme_ctrlr_opts *drv_opts, 6214 struct spdk_bdev_nvme_ctrlr_opts *bdev_opts, 6215 bool multipath) 6216 { 6217 struct nvme_probe_skip_entry *entry, *tmp; 6218 struct nvme_async_probe_ctx *ctx; 6219 spdk_nvme_attach_cb attach_cb; 6220 int rc, len; 6221 6222 /* TODO expand this check to include both the host and target TRIDs. 6223 * Only if both are the same should we fail. 6224 */ 6225 if (nvme_ctrlr_get(trid, drv_opts->hostnqn) != NULL) { 6226 SPDK_ERRLOG("A controller with the provided trid (traddr: %s, hostnqn: %s) " 6227 "already exists.\n", trid->traddr, drv_opts->hostnqn); 6228 return -EEXIST; 6229 } 6230 6231 len = strnlen(base_name, SPDK_CONTROLLER_NAME_MAX); 6232 6233 if (len == 0 || len == SPDK_CONTROLLER_NAME_MAX) { 6234 SPDK_ERRLOG("controller name must be between 1 and %d characters\n", SPDK_CONTROLLER_NAME_MAX - 1); 6235 return -EINVAL; 6236 } 6237 6238 if (bdev_opts != NULL && 6239 !bdev_nvme_check_io_error_resiliency_params(bdev_opts->ctrlr_loss_timeout_sec, 6240 bdev_opts->reconnect_delay_sec, 6241 bdev_opts->fast_io_fail_timeout_sec)) { 6242 return -EINVAL; 6243 } 6244 6245 ctx = calloc(1, sizeof(*ctx)); 6246 if (!ctx) { 6247 return -ENOMEM; 6248 } 6249 ctx->base_name = base_name; 6250 ctx->names = names; 6251 ctx->max_bdevs = count; 6252 ctx->cb_fn = cb_fn; 6253 ctx->cb_ctx = cb_ctx; 6254 ctx->trid = *trid; 6255 6256 if (bdev_opts) { 6257 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 6258 } else { 6259 spdk_bdev_nvme_get_default_ctrlr_opts(&ctx->bdev_opts); 6260 } 6261 6262 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 6263 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) { 6264 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 6265 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 6266 free(entry); 6267 break; 6268 } 6269 } 6270 } 6271 6272 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 6273 ctx->drv_opts.transport_retry_count = g_opts.transport_retry_count; 6274 ctx->drv_opts.transport_ack_timeout = g_opts.transport_ack_timeout; 6275 ctx->drv_opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms; 6276 ctx->drv_opts.disable_read_ana_log_page = true; 6277 ctx->drv_opts.transport_tos = g_opts.transport_tos; 6278 6279 if (ctx->bdev_opts.psk[0] != '\0') { 6280 /* Try to use the keyring first */ 6281 ctx->drv_opts.tls_psk = spdk_keyring_get_key(ctx->bdev_opts.psk); 6282 if (ctx->drv_opts.tls_psk == NULL) { 6283 rc = bdev_nvme_load_psk(ctx->bdev_opts.psk, 6284 ctx->drv_opts.psk, sizeof(ctx->drv_opts.psk)); 6285 if (rc != 0) { 6286 SPDK_ERRLOG("Could not load PSK from %s\n", ctx->bdev_opts.psk); 6287 free_nvme_async_probe_ctx(ctx); 6288 return rc; 6289 } 6290 } 6291 } 6292 6293 if (ctx->bdev_opts.dhchap_key != NULL) { 6294 ctx->drv_opts.dhchap_key = spdk_keyring_get_key(ctx->bdev_opts.dhchap_key); 6295 if (ctx->drv_opts.dhchap_key == NULL) { 6296 SPDK_ERRLOG("Could not load DH-HMAC-CHAP key: %s\n", 6297 ctx->bdev_opts.dhchap_key); 6298 free_nvme_async_probe_ctx(ctx); 6299 return -ENOKEY; 6300 } 6301 6302 ctx->drv_opts.dhchap_digests = g_opts.dhchap_digests; 6303 ctx->drv_opts.dhchap_dhgroups = g_opts.dhchap_dhgroups; 6304 } 6305 if (ctx->bdev_opts.dhchap_ctrlr_key != NULL) { 6306 ctx->drv_opts.dhchap_ctrlr_key = 6307 spdk_keyring_get_key(ctx->bdev_opts.dhchap_ctrlr_key); 6308 if (ctx->drv_opts.dhchap_ctrlr_key == NULL) { 6309 SPDK_ERRLOG("Could not load DH-HMAC-CHAP controller key: %s\n", 6310 ctx->bdev_opts.dhchap_ctrlr_key); 6311 free_nvme_async_probe_ctx(ctx); 6312 return -ENOKEY; 6313 } 6314 } 6315 6316 if (nvme_bdev_ctrlr_get_by_name(base_name) == NULL || multipath) { 6317 attach_cb = connect_attach_cb; 6318 } else { 6319 attach_cb = connect_set_failover_cb; 6320 } 6321 6322 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, attach_cb); 6323 if (ctx->probe_ctx == NULL) { 6324 SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr); 6325 free_nvme_async_probe_ctx(ctx); 6326 return -ENODEV; 6327 } 6328 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000); 6329 6330 return 0; 6331 } 6332 6333 struct bdev_nvme_delete_ctx { 6334 char *name; 6335 struct nvme_path_id path_id; 6336 bdev_nvme_delete_done_fn delete_done; 6337 void *delete_done_ctx; 6338 uint64_t timeout_ticks; 6339 struct spdk_poller *poller; 6340 }; 6341 6342 static void 6343 free_bdev_nvme_delete_ctx(struct bdev_nvme_delete_ctx *ctx) 6344 { 6345 if (ctx != NULL) { 6346 free(ctx->name); 6347 free(ctx); 6348 } 6349 } 6350 6351 static bool 6352 nvme_path_id_compare(struct nvme_path_id *p, const struct nvme_path_id *path_id) 6353 { 6354 if (path_id->trid.trtype != 0) { 6355 if (path_id->trid.trtype == SPDK_NVME_TRANSPORT_CUSTOM) { 6356 if (strcasecmp(path_id->trid.trstring, p->trid.trstring) != 0) { 6357 return false; 6358 } 6359 } else { 6360 if (path_id->trid.trtype != p->trid.trtype) { 6361 return false; 6362 } 6363 } 6364 } 6365 6366 if (!spdk_mem_all_zero(path_id->trid.traddr, sizeof(path_id->trid.traddr))) { 6367 if (strcasecmp(path_id->trid.traddr, p->trid.traddr) != 0) { 6368 return false; 6369 } 6370 } 6371 6372 if (path_id->trid.adrfam != 0) { 6373 if (path_id->trid.adrfam != p->trid.adrfam) { 6374 return false; 6375 } 6376 } 6377 6378 if (!spdk_mem_all_zero(path_id->trid.trsvcid, sizeof(path_id->trid.trsvcid))) { 6379 if (strcasecmp(path_id->trid.trsvcid, p->trid.trsvcid) != 0) { 6380 return false; 6381 } 6382 } 6383 6384 if (!spdk_mem_all_zero(path_id->trid.subnqn, sizeof(path_id->trid.subnqn))) { 6385 if (strcmp(path_id->trid.subnqn, p->trid.subnqn) != 0) { 6386 return false; 6387 } 6388 } 6389 6390 if (!spdk_mem_all_zero(path_id->hostid.hostaddr, sizeof(path_id->hostid.hostaddr))) { 6391 if (strcmp(path_id->hostid.hostaddr, p->hostid.hostaddr) != 0) { 6392 return false; 6393 } 6394 } 6395 6396 if (!spdk_mem_all_zero(path_id->hostid.hostsvcid, sizeof(path_id->hostid.hostsvcid))) { 6397 if (strcmp(path_id->hostid.hostsvcid, p->hostid.hostsvcid) != 0) { 6398 return false; 6399 } 6400 } 6401 6402 return true; 6403 } 6404 6405 static bool 6406 nvme_path_id_exists(const char *name, const struct nvme_path_id *path_id) 6407 { 6408 struct nvme_bdev_ctrlr *nbdev_ctrlr; 6409 struct nvme_ctrlr *ctrlr; 6410 struct nvme_path_id *p; 6411 6412 pthread_mutex_lock(&g_bdev_nvme_mutex); 6413 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 6414 if (!nbdev_ctrlr) { 6415 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6416 return false; 6417 } 6418 6419 TAILQ_FOREACH(ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 6420 pthread_mutex_lock(&ctrlr->mutex); 6421 TAILQ_FOREACH(p, &ctrlr->trids, link) { 6422 if (nvme_path_id_compare(p, path_id)) { 6423 pthread_mutex_unlock(&ctrlr->mutex); 6424 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6425 return true; 6426 } 6427 } 6428 pthread_mutex_unlock(&ctrlr->mutex); 6429 } 6430 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6431 6432 return false; 6433 } 6434 6435 static int 6436 bdev_nvme_delete_complete_poll(void *arg) 6437 { 6438 struct bdev_nvme_delete_ctx *ctx = arg; 6439 int rc = 0; 6440 6441 if (nvme_path_id_exists(ctx->name, &ctx->path_id)) { 6442 if (ctx->timeout_ticks > spdk_get_ticks()) { 6443 return SPDK_POLLER_BUSY; 6444 } 6445 6446 SPDK_ERRLOG("NVMe path '%s' still exists after delete\n", ctx->name); 6447 rc = -ETIMEDOUT; 6448 } 6449 6450 spdk_poller_unregister(&ctx->poller); 6451 6452 ctx->delete_done(ctx->delete_done_ctx, rc); 6453 free_bdev_nvme_delete_ctx(ctx); 6454 6455 return SPDK_POLLER_BUSY; 6456 } 6457 6458 static int 6459 _bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, const struct nvme_path_id *path_id) 6460 { 6461 struct nvme_path_id *p, *t; 6462 spdk_msg_fn msg_fn; 6463 int rc = -ENXIO; 6464 6465 pthread_mutex_lock(&nvme_ctrlr->mutex); 6466 6467 TAILQ_FOREACH_REVERSE_SAFE(p, &nvme_ctrlr->trids, nvme_paths, link, t) { 6468 if (p == TAILQ_FIRST(&nvme_ctrlr->trids)) { 6469 break; 6470 } 6471 6472 if (!nvme_path_id_compare(p, path_id)) { 6473 continue; 6474 } 6475 6476 /* We are not using the specified path. */ 6477 TAILQ_REMOVE(&nvme_ctrlr->trids, p, link); 6478 free(p); 6479 rc = 0; 6480 } 6481 6482 if (p == NULL || !nvme_path_id_compare(p, path_id)) { 6483 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6484 return rc; 6485 } 6486 6487 /* If we made it here, then this path is a match! Now we need to remove it. */ 6488 6489 /* This is the active path in use right now. The active path is always the first in the list. */ 6490 assert(p == nvme_ctrlr->active_path_id); 6491 6492 if (!TAILQ_NEXT(p, link)) { 6493 /* The current path is the only path. */ 6494 msg_fn = _nvme_ctrlr_destruct; 6495 rc = bdev_nvme_delete_ctrlr_unsafe(nvme_ctrlr, false); 6496 } else { 6497 /* There is an alternative path. */ 6498 msg_fn = _bdev_nvme_reset_ctrlr; 6499 rc = bdev_nvme_failover_ctrlr_unsafe(nvme_ctrlr, true); 6500 } 6501 6502 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6503 6504 if (rc == 0) { 6505 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 6506 } else if (rc == -EALREADY) { 6507 rc = 0; 6508 } 6509 6510 return rc; 6511 } 6512 6513 int 6514 bdev_nvme_delete(const char *name, const struct nvme_path_id *path_id, 6515 bdev_nvme_delete_done_fn delete_done, void *delete_done_ctx) 6516 { 6517 struct nvme_bdev_ctrlr *nbdev_ctrlr; 6518 struct nvme_ctrlr *nvme_ctrlr, *tmp_nvme_ctrlr; 6519 struct bdev_nvme_delete_ctx *ctx = NULL; 6520 int rc = -ENXIO, _rc; 6521 6522 if (name == NULL || path_id == NULL) { 6523 rc = -EINVAL; 6524 goto exit; 6525 } 6526 6527 pthread_mutex_lock(&g_bdev_nvme_mutex); 6528 6529 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 6530 if (nbdev_ctrlr == NULL) { 6531 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6532 6533 SPDK_ERRLOG("Failed to find NVMe bdev controller\n"); 6534 rc = -ENODEV; 6535 goto exit; 6536 } 6537 6538 TAILQ_FOREACH_SAFE(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq, tmp_nvme_ctrlr) { 6539 _rc = _bdev_nvme_delete(nvme_ctrlr, path_id); 6540 if (_rc < 0 && _rc != -ENXIO) { 6541 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6542 rc = _rc; 6543 goto exit; 6544 } else if (_rc == 0) { 6545 /* We traverse all remaining nvme_ctrlrs even if one nvme_ctrlr 6546 * was deleted successfully. To remember the successful deletion, 6547 * overwrite rc only if _rc is zero. 6548 */ 6549 rc = 0; 6550 } 6551 } 6552 6553 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6554 6555 if (rc != 0 || delete_done == NULL) { 6556 goto exit; 6557 } 6558 6559 ctx = calloc(1, sizeof(*ctx)); 6560 if (ctx == NULL) { 6561 SPDK_ERRLOG("Failed to allocate context for bdev_nvme_delete\n"); 6562 rc = -ENOMEM; 6563 goto exit; 6564 } 6565 6566 ctx->name = strdup(name); 6567 if (ctx->name == NULL) { 6568 SPDK_ERRLOG("Failed to copy controller name for deletion\n"); 6569 rc = -ENOMEM; 6570 goto exit; 6571 } 6572 6573 ctx->delete_done = delete_done; 6574 ctx->delete_done_ctx = delete_done_ctx; 6575 ctx->path_id = *path_id; 6576 ctx->timeout_ticks = spdk_get_ticks() + 10 * spdk_get_ticks_hz(); 6577 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_delete_complete_poll, ctx, 1000); 6578 if (ctx->poller == NULL) { 6579 SPDK_ERRLOG("Failed to register bdev_nvme_delete poller\n"); 6580 rc = -ENOMEM; 6581 goto exit; 6582 } 6583 6584 exit: 6585 if (rc != 0) { 6586 free_bdev_nvme_delete_ctx(ctx); 6587 } 6588 6589 return rc; 6590 } 6591 6592 #define DISCOVERY_INFOLOG(ctx, format, ...) \ 6593 SPDK_INFOLOG(bdev_nvme, "Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 6594 6595 #define DISCOVERY_ERRLOG(ctx, format, ...) \ 6596 SPDK_ERRLOG("Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 6597 6598 struct discovery_entry_ctx { 6599 char name[128]; 6600 struct spdk_nvme_transport_id trid; 6601 struct spdk_nvme_ctrlr_opts drv_opts; 6602 struct spdk_nvmf_discovery_log_page_entry entry; 6603 TAILQ_ENTRY(discovery_entry_ctx) tailq; 6604 struct discovery_ctx *ctx; 6605 }; 6606 6607 struct discovery_ctx { 6608 char *name; 6609 spdk_bdev_nvme_start_discovery_fn start_cb_fn; 6610 spdk_bdev_nvme_stop_discovery_fn stop_cb_fn; 6611 void *cb_ctx; 6612 struct spdk_nvme_probe_ctx *probe_ctx; 6613 struct spdk_nvme_detach_ctx *detach_ctx; 6614 struct spdk_nvme_ctrlr *ctrlr; 6615 struct spdk_nvme_transport_id trid; 6616 struct discovery_entry_ctx *entry_ctx_in_use; 6617 struct spdk_poller *poller; 6618 struct spdk_nvme_ctrlr_opts drv_opts; 6619 struct spdk_bdev_nvme_ctrlr_opts bdev_opts; 6620 struct spdk_nvmf_discovery_log_page *log_page; 6621 TAILQ_ENTRY(discovery_ctx) tailq; 6622 TAILQ_HEAD(, discovery_entry_ctx) nvm_entry_ctxs; 6623 TAILQ_HEAD(, discovery_entry_ctx) discovery_entry_ctxs; 6624 int rc; 6625 bool wait_for_attach; 6626 uint64_t timeout_ticks; 6627 /* Denotes that the discovery service is being started. We're waiting 6628 * for the initial connection to the discovery controller to be 6629 * established and attach discovered NVM ctrlrs. 6630 */ 6631 bool initializing; 6632 /* Denotes if a discovery is currently in progress for this context. 6633 * That includes connecting to newly discovered subsystems. Used to 6634 * ensure we do not start a new discovery until an existing one is 6635 * complete. 6636 */ 6637 bool in_progress; 6638 6639 /* Denotes if another discovery is needed after the one in progress 6640 * completes. Set when we receive an AER completion while a discovery 6641 * is already in progress. 6642 */ 6643 bool pending; 6644 6645 /* Signal to the discovery context poller that it should stop the 6646 * discovery service, including detaching from the current discovery 6647 * controller. 6648 */ 6649 bool stop; 6650 6651 struct spdk_thread *calling_thread; 6652 uint32_t index; 6653 uint32_t attach_in_progress; 6654 char *hostnqn; 6655 6656 /* Denotes if the discovery service was started by the mdns discovery. 6657 */ 6658 bool from_mdns_discovery_service; 6659 }; 6660 6661 TAILQ_HEAD(discovery_ctxs, discovery_ctx); 6662 static struct discovery_ctxs g_discovery_ctxs = TAILQ_HEAD_INITIALIZER(g_discovery_ctxs); 6663 6664 static void get_discovery_log_page(struct discovery_ctx *ctx); 6665 6666 static void 6667 free_discovery_ctx(struct discovery_ctx *ctx) 6668 { 6669 free(ctx->log_page); 6670 free(ctx->hostnqn); 6671 free(ctx->name); 6672 free(ctx); 6673 } 6674 6675 static void 6676 discovery_complete(struct discovery_ctx *ctx) 6677 { 6678 ctx->initializing = false; 6679 ctx->in_progress = false; 6680 if (ctx->pending) { 6681 ctx->pending = false; 6682 get_discovery_log_page(ctx); 6683 } 6684 } 6685 6686 static void 6687 build_trid_from_log_page_entry(struct spdk_nvme_transport_id *trid, 6688 struct spdk_nvmf_discovery_log_page_entry *entry) 6689 { 6690 char *space; 6691 6692 trid->trtype = entry->trtype; 6693 trid->adrfam = entry->adrfam; 6694 memcpy(trid->traddr, entry->traddr, sizeof(entry->traddr)); 6695 memcpy(trid->trsvcid, entry->trsvcid, sizeof(entry->trsvcid)); 6696 /* Because the source buffer (entry->subnqn) is longer than trid->subnqn, and 6697 * before call to this function trid->subnqn is zeroed out, we need 6698 * to copy sizeof(trid->subnqn) minus one byte to make sure the last character 6699 * remains 0. Then we can shorten the string (replace ' ' with 0) if required 6700 */ 6701 memcpy(trid->subnqn, entry->subnqn, sizeof(trid->subnqn) - 1); 6702 6703 /* We want the traddr, trsvcid and subnqn fields to be NULL-terminated. 6704 * But the log page entries typically pad them with spaces, not zeroes. 6705 * So add a NULL terminator to each of these fields at the appropriate 6706 * location. 6707 */ 6708 space = strchr(trid->traddr, ' '); 6709 if (space) { 6710 *space = 0; 6711 } 6712 space = strchr(trid->trsvcid, ' '); 6713 if (space) { 6714 *space = 0; 6715 } 6716 space = strchr(trid->subnqn, ' '); 6717 if (space) { 6718 *space = 0; 6719 } 6720 } 6721 6722 static void 6723 _stop_discovery(void *_ctx) 6724 { 6725 struct discovery_ctx *ctx = _ctx; 6726 6727 if (ctx->attach_in_progress > 0) { 6728 spdk_thread_send_msg(spdk_get_thread(), _stop_discovery, ctx); 6729 return; 6730 } 6731 6732 ctx->stop = true; 6733 6734 while (!TAILQ_EMPTY(&ctx->nvm_entry_ctxs)) { 6735 struct discovery_entry_ctx *entry_ctx; 6736 struct nvme_path_id path = {}; 6737 6738 entry_ctx = TAILQ_FIRST(&ctx->nvm_entry_ctxs); 6739 path.trid = entry_ctx->trid; 6740 bdev_nvme_delete(entry_ctx->name, &path, NULL, NULL); 6741 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 6742 free(entry_ctx); 6743 } 6744 6745 while (!TAILQ_EMPTY(&ctx->discovery_entry_ctxs)) { 6746 struct discovery_entry_ctx *entry_ctx; 6747 6748 entry_ctx = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 6749 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 6750 free(entry_ctx); 6751 } 6752 6753 free(ctx->entry_ctx_in_use); 6754 ctx->entry_ctx_in_use = NULL; 6755 } 6756 6757 static void 6758 stop_discovery(struct discovery_ctx *ctx, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 6759 { 6760 ctx->stop_cb_fn = cb_fn; 6761 ctx->cb_ctx = cb_ctx; 6762 6763 if (ctx->attach_in_progress > 0) { 6764 DISCOVERY_INFOLOG(ctx, "stopping discovery with attach_in_progress: %"PRIu32"\n", 6765 ctx->attach_in_progress); 6766 } 6767 6768 _stop_discovery(ctx); 6769 } 6770 6771 static void 6772 remove_discovery_entry(struct nvme_ctrlr *nvme_ctrlr) 6773 { 6774 struct discovery_ctx *d_ctx; 6775 struct nvme_path_id *path_id; 6776 struct spdk_nvme_transport_id trid = {}; 6777 struct discovery_entry_ctx *entry_ctx, *tmp; 6778 6779 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 6780 6781 TAILQ_FOREACH(d_ctx, &g_discovery_ctxs, tailq) { 6782 TAILQ_FOREACH_SAFE(entry_ctx, &d_ctx->nvm_entry_ctxs, tailq, tmp) { 6783 build_trid_from_log_page_entry(&trid, &entry_ctx->entry); 6784 if (spdk_nvme_transport_id_compare(&trid, &path_id->trid) != 0) { 6785 continue; 6786 } 6787 6788 TAILQ_REMOVE(&d_ctx->nvm_entry_ctxs, entry_ctx, tailq); 6789 free(entry_ctx); 6790 DISCOVERY_INFOLOG(d_ctx, "Remove discovery entry: %s:%s:%s\n", 6791 trid.subnqn, trid.traddr, trid.trsvcid); 6792 6793 /* Fail discovery ctrlr to force reattach attempt */ 6794 spdk_nvme_ctrlr_fail(d_ctx->ctrlr); 6795 } 6796 } 6797 } 6798 6799 static void 6800 discovery_remove_controllers(struct discovery_ctx *ctx) 6801 { 6802 struct spdk_nvmf_discovery_log_page *log_page = ctx->log_page; 6803 struct discovery_entry_ctx *entry_ctx, *tmp; 6804 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 6805 struct spdk_nvme_transport_id old_trid = {}; 6806 uint64_t numrec, i; 6807 bool found; 6808 6809 numrec = from_le64(&log_page->numrec); 6810 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->nvm_entry_ctxs, tailq, tmp) { 6811 found = false; 6812 old_entry = &entry_ctx->entry; 6813 build_trid_from_log_page_entry(&old_trid, old_entry); 6814 for (i = 0; i < numrec; i++) { 6815 new_entry = &log_page->entries[i]; 6816 if (!memcmp(old_entry, new_entry, sizeof(*old_entry))) { 6817 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s found again\n", 6818 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 6819 found = true; 6820 break; 6821 } 6822 } 6823 if (!found) { 6824 struct nvme_path_id path = {}; 6825 6826 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s not found\n", 6827 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 6828 6829 path.trid = entry_ctx->trid; 6830 bdev_nvme_delete(entry_ctx->name, &path, NULL, NULL); 6831 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 6832 free(entry_ctx); 6833 } 6834 } 6835 free(log_page); 6836 ctx->log_page = NULL; 6837 discovery_complete(ctx); 6838 } 6839 6840 static void 6841 complete_discovery_start(struct discovery_ctx *ctx, int status) 6842 { 6843 ctx->timeout_ticks = 0; 6844 ctx->rc = status; 6845 if (ctx->start_cb_fn) { 6846 ctx->start_cb_fn(ctx->cb_ctx, status); 6847 ctx->start_cb_fn = NULL; 6848 ctx->cb_ctx = NULL; 6849 } 6850 } 6851 6852 static void 6853 discovery_attach_controller_done(void *cb_ctx, size_t bdev_count, int rc) 6854 { 6855 struct discovery_entry_ctx *entry_ctx = cb_ctx; 6856 struct discovery_ctx *ctx = entry_ctx->ctx; 6857 6858 DISCOVERY_INFOLOG(ctx, "attach %s done\n", entry_ctx->name); 6859 ctx->attach_in_progress--; 6860 if (ctx->attach_in_progress == 0) { 6861 complete_discovery_start(ctx, ctx->rc); 6862 if (ctx->initializing && ctx->rc != 0) { 6863 DISCOVERY_ERRLOG(ctx, "stopping discovery due to errors: %d\n", ctx->rc); 6864 stop_discovery(ctx, NULL, ctx->cb_ctx); 6865 } else { 6866 discovery_remove_controllers(ctx); 6867 } 6868 } 6869 } 6870 6871 static struct discovery_entry_ctx * 6872 create_discovery_entry_ctx(struct discovery_ctx *ctx, struct spdk_nvme_transport_id *trid) 6873 { 6874 struct discovery_entry_ctx *new_ctx; 6875 6876 new_ctx = calloc(1, sizeof(*new_ctx)); 6877 if (new_ctx == NULL) { 6878 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6879 return NULL; 6880 } 6881 6882 new_ctx->ctx = ctx; 6883 memcpy(&new_ctx->trid, trid, sizeof(*trid)); 6884 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 6885 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 6886 return new_ctx; 6887 } 6888 6889 static void 6890 discovery_log_page_cb(void *cb_arg, int rc, const struct spdk_nvme_cpl *cpl, 6891 struct spdk_nvmf_discovery_log_page *log_page) 6892 { 6893 struct discovery_ctx *ctx = cb_arg; 6894 struct discovery_entry_ctx *entry_ctx, *tmp; 6895 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 6896 uint64_t numrec, i; 6897 bool found; 6898 6899 if (rc || spdk_nvme_cpl_is_error(cpl)) { 6900 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 6901 return; 6902 } 6903 6904 ctx->log_page = log_page; 6905 assert(ctx->attach_in_progress == 0); 6906 numrec = from_le64(&log_page->numrec); 6907 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->discovery_entry_ctxs, tailq, tmp) { 6908 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 6909 free(entry_ctx); 6910 } 6911 for (i = 0; i < numrec; i++) { 6912 found = false; 6913 new_entry = &log_page->entries[i]; 6914 if (new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY_CURRENT || 6915 new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) { 6916 struct discovery_entry_ctx *new_ctx; 6917 struct spdk_nvme_transport_id trid = {}; 6918 6919 build_trid_from_log_page_entry(&trid, new_entry); 6920 new_ctx = create_discovery_entry_ctx(ctx, &trid); 6921 if (new_ctx == NULL) { 6922 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6923 break; 6924 } 6925 6926 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, new_ctx, tailq); 6927 continue; 6928 } 6929 TAILQ_FOREACH(entry_ctx, &ctx->nvm_entry_ctxs, tailq) { 6930 old_entry = &entry_ctx->entry; 6931 if (!memcmp(new_entry, old_entry, sizeof(*new_entry))) { 6932 found = true; 6933 break; 6934 } 6935 } 6936 if (!found) { 6937 struct discovery_entry_ctx *subnqn_ctx = NULL, *new_ctx; 6938 struct discovery_ctx *d_ctx; 6939 6940 TAILQ_FOREACH(d_ctx, &g_discovery_ctxs, tailq) { 6941 TAILQ_FOREACH(subnqn_ctx, &d_ctx->nvm_entry_ctxs, tailq) { 6942 if (!memcmp(subnqn_ctx->entry.subnqn, new_entry->subnqn, 6943 sizeof(new_entry->subnqn))) { 6944 break; 6945 } 6946 } 6947 if (subnqn_ctx) { 6948 break; 6949 } 6950 } 6951 6952 new_ctx = calloc(1, sizeof(*new_ctx)); 6953 if (new_ctx == NULL) { 6954 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6955 break; 6956 } 6957 6958 new_ctx->ctx = ctx; 6959 memcpy(&new_ctx->entry, new_entry, sizeof(*new_entry)); 6960 build_trid_from_log_page_entry(&new_ctx->trid, new_entry); 6961 if (subnqn_ctx) { 6962 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s", subnqn_ctx->name); 6963 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new path for %s\n", 6964 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 6965 new_ctx->name); 6966 } else { 6967 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s%d", ctx->name, ctx->index++); 6968 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new subsystem %s\n", 6969 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 6970 new_ctx->name); 6971 } 6972 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 6973 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 6974 rc = spdk_bdev_nvme_create(&new_ctx->trid, new_ctx->name, NULL, 0, 6975 discovery_attach_controller_done, new_ctx, 6976 &new_ctx->drv_opts, &ctx->bdev_opts, true); 6977 if (rc == 0) { 6978 TAILQ_INSERT_TAIL(&ctx->nvm_entry_ctxs, new_ctx, tailq); 6979 ctx->attach_in_progress++; 6980 } else { 6981 DISCOVERY_ERRLOG(ctx, "spdk_bdev_nvme_create failed (%s)\n", spdk_strerror(-rc)); 6982 } 6983 } 6984 } 6985 6986 if (ctx->attach_in_progress == 0) { 6987 discovery_remove_controllers(ctx); 6988 } 6989 } 6990 6991 static void 6992 get_discovery_log_page(struct discovery_ctx *ctx) 6993 { 6994 int rc; 6995 6996 assert(ctx->in_progress == false); 6997 ctx->in_progress = true; 6998 rc = spdk_nvme_ctrlr_get_discovery_log_page(ctx->ctrlr, discovery_log_page_cb, ctx); 6999 if (rc != 0) { 7000 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 7001 } 7002 DISCOVERY_INFOLOG(ctx, "sent discovery log page command\n"); 7003 } 7004 7005 static void 7006 discovery_aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 7007 { 7008 struct discovery_ctx *ctx = arg; 7009 uint32_t log_page_id = (cpl->cdw0 & 0xFF0000) >> 16; 7010 7011 if (spdk_nvme_cpl_is_error(cpl)) { 7012 DISCOVERY_ERRLOG(ctx, "aer failed\n"); 7013 return; 7014 } 7015 7016 if (log_page_id != SPDK_NVME_LOG_DISCOVERY) { 7017 DISCOVERY_ERRLOG(ctx, "unexpected log page 0x%x\n", log_page_id); 7018 return; 7019 } 7020 7021 DISCOVERY_INFOLOG(ctx, "got aer\n"); 7022 if (ctx->in_progress) { 7023 ctx->pending = true; 7024 return; 7025 } 7026 7027 get_discovery_log_page(ctx); 7028 } 7029 7030 static void 7031 discovery_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 7032 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 7033 { 7034 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 7035 struct discovery_ctx *ctx; 7036 7037 ctx = SPDK_CONTAINEROF(user_opts, struct discovery_ctx, drv_opts); 7038 7039 DISCOVERY_INFOLOG(ctx, "discovery ctrlr attached\n"); 7040 ctx->probe_ctx = NULL; 7041 ctx->ctrlr = ctrlr; 7042 7043 if (ctx->rc != 0) { 7044 DISCOVERY_ERRLOG(ctx, "encountered error while attaching discovery ctrlr: %d\n", 7045 ctx->rc); 7046 return; 7047 } 7048 7049 spdk_nvme_ctrlr_register_aer_callback(ctx->ctrlr, discovery_aer_cb, ctx); 7050 } 7051 7052 static int 7053 discovery_poller(void *arg) 7054 { 7055 struct discovery_ctx *ctx = arg; 7056 struct spdk_nvme_transport_id *trid; 7057 int rc; 7058 7059 if (ctx->detach_ctx) { 7060 rc = spdk_nvme_detach_poll_async(ctx->detach_ctx); 7061 if (rc != -EAGAIN) { 7062 ctx->detach_ctx = NULL; 7063 ctx->ctrlr = NULL; 7064 } 7065 } else if (ctx->stop) { 7066 if (ctx->ctrlr != NULL) { 7067 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 7068 if (rc == 0) { 7069 return SPDK_POLLER_BUSY; 7070 } 7071 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 7072 } 7073 spdk_poller_unregister(&ctx->poller); 7074 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 7075 assert(ctx->start_cb_fn == NULL); 7076 if (ctx->stop_cb_fn != NULL) { 7077 ctx->stop_cb_fn(ctx->cb_ctx); 7078 } 7079 free_discovery_ctx(ctx); 7080 } else if (ctx->probe_ctx == NULL && ctx->ctrlr == NULL) { 7081 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 7082 DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); 7083 assert(ctx->initializing); 7084 spdk_poller_unregister(&ctx->poller); 7085 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 7086 complete_discovery_start(ctx, -ETIMEDOUT); 7087 stop_discovery(ctx, NULL, NULL); 7088 free_discovery_ctx(ctx); 7089 return SPDK_POLLER_BUSY; 7090 } 7091 7092 assert(ctx->entry_ctx_in_use == NULL); 7093 ctx->entry_ctx_in_use = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 7094 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 7095 trid = &ctx->entry_ctx_in_use->trid; 7096 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, discovery_attach_cb); 7097 if (ctx->probe_ctx) { 7098 spdk_poller_unregister(&ctx->poller); 7099 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000); 7100 } else { 7101 DISCOVERY_ERRLOG(ctx, "could not start discovery connect\n"); 7102 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 7103 ctx->entry_ctx_in_use = NULL; 7104 } 7105 } else if (ctx->probe_ctx) { 7106 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 7107 DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); 7108 complete_discovery_start(ctx, -ETIMEDOUT); 7109 return SPDK_POLLER_BUSY; 7110 } 7111 7112 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 7113 if (rc != -EAGAIN) { 7114 if (ctx->rc != 0) { 7115 assert(ctx->initializing); 7116 stop_discovery(ctx, NULL, ctx->cb_ctx); 7117 } else { 7118 assert(rc == 0); 7119 DISCOVERY_INFOLOG(ctx, "discovery ctrlr connected\n"); 7120 ctx->rc = rc; 7121 get_discovery_log_page(ctx); 7122 } 7123 } 7124 } else { 7125 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 7126 DISCOVERY_ERRLOG(ctx, "timed out while attaching NVM ctrlrs\n"); 7127 complete_discovery_start(ctx, -ETIMEDOUT); 7128 /* We need to wait until all NVM ctrlrs are attached before we stop the 7129 * discovery service to make sure we don't detach a ctrlr that is still 7130 * being attached. 7131 */ 7132 if (ctx->attach_in_progress == 0) { 7133 stop_discovery(ctx, NULL, ctx->cb_ctx); 7134 return SPDK_POLLER_BUSY; 7135 } 7136 } 7137 7138 rc = spdk_nvme_ctrlr_process_admin_completions(ctx->ctrlr); 7139 if (rc < 0) { 7140 spdk_poller_unregister(&ctx->poller); 7141 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 7142 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 7143 ctx->entry_ctx_in_use = NULL; 7144 7145 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 7146 if (rc != 0) { 7147 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 7148 ctx->ctrlr = NULL; 7149 } 7150 } 7151 } 7152 7153 return SPDK_POLLER_BUSY; 7154 } 7155 7156 static void 7157 start_discovery_poller(void *arg) 7158 { 7159 struct discovery_ctx *ctx = arg; 7160 7161 TAILQ_INSERT_TAIL(&g_discovery_ctxs, ctx, tailq); 7162 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 7163 } 7164 7165 int 7166 bdev_nvme_start_discovery(struct spdk_nvme_transport_id *trid, 7167 const char *base_name, 7168 struct spdk_nvme_ctrlr_opts *drv_opts, 7169 struct spdk_bdev_nvme_ctrlr_opts *bdev_opts, 7170 uint64_t attach_timeout, 7171 bool from_mdns, 7172 spdk_bdev_nvme_start_discovery_fn cb_fn, void *cb_ctx) 7173 { 7174 struct discovery_ctx *ctx; 7175 struct discovery_entry_ctx *discovery_entry_ctx; 7176 7177 snprintf(trid->subnqn, sizeof(trid->subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN); 7178 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 7179 if (strcmp(ctx->name, base_name) == 0) { 7180 return -EEXIST; 7181 } 7182 7183 if (ctx->entry_ctx_in_use != NULL) { 7184 if (!spdk_nvme_transport_id_compare(trid, &ctx->entry_ctx_in_use->trid)) { 7185 return -EEXIST; 7186 } 7187 } 7188 7189 TAILQ_FOREACH(discovery_entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 7190 if (!spdk_nvme_transport_id_compare(trid, &discovery_entry_ctx->trid)) { 7191 return -EEXIST; 7192 } 7193 } 7194 } 7195 7196 ctx = calloc(1, sizeof(*ctx)); 7197 if (ctx == NULL) { 7198 return -ENOMEM; 7199 } 7200 7201 ctx->name = strdup(base_name); 7202 if (ctx->name == NULL) { 7203 free_discovery_ctx(ctx); 7204 return -ENOMEM; 7205 } 7206 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 7207 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 7208 ctx->from_mdns_discovery_service = from_mdns; 7209 ctx->bdev_opts.from_discovery_service = true; 7210 ctx->calling_thread = spdk_get_thread(); 7211 ctx->start_cb_fn = cb_fn; 7212 ctx->cb_ctx = cb_ctx; 7213 ctx->initializing = true; 7214 if (ctx->start_cb_fn) { 7215 /* We can use this when dumping json to denote if this RPC parameter 7216 * was specified or not. 7217 */ 7218 ctx->wait_for_attach = true; 7219 } 7220 if (attach_timeout != 0) { 7221 ctx->timeout_ticks = spdk_get_ticks() + attach_timeout * 7222 spdk_get_ticks_hz() / 1000ull; 7223 } 7224 TAILQ_INIT(&ctx->nvm_entry_ctxs); 7225 TAILQ_INIT(&ctx->discovery_entry_ctxs); 7226 memcpy(&ctx->trid, trid, sizeof(*trid)); 7227 /* Even if user did not specify hostnqn, we can still strdup("\0"); */ 7228 ctx->hostnqn = strdup(ctx->drv_opts.hostnqn); 7229 if (ctx->hostnqn == NULL) { 7230 free_discovery_ctx(ctx); 7231 return -ENOMEM; 7232 } 7233 discovery_entry_ctx = create_discovery_entry_ctx(ctx, trid); 7234 if (discovery_entry_ctx == NULL) { 7235 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 7236 free_discovery_ctx(ctx); 7237 return -ENOMEM; 7238 } 7239 7240 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, discovery_entry_ctx, tailq); 7241 spdk_thread_send_msg(g_bdev_nvme_init_thread, start_discovery_poller, ctx); 7242 return 0; 7243 } 7244 7245 int 7246 bdev_nvme_stop_discovery(const char *name, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 7247 { 7248 struct discovery_ctx *ctx; 7249 7250 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 7251 if (strcmp(name, ctx->name) == 0) { 7252 if (ctx->stop) { 7253 return -EALREADY; 7254 } 7255 /* If we're still starting the discovery service and ->rc is non-zero, we're 7256 * going to stop it as soon as we can 7257 */ 7258 if (ctx->initializing && ctx->rc != 0) { 7259 return -EALREADY; 7260 } 7261 stop_discovery(ctx, cb_fn, cb_ctx); 7262 return 0; 7263 } 7264 } 7265 7266 return -ENOENT; 7267 } 7268 7269 static int 7270 bdev_nvme_library_init(void) 7271 { 7272 g_bdev_nvme_init_thread = spdk_get_thread(); 7273 7274 spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_create_poll_group_cb, 7275 bdev_nvme_destroy_poll_group_cb, 7276 sizeof(struct nvme_poll_group), "nvme_poll_groups"); 7277 7278 return 0; 7279 } 7280 7281 static void 7282 bdev_nvme_fini_destruct_ctrlrs(void) 7283 { 7284 struct nvme_bdev_ctrlr *nbdev_ctrlr; 7285 struct nvme_ctrlr *nvme_ctrlr; 7286 7287 pthread_mutex_lock(&g_bdev_nvme_mutex); 7288 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 7289 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 7290 pthread_mutex_lock(&nvme_ctrlr->mutex); 7291 if (nvme_ctrlr->destruct) { 7292 /* This controller's destruction was already started 7293 * before the application started shutting down 7294 */ 7295 pthread_mutex_unlock(&nvme_ctrlr->mutex); 7296 continue; 7297 } 7298 nvme_ctrlr->destruct = true; 7299 pthread_mutex_unlock(&nvme_ctrlr->mutex); 7300 7301 spdk_thread_send_msg(nvme_ctrlr->thread, _nvme_ctrlr_destruct, 7302 nvme_ctrlr); 7303 } 7304 } 7305 7306 g_bdev_nvme_module_finish = true; 7307 if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 7308 pthread_mutex_unlock(&g_bdev_nvme_mutex); 7309 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 7310 spdk_bdev_module_fini_done(); 7311 return; 7312 } 7313 7314 pthread_mutex_unlock(&g_bdev_nvme_mutex); 7315 } 7316 7317 static void 7318 check_discovery_fini(void *arg) 7319 { 7320 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 7321 bdev_nvme_fini_destruct_ctrlrs(); 7322 } 7323 } 7324 7325 static void 7326 bdev_nvme_library_fini(void) 7327 { 7328 struct nvme_probe_skip_entry *entry, *entry_tmp; 7329 struct discovery_ctx *ctx; 7330 7331 spdk_poller_unregister(&g_hotplug_poller); 7332 free(g_hotplug_probe_ctx); 7333 g_hotplug_probe_ctx = NULL; 7334 7335 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) { 7336 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 7337 free(entry); 7338 } 7339 7340 assert(spdk_get_thread() == g_bdev_nvme_init_thread); 7341 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 7342 bdev_nvme_fini_destruct_ctrlrs(); 7343 } else { 7344 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 7345 stop_discovery(ctx, check_discovery_fini, NULL); 7346 } 7347 } 7348 } 7349 7350 static void 7351 bdev_nvme_verify_pi_error(struct nvme_bdev_io *bio) 7352 { 7353 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7354 struct spdk_bdev *bdev = bdev_io->bdev; 7355 struct spdk_dif_ctx dif_ctx; 7356 struct spdk_dif_error err_blk = {}; 7357 int rc; 7358 struct spdk_dif_ctx_init_ext_opts dif_opts; 7359 7360 dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format); 7361 dif_opts.dif_pi_format = bdev->dif_pi_format; 7362 rc = spdk_dif_ctx_init(&dif_ctx, 7363 bdev->blocklen, bdev->md_len, bdev->md_interleave, 7364 bdev->dif_is_head_of_md, bdev->dif_type, 7365 bdev_io->u.bdev.dif_check_flags, 7366 bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0, &dif_opts); 7367 if (rc != 0) { 7368 SPDK_ERRLOG("Initialization of DIF context failed\n"); 7369 return; 7370 } 7371 7372 if (bdev->md_interleave) { 7373 rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 7374 bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 7375 } else { 7376 struct iovec md_iov = { 7377 .iov_base = bdev_io->u.bdev.md_buf, 7378 .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len, 7379 }; 7380 7381 rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 7382 &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 7383 } 7384 7385 if (rc != 0) { 7386 SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n", 7387 err_blk.err_type, err_blk.err_offset); 7388 } else { 7389 SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n"); 7390 } 7391 } 7392 7393 static void 7394 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 7395 { 7396 struct nvme_bdev_io *bio = ref; 7397 7398 if (spdk_nvme_cpl_is_success(cpl)) { 7399 /* Run PI verification for read data buffer. */ 7400 bdev_nvme_verify_pi_error(bio); 7401 } 7402 7403 /* Return original completion status */ 7404 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 7405 } 7406 7407 static void 7408 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 7409 { 7410 struct nvme_bdev_io *bio = ref; 7411 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7412 int ret; 7413 7414 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 7415 SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n", 7416 cpl->status.sct, cpl->status.sc); 7417 7418 /* Save completion status to use after verifying PI error. */ 7419 bio->cpl = *cpl; 7420 7421 if (spdk_likely(nvme_io_path_is_available(bio->io_path))) { 7422 /* Read without PI checking to verify PI error. */ 7423 ret = bdev_nvme_no_pi_readv(bio, 7424 bdev_io->u.bdev.iovs, 7425 bdev_io->u.bdev.iovcnt, 7426 bdev_io->u.bdev.md_buf, 7427 bdev_io->u.bdev.num_blocks, 7428 bdev_io->u.bdev.offset_blocks); 7429 if (ret == 0) { 7430 return; 7431 } 7432 } 7433 } 7434 7435 bdev_nvme_io_complete_nvme_status(bio, cpl); 7436 } 7437 7438 static void 7439 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7440 { 7441 struct nvme_bdev_io *bio = ref; 7442 7443 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 7444 SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n", 7445 cpl->status.sct, cpl->status.sc); 7446 /* Run PI verification for write data buffer if PI error is detected. */ 7447 bdev_nvme_verify_pi_error(bio); 7448 } 7449 7450 bdev_nvme_io_complete_nvme_status(bio, cpl); 7451 } 7452 7453 static void 7454 bdev_nvme_zone_appendv_done(void *ref, const struct spdk_nvme_cpl *cpl) 7455 { 7456 struct nvme_bdev_io *bio = ref; 7457 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7458 7459 /* spdk_bdev_io_get_append_location() requires that the ALBA is stored in offset_blocks. 7460 * Additionally, offset_blocks has to be set before calling bdev_nvme_verify_pi_error(). 7461 */ 7462 bdev_io->u.bdev.offset_blocks = *(uint64_t *)&cpl->cdw0; 7463 7464 if (spdk_nvme_cpl_is_pi_error(cpl)) { 7465 SPDK_ERRLOG("zone append completed with PI error (sct=%d, sc=%d)\n", 7466 cpl->status.sct, cpl->status.sc); 7467 /* Run PI verification for zone append data buffer if PI error is detected. */ 7468 bdev_nvme_verify_pi_error(bio); 7469 } 7470 7471 bdev_nvme_io_complete_nvme_status(bio, cpl); 7472 } 7473 7474 static void 7475 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7476 { 7477 struct nvme_bdev_io *bio = ref; 7478 7479 if (spdk_nvme_cpl_is_pi_error(cpl)) { 7480 SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n", 7481 cpl->status.sct, cpl->status.sc); 7482 /* Run PI verification for compare data buffer if PI error is detected. */ 7483 bdev_nvme_verify_pi_error(bio); 7484 } 7485 7486 bdev_nvme_io_complete_nvme_status(bio, cpl); 7487 } 7488 7489 static void 7490 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7491 { 7492 struct nvme_bdev_io *bio = ref; 7493 7494 /* Compare operation completion */ 7495 if (!bio->first_fused_completed) { 7496 /* Save compare result for write callback */ 7497 bio->cpl = *cpl; 7498 bio->first_fused_completed = true; 7499 return; 7500 } 7501 7502 /* Write operation completion */ 7503 if (spdk_nvme_cpl_is_error(&bio->cpl)) { 7504 /* If bio->cpl is already an error, it means the compare operation failed. In that case, 7505 * complete the IO with the compare operation's status. 7506 */ 7507 if (!spdk_nvme_cpl_is_error(cpl)) { 7508 SPDK_ERRLOG("Unexpected write success after compare failure.\n"); 7509 } 7510 7511 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 7512 } else { 7513 bdev_nvme_io_complete_nvme_status(bio, cpl); 7514 } 7515 } 7516 7517 static void 7518 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl) 7519 { 7520 struct nvme_bdev_io *bio = ref; 7521 7522 bdev_nvme_io_complete_nvme_status(bio, cpl); 7523 } 7524 7525 static int 7526 fill_zone_from_report(struct spdk_bdev_zone_info *info, struct spdk_nvme_zns_zone_desc *desc) 7527 { 7528 switch (desc->zt) { 7529 case SPDK_NVME_ZONE_TYPE_SEQWR: 7530 info->type = SPDK_BDEV_ZONE_TYPE_SEQWR; 7531 break; 7532 default: 7533 SPDK_ERRLOG("Invalid zone type: %#x in zone report\n", desc->zt); 7534 return -EIO; 7535 } 7536 7537 switch (desc->zs) { 7538 case SPDK_NVME_ZONE_STATE_EMPTY: 7539 info->state = SPDK_BDEV_ZONE_STATE_EMPTY; 7540 break; 7541 case SPDK_NVME_ZONE_STATE_IOPEN: 7542 info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN; 7543 break; 7544 case SPDK_NVME_ZONE_STATE_EOPEN: 7545 info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN; 7546 break; 7547 case SPDK_NVME_ZONE_STATE_CLOSED: 7548 info->state = SPDK_BDEV_ZONE_STATE_CLOSED; 7549 break; 7550 case SPDK_NVME_ZONE_STATE_RONLY: 7551 info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY; 7552 break; 7553 case SPDK_NVME_ZONE_STATE_FULL: 7554 info->state = SPDK_BDEV_ZONE_STATE_FULL; 7555 break; 7556 case SPDK_NVME_ZONE_STATE_OFFLINE: 7557 info->state = SPDK_BDEV_ZONE_STATE_OFFLINE; 7558 break; 7559 default: 7560 SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", desc->zs); 7561 return -EIO; 7562 } 7563 7564 info->zone_id = desc->zslba; 7565 info->write_pointer = desc->wp; 7566 info->capacity = desc->zcap; 7567 7568 return 0; 7569 } 7570 7571 static void 7572 bdev_nvme_get_zone_info_done(void *ref, const struct spdk_nvme_cpl *cpl) 7573 { 7574 struct nvme_bdev_io *bio = ref; 7575 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7576 uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id; 7577 uint32_t zones_to_copy = bdev_io->u.zone_mgmt.num_zones; 7578 struct spdk_bdev_zone_info *info = bdev_io->u.zone_mgmt.buf; 7579 uint64_t max_zones_per_buf, i; 7580 uint32_t zone_report_bufsize; 7581 struct spdk_nvme_ns *ns; 7582 struct spdk_nvme_qpair *qpair; 7583 int ret; 7584 7585 if (spdk_nvme_cpl_is_error(cpl)) { 7586 goto out_complete_io_nvme_cpl; 7587 } 7588 7589 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 7590 ret = -ENXIO; 7591 goto out_complete_io_ret; 7592 } 7593 7594 ns = bio->io_path->nvme_ns->ns; 7595 qpair = bio->io_path->qpair->qpair; 7596 7597 zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 7598 max_zones_per_buf = (zone_report_bufsize - sizeof(*bio->zone_report_buf)) / 7599 sizeof(bio->zone_report_buf->descs[0]); 7600 7601 if (bio->zone_report_buf->nr_zones > max_zones_per_buf) { 7602 ret = -EINVAL; 7603 goto out_complete_io_ret; 7604 } 7605 7606 if (!bio->zone_report_buf->nr_zones) { 7607 ret = -EINVAL; 7608 goto out_complete_io_ret; 7609 } 7610 7611 for (i = 0; i < bio->zone_report_buf->nr_zones && bio->handled_zones < zones_to_copy; i++) { 7612 ret = fill_zone_from_report(&info[bio->handled_zones], 7613 &bio->zone_report_buf->descs[i]); 7614 if (ret) { 7615 goto out_complete_io_ret; 7616 } 7617 bio->handled_zones++; 7618 } 7619 7620 if (bio->handled_zones < zones_to_copy) { 7621 uint64_t zone_size_lba = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 7622 uint64_t slba = zone_id + (zone_size_lba * bio->handled_zones); 7623 7624 memset(bio->zone_report_buf, 0, zone_report_bufsize); 7625 ret = spdk_nvme_zns_report_zones(ns, qpair, 7626 bio->zone_report_buf, zone_report_bufsize, 7627 slba, SPDK_NVME_ZRA_LIST_ALL, true, 7628 bdev_nvme_get_zone_info_done, bio); 7629 if (!ret) { 7630 return; 7631 } else { 7632 goto out_complete_io_ret; 7633 } 7634 } 7635 7636 out_complete_io_nvme_cpl: 7637 free(bio->zone_report_buf); 7638 bio->zone_report_buf = NULL; 7639 bdev_nvme_io_complete_nvme_status(bio, cpl); 7640 return; 7641 7642 out_complete_io_ret: 7643 free(bio->zone_report_buf); 7644 bio->zone_report_buf = NULL; 7645 bdev_nvme_io_complete(bio, ret); 7646 } 7647 7648 static void 7649 bdev_nvme_zone_management_done(void *ref, const struct spdk_nvme_cpl *cpl) 7650 { 7651 struct nvme_bdev_io *bio = ref; 7652 7653 bdev_nvme_io_complete_nvme_status(bio, cpl); 7654 } 7655 7656 static void 7657 bdev_nvme_admin_passthru_complete_nvme_status(void *ctx) 7658 { 7659 struct nvme_bdev_io *bio = ctx; 7660 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7661 const struct spdk_nvme_cpl *cpl = &bio->cpl; 7662 7663 assert(bdev_nvme_io_type_is_admin(bdev_io->type)); 7664 7665 __bdev_nvme_io_complete(bdev_io, 0, cpl); 7666 } 7667 7668 static void 7669 bdev_nvme_abort_complete(void *ctx) 7670 { 7671 struct nvme_bdev_io *bio = ctx; 7672 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7673 7674 if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) { 7675 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS, NULL); 7676 } else { 7677 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED, NULL); 7678 } 7679 } 7680 7681 static void 7682 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl) 7683 { 7684 struct nvme_bdev_io *bio = ref; 7685 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7686 7687 bio->cpl = *cpl; 7688 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), bdev_nvme_abort_complete, bio); 7689 } 7690 7691 static void 7692 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl) 7693 { 7694 struct nvme_bdev_io *bio = ref; 7695 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7696 7697 bio->cpl = *cpl; 7698 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7699 bdev_nvme_admin_passthru_complete_nvme_status, bio); 7700 } 7701 7702 static void 7703 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset) 7704 { 7705 struct nvme_bdev_io *bio = ref; 7706 struct iovec *iov; 7707 7708 bio->iov_offset = sgl_offset; 7709 for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) { 7710 iov = &bio->iovs[bio->iovpos]; 7711 if (bio->iov_offset < iov->iov_len) { 7712 break; 7713 } 7714 7715 bio->iov_offset -= iov->iov_len; 7716 } 7717 } 7718 7719 static int 7720 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length) 7721 { 7722 struct nvme_bdev_io *bio = ref; 7723 struct iovec *iov; 7724 7725 assert(bio->iovpos < bio->iovcnt); 7726 7727 iov = &bio->iovs[bio->iovpos]; 7728 7729 *address = iov->iov_base; 7730 *length = iov->iov_len; 7731 7732 if (bio->iov_offset) { 7733 assert(bio->iov_offset <= iov->iov_len); 7734 *address += bio->iov_offset; 7735 *length -= bio->iov_offset; 7736 } 7737 7738 bio->iov_offset += *length; 7739 if (bio->iov_offset == iov->iov_len) { 7740 bio->iovpos++; 7741 bio->iov_offset = 0; 7742 } 7743 7744 return 0; 7745 } 7746 7747 static void 7748 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset) 7749 { 7750 struct nvme_bdev_io *bio = ref; 7751 struct iovec *iov; 7752 7753 bio->fused_iov_offset = sgl_offset; 7754 for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) { 7755 iov = &bio->fused_iovs[bio->fused_iovpos]; 7756 if (bio->fused_iov_offset < iov->iov_len) { 7757 break; 7758 } 7759 7760 bio->fused_iov_offset -= iov->iov_len; 7761 } 7762 } 7763 7764 static int 7765 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length) 7766 { 7767 struct nvme_bdev_io *bio = ref; 7768 struct iovec *iov; 7769 7770 assert(bio->fused_iovpos < bio->fused_iovcnt); 7771 7772 iov = &bio->fused_iovs[bio->fused_iovpos]; 7773 7774 *address = iov->iov_base; 7775 *length = iov->iov_len; 7776 7777 if (bio->fused_iov_offset) { 7778 assert(bio->fused_iov_offset <= iov->iov_len); 7779 *address += bio->fused_iov_offset; 7780 *length -= bio->fused_iov_offset; 7781 } 7782 7783 bio->fused_iov_offset += *length; 7784 if (bio->fused_iov_offset == iov->iov_len) { 7785 bio->fused_iovpos++; 7786 bio->fused_iov_offset = 0; 7787 } 7788 7789 return 0; 7790 } 7791 7792 static int 7793 bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7794 void *md, uint64_t lba_count, uint64_t lba) 7795 { 7796 int rc; 7797 7798 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n", 7799 lba_count, lba); 7800 7801 bio->iovs = iov; 7802 bio->iovcnt = iovcnt; 7803 bio->iovpos = 0; 7804 bio->iov_offset = 0; 7805 7806 rc = spdk_nvme_ns_cmd_readv_with_md(bio->io_path->nvme_ns->ns, 7807 bio->io_path->qpair->qpair, 7808 lba, lba_count, 7809 bdev_nvme_no_pi_readv_done, bio, 0, 7810 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 7811 md, 0, 0); 7812 7813 if (rc != 0 && rc != -ENOMEM) { 7814 SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc); 7815 } 7816 return rc; 7817 } 7818 7819 static int 7820 bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7821 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, 7822 struct spdk_memory_domain *domain, void *domain_ctx, 7823 struct spdk_accel_sequence *seq) 7824 { 7825 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7826 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7827 int rc; 7828 7829 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7830 lba_count, lba); 7831 7832 bio->iovs = iov; 7833 bio->iovcnt = iovcnt; 7834 bio->iovpos = 0; 7835 bio->iov_offset = 0; 7836 7837 if (domain != NULL || seq != NULL) { 7838 bio->ext_opts.size = SPDK_SIZEOF(&bio->ext_opts, accel_sequence); 7839 bio->ext_opts.memory_domain = domain; 7840 bio->ext_opts.memory_domain_ctx = domain_ctx; 7841 bio->ext_opts.io_flags = flags; 7842 bio->ext_opts.metadata = md; 7843 bio->ext_opts.accel_sequence = seq; 7844 7845 if (iovcnt == 1) { 7846 rc = spdk_nvme_ns_cmd_read_ext(ns, qpair, iov[0].iov_base, lba, lba_count, bdev_nvme_readv_done, 7847 bio, &bio->ext_opts); 7848 } else { 7849 rc = spdk_nvme_ns_cmd_readv_ext(ns, qpair, lba, lba_count, 7850 bdev_nvme_readv_done, bio, 7851 bdev_nvme_queued_reset_sgl, 7852 bdev_nvme_queued_next_sge, 7853 &bio->ext_opts); 7854 } 7855 } else if (iovcnt == 1) { 7856 rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base, 7857 md, lba, lba_count, bdev_nvme_readv_done, 7858 bio, flags, 0, 0); 7859 } else { 7860 rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count, 7861 bdev_nvme_readv_done, bio, flags, 7862 bdev_nvme_queued_reset_sgl, 7863 bdev_nvme_queued_next_sge, md, 0, 0); 7864 } 7865 7866 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 7867 SPDK_ERRLOG("readv failed: rc = %d\n", rc); 7868 } 7869 return rc; 7870 } 7871 7872 static int 7873 bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7874 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, 7875 struct spdk_memory_domain *domain, void *domain_ctx, 7876 struct spdk_accel_sequence *seq, 7877 union spdk_bdev_nvme_cdw12 cdw12, union spdk_bdev_nvme_cdw13 cdw13) 7878 { 7879 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7880 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7881 int rc; 7882 7883 SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7884 lba_count, lba); 7885 7886 bio->iovs = iov; 7887 bio->iovcnt = iovcnt; 7888 bio->iovpos = 0; 7889 bio->iov_offset = 0; 7890 7891 if (domain != NULL || seq != NULL) { 7892 bio->ext_opts.size = SPDK_SIZEOF(&bio->ext_opts, accel_sequence); 7893 bio->ext_opts.memory_domain = domain; 7894 bio->ext_opts.memory_domain_ctx = domain_ctx; 7895 bio->ext_opts.io_flags = flags | SPDK_NVME_IO_FLAGS_DIRECTIVE(cdw12.write.dtype); 7896 bio->ext_opts.cdw13 = cdw13.raw; 7897 bio->ext_opts.metadata = md; 7898 bio->ext_opts.accel_sequence = seq; 7899 7900 if (iovcnt == 1) { 7901 rc = spdk_nvme_ns_cmd_write_ext(ns, qpair, iov[0].iov_base, lba, lba_count, bdev_nvme_writev_done, 7902 bio, &bio->ext_opts); 7903 } else { 7904 rc = spdk_nvme_ns_cmd_writev_ext(ns, qpair, lba, lba_count, 7905 bdev_nvme_writev_done, bio, 7906 bdev_nvme_queued_reset_sgl, 7907 bdev_nvme_queued_next_sge, 7908 &bio->ext_opts); 7909 } 7910 } else if (iovcnt == 1) { 7911 rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base, 7912 md, lba, lba_count, bdev_nvme_writev_done, 7913 bio, flags, 0, 0); 7914 } else { 7915 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 7916 bdev_nvme_writev_done, bio, flags, 7917 bdev_nvme_queued_reset_sgl, 7918 bdev_nvme_queued_next_sge, md, 0, 0); 7919 } 7920 7921 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 7922 SPDK_ERRLOG("writev failed: rc = %d\n", rc); 7923 } 7924 return rc; 7925 } 7926 7927 static int 7928 bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7929 void *md, uint64_t lba_count, uint64_t zslba, 7930 uint32_t flags) 7931 { 7932 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7933 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7934 int rc; 7935 7936 SPDK_DEBUGLOG(bdev_nvme, "zone append %" PRIu64 " blocks to zone start lba %#" PRIx64 "\n", 7937 lba_count, zslba); 7938 7939 bio->iovs = iov; 7940 bio->iovcnt = iovcnt; 7941 bio->iovpos = 0; 7942 bio->iov_offset = 0; 7943 7944 if (iovcnt == 1) { 7945 rc = spdk_nvme_zns_zone_append_with_md(ns, qpair, iov[0].iov_base, md, zslba, 7946 lba_count, 7947 bdev_nvme_zone_appendv_done, bio, 7948 flags, 7949 0, 0); 7950 } else { 7951 rc = spdk_nvme_zns_zone_appendv_with_md(ns, qpair, zslba, lba_count, 7952 bdev_nvme_zone_appendv_done, bio, flags, 7953 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 7954 md, 0, 0); 7955 } 7956 7957 if (rc != 0 && rc != -ENOMEM) { 7958 SPDK_ERRLOG("zone append failed: rc = %d\n", rc); 7959 } 7960 return rc; 7961 } 7962 7963 static int 7964 bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7965 void *md, uint64_t lba_count, uint64_t lba, 7966 uint32_t flags) 7967 { 7968 int rc; 7969 7970 SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7971 lba_count, lba); 7972 7973 bio->iovs = iov; 7974 bio->iovcnt = iovcnt; 7975 bio->iovpos = 0; 7976 bio->iov_offset = 0; 7977 7978 rc = spdk_nvme_ns_cmd_comparev_with_md(bio->io_path->nvme_ns->ns, 7979 bio->io_path->qpair->qpair, 7980 lba, lba_count, 7981 bdev_nvme_comparev_done, bio, flags, 7982 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 7983 md, 0, 0); 7984 7985 if (rc != 0 && rc != -ENOMEM) { 7986 SPDK_ERRLOG("comparev failed: rc = %d\n", rc); 7987 } 7988 return rc; 7989 } 7990 7991 static int 7992 bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, 7993 struct iovec *write_iov, int write_iovcnt, 7994 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags) 7995 { 7996 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7997 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7998 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7999 int rc; 8000 8001 SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 8002 lba_count, lba); 8003 8004 bio->iovs = cmp_iov; 8005 bio->iovcnt = cmp_iovcnt; 8006 bio->iovpos = 0; 8007 bio->iov_offset = 0; 8008 bio->fused_iovs = write_iov; 8009 bio->fused_iovcnt = write_iovcnt; 8010 bio->fused_iovpos = 0; 8011 bio->fused_iov_offset = 0; 8012 8013 if (bdev_io->num_retries == 0) { 8014 bio->first_fused_submitted = false; 8015 bio->first_fused_completed = false; 8016 } 8017 8018 if (!bio->first_fused_submitted) { 8019 flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST; 8020 memset(&bio->cpl, 0, sizeof(bio->cpl)); 8021 8022 rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count, 8023 bdev_nvme_comparev_and_writev_done, bio, flags, 8024 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0); 8025 if (rc == 0) { 8026 bio->first_fused_submitted = true; 8027 flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST; 8028 } else { 8029 if (rc != -ENOMEM) { 8030 SPDK_ERRLOG("compare failed: rc = %d\n", rc); 8031 } 8032 return rc; 8033 } 8034 } 8035 8036 flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND; 8037 8038 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 8039 bdev_nvme_comparev_and_writev_done, bio, flags, 8040 bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0); 8041 if (rc != 0 && rc != -ENOMEM) { 8042 SPDK_ERRLOG("write failed: rc = %d\n", rc); 8043 rc = 0; 8044 } 8045 8046 return rc; 8047 } 8048 8049 static int 8050 bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 8051 { 8052 struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES]; 8053 struct spdk_nvme_dsm_range *range; 8054 uint64_t offset, remaining; 8055 uint64_t num_ranges_u64; 8056 uint16_t num_ranges; 8057 int rc; 8058 8059 num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) / 8060 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 8061 if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) { 8062 SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks); 8063 return -EINVAL; 8064 } 8065 num_ranges = (uint16_t)num_ranges_u64; 8066 8067 offset = offset_blocks; 8068 remaining = num_blocks; 8069 range = &dsm_ranges[0]; 8070 8071 /* Fill max-size ranges until the remaining blocks fit into one range */ 8072 while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) { 8073 range->attributes.raw = 0; 8074 range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 8075 range->starting_lba = offset; 8076 8077 offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 8078 remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 8079 range++; 8080 } 8081 8082 /* Final range describes the remaining blocks */ 8083 range->attributes.raw = 0; 8084 range->length = remaining; 8085 range->starting_lba = offset; 8086 8087 rc = spdk_nvme_ns_cmd_dataset_management(bio->io_path->nvme_ns->ns, 8088 bio->io_path->qpair->qpair, 8089 SPDK_NVME_DSM_ATTR_DEALLOCATE, 8090 dsm_ranges, num_ranges, 8091 bdev_nvme_queued_done, bio); 8092 8093 return rc; 8094 } 8095 8096 static int 8097 bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 8098 { 8099 if (num_blocks > UINT16_MAX + 1) { 8100 SPDK_ERRLOG("NVMe write zeroes is limited to 16-bit block count\n"); 8101 return -EINVAL; 8102 } 8103 8104 return spdk_nvme_ns_cmd_write_zeroes(bio->io_path->nvme_ns->ns, 8105 bio->io_path->qpair->qpair, 8106 offset_blocks, num_blocks, 8107 bdev_nvme_queued_done, bio, 8108 0); 8109 } 8110 8111 static int 8112 bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones, 8113 struct spdk_bdev_zone_info *info) 8114 { 8115 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8116 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8117 uint32_t zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 8118 uint64_t zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 8119 uint64_t total_zones = spdk_nvme_zns_ns_get_num_zones(ns); 8120 8121 if (zone_id % zone_size != 0) { 8122 return -EINVAL; 8123 } 8124 8125 if (num_zones > total_zones || !num_zones) { 8126 return -EINVAL; 8127 } 8128 8129 assert(!bio->zone_report_buf); 8130 bio->zone_report_buf = calloc(1, zone_report_bufsize); 8131 if (!bio->zone_report_buf) { 8132 return -ENOMEM; 8133 } 8134 8135 bio->handled_zones = 0; 8136 8137 return spdk_nvme_zns_report_zones(ns, qpair, bio->zone_report_buf, zone_report_bufsize, 8138 zone_id, SPDK_NVME_ZRA_LIST_ALL, true, 8139 bdev_nvme_get_zone_info_done, bio); 8140 } 8141 8142 static int 8143 bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 8144 enum spdk_bdev_zone_action action) 8145 { 8146 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8147 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8148 8149 switch (action) { 8150 case SPDK_BDEV_ZONE_CLOSE: 8151 return spdk_nvme_zns_close_zone(ns, qpair, zone_id, false, 8152 bdev_nvme_zone_management_done, bio); 8153 case SPDK_BDEV_ZONE_FINISH: 8154 return spdk_nvme_zns_finish_zone(ns, qpair, zone_id, false, 8155 bdev_nvme_zone_management_done, bio); 8156 case SPDK_BDEV_ZONE_OPEN: 8157 return spdk_nvme_zns_open_zone(ns, qpair, zone_id, false, 8158 bdev_nvme_zone_management_done, bio); 8159 case SPDK_BDEV_ZONE_RESET: 8160 return spdk_nvme_zns_reset_zone(ns, qpair, zone_id, false, 8161 bdev_nvme_zone_management_done, bio); 8162 case SPDK_BDEV_ZONE_OFFLINE: 8163 return spdk_nvme_zns_offline_zone(ns, qpair, zone_id, false, 8164 bdev_nvme_zone_management_done, bio); 8165 default: 8166 return -EINVAL; 8167 } 8168 } 8169 8170 static void 8171 bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 8172 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) 8173 { 8174 struct nvme_io_path *io_path; 8175 struct nvme_ctrlr *nvme_ctrlr; 8176 uint32_t max_xfer_size; 8177 int rc = -ENXIO; 8178 8179 /* Choose the first ctrlr which is not failed. */ 8180 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 8181 nvme_ctrlr = io_path->qpair->ctrlr; 8182 8183 /* We should skip any unavailable nvme_ctrlr rather than checking 8184 * if the return value of spdk_nvme_ctrlr_cmd_admin_raw() is -ENXIO. 8185 */ 8186 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 8187 continue; 8188 } 8189 8190 max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ctrlr->ctrlr); 8191 8192 if (nbytes > max_xfer_size) { 8193 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 8194 rc = -EINVAL; 8195 goto err; 8196 } 8197 8198 rc = spdk_nvme_ctrlr_cmd_admin_raw(nvme_ctrlr->ctrlr, cmd, buf, (uint32_t)nbytes, 8199 bdev_nvme_admin_passthru_done, bio); 8200 if (rc == 0) { 8201 return; 8202 } 8203 } 8204 8205 err: 8206 bdev_nvme_admin_complete(bio, rc); 8207 } 8208 8209 static int 8210 bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 8211 void *buf, size_t nbytes) 8212 { 8213 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8214 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8215 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 8216 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 8217 8218 if (nbytes > max_xfer_size) { 8219 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 8220 return -EINVAL; 8221 } 8222 8223 /* 8224 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 8225 * so fill it out automatically. 8226 */ 8227 cmd->nsid = spdk_nvme_ns_get_id(ns); 8228 8229 return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf, 8230 (uint32_t)nbytes, bdev_nvme_queued_done, bio); 8231 } 8232 8233 static int 8234 bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 8235 void *buf, size_t nbytes, void *md_buf, size_t md_len) 8236 { 8237 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8238 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8239 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 8240 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 8241 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 8242 8243 if (nbytes > max_xfer_size) { 8244 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 8245 return -EINVAL; 8246 } 8247 8248 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 8249 SPDK_ERRLOG("invalid meta data buffer size\n"); 8250 return -EINVAL; 8251 } 8252 8253 /* 8254 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 8255 * so fill it out automatically. 8256 */ 8257 cmd->nsid = spdk_nvme_ns_get_id(ns); 8258 8259 return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf, 8260 (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio); 8261 } 8262 8263 static int 8264 bdev_nvme_iov_passthru_md(struct nvme_bdev_io *bio, 8265 struct spdk_nvme_cmd *cmd, struct iovec *iov, int iovcnt, 8266 size_t nbytes, void *md_buf, size_t md_len) 8267 { 8268 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8269 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8270 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 8271 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 8272 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 8273 8274 bio->iovs = iov; 8275 bio->iovcnt = iovcnt; 8276 bio->iovpos = 0; 8277 bio->iov_offset = 0; 8278 8279 if (nbytes > max_xfer_size) { 8280 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 8281 return -EINVAL; 8282 } 8283 8284 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 8285 SPDK_ERRLOG("invalid meta data buffer size\n"); 8286 return -EINVAL; 8287 } 8288 8289 /* 8290 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands 8291 * require a nsid, so fill it out automatically. 8292 */ 8293 cmd->nsid = spdk_nvme_ns_get_id(ns); 8294 8295 return spdk_nvme_ctrlr_cmd_iov_raw_with_md( 8296 ctrlr, qpair, cmd, (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio, 8297 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge); 8298 } 8299 8300 static void 8301 bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 8302 struct nvme_bdev_io *bio_to_abort) 8303 { 8304 struct nvme_io_path *io_path; 8305 int rc = 0; 8306 8307 rc = bdev_nvme_abort_retry_io(nbdev_ch, bio_to_abort); 8308 if (rc == 0) { 8309 bdev_nvme_admin_complete(bio, 0); 8310 return; 8311 } 8312 8313 io_path = bio_to_abort->io_path; 8314 if (io_path != NULL) { 8315 rc = spdk_nvme_ctrlr_cmd_abort_ext(io_path->qpair->ctrlr->ctrlr, 8316 io_path->qpair->qpair, 8317 bio_to_abort, 8318 bdev_nvme_abort_done, bio); 8319 } else { 8320 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 8321 rc = spdk_nvme_ctrlr_cmd_abort_ext(io_path->qpair->ctrlr->ctrlr, 8322 NULL, 8323 bio_to_abort, 8324 bdev_nvme_abort_done, bio); 8325 8326 if (rc != -ENOENT) { 8327 break; 8328 } 8329 } 8330 } 8331 8332 if (rc != 0) { 8333 /* If no command was found or there was any error, complete the abort 8334 * request with failure. 8335 */ 8336 bdev_nvme_admin_complete(bio, rc); 8337 } 8338 } 8339 8340 static int 8341 bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, uint64_t src_offset_blocks, 8342 uint64_t num_blocks) 8343 { 8344 struct spdk_nvme_scc_source_range range = { 8345 .slba = src_offset_blocks, 8346 .nlb = num_blocks - 1 8347 }; 8348 8349 return spdk_nvme_ns_cmd_copy(bio->io_path->nvme_ns->ns, 8350 bio->io_path->qpair->qpair, 8351 &range, 1, dst_offset_blocks, 8352 bdev_nvme_queued_done, bio); 8353 } 8354 8355 static void 8356 bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w) 8357 { 8358 const char *action; 8359 uint32_t i; 8360 8361 if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) { 8362 action = "reset"; 8363 } else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) { 8364 action = "abort"; 8365 } else { 8366 action = "none"; 8367 } 8368 8369 spdk_json_write_object_begin(w); 8370 8371 spdk_json_write_named_string(w, "method", "bdev_nvme_set_options"); 8372 8373 spdk_json_write_named_object_begin(w, "params"); 8374 spdk_json_write_named_string(w, "action_on_timeout", action); 8375 spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us); 8376 spdk_json_write_named_uint64(w, "timeout_admin_us", g_opts.timeout_admin_us); 8377 spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms); 8378 spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst); 8379 spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight); 8380 spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight); 8381 spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight); 8382 spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us); 8383 spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us); 8384 spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests); 8385 spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit); 8386 spdk_json_write_named_uint32(w, "transport_retry_count", g_opts.transport_retry_count); 8387 spdk_json_write_named_int32(w, "bdev_retry_count", g_opts.bdev_retry_count); 8388 spdk_json_write_named_uint8(w, "transport_ack_timeout", g_opts.transport_ack_timeout); 8389 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", g_opts.ctrlr_loss_timeout_sec); 8390 spdk_json_write_named_uint32(w, "reconnect_delay_sec", g_opts.reconnect_delay_sec); 8391 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", g_opts.fast_io_fail_timeout_sec); 8392 spdk_json_write_named_bool(w, "disable_auto_failback", g_opts.disable_auto_failback); 8393 spdk_json_write_named_bool(w, "generate_uuids", g_opts.generate_uuids); 8394 spdk_json_write_named_uint8(w, "transport_tos", g_opts.transport_tos); 8395 spdk_json_write_named_bool(w, "nvme_error_stat", g_opts.nvme_error_stat); 8396 spdk_json_write_named_uint32(w, "rdma_srq_size", g_opts.rdma_srq_size); 8397 spdk_json_write_named_bool(w, "io_path_stat", g_opts.io_path_stat); 8398 spdk_json_write_named_bool(w, "allow_accel_sequence", g_opts.allow_accel_sequence); 8399 spdk_json_write_named_uint32(w, "rdma_max_cq_size", g_opts.rdma_max_cq_size); 8400 spdk_json_write_named_uint16(w, "rdma_cm_event_timeout_ms", g_opts.rdma_cm_event_timeout_ms); 8401 spdk_json_write_named_array_begin(w, "dhchap_digests"); 8402 for (i = 0; i < 32; ++i) { 8403 if (g_opts.dhchap_digests & SPDK_BIT(i)) { 8404 spdk_json_write_string(w, spdk_nvme_dhchap_get_digest_name(i)); 8405 } 8406 } 8407 spdk_json_write_array_end(w); 8408 spdk_json_write_named_array_begin(w, "dhchap_dhgroups"); 8409 for (i = 0; i < 32; ++i) { 8410 if (g_opts.dhchap_dhgroups & SPDK_BIT(i)) { 8411 spdk_json_write_string(w, spdk_nvme_dhchap_get_dhgroup_name(i)); 8412 } 8413 } 8414 8415 spdk_json_write_array_end(w); 8416 spdk_json_write_object_end(w); 8417 8418 spdk_json_write_object_end(w); 8419 } 8420 8421 static void 8422 bdev_nvme_discovery_config_json(struct spdk_json_write_ctx *w, struct discovery_ctx *ctx) 8423 { 8424 struct spdk_nvme_transport_id trid; 8425 8426 spdk_json_write_object_begin(w); 8427 8428 spdk_json_write_named_string(w, "method", "bdev_nvme_start_discovery"); 8429 8430 spdk_json_write_named_object_begin(w, "params"); 8431 spdk_json_write_named_string(w, "name", ctx->name); 8432 spdk_json_write_named_string(w, "hostnqn", ctx->hostnqn); 8433 8434 trid = ctx->trid; 8435 memset(trid.subnqn, 0, sizeof(trid.subnqn)); 8436 nvme_bdev_dump_trid_json(&trid, w); 8437 8438 spdk_json_write_named_bool(w, "wait_for_attach", ctx->wait_for_attach); 8439 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", ctx->bdev_opts.ctrlr_loss_timeout_sec); 8440 spdk_json_write_named_uint32(w, "reconnect_delay_sec", ctx->bdev_opts.reconnect_delay_sec); 8441 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 8442 ctx->bdev_opts.fast_io_fail_timeout_sec); 8443 spdk_json_write_object_end(w); 8444 8445 spdk_json_write_object_end(w); 8446 } 8447 8448 #ifdef SPDK_CONFIG_NVME_CUSE 8449 static void 8450 nvme_ctrlr_cuse_config_json(struct spdk_json_write_ctx *w, 8451 struct nvme_ctrlr *nvme_ctrlr) 8452 { 8453 size_t cuse_name_size = 128; 8454 char cuse_name[cuse_name_size]; 8455 8456 if (spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr, 8457 cuse_name, &cuse_name_size) != 0) { 8458 return; 8459 } 8460 8461 spdk_json_write_object_begin(w); 8462 8463 spdk_json_write_named_string(w, "method", "bdev_nvme_cuse_register"); 8464 8465 spdk_json_write_named_object_begin(w, "params"); 8466 spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name); 8467 spdk_json_write_object_end(w); 8468 8469 spdk_json_write_object_end(w); 8470 } 8471 #endif 8472 8473 static void 8474 nvme_ctrlr_config_json(struct spdk_json_write_ctx *w, 8475 struct nvme_ctrlr *nvme_ctrlr) 8476 { 8477 struct spdk_nvme_transport_id *trid; 8478 const struct spdk_nvme_ctrlr_opts *opts; 8479 8480 if (nvme_ctrlr->opts.from_discovery_service) { 8481 /* Do not emit an RPC for this - it will be implicitly 8482 * covered by a separate bdev_nvme_start_discovery or 8483 * bdev_nvme_start_mdns_discovery RPC. 8484 */ 8485 return; 8486 } 8487 8488 trid = &nvme_ctrlr->active_path_id->trid; 8489 8490 spdk_json_write_object_begin(w); 8491 8492 spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller"); 8493 8494 spdk_json_write_named_object_begin(w, "params"); 8495 spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name); 8496 nvme_bdev_dump_trid_json(trid, w); 8497 spdk_json_write_named_bool(w, "prchk_reftag", 8498 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0); 8499 spdk_json_write_named_bool(w, "prchk_guard", 8500 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0); 8501 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", nvme_ctrlr->opts.ctrlr_loss_timeout_sec); 8502 spdk_json_write_named_uint32(w, "reconnect_delay_sec", nvme_ctrlr->opts.reconnect_delay_sec); 8503 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 8504 nvme_ctrlr->opts.fast_io_fail_timeout_sec); 8505 if (nvme_ctrlr->psk != NULL) { 8506 spdk_json_write_named_string(w, "psk", spdk_key_get_name(nvme_ctrlr->psk)); 8507 } else if (nvme_ctrlr->opts.psk[0] != '\0') { 8508 spdk_json_write_named_string(w, "psk", nvme_ctrlr->opts.psk); 8509 } 8510 8511 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 8512 spdk_json_write_named_string(w, "hostnqn", opts->hostnqn); 8513 spdk_json_write_named_bool(w, "hdgst", opts->header_digest); 8514 spdk_json_write_named_bool(w, "ddgst", opts->data_digest); 8515 if (opts->src_addr[0] != '\0') { 8516 spdk_json_write_named_string(w, "hostaddr", opts->src_addr); 8517 } 8518 if (opts->src_svcid[0] != '\0') { 8519 spdk_json_write_named_string(w, "hostsvcid", opts->src_svcid); 8520 } 8521 8522 spdk_json_write_object_end(w); 8523 8524 spdk_json_write_object_end(w); 8525 } 8526 8527 static void 8528 bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w) 8529 { 8530 spdk_json_write_object_begin(w); 8531 spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug"); 8532 8533 spdk_json_write_named_object_begin(w, "params"); 8534 spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us); 8535 spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled); 8536 spdk_json_write_object_end(w); 8537 8538 spdk_json_write_object_end(w); 8539 } 8540 8541 static int 8542 bdev_nvme_config_json(struct spdk_json_write_ctx *w) 8543 { 8544 struct nvme_bdev_ctrlr *nbdev_ctrlr; 8545 struct nvme_ctrlr *nvme_ctrlr; 8546 struct discovery_ctx *ctx; 8547 8548 bdev_nvme_opts_config_json(w); 8549 8550 pthread_mutex_lock(&g_bdev_nvme_mutex); 8551 8552 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 8553 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 8554 nvme_ctrlr_config_json(w, nvme_ctrlr); 8555 8556 #ifdef SPDK_CONFIG_NVME_CUSE 8557 nvme_ctrlr_cuse_config_json(w, nvme_ctrlr); 8558 #endif 8559 } 8560 } 8561 8562 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 8563 if (!ctx->from_mdns_discovery_service) { 8564 bdev_nvme_discovery_config_json(w, ctx); 8565 } 8566 } 8567 8568 bdev_nvme_mdns_discovery_config_json(w); 8569 8570 /* Dump as last parameter to give all NVMe bdevs chance to be constructed 8571 * before enabling hotplug poller. 8572 */ 8573 bdev_nvme_hotplug_config_json(w); 8574 8575 pthread_mutex_unlock(&g_bdev_nvme_mutex); 8576 return 0; 8577 } 8578 8579 struct spdk_nvme_ctrlr * 8580 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev) 8581 { 8582 struct nvme_bdev *nbdev; 8583 struct nvme_ns *nvme_ns; 8584 8585 if (!bdev || bdev->module != &nvme_if) { 8586 return NULL; 8587 } 8588 8589 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 8590 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 8591 assert(nvme_ns != NULL); 8592 8593 return nvme_ns->ctrlr->ctrlr; 8594 } 8595 8596 static bool 8597 nvme_io_path_is_current(struct nvme_io_path *io_path) 8598 { 8599 const struct nvme_bdev_channel *nbdev_ch; 8600 bool current; 8601 8602 if (!nvme_io_path_is_available(io_path)) { 8603 return false; 8604 } 8605 8606 nbdev_ch = io_path->nbdev_ch; 8607 if (nbdev_ch == NULL) { 8608 current = false; 8609 } else if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) { 8610 struct nvme_io_path *optimized_io_path = NULL; 8611 8612 STAILQ_FOREACH(optimized_io_path, &nbdev_ch->io_path_list, stailq) { 8613 if (optimized_io_path->nvme_ns->ana_state == SPDK_NVME_ANA_OPTIMIZED_STATE) { 8614 break; 8615 } 8616 } 8617 8618 /* A non-optimized path is only current if there are no optimized paths. */ 8619 current = (io_path->nvme_ns->ana_state == SPDK_NVME_ANA_OPTIMIZED_STATE) || 8620 (optimized_io_path == NULL); 8621 } else { 8622 if (nbdev_ch->current_io_path) { 8623 current = (io_path == nbdev_ch->current_io_path); 8624 } else { 8625 struct nvme_io_path *first_path; 8626 8627 /* We arrived here as there are no optimized paths for active-passive 8628 * mode. Check if this io_path is the first one available on the list. 8629 */ 8630 current = false; 8631 STAILQ_FOREACH(first_path, &nbdev_ch->io_path_list, stailq) { 8632 if (nvme_io_path_is_available(first_path)) { 8633 current = (io_path == first_path); 8634 break; 8635 } 8636 } 8637 } 8638 } 8639 8640 return current; 8641 } 8642 8643 void 8644 nvme_io_path_info_json(struct spdk_json_write_ctx *w, struct nvme_io_path *io_path) 8645 { 8646 struct nvme_ns *nvme_ns = io_path->nvme_ns; 8647 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 8648 const struct spdk_nvme_ctrlr_data *cdata; 8649 const struct spdk_nvme_transport_id *trid; 8650 const char *adrfam_str; 8651 8652 spdk_json_write_object_begin(w); 8653 8654 spdk_json_write_named_string(w, "bdev_name", nvme_ns->bdev->disk.name); 8655 8656 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 8657 trid = spdk_nvme_ctrlr_get_transport_id(nvme_ctrlr->ctrlr); 8658 8659 spdk_json_write_named_uint32(w, "cntlid", cdata->cntlid); 8660 spdk_json_write_named_bool(w, "current", nvme_io_path_is_current(io_path)); 8661 spdk_json_write_named_bool(w, "connected", nvme_qpair_is_connected(io_path->qpair)); 8662 spdk_json_write_named_bool(w, "accessible", nvme_ns_is_accessible(nvme_ns)); 8663 8664 spdk_json_write_named_object_begin(w, "transport"); 8665 spdk_json_write_named_string(w, "trtype", trid->trstring); 8666 spdk_json_write_named_string(w, "traddr", trid->traddr); 8667 if (trid->trsvcid[0] != '\0') { 8668 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 8669 } 8670 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 8671 if (adrfam_str) { 8672 spdk_json_write_named_string(w, "adrfam", adrfam_str); 8673 } 8674 spdk_json_write_object_end(w); 8675 8676 spdk_json_write_object_end(w); 8677 } 8678 8679 void 8680 bdev_nvme_get_discovery_info(struct spdk_json_write_ctx *w) 8681 { 8682 struct discovery_ctx *ctx; 8683 struct discovery_entry_ctx *entry_ctx; 8684 8685 spdk_json_write_array_begin(w); 8686 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 8687 spdk_json_write_object_begin(w); 8688 spdk_json_write_named_string(w, "name", ctx->name); 8689 8690 spdk_json_write_named_object_begin(w, "trid"); 8691 nvme_bdev_dump_trid_json(&ctx->trid, w); 8692 spdk_json_write_object_end(w); 8693 8694 spdk_json_write_named_array_begin(w, "referrals"); 8695 TAILQ_FOREACH(entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 8696 spdk_json_write_object_begin(w); 8697 spdk_json_write_named_object_begin(w, "trid"); 8698 nvme_bdev_dump_trid_json(&entry_ctx->trid, w); 8699 spdk_json_write_object_end(w); 8700 spdk_json_write_object_end(w); 8701 } 8702 spdk_json_write_array_end(w); 8703 8704 spdk_json_write_object_end(w); 8705 } 8706 spdk_json_write_array_end(w); 8707 } 8708 8709 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme) 8710 8711 SPDK_TRACE_REGISTER_FN(bdev_nvme_trace, "bdev_nvme", TRACE_GROUP_BDEV_NVME) 8712 { 8713 struct spdk_trace_tpoint_opts opts[] = { 8714 { 8715 "BDEV_NVME_IO_START", TRACE_BDEV_NVME_IO_START, 8716 OWNER_TYPE_NONE, OBJECT_BDEV_NVME_IO, 1, 8717 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 8718 }, 8719 { 8720 "BDEV_NVME_IO_DONE", TRACE_BDEV_NVME_IO_DONE, 8721 OWNER_TYPE_NONE, OBJECT_BDEV_NVME_IO, 0, 8722 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 8723 } 8724 }; 8725 8726 8727 spdk_trace_register_object(OBJECT_BDEV_NVME_IO, 'N'); 8728 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 8729 spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_SUBMIT, OBJECT_BDEV_NVME_IO, 0); 8730 spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_SUBMIT, OBJECT_BDEV_NVME_IO, 0); 8731 spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_COMPLETE, OBJECT_BDEV_NVME_IO, 0); 8732 spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_COMPLETE, OBJECT_BDEV_NVME_IO, 0); 8733 } 8734