1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 * Copyright (c) 2022 Dell Inc, or its subsidiaries. All rights reserved. 6 */ 7 8 #include "spdk/stdinc.h" 9 10 #include "bdev_nvme.h" 11 12 #include "spdk/accel.h" 13 #include "spdk/config.h" 14 #include "spdk/endian.h" 15 #include "spdk/bdev.h" 16 #include "spdk/json.h" 17 #include "spdk/keyring.h" 18 #include "spdk/likely.h" 19 #include "spdk/nvme.h" 20 #include "spdk/nvme_ocssd.h" 21 #include "spdk/nvme_zns.h" 22 #include "spdk/opal.h" 23 #include "spdk/thread.h" 24 #include "spdk/trace.h" 25 #include "spdk/string.h" 26 #include "spdk/util.h" 27 #include "spdk/uuid.h" 28 29 #include "spdk/bdev_module.h" 30 #include "spdk/log.h" 31 32 #include "spdk_internal/usdt.h" 33 #include "spdk_internal/trace_defs.h" 34 35 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true 36 #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS (10000) 37 38 #define NSID_STR_LEN 10 39 40 #define SPDK_CONTROLLER_NAME_MAX 512 41 42 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w); 43 44 struct nvme_bdev_io { 45 /** array of iovecs to transfer. */ 46 struct iovec *iovs; 47 48 /** Number of iovecs in iovs array. */ 49 int iovcnt; 50 51 /** Current iovec position. */ 52 int iovpos; 53 54 /** Offset in current iovec. */ 55 uint32_t iov_offset; 56 57 /** I/O path the current I/O or admin passthrough is submitted on, or the I/O path 58 * being reset in a reset I/O. 59 */ 60 struct nvme_io_path *io_path; 61 62 /** array of iovecs to transfer. */ 63 struct iovec *fused_iovs; 64 65 /** Number of iovecs in iovs array. */ 66 int fused_iovcnt; 67 68 /** Current iovec position. */ 69 int fused_iovpos; 70 71 /** Offset in current iovec. */ 72 uint32_t fused_iov_offset; 73 74 /** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */ 75 struct spdk_nvme_cpl cpl; 76 77 /** Extended IO opts passed by the user to bdev layer and mapped to NVME format */ 78 struct spdk_nvme_ns_cmd_ext_io_opts ext_opts; 79 80 /** Keeps track if first of fused commands was submitted */ 81 bool first_fused_submitted; 82 83 /** Keeps track if first of fused commands was completed */ 84 bool first_fused_completed; 85 86 /** Temporary pointer to zone report buffer */ 87 struct spdk_nvme_zns_zone_report *zone_report_buf; 88 89 /** Keep track of how many zones that have been copied to the spdk_bdev_zone_info struct */ 90 uint64_t handled_zones; 91 92 /** Expiration value in ticks to retry the current I/O. */ 93 uint64_t retry_ticks; 94 95 /* How many times the current I/O was retried. */ 96 int32_t retry_count; 97 98 /* Current tsc at submit time. */ 99 uint64_t submit_tsc; 100 101 /* Used to put nvme_bdev_io into the list */ 102 TAILQ_ENTRY(nvme_bdev_io) retry_link; 103 }; 104 105 struct nvme_probe_skip_entry { 106 struct spdk_nvme_transport_id trid; 107 TAILQ_ENTRY(nvme_probe_skip_entry) tailq; 108 }; 109 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */ 110 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER( 111 g_skipped_nvme_ctrlrs); 112 113 #define BDEV_NVME_DEFAULT_DIGESTS (SPDK_BIT(SPDK_NVMF_DHCHAP_HASH_SHA256) | \ 114 SPDK_BIT(SPDK_NVMF_DHCHAP_HASH_SHA384) | \ 115 SPDK_BIT(SPDK_NVMF_DHCHAP_HASH_SHA512)) 116 117 #define BDEV_NVME_DEFAULT_DHGROUPS (SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_NULL) | \ 118 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_2048) | \ 119 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_3072) | \ 120 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_4096) | \ 121 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_6144) | \ 122 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_8192)) 123 124 static struct spdk_bdev_nvme_opts g_opts = { 125 .action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE, 126 .timeout_us = 0, 127 .timeout_admin_us = 0, 128 .keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS, 129 .transport_retry_count = 4, 130 .arbitration_burst = 0, 131 .low_priority_weight = 0, 132 .medium_priority_weight = 0, 133 .high_priority_weight = 0, 134 .nvme_adminq_poll_period_us = 10000ULL, 135 .nvme_ioq_poll_period_us = 0, 136 .io_queue_requests = 0, 137 .delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT, 138 .bdev_retry_count = 3, 139 .transport_ack_timeout = 0, 140 .ctrlr_loss_timeout_sec = 0, 141 .reconnect_delay_sec = 0, 142 .fast_io_fail_timeout_sec = 0, 143 .disable_auto_failback = false, 144 .generate_uuids = false, 145 .transport_tos = 0, 146 .nvme_error_stat = false, 147 .io_path_stat = false, 148 .allow_accel_sequence = false, 149 .dhchap_digests = BDEV_NVME_DEFAULT_DIGESTS, 150 .dhchap_dhgroups = BDEV_NVME_DEFAULT_DHGROUPS, 151 }; 152 153 #define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL 154 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT 100000ULL 155 156 static int g_hot_insert_nvme_controller_index = 0; 157 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT; 158 static bool g_nvme_hotplug_enabled = false; 159 struct spdk_thread *g_bdev_nvme_init_thread; 160 static struct spdk_poller *g_hotplug_poller; 161 static struct spdk_poller *g_hotplug_probe_poller; 162 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx; 163 164 static void nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 165 struct nvme_async_probe_ctx *ctx); 166 static void nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 167 struct nvme_async_probe_ctx *ctx); 168 static int bdev_nvme_library_init(void); 169 static void bdev_nvme_library_fini(void); 170 static void _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, 171 struct spdk_bdev_io *bdev_io); 172 static void bdev_nvme_submit_request(struct spdk_io_channel *ch, 173 struct spdk_bdev_io *bdev_io); 174 static int bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 175 void *md, uint64_t lba_count, uint64_t lba, 176 uint32_t flags, struct spdk_memory_domain *domain, void *domain_ctx, 177 struct spdk_accel_sequence *seq); 178 static int bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 179 void *md, uint64_t lba_count, uint64_t lba); 180 static int bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 181 void *md, uint64_t lba_count, uint64_t lba, 182 uint32_t flags, struct spdk_memory_domain *domain, void *domain_ctx, 183 struct spdk_accel_sequence *seq); 184 static int bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 185 void *md, uint64_t lba_count, 186 uint64_t zslba, uint32_t flags); 187 static int bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 188 void *md, uint64_t lba_count, uint64_t lba, 189 uint32_t flags); 190 static int bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, 191 struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov, 192 int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba, 193 uint32_t flags); 194 static int bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, 195 uint32_t num_zones, struct spdk_bdev_zone_info *info); 196 static int bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 197 enum spdk_bdev_zone_action action); 198 static void bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, 199 struct nvme_bdev_io *bio, 200 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); 201 static int bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 202 void *buf, size_t nbytes); 203 static int bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 204 void *buf, size_t nbytes, void *md_buf, size_t md_len); 205 static int bdev_nvme_iov_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 206 struct iovec *iov, int iovcnt, size_t nbytes, 207 void *md_buf, size_t md_len); 208 static void bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, 209 struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort); 210 static void bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio); 211 static int bdev_nvme_reset_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 212 static int bdev_nvme_failover_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 213 static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr); 214 static int nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr); 215 216 static struct nvme_ns *nvme_ns_alloc(void); 217 static void nvme_ns_free(struct nvme_ns *ns); 218 219 static int 220 nvme_ns_cmp(struct nvme_ns *ns1, struct nvme_ns *ns2) 221 { 222 return ns1->id < ns2->id ? -1 : ns1->id > ns2->id; 223 } 224 225 RB_GENERATE_STATIC(nvme_ns_tree, nvme_ns, node, nvme_ns_cmp); 226 227 struct spdk_nvme_qpair * 228 bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch) 229 { 230 struct nvme_ctrlr_channel *ctrlr_ch; 231 232 assert(ctrlr_io_ch != NULL); 233 234 ctrlr_ch = spdk_io_channel_get_ctx(ctrlr_io_ch); 235 236 return ctrlr_ch->qpair->qpair; 237 } 238 239 static int 240 bdev_nvme_get_ctx_size(void) 241 { 242 return sizeof(struct nvme_bdev_io); 243 } 244 245 static struct spdk_bdev_module nvme_if = { 246 .name = "nvme", 247 .async_fini = true, 248 .module_init = bdev_nvme_library_init, 249 .module_fini = bdev_nvme_library_fini, 250 .config_json = bdev_nvme_config_json, 251 .get_ctx_size = bdev_nvme_get_ctx_size, 252 253 }; 254 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if) 255 256 struct nvme_bdev_ctrlrs g_nvme_bdev_ctrlrs = TAILQ_HEAD_INITIALIZER(g_nvme_bdev_ctrlrs); 257 pthread_mutex_t g_bdev_nvme_mutex = PTHREAD_MUTEX_INITIALIZER; 258 bool g_bdev_nvme_module_finish; 259 260 struct nvme_bdev_ctrlr * 261 nvme_bdev_ctrlr_get_by_name(const char *name) 262 { 263 struct nvme_bdev_ctrlr *nbdev_ctrlr; 264 265 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 266 if (strcmp(name, nbdev_ctrlr->name) == 0) { 267 break; 268 } 269 } 270 271 return nbdev_ctrlr; 272 } 273 274 static struct nvme_ctrlr * 275 nvme_bdev_ctrlr_get_ctrlr(struct nvme_bdev_ctrlr *nbdev_ctrlr, 276 const struct spdk_nvme_transport_id *trid) 277 { 278 struct nvme_ctrlr *nvme_ctrlr; 279 280 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 281 if (spdk_nvme_transport_id_compare(trid, &nvme_ctrlr->active_path_id->trid) == 0) { 282 break; 283 } 284 } 285 286 return nvme_ctrlr; 287 } 288 289 struct nvme_ctrlr * 290 nvme_bdev_ctrlr_get_ctrlr_by_id(struct nvme_bdev_ctrlr *nbdev_ctrlr, 291 uint16_t cntlid) 292 { 293 struct nvme_ctrlr *nvme_ctrlr; 294 const struct spdk_nvme_ctrlr_data *cdata; 295 296 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 297 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 298 if (cdata->cntlid == cntlid) { 299 break; 300 } 301 } 302 303 return nvme_ctrlr; 304 } 305 306 static struct nvme_bdev * 307 nvme_bdev_ctrlr_get_bdev(struct nvme_bdev_ctrlr *nbdev_ctrlr, uint32_t nsid) 308 { 309 struct nvme_bdev *bdev; 310 311 pthread_mutex_lock(&g_bdev_nvme_mutex); 312 TAILQ_FOREACH(bdev, &nbdev_ctrlr->bdevs, tailq) { 313 if (bdev->nsid == nsid) { 314 break; 315 } 316 } 317 pthread_mutex_unlock(&g_bdev_nvme_mutex); 318 319 return bdev; 320 } 321 322 struct nvme_ns * 323 nvme_ctrlr_get_ns(struct nvme_ctrlr *nvme_ctrlr, uint32_t nsid) 324 { 325 struct nvme_ns ns; 326 327 assert(nsid > 0); 328 329 ns.id = nsid; 330 return RB_FIND(nvme_ns_tree, &nvme_ctrlr->namespaces, &ns); 331 } 332 333 struct nvme_ns * 334 nvme_ctrlr_get_first_active_ns(struct nvme_ctrlr *nvme_ctrlr) 335 { 336 return RB_MIN(nvme_ns_tree, &nvme_ctrlr->namespaces); 337 } 338 339 struct nvme_ns * 340 nvme_ctrlr_get_next_active_ns(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *ns) 341 { 342 if (ns == NULL) { 343 return NULL; 344 } 345 346 return RB_NEXT(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 347 } 348 349 static struct nvme_ctrlr * 350 nvme_ctrlr_get(const struct spdk_nvme_transport_id *trid) 351 { 352 struct nvme_bdev_ctrlr *nbdev_ctrlr; 353 struct nvme_ctrlr *nvme_ctrlr = NULL; 354 355 pthread_mutex_lock(&g_bdev_nvme_mutex); 356 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 357 nvme_ctrlr = nvme_bdev_ctrlr_get_ctrlr(nbdev_ctrlr, trid); 358 if (nvme_ctrlr != NULL) { 359 break; 360 } 361 } 362 pthread_mutex_unlock(&g_bdev_nvme_mutex); 363 364 return nvme_ctrlr; 365 } 366 367 struct nvme_ctrlr * 368 nvme_ctrlr_get_by_name(const char *name) 369 { 370 struct nvme_bdev_ctrlr *nbdev_ctrlr; 371 struct nvme_ctrlr *nvme_ctrlr = NULL; 372 373 if (name == NULL) { 374 return NULL; 375 } 376 377 pthread_mutex_lock(&g_bdev_nvme_mutex); 378 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 379 if (nbdev_ctrlr != NULL) { 380 nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 381 } 382 pthread_mutex_unlock(&g_bdev_nvme_mutex); 383 384 return nvme_ctrlr; 385 } 386 387 void 388 nvme_bdev_ctrlr_for_each(nvme_bdev_ctrlr_for_each_fn fn, void *ctx) 389 { 390 struct nvme_bdev_ctrlr *nbdev_ctrlr; 391 392 pthread_mutex_lock(&g_bdev_nvme_mutex); 393 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 394 fn(nbdev_ctrlr, ctx); 395 } 396 pthread_mutex_unlock(&g_bdev_nvme_mutex); 397 } 398 399 void 400 nvme_bdev_dump_trid_json(const struct spdk_nvme_transport_id *trid, struct spdk_json_write_ctx *w) 401 { 402 const char *trtype_str; 403 const char *adrfam_str; 404 405 trtype_str = spdk_nvme_transport_id_trtype_str(trid->trtype); 406 if (trtype_str) { 407 spdk_json_write_named_string(w, "trtype", trtype_str); 408 } 409 410 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 411 if (adrfam_str) { 412 spdk_json_write_named_string(w, "adrfam", adrfam_str); 413 } 414 415 if (trid->traddr[0] != '\0') { 416 spdk_json_write_named_string(w, "traddr", trid->traddr); 417 } 418 419 if (trid->trsvcid[0] != '\0') { 420 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 421 } 422 423 if (trid->subnqn[0] != '\0') { 424 spdk_json_write_named_string(w, "subnqn", trid->subnqn); 425 } 426 } 427 428 static void 429 nvme_bdev_ctrlr_delete(struct nvme_bdev_ctrlr *nbdev_ctrlr, 430 struct nvme_ctrlr *nvme_ctrlr) 431 { 432 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_delete, nvme_ctrlr->nbdev_ctrlr->name); 433 pthread_mutex_lock(&g_bdev_nvme_mutex); 434 435 TAILQ_REMOVE(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 436 if (!TAILQ_EMPTY(&nbdev_ctrlr->ctrlrs)) { 437 pthread_mutex_unlock(&g_bdev_nvme_mutex); 438 439 return; 440 } 441 TAILQ_REMOVE(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 442 443 pthread_mutex_unlock(&g_bdev_nvme_mutex); 444 445 assert(TAILQ_EMPTY(&nbdev_ctrlr->bdevs)); 446 447 free(nbdev_ctrlr->name); 448 free(nbdev_ctrlr); 449 } 450 451 static void 452 _nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 453 { 454 struct nvme_path_id *path_id, *tmp_path; 455 struct nvme_ns *ns, *tmp_ns; 456 457 free(nvme_ctrlr->copied_ana_desc); 458 spdk_free(nvme_ctrlr->ana_log_page); 459 460 if (nvme_ctrlr->opal_dev) { 461 spdk_opal_dev_destruct(nvme_ctrlr->opal_dev); 462 nvme_ctrlr->opal_dev = NULL; 463 } 464 465 if (nvme_ctrlr->nbdev_ctrlr) { 466 nvme_bdev_ctrlr_delete(nvme_ctrlr->nbdev_ctrlr, nvme_ctrlr); 467 } 468 469 RB_FOREACH_SAFE(ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp_ns) { 470 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 471 nvme_ns_free(ns); 472 } 473 474 TAILQ_FOREACH_SAFE(path_id, &nvme_ctrlr->trids, link, tmp_path) { 475 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 476 free(path_id); 477 } 478 479 pthread_mutex_destroy(&nvme_ctrlr->mutex); 480 spdk_keyring_put_key(nvme_ctrlr->psk); 481 spdk_keyring_put_key(nvme_ctrlr->dhchap_key); 482 free(nvme_ctrlr); 483 484 pthread_mutex_lock(&g_bdev_nvme_mutex); 485 if (g_bdev_nvme_module_finish && TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 486 pthread_mutex_unlock(&g_bdev_nvme_mutex); 487 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 488 spdk_bdev_module_fini_done(); 489 return; 490 } 491 pthread_mutex_unlock(&g_bdev_nvme_mutex); 492 } 493 494 static int 495 nvme_detach_poller(void *arg) 496 { 497 struct nvme_ctrlr *nvme_ctrlr = arg; 498 int rc; 499 500 rc = spdk_nvme_detach_poll_async(nvme_ctrlr->detach_ctx); 501 if (rc != -EAGAIN) { 502 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 503 _nvme_ctrlr_delete(nvme_ctrlr); 504 } 505 506 return SPDK_POLLER_BUSY; 507 } 508 509 static void 510 nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 511 { 512 int rc; 513 514 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 515 516 /* First, unregister the adminq poller, as the driver will poll adminq if necessary */ 517 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 518 519 /* If we got here, the reset/detach poller cannot be active */ 520 assert(nvme_ctrlr->reset_detach_poller == NULL); 521 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(nvme_detach_poller, 522 nvme_ctrlr, 1000); 523 if (nvme_ctrlr->reset_detach_poller == NULL) { 524 SPDK_ERRLOG("Failed to register detach poller\n"); 525 goto error; 526 } 527 528 rc = spdk_nvme_detach_async(nvme_ctrlr->ctrlr, &nvme_ctrlr->detach_ctx); 529 if (rc != 0) { 530 SPDK_ERRLOG("Failed to detach the NVMe controller\n"); 531 goto error; 532 } 533 534 return; 535 error: 536 /* We don't have a good way to handle errors here, so just do what we can and delete the 537 * controller without detaching the underlying NVMe device. 538 */ 539 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 540 _nvme_ctrlr_delete(nvme_ctrlr); 541 } 542 543 static void 544 nvme_ctrlr_unregister_cb(void *io_device) 545 { 546 struct nvme_ctrlr *nvme_ctrlr = io_device; 547 548 nvme_ctrlr_delete(nvme_ctrlr); 549 } 550 551 static void 552 nvme_ctrlr_unregister(void *ctx) 553 { 554 struct nvme_ctrlr *nvme_ctrlr = ctx; 555 556 spdk_io_device_unregister(nvme_ctrlr, nvme_ctrlr_unregister_cb); 557 } 558 559 static bool 560 nvme_ctrlr_can_be_unregistered(struct nvme_ctrlr *nvme_ctrlr) 561 { 562 if (!nvme_ctrlr->destruct) { 563 return false; 564 } 565 566 if (nvme_ctrlr->ref > 0) { 567 return false; 568 } 569 570 if (nvme_ctrlr->resetting) { 571 return false; 572 } 573 574 if (nvme_ctrlr->ana_log_page_updating) { 575 return false; 576 } 577 578 if (nvme_ctrlr->io_path_cache_clearing) { 579 return false; 580 } 581 582 return true; 583 } 584 585 static void 586 nvme_ctrlr_release(struct nvme_ctrlr *nvme_ctrlr) 587 { 588 pthread_mutex_lock(&nvme_ctrlr->mutex); 589 SPDK_DTRACE_PROBE2(bdev_nvme_ctrlr_release, nvme_ctrlr->nbdev_ctrlr->name, nvme_ctrlr->ref); 590 591 assert(nvme_ctrlr->ref > 0); 592 nvme_ctrlr->ref--; 593 594 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 595 pthread_mutex_unlock(&nvme_ctrlr->mutex); 596 return; 597 } 598 599 pthread_mutex_unlock(&nvme_ctrlr->mutex); 600 601 spdk_thread_exec_msg(nvme_ctrlr->thread, nvme_ctrlr_unregister, nvme_ctrlr); 602 } 603 604 static void 605 bdev_nvme_clear_current_io_path(struct nvme_bdev_channel *nbdev_ch) 606 { 607 nbdev_ch->current_io_path = NULL; 608 nbdev_ch->rr_counter = 0; 609 } 610 611 static struct nvme_io_path * 612 _bdev_nvme_get_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 613 { 614 struct nvme_io_path *io_path; 615 616 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 617 if (io_path->nvme_ns == nvme_ns) { 618 break; 619 } 620 } 621 622 return io_path; 623 } 624 625 static struct nvme_io_path * 626 nvme_io_path_alloc(void) 627 { 628 struct nvme_io_path *io_path; 629 630 io_path = calloc(1, sizeof(*io_path)); 631 if (io_path == NULL) { 632 SPDK_ERRLOG("Failed to alloc io_path.\n"); 633 return NULL; 634 } 635 636 if (g_opts.io_path_stat) { 637 io_path->stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); 638 if (io_path->stat == NULL) { 639 free(io_path); 640 SPDK_ERRLOG("Failed to alloc io_path stat.\n"); 641 return NULL; 642 } 643 spdk_bdev_reset_io_stat(io_path->stat, SPDK_BDEV_RESET_STAT_MAXMIN); 644 } 645 646 return io_path; 647 } 648 649 static void 650 nvme_io_path_free(struct nvme_io_path *io_path) 651 { 652 free(io_path->stat); 653 free(io_path); 654 } 655 656 static int 657 _bdev_nvme_add_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 658 { 659 struct nvme_io_path *io_path; 660 struct spdk_io_channel *ch; 661 struct nvme_ctrlr_channel *ctrlr_ch; 662 struct nvme_qpair *nvme_qpair; 663 664 io_path = nvme_io_path_alloc(); 665 if (io_path == NULL) { 666 return -ENOMEM; 667 } 668 669 io_path->nvme_ns = nvme_ns; 670 671 ch = spdk_get_io_channel(nvme_ns->ctrlr); 672 if (ch == NULL) { 673 nvme_io_path_free(io_path); 674 SPDK_ERRLOG("Failed to alloc io_channel.\n"); 675 return -ENOMEM; 676 } 677 678 ctrlr_ch = spdk_io_channel_get_ctx(ch); 679 680 nvme_qpair = ctrlr_ch->qpair; 681 assert(nvme_qpair != NULL); 682 683 io_path->qpair = nvme_qpair; 684 TAILQ_INSERT_TAIL(&nvme_qpair->io_path_list, io_path, tailq); 685 686 io_path->nbdev_ch = nbdev_ch; 687 STAILQ_INSERT_TAIL(&nbdev_ch->io_path_list, io_path, stailq); 688 689 bdev_nvme_clear_current_io_path(nbdev_ch); 690 691 return 0; 692 } 693 694 static void 695 bdev_nvme_clear_retry_io_path(struct nvme_bdev_channel *nbdev_ch, 696 struct nvme_io_path *io_path) 697 { 698 struct nvme_bdev_io *bio; 699 700 TAILQ_FOREACH(bio, &nbdev_ch->retry_io_list, retry_link) { 701 if (bio->io_path == io_path) { 702 bio->io_path = NULL; 703 } 704 } 705 } 706 707 static void 708 _bdev_nvme_delete_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *io_path) 709 { 710 struct spdk_io_channel *ch; 711 struct nvme_qpair *nvme_qpair; 712 struct nvme_ctrlr_channel *ctrlr_ch; 713 struct nvme_bdev *nbdev; 714 715 nbdev = spdk_io_channel_get_io_device(spdk_io_channel_from_ctx(nbdev_ch)); 716 717 /* Add the statistics to nvme_ns before this path is destroyed. */ 718 pthread_mutex_lock(&nbdev->mutex); 719 if (nbdev->ref != 0 && io_path->nvme_ns->stat != NULL && io_path->stat != NULL) { 720 spdk_bdev_add_io_stat(io_path->nvme_ns->stat, io_path->stat); 721 } 722 pthread_mutex_unlock(&nbdev->mutex); 723 724 bdev_nvme_clear_current_io_path(nbdev_ch); 725 bdev_nvme_clear_retry_io_path(nbdev_ch, io_path); 726 727 STAILQ_REMOVE(&nbdev_ch->io_path_list, io_path, nvme_io_path, stailq); 728 io_path->nbdev_ch = NULL; 729 730 nvme_qpair = io_path->qpair; 731 assert(nvme_qpair != NULL); 732 733 ctrlr_ch = nvme_qpair->ctrlr_ch; 734 assert(ctrlr_ch != NULL); 735 736 ch = spdk_io_channel_from_ctx(ctrlr_ch); 737 spdk_put_io_channel(ch); 738 739 /* After an io_path is removed, I/Os submitted to it may complete and update statistics 740 * of the io_path. To avoid heap-use-after-free error from this case, do not free the 741 * io_path here but free the io_path when the associated qpair is freed. It is ensured 742 * that all I/Os submitted to the io_path are completed when the associated qpair is freed. 743 */ 744 } 745 746 static void 747 _bdev_nvme_delete_io_paths(struct nvme_bdev_channel *nbdev_ch) 748 { 749 struct nvme_io_path *io_path, *tmp_io_path; 750 751 STAILQ_FOREACH_SAFE(io_path, &nbdev_ch->io_path_list, stailq, tmp_io_path) { 752 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 753 } 754 } 755 756 static int 757 bdev_nvme_create_bdev_channel_cb(void *io_device, void *ctx_buf) 758 { 759 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 760 struct nvme_bdev *nbdev = io_device; 761 struct nvme_ns *nvme_ns; 762 int rc; 763 764 STAILQ_INIT(&nbdev_ch->io_path_list); 765 TAILQ_INIT(&nbdev_ch->retry_io_list); 766 767 pthread_mutex_lock(&nbdev->mutex); 768 769 nbdev_ch->mp_policy = nbdev->mp_policy; 770 nbdev_ch->mp_selector = nbdev->mp_selector; 771 nbdev_ch->rr_min_io = nbdev->rr_min_io; 772 773 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 774 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 775 if (rc != 0) { 776 pthread_mutex_unlock(&nbdev->mutex); 777 778 _bdev_nvme_delete_io_paths(nbdev_ch); 779 return rc; 780 } 781 } 782 pthread_mutex_unlock(&nbdev->mutex); 783 784 return 0; 785 } 786 787 /* If cpl != NULL, complete the bdev_io with nvme status based on 'cpl'. 788 * If cpl == NULL, complete the bdev_io with bdev status based on 'status'. 789 */ 790 static inline void 791 __bdev_nvme_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status, 792 const struct spdk_nvme_cpl *cpl) 793 { 794 spdk_trace_record(TRACE_BDEV_NVME_IO_DONE, 0, 0, (uintptr_t)bdev_io->driver_ctx, 795 (uintptr_t)bdev_io); 796 if (cpl) { 797 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 798 } else { 799 spdk_bdev_io_complete(bdev_io, status); 800 } 801 } 802 803 static void bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch); 804 805 static void 806 bdev_nvme_destroy_bdev_channel_cb(void *io_device, void *ctx_buf) 807 { 808 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 809 810 bdev_nvme_abort_retry_ios(nbdev_ch); 811 _bdev_nvme_delete_io_paths(nbdev_ch); 812 } 813 814 static inline bool 815 bdev_nvme_io_type_is_admin(enum spdk_bdev_io_type io_type) 816 { 817 switch (io_type) { 818 case SPDK_BDEV_IO_TYPE_RESET: 819 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 820 case SPDK_BDEV_IO_TYPE_ABORT: 821 return true; 822 default: 823 break; 824 } 825 826 return false; 827 } 828 829 static inline bool 830 nvme_ns_is_active(struct nvme_ns *nvme_ns) 831 { 832 if (spdk_unlikely(nvme_ns->ana_state_updating)) { 833 return false; 834 } 835 836 if (spdk_unlikely(nvme_ns->ns == NULL)) { 837 return false; 838 } 839 840 return true; 841 } 842 843 static inline bool 844 nvme_ns_is_accessible(struct nvme_ns *nvme_ns) 845 { 846 if (spdk_unlikely(!nvme_ns_is_active(nvme_ns))) { 847 return false; 848 } 849 850 switch (nvme_ns->ana_state) { 851 case SPDK_NVME_ANA_OPTIMIZED_STATE: 852 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 853 return true; 854 default: 855 break; 856 } 857 858 return false; 859 } 860 861 static inline bool 862 nvme_qpair_is_connected(struct nvme_qpair *nvme_qpair) 863 { 864 if (spdk_unlikely(nvme_qpair->qpair == NULL)) { 865 return false; 866 } 867 868 if (spdk_unlikely(spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) != 869 SPDK_NVME_QPAIR_FAILURE_NONE)) { 870 return false; 871 } 872 873 if (spdk_unlikely(nvme_qpair->ctrlr_ch->reset_iter != NULL)) { 874 return false; 875 } 876 877 return true; 878 } 879 880 static inline bool 881 nvme_io_path_is_available(struct nvme_io_path *io_path) 882 { 883 if (spdk_unlikely(!nvme_qpair_is_connected(io_path->qpair))) { 884 return false; 885 } 886 887 if (spdk_unlikely(!nvme_ns_is_accessible(io_path->nvme_ns))) { 888 return false; 889 } 890 891 return true; 892 } 893 894 static inline bool 895 nvme_ctrlr_is_failed(struct nvme_ctrlr *nvme_ctrlr) 896 { 897 if (nvme_ctrlr->destruct) { 898 return true; 899 } 900 901 if (nvme_ctrlr->fast_io_fail_timedout) { 902 return true; 903 } 904 905 if (nvme_ctrlr->resetting) { 906 if (nvme_ctrlr->opts.reconnect_delay_sec != 0) { 907 return false; 908 } else { 909 return true; 910 } 911 } 912 913 if (nvme_ctrlr->reconnect_is_delayed) { 914 return false; 915 } 916 917 if (nvme_ctrlr->disabled) { 918 return true; 919 } 920 921 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 922 return true; 923 } else { 924 return false; 925 } 926 } 927 928 static bool 929 nvme_ctrlr_is_available(struct nvme_ctrlr *nvme_ctrlr) 930 { 931 if (nvme_ctrlr->destruct) { 932 return false; 933 } 934 935 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 936 return false; 937 } 938 939 if (nvme_ctrlr->resetting || nvme_ctrlr->reconnect_is_delayed) { 940 return false; 941 } 942 943 if (nvme_ctrlr->disabled) { 944 return false; 945 } 946 947 return true; 948 } 949 950 /* Simulate circular linked list. */ 951 static inline struct nvme_io_path * 952 nvme_io_path_get_next(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *prev_path) 953 { 954 struct nvme_io_path *next_path; 955 956 if (prev_path != NULL) { 957 next_path = STAILQ_NEXT(prev_path, stailq); 958 if (next_path != NULL) { 959 return next_path; 960 } 961 } 962 963 return STAILQ_FIRST(&nbdev_ch->io_path_list); 964 } 965 966 static struct nvme_io_path * 967 _bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 968 { 969 struct nvme_io_path *io_path, *start, *non_optimized = NULL; 970 971 start = nvme_io_path_get_next(nbdev_ch, nbdev_ch->current_io_path); 972 973 io_path = start; 974 do { 975 if (spdk_likely(nvme_qpair_is_connected(io_path->qpair) && 976 nvme_ns_is_active(io_path->nvme_ns))) { 977 switch (io_path->nvme_ns->ana_state) { 978 case SPDK_NVME_ANA_OPTIMIZED_STATE: 979 nbdev_ch->current_io_path = io_path; 980 return io_path; 981 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 982 if (non_optimized == NULL) { 983 non_optimized = io_path; 984 } 985 break; 986 default: 987 break; 988 } 989 } 990 io_path = nvme_io_path_get_next(nbdev_ch, io_path); 991 } while (io_path != start); 992 993 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) { 994 /* We come here only if there is no optimized path. Cache even non_optimized 995 * path for load balance across multiple non_optimized paths. 996 */ 997 nbdev_ch->current_io_path = non_optimized; 998 } 999 1000 return non_optimized; 1001 } 1002 1003 static struct nvme_io_path * 1004 _bdev_nvme_find_io_path_min_qd(struct nvme_bdev_channel *nbdev_ch) 1005 { 1006 struct nvme_io_path *io_path; 1007 struct nvme_io_path *optimized = NULL, *non_optimized = NULL; 1008 uint32_t opt_min_qd = UINT32_MAX, non_opt_min_qd = UINT32_MAX; 1009 uint32_t num_outstanding_reqs; 1010 1011 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 1012 if (spdk_unlikely(!nvme_qpair_is_connected(io_path->qpair))) { 1013 /* The device is currently resetting. */ 1014 continue; 1015 } 1016 1017 if (spdk_unlikely(!nvme_ns_is_active(io_path->nvme_ns))) { 1018 continue; 1019 } 1020 1021 num_outstanding_reqs = spdk_nvme_qpair_get_num_outstanding_reqs(io_path->qpair->qpair); 1022 switch (io_path->nvme_ns->ana_state) { 1023 case SPDK_NVME_ANA_OPTIMIZED_STATE: 1024 if (num_outstanding_reqs < opt_min_qd) { 1025 opt_min_qd = num_outstanding_reqs; 1026 optimized = io_path; 1027 } 1028 break; 1029 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 1030 if (num_outstanding_reqs < non_opt_min_qd) { 1031 non_opt_min_qd = num_outstanding_reqs; 1032 non_optimized = io_path; 1033 } 1034 break; 1035 default: 1036 break; 1037 } 1038 } 1039 1040 /* don't cache io path for BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH selector */ 1041 if (optimized != NULL) { 1042 return optimized; 1043 } 1044 1045 return non_optimized; 1046 } 1047 1048 static inline struct nvme_io_path * 1049 bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 1050 { 1051 if (spdk_likely(nbdev_ch->current_io_path != NULL)) { 1052 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE) { 1053 return nbdev_ch->current_io_path; 1054 } else if (nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 1055 if (++nbdev_ch->rr_counter < nbdev_ch->rr_min_io) { 1056 return nbdev_ch->current_io_path; 1057 } 1058 nbdev_ch->rr_counter = 0; 1059 } 1060 } 1061 1062 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE || 1063 nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 1064 return _bdev_nvme_find_io_path(nbdev_ch); 1065 } else { 1066 return _bdev_nvme_find_io_path_min_qd(nbdev_ch); 1067 } 1068 } 1069 1070 /* Return true if there is any io_path whose qpair is active or ctrlr is not failed, 1071 * or false otherwise. 1072 * 1073 * If any io_path has an active qpair but find_io_path() returned NULL, its namespace 1074 * is likely to be non-accessible now but may become accessible. 1075 * 1076 * If any io_path has an unfailed ctrlr but find_io_path() returned NULL, the ctrlr 1077 * is likely to be resetting now but the reset may succeed. A ctrlr is set to unfailed 1078 * when starting to reset it but it is set to failed when the reset failed. Hence, if 1079 * a ctrlr is unfailed, it is likely that it works fine or is resetting. 1080 */ 1081 static bool 1082 any_io_path_may_become_available(struct nvme_bdev_channel *nbdev_ch) 1083 { 1084 struct nvme_io_path *io_path; 1085 1086 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 1087 if (io_path->nvme_ns->ana_transition_timedout) { 1088 continue; 1089 } 1090 1091 if (nvme_qpair_is_connected(io_path->qpair) || 1092 !nvme_ctrlr_is_failed(io_path->qpair->ctrlr)) { 1093 return true; 1094 } 1095 } 1096 1097 return false; 1098 } 1099 1100 static void 1101 bdev_nvme_retry_io(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io) 1102 { 1103 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 1104 struct spdk_io_channel *ch; 1105 1106 if (nbdev_io->io_path != NULL && nvme_io_path_is_available(nbdev_io->io_path)) { 1107 _bdev_nvme_submit_request(nbdev_ch, bdev_io); 1108 } else { 1109 ch = spdk_io_channel_from_ctx(nbdev_ch); 1110 bdev_nvme_submit_request(ch, bdev_io); 1111 } 1112 } 1113 1114 static int 1115 bdev_nvme_retry_ios(void *arg) 1116 { 1117 struct nvme_bdev_channel *nbdev_ch = arg; 1118 struct nvme_bdev_io *bio, *tmp_bio; 1119 uint64_t now, delay_us; 1120 1121 now = spdk_get_ticks(); 1122 1123 TAILQ_FOREACH_SAFE(bio, &nbdev_ch->retry_io_list, retry_link, tmp_bio) { 1124 if (bio->retry_ticks > now) { 1125 break; 1126 } 1127 1128 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bio, retry_link); 1129 1130 bdev_nvme_retry_io(nbdev_ch, spdk_bdev_io_from_ctx(bio)); 1131 } 1132 1133 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1134 1135 bio = TAILQ_FIRST(&nbdev_ch->retry_io_list); 1136 if (bio != NULL) { 1137 delay_us = (bio->retry_ticks - now) * SPDK_SEC_TO_USEC / spdk_get_ticks_hz(); 1138 1139 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 1140 delay_us); 1141 } 1142 1143 return SPDK_POLLER_BUSY; 1144 } 1145 1146 static void 1147 bdev_nvme_queue_retry_io(struct nvme_bdev_channel *nbdev_ch, 1148 struct nvme_bdev_io *bio, uint64_t delay_ms) 1149 { 1150 struct nvme_bdev_io *tmp_bio; 1151 1152 bio->retry_ticks = spdk_get_ticks() + delay_ms * spdk_get_ticks_hz() / 1000ULL; 1153 1154 TAILQ_FOREACH_REVERSE(tmp_bio, &nbdev_ch->retry_io_list, retry_io_head, retry_link) { 1155 if (tmp_bio->retry_ticks <= bio->retry_ticks) { 1156 TAILQ_INSERT_AFTER(&nbdev_ch->retry_io_list, tmp_bio, bio, 1157 retry_link); 1158 return; 1159 } 1160 } 1161 1162 /* No earlier I/Os were found. This I/O must be the new head. */ 1163 TAILQ_INSERT_HEAD(&nbdev_ch->retry_io_list, bio, retry_link); 1164 1165 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1166 1167 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 1168 delay_ms * 1000ULL); 1169 } 1170 1171 static void 1172 bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch) 1173 { 1174 struct nvme_bdev_io *bio, *tmp_bio; 1175 1176 TAILQ_FOREACH_SAFE(bio, &nbdev_ch->retry_io_list, retry_link, tmp_bio) { 1177 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bio, retry_link); 1178 __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_ABORTED, NULL); 1179 } 1180 1181 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1182 } 1183 1184 static int 1185 bdev_nvme_abort_retry_io(struct nvme_bdev_channel *nbdev_ch, 1186 struct nvme_bdev_io *bio_to_abort) 1187 { 1188 struct nvme_bdev_io *bio; 1189 1190 TAILQ_FOREACH(bio, &nbdev_ch->retry_io_list, retry_link) { 1191 if (bio == bio_to_abort) { 1192 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bio, retry_link); 1193 __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_ABORTED, NULL); 1194 return 0; 1195 } 1196 } 1197 1198 return -ENOENT; 1199 } 1200 1201 static void 1202 bdev_nvme_update_nvme_error_stat(struct spdk_bdev_io *bdev_io, const struct spdk_nvme_cpl *cpl) 1203 { 1204 struct nvme_bdev *nbdev; 1205 uint16_t sct, sc; 1206 1207 assert(spdk_nvme_cpl_is_error(cpl)); 1208 1209 nbdev = bdev_io->bdev->ctxt; 1210 1211 if (nbdev->err_stat == NULL) { 1212 return; 1213 } 1214 1215 sct = cpl->status.sct; 1216 sc = cpl->status.sc; 1217 1218 pthread_mutex_lock(&nbdev->mutex); 1219 1220 nbdev->err_stat->status_type[sct]++; 1221 switch (sct) { 1222 case SPDK_NVME_SCT_GENERIC: 1223 case SPDK_NVME_SCT_COMMAND_SPECIFIC: 1224 case SPDK_NVME_SCT_MEDIA_ERROR: 1225 case SPDK_NVME_SCT_PATH: 1226 nbdev->err_stat->status[sct][sc]++; 1227 break; 1228 default: 1229 break; 1230 } 1231 1232 pthread_mutex_unlock(&nbdev->mutex); 1233 } 1234 1235 static inline void 1236 bdev_nvme_update_io_path_stat(struct nvme_bdev_io *bio) 1237 { 1238 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1239 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 1240 uint32_t blocklen = bdev_io->bdev->blocklen; 1241 struct spdk_bdev_io_stat *stat; 1242 uint64_t tsc_diff; 1243 1244 if (bio->io_path->stat == NULL) { 1245 return; 1246 } 1247 1248 tsc_diff = spdk_get_ticks() - bio->submit_tsc; 1249 stat = bio->io_path->stat; 1250 1251 switch (bdev_io->type) { 1252 case SPDK_BDEV_IO_TYPE_READ: 1253 stat->bytes_read += num_blocks * blocklen; 1254 stat->num_read_ops++; 1255 stat->read_latency_ticks += tsc_diff; 1256 if (stat->max_read_latency_ticks < tsc_diff) { 1257 stat->max_read_latency_ticks = tsc_diff; 1258 } 1259 if (stat->min_read_latency_ticks > tsc_diff) { 1260 stat->min_read_latency_ticks = tsc_diff; 1261 } 1262 break; 1263 case SPDK_BDEV_IO_TYPE_WRITE: 1264 stat->bytes_written += num_blocks * blocklen; 1265 stat->num_write_ops++; 1266 stat->write_latency_ticks += tsc_diff; 1267 if (stat->max_write_latency_ticks < tsc_diff) { 1268 stat->max_write_latency_ticks = tsc_diff; 1269 } 1270 if (stat->min_write_latency_ticks > tsc_diff) { 1271 stat->min_write_latency_ticks = tsc_diff; 1272 } 1273 break; 1274 case SPDK_BDEV_IO_TYPE_UNMAP: 1275 stat->bytes_unmapped += num_blocks * blocklen; 1276 stat->num_unmap_ops++; 1277 stat->unmap_latency_ticks += tsc_diff; 1278 if (stat->max_unmap_latency_ticks < tsc_diff) { 1279 stat->max_unmap_latency_ticks = tsc_diff; 1280 } 1281 if (stat->min_unmap_latency_ticks > tsc_diff) { 1282 stat->min_unmap_latency_ticks = tsc_diff; 1283 } 1284 break; 1285 case SPDK_BDEV_IO_TYPE_ZCOPY: 1286 /* Track the data in the start phase only */ 1287 if (!bdev_io->u.bdev.zcopy.start) { 1288 break; 1289 } 1290 if (bdev_io->u.bdev.zcopy.populate) { 1291 stat->bytes_read += num_blocks * blocklen; 1292 stat->num_read_ops++; 1293 stat->read_latency_ticks += tsc_diff; 1294 if (stat->max_read_latency_ticks < tsc_diff) { 1295 stat->max_read_latency_ticks = tsc_diff; 1296 } 1297 if (stat->min_read_latency_ticks > tsc_diff) { 1298 stat->min_read_latency_ticks = tsc_diff; 1299 } 1300 } else { 1301 stat->bytes_written += num_blocks * blocklen; 1302 stat->num_write_ops++; 1303 stat->write_latency_ticks += tsc_diff; 1304 if (stat->max_write_latency_ticks < tsc_diff) { 1305 stat->max_write_latency_ticks = tsc_diff; 1306 } 1307 if (stat->min_write_latency_ticks > tsc_diff) { 1308 stat->min_write_latency_ticks = tsc_diff; 1309 } 1310 } 1311 break; 1312 case SPDK_BDEV_IO_TYPE_COPY: 1313 stat->bytes_copied += num_blocks * blocklen; 1314 stat->num_copy_ops++; 1315 stat->copy_latency_ticks += tsc_diff; 1316 if (stat->max_copy_latency_ticks < tsc_diff) { 1317 stat->max_copy_latency_ticks = tsc_diff; 1318 } 1319 if (stat->min_copy_latency_ticks > tsc_diff) { 1320 stat->min_copy_latency_ticks = tsc_diff; 1321 } 1322 break; 1323 default: 1324 break; 1325 } 1326 } 1327 1328 static bool 1329 bdev_nvme_check_retry_io(struct nvme_bdev_io *bio, 1330 const struct spdk_nvme_cpl *cpl, 1331 struct nvme_bdev_channel *nbdev_ch, 1332 uint64_t *_delay_ms) 1333 { 1334 struct nvme_io_path *io_path = bio->io_path; 1335 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 1336 const struct spdk_nvme_ctrlr_data *cdata; 1337 1338 if (spdk_nvme_cpl_is_path_error(cpl) || 1339 spdk_nvme_cpl_is_aborted_sq_deletion(cpl) || 1340 !nvme_io_path_is_available(io_path) || 1341 !nvme_ctrlr_is_available(nvme_ctrlr)) { 1342 bdev_nvme_clear_current_io_path(nbdev_ch); 1343 bio->io_path = NULL; 1344 if (spdk_nvme_cpl_is_ana_error(cpl)) { 1345 if (nvme_ctrlr_read_ana_log_page(nvme_ctrlr) == 0) { 1346 io_path->nvme_ns->ana_state_updating = true; 1347 } 1348 } 1349 if (!any_io_path_may_become_available(nbdev_ch)) { 1350 return false; 1351 } 1352 *_delay_ms = 0; 1353 } else { 1354 bio->retry_count++; 1355 1356 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 1357 1358 if (cpl->status.crd != 0) { 1359 *_delay_ms = cdata->crdt[cpl->status.crd] * 100; 1360 } else { 1361 *_delay_ms = 0; 1362 } 1363 } 1364 1365 return true; 1366 } 1367 1368 static inline void 1369 bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio, 1370 const struct spdk_nvme_cpl *cpl) 1371 { 1372 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1373 struct nvme_bdev_channel *nbdev_ch; 1374 uint64_t delay_ms; 1375 1376 assert(!bdev_nvme_io_type_is_admin(bdev_io->type)); 1377 1378 if (spdk_likely(spdk_nvme_cpl_is_success(cpl))) { 1379 bdev_nvme_update_io_path_stat(bio); 1380 goto complete; 1381 } 1382 1383 /* Update error counts before deciding if retry is needed. 1384 * Hence, error counts may be more than the number of I/O errors. 1385 */ 1386 bdev_nvme_update_nvme_error_stat(bdev_io, cpl); 1387 1388 if (cpl->status.dnr != 0 || spdk_nvme_cpl_is_aborted_by_request(cpl) || 1389 (g_opts.bdev_retry_count != -1 && bio->retry_count >= g_opts.bdev_retry_count)) { 1390 goto complete; 1391 } 1392 1393 /* At this point we don't know whether the sequence was successfully executed or not, so we 1394 * cannot retry the IO */ 1395 if (bdev_io->u.bdev.accel_sequence != NULL) { 1396 goto complete; 1397 } 1398 1399 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1400 1401 if (bdev_nvme_check_retry_io(bio, cpl, nbdev_ch, &delay_ms)) { 1402 bdev_nvme_queue_retry_io(nbdev_ch, bio, delay_ms); 1403 return; 1404 } 1405 1406 complete: 1407 bio->retry_count = 0; 1408 bio->submit_tsc = 0; 1409 bdev_io->u.bdev.accel_sequence = NULL; 1410 __bdev_nvme_io_complete(bdev_io, 0, cpl); 1411 } 1412 1413 static inline void 1414 bdev_nvme_io_complete(struct nvme_bdev_io *bio, int rc) 1415 { 1416 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1417 struct nvme_bdev_channel *nbdev_ch; 1418 enum spdk_bdev_io_status io_status; 1419 1420 assert(!bdev_nvme_io_type_is_admin(bdev_io->type)); 1421 1422 switch (rc) { 1423 case 0: 1424 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1425 break; 1426 case -ENOMEM: 1427 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1428 break; 1429 case -ENXIO: 1430 if (g_opts.bdev_retry_count == -1 || bio->retry_count < g_opts.bdev_retry_count) { 1431 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1432 1433 bdev_nvme_clear_current_io_path(nbdev_ch); 1434 bio->io_path = NULL; 1435 1436 if (any_io_path_may_become_available(nbdev_ch)) { 1437 bdev_nvme_queue_retry_io(nbdev_ch, bio, 1000ULL); 1438 return; 1439 } 1440 } 1441 1442 /* fallthrough */ 1443 default: 1444 spdk_accel_sequence_abort(bdev_io->u.bdev.accel_sequence); 1445 bdev_io->u.bdev.accel_sequence = NULL; 1446 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1447 break; 1448 } 1449 1450 bio->retry_count = 0; 1451 bio->submit_tsc = 0; 1452 __bdev_nvme_io_complete(bdev_io, io_status, NULL); 1453 } 1454 1455 static inline void 1456 bdev_nvme_admin_complete(struct nvme_bdev_io *bio, int rc) 1457 { 1458 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1459 enum spdk_bdev_io_status io_status; 1460 1461 switch (rc) { 1462 case 0: 1463 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1464 break; 1465 case -ENOMEM: 1466 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1467 break; 1468 case -ENXIO: 1469 /* fallthrough */ 1470 default: 1471 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1472 break; 1473 } 1474 1475 __bdev_nvme_io_complete(bdev_io, io_status, NULL); 1476 } 1477 1478 static void 1479 bdev_nvme_clear_io_path_caches_done(struct spdk_io_channel_iter *i, int status) 1480 { 1481 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 1482 1483 pthread_mutex_lock(&nvme_ctrlr->mutex); 1484 1485 assert(nvme_ctrlr->io_path_cache_clearing == true); 1486 nvme_ctrlr->io_path_cache_clearing = false; 1487 1488 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 1489 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1490 return; 1491 } 1492 1493 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1494 1495 nvme_ctrlr_unregister(nvme_ctrlr); 1496 } 1497 1498 static void 1499 _bdev_nvme_clear_io_path_cache(struct nvme_qpair *nvme_qpair) 1500 { 1501 struct nvme_io_path *io_path; 1502 1503 TAILQ_FOREACH(io_path, &nvme_qpair->io_path_list, tailq) { 1504 if (io_path->nbdev_ch == NULL) { 1505 continue; 1506 } 1507 bdev_nvme_clear_current_io_path(io_path->nbdev_ch); 1508 } 1509 } 1510 1511 static void 1512 bdev_nvme_clear_io_path_cache(struct spdk_io_channel_iter *i) 1513 { 1514 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 1515 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 1516 1517 assert(ctrlr_ch->qpair != NULL); 1518 1519 _bdev_nvme_clear_io_path_cache(ctrlr_ch->qpair); 1520 1521 spdk_for_each_channel_continue(i, 0); 1522 } 1523 1524 static void 1525 bdev_nvme_clear_io_path_caches(struct nvme_ctrlr *nvme_ctrlr) 1526 { 1527 pthread_mutex_lock(&nvme_ctrlr->mutex); 1528 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 1529 nvme_ctrlr->io_path_cache_clearing) { 1530 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1531 return; 1532 } 1533 1534 nvme_ctrlr->io_path_cache_clearing = true; 1535 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1536 1537 spdk_for_each_channel(nvme_ctrlr, 1538 bdev_nvme_clear_io_path_cache, 1539 NULL, 1540 bdev_nvme_clear_io_path_caches_done); 1541 } 1542 1543 static struct nvme_qpair * 1544 nvme_poll_group_get_qpair(struct nvme_poll_group *group, struct spdk_nvme_qpair *qpair) 1545 { 1546 struct nvme_qpair *nvme_qpair; 1547 1548 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1549 if (nvme_qpair->qpair == qpair) { 1550 break; 1551 } 1552 } 1553 1554 return nvme_qpair; 1555 } 1556 1557 static void nvme_qpair_delete(struct nvme_qpair *nvme_qpair); 1558 1559 static void 1560 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx) 1561 { 1562 struct nvme_poll_group *group = poll_group_ctx; 1563 struct nvme_qpair *nvme_qpair; 1564 struct nvme_ctrlr_channel *ctrlr_ch; 1565 int status; 1566 1567 nvme_qpair = nvme_poll_group_get_qpair(group, qpair); 1568 if (nvme_qpair == NULL) { 1569 return; 1570 } 1571 1572 if (nvme_qpair->qpair != NULL) { 1573 spdk_nvme_ctrlr_free_io_qpair(nvme_qpair->qpair); 1574 nvme_qpair->qpair = NULL; 1575 } 1576 1577 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1578 1579 ctrlr_ch = nvme_qpair->ctrlr_ch; 1580 1581 if (ctrlr_ch != NULL) { 1582 if (ctrlr_ch->reset_iter != NULL) { 1583 /* We are in a full reset sequence. */ 1584 if (ctrlr_ch->connect_poller != NULL) { 1585 /* qpair was failed to connect. Abort the reset sequence. */ 1586 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was failed to connect. abort the reset ctrlr sequence.\n", 1587 qpair); 1588 spdk_poller_unregister(&ctrlr_ch->connect_poller); 1589 status = -1; 1590 } else { 1591 /* qpair was completed to disconnect. Just move to the next ctrlr_channel. */ 1592 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was disconnected and freed in a reset ctrlr sequence.\n", 1593 qpair); 1594 status = 0; 1595 } 1596 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, status); 1597 ctrlr_ch->reset_iter = NULL; 1598 } else { 1599 /* qpair was disconnected unexpectedly. Reset controller for recovery. */ 1600 SPDK_NOTICELOG("qpair %p was disconnected and freed. reset controller.\n", qpair); 1601 bdev_nvme_failover_ctrlr(nvme_qpair->ctrlr); 1602 } 1603 } else { 1604 /* In this case, ctrlr_channel is already deleted. */ 1605 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was disconnected and freed. delete nvme_qpair.\n", qpair); 1606 nvme_qpair_delete(nvme_qpair); 1607 } 1608 } 1609 1610 static void 1611 bdev_nvme_check_io_qpairs(struct nvme_poll_group *group) 1612 { 1613 struct nvme_qpair *nvme_qpair; 1614 1615 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1616 if (nvme_qpair->qpair == NULL || nvme_qpair->ctrlr_ch == NULL) { 1617 continue; 1618 } 1619 1620 if (spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) != 1621 SPDK_NVME_QPAIR_FAILURE_NONE) { 1622 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1623 } 1624 } 1625 } 1626 1627 static int 1628 bdev_nvme_poll(void *arg) 1629 { 1630 struct nvme_poll_group *group = arg; 1631 int64_t num_completions; 1632 1633 if (group->collect_spin_stat && group->start_ticks == 0) { 1634 group->start_ticks = spdk_get_ticks(); 1635 } 1636 1637 num_completions = spdk_nvme_poll_group_process_completions(group->group, 0, 1638 bdev_nvme_disconnected_qpair_cb); 1639 if (group->collect_spin_stat) { 1640 if (num_completions > 0) { 1641 if (group->end_ticks != 0) { 1642 group->spin_ticks += (group->end_ticks - group->start_ticks); 1643 group->end_ticks = 0; 1644 } 1645 group->start_ticks = 0; 1646 } else { 1647 group->end_ticks = spdk_get_ticks(); 1648 } 1649 } 1650 1651 if (spdk_unlikely(num_completions < 0)) { 1652 bdev_nvme_check_io_qpairs(group); 1653 } 1654 1655 return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 1656 } 1657 1658 static int bdev_nvme_poll_adminq(void *arg); 1659 1660 static void 1661 bdev_nvme_change_adminq_poll_period(struct nvme_ctrlr *nvme_ctrlr, uint64_t new_period_us) 1662 { 1663 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 1664 1665 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, 1666 nvme_ctrlr, new_period_us); 1667 } 1668 1669 static int 1670 bdev_nvme_poll_adminq(void *arg) 1671 { 1672 int32_t rc; 1673 struct nvme_ctrlr *nvme_ctrlr = arg; 1674 nvme_ctrlr_disconnected_cb disconnected_cb; 1675 1676 assert(nvme_ctrlr != NULL); 1677 1678 rc = spdk_nvme_ctrlr_process_admin_completions(nvme_ctrlr->ctrlr); 1679 if (rc < 0) { 1680 disconnected_cb = nvme_ctrlr->disconnected_cb; 1681 nvme_ctrlr->disconnected_cb = NULL; 1682 1683 if (disconnected_cb != NULL) { 1684 bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 1685 g_opts.nvme_adminq_poll_period_us); 1686 disconnected_cb(nvme_ctrlr); 1687 } else { 1688 bdev_nvme_failover_ctrlr(nvme_ctrlr); 1689 } 1690 } else if (spdk_nvme_ctrlr_get_admin_qp_failure_reason(nvme_ctrlr->ctrlr) != 1691 SPDK_NVME_QPAIR_FAILURE_NONE) { 1692 bdev_nvme_clear_io_path_caches(nvme_ctrlr); 1693 } 1694 1695 return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY; 1696 } 1697 1698 static void 1699 nvme_bdev_free(void *io_device) 1700 { 1701 struct nvme_bdev *nvme_disk = io_device; 1702 1703 pthread_mutex_destroy(&nvme_disk->mutex); 1704 free(nvme_disk->disk.name); 1705 free(nvme_disk->err_stat); 1706 free(nvme_disk); 1707 } 1708 1709 static int 1710 bdev_nvme_destruct(void *ctx) 1711 { 1712 struct nvme_bdev *nvme_disk = ctx; 1713 struct nvme_ns *nvme_ns, *tmp_nvme_ns; 1714 1715 SPDK_DTRACE_PROBE2(bdev_nvme_destruct, nvme_disk->nbdev_ctrlr->name, nvme_disk->nsid); 1716 1717 TAILQ_FOREACH_SAFE(nvme_ns, &nvme_disk->nvme_ns_list, tailq, tmp_nvme_ns) { 1718 pthread_mutex_lock(&nvme_ns->ctrlr->mutex); 1719 1720 nvme_ns->bdev = NULL; 1721 1722 assert(nvme_ns->id > 0); 1723 1724 if (nvme_ctrlr_get_ns(nvme_ns->ctrlr, nvme_ns->id) == NULL) { 1725 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1726 1727 nvme_ctrlr_release(nvme_ns->ctrlr); 1728 nvme_ns_free(nvme_ns); 1729 } else { 1730 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1731 } 1732 } 1733 1734 pthread_mutex_lock(&g_bdev_nvme_mutex); 1735 TAILQ_REMOVE(&nvme_disk->nbdev_ctrlr->bdevs, nvme_disk, tailq); 1736 pthread_mutex_unlock(&g_bdev_nvme_mutex); 1737 1738 spdk_io_device_unregister(nvme_disk, nvme_bdev_free); 1739 1740 return 0; 1741 } 1742 1743 static int 1744 bdev_nvme_create_qpair(struct nvme_qpair *nvme_qpair) 1745 { 1746 struct nvme_ctrlr *nvme_ctrlr; 1747 struct spdk_nvme_io_qpair_opts opts; 1748 struct spdk_nvme_qpair *qpair; 1749 int rc; 1750 1751 nvme_ctrlr = nvme_qpair->ctrlr; 1752 1753 spdk_nvme_ctrlr_get_default_io_qpair_opts(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1754 opts.delay_cmd_submit = g_opts.delay_cmd_submit; 1755 opts.create_only = true; 1756 opts.async_mode = true; 1757 opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests); 1758 g_opts.io_queue_requests = opts.io_queue_requests; 1759 1760 qpair = spdk_nvme_ctrlr_alloc_io_qpair(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1761 if (qpair == NULL) { 1762 return -1; 1763 } 1764 1765 SPDK_DTRACE_PROBE3(bdev_nvme_create_qpair, nvme_ctrlr->nbdev_ctrlr->name, 1766 spdk_nvme_qpair_get_id(qpair), spdk_thread_get_id(nvme_ctrlr->thread)); 1767 1768 assert(nvme_qpair->group != NULL); 1769 1770 rc = spdk_nvme_poll_group_add(nvme_qpair->group->group, qpair); 1771 if (rc != 0) { 1772 SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n"); 1773 goto err; 1774 } 1775 1776 rc = spdk_nvme_ctrlr_connect_io_qpair(nvme_ctrlr->ctrlr, qpair); 1777 if (rc != 0) { 1778 SPDK_ERRLOG("Unable to connect I/O qpair.\n"); 1779 goto err; 1780 } 1781 1782 nvme_qpair->qpair = qpair; 1783 1784 if (!g_opts.disable_auto_failback) { 1785 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1786 } 1787 1788 return 0; 1789 1790 err: 1791 spdk_nvme_ctrlr_free_io_qpair(qpair); 1792 1793 return rc; 1794 } 1795 1796 static void bdev_nvme_reset_io_continue(void *cb_arg, int rc); 1797 1798 static void 1799 bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i) 1800 { 1801 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 1802 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 1803 int rc = 0; 1804 struct nvme_bdev_io *bio; 1805 1806 if (spdk_io_channel_iter_get_ctx(i) != NULL) { 1807 rc = -1; 1808 } 1809 1810 while (!TAILQ_EMPTY(&ctrlr_ch->pending_resets)) { 1811 bio = TAILQ_FIRST(&ctrlr_ch->pending_resets); 1812 TAILQ_REMOVE(&ctrlr_ch->pending_resets, bio, retry_link); 1813 1814 bdev_nvme_reset_io_continue(bio, rc); 1815 } 1816 1817 spdk_for_each_channel_continue(i, 0); 1818 } 1819 1820 /* This function marks the current trid as failed by storing the current ticks 1821 * and then sets the next trid to the active trid within a controller if exists. 1822 * 1823 * The purpose of the boolean return value is to request the caller to disconnect 1824 * the current trid now to try connecting the next trid. 1825 */ 1826 static bool 1827 bdev_nvme_failover_trid(struct nvme_ctrlr *nvme_ctrlr, bool remove, bool start) 1828 { 1829 struct nvme_path_id *path_id, *next_path; 1830 int rc __attribute__((unused)); 1831 1832 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 1833 assert(path_id); 1834 assert(path_id == nvme_ctrlr->active_path_id); 1835 next_path = TAILQ_NEXT(path_id, link); 1836 1837 /* Update the last failed time. It means the trid is failed if its last 1838 * failed time is non-zero. 1839 */ 1840 path_id->last_failed_tsc = spdk_get_ticks(); 1841 1842 if (next_path == NULL) { 1843 /* There is no alternate trid within a controller. */ 1844 return false; 1845 } 1846 1847 if (!start && nvme_ctrlr->opts.reconnect_delay_sec == 0) { 1848 /* Connect is not retried in a controller reset sequence. Connecting 1849 * the next trid will be done by the next bdev_nvme_failover_ctrlr() call. 1850 */ 1851 return false; 1852 } 1853 1854 assert(path_id->trid.trtype != SPDK_NVME_TRANSPORT_PCIE); 1855 1856 SPDK_NOTICELOG("Start failover from %s:%s to %s:%s\n", path_id->trid.traddr, 1857 path_id->trid.trsvcid, next_path->trid.traddr, next_path->trid.trsvcid); 1858 1859 spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr); 1860 nvme_ctrlr->active_path_id = next_path; 1861 rc = spdk_nvme_ctrlr_set_trid(nvme_ctrlr->ctrlr, &next_path->trid); 1862 assert(rc == 0); 1863 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 1864 if (!remove) { 1865 /** Shuffle the old trid to the end of the list and use the new one. 1866 * Allows for round robin through multiple connections. 1867 */ 1868 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, path_id, link); 1869 } else { 1870 free(path_id); 1871 } 1872 1873 if (start || next_path->last_failed_tsc == 0) { 1874 /* bdev_nvme_failover_ctrlr() is just called or the next trid is not failed 1875 * or used yet. Try the next trid now. 1876 */ 1877 return true; 1878 } 1879 1880 if (spdk_get_ticks() > next_path->last_failed_tsc + spdk_get_ticks_hz() * 1881 nvme_ctrlr->opts.reconnect_delay_sec) { 1882 /* Enough backoff passed since the next trid failed. Try the next trid now. */ 1883 return true; 1884 } 1885 1886 /* The next trid will be tried after reconnect_delay_sec seconds. */ 1887 return false; 1888 } 1889 1890 static bool 1891 bdev_nvme_check_ctrlr_loss_timeout(struct nvme_ctrlr *nvme_ctrlr) 1892 { 1893 int32_t elapsed; 1894 1895 if (nvme_ctrlr->opts.ctrlr_loss_timeout_sec == 0 || 1896 nvme_ctrlr->opts.ctrlr_loss_timeout_sec == -1) { 1897 return false; 1898 } 1899 1900 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 1901 if (elapsed >= nvme_ctrlr->opts.ctrlr_loss_timeout_sec) { 1902 return true; 1903 } else { 1904 return false; 1905 } 1906 } 1907 1908 static bool 1909 bdev_nvme_check_fast_io_fail_timeout(struct nvme_ctrlr *nvme_ctrlr) 1910 { 1911 uint32_t elapsed; 1912 1913 if (nvme_ctrlr->opts.fast_io_fail_timeout_sec == 0) { 1914 return false; 1915 } 1916 1917 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 1918 if (elapsed >= nvme_ctrlr->opts.fast_io_fail_timeout_sec) { 1919 return true; 1920 } else { 1921 return false; 1922 } 1923 } 1924 1925 static void bdev_nvme_reset_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, bool success); 1926 1927 static void 1928 nvme_ctrlr_disconnect(struct nvme_ctrlr *nvme_ctrlr, nvme_ctrlr_disconnected_cb cb_fn) 1929 { 1930 int rc; 1931 1932 rc = spdk_nvme_ctrlr_disconnect(nvme_ctrlr->ctrlr); 1933 if (rc != 0) { 1934 /* Disconnect fails if ctrlr is already resetting or removed. In this case, 1935 * fail the reset sequence immediately. 1936 */ 1937 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 1938 return; 1939 } 1940 1941 /* spdk_nvme_ctrlr_disconnect() may complete asynchronously later by polling adminq. 1942 * Set callback here to execute the specified operation after ctrlr is really disconnected. 1943 */ 1944 assert(nvme_ctrlr->disconnected_cb == NULL); 1945 nvme_ctrlr->disconnected_cb = cb_fn; 1946 1947 /* During disconnection, reduce the period to poll adminq more often. */ 1948 bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 0); 1949 } 1950 1951 enum bdev_nvme_op_after_reset { 1952 OP_NONE, 1953 OP_COMPLETE_PENDING_DESTRUCT, 1954 OP_DESTRUCT, 1955 OP_DELAYED_RECONNECT, 1956 OP_FAILOVER, 1957 }; 1958 1959 typedef enum bdev_nvme_op_after_reset _bdev_nvme_op_after_reset; 1960 1961 static _bdev_nvme_op_after_reset 1962 bdev_nvme_check_op_after_reset(struct nvme_ctrlr *nvme_ctrlr, bool success) 1963 { 1964 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 1965 /* Complete pending destruct after reset completes. */ 1966 return OP_COMPLETE_PENDING_DESTRUCT; 1967 } else if (nvme_ctrlr->pending_failover) { 1968 nvme_ctrlr->pending_failover = false; 1969 nvme_ctrlr->reset_start_tsc = 0; 1970 return OP_FAILOVER; 1971 } else if (success || nvme_ctrlr->opts.reconnect_delay_sec == 0) { 1972 nvme_ctrlr->reset_start_tsc = 0; 1973 return OP_NONE; 1974 } else if (bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 1975 return OP_DESTRUCT; 1976 } else { 1977 if (bdev_nvme_check_fast_io_fail_timeout(nvme_ctrlr)) { 1978 nvme_ctrlr->fast_io_fail_timedout = true; 1979 } 1980 return OP_DELAYED_RECONNECT; 1981 } 1982 } 1983 1984 static int bdev_nvme_delete_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool hotplug); 1985 static void bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 1986 1987 static int 1988 bdev_nvme_reconnect_delay_timer_expired(void *ctx) 1989 { 1990 struct nvme_ctrlr *nvme_ctrlr = ctx; 1991 1992 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect_delay, nvme_ctrlr->nbdev_ctrlr->name); 1993 pthread_mutex_lock(&nvme_ctrlr->mutex); 1994 1995 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 1996 1997 if (!nvme_ctrlr->reconnect_is_delayed) { 1998 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1999 return SPDK_POLLER_BUSY; 2000 } 2001 2002 nvme_ctrlr->reconnect_is_delayed = false; 2003 2004 if (nvme_ctrlr->destruct) { 2005 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2006 return SPDK_POLLER_BUSY; 2007 } 2008 2009 assert(nvme_ctrlr->resetting == false); 2010 nvme_ctrlr->resetting = true; 2011 2012 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2013 2014 spdk_poller_resume(nvme_ctrlr->adminq_timer_poller); 2015 2016 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2017 return SPDK_POLLER_BUSY; 2018 } 2019 2020 static void 2021 bdev_nvme_start_reconnect_delay_timer(struct nvme_ctrlr *nvme_ctrlr) 2022 { 2023 spdk_poller_pause(nvme_ctrlr->adminq_timer_poller); 2024 2025 assert(nvme_ctrlr->reconnect_is_delayed == false); 2026 nvme_ctrlr->reconnect_is_delayed = true; 2027 2028 assert(nvme_ctrlr->reconnect_delay_timer == NULL); 2029 nvme_ctrlr->reconnect_delay_timer = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_delay_timer_expired, 2030 nvme_ctrlr, 2031 nvme_ctrlr->opts.reconnect_delay_sec * SPDK_SEC_TO_USEC); 2032 } 2033 2034 static void remove_discovery_entry(struct nvme_ctrlr *nvme_ctrlr); 2035 2036 static void 2037 _bdev_nvme_reset_ctrlr_complete(struct spdk_io_channel_iter *i, int status) 2038 { 2039 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2040 bool success = spdk_io_channel_iter_get_ctx(i) == NULL; 2041 bdev_nvme_ctrlr_op_cb ctrlr_op_cb_fn = nvme_ctrlr->ctrlr_op_cb_fn; 2042 void *ctrlr_op_cb_arg = nvme_ctrlr->ctrlr_op_cb_arg; 2043 enum bdev_nvme_op_after_reset op_after_reset; 2044 2045 assert(nvme_ctrlr->thread == spdk_get_thread()); 2046 2047 nvme_ctrlr->ctrlr_op_cb_fn = NULL; 2048 nvme_ctrlr->ctrlr_op_cb_arg = NULL; 2049 2050 if (!success) { 2051 SPDK_ERRLOG("Resetting controller failed.\n"); 2052 } else { 2053 SPDK_NOTICELOG("Resetting controller successful.\n"); 2054 } 2055 2056 pthread_mutex_lock(&nvme_ctrlr->mutex); 2057 nvme_ctrlr->resetting = false; 2058 nvme_ctrlr->dont_retry = false; 2059 nvme_ctrlr->in_failover = false; 2060 2061 op_after_reset = bdev_nvme_check_op_after_reset(nvme_ctrlr, success); 2062 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2063 2064 /* Delay callbacks when the next operation is a failover. */ 2065 if (ctrlr_op_cb_fn && op_after_reset != OP_FAILOVER) { 2066 ctrlr_op_cb_fn(ctrlr_op_cb_arg, success ? 0 : -1); 2067 } 2068 2069 switch (op_after_reset) { 2070 case OP_COMPLETE_PENDING_DESTRUCT: 2071 nvme_ctrlr_unregister(nvme_ctrlr); 2072 break; 2073 case OP_DESTRUCT: 2074 bdev_nvme_delete_ctrlr(nvme_ctrlr, false); 2075 remove_discovery_entry(nvme_ctrlr); 2076 break; 2077 case OP_DELAYED_RECONNECT: 2078 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_start_reconnect_delay_timer); 2079 break; 2080 case OP_FAILOVER: 2081 nvme_ctrlr->ctrlr_op_cb_fn = ctrlr_op_cb_fn; 2082 nvme_ctrlr->ctrlr_op_cb_arg = ctrlr_op_cb_arg; 2083 bdev_nvme_failover_ctrlr(nvme_ctrlr); 2084 break; 2085 default: 2086 break; 2087 } 2088 } 2089 2090 static void 2091 bdev_nvme_reset_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, bool success) 2092 { 2093 pthread_mutex_lock(&nvme_ctrlr->mutex); 2094 if (!success) { 2095 /* Connecting the active trid failed. Set the next alternate trid to the 2096 * active trid if it exists. 2097 */ 2098 if (bdev_nvme_failover_trid(nvme_ctrlr, false, false)) { 2099 /* The next alternate trid exists and is ready to try. Try it now. */ 2100 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2101 2102 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr); 2103 return; 2104 } 2105 2106 /* We came here if there is no alternate trid or if the next trid exists but 2107 * is not ready to try. We will try the active trid after reconnect_delay_sec 2108 * seconds if it is non-zero or at the next reset call otherwise. 2109 */ 2110 } else { 2111 /* Connecting the active trid succeeded. Clear the last failed time because it 2112 * means the trid is failed if its last failed time is non-zero. 2113 */ 2114 nvme_ctrlr->active_path_id->last_failed_tsc = 0; 2115 } 2116 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2117 2118 /* Make sure we clear any pending resets before returning. */ 2119 spdk_for_each_channel(nvme_ctrlr, 2120 bdev_nvme_complete_pending_resets, 2121 success ? NULL : (void *)0x1, 2122 _bdev_nvme_reset_ctrlr_complete); 2123 } 2124 2125 static void 2126 bdev_nvme_reset_create_qpairs_failed(struct spdk_io_channel_iter *i, int status) 2127 { 2128 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2129 2130 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 2131 } 2132 2133 static void 2134 bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i) 2135 { 2136 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 2137 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch); 2138 struct nvme_qpair *nvme_qpair; 2139 2140 nvme_qpair = ctrlr_ch->qpair; 2141 assert(nvme_qpair != NULL); 2142 2143 _bdev_nvme_clear_io_path_cache(nvme_qpair); 2144 2145 if (nvme_qpair->qpair != NULL) { 2146 if (nvme_qpair->ctrlr->dont_retry) { 2147 spdk_nvme_qpair_set_abort_dnr(nvme_qpair->qpair, true); 2148 } 2149 spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); 2150 2151 /* The current full reset sequence will move to the next 2152 * ctrlr_channel after the qpair is actually disconnected. 2153 */ 2154 assert(ctrlr_ch->reset_iter == NULL); 2155 ctrlr_ch->reset_iter = i; 2156 } else { 2157 spdk_for_each_channel_continue(i, 0); 2158 } 2159 } 2160 2161 static void 2162 bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status) 2163 { 2164 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2165 2166 if (status == 0) { 2167 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, true); 2168 } else { 2169 /* Delete the added qpairs and quiesce ctrlr to make the states clean. */ 2170 spdk_for_each_channel(nvme_ctrlr, 2171 bdev_nvme_reset_destroy_qpair, 2172 NULL, 2173 bdev_nvme_reset_create_qpairs_failed); 2174 } 2175 } 2176 2177 static int 2178 bdev_nvme_reset_check_qpair_connected(void *ctx) 2179 { 2180 struct nvme_ctrlr_channel *ctrlr_ch = ctx; 2181 2182 if (ctrlr_ch->reset_iter == NULL) { 2183 /* qpair was already failed to connect and the reset sequence is being aborted. */ 2184 assert(ctrlr_ch->connect_poller == NULL); 2185 assert(ctrlr_ch->qpair->qpair == NULL); 2186 return SPDK_POLLER_BUSY; 2187 } 2188 2189 assert(ctrlr_ch->qpair->qpair != NULL); 2190 2191 if (!spdk_nvme_qpair_is_connected(ctrlr_ch->qpair->qpair)) { 2192 return SPDK_POLLER_BUSY; 2193 } 2194 2195 spdk_poller_unregister(&ctrlr_ch->connect_poller); 2196 2197 /* qpair was completed to connect. Move to the next ctrlr_channel */ 2198 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 2199 ctrlr_ch->reset_iter = NULL; 2200 2201 if (!g_opts.disable_auto_failback) { 2202 _bdev_nvme_clear_io_path_cache(ctrlr_ch->qpair); 2203 } 2204 2205 return SPDK_POLLER_BUSY; 2206 } 2207 2208 static void 2209 bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i) 2210 { 2211 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 2212 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 2213 int rc; 2214 2215 rc = bdev_nvme_create_qpair(ctrlr_ch->qpair); 2216 if (rc == 0) { 2217 ctrlr_ch->connect_poller = SPDK_POLLER_REGISTER(bdev_nvme_reset_check_qpair_connected, 2218 ctrlr_ch, 0); 2219 2220 /* The current full reset sequence will move to the next 2221 * ctrlr_channel after the qpair is actually connected. 2222 */ 2223 assert(ctrlr_ch->reset_iter == NULL); 2224 ctrlr_ch->reset_iter = i; 2225 } else { 2226 spdk_for_each_channel_continue(i, rc); 2227 } 2228 } 2229 2230 static void 2231 nvme_ctrlr_check_namespaces(struct nvme_ctrlr *nvme_ctrlr) 2232 { 2233 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 2234 struct nvme_ns *nvme_ns; 2235 2236 for (nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 2237 nvme_ns != NULL; 2238 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns)) { 2239 if (!spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) { 2240 SPDK_DEBUGLOG(bdev_nvme, "NSID %u was removed during reset.\n", nvme_ns->id); 2241 /* NS can be added again. Just nullify nvme_ns->ns. */ 2242 nvme_ns->ns = NULL; 2243 } 2244 } 2245 } 2246 2247 2248 static int 2249 bdev_nvme_reconnect_ctrlr_poll(void *arg) 2250 { 2251 struct nvme_ctrlr *nvme_ctrlr = arg; 2252 int rc = -ETIMEDOUT; 2253 2254 if (!bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 2255 rc = spdk_nvme_ctrlr_reconnect_poll_async(nvme_ctrlr->ctrlr); 2256 if (rc == -EAGAIN) { 2257 return SPDK_POLLER_BUSY; 2258 } 2259 } 2260 2261 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 2262 if (rc == 0) { 2263 nvme_ctrlr_check_namespaces(nvme_ctrlr); 2264 2265 /* Recreate all of the I/O queue pairs */ 2266 spdk_for_each_channel(nvme_ctrlr, 2267 bdev_nvme_reset_create_qpair, 2268 NULL, 2269 bdev_nvme_reset_create_qpairs_done); 2270 } else { 2271 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 2272 } 2273 return SPDK_POLLER_BUSY; 2274 } 2275 2276 static void 2277 bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2278 { 2279 spdk_nvme_ctrlr_reconnect_async(nvme_ctrlr->ctrlr); 2280 2281 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect, nvme_ctrlr->nbdev_ctrlr->name); 2282 assert(nvme_ctrlr->reset_detach_poller == NULL); 2283 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_ctrlr_poll, 2284 nvme_ctrlr, 0); 2285 } 2286 2287 static void 2288 bdev_nvme_reset_destroy_qpair_done(struct spdk_io_channel_iter *i, int status) 2289 { 2290 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2291 2292 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reset, nvme_ctrlr->nbdev_ctrlr->name); 2293 assert(status == 0); 2294 2295 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2296 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2297 } else { 2298 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr); 2299 } 2300 } 2301 2302 static void 2303 bdev_nvme_reset_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr) 2304 { 2305 spdk_for_each_channel(nvme_ctrlr, 2306 bdev_nvme_reset_destroy_qpair, 2307 NULL, 2308 bdev_nvme_reset_destroy_qpair_done); 2309 } 2310 2311 static void 2312 bdev_nvme_reconnect_ctrlr_now(void *ctx) 2313 { 2314 struct nvme_ctrlr *nvme_ctrlr = ctx; 2315 2316 assert(nvme_ctrlr->resetting == true); 2317 assert(nvme_ctrlr->thread == spdk_get_thread()); 2318 2319 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 2320 2321 spdk_poller_resume(nvme_ctrlr->adminq_timer_poller); 2322 2323 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2324 } 2325 2326 static void 2327 _bdev_nvme_reset_ctrlr(void *ctx) 2328 { 2329 struct nvme_ctrlr *nvme_ctrlr = ctx; 2330 2331 assert(nvme_ctrlr->resetting == true); 2332 assert(nvme_ctrlr->thread == spdk_get_thread()); 2333 2334 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2335 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reset_destroy_qpairs); 2336 } else { 2337 bdev_nvme_reset_destroy_qpairs(nvme_ctrlr); 2338 } 2339 } 2340 2341 static int 2342 bdev_nvme_reset_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2343 { 2344 spdk_msg_fn msg_fn; 2345 2346 pthread_mutex_lock(&nvme_ctrlr->mutex); 2347 if (nvme_ctrlr->destruct) { 2348 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2349 return -ENXIO; 2350 } 2351 2352 if (nvme_ctrlr->resetting) { 2353 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2354 SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); 2355 return -EBUSY; 2356 } 2357 2358 if (nvme_ctrlr->disabled) { 2359 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2360 SPDK_NOTICELOG("Unable to perform reset. Controller is disabled.\n"); 2361 return -EALREADY; 2362 } 2363 2364 nvme_ctrlr->resetting = true; 2365 nvme_ctrlr->dont_retry = true; 2366 2367 if (nvme_ctrlr->reconnect_is_delayed) { 2368 SPDK_DEBUGLOG(bdev_nvme, "Reconnect is already scheduled.\n"); 2369 msg_fn = bdev_nvme_reconnect_ctrlr_now; 2370 nvme_ctrlr->reconnect_is_delayed = false; 2371 } else { 2372 msg_fn = _bdev_nvme_reset_ctrlr; 2373 assert(nvme_ctrlr->reset_start_tsc == 0); 2374 } 2375 2376 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2377 2378 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2379 2380 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 2381 return 0; 2382 } 2383 2384 static int 2385 bdev_nvme_enable_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2386 { 2387 pthread_mutex_lock(&nvme_ctrlr->mutex); 2388 if (nvme_ctrlr->destruct) { 2389 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2390 return -ENXIO; 2391 } 2392 2393 if (nvme_ctrlr->resetting) { 2394 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2395 return -EBUSY; 2396 } 2397 2398 if (!nvme_ctrlr->disabled) { 2399 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2400 return -EALREADY; 2401 } 2402 2403 nvme_ctrlr->disabled = false; 2404 nvme_ctrlr->resetting = true; 2405 2406 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2407 2408 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2409 2410 spdk_thread_send_msg(nvme_ctrlr->thread, bdev_nvme_reconnect_ctrlr_now, nvme_ctrlr); 2411 return 0; 2412 } 2413 2414 static void 2415 _bdev_nvme_disable_ctrlr_complete(struct spdk_io_channel_iter *i, int status) 2416 { 2417 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2418 bdev_nvme_ctrlr_op_cb ctrlr_op_cb_fn = nvme_ctrlr->ctrlr_op_cb_fn; 2419 void *ctrlr_op_cb_arg = nvme_ctrlr->ctrlr_op_cb_arg; 2420 enum bdev_nvme_op_after_reset op_after_disable; 2421 2422 assert(nvme_ctrlr->thread == spdk_get_thread()); 2423 2424 nvme_ctrlr->ctrlr_op_cb_fn = NULL; 2425 nvme_ctrlr->ctrlr_op_cb_arg = NULL; 2426 2427 pthread_mutex_lock(&nvme_ctrlr->mutex); 2428 2429 nvme_ctrlr->resetting = false; 2430 nvme_ctrlr->dont_retry = false; 2431 2432 op_after_disable = bdev_nvme_check_op_after_reset(nvme_ctrlr, true); 2433 2434 nvme_ctrlr->disabled = true; 2435 spdk_poller_pause(nvme_ctrlr->adminq_timer_poller); 2436 2437 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2438 2439 if (ctrlr_op_cb_fn) { 2440 ctrlr_op_cb_fn(ctrlr_op_cb_arg, 0); 2441 } 2442 2443 switch (op_after_disable) { 2444 case OP_COMPLETE_PENDING_DESTRUCT: 2445 nvme_ctrlr_unregister(nvme_ctrlr); 2446 break; 2447 default: 2448 break; 2449 } 2450 2451 } 2452 2453 static void 2454 bdev_nvme_disable_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr) 2455 { 2456 /* Make sure we clear any pending resets before returning. */ 2457 spdk_for_each_channel(nvme_ctrlr, 2458 bdev_nvme_complete_pending_resets, 2459 NULL, 2460 _bdev_nvme_disable_ctrlr_complete); 2461 } 2462 2463 static void 2464 bdev_nvme_disable_destroy_qpairs_done(struct spdk_io_channel_iter *i, int status) 2465 { 2466 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2467 2468 assert(status == 0); 2469 2470 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2471 bdev_nvme_disable_ctrlr_complete(nvme_ctrlr); 2472 } else { 2473 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_disable_ctrlr_complete); 2474 } 2475 } 2476 2477 static void 2478 bdev_nvme_disable_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr) 2479 { 2480 spdk_for_each_channel(nvme_ctrlr, 2481 bdev_nvme_reset_destroy_qpair, 2482 NULL, 2483 bdev_nvme_disable_destroy_qpairs_done); 2484 } 2485 2486 static void 2487 _bdev_nvme_cancel_reconnect_and_disable_ctrlr(void *ctx) 2488 { 2489 struct nvme_ctrlr *nvme_ctrlr = ctx; 2490 2491 assert(nvme_ctrlr->resetting == true); 2492 assert(nvme_ctrlr->thread == spdk_get_thread()); 2493 2494 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 2495 2496 bdev_nvme_disable_ctrlr_complete(nvme_ctrlr); 2497 } 2498 2499 static void 2500 _bdev_nvme_disconnect_and_disable_ctrlr(void *ctx) 2501 { 2502 struct nvme_ctrlr *nvme_ctrlr = ctx; 2503 2504 assert(nvme_ctrlr->resetting == true); 2505 assert(nvme_ctrlr->thread == spdk_get_thread()); 2506 2507 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2508 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_disable_destroy_qpairs); 2509 } else { 2510 bdev_nvme_disable_destroy_qpairs(nvme_ctrlr); 2511 } 2512 } 2513 2514 static int 2515 bdev_nvme_disable_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2516 { 2517 spdk_msg_fn msg_fn; 2518 2519 pthread_mutex_lock(&nvme_ctrlr->mutex); 2520 if (nvme_ctrlr->destruct) { 2521 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2522 return -ENXIO; 2523 } 2524 2525 if (nvme_ctrlr->resetting) { 2526 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2527 return -EBUSY; 2528 } 2529 2530 if (nvme_ctrlr->disabled) { 2531 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2532 return -EALREADY; 2533 } 2534 2535 nvme_ctrlr->resetting = true; 2536 nvme_ctrlr->dont_retry = true; 2537 2538 if (nvme_ctrlr->reconnect_is_delayed) { 2539 msg_fn = _bdev_nvme_cancel_reconnect_and_disable_ctrlr; 2540 nvme_ctrlr->reconnect_is_delayed = false; 2541 } else { 2542 msg_fn = _bdev_nvme_disconnect_and_disable_ctrlr; 2543 } 2544 2545 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2546 2547 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2548 2549 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 2550 return 0; 2551 } 2552 2553 static int 2554 nvme_ctrlr_op(struct nvme_ctrlr *nvme_ctrlr, enum nvme_ctrlr_op op, 2555 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2556 { 2557 int rc; 2558 2559 switch (op) { 2560 case NVME_CTRLR_OP_RESET: 2561 rc = bdev_nvme_reset_ctrlr(nvme_ctrlr); 2562 break; 2563 case NVME_CTRLR_OP_ENABLE: 2564 rc = bdev_nvme_enable_ctrlr(nvme_ctrlr); 2565 break; 2566 case NVME_CTRLR_OP_DISABLE: 2567 rc = bdev_nvme_disable_ctrlr(nvme_ctrlr); 2568 break; 2569 default: 2570 rc = -EINVAL; 2571 break; 2572 } 2573 2574 if (rc == 0) { 2575 assert(nvme_ctrlr->ctrlr_op_cb_fn == NULL); 2576 assert(nvme_ctrlr->ctrlr_op_cb_arg == NULL); 2577 nvme_ctrlr->ctrlr_op_cb_fn = cb_fn; 2578 nvme_ctrlr->ctrlr_op_cb_arg = cb_arg; 2579 } 2580 return rc; 2581 } 2582 2583 struct nvme_ctrlr_op_rpc_ctx { 2584 struct nvme_ctrlr *nvme_ctrlr; 2585 struct spdk_thread *orig_thread; 2586 enum nvme_ctrlr_op op; 2587 int rc; 2588 bdev_nvme_ctrlr_op_cb cb_fn; 2589 void *cb_arg; 2590 }; 2591 2592 static void 2593 _nvme_ctrlr_op_rpc_complete(void *_ctx) 2594 { 2595 struct nvme_ctrlr_op_rpc_ctx *ctx = _ctx; 2596 2597 assert(ctx != NULL); 2598 assert(ctx->cb_fn != NULL); 2599 2600 ctx->cb_fn(ctx->cb_arg, ctx->rc); 2601 2602 free(ctx); 2603 } 2604 2605 static void 2606 nvme_ctrlr_op_rpc_complete(void *cb_arg, int rc) 2607 { 2608 struct nvme_ctrlr_op_rpc_ctx *ctx = cb_arg; 2609 2610 ctx->rc = rc; 2611 2612 spdk_thread_send_msg(ctx->orig_thread, _nvme_ctrlr_op_rpc_complete, ctx); 2613 } 2614 2615 void 2616 nvme_ctrlr_op_rpc(struct nvme_ctrlr *nvme_ctrlr, enum nvme_ctrlr_op op, 2617 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2618 { 2619 struct nvme_ctrlr_op_rpc_ctx *ctx; 2620 int rc; 2621 2622 assert(cb_fn != NULL); 2623 2624 ctx = calloc(1, sizeof(*ctx)); 2625 if (ctx == NULL) { 2626 SPDK_ERRLOG("Failed to allocate nvme_ctrlr_op_rpc_ctx.\n"); 2627 cb_fn(cb_arg, -ENOMEM); 2628 return; 2629 } 2630 2631 ctx->orig_thread = spdk_get_thread(); 2632 ctx->cb_fn = cb_fn; 2633 ctx->cb_arg = cb_arg; 2634 2635 rc = nvme_ctrlr_op(nvme_ctrlr, op, nvme_ctrlr_op_rpc_complete, ctx); 2636 if (rc == 0) { 2637 return; 2638 } else if (rc == -EALREADY) { 2639 rc = 0; 2640 } 2641 2642 nvme_ctrlr_op_rpc_complete(ctx, rc); 2643 } 2644 2645 static void nvme_bdev_ctrlr_op_rpc_continue(void *cb_arg, int rc); 2646 2647 static void 2648 _nvme_bdev_ctrlr_op_rpc_continue(void *_ctx) 2649 { 2650 struct nvme_ctrlr_op_rpc_ctx *ctx = _ctx; 2651 struct nvme_ctrlr *prev_nvme_ctrlr, *next_nvme_ctrlr; 2652 int rc; 2653 2654 prev_nvme_ctrlr = ctx->nvme_ctrlr; 2655 ctx->nvme_ctrlr = NULL; 2656 2657 if (ctx->rc != 0) { 2658 goto complete; 2659 } 2660 2661 next_nvme_ctrlr = TAILQ_NEXT(prev_nvme_ctrlr, tailq); 2662 if (next_nvme_ctrlr == NULL) { 2663 goto complete; 2664 } 2665 2666 rc = nvme_ctrlr_op(next_nvme_ctrlr, ctx->op, nvme_bdev_ctrlr_op_rpc_continue, ctx); 2667 if (rc == 0) { 2668 ctx->nvme_ctrlr = next_nvme_ctrlr; 2669 return; 2670 } else if (rc == -EALREADY) { 2671 ctx->nvme_ctrlr = next_nvme_ctrlr; 2672 rc = 0; 2673 } 2674 2675 ctx->rc = rc; 2676 2677 complete: 2678 ctx->cb_fn(ctx->cb_arg, ctx->rc); 2679 free(ctx); 2680 } 2681 2682 static void 2683 nvme_bdev_ctrlr_op_rpc_continue(void *cb_arg, int rc) 2684 { 2685 struct nvme_ctrlr_op_rpc_ctx *ctx = cb_arg; 2686 2687 ctx->rc = rc; 2688 2689 spdk_thread_send_msg(ctx->orig_thread, _nvme_bdev_ctrlr_op_rpc_continue, ctx); 2690 } 2691 2692 void 2693 nvme_bdev_ctrlr_op_rpc(struct nvme_bdev_ctrlr *nbdev_ctrlr, enum nvme_ctrlr_op op, 2694 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2695 { 2696 struct nvme_ctrlr_op_rpc_ctx *ctx; 2697 struct nvme_ctrlr *nvme_ctrlr; 2698 int rc; 2699 2700 assert(cb_fn != NULL); 2701 2702 ctx = calloc(1, sizeof(*ctx)); 2703 if (ctx == NULL) { 2704 SPDK_ERRLOG("Failed to allocate nvme_ctrlr_op_rpc_ctx.\n"); 2705 cb_fn(cb_arg, -ENOMEM); 2706 return; 2707 } 2708 2709 ctx->orig_thread = spdk_get_thread(); 2710 ctx->op = op; 2711 ctx->cb_fn = cb_fn; 2712 ctx->cb_arg = cb_arg; 2713 2714 nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 2715 assert(nvme_ctrlr != NULL); 2716 2717 rc = nvme_ctrlr_op(nvme_ctrlr, op, nvme_bdev_ctrlr_op_rpc_continue, ctx); 2718 if (rc == 0) { 2719 ctx->nvme_ctrlr = nvme_ctrlr; 2720 return; 2721 } else if (rc == -EALREADY) { 2722 ctx->nvme_ctrlr = nvme_ctrlr; 2723 rc = 0; 2724 } 2725 2726 nvme_bdev_ctrlr_op_rpc_continue(ctx, rc); 2727 } 2728 2729 static int _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio); 2730 2731 static void 2732 _bdev_nvme_reset_io_complete(struct spdk_io_channel_iter *i, int status) 2733 { 2734 struct nvme_bdev_io *bio = spdk_io_channel_iter_get_ctx(i); 2735 enum spdk_bdev_io_status io_status; 2736 2737 if (bio->cpl.cdw0 == 0) { 2738 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 2739 } else { 2740 io_status = SPDK_BDEV_IO_STATUS_FAILED; 2741 } 2742 2743 __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), io_status, NULL); 2744 } 2745 2746 static void 2747 bdev_nvme_abort_bdev_channel(struct spdk_io_channel_iter *i) 2748 { 2749 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 2750 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 2751 2752 bdev_nvme_abort_retry_ios(nbdev_ch); 2753 2754 spdk_for_each_channel_continue(i, 0); 2755 } 2756 2757 static void 2758 bdev_nvme_reset_io_complete(struct nvme_bdev_io *bio) 2759 { 2760 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2761 struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt; 2762 2763 /* Abort all queued I/Os for retry. */ 2764 spdk_for_each_channel(nbdev, 2765 bdev_nvme_abort_bdev_channel, 2766 bio, 2767 _bdev_nvme_reset_io_complete); 2768 } 2769 2770 static void 2771 _bdev_nvme_reset_io_continue(void *ctx) 2772 { 2773 struct nvme_bdev_io *bio = ctx; 2774 struct nvme_io_path *prev_io_path, *next_io_path; 2775 int rc; 2776 2777 prev_io_path = bio->io_path; 2778 bio->io_path = NULL; 2779 2780 if (bio->cpl.cdw0 != 0) { 2781 goto complete; 2782 } 2783 2784 next_io_path = STAILQ_NEXT(prev_io_path, stailq); 2785 if (next_io_path == NULL) { 2786 goto complete; 2787 } 2788 2789 rc = _bdev_nvme_reset_io(next_io_path, bio); 2790 if (rc == 0) { 2791 return; 2792 } 2793 2794 bio->cpl.cdw0 = 1; 2795 2796 complete: 2797 bdev_nvme_reset_io_complete(bio); 2798 } 2799 2800 static void 2801 bdev_nvme_reset_io_continue(void *cb_arg, int rc) 2802 { 2803 struct nvme_bdev_io *bio = cb_arg; 2804 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2805 2806 bio->cpl.cdw0 = (rc == 0) ? 0 : 1; 2807 2808 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), _bdev_nvme_reset_io_continue, bio); 2809 } 2810 2811 static int 2812 _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio) 2813 { 2814 struct nvme_ctrlr_channel *ctrlr_ch; 2815 int rc; 2816 2817 rc = nvme_ctrlr_op(io_path->qpair->ctrlr, NVME_CTRLR_OP_RESET, 2818 bdev_nvme_reset_io_continue, bio); 2819 if (rc != 0 && rc != -EBUSY) { 2820 return rc; 2821 } 2822 2823 assert(bio->io_path == NULL); 2824 bio->io_path = io_path; 2825 2826 if (rc == -EBUSY) { 2827 ctrlr_ch = io_path->qpair->ctrlr_ch; 2828 assert(ctrlr_ch != NULL); 2829 /* 2830 * Reset call is queued only if it is from the app framework. This is on purpose so that 2831 * we don't interfere with the app framework reset strategy. i.e. we are deferring to the 2832 * upper level. If they are in the middle of a reset, we won't try to schedule another one. 2833 */ 2834 TAILQ_INSERT_TAIL(&ctrlr_ch->pending_resets, bio, retry_link); 2835 } 2836 2837 return 0; 2838 } 2839 2840 static void 2841 bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio) 2842 { 2843 struct nvme_io_path *io_path; 2844 int rc; 2845 2846 bio->cpl.cdw0 = 0; 2847 2848 /* Reset all nvme_ctrlrs of a bdev controller sequentially. */ 2849 io_path = STAILQ_FIRST(&nbdev_ch->io_path_list); 2850 assert(io_path != NULL); 2851 2852 rc = _bdev_nvme_reset_io(io_path, bio); 2853 if (rc != 0) { 2854 /* If the current nvme_ctrlr is disabled, skip it and move to the next nvme_ctrlr. */ 2855 rc = (rc == -EALREADY) ? 0 : rc; 2856 2857 bdev_nvme_reset_io_continue(bio, rc); 2858 } 2859 } 2860 2861 static int 2862 bdev_nvme_failover_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool remove) 2863 { 2864 if (nvme_ctrlr->destruct) { 2865 /* Don't bother resetting if the controller is in the process of being destructed. */ 2866 return -ENXIO; 2867 } 2868 2869 if (nvme_ctrlr->resetting) { 2870 if (!nvme_ctrlr->in_failover) { 2871 SPDK_NOTICELOG("Reset is already in progress. Defer failover until reset completes.\n"); 2872 2873 /* Defer failover until reset completes. */ 2874 nvme_ctrlr->pending_failover = true; 2875 return -EINPROGRESS; 2876 } else { 2877 SPDK_NOTICELOG("Unable to perform failover, already in progress.\n"); 2878 return -EBUSY; 2879 } 2880 } 2881 2882 bdev_nvme_failover_trid(nvme_ctrlr, remove, true); 2883 2884 if (nvme_ctrlr->reconnect_is_delayed) { 2885 SPDK_NOTICELOG("Reconnect is already scheduled.\n"); 2886 2887 /* We rely on the next reconnect for the failover. */ 2888 return -EALREADY; 2889 } 2890 2891 if (nvme_ctrlr->disabled) { 2892 SPDK_NOTICELOG("Controller is disabled.\n"); 2893 2894 /* We rely on the enablement for the failover. */ 2895 return -EALREADY; 2896 } 2897 2898 nvme_ctrlr->resetting = true; 2899 nvme_ctrlr->in_failover = true; 2900 2901 assert(nvme_ctrlr->reset_start_tsc == 0); 2902 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2903 2904 return 0; 2905 } 2906 2907 static int 2908 bdev_nvme_failover_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2909 { 2910 int rc; 2911 2912 pthread_mutex_lock(&nvme_ctrlr->mutex); 2913 rc = bdev_nvme_failover_ctrlr_unsafe(nvme_ctrlr, false); 2914 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2915 2916 if (rc == 0) { 2917 spdk_thread_send_msg(nvme_ctrlr->thread, _bdev_nvme_reset_ctrlr, nvme_ctrlr); 2918 } else if (rc == -EALREADY) { 2919 rc = 0; 2920 } 2921 2922 return rc; 2923 } 2924 2925 static int bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, 2926 uint64_t num_blocks); 2927 2928 static int bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, 2929 uint64_t num_blocks); 2930 2931 static int bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, 2932 uint64_t src_offset_blocks, 2933 uint64_t num_blocks); 2934 2935 static void 2936 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 2937 bool success) 2938 { 2939 struct nvme_bdev_io *bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 2940 int ret; 2941 2942 if (!success) { 2943 ret = -EINVAL; 2944 goto exit; 2945 } 2946 2947 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 2948 ret = -ENXIO; 2949 goto exit; 2950 } 2951 2952 ret = bdev_nvme_readv(bio, 2953 bdev_io->u.bdev.iovs, 2954 bdev_io->u.bdev.iovcnt, 2955 bdev_io->u.bdev.md_buf, 2956 bdev_io->u.bdev.num_blocks, 2957 bdev_io->u.bdev.offset_blocks, 2958 bdev_io->u.bdev.dif_check_flags, 2959 bdev_io->u.bdev.memory_domain, 2960 bdev_io->u.bdev.memory_domain_ctx, 2961 bdev_io->u.bdev.accel_sequence); 2962 2963 exit: 2964 if (spdk_unlikely(ret != 0)) { 2965 bdev_nvme_io_complete(bio, ret); 2966 } 2967 } 2968 2969 static inline void 2970 _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io) 2971 { 2972 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 2973 struct spdk_bdev *bdev = bdev_io->bdev; 2974 struct nvme_bdev_io *nbdev_io_to_abort; 2975 int rc = 0; 2976 2977 switch (bdev_io->type) { 2978 case SPDK_BDEV_IO_TYPE_READ: 2979 if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) { 2980 2981 rc = bdev_nvme_readv(nbdev_io, 2982 bdev_io->u.bdev.iovs, 2983 bdev_io->u.bdev.iovcnt, 2984 bdev_io->u.bdev.md_buf, 2985 bdev_io->u.bdev.num_blocks, 2986 bdev_io->u.bdev.offset_blocks, 2987 bdev_io->u.bdev.dif_check_flags, 2988 bdev_io->u.bdev.memory_domain, 2989 bdev_io->u.bdev.memory_domain_ctx, 2990 bdev_io->u.bdev.accel_sequence); 2991 } else { 2992 spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb, 2993 bdev_io->u.bdev.num_blocks * bdev->blocklen); 2994 rc = 0; 2995 } 2996 break; 2997 case SPDK_BDEV_IO_TYPE_WRITE: 2998 rc = bdev_nvme_writev(nbdev_io, 2999 bdev_io->u.bdev.iovs, 3000 bdev_io->u.bdev.iovcnt, 3001 bdev_io->u.bdev.md_buf, 3002 bdev_io->u.bdev.num_blocks, 3003 bdev_io->u.bdev.offset_blocks, 3004 bdev_io->u.bdev.dif_check_flags, 3005 bdev_io->u.bdev.memory_domain, 3006 bdev_io->u.bdev.memory_domain_ctx, 3007 bdev_io->u.bdev.accel_sequence); 3008 break; 3009 case SPDK_BDEV_IO_TYPE_COMPARE: 3010 rc = bdev_nvme_comparev(nbdev_io, 3011 bdev_io->u.bdev.iovs, 3012 bdev_io->u.bdev.iovcnt, 3013 bdev_io->u.bdev.md_buf, 3014 bdev_io->u.bdev.num_blocks, 3015 bdev_io->u.bdev.offset_blocks, 3016 bdev_io->u.bdev.dif_check_flags); 3017 break; 3018 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 3019 rc = bdev_nvme_comparev_and_writev(nbdev_io, 3020 bdev_io->u.bdev.iovs, 3021 bdev_io->u.bdev.iovcnt, 3022 bdev_io->u.bdev.fused_iovs, 3023 bdev_io->u.bdev.fused_iovcnt, 3024 bdev_io->u.bdev.md_buf, 3025 bdev_io->u.bdev.num_blocks, 3026 bdev_io->u.bdev.offset_blocks, 3027 bdev_io->u.bdev.dif_check_flags); 3028 break; 3029 case SPDK_BDEV_IO_TYPE_UNMAP: 3030 rc = bdev_nvme_unmap(nbdev_io, 3031 bdev_io->u.bdev.offset_blocks, 3032 bdev_io->u.bdev.num_blocks); 3033 break; 3034 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3035 rc = bdev_nvme_write_zeroes(nbdev_io, 3036 bdev_io->u.bdev.offset_blocks, 3037 bdev_io->u.bdev.num_blocks); 3038 break; 3039 case SPDK_BDEV_IO_TYPE_RESET: 3040 nbdev_io->io_path = NULL; 3041 bdev_nvme_reset_io(nbdev_ch, nbdev_io); 3042 return; 3043 3044 case SPDK_BDEV_IO_TYPE_FLUSH: 3045 bdev_nvme_io_complete(nbdev_io, 0); 3046 return; 3047 3048 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 3049 rc = bdev_nvme_zone_appendv(nbdev_io, 3050 bdev_io->u.bdev.iovs, 3051 bdev_io->u.bdev.iovcnt, 3052 bdev_io->u.bdev.md_buf, 3053 bdev_io->u.bdev.num_blocks, 3054 bdev_io->u.bdev.offset_blocks, 3055 bdev_io->u.bdev.dif_check_flags); 3056 break; 3057 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 3058 rc = bdev_nvme_get_zone_info(nbdev_io, 3059 bdev_io->u.zone_mgmt.zone_id, 3060 bdev_io->u.zone_mgmt.num_zones, 3061 bdev_io->u.zone_mgmt.buf); 3062 break; 3063 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 3064 rc = bdev_nvme_zone_management(nbdev_io, 3065 bdev_io->u.zone_mgmt.zone_id, 3066 bdev_io->u.zone_mgmt.zone_action); 3067 break; 3068 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 3069 nbdev_io->io_path = NULL; 3070 bdev_nvme_admin_passthru(nbdev_ch, 3071 nbdev_io, 3072 &bdev_io->u.nvme_passthru.cmd, 3073 bdev_io->u.nvme_passthru.buf, 3074 bdev_io->u.nvme_passthru.nbytes); 3075 return; 3076 3077 case SPDK_BDEV_IO_TYPE_NVME_IO: 3078 rc = bdev_nvme_io_passthru(nbdev_io, 3079 &bdev_io->u.nvme_passthru.cmd, 3080 bdev_io->u.nvme_passthru.buf, 3081 bdev_io->u.nvme_passthru.nbytes); 3082 break; 3083 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3084 rc = bdev_nvme_io_passthru_md(nbdev_io, 3085 &bdev_io->u.nvme_passthru.cmd, 3086 bdev_io->u.nvme_passthru.buf, 3087 bdev_io->u.nvme_passthru.nbytes, 3088 bdev_io->u.nvme_passthru.md_buf, 3089 bdev_io->u.nvme_passthru.md_len); 3090 break; 3091 case SPDK_BDEV_IO_TYPE_NVME_IOV_MD: 3092 rc = bdev_nvme_iov_passthru_md(nbdev_io, 3093 &bdev_io->u.nvme_passthru.cmd, 3094 bdev_io->u.nvme_passthru.iovs, 3095 bdev_io->u.nvme_passthru.iovcnt, 3096 bdev_io->u.nvme_passthru.nbytes, 3097 bdev_io->u.nvme_passthru.md_buf, 3098 bdev_io->u.nvme_passthru.md_len); 3099 break; 3100 case SPDK_BDEV_IO_TYPE_ABORT: 3101 nbdev_io->io_path = NULL; 3102 nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx; 3103 bdev_nvme_abort(nbdev_ch, 3104 nbdev_io, 3105 nbdev_io_to_abort); 3106 return; 3107 3108 case SPDK_BDEV_IO_TYPE_COPY: 3109 rc = bdev_nvme_copy(nbdev_io, 3110 bdev_io->u.bdev.offset_blocks, 3111 bdev_io->u.bdev.copy.src_offset_blocks, 3112 bdev_io->u.bdev.num_blocks); 3113 break; 3114 default: 3115 rc = -EINVAL; 3116 break; 3117 } 3118 3119 if (spdk_unlikely(rc != 0)) { 3120 bdev_nvme_io_complete(nbdev_io, rc); 3121 } 3122 } 3123 3124 static void 3125 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 3126 { 3127 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 3128 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 3129 3130 if (spdk_likely(nbdev_io->submit_tsc == 0)) { 3131 nbdev_io->submit_tsc = spdk_bdev_io_get_submit_tsc(bdev_io); 3132 } else { 3133 /* There are cases where submit_tsc != 0, i.e. retry I/O. 3134 * We need to update submit_tsc here. 3135 */ 3136 nbdev_io->submit_tsc = spdk_get_ticks(); 3137 } 3138 3139 spdk_trace_record(TRACE_BDEV_NVME_IO_START, 0, 0, (uintptr_t)nbdev_io, (uintptr_t)bdev_io); 3140 nbdev_io->io_path = bdev_nvme_find_io_path(nbdev_ch); 3141 if (spdk_unlikely(!nbdev_io->io_path)) { 3142 if (!bdev_nvme_io_type_is_admin(bdev_io->type)) { 3143 bdev_nvme_io_complete(nbdev_io, -ENXIO); 3144 return; 3145 } 3146 3147 /* Admin commands do not use the optimal I/O path. 3148 * Simply fall through even if it is not found. 3149 */ 3150 } 3151 3152 _bdev_nvme_submit_request(nbdev_ch, bdev_io); 3153 } 3154 3155 static bool 3156 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 3157 { 3158 struct nvme_bdev *nbdev = ctx; 3159 struct nvme_ns *nvme_ns; 3160 struct spdk_nvme_ns *ns; 3161 struct spdk_nvme_ctrlr *ctrlr; 3162 const struct spdk_nvme_ctrlr_data *cdata; 3163 3164 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 3165 assert(nvme_ns != NULL); 3166 ns = nvme_ns->ns; 3167 if (ns == NULL) { 3168 return false; 3169 } 3170 3171 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 3172 3173 switch (io_type) { 3174 case SPDK_BDEV_IO_TYPE_READ: 3175 case SPDK_BDEV_IO_TYPE_WRITE: 3176 case SPDK_BDEV_IO_TYPE_RESET: 3177 case SPDK_BDEV_IO_TYPE_FLUSH: 3178 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 3179 case SPDK_BDEV_IO_TYPE_NVME_IO: 3180 case SPDK_BDEV_IO_TYPE_ABORT: 3181 return true; 3182 3183 case SPDK_BDEV_IO_TYPE_COMPARE: 3184 return spdk_nvme_ns_supports_compare(ns); 3185 3186 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3187 return spdk_nvme_ns_get_md_size(ns) ? true : false; 3188 3189 case SPDK_BDEV_IO_TYPE_UNMAP: 3190 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3191 return cdata->oncs.dsm; 3192 3193 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3194 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3195 return cdata->oncs.write_zeroes; 3196 3197 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 3198 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 3199 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) { 3200 return true; 3201 } 3202 return false; 3203 3204 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 3205 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 3206 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS; 3207 3208 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 3209 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS && 3210 spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ZONE_APPEND_SUPPORTED; 3211 3212 case SPDK_BDEV_IO_TYPE_COPY: 3213 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3214 return cdata->oncs.copy; 3215 3216 default: 3217 return false; 3218 } 3219 } 3220 3221 static int 3222 nvme_qpair_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ctrlr_channel *ctrlr_ch) 3223 { 3224 struct nvme_qpair *nvme_qpair; 3225 struct spdk_io_channel *pg_ch; 3226 int rc; 3227 3228 nvme_qpair = calloc(1, sizeof(*nvme_qpair)); 3229 if (!nvme_qpair) { 3230 SPDK_ERRLOG("Failed to alloc nvme_qpair.\n"); 3231 return -1; 3232 } 3233 3234 TAILQ_INIT(&nvme_qpair->io_path_list); 3235 3236 nvme_qpair->ctrlr = nvme_ctrlr; 3237 nvme_qpair->ctrlr_ch = ctrlr_ch; 3238 3239 pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs); 3240 if (!pg_ch) { 3241 free(nvme_qpair); 3242 return -1; 3243 } 3244 3245 nvme_qpair->group = spdk_io_channel_get_ctx(pg_ch); 3246 3247 #ifdef SPDK_CONFIG_VTUNE 3248 nvme_qpair->group->collect_spin_stat = true; 3249 #else 3250 nvme_qpair->group->collect_spin_stat = false; 3251 #endif 3252 3253 if (!nvme_ctrlr->disabled) { 3254 /* If a nvme_ctrlr is disabled, don't try to create qpair for it. Qpair will 3255 * be created when it's enabled. 3256 */ 3257 rc = bdev_nvme_create_qpair(nvme_qpair); 3258 if (rc != 0) { 3259 /* nvme_ctrlr can't create IO qpair if connection is down. 3260 * If reconnect_delay_sec is non-zero, creating IO qpair is retried 3261 * after reconnect_delay_sec seconds. If bdev_retry_count is non-zero, 3262 * submitted IO will be queued until IO qpair is successfully created. 3263 * 3264 * Hence, if both are satisfied, ignore the failure. 3265 */ 3266 if (nvme_ctrlr->opts.reconnect_delay_sec == 0 || g_opts.bdev_retry_count == 0) { 3267 spdk_put_io_channel(pg_ch); 3268 free(nvme_qpair); 3269 return rc; 3270 } 3271 } 3272 } 3273 3274 TAILQ_INSERT_TAIL(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 3275 3276 ctrlr_ch->qpair = nvme_qpair; 3277 3278 pthread_mutex_lock(&nvme_qpair->ctrlr->mutex); 3279 nvme_qpair->ctrlr->ref++; 3280 pthread_mutex_unlock(&nvme_qpair->ctrlr->mutex); 3281 3282 return 0; 3283 } 3284 3285 static int 3286 bdev_nvme_create_ctrlr_channel_cb(void *io_device, void *ctx_buf) 3287 { 3288 struct nvme_ctrlr *nvme_ctrlr = io_device; 3289 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 3290 3291 TAILQ_INIT(&ctrlr_ch->pending_resets); 3292 3293 return nvme_qpair_create(nvme_ctrlr, ctrlr_ch); 3294 } 3295 3296 static void 3297 nvme_qpair_delete(struct nvme_qpair *nvme_qpair) 3298 { 3299 struct nvme_io_path *io_path, *next; 3300 3301 assert(nvme_qpair->group != NULL); 3302 3303 TAILQ_FOREACH_SAFE(io_path, &nvme_qpair->io_path_list, tailq, next) { 3304 TAILQ_REMOVE(&nvme_qpair->io_path_list, io_path, tailq); 3305 nvme_io_path_free(io_path); 3306 } 3307 3308 TAILQ_REMOVE(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 3309 3310 spdk_put_io_channel(spdk_io_channel_from_ctx(nvme_qpair->group)); 3311 3312 nvme_ctrlr_release(nvme_qpair->ctrlr); 3313 3314 free(nvme_qpair); 3315 } 3316 3317 static void 3318 bdev_nvme_destroy_ctrlr_channel_cb(void *io_device, void *ctx_buf) 3319 { 3320 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 3321 struct nvme_qpair *nvme_qpair; 3322 3323 nvme_qpair = ctrlr_ch->qpair; 3324 assert(nvme_qpair != NULL); 3325 3326 _bdev_nvme_clear_io_path_cache(nvme_qpair); 3327 3328 if (nvme_qpair->qpair != NULL) { 3329 if (ctrlr_ch->reset_iter == NULL) { 3330 spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); 3331 } else { 3332 /* Skip current ctrlr_channel in a full reset sequence because 3333 * it is being deleted now. The qpair is already being disconnected. 3334 * We do not have to restart disconnecting it. 3335 */ 3336 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 3337 } 3338 3339 /* We cannot release a reference to the poll group now. 3340 * The qpair may be disconnected asynchronously later. 3341 * We need to poll it until it is actually disconnected. 3342 * Just detach the qpair from the deleting ctrlr_channel. 3343 */ 3344 nvme_qpair->ctrlr_ch = NULL; 3345 } else { 3346 assert(ctrlr_ch->reset_iter == NULL); 3347 3348 nvme_qpair_delete(nvme_qpair); 3349 } 3350 } 3351 3352 static inline struct spdk_io_channel * 3353 bdev_nvme_get_accel_channel(struct nvme_poll_group *group) 3354 { 3355 if (spdk_unlikely(!group->accel_channel)) { 3356 group->accel_channel = spdk_accel_get_io_channel(); 3357 if (!group->accel_channel) { 3358 SPDK_ERRLOG("Cannot get the accel_channel for bdev nvme polling group=%p\n", 3359 group); 3360 return NULL; 3361 } 3362 } 3363 3364 return group->accel_channel; 3365 } 3366 3367 static void 3368 bdev_nvme_submit_accel_crc32c(void *ctx, uint32_t *dst, struct iovec *iov, 3369 uint32_t iov_cnt, uint32_t seed, 3370 spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) 3371 { 3372 struct spdk_io_channel *accel_ch; 3373 struct nvme_poll_group *group = ctx; 3374 int rc; 3375 3376 assert(cb_fn != NULL); 3377 3378 accel_ch = bdev_nvme_get_accel_channel(group); 3379 if (spdk_unlikely(accel_ch == NULL)) { 3380 cb_fn(cb_arg, -ENOMEM); 3381 return; 3382 } 3383 3384 rc = spdk_accel_submit_crc32cv(accel_ch, dst, iov, iov_cnt, seed, cb_fn, cb_arg); 3385 if (rc) { 3386 /* For the two cases, spdk_accel_submit_crc32cv does not call the user's cb_fn */ 3387 if (rc == -ENOMEM || rc == -EINVAL) { 3388 cb_fn(cb_arg, rc); 3389 } 3390 SPDK_ERRLOG("Cannot complete the accelerated crc32c operation with iov=%p\n", iov); 3391 } 3392 } 3393 3394 static void 3395 bdev_nvme_finish_sequence(void *seq, spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) 3396 { 3397 spdk_accel_sequence_finish(seq, cb_fn, cb_arg); 3398 } 3399 3400 static void 3401 bdev_nvme_abort_sequence(void *seq) 3402 { 3403 spdk_accel_sequence_abort(seq); 3404 } 3405 3406 static void 3407 bdev_nvme_reverse_sequence(void *seq) 3408 { 3409 spdk_accel_sequence_reverse(seq); 3410 } 3411 3412 static int 3413 bdev_nvme_append_crc32c(void *ctx, void **seq, uint32_t *dst, struct iovec *iovs, uint32_t iovcnt, 3414 struct spdk_memory_domain *domain, void *domain_ctx, uint32_t seed, 3415 spdk_nvme_accel_step_cb cb_fn, void *cb_arg) 3416 { 3417 struct spdk_io_channel *ch; 3418 struct nvme_poll_group *group = ctx; 3419 3420 ch = bdev_nvme_get_accel_channel(group); 3421 if (spdk_unlikely(ch == NULL)) { 3422 return -ENOMEM; 3423 } 3424 3425 return spdk_accel_append_crc32c((struct spdk_accel_sequence **)seq, ch, dst, iovs, iovcnt, 3426 domain, domain_ctx, seed, cb_fn, cb_arg); 3427 } 3428 3429 static struct spdk_nvme_accel_fn_table g_bdev_nvme_accel_fn_table = { 3430 .table_size = sizeof(struct spdk_nvme_accel_fn_table), 3431 .submit_accel_crc32c = bdev_nvme_submit_accel_crc32c, 3432 .append_crc32c = bdev_nvme_append_crc32c, 3433 .finish_sequence = bdev_nvme_finish_sequence, 3434 .reverse_sequence = bdev_nvme_reverse_sequence, 3435 .abort_sequence = bdev_nvme_abort_sequence, 3436 }; 3437 3438 static int 3439 bdev_nvme_create_poll_group_cb(void *io_device, void *ctx_buf) 3440 { 3441 struct nvme_poll_group *group = ctx_buf; 3442 3443 TAILQ_INIT(&group->qpair_list); 3444 3445 group->group = spdk_nvme_poll_group_create(group, &g_bdev_nvme_accel_fn_table); 3446 if (group->group == NULL) { 3447 return -1; 3448 } 3449 3450 group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us); 3451 3452 if (group->poller == NULL) { 3453 spdk_nvme_poll_group_destroy(group->group); 3454 return -1; 3455 } 3456 3457 return 0; 3458 } 3459 3460 static void 3461 bdev_nvme_destroy_poll_group_cb(void *io_device, void *ctx_buf) 3462 { 3463 struct nvme_poll_group *group = ctx_buf; 3464 3465 assert(TAILQ_EMPTY(&group->qpair_list)); 3466 3467 if (group->accel_channel) { 3468 spdk_put_io_channel(group->accel_channel); 3469 } 3470 3471 spdk_poller_unregister(&group->poller); 3472 if (spdk_nvme_poll_group_destroy(group->group)) { 3473 SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.\n"); 3474 assert(false); 3475 } 3476 } 3477 3478 static struct spdk_io_channel * 3479 bdev_nvme_get_io_channel(void *ctx) 3480 { 3481 struct nvme_bdev *nvme_bdev = ctx; 3482 3483 return spdk_get_io_channel(nvme_bdev); 3484 } 3485 3486 static void * 3487 bdev_nvme_get_module_ctx(void *ctx) 3488 { 3489 struct nvme_bdev *nvme_bdev = ctx; 3490 struct nvme_ns *nvme_ns; 3491 3492 if (!nvme_bdev || nvme_bdev->disk.module != &nvme_if) { 3493 return NULL; 3494 } 3495 3496 nvme_ns = TAILQ_FIRST(&nvme_bdev->nvme_ns_list); 3497 if (!nvme_ns) { 3498 return NULL; 3499 } 3500 3501 return nvme_ns->ns; 3502 } 3503 3504 static const char * 3505 _nvme_ana_state_str(enum spdk_nvme_ana_state ana_state) 3506 { 3507 switch (ana_state) { 3508 case SPDK_NVME_ANA_OPTIMIZED_STATE: 3509 return "optimized"; 3510 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 3511 return "non_optimized"; 3512 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 3513 return "inaccessible"; 3514 case SPDK_NVME_ANA_PERSISTENT_LOSS_STATE: 3515 return "persistent_loss"; 3516 case SPDK_NVME_ANA_CHANGE_STATE: 3517 return "change"; 3518 default: 3519 return NULL; 3520 } 3521 } 3522 3523 static int 3524 bdev_nvme_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size) 3525 { 3526 struct spdk_memory_domain **_domains = NULL; 3527 struct nvme_bdev *nbdev = ctx; 3528 struct nvme_ns *nvme_ns; 3529 int i = 0, _array_size = array_size; 3530 int rc = 0; 3531 3532 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 3533 if (domains && array_size >= i) { 3534 _domains = &domains[i]; 3535 } else { 3536 _domains = NULL; 3537 } 3538 rc = spdk_nvme_ctrlr_get_memory_domains(nvme_ns->ctrlr->ctrlr, _domains, _array_size); 3539 if (rc > 0) { 3540 i += rc; 3541 if (_array_size >= rc) { 3542 _array_size -= rc; 3543 } else { 3544 _array_size = 0; 3545 } 3546 } else if (rc < 0) { 3547 return rc; 3548 } 3549 } 3550 3551 return i; 3552 } 3553 3554 static const char * 3555 nvme_ctrlr_get_state_str(struct nvme_ctrlr *nvme_ctrlr) 3556 { 3557 if (nvme_ctrlr->destruct) { 3558 return "deleting"; 3559 } else if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 3560 return "failed"; 3561 } else if (nvme_ctrlr->resetting) { 3562 return "resetting"; 3563 } else if (nvme_ctrlr->reconnect_is_delayed > 0) { 3564 return "reconnect_is_delayed"; 3565 } else if (nvme_ctrlr->disabled) { 3566 return "disabled"; 3567 } else { 3568 return "enabled"; 3569 } 3570 } 3571 3572 void 3573 nvme_ctrlr_info_json(struct spdk_json_write_ctx *w, struct nvme_ctrlr *nvme_ctrlr) 3574 { 3575 struct spdk_nvme_transport_id *trid; 3576 const struct spdk_nvme_ctrlr_opts *opts; 3577 const struct spdk_nvme_ctrlr_data *cdata; 3578 struct nvme_path_id *path_id; 3579 3580 spdk_json_write_object_begin(w); 3581 3582 spdk_json_write_named_string(w, "state", nvme_ctrlr_get_state_str(nvme_ctrlr)); 3583 3584 #ifdef SPDK_CONFIG_NVME_CUSE 3585 size_t cuse_name_size = 128; 3586 char cuse_name[cuse_name_size]; 3587 3588 int rc = spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr, cuse_name, &cuse_name_size); 3589 if (rc == 0) { 3590 spdk_json_write_named_string(w, "cuse_device", cuse_name); 3591 } 3592 #endif 3593 trid = &nvme_ctrlr->active_path_id->trid; 3594 spdk_json_write_named_object_begin(w, "trid"); 3595 nvme_bdev_dump_trid_json(trid, w); 3596 spdk_json_write_object_end(w); 3597 3598 path_id = TAILQ_NEXT(nvme_ctrlr->active_path_id, link); 3599 if (path_id != NULL) { 3600 spdk_json_write_named_array_begin(w, "alternate_trids"); 3601 do { 3602 trid = &path_id->trid; 3603 spdk_json_write_object_begin(w); 3604 nvme_bdev_dump_trid_json(trid, w); 3605 spdk_json_write_object_end(w); 3606 3607 path_id = TAILQ_NEXT(path_id, link); 3608 } while (path_id != NULL); 3609 spdk_json_write_array_end(w); 3610 } 3611 3612 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 3613 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 3614 3615 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 3616 spdk_json_write_named_object_begin(w, "host"); 3617 spdk_json_write_named_string(w, "nqn", opts->hostnqn); 3618 spdk_json_write_named_string(w, "addr", opts->src_addr); 3619 spdk_json_write_named_string(w, "svcid", opts->src_svcid); 3620 spdk_json_write_object_end(w); 3621 3622 spdk_json_write_object_end(w); 3623 } 3624 3625 static void 3626 nvme_namespace_info_json(struct spdk_json_write_ctx *w, 3627 struct nvme_ns *nvme_ns) 3628 { 3629 struct spdk_nvme_ns *ns; 3630 struct spdk_nvme_ctrlr *ctrlr; 3631 const struct spdk_nvme_ctrlr_data *cdata; 3632 const struct spdk_nvme_transport_id *trid; 3633 union spdk_nvme_vs_register vs; 3634 const struct spdk_nvme_ns_data *nsdata; 3635 char buf[128]; 3636 3637 ns = nvme_ns->ns; 3638 if (ns == NULL) { 3639 return; 3640 } 3641 3642 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 3643 3644 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3645 trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); 3646 vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr); 3647 3648 spdk_json_write_object_begin(w); 3649 3650 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 3651 spdk_json_write_named_string(w, "pci_address", trid->traddr); 3652 } 3653 3654 spdk_json_write_named_object_begin(w, "trid"); 3655 3656 nvme_bdev_dump_trid_json(trid, w); 3657 3658 spdk_json_write_object_end(w); 3659 3660 #ifdef SPDK_CONFIG_NVME_CUSE 3661 size_t cuse_name_size = 128; 3662 char cuse_name[cuse_name_size]; 3663 3664 int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns), 3665 cuse_name, &cuse_name_size); 3666 if (rc == 0) { 3667 spdk_json_write_named_string(w, "cuse_device", cuse_name); 3668 } 3669 #endif 3670 3671 spdk_json_write_named_object_begin(w, "ctrlr_data"); 3672 3673 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 3674 3675 spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid); 3676 3677 snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn); 3678 spdk_str_trim(buf); 3679 spdk_json_write_named_string(w, "model_number", buf); 3680 3681 snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn); 3682 spdk_str_trim(buf); 3683 spdk_json_write_named_string(w, "serial_number", buf); 3684 3685 snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr); 3686 spdk_str_trim(buf); 3687 spdk_json_write_named_string(w, "firmware_revision", buf); 3688 3689 if (cdata->subnqn[0] != '\0') { 3690 spdk_json_write_named_string(w, "subnqn", cdata->subnqn); 3691 } 3692 3693 spdk_json_write_named_object_begin(w, "oacs"); 3694 3695 spdk_json_write_named_uint32(w, "security", cdata->oacs.security); 3696 spdk_json_write_named_uint32(w, "format", cdata->oacs.format); 3697 spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware); 3698 spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage); 3699 3700 spdk_json_write_object_end(w); 3701 3702 spdk_json_write_named_bool(w, "multi_ctrlr", cdata->cmic.multi_ctrlr); 3703 spdk_json_write_named_bool(w, "ana_reporting", cdata->cmic.ana_reporting); 3704 3705 spdk_json_write_object_end(w); 3706 3707 spdk_json_write_named_object_begin(w, "vs"); 3708 3709 spdk_json_write_name(w, "nvme_version"); 3710 if (vs.bits.ter) { 3711 spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter); 3712 } else { 3713 spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr); 3714 } 3715 3716 spdk_json_write_object_end(w); 3717 3718 nsdata = spdk_nvme_ns_get_data(ns); 3719 3720 spdk_json_write_named_object_begin(w, "ns_data"); 3721 3722 spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns)); 3723 3724 if (cdata->cmic.ana_reporting) { 3725 spdk_json_write_named_string(w, "ana_state", 3726 _nvme_ana_state_str(nvme_ns->ana_state)); 3727 } 3728 3729 spdk_json_write_named_bool(w, "can_share", nsdata->nmic.can_share); 3730 3731 spdk_json_write_object_end(w); 3732 3733 if (cdata->oacs.security) { 3734 spdk_json_write_named_object_begin(w, "security"); 3735 3736 spdk_json_write_named_bool(w, "opal", nvme_ns->bdev->opal); 3737 3738 spdk_json_write_object_end(w); 3739 } 3740 3741 spdk_json_write_object_end(w); 3742 } 3743 3744 static const char * 3745 nvme_bdev_get_mp_policy_str(struct nvme_bdev *nbdev) 3746 { 3747 switch (nbdev->mp_policy) { 3748 case BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE: 3749 return "active_passive"; 3750 case BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE: 3751 return "active_active"; 3752 default: 3753 assert(false); 3754 return "invalid"; 3755 } 3756 } 3757 3758 static int 3759 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) 3760 { 3761 struct nvme_bdev *nvme_bdev = ctx; 3762 struct nvme_ns *nvme_ns; 3763 3764 pthread_mutex_lock(&nvme_bdev->mutex); 3765 spdk_json_write_named_array_begin(w, "nvme"); 3766 TAILQ_FOREACH(nvme_ns, &nvme_bdev->nvme_ns_list, tailq) { 3767 nvme_namespace_info_json(w, nvme_ns); 3768 } 3769 spdk_json_write_array_end(w); 3770 spdk_json_write_named_string(w, "mp_policy", nvme_bdev_get_mp_policy_str(nvme_bdev)); 3771 pthread_mutex_unlock(&nvme_bdev->mutex); 3772 3773 return 0; 3774 } 3775 3776 static void 3777 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3778 { 3779 /* No config per bdev needed */ 3780 } 3781 3782 static uint64_t 3783 bdev_nvme_get_spin_time(struct spdk_io_channel *ch) 3784 { 3785 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 3786 struct nvme_io_path *io_path; 3787 struct nvme_poll_group *group; 3788 uint64_t spin_time = 0; 3789 3790 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 3791 group = io_path->qpair->group; 3792 3793 if (!group || !group->collect_spin_stat) { 3794 continue; 3795 } 3796 3797 if (group->end_ticks != 0) { 3798 group->spin_ticks += (group->end_ticks - group->start_ticks); 3799 group->end_ticks = 0; 3800 } 3801 3802 spin_time += group->spin_ticks; 3803 group->start_ticks = 0; 3804 group->spin_ticks = 0; 3805 } 3806 3807 return (spin_time * 1000000ULL) / spdk_get_ticks_hz(); 3808 } 3809 3810 static void 3811 bdev_nvme_reset_device_stat(void *ctx) 3812 { 3813 struct nvme_bdev *nbdev = ctx; 3814 3815 if (nbdev->err_stat != NULL) { 3816 memset(nbdev->err_stat, 0, sizeof(struct nvme_error_stat)); 3817 } 3818 } 3819 3820 /* JSON string should be lowercases and underscore delimited string. */ 3821 static void 3822 bdev_nvme_format_nvme_status(char *dst, const char *src) 3823 { 3824 char tmp[256]; 3825 3826 spdk_strcpy_replace(dst, 256, src, " - ", "_"); 3827 spdk_strcpy_replace(tmp, 256, dst, "-", "_"); 3828 spdk_strcpy_replace(dst, 256, tmp, " ", "_"); 3829 spdk_strlwr(dst); 3830 } 3831 3832 static void 3833 bdev_nvme_dump_device_stat_json(void *ctx, struct spdk_json_write_ctx *w) 3834 { 3835 struct nvme_bdev *nbdev = ctx; 3836 struct spdk_nvme_status status = {}; 3837 uint16_t sct, sc; 3838 char status_json[256]; 3839 const char *status_str; 3840 3841 if (nbdev->err_stat == NULL) { 3842 return; 3843 } 3844 3845 spdk_json_write_named_object_begin(w, "nvme_error"); 3846 3847 spdk_json_write_named_object_begin(w, "status_type"); 3848 for (sct = 0; sct < 8; sct++) { 3849 if (nbdev->err_stat->status_type[sct] == 0) { 3850 continue; 3851 } 3852 status.sct = sct; 3853 3854 status_str = spdk_nvme_cpl_get_status_type_string(&status); 3855 assert(status_str != NULL); 3856 bdev_nvme_format_nvme_status(status_json, status_str); 3857 3858 spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status_type[sct]); 3859 } 3860 spdk_json_write_object_end(w); 3861 3862 spdk_json_write_named_object_begin(w, "status_code"); 3863 for (sct = 0; sct < 4; sct++) { 3864 status.sct = sct; 3865 for (sc = 0; sc < 256; sc++) { 3866 if (nbdev->err_stat->status[sct][sc] == 0) { 3867 continue; 3868 } 3869 status.sc = sc; 3870 3871 status_str = spdk_nvme_cpl_get_status_string(&status); 3872 assert(status_str != NULL); 3873 bdev_nvme_format_nvme_status(status_json, status_str); 3874 3875 spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status[sct][sc]); 3876 } 3877 } 3878 spdk_json_write_object_end(w); 3879 3880 spdk_json_write_object_end(w); 3881 } 3882 3883 static bool 3884 bdev_nvme_accel_sequence_supported(void *ctx, enum spdk_bdev_io_type type) 3885 { 3886 struct nvme_bdev *nbdev = ctx; 3887 struct spdk_nvme_ctrlr *ctrlr; 3888 3889 if (!g_opts.allow_accel_sequence) { 3890 return false; 3891 } 3892 3893 switch (type) { 3894 case SPDK_BDEV_IO_TYPE_WRITE: 3895 case SPDK_BDEV_IO_TYPE_READ: 3896 break; 3897 default: 3898 return false; 3899 } 3900 3901 ctrlr = bdev_nvme_get_ctrlr(&nbdev->disk); 3902 assert(ctrlr != NULL); 3903 3904 return spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ACCEL_SEQUENCE_SUPPORTED; 3905 } 3906 3907 static const struct spdk_bdev_fn_table nvmelib_fn_table = { 3908 .destruct = bdev_nvme_destruct, 3909 .submit_request = bdev_nvme_submit_request, 3910 .io_type_supported = bdev_nvme_io_type_supported, 3911 .get_io_channel = bdev_nvme_get_io_channel, 3912 .dump_info_json = bdev_nvme_dump_info_json, 3913 .write_config_json = bdev_nvme_write_config_json, 3914 .get_spin_time = bdev_nvme_get_spin_time, 3915 .get_module_ctx = bdev_nvme_get_module_ctx, 3916 .get_memory_domains = bdev_nvme_get_memory_domains, 3917 .accel_sequence_supported = bdev_nvme_accel_sequence_supported, 3918 .reset_device_stat = bdev_nvme_reset_device_stat, 3919 .dump_device_stat_json = bdev_nvme_dump_device_stat_json, 3920 }; 3921 3922 typedef int (*bdev_nvme_parse_ana_log_page_cb)( 3923 const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg); 3924 3925 static int 3926 bdev_nvme_parse_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 3927 bdev_nvme_parse_ana_log_page_cb cb_fn, void *cb_arg) 3928 { 3929 struct spdk_nvme_ana_group_descriptor *copied_desc; 3930 uint8_t *orig_desc; 3931 uint32_t i, desc_size, copy_len; 3932 int rc = 0; 3933 3934 if (nvme_ctrlr->ana_log_page == NULL) { 3935 return -EINVAL; 3936 } 3937 3938 copied_desc = nvme_ctrlr->copied_ana_desc; 3939 3940 orig_desc = (uint8_t *)nvme_ctrlr->ana_log_page + sizeof(struct spdk_nvme_ana_page); 3941 copy_len = nvme_ctrlr->max_ana_log_page_size - sizeof(struct spdk_nvme_ana_page); 3942 3943 for (i = 0; i < nvme_ctrlr->ana_log_page->num_ana_group_desc; i++) { 3944 memcpy(copied_desc, orig_desc, copy_len); 3945 3946 rc = cb_fn(copied_desc, cb_arg); 3947 if (rc != 0) { 3948 break; 3949 } 3950 3951 desc_size = sizeof(struct spdk_nvme_ana_group_descriptor) + 3952 copied_desc->num_of_nsid * sizeof(uint32_t); 3953 orig_desc += desc_size; 3954 copy_len -= desc_size; 3955 } 3956 3957 return rc; 3958 } 3959 3960 static int 3961 nvme_ns_ana_transition_timedout(void *ctx) 3962 { 3963 struct nvme_ns *nvme_ns = ctx; 3964 3965 spdk_poller_unregister(&nvme_ns->anatt_timer); 3966 nvme_ns->ana_transition_timedout = true; 3967 3968 return SPDK_POLLER_BUSY; 3969 } 3970 3971 static void 3972 _nvme_ns_set_ana_state(struct nvme_ns *nvme_ns, 3973 const struct spdk_nvme_ana_group_descriptor *desc) 3974 { 3975 const struct spdk_nvme_ctrlr_data *cdata; 3976 3977 nvme_ns->ana_group_id = desc->ana_group_id; 3978 nvme_ns->ana_state = desc->ana_state; 3979 nvme_ns->ana_state_updating = false; 3980 3981 switch (nvme_ns->ana_state) { 3982 case SPDK_NVME_ANA_OPTIMIZED_STATE: 3983 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 3984 nvme_ns->ana_transition_timedout = false; 3985 spdk_poller_unregister(&nvme_ns->anatt_timer); 3986 break; 3987 3988 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 3989 case SPDK_NVME_ANA_CHANGE_STATE: 3990 if (nvme_ns->anatt_timer != NULL) { 3991 break; 3992 } 3993 3994 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 3995 nvme_ns->anatt_timer = SPDK_POLLER_REGISTER(nvme_ns_ana_transition_timedout, 3996 nvme_ns, 3997 cdata->anatt * SPDK_SEC_TO_USEC); 3998 break; 3999 default: 4000 break; 4001 } 4002 } 4003 4004 static int 4005 nvme_ns_set_ana_state(const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg) 4006 { 4007 struct nvme_ns *nvme_ns = cb_arg; 4008 uint32_t i; 4009 4010 assert(nvme_ns->ns != NULL); 4011 4012 for (i = 0; i < desc->num_of_nsid; i++) { 4013 if (desc->nsid[i] != spdk_nvme_ns_get_id(nvme_ns->ns)) { 4014 continue; 4015 } 4016 4017 _nvme_ns_set_ana_state(nvme_ns, desc); 4018 return 1; 4019 } 4020 4021 return 0; 4022 } 4023 4024 static int 4025 nvme_generate_uuid(const char *sn, uint32_t nsid, struct spdk_uuid *uuid) 4026 { 4027 int rc = 0; 4028 struct spdk_uuid new_uuid, namespace_uuid; 4029 char merged_str[SPDK_NVME_CTRLR_SN_LEN + NSID_STR_LEN + 1] = {'\0'}; 4030 /* This namespace UUID was generated using uuid_generate() method. */ 4031 const char *namespace_str = {"edaed2de-24bc-4b07-b559-f47ecbe730fd"}; 4032 int size; 4033 4034 assert(strlen(sn) <= SPDK_NVME_CTRLR_SN_LEN); 4035 4036 spdk_uuid_set_null(&new_uuid); 4037 spdk_uuid_set_null(&namespace_uuid); 4038 4039 size = snprintf(merged_str, sizeof(merged_str), "%s%"PRIu32, sn, nsid); 4040 if (size <= 0 || (unsigned long)size >= sizeof(merged_str)) { 4041 return -EINVAL; 4042 } 4043 4044 spdk_uuid_parse(&namespace_uuid, namespace_str); 4045 4046 rc = spdk_uuid_generate_sha1(&new_uuid, &namespace_uuid, merged_str, size); 4047 if (rc == 0) { 4048 memcpy(uuid, &new_uuid, sizeof(struct spdk_uuid)); 4049 } 4050 4051 return rc; 4052 } 4053 4054 static int 4055 nvme_disk_create(struct spdk_bdev *disk, const char *base_name, 4056 struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns, 4057 uint32_t prchk_flags, void *ctx) 4058 { 4059 const struct spdk_uuid *uuid; 4060 const uint8_t *nguid; 4061 const struct spdk_nvme_ctrlr_data *cdata; 4062 const struct spdk_nvme_ns_data *nsdata; 4063 const struct spdk_nvme_ctrlr_opts *opts; 4064 enum spdk_nvme_csi csi; 4065 uint32_t atomic_bs, phys_bs, bs; 4066 char sn_tmp[SPDK_NVME_CTRLR_SN_LEN + 1] = {'\0'}; 4067 int rc; 4068 4069 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 4070 csi = spdk_nvme_ns_get_csi(ns); 4071 opts = spdk_nvme_ctrlr_get_opts(ctrlr); 4072 4073 switch (csi) { 4074 case SPDK_NVME_CSI_NVM: 4075 disk->product_name = "NVMe disk"; 4076 break; 4077 case SPDK_NVME_CSI_ZNS: 4078 disk->product_name = "NVMe ZNS disk"; 4079 disk->zoned = true; 4080 disk->zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 4081 disk->max_zone_append_size = spdk_nvme_zns_ctrlr_get_max_zone_append_size(ctrlr) / 4082 spdk_nvme_ns_get_extended_sector_size(ns); 4083 disk->max_open_zones = spdk_nvme_zns_ns_get_max_open_zones(ns); 4084 disk->max_active_zones = spdk_nvme_zns_ns_get_max_active_zones(ns); 4085 break; 4086 default: 4087 SPDK_ERRLOG("unsupported CSI: %u\n", csi); 4088 return -ENOTSUP; 4089 } 4090 4091 nguid = spdk_nvme_ns_get_nguid(ns); 4092 if (!nguid) { 4093 uuid = spdk_nvme_ns_get_uuid(ns); 4094 if (uuid) { 4095 disk->uuid = *uuid; 4096 } else if (g_opts.generate_uuids) { 4097 spdk_strcpy_pad(sn_tmp, cdata->sn, SPDK_NVME_CTRLR_SN_LEN, '\0'); 4098 rc = nvme_generate_uuid(sn_tmp, spdk_nvme_ns_get_id(ns), &disk->uuid); 4099 if (rc < 0) { 4100 SPDK_ERRLOG("UUID generation failed (%s)\n", spdk_strerror(-rc)); 4101 return rc; 4102 } 4103 } 4104 } else { 4105 memcpy(&disk->uuid, nguid, sizeof(disk->uuid)); 4106 } 4107 4108 disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns)); 4109 if (!disk->name) { 4110 return -ENOMEM; 4111 } 4112 4113 disk->write_cache = 0; 4114 if (cdata->vwc.present) { 4115 /* Enable if the Volatile Write Cache exists */ 4116 disk->write_cache = 1; 4117 } 4118 if (cdata->oncs.write_zeroes) { 4119 disk->max_write_zeroes = UINT16_MAX + 1; 4120 } 4121 disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns); 4122 disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns); 4123 disk->max_segment_size = spdk_nvme_ctrlr_get_max_xfer_size(ctrlr); 4124 /* NVMe driver will split one request into multiple requests 4125 * based on MDTS and stripe boundary, the bdev layer will use 4126 * max_segment_size and max_num_segments to split one big IO 4127 * into multiple requests, then small request can't run out 4128 * of NVMe internal requests data structure. 4129 */ 4130 if (opts && opts->io_queue_requests) { 4131 disk->max_num_segments = opts->io_queue_requests / 2; 4132 } 4133 if (spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_SGL_SUPPORTED) { 4134 /* The nvme driver will try to split I/O that have too many 4135 * SGEs, but it doesn't work if that last SGE doesn't end on 4136 * an aggregate total that is block aligned. The bdev layer has 4137 * a more robust splitting framework, so use that instead for 4138 * this case. (See issue #3269.) 4139 */ 4140 uint16_t max_sges = spdk_nvme_ctrlr_get_max_sges(ctrlr); 4141 4142 if (disk->max_num_segments == 0) { 4143 disk->max_num_segments = max_sges; 4144 } else { 4145 disk->max_num_segments = spdk_min(disk->max_num_segments, max_sges); 4146 } 4147 } 4148 disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns); 4149 4150 nsdata = spdk_nvme_ns_get_data(ns); 4151 bs = spdk_nvme_ns_get_sector_size(ns); 4152 atomic_bs = bs; 4153 phys_bs = bs; 4154 if (nsdata->nabo == 0) { 4155 if (nsdata->nsfeat.ns_atomic_write_unit && nsdata->nawupf) { 4156 atomic_bs = bs * (1 + nsdata->nawupf); 4157 } else { 4158 atomic_bs = bs * (1 + cdata->awupf); 4159 } 4160 } 4161 if (nsdata->nsfeat.optperf) { 4162 phys_bs = bs * (1 + nsdata->npwg); 4163 } 4164 disk->phys_blocklen = spdk_min(phys_bs, atomic_bs); 4165 4166 disk->md_len = spdk_nvme_ns_get_md_size(ns); 4167 if (disk->md_len != 0) { 4168 disk->md_interleave = nsdata->flbas.extended; 4169 disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns); 4170 if (disk->dif_type != SPDK_DIF_DISABLE) { 4171 disk->dif_is_head_of_md = nsdata->dps.md_start; 4172 disk->dif_check_flags = prchk_flags; 4173 } 4174 } 4175 4176 if (!(spdk_nvme_ctrlr_get_flags(ctrlr) & 4177 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) { 4178 disk->acwu = 0; 4179 } else if (nsdata->nsfeat.ns_atomic_write_unit) { 4180 disk->acwu = nsdata->nacwu + 1; /* 0-based */ 4181 } else { 4182 disk->acwu = cdata->acwu + 1; /* 0-based */ 4183 } 4184 4185 if (cdata->oncs.copy) { 4186 /* For now bdev interface allows only single segment copy */ 4187 disk->max_copy = nsdata->mssrl; 4188 } 4189 4190 disk->ctxt = ctx; 4191 disk->fn_table = &nvmelib_fn_table; 4192 disk->module = &nvme_if; 4193 4194 return 0; 4195 } 4196 4197 static struct nvme_bdev * 4198 nvme_bdev_alloc(void) 4199 { 4200 struct nvme_bdev *bdev; 4201 int rc; 4202 4203 bdev = calloc(1, sizeof(*bdev)); 4204 if (!bdev) { 4205 SPDK_ERRLOG("bdev calloc() failed\n"); 4206 return NULL; 4207 } 4208 4209 if (g_opts.nvme_error_stat) { 4210 bdev->err_stat = calloc(1, sizeof(struct nvme_error_stat)); 4211 if (!bdev->err_stat) { 4212 SPDK_ERRLOG("err_stat calloc() failed\n"); 4213 free(bdev); 4214 return NULL; 4215 } 4216 } 4217 4218 rc = pthread_mutex_init(&bdev->mutex, NULL); 4219 if (rc != 0) { 4220 free(bdev->err_stat); 4221 free(bdev); 4222 return NULL; 4223 } 4224 4225 bdev->ref = 1; 4226 bdev->mp_policy = BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE; 4227 bdev->mp_selector = BDEV_NVME_MP_SELECTOR_ROUND_ROBIN; 4228 bdev->rr_min_io = UINT32_MAX; 4229 TAILQ_INIT(&bdev->nvme_ns_list); 4230 4231 return bdev; 4232 } 4233 4234 static int 4235 nvme_bdev_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4236 { 4237 struct nvme_bdev *bdev; 4238 struct nvme_bdev_ctrlr *nbdev_ctrlr = nvme_ctrlr->nbdev_ctrlr; 4239 int rc; 4240 4241 bdev = nvme_bdev_alloc(); 4242 if (bdev == NULL) { 4243 SPDK_ERRLOG("Failed to allocate NVMe bdev\n"); 4244 return -ENOMEM; 4245 } 4246 4247 bdev->opal = nvme_ctrlr->opal_dev != NULL; 4248 4249 rc = nvme_disk_create(&bdev->disk, nbdev_ctrlr->name, nvme_ctrlr->ctrlr, 4250 nvme_ns->ns, nvme_ctrlr->opts.prchk_flags, bdev); 4251 if (rc != 0) { 4252 SPDK_ERRLOG("Failed to create NVMe disk\n"); 4253 nvme_bdev_free(bdev); 4254 return rc; 4255 } 4256 4257 spdk_io_device_register(bdev, 4258 bdev_nvme_create_bdev_channel_cb, 4259 bdev_nvme_destroy_bdev_channel_cb, 4260 sizeof(struct nvme_bdev_channel), 4261 bdev->disk.name); 4262 4263 nvme_ns->bdev = bdev; 4264 bdev->nsid = nvme_ns->id; 4265 TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); 4266 4267 bdev->nbdev_ctrlr = nbdev_ctrlr; 4268 TAILQ_INSERT_TAIL(&nbdev_ctrlr->bdevs, bdev, tailq); 4269 4270 rc = spdk_bdev_register(&bdev->disk); 4271 if (rc != 0) { 4272 SPDK_ERRLOG("spdk_bdev_register() failed\n"); 4273 spdk_io_device_unregister(bdev, NULL); 4274 nvme_ns->bdev = NULL; 4275 TAILQ_REMOVE(&nbdev_ctrlr->bdevs, bdev, tailq); 4276 nvme_bdev_free(bdev); 4277 return rc; 4278 } 4279 4280 return 0; 4281 } 4282 4283 static bool 4284 bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2) 4285 { 4286 const struct spdk_nvme_ns_data *nsdata1, *nsdata2; 4287 const struct spdk_uuid *uuid1, *uuid2; 4288 4289 nsdata1 = spdk_nvme_ns_get_data(ns1); 4290 nsdata2 = spdk_nvme_ns_get_data(ns2); 4291 uuid1 = spdk_nvme_ns_get_uuid(ns1); 4292 uuid2 = spdk_nvme_ns_get_uuid(ns2); 4293 4294 return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid)) == 0 && 4295 nsdata1->eui64 == nsdata2->eui64 && 4296 ((uuid1 == NULL && uuid2 == NULL) || 4297 (uuid1 != NULL && uuid2 != NULL && spdk_uuid_compare(uuid1, uuid2) == 0)) && 4298 spdk_nvme_ns_get_csi(ns1) == spdk_nvme_ns_get_csi(ns2); 4299 } 4300 4301 static bool 4302 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 4303 struct spdk_nvme_ctrlr_opts *opts) 4304 { 4305 struct nvme_probe_skip_entry *entry; 4306 4307 TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) { 4308 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 4309 return false; 4310 } 4311 } 4312 4313 opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst; 4314 opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight; 4315 opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight; 4316 opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight; 4317 opts->disable_read_ana_log_page = true; 4318 4319 SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr); 4320 4321 return true; 4322 } 4323 4324 static void 4325 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl) 4326 { 4327 struct nvme_ctrlr *nvme_ctrlr = ctx; 4328 4329 if (spdk_nvme_cpl_is_error(cpl)) { 4330 SPDK_WARNLOG("Abort failed. Resetting controller. sc is %u, sct is %u.\n", cpl->status.sc, 4331 cpl->status.sct); 4332 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4333 } else if (cpl->cdw0 & 0x1) { 4334 SPDK_WARNLOG("Specified command could not be aborted.\n"); 4335 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4336 } 4337 } 4338 4339 static void 4340 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr, 4341 struct spdk_nvme_qpair *qpair, uint16_t cid) 4342 { 4343 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 4344 union spdk_nvme_csts_register csts; 4345 int rc; 4346 4347 assert(nvme_ctrlr->ctrlr == ctrlr); 4348 4349 SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid); 4350 4351 /* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O 4352 * queue. (Note: qpair == NULL when there's an admin cmd timeout.) Otherwise we 4353 * would submit another fabrics cmd on the admin queue to read CSTS and check for its 4354 * completion recursively. 4355 */ 4356 if (nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) { 4357 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); 4358 if (csts.bits.cfs) { 4359 SPDK_ERRLOG("Controller Fatal Status, reset required\n"); 4360 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4361 return; 4362 } 4363 } 4364 4365 switch (g_opts.action_on_timeout) { 4366 case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT: 4367 if (qpair) { 4368 /* Don't send abort to ctrlr when ctrlr is not available. */ 4369 pthread_mutex_lock(&nvme_ctrlr->mutex); 4370 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 4371 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4372 SPDK_NOTICELOG("Quit abort. Ctrlr is not available.\n"); 4373 return; 4374 } 4375 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4376 4377 rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid, 4378 nvme_abort_cpl, nvme_ctrlr); 4379 if (rc == 0) { 4380 return; 4381 } 4382 4383 SPDK_ERRLOG("Unable to send abort. Resetting, rc is %d.\n", rc); 4384 } 4385 4386 /* FALLTHROUGH */ 4387 case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET: 4388 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4389 break; 4390 case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE: 4391 SPDK_DEBUGLOG(bdev_nvme, "No action for nvme controller timeout.\n"); 4392 break; 4393 default: 4394 SPDK_ERRLOG("An invalid timeout action value is found.\n"); 4395 break; 4396 } 4397 } 4398 4399 static struct nvme_ns * 4400 nvme_ns_alloc(void) 4401 { 4402 struct nvme_ns *nvme_ns; 4403 4404 nvme_ns = calloc(1, sizeof(struct nvme_ns)); 4405 if (nvme_ns == NULL) { 4406 return NULL; 4407 } 4408 4409 if (g_opts.io_path_stat) { 4410 nvme_ns->stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); 4411 if (nvme_ns->stat == NULL) { 4412 free(nvme_ns); 4413 return NULL; 4414 } 4415 spdk_bdev_reset_io_stat(nvme_ns->stat, SPDK_BDEV_RESET_STAT_MAXMIN); 4416 } 4417 4418 return nvme_ns; 4419 } 4420 4421 static void 4422 nvme_ns_free(struct nvme_ns *nvme_ns) 4423 { 4424 free(nvme_ns->stat); 4425 free(nvme_ns); 4426 } 4427 4428 static void 4429 nvme_ctrlr_populate_namespace_done(struct nvme_ns *nvme_ns, int rc) 4430 { 4431 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 4432 struct nvme_async_probe_ctx *ctx = nvme_ns->probe_ctx; 4433 4434 if (rc == 0) { 4435 nvme_ns->probe_ctx = NULL; 4436 pthread_mutex_lock(&nvme_ctrlr->mutex); 4437 nvme_ctrlr->ref++; 4438 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4439 } else { 4440 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4441 nvme_ns_free(nvme_ns); 4442 } 4443 4444 if (ctx) { 4445 ctx->populates_in_progress--; 4446 if (ctx->populates_in_progress == 0) { 4447 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 4448 } 4449 } 4450 } 4451 4452 static void 4453 bdev_nvme_add_io_path(struct spdk_io_channel_iter *i) 4454 { 4455 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4456 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4457 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4458 int rc; 4459 4460 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 4461 if (rc != 0) { 4462 SPDK_ERRLOG("Failed to add I/O path to bdev_channel dynamically.\n"); 4463 } 4464 4465 spdk_for_each_channel_continue(i, rc); 4466 } 4467 4468 static void 4469 bdev_nvme_delete_io_path(struct spdk_io_channel_iter *i) 4470 { 4471 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4472 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4473 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4474 struct nvme_io_path *io_path; 4475 4476 io_path = _bdev_nvme_get_io_path(nbdev_ch, nvme_ns); 4477 if (io_path != NULL) { 4478 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 4479 } 4480 4481 spdk_for_each_channel_continue(i, 0); 4482 } 4483 4484 static void 4485 bdev_nvme_add_io_path_failed(struct spdk_io_channel_iter *i, int status) 4486 { 4487 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4488 4489 nvme_ctrlr_populate_namespace_done(nvme_ns, -1); 4490 } 4491 4492 static void 4493 bdev_nvme_add_io_path_done(struct spdk_io_channel_iter *i, int status) 4494 { 4495 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4496 struct nvme_bdev *bdev = spdk_io_channel_iter_get_io_device(i); 4497 4498 if (status == 0) { 4499 nvme_ctrlr_populate_namespace_done(nvme_ns, 0); 4500 } else { 4501 /* Delete the added io_paths and fail populating the namespace. */ 4502 spdk_for_each_channel(bdev, 4503 bdev_nvme_delete_io_path, 4504 nvme_ns, 4505 bdev_nvme_add_io_path_failed); 4506 } 4507 } 4508 4509 static int 4510 nvme_bdev_add_ns(struct nvme_bdev *bdev, struct nvme_ns *nvme_ns) 4511 { 4512 struct nvme_ns *tmp_ns; 4513 const struct spdk_nvme_ns_data *nsdata; 4514 4515 nsdata = spdk_nvme_ns_get_data(nvme_ns->ns); 4516 if (!nsdata->nmic.can_share) { 4517 SPDK_ERRLOG("Namespace cannot be shared.\n"); 4518 return -EINVAL; 4519 } 4520 4521 pthread_mutex_lock(&bdev->mutex); 4522 4523 tmp_ns = TAILQ_FIRST(&bdev->nvme_ns_list); 4524 assert(tmp_ns != NULL); 4525 4526 if (tmp_ns->ns != NULL && !bdev_nvme_compare_ns(nvme_ns->ns, tmp_ns->ns)) { 4527 pthread_mutex_unlock(&bdev->mutex); 4528 SPDK_ERRLOG("Namespaces are not identical.\n"); 4529 return -EINVAL; 4530 } 4531 4532 bdev->ref++; 4533 TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); 4534 nvme_ns->bdev = bdev; 4535 4536 pthread_mutex_unlock(&bdev->mutex); 4537 4538 /* Add nvme_io_path to nvme_bdev_channels dynamically. */ 4539 spdk_for_each_channel(bdev, 4540 bdev_nvme_add_io_path, 4541 nvme_ns, 4542 bdev_nvme_add_io_path_done); 4543 4544 return 0; 4545 } 4546 4547 static void 4548 nvme_ctrlr_populate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4549 { 4550 struct spdk_nvme_ns *ns; 4551 struct nvme_bdev *bdev; 4552 int rc = 0; 4553 4554 ns = spdk_nvme_ctrlr_get_ns(nvme_ctrlr->ctrlr, nvme_ns->id); 4555 if (!ns) { 4556 SPDK_DEBUGLOG(bdev_nvme, "Invalid NS %d\n", nvme_ns->id); 4557 rc = -EINVAL; 4558 goto done; 4559 } 4560 4561 nvme_ns->ns = ns; 4562 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 4563 4564 if (nvme_ctrlr->ana_log_page != NULL) { 4565 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ns_set_ana_state, nvme_ns); 4566 } 4567 4568 bdev = nvme_bdev_ctrlr_get_bdev(nvme_ctrlr->nbdev_ctrlr, nvme_ns->id); 4569 if (bdev == NULL) { 4570 rc = nvme_bdev_create(nvme_ctrlr, nvme_ns); 4571 } else { 4572 rc = nvme_bdev_add_ns(bdev, nvme_ns); 4573 if (rc == 0) { 4574 return; 4575 } 4576 } 4577 done: 4578 nvme_ctrlr_populate_namespace_done(nvme_ns, rc); 4579 } 4580 4581 static void 4582 nvme_ctrlr_depopulate_namespace_done(struct nvme_ns *nvme_ns) 4583 { 4584 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 4585 4586 assert(nvme_ctrlr != NULL); 4587 4588 pthread_mutex_lock(&nvme_ctrlr->mutex); 4589 4590 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4591 4592 if (nvme_ns->bdev != NULL) { 4593 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4594 return; 4595 } 4596 4597 nvme_ns_free(nvme_ns); 4598 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4599 4600 nvme_ctrlr_release(nvme_ctrlr); 4601 } 4602 4603 static void 4604 bdev_nvme_delete_io_path_done(struct spdk_io_channel_iter *i, int status) 4605 { 4606 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4607 4608 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 4609 } 4610 4611 static void 4612 nvme_ctrlr_depopulate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4613 { 4614 struct nvme_bdev *bdev; 4615 4616 spdk_poller_unregister(&nvme_ns->anatt_timer); 4617 4618 bdev = nvme_ns->bdev; 4619 if (bdev != NULL) { 4620 pthread_mutex_lock(&bdev->mutex); 4621 4622 assert(bdev->ref > 0); 4623 bdev->ref--; 4624 if (bdev->ref == 0) { 4625 pthread_mutex_unlock(&bdev->mutex); 4626 4627 spdk_bdev_unregister(&bdev->disk, NULL, NULL); 4628 } else { 4629 /* spdk_bdev_unregister() is not called until the last nvme_ns is 4630 * depopulated. Hence we need to remove nvme_ns from bdev->nvme_ns_list 4631 * and clear nvme_ns->bdev here. 4632 */ 4633 TAILQ_REMOVE(&bdev->nvme_ns_list, nvme_ns, tailq); 4634 nvme_ns->bdev = NULL; 4635 4636 pthread_mutex_unlock(&bdev->mutex); 4637 4638 /* Delete nvme_io_paths from nvme_bdev_channels dynamically. After that, 4639 * we call depopulate_namespace_done() to avoid use-after-free. 4640 */ 4641 spdk_for_each_channel(bdev, 4642 bdev_nvme_delete_io_path, 4643 nvme_ns, 4644 bdev_nvme_delete_io_path_done); 4645 return; 4646 } 4647 } 4648 4649 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 4650 } 4651 4652 static void 4653 nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 4654 struct nvme_async_probe_ctx *ctx) 4655 { 4656 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 4657 struct nvme_ns *nvme_ns, *next; 4658 struct spdk_nvme_ns *ns; 4659 struct nvme_bdev *bdev; 4660 uint32_t nsid; 4661 int rc; 4662 uint64_t num_sectors; 4663 4664 if (ctx) { 4665 /* Initialize this count to 1 to handle the populate functions 4666 * calling nvme_ctrlr_populate_namespace_done() immediately. 4667 */ 4668 ctx->populates_in_progress = 1; 4669 } 4670 4671 /* First loop over our existing namespaces and see if they have been 4672 * removed. */ 4673 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 4674 while (nvme_ns != NULL) { 4675 next = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 4676 4677 if (spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) { 4678 /* NS is still there or added again. Its attributes may have changed. */ 4679 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id); 4680 if (nvme_ns->ns != ns) { 4681 assert(nvme_ns->ns == NULL); 4682 nvme_ns->ns = ns; 4683 SPDK_DEBUGLOG(bdev_nvme, "NSID %u was added\n", nvme_ns->id); 4684 } 4685 4686 num_sectors = spdk_nvme_ns_get_num_sectors(ns); 4687 bdev = nvme_ns->bdev; 4688 assert(bdev != NULL); 4689 if (bdev->disk.blockcnt != num_sectors) { 4690 SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n", 4691 nvme_ns->id, 4692 bdev->disk.name, 4693 bdev->disk.blockcnt, 4694 num_sectors); 4695 rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors); 4696 if (rc != 0) { 4697 SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n", 4698 bdev->disk.name, rc); 4699 } 4700 } 4701 } else { 4702 /* Namespace was removed */ 4703 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 4704 } 4705 4706 nvme_ns = next; 4707 } 4708 4709 /* Loop through all of the namespaces at the nvme level and see if any of them are new */ 4710 nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 4711 while (nsid != 0) { 4712 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 4713 4714 if (nvme_ns == NULL) { 4715 /* Found a new one */ 4716 nvme_ns = nvme_ns_alloc(); 4717 if (nvme_ns == NULL) { 4718 SPDK_ERRLOG("Failed to allocate namespace\n"); 4719 /* This just fails to attach the namespace. It may work on a future attempt. */ 4720 continue; 4721 } 4722 4723 nvme_ns->id = nsid; 4724 nvme_ns->ctrlr = nvme_ctrlr; 4725 4726 nvme_ns->bdev = NULL; 4727 4728 if (ctx) { 4729 ctx->populates_in_progress++; 4730 } 4731 nvme_ns->probe_ctx = ctx; 4732 4733 RB_INSERT(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4734 4735 nvme_ctrlr_populate_namespace(nvme_ctrlr, nvme_ns); 4736 } 4737 4738 nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid); 4739 } 4740 4741 if (ctx) { 4742 /* Decrement this count now that the loop is over to account 4743 * for the one we started with. If the count is then 0, we 4744 * know any populate_namespace functions completed immediately, 4745 * so we'll kick the callback here. 4746 */ 4747 ctx->populates_in_progress--; 4748 if (ctx->populates_in_progress == 0) { 4749 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 4750 } 4751 } 4752 4753 } 4754 4755 static void 4756 nvme_ctrlr_depopulate_namespaces(struct nvme_ctrlr *nvme_ctrlr) 4757 { 4758 struct nvme_ns *nvme_ns, *tmp; 4759 4760 RB_FOREACH_SAFE(nvme_ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp) { 4761 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 4762 } 4763 } 4764 4765 static uint32_t 4766 nvme_ctrlr_get_ana_log_page_size(struct nvme_ctrlr *nvme_ctrlr) 4767 { 4768 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 4769 const struct spdk_nvme_ctrlr_data *cdata; 4770 uint32_t nsid, ns_count = 0; 4771 4772 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 4773 4774 for (nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 4775 nsid != 0; nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid)) { 4776 ns_count++; 4777 } 4778 4779 return sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 4780 sizeof(struct spdk_nvme_ana_group_descriptor) + ns_count * 4781 sizeof(uint32_t); 4782 } 4783 4784 static int 4785 nvme_ctrlr_set_ana_states(const struct spdk_nvme_ana_group_descriptor *desc, 4786 void *cb_arg) 4787 { 4788 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 4789 struct nvme_ns *nvme_ns; 4790 uint32_t i, nsid; 4791 4792 for (i = 0; i < desc->num_of_nsid; i++) { 4793 nsid = desc->nsid[i]; 4794 if (nsid == 0) { 4795 continue; 4796 } 4797 4798 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 4799 4800 assert(nvme_ns != NULL); 4801 if (nvme_ns == NULL) { 4802 /* Target told us that an inactive namespace had an ANA change */ 4803 continue; 4804 } 4805 4806 _nvme_ns_set_ana_state(nvme_ns, desc); 4807 } 4808 4809 return 0; 4810 } 4811 4812 static void 4813 bdev_nvme_disable_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 4814 { 4815 struct nvme_ns *nvme_ns; 4816 4817 spdk_free(nvme_ctrlr->ana_log_page); 4818 nvme_ctrlr->ana_log_page = NULL; 4819 4820 for (nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 4821 nvme_ns != NULL; 4822 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns)) { 4823 nvme_ns->ana_state_updating = false; 4824 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 4825 } 4826 } 4827 4828 static void 4829 nvme_ctrlr_read_ana_log_page_done(void *ctx, const struct spdk_nvme_cpl *cpl) 4830 { 4831 struct nvme_ctrlr *nvme_ctrlr = ctx; 4832 4833 if (cpl != NULL && spdk_nvme_cpl_is_success(cpl)) { 4834 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ctrlr_set_ana_states, 4835 nvme_ctrlr); 4836 } else { 4837 bdev_nvme_disable_read_ana_log_page(nvme_ctrlr); 4838 } 4839 4840 pthread_mutex_lock(&nvme_ctrlr->mutex); 4841 4842 assert(nvme_ctrlr->ana_log_page_updating == true); 4843 nvme_ctrlr->ana_log_page_updating = false; 4844 4845 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 4846 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4847 4848 nvme_ctrlr_unregister(nvme_ctrlr); 4849 } else { 4850 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4851 4852 bdev_nvme_clear_io_path_caches(nvme_ctrlr); 4853 } 4854 } 4855 4856 static int 4857 nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 4858 { 4859 uint32_t ana_log_page_size; 4860 int rc; 4861 4862 if (nvme_ctrlr->ana_log_page == NULL) { 4863 return -EINVAL; 4864 } 4865 4866 ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); 4867 4868 if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { 4869 SPDK_ERRLOG("ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", 4870 ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); 4871 return -EINVAL; 4872 } 4873 4874 pthread_mutex_lock(&nvme_ctrlr->mutex); 4875 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 4876 nvme_ctrlr->ana_log_page_updating) { 4877 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4878 return -EBUSY; 4879 } 4880 4881 nvme_ctrlr->ana_log_page_updating = true; 4882 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4883 4884 rc = spdk_nvme_ctrlr_cmd_get_log_page(nvme_ctrlr->ctrlr, 4885 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 4886 SPDK_NVME_GLOBAL_NS_TAG, 4887 nvme_ctrlr->ana_log_page, 4888 ana_log_page_size, 0, 4889 nvme_ctrlr_read_ana_log_page_done, 4890 nvme_ctrlr); 4891 if (rc != 0) { 4892 nvme_ctrlr_read_ana_log_page_done(nvme_ctrlr, NULL); 4893 } 4894 4895 return rc; 4896 } 4897 4898 static void 4899 dummy_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 4900 { 4901 } 4902 4903 struct bdev_nvme_set_preferred_path_ctx { 4904 struct spdk_bdev_desc *desc; 4905 struct nvme_ns *nvme_ns; 4906 bdev_nvme_set_preferred_path_cb cb_fn; 4907 void *cb_arg; 4908 }; 4909 4910 static void 4911 bdev_nvme_set_preferred_path_done(struct spdk_io_channel_iter *i, int status) 4912 { 4913 struct bdev_nvme_set_preferred_path_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4914 4915 assert(ctx != NULL); 4916 assert(ctx->desc != NULL); 4917 assert(ctx->cb_fn != NULL); 4918 4919 spdk_bdev_close(ctx->desc); 4920 4921 ctx->cb_fn(ctx->cb_arg, status); 4922 4923 free(ctx); 4924 } 4925 4926 static void 4927 _bdev_nvme_set_preferred_path(struct spdk_io_channel_iter *i) 4928 { 4929 struct bdev_nvme_set_preferred_path_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4930 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4931 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4932 struct nvme_io_path *io_path, *prev; 4933 4934 prev = NULL; 4935 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 4936 if (io_path->nvme_ns == ctx->nvme_ns) { 4937 break; 4938 } 4939 prev = io_path; 4940 } 4941 4942 if (io_path != NULL) { 4943 if (prev != NULL) { 4944 STAILQ_REMOVE_AFTER(&nbdev_ch->io_path_list, prev, stailq); 4945 STAILQ_INSERT_HEAD(&nbdev_ch->io_path_list, io_path, stailq); 4946 } 4947 4948 /* We can set io_path to nbdev_ch->current_io_path directly here. 4949 * However, it needs to be conditional. To simplify the code, 4950 * just clear nbdev_ch->current_io_path and let find_io_path() 4951 * fill it. 4952 * 4953 * Automatic failback may be disabled. Hence even if the io_path is 4954 * already at the head, clear nbdev_ch->current_io_path. 4955 */ 4956 bdev_nvme_clear_current_io_path(nbdev_ch); 4957 } 4958 4959 spdk_for_each_channel_continue(i, 0); 4960 } 4961 4962 static struct nvme_ns * 4963 bdev_nvme_set_preferred_ns(struct nvme_bdev *nbdev, uint16_t cntlid) 4964 { 4965 struct nvme_ns *nvme_ns, *prev; 4966 const struct spdk_nvme_ctrlr_data *cdata; 4967 4968 prev = NULL; 4969 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 4970 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 4971 4972 if (cdata->cntlid == cntlid) { 4973 break; 4974 } 4975 prev = nvme_ns; 4976 } 4977 4978 if (nvme_ns != NULL && prev != NULL) { 4979 TAILQ_REMOVE(&nbdev->nvme_ns_list, nvme_ns, tailq); 4980 TAILQ_INSERT_HEAD(&nbdev->nvme_ns_list, nvme_ns, tailq); 4981 } 4982 4983 return nvme_ns; 4984 } 4985 4986 /* This function supports only multipath mode. There is only a single I/O path 4987 * for each NVMe-oF controller. Hence, just move the matched I/O path to the 4988 * head of the I/O path list for each NVMe bdev channel. 4989 * 4990 * NVMe bdev channel may be acquired after completing this function. move the 4991 * matched namespace to the head of the namespace list for the NVMe bdev too. 4992 */ 4993 void 4994 bdev_nvme_set_preferred_path(const char *name, uint16_t cntlid, 4995 bdev_nvme_set_preferred_path_cb cb_fn, void *cb_arg) 4996 { 4997 struct bdev_nvme_set_preferred_path_ctx *ctx; 4998 struct spdk_bdev *bdev; 4999 struct nvme_bdev *nbdev; 5000 int rc = 0; 5001 5002 assert(cb_fn != NULL); 5003 5004 ctx = calloc(1, sizeof(*ctx)); 5005 if (ctx == NULL) { 5006 SPDK_ERRLOG("Failed to alloc context.\n"); 5007 rc = -ENOMEM; 5008 goto err_alloc; 5009 } 5010 5011 ctx->cb_fn = cb_fn; 5012 ctx->cb_arg = cb_arg; 5013 5014 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 5015 if (rc != 0) { 5016 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 5017 goto err_open; 5018 } 5019 5020 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 5021 5022 if (bdev->module != &nvme_if) { 5023 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 5024 rc = -ENODEV; 5025 goto err_bdev; 5026 } 5027 5028 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 5029 5030 pthread_mutex_lock(&nbdev->mutex); 5031 5032 ctx->nvme_ns = bdev_nvme_set_preferred_ns(nbdev, cntlid); 5033 if (ctx->nvme_ns == NULL) { 5034 pthread_mutex_unlock(&nbdev->mutex); 5035 5036 SPDK_ERRLOG("bdev %s does not have namespace to controller %u.\n", name, cntlid); 5037 rc = -ENODEV; 5038 goto err_bdev; 5039 } 5040 5041 pthread_mutex_unlock(&nbdev->mutex); 5042 5043 spdk_for_each_channel(nbdev, 5044 _bdev_nvme_set_preferred_path, 5045 ctx, 5046 bdev_nvme_set_preferred_path_done); 5047 return; 5048 5049 err_bdev: 5050 spdk_bdev_close(ctx->desc); 5051 err_open: 5052 free(ctx); 5053 err_alloc: 5054 cb_fn(cb_arg, rc); 5055 } 5056 5057 struct bdev_nvme_set_multipath_policy_ctx { 5058 struct spdk_bdev_desc *desc; 5059 bdev_nvme_set_multipath_policy_cb cb_fn; 5060 void *cb_arg; 5061 }; 5062 5063 static void 5064 bdev_nvme_set_multipath_policy_done(struct spdk_io_channel_iter *i, int status) 5065 { 5066 struct bdev_nvme_set_multipath_policy_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 5067 5068 assert(ctx != NULL); 5069 assert(ctx->desc != NULL); 5070 assert(ctx->cb_fn != NULL); 5071 5072 spdk_bdev_close(ctx->desc); 5073 5074 ctx->cb_fn(ctx->cb_arg, status); 5075 5076 free(ctx); 5077 } 5078 5079 static void 5080 _bdev_nvme_set_multipath_policy(struct spdk_io_channel_iter *i) 5081 { 5082 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 5083 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 5084 struct nvme_bdev *nbdev = spdk_io_channel_get_io_device(_ch); 5085 5086 nbdev_ch->mp_policy = nbdev->mp_policy; 5087 nbdev_ch->mp_selector = nbdev->mp_selector; 5088 nbdev_ch->rr_min_io = nbdev->rr_min_io; 5089 bdev_nvme_clear_current_io_path(nbdev_ch); 5090 5091 spdk_for_each_channel_continue(i, 0); 5092 } 5093 5094 void 5095 bdev_nvme_set_multipath_policy(const char *name, enum bdev_nvme_multipath_policy policy, 5096 enum bdev_nvme_multipath_selector selector, uint32_t rr_min_io, 5097 bdev_nvme_set_multipath_policy_cb cb_fn, void *cb_arg) 5098 { 5099 struct bdev_nvme_set_multipath_policy_ctx *ctx; 5100 struct spdk_bdev *bdev; 5101 struct nvme_bdev *nbdev; 5102 int rc; 5103 5104 assert(cb_fn != NULL); 5105 5106 switch (policy) { 5107 case BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE: 5108 break; 5109 case BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE: 5110 switch (selector) { 5111 case BDEV_NVME_MP_SELECTOR_ROUND_ROBIN: 5112 if (rr_min_io == UINT32_MAX) { 5113 rr_min_io = 1; 5114 } else if (rr_min_io == 0) { 5115 rc = -EINVAL; 5116 goto exit; 5117 } 5118 break; 5119 case BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH: 5120 break; 5121 default: 5122 rc = -EINVAL; 5123 goto exit; 5124 } 5125 break; 5126 default: 5127 rc = -EINVAL; 5128 goto exit; 5129 } 5130 5131 ctx = calloc(1, sizeof(*ctx)); 5132 if (ctx == NULL) { 5133 SPDK_ERRLOG("Failed to alloc context.\n"); 5134 rc = -ENOMEM; 5135 goto exit; 5136 } 5137 5138 ctx->cb_fn = cb_fn; 5139 ctx->cb_arg = cb_arg; 5140 5141 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 5142 if (rc != 0) { 5143 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 5144 rc = -ENODEV; 5145 goto err_open; 5146 } 5147 5148 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 5149 if (bdev->module != &nvme_if) { 5150 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 5151 rc = -ENODEV; 5152 goto err_module; 5153 } 5154 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 5155 5156 pthread_mutex_lock(&nbdev->mutex); 5157 nbdev->mp_policy = policy; 5158 nbdev->mp_selector = selector; 5159 nbdev->rr_min_io = rr_min_io; 5160 pthread_mutex_unlock(&nbdev->mutex); 5161 5162 spdk_for_each_channel(nbdev, 5163 _bdev_nvme_set_multipath_policy, 5164 ctx, 5165 bdev_nvme_set_multipath_policy_done); 5166 return; 5167 5168 err_module: 5169 spdk_bdev_close(ctx->desc); 5170 err_open: 5171 free(ctx); 5172 exit: 5173 cb_fn(cb_arg, rc); 5174 } 5175 5176 static void 5177 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 5178 { 5179 struct nvme_ctrlr *nvme_ctrlr = arg; 5180 union spdk_nvme_async_event_completion event; 5181 5182 if (spdk_nvme_cpl_is_error(cpl)) { 5183 SPDK_WARNLOG("AER request execute failed\n"); 5184 return; 5185 } 5186 5187 event.raw = cpl->cdw0; 5188 if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 5189 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) { 5190 nvme_ctrlr_populate_namespaces(nvme_ctrlr, NULL); 5191 } else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 5192 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_ANA_CHANGE)) { 5193 nvme_ctrlr_read_ana_log_page(nvme_ctrlr); 5194 } 5195 } 5196 5197 static void 5198 free_nvme_async_probe_ctx(struct nvme_async_probe_ctx *ctx) 5199 { 5200 spdk_keyring_put_key(ctx->drv_opts.tls_psk); 5201 spdk_keyring_put_key(ctx->drv_opts.dhchap_key); 5202 free(ctx); 5203 } 5204 5205 static void 5206 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, int rc) 5207 { 5208 if (ctx->cb_fn) { 5209 ctx->cb_fn(ctx->cb_ctx, ctx->reported_bdevs, rc); 5210 } 5211 5212 ctx->namespaces_populated = true; 5213 if (ctx->probe_done) { 5214 /* The probe was already completed, so we need to free the context 5215 * here. This can happen for cases like OCSSD, where we need to 5216 * send additional commands to the SSD after attach. 5217 */ 5218 free_nvme_async_probe_ctx(ctx); 5219 } 5220 } 5221 5222 static void 5223 nvme_ctrlr_create_done(struct nvme_ctrlr *nvme_ctrlr, 5224 struct nvme_async_probe_ctx *ctx) 5225 { 5226 spdk_io_device_register(nvme_ctrlr, 5227 bdev_nvme_create_ctrlr_channel_cb, 5228 bdev_nvme_destroy_ctrlr_channel_cb, 5229 sizeof(struct nvme_ctrlr_channel), 5230 nvme_ctrlr->nbdev_ctrlr->name); 5231 5232 nvme_ctrlr_populate_namespaces(nvme_ctrlr, ctx); 5233 } 5234 5235 static void 5236 nvme_ctrlr_init_ana_log_page_done(void *_ctx, const struct spdk_nvme_cpl *cpl) 5237 { 5238 struct nvme_ctrlr *nvme_ctrlr = _ctx; 5239 struct nvme_async_probe_ctx *ctx = nvme_ctrlr->probe_ctx; 5240 5241 nvme_ctrlr->probe_ctx = NULL; 5242 5243 if (spdk_nvme_cpl_is_error(cpl)) { 5244 nvme_ctrlr_delete(nvme_ctrlr); 5245 5246 if (ctx != NULL) { 5247 ctx->reported_bdevs = 0; 5248 populate_namespaces_cb(ctx, -1); 5249 } 5250 return; 5251 } 5252 5253 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 5254 } 5255 5256 static int 5257 nvme_ctrlr_init_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 5258 struct nvme_async_probe_ctx *ctx) 5259 { 5260 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 5261 const struct spdk_nvme_ctrlr_data *cdata; 5262 uint32_t ana_log_page_size; 5263 5264 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5265 5266 /* Set buffer size enough to include maximum number of allowed namespaces. */ 5267 ana_log_page_size = sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 5268 sizeof(struct spdk_nvme_ana_group_descriptor) + cdata->mnan * 5269 sizeof(uint32_t); 5270 5271 nvme_ctrlr->ana_log_page = spdk_zmalloc(ana_log_page_size, 64, NULL, 5272 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 5273 if (nvme_ctrlr->ana_log_page == NULL) { 5274 SPDK_ERRLOG("could not allocate ANA log page buffer\n"); 5275 return -ENXIO; 5276 } 5277 5278 /* Each descriptor in a ANA log page is not ensured to be 8-bytes aligned. 5279 * Hence copy each descriptor to a temporary area when parsing it. 5280 * 5281 * Allocate a buffer whose size is as large as ANA log page buffer because 5282 * we do not know the size of a descriptor until actually reading it. 5283 */ 5284 nvme_ctrlr->copied_ana_desc = calloc(1, ana_log_page_size); 5285 if (nvme_ctrlr->copied_ana_desc == NULL) { 5286 SPDK_ERRLOG("could not allocate a buffer to parse ANA descriptor\n"); 5287 return -ENOMEM; 5288 } 5289 5290 nvme_ctrlr->max_ana_log_page_size = ana_log_page_size; 5291 5292 nvme_ctrlr->probe_ctx = ctx; 5293 5294 /* Then, set the read size only to include the current active namespaces. */ 5295 ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); 5296 5297 if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { 5298 SPDK_ERRLOG("ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", 5299 ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); 5300 return -EINVAL; 5301 } 5302 5303 return spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, 5304 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 5305 SPDK_NVME_GLOBAL_NS_TAG, 5306 nvme_ctrlr->ana_log_page, 5307 ana_log_page_size, 0, 5308 nvme_ctrlr_init_ana_log_page_done, 5309 nvme_ctrlr); 5310 } 5311 5312 /* hostnqn and subnqn were already verified before attaching a controller. 5313 * Hence check only the multipath capability and cntlid here. 5314 */ 5315 static bool 5316 bdev_nvme_check_multipath(struct nvme_bdev_ctrlr *nbdev_ctrlr, struct spdk_nvme_ctrlr *ctrlr) 5317 { 5318 struct nvme_ctrlr *tmp; 5319 const struct spdk_nvme_ctrlr_data *cdata, *tmp_cdata; 5320 5321 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5322 5323 if (!cdata->cmic.multi_ctrlr) { 5324 SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); 5325 return false; 5326 } 5327 5328 TAILQ_FOREACH(tmp, &nbdev_ctrlr->ctrlrs, tailq) { 5329 tmp_cdata = spdk_nvme_ctrlr_get_data(tmp->ctrlr); 5330 5331 if (!tmp_cdata->cmic.multi_ctrlr) { 5332 SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); 5333 return false; 5334 } 5335 if (cdata->cntlid == tmp_cdata->cntlid) { 5336 SPDK_ERRLOG("cntlid %u are duplicated.\n", tmp_cdata->cntlid); 5337 return false; 5338 } 5339 } 5340 5341 return true; 5342 } 5343 5344 static int 5345 nvme_bdev_ctrlr_create(const char *name, struct nvme_ctrlr *nvme_ctrlr) 5346 { 5347 struct nvme_bdev_ctrlr *nbdev_ctrlr; 5348 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 5349 int rc = 0; 5350 5351 pthread_mutex_lock(&g_bdev_nvme_mutex); 5352 5353 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 5354 if (nbdev_ctrlr != NULL) { 5355 if (!bdev_nvme_check_multipath(nbdev_ctrlr, ctrlr)) { 5356 rc = -EINVAL; 5357 goto exit; 5358 } 5359 } else { 5360 nbdev_ctrlr = calloc(1, sizeof(*nbdev_ctrlr)); 5361 if (nbdev_ctrlr == NULL) { 5362 SPDK_ERRLOG("Failed to allocate nvme_bdev_ctrlr.\n"); 5363 rc = -ENOMEM; 5364 goto exit; 5365 } 5366 nbdev_ctrlr->name = strdup(name); 5367 if (nbdev_ctrlr->name == NULL) { 5368 SPDK_ERRLOG("Failed to allocate name of nvme_bdev_ctrlr.\n"); 5369 free(nbdev_ctrlr); 5370 goto exit; 5371 } 5372 TAILQ_INIT(&nbdev_ctrlr->ctrlrs); 5373 TAILQ_INIT(&nbdev_ctrlr->bdevs); 5374 TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 5375 } 5376 nvme_ctrlr->nbdev_ctrlr = nbdev_ctrlr; 5377 TAILQ_INSERT_TAIL(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 5378 exit: 5379 pthread_mutex_unlock(&g_bdev_nvme_mutex); 5380 return rc; 5381 } 5382 5383 static int 5384 nvme_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr, 5385 const char *name, 5386 const struct spdk_nvme_transport_id *trid, 5387 struct nvme_async_probe_ctx *ctx) 5388 { 5389 struct nvme_ctrlr *nvme_ctrlr; 5390 struct nvme_path_id *path_id; 5391 const struct spdk_nvme_ctrlr_data *cdata; 5392 int rc; 5393 5394 nvme_ctrlr = calloc(1, sizeof(*nvme_ctrlr)); 5395 if (nvme_ctrlr == NULL) { 5396 SPDK_ERRLOG("Failed to allocate device struct\n"); 5397 return -ENOMEM; 5398 } 5399 5400 rc = pthread_mutex_init(&nvme_ctrlr->mutex, NULL); 5401 if (rc != 0) { 5402 free(nvme_ctrlr); 5403 return rc; 5404 } 5405 5406 TAILQ_INIT(&nvme_ctrlr->trids); 5407 RB_INIT(&nvme_ctrlr->namespaces); 5408 5409 /* Get another reference to the key, so the first one can be released from probe_ctx */ 5410 if (ctx != NULL) { 5411 if (ctx->drv_opts.tls_psk != NULL) { 5412 nvme_ctrlr->psk = spdk_keyring_get_key( 5413 spdk_key_get_name(ctx->drv_opts.tls_psk)); 5414 if (nvme_ctrlr->psk == NULL) { 5415 /* Could only happen if the key was removed in the meantime */ 5416 SPDK_ERRLOG("Couldn't get a reference to the key '%s'\n", 5417 spdk_key_get_name(ctx->drv_opts.tls_psk)); 5418 rc = -ENOKEY; 5419 goto err; 5420 } 5421 } 5422 5423 if (ctx->drv_opts.dhchap_key != NULL) { 5424 nvme_ctrlr->dhchap_key = spdk_keyring_get_key( 5425 spdk_key_get_name(ctx->drv_opts.dhchap_key)); 5426 if (nvme_ctrlr->dhchap_key == NULL) { 5427 SPDK_ERRLOG("Couldn't get a reference to the key '%s'\n", 5428 spdk_key_get_name(ctx->drv_opts.dhchap_key)); 5429 rc = -ENOKEY; 5430 goto err; 5431 } 5432 } 5433 } 5434 5435 path_id = calloc(1, sizeof(*path_id)); 5436 if (path_id == NULL) { 5437 SPDK_ERRLOG("Failed to allocate trid entry pointer\n"); 5438 rc = -ENOMEM; 5439 goto err; 5440 } 5441 5442 path_id->trid = *trid; 5443 if (ctx != NULL) { 5444 memcpy(path_id->hostid.hostaddr, ctx->drv_opts.src_addr, sizeof(path_id->hostid.hostaddr)); 5445 memcpy(path_id->hostid.hostsvcid, ctx->drv_opts.src_svcid, sizeof(path_id->hostid.hostsvcid)); 5446 } 5447 nvme_ctrlr->active_path_id = path_id; 5448 TAILQ_INSERT_HEAD(&nvme_ctrlr->trids, path_id, link); 5449 5450 nvme_ctrlr->thread = spdk_get_thread(); 5451 nvme_ctrlr->ctrlr = ctrlr; 5452 nvme_ctrlr->ref = 1; 5453 5454 if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) { 5455 SPDK_ERRLOG("OCSSDs are not supported"); 5456 rc = -ENOTSUP; 5457 goto err; 5458 } 5459 5460 if (ctx != NULL) { 5461 memcpy(&nvme_ctrlr->opts, &ctx->bdev_opts, sizeof(ctx->bdev_opts)); 5462 } else { 5463 bdev_nvme_get_default_ctrlr_opts(&nvme_ctrlr->opts); 5464 } 5465 5466 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_ctrlr, 5467 g_opts.nvme_adminq_poll_period_us); 5468 5469 if (g_opts.timeout_us > 0) { 5470 /* Register timeout callback. Timeout values for IO vs. admin reqs can be different. */ 5471 /* If timeout_admin_us is 0 (not specified), admin uses same timeout as IO. */ 5472 uint64_t adm_timeout_us = (g_opts.timeout_admin_us == 0) ? 5473 g_opts.timeout_us : g_opts.timeout_admin_us; 5474 spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us, 5475 adm_timeout_us, timeout_cb, nvme_ctrlr); 5476 } 5477 5478 spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_ctrlr); 5479 spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_ctrlr); 5480 5481 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 5482 SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) { 5483 nvme_ctrlr->opal_dev = spdk_opal_dev_construct(ctrlr); 5484 } 5485 5486 rc = nvme_bdev_ctrlr_create(name, nvme_ctrlr); 5487 if (rc != 0) { 5488 goto err; 5489 } 5490 5491 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5492 5493 if (cdata->cmic.ana_reporting) { 5494 rc = nvme_ctrlr_init_ana_log_page(nvme_ctrlr, ctx); 5495 if (rc == 0) { 5496 return 0; 5497 } 5498 } else { 5499 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 5500 return 0; 5501 } 5502 5503 err: 5504 nvme_ctrlr_delete(nvme_ctrlr); 5505 return rc; 5506 } 5507 5508 void 5509 bdev_nvme_get_default_ctrlr_opts(struct nvme_ctrlr_opts *opts) 5510 { 5511 opts->prchk_flags = 0; 5512 opts->ctrlr_loss_timeout_sec = g_opts.ctrlr_loss_timeout_sec; 5513 opts->reconnect_delay_sec = g_opts.reconnect_delay_sec; 5514 opts->fast_io_fail_timeout_sec = g_opts.fast_io_fail_timeout_sec; 5515 } 5516 5517 static void 5518 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5519 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *drv_opts) 5520 { 5521 char *name; 5522 5523 name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++); 5524 if (!name) { 5525 SPDK_ERRLOG("Failed to assign name to NVMe device\n"); 5526 return; 5527 } 5528 5529 if (nvme_ctrlr_create(ctrlr, name, trid, NULL) == 0) { 5530 SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name); 5531 } else { 5532 SPDK_ERRLOG("Failed to attach to %s (%s)\n", trid->traddr, name); 5533 } 5534 5535 free(name); 5536 } 5537 5538 static void 5539 _nvme_ctrlr_destruct(void *ctx) 5540 { 5541 struct nvme_ctrlr *nvme_ctrlr = ctx; 5542 5543 nvme_ctrlr_depopulate_namespaces(nvme_ctrlr); 5544 nvme_ctrlr_release(nvme_ctrlr); 5545 } 5546 5547 static int 5548 bdev_nvme_delete_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) 5549 { 5550 struct nvme_probe_skip_entry *entry; 5551 5552 /* The controller's destruction was already started */ 5553 if (nvme_ctrlr->destruct) { 5554 return -EALREADY; 5555 } 5556 5557 if (!hotplug && 5558 nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { 5559 entry = calloc(1, sizeof(*entry)); 5560 if (!entry) { 5561 return -ENOMEM; 5562 } 5563 entry->trid = nvme_ctrlr->active_path_id->trid; 5564 TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq); 5565 } 5566 5567 nvme_ctrlr->destruct = true; 5568 return 0; 5569 } 5570 5571 static int 5572 bdev_nvme_delete_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) 5573 { 5574 int rc; 5575 5576 pthread_mutex_lock(&nvme_ctrlr->mutex); 5577 rc = bdev_nvme_delete_ctrlr_unsafe(nvme_ctrlr, hotplug); 5578 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5579 5580 if (rc == 0) { 5581 _nvme_ctrlr_destruct(nvme_ctrlr); 5582 } else if (rc == -EALREADY) { 5583 rc = 0; 5584 } 5585 5586 return rc; 5587 } 5588 5589 static void 5590 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr) 5591 { 5592 struct nvme_ctrlr *nvme_ctrlr = cb_ctx; 5593 5594 bdev_nvme_delete_ctrlr(nvme_ctrlr, true); 5595 } 5596 5597 static int 5598 bdev_nvme_hotplug_probe(void *arg) 5599 { 5600 if (g_hotplug_probe_ctx == NULL) { 5601 spdk_poller_unregister(&g_hotplug_probe_poller); 5602 return SPDK_POLLER_IDLE; 5603 } 5604 5605 if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) { 5606 g_hotplug_probe_ctx = NULL; 5607 spdk_poller_unregister(&g_hotplug_probe_poller); 5608 } 5609 5610 return SPDK_POLLER_BUSY; 5611 } 5612 5613 static int 5614 bdev_nvme_hotplug(void *arg) 5615 { 5616 struct spdk_nvme_transport_id trid_pcie; 5617 5618 if (g_hotplug_probe_ctx) { 5619 return SPDK_POLLER_BUSY; 5620 } 5621 5622 memset(&trid_pcie, 0, sizeof(trid_pcie)); 5623 spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE); 5624 5625 g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL, 5626 hotplug_probe_cb, attach_cb, NULL); 5627 5628 if (g_hotplug_probe_ctx) { 5629 assert(g_hotplug_probe_poller == NULL); 5630 g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000); 5631 } 5632 5633 return SPDK_POLLER_BUSY; 5634 } 5635 5636 void 5637 bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts) 5638 { 5639 *opts = g_opts; 5640 } 5641 5642 static bool bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, 5643 uint32_t reconnect_delay_sec, 5644 uint32_t fast_io_fail_timeout_sec); 5645 5646 static int 5647 bdev_nvme_validate_opts(const struct spdk_bdev_nvme_opts *opts) 5648 { 5649 if ((opts->timeout_us == 0) && (opts->timeout_admin_us != 0)) { 5650 /* Can't set timeout_admin_us without also setting timeout_us */ 5651 SPDK_WARNLOG("Invalid options: Can't have (timeout_us == 0) with (timeout_admin_us > 0)\n"); 5652 return -EINVAL; 5653 } 5654 5655 if (opts->bdev_retry_count < -1) { 5656 SPDK_WARNLOG("Invalid option: bdev_retry_count can't be less than -1.\n"); 5657 return -EINVAL; 5658 } 5659 5660 if (!bdev_nvme_check_io_error_resiliency_params(opts->ctrlr_loss_timeout_sec, 5661 opts->reconnect_delay_sec, 5662 opts->fast_io_fail_timeout_sec)) { 5663 return -EINVAL; 5664 } 5665 5666 return 0; 5667 } 5668 5669 int 5670 bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts) 5671 { 5672 int ret; 5673 5674 ret = bdev_nvme_validate_opts(opts); 5675 if (ret) { 5676 SPDK_WARNLOG("Failed to set nvme opts.\n"); 5677 return ret; 5678 } 5679 5680 if (g_bdev_nvme_init_thread != NULL) { 5681 if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 5682 return -EPERM; 5683 } 5684 } 5685 5686 if (opts->rdma_srq_size != 0 || 5687 opts->rdma_max_cq_size != 0 || 5688 opts->rdma_cm_event_timeout_ms != 0) { 5689 struct spdk_nvme_transport_opts drv_opts; 5690 5691 spdk_nvme_transport_get_opts(&drv_opts, sizeof(drv_opts)); 5692 if (opts->rdma_srq_size != 0) { 5693 drv_opts.rdma_srq_size = opts->rdma_srq_size; 5694 } 5695 if (opts->rdma_max_cq_size != 0) { 5696 drv_opts.rdma_max_cq_size = opts->rdma_max_cq_size; 5697 } 5698 if (opts->rdma_cm_event_timeout_ms != 0) { 5699 drv_opts.rdma_cm_event_timeout_ms = opts->rdma_cm_event_timeout_ms; 5700 } 5701 5702 ret = spdk_nvme_transport_set_opts(&drv_opts, sizeof(drv_opts)); 5703 if (ret) { 5704 SPDK_ERRLOG("Failed to set NVMe transport opts.\n"); 5705 return ret; 5706 } 5707 } 5708 5709 g_opts = *opts; 5710 5711 return 0; 5712 } 5713 5714 struct set_nvme_hotplug_ctx { 5715 uint64_t period_us; 5716 bool enabled; 5717 spdk_msg_fn fn; 5718 void *fn_ctx; 5719 }; 5720 5721 static void 5722 set_nvme_hotplug_period_cb(void *_ctx) 5723 { 5724 struct set_nvme_hotplug_ctx *ctx = _ctx; 5725 5726 spdk_poller_unregister(&g_hotplug_poller); 5727 if (ctx->enabled) { 5728 g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us); 5729 } 5730 5731 g_nvme_hotplug_poll_period_us = ctx->period_us; 5732 g_nvme_hotplug_enabled = ctx->enabled; 5733 if (ctx->fn) { 5734 ctx->fn(ctx->fn_ctx); 5735 } 5736 5737 free(ctx); 5738 } 5739 5740 int 5741 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx) 5742 { 5743 struct set_nvme_hotplug_ctx *ctx; 5744 5745 if (enabled == true && !spdk_process_is_primary()) { 5746 return -EPERM; 5747 } 5748 5749 ctx = calloc(1, sizeof(*ctx)); 5750 if (ctx == NULL) { 5751 return -ENOMEM; 5752 } 5753 5754 period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us; 5755 ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX); 5756 ctx->enabled = enabled; 5757 ctx->fn = cb; 5758 ctx->fn_ctx = cb_ctx; 5759 5760 spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx); 5761 return 0; 5762 } 5763 5764 static void 5765 nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 5766 struct nvme_async_probe_ctx *ctx) 5767 { 5768 struct nvme_ns *nvme_ns; 5769 struct nvme_bdev *nvme_bdev; 5770 size_t j; 5771 5772 assert(nvme_ctrlr != NULL); 5773 5774 if (ctx->names == NULL) { 5775 ctx->reported_bdevs = 0; 5776 populate_namespaces_cb(ctx, 0); 5777 return; 5778 } 5779 5780 /* 5781 * Report the new bdevs that were created in this call. 5782 * There can be more than one bdev per NVMe controller. 5783 */ 5784 j = 0; 5785 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 5786 while (nvme_ns != NULL) { 5787 nvme_bdev = nvme_ns->bdev; 5788 if (j < ctx->max_bdevs) { 5789 ctx->names[j] = nvme_bdev->disk.name; 5790 j++; 5791 } else { 5792 SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n", 5793 ctx->max_bdevs); 5794 ctx->reported_bdevs = 0; 5795 populate_namespaces_cb(ctx, -ERANGE); 5796 return; 5797 } 5798 5799 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 5800 } 5801 5802 ctx->reported_bdevs = j; 5803 populate_namespaces_cb(ctx, 0); 5804 } 5805 5806 static int 5807 bdev_nvme_check_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 5808 struct spdk_nvme_ctrlr *new_ctrlr, 5809 struct spdk_nvme_transport_id *trid) 5810 { 5811 struct nvme_path_id *tmp_trid; 5812 5813 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 5814 SPDK_ERRLOG("PCIe failover is not supported.\n"); 5815 return -ENOTSUP; 5816 } 5817 5818 /* Currently we only support failover to the same transport type. */ 5819 if (nvme_ctrlr->active_path_id->trid.trtype != trid->trtype) { 5820 SPDK_WARNLOG("Failover from trtype: %s to a different trtype: %s is not supported currently\n", 5821 spdk_nvme_transport_id_trtype_str(nvme_ctrlr->active_path_id->trid.trtype), 5822 spdk_nvme_transport_id_trtype_str(trid->trtype)); 5823 return -EINVAL; 5824 } 5825 5826 5827 /* Currently we only support failover to the same NQN. */ 5828 if (strncmp(trid->subnqn, nvme_ctrlr->active_path_id->trid.subnqn, SPDK_NVMF_NQN_MAX_LEN)) { 5829 SPDK_WARNLOG("Failover from subnqn: %s to a different subnqn: %s is not supported currently\n", 5830 nvme_ctrlr->active_path_id->trid.subnqn, trid->subnqn); 5831 return -EINVAL; 5832 } 5833 5834 /* Skip all the other checks if we've already registered this path. */ 5835 TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) { 5836 if (!spdk_nvme_transport_id_compare(&tmp_trid->trid, trid)) { 5837 SPDK_WARNLOG("This path (traddr: %s subnqn: %s) is already registered\n", trid->traddr, 5838 trid->subnqn); 5839 return -EEXIST; 5840 } 5841 } 5842 5843 return 0; 5844 } 5845 5846 static int 5847 bdev_nvme_check_secondary_namespace(struct nvme_ctrlr *nvme_ctrlr, 5848 struct spdk_nvme_ctrlr *new_ctrlr) 5849 { 5850 struct nvme_ns *nvme_ns; 5851 struct spdk_nvme_ns *new_ns; 5852 5853 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 5854 while (nvme_ns != NULL) { 5855 new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nvme_ns->id); 5856 assert(new_ns != NULL); 5857 5858 if (!bdev_nvme_compare_ns(nvme_ns->ns, new_ns)) { 5859 return -EINVAL; 5860 } 5861 5862 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 5863 } 5864 5865 return 0; 5866 } 5867 5868 static int 5869 _bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 5870 struct spdk_nvme_transport_id *trid) 5871 { 5872 struct nvme_path_id *active_id, *new_trid, *tmp_trid; 5873 5874 new_trid = calloc(1, sizeof(*new_trid)); 5875 if (new_trid == NULL) { 5876 return -ENOMEM; 5877 } 5878 new_trid->trid = *trid; 5879 5880 active_id = nvme_ctrlr->active_path_id; 5881 assert(active_id != NULL); 5882 assert(active_id == TAILQ_FIRST(&nvme_ctrlr->trids)); 5883 5884 /* Skip the active trid not to replace it until it is failed. */ 5885 tmp_trid = TAILQ_NEXT(active_id, link); 5886 if (tmp_trid == NULL) { 5887 goto add_tail; 5888 } 5889 5890 /* It means the trid is faled if its last failed time is non-zero. 5891 * Insert the new alternate trid before any failed trid. 5892 */ 5893 TAILQ_FOREACH_FROM(tmp_trid, &nvme_ctrlr->trids, link) { 5894 if (tmp_trid->last_failed_tsc != 0) { 5895 TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link); 5896 return 0; 5897 } 5898 } 5899 5900 add_tail: 5901 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, new_trid, link); 5902 return 0; 5903 } 5904 5905 /* This is the case that a secondary path is added to an existing 5906 * nvme_ctrlr for failover. After checking if it can access the same 5907 * namespaces as the primary path, it is disconnected until failover occurs. 5908 */ 5909 static int 5910 bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 5911 struct spdk_nvme_ctrlr *new_ctrlr, 5912 struct spdk_nvme_transport_id *trid) 5913 { 5914 int rc; 5915 5916 assert(nvme_ctrlr != NULL); 5917 5918 pthread_mutex_lock(&nvme_ctrlr->mutex); 5919 5920 rc = bdev_nvme_check_secondary_trid(nvme_ctrlr, new_ctrlr, trid); 5921 if (rc != 0) { 5922 goto exit; 5923 } 5924 5925 rc = bdev_nvme_check_secondary_namespace(nvme_ctrlr, new_ctrlr); 5926 if (rc != 0) { 5927 goto exit; 5928 } 5929 5930 rc = _bdev_nvme_add_secondary_trid(nvme_ctrlr, trid); 5931 5932 exit: 5933 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5934 5935 spdk_nvme_detach(new_ctrlr); 5936 5937 return rc; 5938 } 5939 5940 static void 5941 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5942 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 5943 { 5944 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 5945 struct nvme_async_probe_ctx *ctx; 5946 int rc; 5947 5948 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 5949 ctx->ctrlr_attached = true; 5950 5951 rc = nvme_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx); 5952 if (rc != 0) { 5953 ctx->reported_bdevs = 0; 5954 populate_namespaces_cb(ctx, rc); 5955 } 5956 } 5957 5958 static void 5959 connect_set_failover_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5960 struct spdk_nvme_ctrlr *ctrlr, 5961 const struct spdk_nvme_ctrlr_opts *opts) 5962 { 5963 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 5964 struct nvme_ctrlr *nvme_ctrlr; 5965 struct nvme_async_probe_ctx *ctx; 5966 int rc; 5967 5968 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 5969 ctx->ctrlr_attached = true; 5970 5971 nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->base_name); 5972 if (nvme_ctrlr) { 5973 rc = bdev_nvme_add_secondary_trid(nvme_ctrlr, ctrlr, &ctx->trid); 5974 } else { 5975 rc = -ENODEV; 5976 } 5977 5978 ctx->reported_bdevs = 0; 5979 populate_namespaces_cb(ctx, rc); 5980 } 5981 5982 static int 5983 bdev_nvme_async_poll(void *arg) 5984 { 5985 struct nvme_async_probe_ctx *ctx = arg; 5986 int rc; 5987 5988 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 5989 if (spdk_unlikely(rc != -EAGAIN)) { 5990 ctx->probe_done = true; 5991 spdk_poller_unregister(&ctx->poller); 5992 if (!ctx->ctrlr_attached) { 5993 /* The probe is done, but no controller was attached. 5994 * That means we had a failure, so report -EIO back to 5995 * the caller (usually the RPC). populate_namespaces_cb() 5996 * will take care of freeing the nvme_async_probe_ctx. 5997 */ 5998 ctx->reported_bdevs = 0; 5999 populate_namespaces_cb(ctx, -EIO); 6000 } else if (ctx->namespaces_populated) { 6001 /* The namespaces for the attached controller were all 6002 * populated and the response was already sent to the 6003 * caller (usually the RPC). So free the context here. 6004 */ 6005 free_nvme_async_probe_ctx(ctx); 6006 } 6007 } 6008 6009 return SPDK_POLLER_BUSY; 6010 } 6011 6012 static bool 6013 bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, 6014 uint32_t reconnect_delay_sec, 6015 uint32_t fast_io_fail_timeout_sec) 6016 { 6017 if (ctrlr_loss_timeout_sec < -1) { 6018 SPDK_ERRLOG("ctrlr_loss_timeout_sec can't be less than -1.\n"); 6019 return false; 6020 } else if (ctrlr_loss_timeout_sec == -1) { 6021 if (reconnect_delay_sec == 0) { 6022 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 6023 return false; 6024 } else if (fast_io_fail_timeout_sec != 0 && 6025 fast_io_fail_timeout_sec < reconnect_delay_sec) { 6026 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io-fail_timeout_sec.\n"); 6027 return false; 6028 } 6029 } else if (ctrlr_loss_timeout_sec != 0) { 6030 if (reconnect_delay_sec == 0) { 6031 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 6032 return false; 6033 } else if (reconnect_delay_sec > (uint32_t)ctrlr_loss_timeout_sec) { 6034 SPDK_ERRLOG("reconnect_delay_sec can't be more than ctrlr_loss_timeout_sec.\n"); 6035 return false; 6036 } else if (fast_io_fail_timeout_sec != 0) { 6037 if (fast_io_fail_timeout_sec < reconnect_delay_sec) { 6038 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io_fail_timeout_sec.\n"); 6039 return false; 6040 } else if (fast_io_fail_timeout_sec > (uint32_t)ctrlr_loss_timeout_sec) { 6041 SPDK_ERRLOG("fast_io_fail_timeout_sec can't be more than ctrlr_loss_timeout_sec.\n"); 6042 return false; 6043 } 6044 } 6045 } else if (reconnect_delay_sec != 0 || fast_io_fail_timeout_sec != 0) { 6046 SPDK_ERRLOG("Both reconnect_delay_sec and fast_io_fail_timeout_sec must be 0 if ctrlr_loss_timeout_sec is 0.\n"); 6047 return false; 6048 } 6049 6050 return true; 6051 } 6052 6053 static int 6054 bdev_nvme_load_psk(const char *fname, char *buf, size_t bufsz) 6055 { 6056 FILE *psk_file; 6057 struct stat statbuf; 6058 int rc; 6059 #define TCP_PSK_INVALID_PERMISSIONS 0177 6060 6061 if (stat(fname, &statbuf) != 0) { 6062 SPDK_ERRLOG("Could not read permissions for PSK file\n"); 6063 return -EACCES; 6064 } 6065 6066 if ((statbuf.st_mode & TCP_PSK_INVALID_PERMISSIONS) != 0) { 6067 SPDK_ERRLOG("Incorrect permissions for PSK file\n"); 6068 return -EPERM; 6069 } 6070 if ((size_t)statbuf.st_size >= bufsz) { 6071 SPDK_ERRLOG("Invalid PSK: too long\n"); 6072 return -EINVAL; 6073 } 6074 psk_file = fopen(fname, "r"); 6075 if (psk_file == NULL) { 6076 SPDK_ERRLOG("Could not open PSK file\n"); 6077 return -EINVAL; 6078 } 6079 6080 memset(buf, 0, bufsz); 6081 rc = fread(buf, 1, statbuf.st_size, psk_file); 6082 if (rc != statbuf.st_size) { 6083 SPDK_ERRLOG("Failed to read PSK\n"); 6084 fclose(psk_file); 6085 return -EINVAL; 6086 } 6087 6088 fclose(psk_file); 6089 return 0; 6090 } 6091 6092 int 6093 bdev_nvme_create(struct spdk_nvme_transport_id *trid, 6094 const char *base_name, 6095 const char **names, 6096 uint32_t count, 6097 spdk_bdev_create_nvme_fn cb_fn, 6098 void *cb_ctx, 6099 struct spdk_nvme_ctrlr_opts *drv_opts, 6100 struct nvme_ctrlr_opts *bdev_opts, 6101 bool multipath) 6102 { 6103 struct nvme_probe_skip_entry *entry, *tmp; 6104 struct nvme_async_probe_ctx *ctx; 6105 spdk_nvme_attach_cb attach_cb; 6106 int rc, len; 6107 6108 /* TODO expand this check to include both the host and target TRIDs. 6109 * Only if both are the same should we fail. 6110 */ 6111 if (nvme_ctrlr_get(trid) != NULL) { 6112 SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr); 6113 return -EEXIST; 6114 } 6115 6116 len = strnlen(base_name, SPDK_CONTROLLER_NAME_MAX); 6117 6118 if (len == 0 || len == SPDK_CONTROLLER_NAME_MAX) { 6119 SPDK_ERRLOG("controller name must be between 1 and %d characters\n", SPDK_CONTROLLER_NAME_MAX - 1); 6120 return -EINVAL; 6121 } 6122 6123 if (bdev_opts != NULL && 6124 !bdev_nvme_check_io_error_resiliency_params(bdev_opts->ctrlr_loss_timeout_sec, 6125 bdev_opts->reconnect_delay_sec, 6126 bdev_opts->fast_io_fail_timeout_sec)) { 6127 return -EINVAL; 6128 } 6129 6130 ctx = calloc(1, sizeof(*ctx)); 6131 if (!ctx) { 6132 return -ENOMEM; 6133 } 6134 ctx->base_name = base_name; 6135 ctx->names = names; 6136 ctx->max_bdevs = count; 6137 ctx->cb_fn = cb_fn; 6138 ctx->cb_ctx = cb_ctx; 6139 ctx->trid = *trid; 6140 6141 if (bdev_opts) { 6142 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 6143 } else { 6144 bdev_nvme_get_default_ctrlr_opts(&ctx->bdev_opts); 6145 } 6146 6147 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 6148 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) { 6149 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 6150 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 6151 free(entry); 6152 break; 6153 } 6154 } 6155 } 6156 6157 if (drv_opts) { 6158 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 6159 } else { 6160 spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->drv_opts, sizeof(ctx->drv_opts)); 6161 } 6162 6163 ctx->drv_opts.transport_retry_count = g_opts.transport_retry_count; 6164 ctx->drv_opts.transport_ack_timeout = g_opts.transport_ack_timeout; 6165 ctx->drv_opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms; 6166 ctx->drv_opts.disable_read_ana_log_page = true; 6167 ctx->drv_opts.transport_tos = g_opts.transport_tos; 6168 6169 if (ctx->bdev_opts.psk[0] != '\0') { 6170 /* Try to use the keyring first */ 6171 ctx->drv_opts.tls_psk = spdk_keyring_get_key(ctx->bdev_opts.psk); 6172 if (ctx->drv_opts.tls_psk == NULL) { 6173 rc = bdev_nvme_load_psk(ctx->bdev_opts.psk, 6174 ctx->drv_opts.psk, sizeof(ctx->drv_opts.psk)); 6175 if (rc != 0) { 6176 SPDK_ERRLOG("Could not load PSK from %s\n", ctx->bdev_opts.psk); 6177 free_nvme_async_probe_ctx(ctx); 6178 return rc; 6179 } 6180 } 6181 } 6182 6183 if (ctx->bdev_opts.dhchap_key != NULL) { 6184 ctx->drv_opts.dhchap_key = spdk_keyring_get_key(ctx->bdev_opts.dhchap_key); 6185 if (ctx->drv_opts.dhchap_key == NULL) { 6186 SPDK_ERRLOG("Could not load DH-HMAC-CHAP key: %s\n", 6187 ctx->bdev_opts.dhchap_key); 6188 free_nvme_async_probe_ctx(ctx); 6189 return -ENOKEY; 6190 } 6191 6192 ctx->drv_opts.dhchap_digests = g_opts.dhchap_digests; 6193 ctx->drv_opts.dhchap_dhgroups = g_opts.dhchap_dhgroups; 6194 } 6195 6196 if (nvme_bdev_ctrlr_get_by_name(base_name) == NULL || multipath) { 6197 attach_cb = connect_attach_cb; 6198 } else { 6199 attach_cb = connect_set_failover_cb; 6200 } 6201 6202 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, attach_cb); 6203 if (ctx->probe_ctx == NULL) { 6204 SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr); 6205 free_nvme_async_probe_ctx(ctx); 6206 return -ENODEV; 6207 } 6208 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000); 6209 6210 return 0; 6211 } 6212 6213 struct bdev_nvme_delete_ctx { 6214 char *name; 6215 struct nvme_path_id path_id; 6216 bdev_nvme_delete_done_fn delete_done; 6217 void *delete_done_ctx; 6218 uint64_t timeout_ticks; 6219 struct spdk_poller *poller; 6220 }; 6221 6222 static void 6223 free_bdev_nvme_delete_ctx(struct bdev_nvme_delete_ctx *ctx) 6224 { 6225 if (ctx != NULL) { 6226 free(ctx->name); 6227 free(ctx); 6228 } 6229 } 6230 6231 static bool 6232 nvme_path_id_compare(struct nvme_path_id *p, const struct nvme_path_id *path_id) 6233 { 6234 if (path_id->trid.trtype != 0) { 6235 if (path_id->trid.trtype == SPDK_NVME_TRANSPORT_CUSTOM) { 6236 if (strcasecmp(path_id->trid.trstring, p->trid.trstring) != 0) { 6237 return false; 6238 } 6239 } else { 6240 if (path_id->trid.trtype != p->trid.trtype) { 6241 return false; 6242 } 6243 } 6244 } 6245 6246 if (!spdk_mem_all_zero(path_id->trid.traddr, sizeof(path_id->trid.traddr))) { 6247 if (strcasecmp(path_id->trid.traddr, p->trid.traddr) != 0) { 6248 return false; 6249 } 6250 } 6251 6252 if (path_id->trid.adrfam != 0) { 6253 if (path_id->trid.adrfam != p->trid.adrfam) { 6254 return false; 6255 } 6256 } 6257 6258 if (!spdk_mem_all_zero(path_id->trid.trsvcid, sizeof(path_id->trid.trsvcid))) { 6259 if (strcasecmp(path_id->trid.trsvcid, p->trid.trsvcid) != 0) { 6260 return false; 6261 } 6262 } 6263 6264 if (!spdk_mem_all_zero(path_id->trid.subnqn, sizeof(path_id->trid.subnqn))) { 6265 if (strcmp(path_id->trid.subnqn, p->trid.subnqn) != 0) { 6266 return false; 6267 } 6268 } 6269 6270 if (!spdk_mem_all_zero(path_id->hostid.hostaddr, sizeof(path_id->hostid.hostaddr))) { 6271 if (strcmp(path_id->hostid.hostaddr, p->hostid.hostaddr) != 0) { 6272 return false; 6273 } 6274 } 6275 6276 if (!spdk_mem_all_zero(path_id->hostid.hostsvcid, sizeof(path_id->hostid.hostsvcid))) { 6277 if (strcmp(path_id->hostid.hostsvcid, p->hostid.hostsvcid) != 0) { 6278 return false; 6279 } 6280 } 6281 6282 return true; 6283 } 6284 6285 static bool 6286 nvme_path_id_exists(const char *name, const struct nvme_path_id *path_id) 6287 { 6288 struct nvme_bdev_ctrlr *nbdev_ctrlr; 6289 struct nvme_ctrlr *ctrlr; 6290 struct nvme_path_id *p; 6291 6292 pthread_mutex_lock(&g_bdev_nvme_mutex); 6293 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 6294 if (!nbdev_ctrlr) { 6295 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6296 return false; 6297 } 6298 6299 TAILQ_FOREACH(ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 6300 pthread_mutex_lock(&ctrlr->mutex); 6301 TAILQ_FOREACH(p, &ctrlr->trids, link) { 6302 if (nvme_path_id_compare(p, path_id)) { 6303 pthread_mutex_unlock(&ctrlr->mutex); 6304 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6305 return true; 6306 } 6307 } 6308 pthread_mutex_unlock(&ctrlr->mutex); 6309 } 6310 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6311 6312 return false; 6313 } 6314 6315 static int 6316 bdev_nvme_delete_complete_poll(void *arg) 6317 { 6318 struct bdev_nvme_delete_ctx *ctx = arg; 6319 int rc = 0; 6320 6321 if (nvme_path_id_exists(ctx->name, &ctx->path_id)) { 6322 if (ctx->timeout_ticks > spdk_get_ticks()) { 6323 return SPDK_POLLER_BUSY; 6324 } 6325 6326 SPDK_ERRLOG("NVMe path '%s' still exists after delete\n", ctx->name); 6327 rc = -ETIMEDOUT; 6328 } 6329 6330 spdk_poller_unregister(&ctx->poller); 6331 6332 ctx->delete_done(ctx->delete_done_ctx, rc); 6333 free_bdev_nvme_delete_ctx(ctx); 6334 6335 return SPDK_POLLER_BUSY; 6336 } 6337 6338 static int 6339 _bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, const struct nvme_path_id *path_id) 6340 { 6341 struct nvme_path_id *p, *t; 6342 spdk_msg_fn msg_fn; 6343 int rc = -ENXIO; 6344 6345 pthread_mutex_lock(&nvme_ctrlr->mutex); 6346 6347 TAILQ_FOREACH_REVERSE_SAFE(p, &nvme_ctrlr->trids, nvme_paths, link, t) { 6348 if (p == TAILQ_FIRST(&nvme_ctrlr->trids)) { 6349 break; 6350 } 6351 6352 if (!nvme_path_id_compare(p, path_id)) { 6353 continue; 6354 } 6355 6356 /* We are not using the specified path. */ 6357 TAILQ_REMOVE(&nvme_ctrlr->trids, p, link); 6358 free(p); 6359 rc = 0; 6360 } 6361 6362 if (p == NULL || !nvme_path_id_compare(p, path_id)) { 6363 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6364 return rc; 6365 } 6366 6367 /* If we made it here, then this path is a match! Now we need to remove it. */ 6368 6369 /* This is the active path in use right now. The active path is always the first in the list. */ 6370 assert(p == nvme_ctrlr->active_path_id); 6371 6372 if (!TAILQ_NEXT(p, link)) { 6373 /* The current path is the only path. */ 6374 msg_fn = _nvme_ctrlr_destruct; 6375 rc = bdev_nvme_delete_ctrlr_unsafe(nvme_ctrlr, false); 6376 } else { 6377 /* There is an alternative path. */ 6378 msg_fn = _bdev_nvme_reset_ctrlr; 6379 rc = bdev_nvme_failover_ctrlr_unsafe(nvme_ctrlr, true); 6380 } 6381 6382 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6383 6384 if (rc == 0) { 6385 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 6386 } else if (rc == -EALREADY) { 6387 rc = 0; 6388 } 6389 6390 return rc; 6391 } 6392 6393 int 6394 bdev_nvme_delete(const char *name, const struct nvme_path_id *path_id, 6395 bdev_nvme_delete_done_fn delete_done, void *delete_done_ctx) 6396 { 6397 struct nvme_bdev_ctrlr *nbdev_ctrlr; 6398 struct nvme_ctrlr *nvme_ctrlr, *tmp_nvme_ctrlr; 6399 struct bdev_nvme_delete_ctx *ctx = NULL; 6400 int rc = -ENXIO, _rc; 6401 6402 if (name == NULL || path_id == NULL) { 6403 rc = -EINVAL; 6404 goto exit; 6405 } 6406 6407 pthread_mutex_lock(&g_bdev_nvme_mutex); 6408 6409 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 6410 if (nbdev_ctrlr == NULL) { 6411 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6412 6413 SPDK_ERRLOG("Failed to find NVMe bdev controller\n"); 6414 rc = -ENODEV; 6415 goto exit; 6416 } 6417 6418 TAILQ_FOREACH_SAFE(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq, tmp_nvme_ctrlr) { 6419 _rc = _bdev_nvme_delete(nvme_ctrlr, path_id); 6420 if (_rc < 0 && _rc != -ENXIO) { 6421 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6422 rc = _rc; 6423 goto exit; 6424 } else if (_rc == 0) { 6425 /* We traverse all remaining nvme_ctrlrs even if one nvme_ctrlr 6426 * was deleted successfully. To remember the successful deletion, 6427 * overwrite rc only if _rc is zero. 6428 */ 6429 rc = 0; 6430 } 6431 } 6432 6433 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6434 6435 if (rc != 0 || delete_done == NULL) { 6436 goto exit; 6437 } 6438 6439 ctx = calloc(1, sizeof(*ctx)); 6440 if (ctx == NULL) { 6441 SPDK_ERRLOG("Failed to allocate context for bdev_nvme_delete\n"); 6442 rc = -ENOMEM; 6443 goto exit; 6444 } 6445 6446 ctx->name = strdup(name); 6447 if (ctx->name == NULL) { 6448 SPDK_ERRLOG("Failed to copy controller name for deletion\n"); 6449 rc = -ENOMEM; 6450 goto exit; 6451 } 6452 6453 ctx->delete_done = delete_done; 6454 ctx->delete_done_ctx = delete_done_ctx; 6455 ctx->path_id = *path_id; 6456 ctx->timeout_ticks = spdk_get_ticks() + 10 * spdk_get_ticks_hz(); 6457 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_delete_complete_poll, ctx, 1000); 6458 if (ctx->poller == NULL) { 6459 SPDK_ERRLOG("Failed to register bdev_nvme_delete poller\n"); 6460 rc = -ENOMEM; 6461 goto exit; 6462 } 6463 6464 exit: 6465 if (rc != 0) { 6466 free_bdev_nvme_delete_ctx(ctx); 6467 } 6468 6469 return rc; 6470 } 6471 6472 #define DISCOVERY_INFOLOG(ctx, format, ...) \ 6473 SPDK_INFOLOG(bdev_nvme, "Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 6474 6475 #define DISCOVERY_ERRLOG(ctx, format, ...) \ 6476 SPDK_ERRLOG("Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 6477 6478 struct discovery_entry_ctx { 6479 char name[128]; 6480 struct spdk_nvme_transport_id trid; 6481 struct spdk_nvme_ctrlr_opts drv_opts; 6482 struct spdk_nvmf_discovery_log_page_entry entry; 6483 TAILQ_ENTRY(discovery_entry_ctx) tailq; 6484 struct discovery_ctx *ctx; 6485 }; 6486 6487 struct discovery_ctx { 6488 char *name; 6489 spdk_bdev_nvme_start_discovery_fn start_cb_fn; 6490 spdk_bdev_nvme_stop_discovery_fn stop_cb_fn; 6491 void *cb_ctx; 6492 struct spdk_nvme_probe_ctx *probe_ctx; 6493 struct spdk_nvme_detach_ctx *detach_ctx; 6494 struct spdk_nvme_ctrlr *ctrlr; 6495 struct spdk_nvme_transport_id trid; 6496 struct discovery_entry_ctx *entry_ctx_in_use; 6497 struct spdk_poller *poller; 6498 struct spdk_nvme_ctrlr_opts drv_opts; 6499 struct nvme_ctrlr_opts bdev_opts; 6500 struct spdk_nvmf_discovery_log_page *log_page; 6501 TAILQ_ENTRY(discovery_ctx) tailq; 6502 TAILQ_HEAD(, discovery_entry_ctx) nvm_entry_ctxs; 6503 TAILQ_HEAD(, discovery_entry_ctx) discovery_entry_ctxs; 6504 int rc; 6505 bool wait_for_attach; 6506 uint64_t timeout_ticks; 6507 /* Denotes that the discovery service is being started. We're waiting 6508 * for the initial connection to the discovery controller to be 6509 * established and attach discovered NVM ctrlrs. 6510 */ 6511 bool initializing; 6512 /* Denotes if a discovery is currently in progress for this context. 6513 * That includes connecting to newly discovered subsystems. Used to 6514 * ensure we do not start a new discovery until an existing one is 6515 * complete. 6516 */ 6517 bool in_progress; 6518 6519 /* Denotes if another discovery is needed after the one in progress 6520 * completes. Set when we receive an AER completion while a discovery 6521 * is already in progress. 6522 */ 6523 bool pending; 6524 6525 /* Signal to the discovery context poller that it should stop the 6526 * discovery service, including detaching from the current discovery 6527 * controller. 6528 */ 6529 bool stop; 6530 6531 struct spdk_thread *calling_thread; 6532 uint32_t index; 6533 uint32_t attach_in_progress; 6534 char *hostnqn; 6535 6536 /* Denotes if the discovery service was started by the mdns discovery. 6537 */ 6538 bool from_mdns_discovery_service; 6539 }; 6540 6541 TAILQ_HEAD(discovery_ctxs, discovery_ctx); 6542 static struct discovery_ctxs g_discovery_ctxs = TAILQ_HEAD_INITIALIZER(g_discovery_ctxs); 6543 6544 static void get_discovery_log_page(struct discovery_ctx *ctx); 6545 6546 static void 6547 free_discovery_ctx(struct discovery_ctx *ctx) 6548 { 6549 free(ctx->log_page); 6550 free(ctx->hostnqn); 6551 free(ctx->name); 6552 free(ctx); 6553 } 6554 6555 static void 6556 discovery_complete(struct discovery_ctx *ctx) 6557 { 6558 ctx->initializing = false; 6559 ctx->in_progress = false; 6560 if (ctx->pending) { 6561 ctx->pending = false; 6562 get_discovery_log_page(ctx); 6563 } 6564 } 6565 6566 static void 6567 build_trid_from_log_page_entry(struct spdk_nvme_transport_id *trid, 6568 struct spdk_nvmf_discovery_log_page_entry *entry) 6569 { 6570 char *space; 6571 6572 trid->trtype = entry->trtype; 6573 trid->adrfam = entry->adrfam; 6574 memcpy(trid->traddr, entry->traddr, sizeof(entry->traddr)); 6575 memcpy(trid->trsvcid, entry->trsvcid, sizeof(entry->trsvcid)); 6576 /* Because the source buffer (entry->subnqn) is longer than trid->subnqn, and 6577 * before call to this function trid->subnqn is zeroed out, we need 6578 * to copy sizeof(trid->subnqn) minus one byte to make sure the last character 6579 * remains 0. Then we can shorten the string (replace ' ' with 0) if required 6580 */ 6581 memcpy(trid->subnqn, entry->subnqn, sizeof(trid->subnqn) - 1); 6582 6583 /* We want the traddr, trsvcid and subnqn fields to be NULL-terminated. 6584 * But the log page entries typically pad them with spaces, not zeroes. 6585 * So add a NULL terminator to each of these fields at the appropriate 6586 * location. 6587 */ 6588 space = strchr(trid->traddr, ' '); 6589 if (space) { 6590 *space = 0; 6591 } 6592 space = strchr(trid->trsvcid, ' '); 6593 if (space) { 6594 *space = 0; 6595 } 6596 space = strchr(trid->subnqn, ' '); 6597 if (space) { 6598 *space = 0; 6599 } 6600 } 6601 6602 static void 6603 _stop_discovery(void *_ctx) 6604 { 6605 struct discovery_ctx *ctx = _ctx; 6606 6607 if (ctx->attach_in_progress > 0) { 6608 spdk_thread_send_msg(spdk_get_thread(), _stop_discovery, ctx); 6609 return; 6610 } 6611 6612 ctx->stop = true; 6613 6614 while (!TAILQ_EMPTY(&ctx->nvm_entry_ctxs)) { 6615 struct discovery_entry_ctx *entry_ctx; 6616 struct nvme_path_id path = {}; 6617 6618 entry_ctx = TAILQ_FIRST(&ctx->nvm_entry_ctxs); 6619 path.trid = entry_ctx->trid; 6620 bdev_nvme_delete(entry_ctx->name, &path, NULL, NULL); 6621 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 6622 free(entry_ctx); 6623 } 6624 6625 while (!TAILQ_EMPTY(&ctx->discovery_entry_ctxs)) { 6626 struct discovery_entry_ctx *entry_ctx; 6627 6628 entry_ctx = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 6629 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 6630 free(entry_ctx); 6631 } 6632 6633 free(ctx->entry_ctx_in_use); 6634 ctx->entry_ctx_in_use = NULL; 6635 } 6636 6637 static void 6638 stop_discovery(struct discovery_ctx *ctx, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 6639 { 6640 ctx->stop_cb_fn = cb_fn; 6641 ctx->cb_ctx = cb_ctx; 6642 6643 if (ctx->attach_in_progress > 0) { 6644 DISCOVERY_INFOLOG(ctx, "stopping discovery with attach_in_progress: %"PRIu32"\n", 6645 ctx->attach_in_progress); 6646 } 6647 6648 _stop_discovery(ctx); 6649 } 6650 6651 static void 6652 remove_discovery_entry(struct nvme_ctrlr *nvme_ctrlr) 6653 { 6654 struct discovery_ctx *d_ctx; 6655 struct nvme_path_id *path_id; 6656 struct spdk_nvme_transport_id trid = {}; 6657 struct discovery_entry_ctx *entry_ctx, *tmp; 6658 6659 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 6660 6661 TAILQ_FOREACH(d_ctx, &g_discovery_ctxs, tailq) { 6662 TAILQ_FOREACH_SAFE(entry_ctx, &d_ctx->nvm_entry_ctxs, tailq, tmp) { 6663 build_trid_from_log_page_entry(&trid, &entry_ctx->entry); 6664 if (spdk_nvme_transport_id_compare(&trid, &path_id->trid) != 0) { 6665 continue; 6666 } 6667 6668 TAILQ_REMOVE(&d_ctx->nvm_entry_ctxs, entry_ctx, tailq); 6669 free(entry_ctx); 6670 DISCOVERY_INFOLOG(d_ctx, "Remove discovery entry: %s:%s:%s\n", 6671 trid.subnqn, trid.traddr, trid.trsvcid); 6672 6673 /* Fail discovery ctrlr to force reattach attempt */ 6674 spdk_nvme_ctrlr_fail(d_ctx->ctrlr); 6675 } 6676 } 6677 } 6678 6679 static void 6680 discovery_remove_controllers(struct discovery_ctx *ctx) 6681 { 6682 struct spdk_nvmf_discovery_log_page *log_page = ctx->log_page; 6683 struct discovery_entry_ctx *entry_ctx, *tmp; 6684 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 6685 struct spdk_nvme_transport_id old_trid = {}; 6686 uint64_t numrec, i; 6687 bool found; 6688 6689 numrec = from_le64(&log_page->numrec); 6690 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->nvm_entry_ctxs, tailq, tmp) { 6691 found = false; 6692 old_entry = &entry_ctx->entry; 6693 build_trid_from_log_page_entry(&old_trid, old_entry); 6694 for (i = 0; i < numrec; i++) { 6695 new_entry = &log_page->entries[i]; 6696 if (!memcmp(old_entry, new_entry, sizeof(*old_entry))) { 6697 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s found again\n", 6698 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 6699 found = true; 6700 break; 6701 } 6702 } 6703 if (!found) { 6704 struct nvme_path_id path = {}; 6705 6706 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s not found\n", 6707 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 6708 6709 path.trid = entry_ctx->trid; 6710 bdev_nvme_delete(entry_ctx->name, &path, NULL, NULL); 6711 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 6712 free(entry_ctx); 6713 } 6714 } 6715 free(log_page); 6716 ctx->log_page = NULL; 6717 discovery_complete(ctx); 6718 } 6719 6720 static void 6721 complete_discovery_start(struct discovery_ctx *ctx, int status) 6722 { 6723 ctx->timeout_ticks = 0; 6724 ctx->rc = status; 6725 if (ctx->start_cb_fn) { 6726 ctx->start_cb_fn(ctx->cb_ctx, status); 6727 ctx->start_cb_fn = NULL; 6728 ctx->cb_ctx = NULL; 6729 } 6730 } 6731 6732 static void 6733 discovery_attach_controller_done(void *cb_ctx, size_t bdev_count, int rc) 6734 { 6735 struct discovery_entry_ctx *entry_ctx = cb_ctx; 6736 struct discovery_ctx *ctx = entry_ctx->ctx; 6737 6738 DISCOVERY_INFOLOG(ctx, "attach %s done\n", entry_ctx->name); 6739 ctx->attach_in_progress--; 6740 if (ctx->attach_in_progress == 0) { 6741 complete_discovery_start(ctx, ctx->rc); 6742 if (ctx->initializing && ctx->rc != 0) { 6743 DISCOVERY_ERRLOG(ctx, "stopping discovery due to errors: %d\n", ctx->rc); 6744 stop_discovery(ctx, NULL, ctx->cb_ctx); 6745 } else { 6746 discovery_remove_controllers(ctx); 6747 } 6748 } 6749 } 6750 6751 static struct discovery_entry_ctx * 6752 create_discovery_entry_ctx(struct discovery_ctx *ctx, struct spdk_nvme_transport_id *trid) 6753 { 6754 struct discovery_entry_ctx *new_ctx; 6755 6756 new_ctx = calloc(1, sizeof(*new_ctx)); 6757 if (new_ctx == NULL) { 6758 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6759 return NULL; 6760 } 6761 6762 new_ctx->ctx = ctx; 6763 memcpy(&new_ctx->trid, trid, sizeof(*trid)); 6764 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 6765 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 6766 return new_ctx; 6767 } 6768 6769 static void 6770 discovery_log_page_cb(void *cb_arg, int rc, const struct spdk_nvme_cpl *cpl, 6771 struct spdk_nvmf_discovery_log_page *log_page) 6772 { 6773 struct discovery_ctx *ctx = cb_arg; 6774 struct discovery_entry_ctx *entry_ctx, *tmp; 6775 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 6776 uint64_t numrec, i; 6777 bool found; 6778 6779 if (rc || spdk_nvme_cpl_is_error(cpl)) { 6780 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 6781 return; 6782 } 6783 6784 ctx->log_page = log_page; 6785 assert(ctx->attach_in_progress == 0); 6786 numrec = from_le64(&log_page->numrec); 6787 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->discovery_entry_ctxs, tailq, tmp) { 6788 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 6789 free(entry_ctx); 6790 } 6791 for (i = 0; i < numrec; i++) { 6792 found = false; 6793 new_entry = &log_page->entries[i]; 6794 if (new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY_CURRENT || 6795 new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) { 6796 struct discovery_entry_ctx *new_ctx; 6797 struct spdk_nvme_transport_id trid = {}; 6798 6799 build_trid_from_log_page_entry(&trid, new_entry); 6800 new_ctx = create_discovery_entry_ctx(ctx, &trid); 6801 if (new_ctx == NULL) { 6802 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6803 break; 6804 } 6805 6806 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, new_ctx, tailq); 6807 continue; 6808 } 6809 TAILQ_FOREACH(entry_ctx, &ctx->nvm_entry_ctxs, tailq) { 6810 old_entry = &entry_ctx->entry; 6811 if (!memcmp(new_entry, old_entry, sizeof(*new_entry))) { 6812 found = true; 6813 break; 6814 } 6815 } 6816 if (!found) { 6817 struct discovery_entry_ctx *subnqn_ctx = NULL, *new_ctx; 6818 struct discovery_ctx *d_ctx; 6819 6820 TAILQ_FOREACH(d_ctx, &g_discovery_ctxs, tailq) { 6821 TAILQ_FOREACH(subnqn_ctx, &d_ctx->nvm_entry_ctxs, tailq) { 6822 if (!memcmp(subnqn_ctx->entry.subnqn, new_entry->subnqn, 6823 sizeof(new_entry->subnqn))) { 6824 break; 6825 } 6826 } 6827 if (subnqn_ctx) { 6828 break; 6829 } 6830 } 6831 6832 new_ctx = calloc(1, sizeof(*new_ctx)); 6833 if (new_ctx == NULL) { 6834 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6835 break; 6836 } 6837 6838 new_ctx->ctx = ctx; 6839 memcpy(&new_ctx->entry, new_entry, sizeof(*new_entry)); 6840 build_trid_from_log_page_entry(&new_ctx->trid, new_entry); 6841 if (subnqn_ctx) { 6842 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s", subnqn_ctx->name); 6843 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new path for %s\n", 6844 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 6845 new_ctx->name); 6846 } else { 6847 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s%d", ctx->name, ctx->index++); 6848 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new subsystem %s\n", 6849 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 6850 new_ctx->name); 6851 } 6852 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 6853 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 6854 rc = bdev_nvme_create(&new_ctx->trid, new_ctx->name, NULL, 0, 6855 discovery_attach_controller_done, new_ctx, 6856 &new_ctx->drv_opts, &ctx->bdev_opts, true); 6857 if (rc == 0) { 6858 TAILQ_INSERT_TAIL(&ctx->nvm_entry_ctxs, new_ctx, tailq); 6859 ctx->attach_in_progress++; 6860 } else { 6861 DISCOVERY_ERRLOG(ctx, "bdev_nvme_create failed (%s)\n", spdk_strerror(-rc)); 6862 } 6863 } 6864 } 6865 6866 if (ctx->attach_in_progress == 0) { 6867 discovery_remove_controllers(ctx); 6868 } 6869 } 6870 6871 static void 6872 get_discovery_log_page(struct discovery_ctx *ctx) 6873 { 6874 int rc; 6875 6876 assert(ctx->in_progress == false); 6877 ctx->in_progress = true; 6878 rc = spdk_nvme_ctrlr_get_discovery_log_page(ctx->ctrlr, discovery_log_page_cb, ctx); 6879 if (rc != 0) { 6880 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 6881 } 6882 DISCOVERY_INFOLOG(ctx, "sent discovery log page command\n"); 6883 } 6884 6885 static void 6886 discovery_aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 6887 { 6888 struct discovery_ctx *ctx = arg; 6889 uint32_t log_page_id = (cpl->cdw0 & 0xFF0000) >> 16; 6890 6891 if (spdk_nvme_cpl_is_error(cpl)) { 6892 DISCOVERY_ERRLOG(ctx, "aer failed\n"); 6893 return; 6894 } 6895 6896 if (log_page_id != SPDK_NVME_LOG_DISCOVERY) { 6897 DISCOVERY_ERRLOG(ctx, "unexpected log page 0x%x\n", log_page_id); 6898 return; 6899 } 6900 6901 DISCOVERY_INFOLOG(ctx, "got aer\n"); 6902 if (ctx->in_progress) { 6903 ctx->pending = true; 6904 return; 6905 } 6906 6907 get_discovery_log_page(ctx); 6908 } 6909 6910 static void 6911 discovery_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 6912 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 6913 { 6914 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 6915 struct discovery_ctx *ctx; 6916 6917 ctx = SPDK_CONTAINEROF(user_opts, struct discovery_ctx, drv_opts); 6918 6919 DISCOVERY_INFOLOG(ctx, "discovery ctrlr attached\n"); 6920 ctx->probe_ctx = NULL; 6921 ctx->ctrlr = ctrlr; 6922 6923 if (ctx->rc != 0) { 6924 DISCOVERY_ERRLOG(ctx, "encountered error while attaching discovery ctrlr: %d\n", 6925 ctx->rc); 6926 return; 6927 } 6928 6929 spdk_nvme_ctrlr_register_aer_callback(ctx->ctrlr, discovery_aer_cb, ctx); 6930 } 6931 6932 static int 6933 discovery_poller(void *arg) 6934 { 6935 struct discovery_ctx *ctx = arg; 6936 struct spdk_nvme_transport_id *trid; 6937 int rc; 6938 6939 if (ctx->detach_ctx) { 6940 rc = spdk_nvme_detach_poll_async(ctx->detach_ctx); 6941 if (rc != -EAGAIN) { 6942 ctx->detach_ctx = NULL; 6943 ctx->ctrlr = NULL; 6944 } 6945 } else if (ctx->stop) { 6946 if (ctx->ctrlr != NULL) { 6947 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 6948 if (rc == 0) { 6949 return SPDK_POLLER_BUSY; 6950 } 6951 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 6952 } 6953 spdk_poller_unregister(&ctx->poller); 6954 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 6955 assert(ctx->start_cb_fn == NULL); 6956 if (ctx->stop_cb_fn != NULL) { 6957 ctx->stop_cb_fn(ctx->cb_ctx); 6958 } 6959 free_discovery_ctx(ctx); 6960 } else if (ctx->probe_ctx == NULL && ctx->ctrlr == NULL) { 6961 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 6962 DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); 6963 assert(ctx->initializing); 6964 spdk_poller_unregister(&ctx->poller); 6965 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 6966 complete_discovery_start(ctx, -ETIMEDOUT); 6967 stop_discovery(ctx, NULL, NULL); 6968 free_discovery_ctx(ctx); 6969 return SPDK_POLLER_BUSY; 6970 } 6971 6972 assert(ctx->entry_ctx_in_use == NULL); 6973 ctx->entry_ctx_in_use = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 6974 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 6975 trid = &ctx->entry_ctx_in_use->trid; 6976 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, discovery_attach_cb); 6977 if (ctx->probe_ctx) { 6978 spdk_poller_unregister(&ctx->poller); 6979 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000); 6980 } else { 6981 DISCOVERY_ERRLOG(ctx, "could not start discovery connect\n"); 6982 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 6983 ctx->entry_ctx_in_use = NULL; 6984 } 6985 } else if (ctx->probe_ctx) { 6986 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 6987 DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); 6988 complete_discovery_start(ctx, -ETIMEDOUT); 6989 return SPDK_POLLER_BUSY; 6990 } 6991 6992 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 6993 if (rc != -EAGAIN) { 6994 if (ctx->rc != 0) { 6995 assert(ctx->initializing); 6996 stop_discovery(ctx, NULL, ctx->cb_ctx); 6997 } else { 6998 assert(rc == 0); 6999 DISCOVERY_INFOLOG(ctx, "discovery ctrlr connected\n"); 7000 ctx->rc = rc; 7001 get_discovery_log_page(ctx); 7002 } 7003 } 7004 } else { 7005 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 7006 DISCOVERY_ERRLOG(ctx, "timed out while attaching NVM ctrlrs\n"); 7007 complete_discovery_start(ctx, -ETIMEDOUT); 7008 /* We need to wait until all NVM ctrlrs are attached before we stop the 7009 * discovery service to make sure we don't detach a ctrlr that is still 7010 * being attached. 7011 */ 7012 if (ctx->attach_in_progress == 0) { 7013 stop_discovery(ctx, NULL, ctx->cb_ctx); 7014 return SPDK_POLLER_BUSY; 7015 } 7016 } 7017 7018 rc = spdk_nvme_ctrlr_process_admin_completions(ctx->ctrlr); 7019 if (rc < 0) { 7020 spdk_poller_unregister(&ctx->poller); 7021 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 7022 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 7023 ctx->entry_ctx_in_use = NULL; 7024 7025 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 7026 if (rc != 0) { 7027 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 7028 ctx->ctrlr = NULL; 7029 } 7030 } 7031 } 7032 7033 return SPDK_POLLER_BUSY; 7034 } 7035 7036 static void 7037 start_discovery_poller(void *arg) 7038 { 7039 struct discovery_ctx *ctx = arg; 7040 7041 TAILQ_INSERT_TAIL(&g_discovery_ctxs, ctx, tailq); 7042 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 7043 } 7044 7045 int 7046 bdev_nvme_start_discovery(struct spdk_nvme_transport_id *trid, 7047 const char *base_name, 7048 struct spdk_nvme_ctrlr_opts *drv_opts, 7049 struct nvme_ctrlr_opts *bdev_opts, 7050 uint64_t attach_timeout, 7051 bool from_mdns, 7052 spdk_bdev_nvme_start_discovery_fn cb_fn, void *cb_ctx) 7053 { 7054 struct discovery_ctx *ctx; 7055 struct discovery_entry_ctx *discovery_entry_ctx; 7056 7057 snprintf(trid->subnqn, sizeof(trid->subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN); 7058 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 7059 if (strcmp(ctx->name, base_name) == 0) { 7060 return -EEXIST; 7061 } 7062 7063 if (ctx->entry_ctx_in_use != NULL) { 7064 if (!spdk_nvme_transport_id_compare(trid, &ctx->entry_ctx_in_use->trid)) { 7065 return -EEXIST; 7066 } 7067 } 7068 7069 TAILQ_FOREACH(discovery_entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 7070 if (!spdk_nvme_transport_id_compare(trid, &discovery_entry_ctx->trid)) { 7071 return -EEXIST; 7072 } 7073 } 7074 } 7075 7076 ctx = calloc(1, sizeof(*ctx)); 7077 if (ctx == NULL) { 7078 return -ENOMEM; 7079 } 7080 7081 ctx->name = strdup(base_name); 7082 if (ctx->name == NULL) { 7083 free_discovery_ctx(ctx); 7084 return -ENOMEM; 7085 } 7086 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 7087 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 7088 ctx->from_mdns_discovery_service = from_mdns; 7089 ctx->bdev_opts.from_discovery_service = true; 7090 ctx->calling_thread = spdk_get_thread(); 7091 ctx->start_cb_fn = cb_fn; 7092 ctx->cb_ctx = cb_ctx; 7093 ctx->initializing = true; 7094 if (ctx->start_cb_fn) { 7095 /* We can use this when dumping json to denote if this RPC parameter 7096 * was specified or not. 7097 */ 7098 ctx->wait_for_attach = true; 7099 } 7100 if (attach_timeout != 0) { 7101 ctx->timeout_ticks = spdk_get_ticks() + attach_timeout * 7102 spdk_get_ticks_hz() / 1000ull; 7103 } 7104 TAILQ_INIT(&ctx->nvm_entry_ctxs); 7105 TAILQ_INIT(&ctx->discovery_entry_ctxs); 7106 memcpy(&ctx->trid, trid, sizeof(*trid)); 7107 /* Even if user did not specify hostnqn, we can still strdup("\0"); */ 7108 ctx->hostnqn = strdup(ctx->drv_opts.hostnqn); 7109 if (ctx->hostnqn == NULL) { 7110 free_discovery_ctx(ctx); 7111 return -ENOMEM; 7112 } 7113 discovery_entry_ctx = create_discovery_entry_ctx(ctx, trid); 7114 if (discovery_entry_ctx == NULL) { 7115 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 7116 free_discovery_ctx(ctx); 7117 return -ENOMEM; 7118 } 7119 7120 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, discovery_entry_ctx, tailq); 7121 spdk_thread_send_msg(g_bdev_nvme_init_thread, start_discovery_poller, ctx); 7122 return 0; 7123 } 7124 7125 int 7126 bdev_nvme_stop_discovery(const char *name, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 7127 { 7128 struct discovery_ctx *ctx; 7129 7130 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 7131 if (strcmp(name, ctx->name) == 0) { 7132 if (ctx->stop) { 7133 return -EALREADY; 7134 } 7135 /* If we're still starting the discovery service and ->rc is non-zero, we're 7136 * going to stop it as soon as we can 7137 */ 7138 if (ctx->initializing && ctx->rc != 0) { 7139 return -EALREADY; 7140 } 7141 stop_discovery(ctx, cb_fn, cb_ctx); 7142 return 0; 7143 } 7144 } 7145 7146 return -ENOENT; 7147 } 7148 7149 static int 7150 bdev_nvme_library_init(void) 7151 { 7152 g_bdev_nvme_init_thread = spdk_get_thread(); 7153 7154 spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_create_poll_group_cb, 7155 bdev_nvme_destroy_poll_group_cb, 7156 sizeof(struct nvme_poll_group), "nvme_poll_groups"); 7157 7158 return 0; 7159 } 7160 7161 static void 7162 bdev_nvme_fini_destruct_ctrlrs(void) 7163 { 7164 struct nvme_bdev_ctrlr *nbdev_ctrlr; 7165 struct nvme_ctrlr *nvme_ctrlr; 7166 7167 pthread_mutex_lock(&g_bdev_nvme_mutex); 7168 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 7169 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 7170 pthread_mutex_lock(&nvme_ctrlr->mutex); 7171 if (nvme_ctrlr->destruct) { 7172 /* This controller's destruction was already started 7173 * before the application started shutting down 7174 */ 7175 pthread_mutex_unlock(&nvme_ctrlr->mutex); 7176 continue; 7177 } 7178 nvme_ctrlr->destruct = true; 7179 pthread_mutex_unlock(&nvme_ctrlr->mutex); 7180 7181 spdk_thread_send_msg(nvme_ctrlr->thread, _nvme_ctrlr_destruct, 7182 nvme_ctrlr); 7183 } 7184 } 7185 7186 g_bdev_nvme_module_finish = true; 7187 if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 7188 pthread_mutex_unlock(&g_bdev_nvme_mutex); 7189 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 7190 spdk_bdev_module_fini_done(); 7191 return; 7192 } 7193 7194 pthread_mutex_unlock(&g_bdev_nvme_mutex); 7195 } 7196 7197 static void 7198 check_discovery_fini(void *arg) 7199 { 7200 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 7201 bdev_nvme_fini_destruct_ctrlrs(); 7202 } 7203 } 7204 7205 static void 7206 bdev_nvme_library_fini(void) 7207 { 7208 struct nvme_probe_skip_entry *entry, *entry_tmp; 7209 struct discovery_ctx *ctx; 7210 7211 spdk_poller_unregister(&g_hotplug_poller); 7212 free(g_hotplug_probe_ctx); 7213 g_hotplug_probe_ctx = NULL; 7214 7215 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) { 7216 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 7217 free(entry); 7218 } 7219 7220 assert(spdk_get_thread() == g_bdev_nvme_init_thread); 7221 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 7222 bdev_nvme_fini_destruct_ctrlrs(); 7223 } else { 7224 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 7225 stop_discovery(ctx, check_discovery_fini, NULL); 7226 } 7227 } 7228 } 7229 7230 static void 7231 bdev_nvme_verify_pi_error(struct nvme_bdev_io *bio) 7232 { 7233 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7234 struct spdk_bdev *bdev = bdev_io->bdev; 7235 struct spdk_dif_ctx dif_ctx; 7236 struct spdk_dif_error err_blk = {}; 7237 int rc; 7238 struct spdk_dif_ctx_init_ext_opts dif_opts; 7239 7240 dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format); 7241 dif_opts.dif_pi_format = SPDK_DIF_PI_FORMAT_16; 7242 rc = spdk_dif_ctx_init(&dif_ctx, 7243 bdev->blocklen, bdev->md_len, bdev->md_interleave, 7244 bdev->dif_is_head_of_md, bdev->dif_type, 7245 bdev_io->u.bdev.dif_check_flags, 7246 bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0, &dif_opts); 7247 if (rc != 0) { 7248 SPDK_ERRLOG("Initialization of DIF context failed\n"); 7249 return; 7250 } 7251 7252 if (bdev->md_interleave) { 7253 rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 7254 bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 7255 } else { 7256 struct iovec md_iov = { 7257 .iov_base = bdev_io->u.bdev.md_buf, 7258 .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len, 7259 }; 7260 7261 rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 7262 &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 7263 } 7264 7265 if (rc != 0) { 7266 SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n", 7267 err_blk.err_type, err_blk.err_offset); 7268 } else { 7269 SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n"); 7270 } 7271 } 7272 7273 static void 7274 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 7275 { 7276 struct nvme_bdev_io *bio = ref; 7277 7278 if (spdk_nvme_cpl_is_success(cpl)) { 7279 /* Run PI verification for read data buffer. */ 7280 bdev_nvme_verify_pi_error(bio); 7281 } 7282 7283 /* Return original completion status */ 7284 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 7285 } 7286 7287 static void 7288 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 7289 { 7290 struct nvme_bdev_io *bio = ref; 7291 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7292 int ret; 7293 7294 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 7295 SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n", 7296 cpl->status.sct, cpl->status.sc); 7297 7298 /* Save completion status to use after verifying PI error. */ 7299 bio->cpl = *cpl; 7300 7301 if (spdk_likely(nvme_io_path_is_available(bio->io_path))) { 7302 /* Read without PI checking to verify PI error. */ 7303 ret = bdev_nvme_no_pi_readv(bio, 7304 bdev_io->u.bdev.iovs, 7305 bdev_io->u.bdev.iovcnt, 7306 bdev_io->u.bdev.md_buf, 7307 bdev_io->u.bdev.num_blocks, 7308 bdev_io->u.bdev.offset_blocks); 7309 if (ret == 0) { 7310 return; 7311 } 7312 } 7313 } 7314 7315 bdev_nvme_io_complete_nvme_status(bio, cpl); 7316 } 7317 7318 static void 7319 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7320 { 7321 struct nvme_bdev_io *bio = ref; 7322 7323 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 7324 SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n", 7325 cpl->status.sct, cpl->status.sc); 7326 /* Run PI verification for write data buffer if PI error is detected. */ 7327 bdev_nvme_verify_pi_error(bio); 7328 } 7329 7330 bdev_nvme_io_complete_nvme_status(bio, cpl); 7331 } 7332 7333 static void 7334 bdev_nvme_zone_appendv_done(void *ref, const struct spdk_nvme_cpl *cpl) 7335 { 7336 struct nvme_bdev_io *bio = ref; 7337 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7338 7339 /* spdk_bdev_io_get_append_location() requires that the ALBA is stored in offset_blocks. 7340 * Additionally, offset_blocks has to be set before calling bdev_nvme_verify_pi_error(). 7341 */ 7342 bdev_io->u.bdev.offset_blocks = *(uint64_t *)&cpl->cdw0; 7343 7344 if (spdk_nvme_cpl_is_pi_error(cpl)) { 7345 SPDK_ERRLOG("zone append completed with PI error (sct=%d, sc=%d)\n", 7346 cpl->status.sct, cpl->status.sc); 7347 /* Run PI verification for zone append data buffer if PI error is detected. */ 7348 bdev_nvme_verify_pi_error(bio); 7349 } 7350 7351 bdev_nvme_io_complete_nvme_status(bio, cpl); 7352 } 7353 7354 static void 7355 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7356 { 7357 struct nvme_bdev_io *bio = ref; 7358 7359 if (spdk_nvme_cpl_is_pi_error(cpl)) { 7360 SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n", 7361 cpl->status.sct, cpl->status.sc); 7362 /* Run PI verification for compare data buffer if PI error is detected. */ 7363 bdev_nvme_verify_pi_error(bio); 7364 } 7365 7366 bdev_nvme_io_complete_nvme_status(bio, cpl); 7367 } 7368 7369 static void 7370 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7371 { 7372 struct nvme_bdev_io *bio = ref; 7373 7374 /* Compare operation completion */ 7375 if (!bio->first_fused_completed) { 7376 /* Save compare result for write callback */ 7377 bio->cpl = *cpl; 7378 bio->first_fused_completed = true; 7379 return; 7380 } 7381 7382 /* Write operation completion */ 7383 if (spdk_nvme_cpl_is_error(&bio->cpl)) { 7384 /* If bio->cpl is already an error, it means the compare operation failed. In that case, 7385 * complete the IO with the compare operation's status. 7386 */ 7387 if (!spdk_nvme_cpl_is_error(cpl)) { 7388 SPDK_ERRLOG("Unexpected write success after compare failure.\n"); 7389 } 7390 7391 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 7392 } else { 7393 bdev_nvme_io_complete_nvme_status(bio, cpl); 7394 } 7395 } 7396 7397 static void 7398 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl) 7399 { 7400 struct nvme_bdev_io *bio = ref; 7401 7402 bdev_nvme_io_complete_nvme_status(bio, cpl); 7403 } 7404 7405 static int 7406 fill_zone_from_report(struct spdk_bdev_zone_info *info, struct spdk_nvme_zns_zone_desc *desc) 7407 { 7408 switch (desc->zt) { 7409 case SPDK_NVME_ZONE_TYPE_SEQWR: 7410 info->type = SPDK_BDEV_ZONE_TYPE_SEQWR; 7411 break; 7412 default: 7413 SPDK_ERRLOG("Invalid zone type: %#x in zone report\n", desc->zt); 7414 return -EIO; 7415 } 7416 7417 switch (desc->zs) { 7418 case SPDK_NVME_ZONE_STATE_EMPTY: 7419 info->state = SPDK_BDEV_ZONE_STATE_EMPTY; 7420 break; 7421 case SPDK_NVME_ZONE_STATE_IOPEN: 7422 info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN; 7423 break; 7424 case SPDK_NVME_ZONE_STATE_EOPEN: 7425 info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN; 7426 break; 7427 case SPDK_NVME_ZONE_STATE_CLOSED: 7428 info->state = SPDK_BDEV_ZONE_STATE_CLOSED; 7429 break; 7430 case SPDK_NVME_ZONE_STATE_RONLY: 7431 info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY; 7432 break; 7433 case SPDK_NVME_ZONE_STATE_FULL: 7434 info->state = SPDK_BDEV_ZONE_STATE_FULL; 7435 break; 7436 case SPDK_NVME_ZONE_STATE_OFFLINE: 7437 info->state = SPDK_BDEV_ZONE_STATE_OFFLINE; 7438 break; 7439 default: 7440 SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", desc->zs); 7441 return -EIO; 7442 } 7443 7444 info->zone_id = desc->zslba; 7445 info->write_pointer = desc->wp; 7446 info->capacity = desc->zcap; 7447 7448 return 0; 7449 } 7450 7451 static void 7452 bdev_nvme_get_zone_info_done(void *ref, const struct spdk_nvme_cpl *cpl) 7453 { 7454 struct nvme_bdev_io *bio = ref; 7455 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7456 uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id; 7457 uint32_t zones_to_copy = bdev_io->u.zone_mgmt.num_zones; 7458 struct spdk_bdev_zone_info *info = bdev_io->u.zone_mgmt.buf; 7459 uint64_t max_zones_per_buf, i; 7460 uint32_t zone_report_bufsize; 7461 struct spdk_nvme_ns *ns; 7462 struct spdk_nvme_qpair *qpair; 7463 int ret; 7464 7465 if (spdk_nvme_cpl_is_error(cpl)) { 7466 goto out_complete_io_nvme_cpl; 7467 } 7468 7469 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 7470 ret = -ENXIO; 7471 goto out_complete_io_ret; 7472 } 7473 7474 ns = bio->io_path->nvme_ns->ns; 7475 qpair = bio->io_path->qpair->qpair; 7476 7477 zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 7478 max_zones_per_buf = (zone_report_bufsize - sizeof(*bio->zone_report_buf)) / 7479 sizeof(bio->zone_report_buf->descs[0]); 7480 7481 if (bio->zone_report_buf->nr_zones > max_zones_per_buf) { 7482 ret = -EINVAL; 7483 goto out_complete_io_ret; 7484 } 7485 7486 if (!bio->zone_report_buf->nr_zones) { 7487 ret = -EINVAL; 7488 goto out_complete_io_ret; 7489 } 7490 7491 for (i = 0; i < bio->zone_report_buf->nr_zones && bio->handled_zones < zones_to_copy; i++) { 7492 ret = fill_zone_from_report(&info[bio->handled_zones], 7493 &bio->zone_report_buf->descs[i]); 7494 if (ret) { 7495 goto out_complete_io_ret; 7496 } 7497 bio->handled_zones++; 7498 } 7499 7500 if (bio->handled_zones < zones_to_copy) { 7501 uint64_t zone_size_lba = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 7502 uint64_t slba = zone_id + (zone_size_lba * bio->handled_zones); 7503 7504 memset(bio->zone_report_buf, 0, zone_report_bufsize); 7505 ret = spdk_nvme_zns_report_zones(ns, qpair, 7506 bio->zone_report_buf, zone_report_bufsize, 7507 slba, SPDK_NVME_ZRA_LIST_ALL, true, 7508 bdev_nvme_get_zone_info_done, bio); 7509 if (!ret) { 7510 return; 7511 } else { 7512 goto out_complete_io_ret; 7513 } 7514 } 7515 7516 out_complete_io_nvme_cpl: 7517 free(bio->zone_report_buf); 7518 bio->zone_report_buf = NULL; 7519 bdev_nvme_io_complete_nvme_status(bio, cpl); 7520 return; 7521 7522 out_complete_io_ret: 7523 free(bio->zone_report_buf); 7524 bio->zone_report_buf = NULL; 7525 bdev_nvme_io_complete(bio, ret); 7526 } 7527 7528 static void 7529 bdev_nvme_zone_management_done(void *ref, const struct spdk_nvme_cpl *cpl) 7530 { 7531 struct nvme_bdev_io *bio = ref; 7532 7533 bdev_nvme_io_complete_nvme_status(bio, cpl); 7534 } 7535 7536 static void 7537 bdev_nvme_admin_passthru_complete_nvme_status(void *ctx) 7538 { 7539 struct nvme_bdev_io *bio = ctx; 7540 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7541 const struct spdk_nvme_cpl *cpl = &bio->cpl; 7542 7543 assert(bdev_nvme_io_type_is_admin(bdev_io->type)); 7544 7545 __bdev_nvme_io_complete(bdev_io, 0, cpl); 7546 } 7547 7548 static void 7549 bdev_nvme_abort_complete(void *ctx) 7550 { 7551 struct nvme_bdev_io *bio = ctx; 7552 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7553 7554 if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) { 7555 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS, NULL); 7556 } else { 7557 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED, NULL); 7558 } 7559 } 7560 7561 static void 7562 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl) 7563 { 7564 struct nvme_bdev_io *bio = ref; 7565 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7566 7567 bio->cpl = *cpl; 7568 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), bdev_nvme_abort_complete, bio); 7569 } 7570 7571 static void 7572 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl) 7573 { 7574 struct nvme_bdev_io *bio = ref; 7575 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7576 7577 bio->cpl = *cpl; 7578 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7579 bdev_nvme_admin_passthru_complete_nvme_status, bio); 7580 } 7581 7582 static void 7583 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset) 7584 { 7585 struct nvme_bdev_io *bio = ref; 7586 struct iovec *iov; 7587 7588 bio->iov_offset = sgl_offset; 7589 for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) { 7590 iov = &bio->iovs[bio->iovpos]; 7591 if (bio->iov_offset < iov->iov_len) { 7592 break; 7593 } 7594 7595 bio->iov_offset -= iov->iov_len; 7596 } 7597 } 7598 7599 static int 7600 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length) 7601 { 7602 struct nvme_bdev_io *bio = ref; 7603 struct iovec *iov; 7604 7605 assert(bio->iovpos < bio->iovcnt); 7606 7607 iov = &bio->iovs[bio->iovpos]; 7608 7609 *address = iov->iov_base; 7610 *length = iov->iov_len; 7611 7612 if (bio->iov_offset) { 7613 assert(bio->iov_offset <= iov->iov_len); 7614 *address += bio->iov_offset; 7615 *length -= bio->iov_offset; 7616 } 7617 7618 bio->iov_offset += *length; 7619 if (bio->iov_offset == iov->iov_len) { 7620 bio->iovpos++; 7621 bio->iov_offset = 0; 7622 } 7623 7624 return 0; 7625 } 7626 7627 static void 7628 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset) 7629 { 7630 struct nvme_bdev_io *bio = ref; 7631 struct iovec *iov; 7632 7633 bio->fused_iov_offset = sgl_offset; 7634 for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) { 7635 iov = &bio->fused_iovs[bio->fused_iovpos]; 7636 if (bio->fused_iov_offset < iov->iov_len) { 7637 break; 7638 } 7639 7640 bio->fused_iov_offset -= iov->iov_len; 7641 } 7642 } 7643 7644 static int 7645 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length) 7646 { 7647 struct nvme_bdev_io *bio = ref; 7648 struct iovec *iov; 7649 7650 assert(bio->fused_iovpos < bio->fused_iovcnt); 7651 7652 iov = &bio->fused_iovs[bio->fused_iovpos]; 7653 7654 *address = iov->iov_base; 7655 *length = iov->iov_len; 7656 7657 if (bio->fused_iov_offset) { 7658 assert(bio->fused_iov_offset <= iov->iov_len); 7659 *address += bio->fused_iov_offset; 7660 *length -= bio->fused_iov_offset; 7661 } 7662 7663 bio->fused_iov_offset += *length; 7664 if (bio->fused_iov_offset == iov->iov_len) { 7665 bio->fused_iovpos++; 7666 bio->fused_iov_offset = 0; 7667 } 7668 7669 return 0; 7670 } 7671 7672 static int 7673 bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7674 void *md, uint64_t lba_count, uint64_t lba) 7675 { 7676 int rc; 7677 7678 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n", 7679 lba_count, lba); 7680 7681 bio->iovs = iov; 7682 bio->iovcnt = iovcnt; 7683 bio->iovpos = 0; 7684 bio->iov_offset = 0; 7685 7686 rc = spdk_nvme_ns_cmd_readv_with_md(bio->io_path->nvme_ns->ns, 7687 bio->io_path->qpair->qpair, 7688 lba, lba_count, 7689 bdev_nvme_no_pi_readv_done, bio, 0, 7690 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 7691 md, 0, 0); 7692 7693 if (rc != 0 && rc != -ENOMEM) { 7694 SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc); 7695 } 7696 return rc; 7697 } 7698 7699 static int 7700 bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7701 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, 7702 struct spdk_memory_domain *domain, void *domain_ctx, 7703 struct spdk_accel_sequence *seq) 7704 { 7705 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7706 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7707 int rc; 7708 7709 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7710 lba_count, lba); 7711 7712 bio->iovs = iov; 7713 bio->iovcnt = iovcnt; 7714 bio->iovpos = 0; 7715 bio->iov_offset = 0; 7716 7717 if (domain != NULL || seq != NULL) { 7718 bio->ext_opts.size = SPDK_SIZEOF(&bio->ext_opts, accel_sequence); 7719 bio->ext_opts.memory_domain = domain; 7720 bio->ext_opts.memory_domain_ctx = domain_ctx; 7721 bio->ext_opts.io_flags = flags; 7722 bio->ext_opts.metadata = md; 7723 bio->ext_opts.accel_sequence = seq; 7724 7725 if (iovcnt == 1) { 7726 rc = spdk_nvme_ns_cmd_read_ext(ns, qpair, iov[0].iov_base, lba, lba_count, bdev_nvme_readv_done, 7727 bio, &bio->ext_opts); 7728 } else { 7729 rc = spdk_nvme_ns_cmd_readv_ext(ns, qpair, lba, lba_count, 7730 bdev_nvme_readv_done, bio, 7731 bdev_nvme_queued_reset_sgl, 7732 bdev_nvme_queued_next_sge, 7733 &bio->ext_opts); 7734 } 7735 } else if (iovcnt == 1) { 7736 rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base, 7737 md, lba, lba_count, bdev_nvme_readv_done, 7738 bio, flags, 0, 0); 7739 } else { 7740 rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count, 7741 bdev_nvme_readv_done, bio, flags, 7742 bdev_nvme_queued_reset_sgl, 7743 bdev_nvme_queued_next_sge, md, 0, 0); 7744 } 7745 7746 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 7747 SPDK_ERRLOG("readv failed: rc = %d\n", rc); 7748 } 7749 return rc; 7750 } 7751 7752 static int 7753 bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7754 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, 7755 struct spdk_memory_domain *domain, void *domain_ctx, 7756 struct spdk_accel_sequence *seq) 7757 { 7758 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7759 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7760 int rc; 7761 7762 SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7763 lba_count, lba); 7764 7765 bio->iovs = iov; 7766 bio->iovcnt = iovcnt; 7767 bio->iovpos = 0; 7768 bio->iov_offset = 0; 7769 7770 if (domain != NULL || seq != NULL) { 7771 bio->ext_opts.size = SPDK_SIZEOF(&bio->ext_opts, accel_sequence); 7772 bio->ext_opts.memory_domain = domain; 7773 bio->ext_opts.memory_domain_ctx = domain_ctx; 7774 bio->ext_opts.io_flags = flags; 7775 bio->ext_opts.metadata = md; 7776 bio->ext_opts.accel_sequence = seq; 7777 7778 if (iovcnt == 1) { 7779 rc = spdk_nvme_ns_cmd_write_ext(ns, qpair, iov[0].iov_base, lba, lba_count, bdev_nvme_writev_done, 7780 bio, &bio->ext_opts); 7781 } else { 7782 rc = spdk_nvme_ns_cmd_writev_ext(ns, qpair, lba, lba_count, 7783 bdev_nvme_writev_done, bio, 7784 bdev_nvme_queued_reset_sgl, 7785 bdev_nvme_queued_next_sge, 7786 &bio->ext_opts); 7787 } 7788 } else if (iovcnt == 1) { 7789 rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base, 7790 md, lba, lba_count, bdev_nvme_writev_done, 7791 bio, flags, 0, 0); 7792 } else { 7793 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 7794 bdev_nvme_writev_done, bio, flags, 7795 bdev_nvme_queued_reset_sgl, 7796 bdev_nvme_queued_next_sge, md, 0, 0); 7797 } 7798 7799 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 7800 SPDK_ERRLOG("writev failed: rc = %d\n", rc); 7801 } 7802 return rc; 7803 } 7804 7805 static int 7806 bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7807 void *md, uint64_t lba_count, uint64_t zslba, 7808 uint32_t flags) 7809 { 7810 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7811 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7812 int rc; 7813 7814 SPDK_DEBUGLOG(bdev_nvme, "zone append %" PRIu64 " blocks to zone start lba %#" PRIx64 "\n", 7815 lba_count, zslba); 7816 7817 bio->iovs = iov; 7818 bio->iovcnt = iovcnt; 7819 bio->iovpos = 0; 7820 bio->iov_offset = 0; 7821 7822 if (iovcnt == 1) { 7823 rc = spdk_nvme_zns_zone_append_with_md(ns, qpair, iov[0].iov_base, md, zslba, 7824 lba_count, 7825 bdev_nvme_zone_appendv_done, bio, 7826 flags, 7827 0, 0); 7828 } else { 7829 rc = spdk_nvme_zns_zone_appendv_with_md(ns, qpair, zslba, lba_count, 7830 bdev_nvme_zone_appendv_done, bio, flags, 7831 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 7832 md, 0, 0); 7833 } 7834 7835 if (rc != 0 && rc != -ENOMEM) { 7836 SPDK_ERRLOG("zone append failed: rc = %d\n", rc); 7837 } 7838 return rc; 7839 } 7840 7841 static int 7842 bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7843 void *md, uint64_t lba_count, uint64_t lba, 7844 uint32_t flags) 7845 { 7846 int rc; 7847 7848 SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7849 lba_count, lba); 7850 7851 bio->iovs = iov; 7852 bio->iovcnt = iovcnt; 7853 bio->iovpos = 0; 7854 bio->iov_offset = 0; 7855 7856 rc = spdk_nvme_ns_cmd_comparev_with_md(bio->io_path->nvme_ns->ns, 7857 bio->io_path->qpair->qpair, 7858 lba, lba_count, 7859 bdev_nvme_comparev_done, bio, flags, 7860 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 7861 md, 0, 0); 7862 7863 if (rc != 0 && rc != -ENOMEM) { 7864 SPDK_ERRLOG("comparev failed: rc = %d\n", rc); 7865 } 7866 return rc; 7867 } 7868 7869 static int 7870 bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, 7871 struct iovec *write_iov, int write_iovcnt, 7872 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags) 7873 { 7874 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7875 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7876 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7877 int rc; 7878 7879 SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7880 lba_count, lba); 7881 7882 bio->iovs = cmp_iov; 7883 bio->iovcnt = cmp_iovcnt; 7884 bio->iovpos = 0; 7885 bio->iov_offset = 0; 7886 bio->fused_iovs = write_iov; 7887 bio->fused_iovcnt = write_iovcnt; 7888 bio->fused_iovpos = 0; 7889 bio->fused_iov_offset = 0; 7890 7891 if (bdev_io->num_retries == 0) { 7892 bio->first_fused_submitted = false; 7893 bio->first_fused_completed = false; 7894 } 7895 7896 if (!bio->first_fused_submitted) { 7897 flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST; 7898 memset(&bio->cpl, 0, sizeof(bio->cpl)); 7899 7900 rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count, 7901 bdev_nvme_comparev_and_writev_done, bio, flags, 7902 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0); 7903 if (rc == 0) { 7904 bio->first_fused_submitted = true; 7905 flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST; 7906 } else { 7907 if (rc != -ENOMEM) { 7908 SPDK_ERRLOG("compare failed: rc = %d\n", rc); 7909 } 7910 return rc; 7911 } 7912 } 7913 7914 flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND; 7915 7916 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 7917 bdev_nvme_comparev_and_writev_done, bio, flags, 7918 bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0); 7919 if (rc != 0 && rc != -ENOMEM) { 7920 SPDK_ERRLOG("write failed: rc = %d\n", rc); 7921 rc = 0; 7922 } 7923 7924 return rc; 7925 } 7926 7927 static int 7928 bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 7929 { 7930 struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES]; 7931 struct spdk_nvme_dsm_range *range; 7932 uint64_t offset, remaining; 7933 uint64_t num_ranges_u64; 7934 uint16_t num_ranges; 7935 int rc; 7936 7937 num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) / 7938 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 7939 if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) { 7940 SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks); 7941 return -EINVAL; 7942 } 7943 num_ranges = (uint16_t)num_ranges_u64; 7944 7945 offset = offset_blocks; 7946 remaining = num_blocks; 7947 range = &dsm_ranges[0]; 7948 7949 /* Fill max-size ranges until the remaining blocks fit into one range */ 7950 while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) { 7951 range->attributes.raw = 0; 7952 range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 7953 range->starting_lba = offset; 7954 7955 offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 7956 remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 7957 range++; 7958 } 7959 7960 /* Final range describes the remaining blocks */ 7961 range->attributes.raw = 0; 7962 range->length = remaining; 7963 range->starting_lba = offset; 7964 7965 rc = spdk_nvme_ns_cmd_dataset_management(bio->io_path->nvme_ns->ns, 7966 bio->io_path->qpair->qpair, 7967 SPDK_NVME_DSM_ATTR_DEALLOCATE, 7968 dsm_ranges, num_ranges, 7969 bdev_nvme_queued_done, bio); 7970 7971 return rc; 7972 } 7973 7974 static int 7975 bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 7976 { 7977 if (num_blocks > UINT16_MAX + 1) { 7978 SPDK_ERRLOG("NVMe write zeroes is limited to 16-bit block count\n"); 7979 return -EINVAL; 7980 } 7981 7982 return spdk_nvme_ns_cmd_write_zeroes(bio->io_path->nvme_ns->ns, 7983 bio->io_path->qpair->qpair, 7984 offset_blocks, num_blocks, 7985 bdev_nvme_queued_done, bio, 7986 0); 7987 } 7988 7989 static int 7990 bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones, 7991 struct spdk_bdev_zone_info *info) 7992 { 7993 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7994 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7995 uint32_t zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 7996 uint64_t zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 7997 uint64_t total_zones = spdk_nvme_zns_ns_get_num_zones(ns); 7998 7999 if (zone_id % zone_size != 0) { 8000 return -EINVAL; 8001 } 8002 8003 if (num_zones > total_zones || !num_zones) { 8004 return -EINVAL; 8005 } 8006 8007 assert(!bio->zone_report_buf); 8008 bio->zone_report_buf = calloc(1, zone_report_bufsize); 8009 if (!bio->zone_report_buf) { 8010 return -ENOMEM; 8011 } 8012 8013 bio->handled_zones = 0; 8014 8015 return spdk_nvme_zns_report_zones(ns, qpair, bio->zone_report_buf, zone_report_bufsize, 8016 zone_id, SPDK_NVME_ZRA_LIST_ALL, true, 8017 bdev_nvme_get_zone_info_done, bio); 8018 } 8019 8020 static int 8021 bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 8022 enum spdk_bdev_zone_action action) 8023 { 8024 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8025 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8026 8027 switch (action) { 8028 case SPDK_BDEV_ZONE_CLOSE: 8029 return spdk_nvme_zns_close_zone(ns, qpair, zone_id, false, 8030 bdev_nvme_zone_management_done, bio); 8031 case SPDK_BDEV_ZONE_FINISH: 8032 return spdk_nvme_zns_finish_zone(ns, qpair, zone_id, false, 8033 bdev_nvme_zone_management_done, bio); 8034 case SPDK_BDEV_ZONE_OPEN: 8035 return spdk_nvme_zns_open_zone(ns, qpair, zone_id, false, 8036 bdev_nvme_zone_management_done, bio); 8037 case SPDK_BDEV_ZONE_RESET: 8038 return spdk_nvme_zns_reset_zone(ns, qpair, zone_id, false, 8039 bdev_nvme_zone_management_done, bio); 8040 case SPDK_BDEV_ZONE_OFFLINE: 8041 return spdk_nvme_zns_offline_zone(ns, qpair, zone_id, false, 8042 bdev_nvme_zone_management_done, bio); 8043 default: 8044 return -EINVAL; 8045 } 8046 } 8047 8048 static void 8049 bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 8050 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) 8051 { 8052 struct nvme_io_path *io_path; 8053 struct nvme_ctrlr *nvme_ctrlr; 8054 uint32_t max_xfer_size; 8055 int rc = -ENXIO; 8056 8057 /* Choose the first ctrlr which is not failed. */ 8058 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 8059 nvme_ctrlr = io_path->qpair->ctrlr; 8060 8061 /* We should skip any unavailable nvme_ctrlr rather than checking 8062 * if the return value of spdk_nvme_ctrlr_cmd_admin_raw() is -ENXIO. 8063 */ 8064 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 8065 continue; 8066 } 8067 8068 max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ctrlr->ctrlr); 8069 8070 if (nbytes > max_xfer_size) { 8071 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 8072 rc = -EINVAL; 8073 goto err; 8074 } 8075 8076 rc = spdk_nvme_ctrlr_cmd_admin_raw(nvme_ctrlr->ctrlr, cmd, buf, (uint32_t)nbytes, 8077 bdev_nvme_admin_passthru_done, bio); 8078 if (rc == 0) { 8079 return; 8080 } 8081 } 8082 8083 err: 8084 bdev_nvme_admin_complete(bio, rc); 8085 } 8086 8087 static int 8088 bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 8089 void *buf, size_t nbytes) 8090 { 8091 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8092 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8093 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 8094 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 8095 8096 if (nbytes > max_xfer_size) { 8097 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 8098 return -EINVAL; 8099 } 8100 8101 /* 8102 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 8103 * so fill it out automatically. 8104 */ 8105 cmd->nsid = spdk_nvme_ns_get_id(ns); 8106 8107 return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf, 8108 (uint32_t)nbytes, bdev_nvme_queued_done, bio); 8109 } 8110 8111 static int 8112 bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 8113 void *buf, size_t nbytes, void *md_buf, size_t md_len) 8114 { 8115 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8116 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8117 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 8118 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 8119 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 8120 8121 if (nbytes > max_xfer_size) { 8122 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 8123 return -EINVAL; 8124 } 8125 8126 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 8127 SPDK_ERRLOG("invalid meta data buffer size\n"); 8128 return -EINVAL; 8129 } 8130 8131 /* 8132 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 8133 * so fill it out automatically. 8134 */ 8135 cmd->nsid = spdk_nvme_ns_get_id(ns); 8136 8137 return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf, 8138 (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio); 8139 } 8140 8141 static int 8142 bdev_nvme_iov_passthru_md(struct nvme_bdev_io *bio, 8143 struct spdk_nvme_cmd *cmd, struct iovec *iov, int iovcnt, 8144 size_t nbytes, void *md_buf, size_t md_len) 8145 { 8146 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8147 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8148 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 8149 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 8150 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 8151 8152 bio->iovs = iov; 8153 bio->iovcnt = iovcnt; 8154 bio->iovpos = 0; 8155 bio->iov_offset = 0; 8156 8157 if (nbytes > max_xfer_size) { 8158 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 8159 return -EINVAL; 8160 } 8161 8162 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 8163 SPDK_ERRLOG("invalid meta data buffer size\n"); 8164 return -EINVAL; 8165 } 8166 8167 /* 8168 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands 8169 * require a nsid, so fill it out automatically. 8170 */ 8171 cmd->nsid = spdk_nvme_ns_get_id(ns); 8172 8173 return spdk_nvme_ctrlr_cmd_iov_raw_with_md( 8174 ctrlr, qpair, cmd, (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio, 8175 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge); 8176 } 8177 8178 static void 8179 bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 8180 struct nvme_bdev_io *bio_to_abort) 8181 { 8182 struct nvme_io_path *io_path; 8183 int rc = 0; 8184 8185 rc = bdev_nvme_abort_retry_io(nbdev_ch, bio_to_abort); 8186 if (rc == 0) { 8187 bdev_nvme_admin_complete(bio, 0); 8188 return; 8189 } 8190 8191 io_path = bio_to_abort->io_path; 8192 if (io_path != NULL) { 8193 rc = spdk_nvme_ctrlr_cmd_abort_ext(io_path->qpair->ctrlr->ctrlr, 8194 io_path->qpair->qpair, 8195 bio_to_abort, 8196 bdev_nvme_abort_done, bio); 8197 } else { 8198 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 8199 rc = spdk_nvme_ctrlr_cmd_abort_ext(io_path->qpair->ctrlr->ctrlr, 8200 NULL, 8201 bio_to_abort, 8202 bdev_nvme_abort_done, bio); 8203 8204 if (rc != -ENOENT) { 8205 break; 8206 } 8207 } 8208 } 8209 8210 if (rc != 0) { 8211 /* If no command was found or there was any error, complete the abort 8212 * request with failure. 8213 */ 8214 bdev_nvme_admin_complete(bio, rc); 8215 } 8216 } 8217 8218 static int 8219 bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, uint64_t src_offset_blocks, 8220 uint64_t num_blocks) 8221 { 8222 struct spdk_nvme_scc_source_range range = { 8223 .slba = src_offset_blocks, 8224 .nlb = num_blocks - 1 8225 }; 8226 8227 return spdk_nvme_ns_cmd_copy(bio->io_path->nvme_ns->ns, 8228 bio->io_path->qpair->qpair, 8229 &range, 1, dst_offset_blocks, 8230 bdev_nvme_queued_done, bio); 8231 } 8232 8233 static void 8234 bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w) 8235 { 8236 const char *action; 8237 uint32_t i; 8238 8239 if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) { 8240 action = "reset"; 8241 } else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) { 8242 action = "abort"; 8243 } else { 8244 action = "none"; 8245 } 8246 8247 spdk_json_write_object_begin(w); 8248 8249 spdk_json_write_named_string(w, "method", "bdev_nvme_set_options"); 8250 8251 spdk_json_write_named_object_begin(w, "params"); 8252 spdk_json_write_named_string(w, "action_on_timeout", action); 8253 spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us); 8254 spdk_json_write_named_uint64(w, "timeout_admin_us", g_opts.timeout_admin_us); 8255 spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms); 8256 spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst); 8257 spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight); 8258 spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight); 8259 spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight); 8260 spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us); 8261 spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us); 8262 spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests); 8263 spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit); 8264 spdk_json_write_named_uint32(w, "transport_retry_count", g_opts.transport_retry_count); 8265 spdk_json_write_named_int32(w, "bdev_retry_count", g_opts.bdev_retry_count); 8266 spdk_json_write_named_uint8(w, "transport_ack_timeout", g_opts.transport_ack_timeout); 8267 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", g_opts.ctrlr_loss_timeout_sec); 8268 spdk_json_write_named_uint32(w, "reconnect_delay_sec", g_opts.reconnect_delay_sec); 8269 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", g_opts.fast_io_fail_timeout_sec); 8270 spdk_json_write_named_bool(w, "disable_auto_failback", g_opts.disable_auto_failback); 8271 spdk_json_write_named_bool(w, "generate_uuids", g_opts.generate_uuids); 8272 spdk_json_write_named_uint8(w, "transport_tos", g_opts.transport_tos); 8273 spdk_json_write_named_bool(w, "nvme_error_stat", g_opts.nvme_error_stat); 8274 spdk_json_write_named_uint32(w, "rdma_srq_size", g_opts.rdma_srq_size); 8275 spdk_json_write_named_bool(w, "io_path_stat", g_opts.io_path_stat); 8276 spdk_json_write_named_bool(w, "allow_accel_sequence", g_opts.allow_accel_sequence); 8277 spdk_json_write_named_uint32(w, "rdma_max_cq_size", g_opts.rdma_max_cq_size); 8278 spdk_json_write_named_uint16(w, "rdma_cm_event_timeout_ms", g_opts.rdma_cm_event_timeout_ms); 8279 spdk_json_write_named_array_begin(w, "dhchap_digests"); 8280 for (i = 0; i < 32; ++i) { 8281 if (g_opts.dhchap_digests & SPDK_BIT(i)) { 8282 spdk_json_write_string(w, spdk_nvme_dhchap_get_digest_name(i)); 8283 } 8284 } 8285 spdk_json_write_array_end(w); 8286 spdk_json_write_named_array_begin(w, "dhchap_dhgroups"); 8287 for (i = 0; i < 32; ++i) { 8288 if (g_opts.dhchap_dhgroups & SPDK_BIT(i)) { 8289 spdk_json_write_string(w, spdk_nvme_dhchap_get_dhgroup_name(i)); 8290 } 8291 } 8292 8293 spdk_json_write_array_end(w); 8294 spdk_json_write_object_end(w); 8295 8296 spdk_json_write_object_end(w); 8297 } 8298 8299 static void 8300 bdev_nvme_discovery_config_json(struct spdk_json_write_ctx *w, struct discovery_ctx *ctx) 8301 { 8302 struct spdk_nvme_transport_id trid; 8303 8304 spdk_json_write_object_begin(w); 8305 8306 spdk_json_write_named_string(w, "method", "bdev_nvme_start_discovery"); 8307 8308 spdk_json_write_named_object_begin(w, "params"); 8309 spdk_json_write_named_string(w, "name", ctx->name); 8310 spdk_json_write_named_string(w, "hostnqn", ctx->hostnqn); 8311 8312 trid = ctx->trid; 8313 memset(trid.subnqn, 0, sizeof(trid.subnqn)); 8314 nvme_bdev_dump_trid_json(&trid, w); 8315 8316 spdk_json_write_named_bool(w, "wait_for_attach", ctx->wait_for_attach); 8317 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", ctx->bdev_opts.ctrlr_loss_timeout_sec); 8318 spdk_json_write_named_uint32(w, "reconnect_delay_sec", ctx->bdev_opts.reconnect_delay_sec); 8319 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 8320 ctx->bdev_opts.fast_io_fail_timeout_sec); 8321 spdk_json_write_object_end(w); 8322 8323 spdk_json_write_object_end(w); 8324 } 8325 8326 #ifdef SPDK_CONFIG_NVME_CUSE 8327 static void 8328 nvme_ctrlr_cuse_config_json(struct spdk_json_write_ctx *w, 8329 struct nvme_ctrlr *nvme_ctrlr) 8330 { 8331 size_t cuse_name_size = 128; 8332 char cuse_name[cuse_name_size]; 8333 8334 if (spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr, 8335 cuse_name, &cuse_name_size) != 0) { 8336 return; 8337 } 8338 8339 spdk_json_write_object_begin(w); 8340 8341 spdk_json_write_named_string(w, "method", "bdev_nvme_cuse_register"); 8342 8343 spdk_json_write_named_object_begin(w, "params"); 8344 spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name); 8345 spdk_json_write_object_end(w); 8346 8347 spdk_json_write_object_end(w); 8348 } 8349 #endif 8350 8351 static void 8352 nvme_ctrlr_config_json(struct spdk_json_write_ctx *w, 8353 struct nvme_ctrlr *nvme_ctrlr) 8354 { 8355 struct spdk_nvme_transport_id *trid; 8356 const struct spdk_nvme_ctrlr_opts *opts; 8357 8358 if (nvme_ctrlr->opts.from_discovery_service) { 8359 /* Do not emit an RPC for this - it will be implicitly 8360 * covered by a separate bdev_nvme_start_discovery or 8361 * bdev_nvme_start_mdns_discovery RPC. 8362 */ 8363 return; 8364 } 8365 8366 trid = &nvme_ctrlr->active_path_id->trid; 8367 8368 spdk_json_write_object_begin(w); 8369 8370 spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller"); 8371 8372 spdk_json_write_named_object_begin(w, "params"); 8373 spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name); 8374 nvme_bdev_dump_trid_json(trid, w); 8375 spdk_json_write_named_bool(w, "prchk_reftag", 8376 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0); 8377 spdk_json_write_named_bool(w, "prchk_guard", 8378 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0); 8379 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", nvme_ctrlr->opts.ctrlr_loss_timeout_sec); 8380 spdk_json_write_named_uint32(w, "reconnect_delay_sec", nvme_ctrlr->opts.reconnect_delay_sec); 8381 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 8382 nvme_ctrlr->opts.fast_io_fail_timeout_sec); 8383 if (nvme_ctrlr->psk != NULL) { 8384 spdk_json_write_named_string(w, "psk", spdk_key_get_name(nvme_ctrlr->psk)); 8385 } else if (nvme_ctrlr->opts.psk[0] != '\0') { 8386 spdk_json_write_named_string(w, "psk", nvme_ctrlr->opts.psk); 8387 } 8388 8389 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 8390 spdk_json_write_named_string(w, "hostnqn", opts->hostnqn); 8391 spdk_json_write_named_bool(w, "hdgst", opts->header_digest); 8392 spdk_json_write_named_bool(w, "ddgst", opts->data_digest); 8393 if (opts->src_addr[0] != '\0') { 8394 spdk_json_write_named_string(w, "hostaddr", opts->src_addr); 8395 } 8396 if (opts->src_svcid[0] != '\0') { 8397 spdk_json_write_named_string(w, "hostsvcid", opts->src_svcid); 8398 } 8399 8400 spdk_json_write_object_end(w); 8401 8402 spdk_json_write_object_end(w); 8403 } 8404 8405 static void 8406 bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w) 8407 { 8408 spdk_json_write_object_begin(w); 8409 spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug"); 8410 8411 spdk_json_write_named_object_begin(w, "params"); 8412 spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us); 8413 spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled); 8414 spdk_json_write_object_end(w); 8415 8416 spdk_json_write_object_end(w); 8417 } 8418 8419 static int 8420 bdev_nvme_config_json(struct spdk_json_write_ctx *w) 8421 { 8422 struct nvme_bdev_ctrlr *nbdev_ctrlr; 8423 struct nvme_ctrlr *nvme_ctrlr; 8424 struct discovery_ctx *ctx; 8425 8426 bdev_nvme_opts_config_json(w); 8427 8428 pthread_mutex_lock(&g_bdev_nvme_mutex); 8429 8430 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 8431 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 8432 nvme_ctrlr_config_json(w, nvme_ctrlr); 8433 8434 #ifdef SPDK_CONFIG_NVME_CUSE 8435 nvme_ctrlr_cuse_config_json(w, nvme_ctrlr); 8436 #endif 8437 } 8438 } 8439 8440 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 8441 if (!ctx->from_mdns_discovery_service) { 8442 bdev_nvme_discovery_config_json(w, ctx); 8443 } 8444 } 8445 8446 bdev_nvme_mdns_discovery_config_json(w); 8447 8448 /* Dump as last parameter to give all NVMe bdevs chance to be constructed 8449 * before enabling hotplug poller. 8450 */ 8451 bdev_nvme_hotplug_config_json(w); 8452 8453 pthread_mutex_unlock(&g_bdev_nvme_mutex); 8454 return 0; 8455 } 8456 8457 struct spdk_nvme_ctrlr * 8458 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev) 8459 { 8460 struct nvme_bdev *nbdev; 8461 struct nvme_ns *nvme_ns; 8462 8463 if (!bdev || bdev->module != &nvme_if) { 8464 return NULL; 8465 } 8466 8467 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 8468 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 8469 assert(nvme_ns != NULL); 8470 8471 return nvme_ns->ctrlr->ctrlr; 8472 } 8473 8474 void 8475 nvme_io_path_info_json(struct spdk_json_write_ctx *w, struct nvme_io_path *io_path) 8476 { 8477 struct nvme_ns *nvme_ns = io_path->nvme_ns; 8478 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 8479 const struct spdk_nvme_ctrlr_data *cdata; 8480 const struct spdk_nvme_transport_id *trid; 8481 const struct nvme_bdev_channel *nbdev_ch; 8482 const char *adrfam_str; 8483 bool current; 8484 8485 spdk_json_write_object_begin(w); 8486 8487 spdk_json_write_named_string(w, "bdev_name", nvme_ns->bdev->disk.name); 8488 8489 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 8490 trid = spdk_nvme_ctrlr_get_transport_id(nvme_ctrlr->ctrlr); 8491 8492 spdk_json_write_named_uint32(w, "cntlid", cdata->cntlid); 8493 nbdev_ch = io_path->nbdev_ch; 8494 if (nbdev_ch == NULL) { 8495 current = false; 8496 } else { 8497 current = (io_path == nbdev_ch->current_io_path); 8498 } 8499 spdk_json_write_named_bool(w, "current", current); 8500 spdk_json_write_named_bool(w, "connected", nvme_qpair_is_connected(io_path->qpair)); 8501 spdk_json_write_named_bool(w, "accessible", nvme_ns_is_accessible(nvme_ns)); 8502 8503 spdk_json_write_named_object_begin(w, "transport"); 8504 spdk_json_write_named_string(w, "trtype", trid->trstring); 8505 spdk_json_write_named_string(w, "traddr", trid->traddr); 8506 if (trid->trsvcid[0] != '\0') { 8507 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 8508 } 8509 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 8510 if (adrfam_str) { 8511 spdk_json_write_named_string(w, "adrfam", adrfam_str); 8512 } 8513 spdk_json_write_object_end(w); 8514 8515 spdk_json_write_object_end(w); 8516 } 8517 8518 void 8519 bdev_nvme_get_discovery_info(struct spdk_json_write_ctx *w) 8520 { 8521 struct discovery_ctx *ctx; 8522 struct discovery_entry_ctx *entry_ctx; 8523 8524 spdk_json_write_array_begin(w); 8525 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 8526 spdk_json_write_object_begin(w); 8527 spdk_json_write_named_string(w, "name", ctx->name); 8528 8529 spdk_json_write_named_object_begin(w, "trid"); 8530 nvme_bdev_dump_trid_json(&ctx->trid, w); 8531 spdk_json_write_object_end(w); 8532 8533 spdk_json_write_named_array_begin(w, "referrals"); 8534 TAILQ_FOREACH(entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 8535 spdk_json_write_object_begin(w); 8536 spdk_json_write_named_object_begin(w, "trid"); 8537 nvme_bdev_dump_trid_json(&entry_ctx->trid, w); 8538 spdk_json_write_object_end(w); 8539 spdk_json_write_object_end(w); 8540 } 8541 spdk_json_write_array_end(w); 8542 8543 spdk_json_write_object_end(w); 8544 } 8545 spdk_json_write_array_end(w); 8546 } 8547 8548 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme) 8549 8550 SPDK_TRACE_REGISTER_FN(bdev_nvme_trace, "bdev_nvme", TRACE_GROUP_BDEV_NVME) 8551 { 8552 struct spdk_trace_tpoint_opts opts[] = { 8553 { 8554 "BDEV_NVME_IO_START", TRACE_BDEV_NVME_IO_START, 8555 OWNER_TYPE_NONE, OBJECT_BDEV_NVME_IO, 1, 8556 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 8557 }, 8558 { 8559 "BDEV_NVME_IO_DONE", TRACE_BDEV_NVME_IO_DONE, 8560 OWNER_TYPE_NONE, OBJECT_BDEV_NVME_IO, 0, 8561 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 8562 } 8563 }; 8564 8565 8566 spdk_trace_register_object(OBJECT_BDEV_NVME_IO, 'N'); 8567 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 8568 spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_SUBMIT, OBJECT_BDEV_NVME_IO, 0); 8569 spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_SUBMIT, OBJECT_BDEV_NVME_IO, 0); 8570 spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_COMPLETE, OBJECT_BDEV_NVME_IO, 0); 8571 spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_COMPLETE, OBJECT_BDEV_NVME_IO, 0); 8572 } 8573