1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 * Copyright (c) 2022 Dell Inc, or its subsidiaries. All rights reserved. 6 */ 7 8 #include "spdk/stdinc.h" 9 10 #include "bdev_nvme.h" 11 12 #include "spdk/accel.h" 13 #include "spdk/config.h" 14 #include "spdk/endian.h" 15 #include "spdk/bdev.h" 16 #include "spdk/json.h" 17 #include "spdk/keyring.h" 18 #include "spdk/likely.h" 19 #include "spdk/nvme.h" 20 #include "spdk/nvme_ocssd.h" 21 #include "spdk/nvme_zns.h" 22 #include "spdk/opal.h" 23 #include "spdk/thread.h" 24 #include "spdk/trace.h" 25 #include "spdk/string.h" 26 #include "spdk/util.h" 27 #include "spdk/uuid.h" 28 29 #include "spdk/bdev_module.h" 30 #include "spdk/log.h" 31 32 #include "spdk_internal/usdt.h" 33 #include "spdk_internal/trace_defs.h" 34 35 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true 36 #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS (10000) 37 38 #define NSID_STR_LEN 10 39 40 #define SPDK_CONTROLLER_NAME_MAX 512 41 42 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w); 43 44 struct nvme_bdev_io { 45 /** array of iovecs to transfer. */ 46 struct iovec *iovs; 47 48 /** Number of iovecs in iovs array. */ 49 int iovcnt; 50 51 /** Current iovec position. */ 52 int iovpos; 53 54 /** Offset in current iovec. */ 55 uint32_t iov_offset; 56 57 /** I/O path the current I/O or admin passthrough is submitted on, or the I/O path 58 * being reset in a reset I/O. 59 */ 60 struct nvme_io_path *io_path; 61 62 /** array of iovecs to transfer. */ 63 struct iovec *fused_iovs; 64 65 /** Number of iovecs in iovs array. */ 66 int fused_iovcnt; 67 68 /** Current iovec position. */ 69 int fused_iovpos; 70 71 /** Offset in current iovec. */ 72 uint32_t fused_iov_offset; 73 74 /** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */ 75 struct spdk_nvme_cpl cpl; 76 77 /** Extended IO opts passed by the user to bdev layer and mapped to NVME format */ 78 struct spdk_nvme_ns_cmd_ext_io_opts ext_opts; 79 80 /** Keeps track if first of fused commands was submitted */ 81 bool first_fused_submitted; 82 83 /** Keeps track if first of fused commands was completed */ 84 bool first_fused_completed; 85 86 /** Temporary pointer to zone report buffer */ 87 struct spdk_nvme_zns_zone_report *zone_report_buf; 88 89 /** Keep track of how many zones that have been copied to the spdk_bdev_zone_info struct */ 90 uint64_t handled_zones; 91 92 /** Expiration value in ticks to retry the current I/O. */ 93 uint64_t retry_ticks; 94 95 /* How many times the current I/O was retried. */ 96 int32_t retry_count; 97 98 /* Current tsc at submit time. */ 99 uint64_t submit_tsc; 100 101 /* Used to put nvme_bdev_io into the list */ 102 TAILQ_ENTRY(nvme_bdev_io) retry_link; 103 }; 104 105 struct nvme_probe_skip_entry { 106 struct spdk_nvme_transport_id trid; 107 TAILQ_ENTRY(nvme_probe_skip_entry) tailq; 108 }; 109 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */ 110 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER( 111 g_skipped_nvme_ctrlrs); 112 113 #define BDEV_NVME_DEFAULT_DIGESTS (SPDK_BIT(SPDK_NVMF_DHCHAP_HASH_SHA256) | \ 114 SPDK_BIT(SPDK_NVMF_DHCHAP_HASH_SHA384) | \ 115 SPDK_BIT(SPDK_NVMF_DHCHAP_HASH_SHA512)) 116 117 #define BDEV_NVME_DEFAULT_DHGROUPS (SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_NULL) | \ 118 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_2048) | \ 119 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_3072) | \ 120 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_4096) | \ 121 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_6144) | \ 122 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_8192)) 123 124 static struct spdk_bdev_nvme_opts g_opts = { 125 .action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE, 126 .timeout_us = 0, 127 .timeout_admin_us = 0, 128 .keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS, 129 .transport_retry_count = 4, 130 .arbitration_burst = 0, 131 .low_priority_weight = 0, 132 .medium_priority_weight = 0, 133 .high_priority_weight = 0, 134 .nvme_adminq_poll_period_us = 10000ULL, 135 .nvme_ioq_poll_period_us = 0, 136 .io_queue_requests = 0, 137 .delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT, 138 .bdev_retry_count = 3, 139 .transport_ack_timeout = 0, 140 .ctrlr_loss_timeout_sec = 0, 141 .reconnect_delay_sec = 0, 142 .fast_io_fail_timeout_sec = 0, 143 .disable_auto_failback = false, 144 .generate_uuids = false, 145 .transport_tos = 0, 146 .nvme_error_stat = false, 147 .io_path_stat = false, 148 .allow_accel_sequence = false, 149 .dhchap_digests = BDEV_NVME_DEFAULT_DIGESTS, 150 .dhchap_dhgroups = BDEV_NVME_DEFAULT_DHGROUPS, 151 }; 152 153 #define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL 154 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT 100000ULL 155 156 static int g_hot_insert_nvme_controller_index = 0; 157 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT; 158 static bool g_nvme_hotplug_enabled = false; 159 struct spdk_thread *g_bdev_nvme_init_thread; 160 static struct spdk_poller *g_hotplug_poller; 161 static struct spdk_poller *g_hotplug_probe_poller; 162 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx; 163 164 static void nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 165 struct nvme_async_probe_ctx *ctx); 166 static void nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 167 struct nvme_async_probe_ctx *ctx); 168 static int bdev_nvme_library_init(void); 169 static void bdev_nvme_library_fini(void); 170 static void _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, 171 struct spdk_bdev_io *bdev_io); 172 static void bdev_nvme_submit_request(struct spdk_io_channel *ch, 173 struct spdk_bdev_io *bdev_io); 174 static int bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 175 void *md, uint64_t lba_count, uint64_t lba, 176 uint32_t flags, struct spdk_memory_domain *domain, void *domain_ctx, 177 struct spdk_accel_sequence *seq); 178 static int bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 179 void *md, uint64_t lba_count, uint64_t lba); 180 static int bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 181 void *md, uint64_t lba_count, uint64_t lba, 182 uint32_t flags, struct spdk_memory_domain *domain, void *domain_ctx, 183 struct spdk_accel_sequence *seq); 184 static int bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 185 void *md, uint64_t lba_count, 186 uint64_t zslba, uint32_t flags); 187 static int bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 188 void *md, uint64_t lba_count, uint64_t lba, 189 uint32_t flags); 190 static int bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, 191 struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov, 192 int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba, 193 uint32_t flags); 194 static int bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, 195 uint32_t num_zones, struct spdk_bdev_zone_info *info); 196 static int bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 197 enum spdk_bdev_zone_action action); 198 static void bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, 199 struct nvme_bdev_io *bio, 200 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); 201 static int bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 202 void *buf, size_t nbytes); 203 static int bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 204 void *buf, size_t nbytes, void *md_buf, size_t md_len); 205 static int bdev_nvme_iov_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 206 struct iovec *iov, int iovcnt, size_t nbytes, 207 void *md_buf, size_t md_len); 208 static void bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, 209 struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort); 210 static void bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio); 211 static int bdev_nvme_reset_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 212 static int bdev_nvme_failover_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 213 static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr); 214 static int nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr); 215 216 static struct nvme_ns *nvme_ns_alloc(void); 217 static void nvme_ns_free(struct nvme_ns *ns); 218 219 static int 220 nvme_ns_cmp(struct nvme_ns *ns1, struct nvme_ns *ns2) 221 { 222 return ns1->id < ns2->id ? -1 : ns1->id > ns2->id; 223 } 224 225 RB_GENERATE_STATIC(nvme_ns_tree, nvme_ns, node, nvme_ns_cmp); 226 227 struct spdk_nvme_qpair * 228 bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch) 229 { 230 struct nvme_ctrlr_channel *ctrlr_ch; 231 232 assert(ctrlr_io_ch != NULL); 233 234 ctrlr_ch = spdk_io_channel_get_ctx(ctrlr_io_ch); 235 236 return ctrlr_ch->qpair->qpair; 237 } 238 239 static int 240 bdev_nvme_get_ctx_size(void) 241 { 242 return sizeof(struct nvme_bdev_io); 243 } 244 245 static struct spdk_bdev_module nvme_if = { 246 .name = "nvme", 247 .async_fini = true, 248 .module_init = bdev_nvme_library_init, 249 .module_fini = bdev_nvme_library_fini, 250 .config_json = bdev_nvme_config_json, 251 .get_ctx_size = bdev_nvme_get_ctx_size, 252 253 }; 254 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if) 255 256 struct nvme_bdev_ctrlrs g_nvme_bdev_ctrlrs = TAILQ_HEAD_INITIALIZER(g_nvme_bdev_ctrlrs); 257 pthread_mutex_t g_bdev_nvme_mutex = PTHREAD_MUTEX_INITIALIZER; 258 bool g_bdev_nvme_module_finish; 259 260 struct nvme_bdev_ctrlr * 261 nvme_bdev_ctrlr_get_by_name(const char *name) 262 { 263 struct nvme_bdev_ctrlr *nbdev_ctrlr; 264 265 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 266 if (strcmp(name, nbdev_ctrlr->name) == 0) { 267 break; 268 } 269 } 270 271 return nbdev_ctrlr; 272 } 273 274 static struct nvme_ctrlr * 275 nvme_bdev_ctrlr_get_ctrlr(struct nvme_bdev_ctrlr *nbdev_ctrlr, 276 const struct spdk_nvme_transport_id *trid) 277 { 278 struct nvme_ctrlr *nvme_ctrlr; 279 280 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 281 if (spdk_nvme_transport_id_compare(trid, &nvme_ctrlr->active_path_id->trid) == 0) { 282 break; 283 } 284 } 285 286 return nvme_ctrlr; 287 } 288 289 struct nvme_ctrlr * 290 nvme_bdev_ctrlr_get_ctrlr_by_id(struct nvme_bdev_ctrlr *nbdev_ctrlr, 291 uint16_t cntlid) 292 { 293 struct nvme_ctrlr *nvme_ctrlr; 294 const struct spdk_nvme_ctrlr_data *cdata; 295 296 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 297 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 298 if (cdata->cntlid == cntlid) { 299 break; 300 } 301 } 302 303 return nvme_ctrlr; 304 } 305 306 static struct nvme_bdev * 307 nvme_bdev_ctrlr_get_bdev(struct nvme_bdev_ctrlr *nbdev_ctrlr, uint32_t nsid) 308 { 309 struct nvme_bdev *bdev; 310 311 pthread_mutex_lock(&g_bdev_nvme_mutex); 312 TAILQ_FOREACH(bdev, &nbdev_ctrlr->bdevs, tailq) { 313 if (bdev->nsid == nsid) { 314 break; 315 } 316 } 317 pthread_mutex_unlock(&g_bdev_nvme_mutex); 318 319 return bdev; 320 } 321 322 struct nvme_ns * 323 nvme_ctrlr_get_ns(struct nvme_ctrlr *nvme_ctrlr, uint32_t nsid) 324 { 325 struct nvme_ns ns; 326 327 assert(nsid > 0); 328 329 ns.id = nsid; 330 return RB_FIND(nvme_ns_tree, &nvme_ctrlr->namespaces, &ns); 331 } 332 333 struct nvme_ns * 334 nvme_ctrlr_get_first_active_ns(struct nvme_ctrlr *nvme_ctrlr) 335 { 336 return RB_MIN(nvme_ns_tree, &nvme_ctrlr->namespaces); 337 } 338 339 struct nvme_ns * 340 nvme_ctrlr_get_next_active_ns(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *ns) 341 { 342 if (ns == NULL) { 343 return NULL; 344 } 345 346 return RB_NEXT(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 347 } 348 349 static struct nvme_ctrlr * 350 nvme_ctrlr_get(const struct spdk_nvme_transport_id *trid) 351 { 352 struct nvme_bdev_ctrlr *nbdev_ctrlr; 353 struct nvme_ctrlr *nvme_ctrlr = NULL; 354 355 pthread_mutex_lock(&g_bdev_nvme_mutex); 356 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 357 nvme_ctrlr = nvme_bdev_ctrlr_get_ctrlr(nbdev_ctrlr, trid); 358 if (nvme_ctrlr != NULL) { 359 break; 360 } 361 } 362 pthread_mutex_unlock(&g_bdev_nvme_mutex); 363 364 return nvme_ctrlr; 365 } 366 367 struct nvme_ctrlr * 368 nvme_ctrlr_get_by_name(const char *name) 369 { 370 struct nvme_bdev_ctrlr *nbdev_ctrlr; 371 struct nvme_ctrlr *nvme_ctrlr = NULL; 372 373 if (name == NULL) { 374 return NULL; 375 } 376 377 pthread_mutex_lock(&g_bdev_nvme_mutex); 378 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 379 if (nbdev_ctrlr != NULL) { 380 nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 381 } 382 pthread_mutex_unlock(&g_bdev_nvme_mutex); 383 384 return nvme_ctrlr; 385 } 386 387 void 388 nvme_bdev_ctrlr_for_each(nvme_bdev_ctrlr_for_each_fn fn, void *ctx) 389 { 390 struct nvme_bdev_ctrlr *nbdev_ctrlr; 391 392 pthread_mutex_lock(&g_bdev_nvme_mutex); 393 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 394 fn(nbdev_ctrlr, ctx); 395 } 396 pthread_mutex_unlock(&g_bdev_nvme_mutex); 397 } 398 399 void 400 nvme_bdev_dump_trid_json(const struct spdk_nvme_transport_id *trid, struct spdk_json_write_ctx *w) 401 { 402 const char *trtype_str; 403 const char *adrfam_str; 404 405 trtype_str = spdk_nvme_transport_id_trtype_str(trid->trtype); 406 if (trtype_str) { 407 spdk_json_write_named_string(w, "trtype", trtype_str); 408 } 409 410 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 411 if (adrfam_str) { 412 spdk_json_write_named_string(w, "adrfam", adrfam_str); 413 } 414 415 if (trid->traddr[0] != '\0') { 416 spdk_json_write_named_string(w, "traddr", trid->traddr); 417 } 418 419 if (trid->trsvcid[0] != '\0') { 420 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 421 } 422 423 if (trid->subnqn[0] != '\0') { 424 spdk_json_write_named_string(w, "subnqn", trid->subnqn); 425 } 426 } 427 428 static void 429 nvme_bdev_ctrlr_delete(struct nvme_bdev_ctrlr *nbdev_ctrlr, 430 struct nvme_ctrlr *nvme_ctrlr) 431 { 432 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_delete, nvme_ctrlr->nbdev_ctrlr->name); 433 pthread_mutex_lock(&g_bdev_nvme_mutex); 434 435 TAILQ_REMOVE(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 436 if (!TAILQ_EMPTY(&nbdev_ctrlr->ctrlrs)) { 437 pthread_mutex_unlock(&g_bdev_nvme_mutex); 438 439 return; 440 } 441 TAILQ_REMOVE(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 442 443 pthread_mutex_unlock(&g_bdev_nvme_mutex); 444 445 assert(TAILQ_EMPTY(&nbdev_ctrlr->bdevs)); 446 447 free(nbdev_ctrlr->name); 448 free(nbdev_ctrlr); 449 } 450 451 static void 452 _nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 453 { 454 struct nvme_path_id *path_id, *tmp_path; 455 struct nvme_ns *ns, *tmp_ns; 456 457 free(nvme_ctrlr->copied_ana_desc); 458 spdk_free(nvme_ctrlr->ana_log_page); 459 460 if (nvme_ctrlr->opal_dev) { 461 spdk_opal_dev_destruct(nvme_ctrlr->opal_dev); 462 nvme_ctrlr->opal_dev = NULL; 463 } 464 465 if (nvme_ctrlr->nbdev_ctrlr) { 466 nvme_bdev_ctrlr_delete(nvme_ctrlr->nbdev_ctrlr, nvme_ctrlr); 467 } 468 469 RB_FOREACH_SAFE(ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp_ns) { 470 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 471 nvme_ns_free(ns); 472 } 473 474 TAILQ_FOREACH_SAFE(path_id, &nvme_ctrlr->trids, link, tmp_path) { 475 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 476 free(path_id); 477 } 478 479 pthread_mutex_destroy(&nvme_ctrlr->mutex); 480 spdk_keyring_put_key(nvme_ctrlr->psk); 481 spdk_keyring_put_key(nvme_ctrlr->dhchap_key); 482 free(nvme_ctrlr); 483 484 pthread_mutex_lock(&g_bdev_nvme_mutex); 485 if (g_bdev_nvme_module_finish && TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 486 pthread_mutex_unlock(&g_bdev_nvme_mutex); 487 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 488 spdk_bdev_module_fini_done(); 489 return; 490 } 491 pthread_mutex_unlock(&g_bdev_nvme_mutex); 492 } 493 494 static int 495 nvme_detach_poller(void *arg) 496 { 497 struct nvme_ctrlr *nvme_ctrlr = arg; 498 int rc; 499 500 rc = spdk_nvme_detach_poll_async(nvme_ctrlr->detach_ctx); 501 if (rc != -EAGAIN) { 502 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 503 _nvme_ctrlr_delete(nvme_ctrlr); 504 } 505 506 return SPDK_POLLER_BUSY; 507 } 508 509 static void 510 nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 511 { 512 int rc; 513 514 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 515 516 /* First, unregister the adminq poller, as the driver will poll adminq if necessary */ 517 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 518 519 /* If we got here, the reset/detach poller cannot be active */ 520 assert(nvme_ctrlr->reset_detach_poller == NULL); 521 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(nvme_detach_poller, 522 nvme_ctrlr, 1000); 523 if (nvme_ctrlr->reset_detach_poller == NULL) { 524 SPDK_ERRLOG("Failed to register detach poller\n"); 525 goto error; 526 } 527 528 rc = spdk_nvme_detach_async(nvme_ctrlr->ctrlr, &nvme_ctrlr->detach_ctx); 529 if (rc != 0) { 530 SPDK_ERRLOG("Failed to detach the NVMe controller\n"); 531 goto error; 532 } 533 534 return; 535 error: 536 /* We don't have a good way to handle errors here, so just do what we can and delete the 537 * controller without detaching the underlying NVMe device. 538 */ 539 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 540 _nvme_ctrlr_delete(nvme_ctrlr); 541 } 542 543 static void 544 nvme_ctrlr_unregister_cb(void *io_device) 545 { 546 struct nvme_ctrlr *nvme_ctrlr = io_device; 547 548 nvme_ctrlr_delete(nvme_ctrlr); 549 } 550 551 static void 552 nvme_ctrlr_unregister(void *ctx) 553 { 554 struct nvme_ctrlr *nvme_ctrlr = ctx; 555 556 spdk_io_device_unregister(nvme_ctrlr, nvme_ctrlr_unregister_cb); 557 } 558 559 static bool 560 nvme_ctrlr_can_be_unregistered(struct nvme_ctrlr *nvme_ctrlr) 561 { 562 if (!nvme_ctrlr->destruct) { 563 return false; 564 } 565 566 if (nvme_ctrlr->ref > 0) { 567 return false; 568 } 569 570 if (nvme_ctrlr->resetting) { 571 return false; 572 } 573 574 if (nvme_ctrlr->ana_log_page_updating) { 575 return false; 576 } 577 578 if (nvme_ctrlr->io_path_cache_clearing) { 579 return false; 580 } 581 582 return true; 583 } 584 585 static void 586 nvme_ctrlr_release(struct nvme_ctrlr *nvme_ctrlr) 587 { 588 pthread_mutex_lock(&nvme_ctrlr->mutex); 589 SPDK_DTRACE_PROBE2(bdev_nvme_ctrlr_release, nvme_ctrlr->nbdev_ctrlr->name, nvme_ctrlr->ref); 590 591 assert(nvme_ctrlr->ref > 0); 592 nvme_ctrlr->ref--; 593 594 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 595 pthread_mutex_unlock(&nvme_ctrlr->mutex); 596 return; 597 } 598 599 pthread_mutex_unlock(&nvme_ctrlr->mutex); 600 601 spdk_thread_exec_msg(nvme_ctrlr->thread, nvme_ctrlr_unregister, nvme_ctrlr); 602 } 603 604 static void 605 bdev_nvme_clear_current_io_path(struct nvme_bdev_channel *nbdev_ch) 606 { 607 nbdev_ch->current_io_path = NULL; 608 nbdev_ch->rr_counter = 0; 609 } 610 611 static struct nvme_io_path * 612 _bdev_nvme_get_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 613 { 614 struct nvme_io_path *io_path; 615 616 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 617 if (io_path->nvme_ns == nvme_ns) { 618 break; 619 } 620 } 621 622 return io_path; 623 } 624 625 static struct nvme_io_path * 626 nvme_io_path_alloc(void) 627 { 628 struct nvme_io_path *io_path; 629 630 io_path = calloc(1, sizeof(*io_path)); 631 if (io_path == NULL) { 632 SPDK_ERRLOG("Failed to alloc io_path.\n"); 633 return NULL; 634 } 635 636 if (g_opts.io_path_stat) { 637 io_path->stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); 638 if (io_path->stat == NULL) { 639 free(io_path); 640 SPDK_ERRLOG("Failed to alloc io_path stat.\n"); 641 return NULL; 642 } 643 spdk_bdev_reset_io_stat(io_path->stat, SPDK_BDEV_RESET_STAT_MAXMIN); 644 } 645 646 return io_path; 647 } 648 649 static void 650 nvme_io_path_free(struct nvme_io_path *io_path) 651 { 652 free(io_path->stat); 653 free(io_path); 654 } 655 656 static int 657 _bdev_nvme_add_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 658 { 659 struct nvme_io_path *io_path; 660 struct spdk_io_channel *ch; 661 struct nvme_ctrlr_channel *ctrlr_ch; 662 struct nvme_qpair *nvme_qpair; 663 664 io_path = nvme_io_path_alloc(); 665 if (io_path == NULL) { 666 return -ENOMEM; 667 } 668 669 io_path->nvme_ns = nvme_ns; 670 671 ch = spdk_get_io_channel(nvme_ns->ctrlr); 672 if (ch == NULL) { 673 nvme_io_path_free(io_path); 674 SPDK_ERRLOG("Failed to alloc io_channel.\n"); 675 return -ENOMEM; 676 } 677 678 ctrlr_ch = spdk_io_channel_get_ctx(ch); 679 680 nvme_qpair = ctrlr_ch->qpair; 681 assert(nvme_qpair != NULL); 682 683 io_path->qpair = nvme_qpair; 684 TAILQ_INSERT_TAIL(&nvme_qpair->io_path_list, io_path, tailq); 685 686 io_path->nbdev_ch = nbdev_ch; 687 STAILQ_INSERT_TAIL(&nbdev_ch->io_path_list, io_path, stailq); 688 689 bdev_nvme_clear_current_io_path(nbdev_ch); 690 691 return 0; 692 } 693 694 static void 695 bdev_nvme_clear_retry_io_path(struct nvme_bdev_channel *nbdev_ch, 696 struct nvme_io_path *io_path) 697 { 698 struct nvme_bdev_io *bio; 699 700 TAILQ_FOREACH(bio, &nbdev_ch->retry_io_list, retry_link) { 701 if (bio->io_path == io_path) { 702 bio->io_path = NULL; 703 } 704 } 705 } 706 707 static void 708 _bdev_nvme_delete_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *io_path) 709 { 710 struct spdk_io_channel *ch; 711 struct nvme_qpair *nvme_qpair; 712 struct nvme_ctrlr_channel *ctrlr_ch; 713 struct nvme_bdev *nbdev; 714 715 nbdev = spdk_io_channel_get_io_device(spdk_io_channel_from_ctx(nbdev_ch)); 716 717 /* Add the statistics to nvme_ns before this path is destroyed. */ 718 pthread_mutex_lock(&nbdev->mutex); 719 if (nbdev->ref != 0 && io_path->nvme_ns->stat != NULL && io_path->stat != NULL) { 720 spdk_bdev_add_io_stat(io_path->nvme_ns->stat, io_path->stat); 721 } 722 pthread_mutex_unlock(&nbdev->mutex); 723 724 bdev_nvme_clear_current_io_path(nbdev_ch); 725 bdev_nvme_clear_retry_io_path(nbdev_ch, io_path); 726 727 STAILQ_REMOVE(&nbdev_ch->io_path_list, io_path, nvme_io_path, stailq); 728 io_path->nbdev_ch = NULL; 729 730 nvme_qpair = io_path->qpair; 731 assert(nvme_qpair != NULL); 732 733 ctrlr_ch = nvme_qpair->ctrlr_ch; 734 assert(ctrlr_ch != NULL); 735 736 ch = spdk_io_channel_from_ctx(ctrlr_ch); 737 spdk_put_io_channel(ch); 738 739 /* After an io_path is removed, I/Os submitted to it may complete and update statistics 740 * of the io_path. To avoid heap-use-after-free error from this case, do not free the 741 * io_path here but free the io_path when the associated qpair is freed. It is ensured 742 * that all I/Os submitted to the io_path are completed when the associated qpair is freed. 743 */ 744 } 745 746 static void 747 _bdev_nvme_delete_io_paths(struct nvme_bdev_channel *nbdev_ch) 748 { 749 struct nvme_io_path *io_path, *tmp_io_path; 750 751 STAILQ_FOREACH_SAFE(io_path, &nbdev_ch->io_path_list, stailq, tmp_io_path) { 752 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 753 } 754 } 755 756 static int 757 bdev_nvme_create_bdev_channel_cb(void *io_device, void *ctx_buf) 758 { 759 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 760 struct nvme_bdev *nbdev = io_device; 761 struct nvme_ns *nvme_ns; 762 int rc; 763 764 STAILQ_INIT(&nbdev_ch->io_path_list); 765 TAILQ_INIT(&nbdev_ch->retry_io_list); 766 767 pthread_mutex_lock(&nbdev->mutex); 768 769 nbdev_ch->mp_policy = nbdev->mp_policy; 770 nbdev_ch->mp_selector = nbdev->mp_selector; 771 nbdev_ch->rr_min_io = nbdev->rr_min_io; 772 773 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 774 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 775 if (rc != 0) { 776 pthread_mutex_unlock(&nbdev->mutex); 777 778 _bdev_nvme_delete_io_paths(nbdev_ch); 779 return rc; 780 } 781 } 782 pthread_mutex_unlock(&nbdev->mutex); 783 784 return 0; 785 } 786 787 /* If cpl != NULL, complete the bdev_io with nvme status based on 'cpl'. 788 * If cpl == NULL, complete the bdev_io with bdev status based on 'status'. 789 */ 790 static inline void 791 __bdev_nvme_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status, 792 const struct spdk_nvme_cpl *cpl) 793 { 794 spdk_trace_record(TRACE_BDEV_NVME_IO_DONE, 0, 0, (uintptr_t)bdev_io->driver_ctx, 795 (uintptr_t)bdev_io); 796 if (cpl) { 797 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 798 } else { 799 spdk_bdev_io_complete(bdev_io, status); 800 } 801 } 802 803 static void bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch); 804 805 static void 806 bdev_nvme_destroy_bdev_channel_cb(void *io_device, void *ctx_buf) 807 { 808 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 809 810 bdev_nvme_abort_retry_ios(nbdev_ch); 811 _bdev_nvme_delete_io_paths(nbdev_ch); 812 } 813 814 static inline bool 815 bdev_nvme_io_type_is_admin(enum spdk_bdev_io_type io_type) 816 { 817 switch (io_type) { 818 case SPDK_BDEV_IO_TYPE_RESET: 819 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 820 case SPDK_BDEV_IO_TYPE_ABORT: 821 return true; 822 default: 823 break; 824 } 825 826 return false; 827 } 828 829 static inline bool 830 nvme_ns_is_active(struct nvme_ns *nvme_ns) 831 { 832 if (spdk_unlikely(nvme_ns->ana_state_updating)) { 833 return false; 834 } 835 836 if (spdk_unlikely(nvme_ns->ns == NULL)) { 837 return false; 838 } 839 840 return true; 841 } 842 843 static inline bool 844 nvme_ns_is_accessible(struct nvme_ns *nvme_ns) 845 { 846 if (spdk_unlikely(!nvme_ns_is_active(nvme_ns))) { 847 return false; 848 } 849 850 switch (nvme_ns->ana_state) { 851 case SPDK_NVME_ANA_OPTIMIZED_STATE: 852 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 853 return true; 854 default: 855 break; 856 } 857 858 return false; 859 } 860 861 static inline bool 862 nvme_qpair_is_connected(struct nvme_qpair *nvme_qpair) 863 { 864 if (spdk_unlikely(nvme_qpair->qpair == NULL)) { 865 return false; 866 } 867 868 if (spdk_unlikely(spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) != 869 SPDK_NVME_QPAIR_FAILURE_NONE)) { 870 return false; 871 } 872 873 if (spdk_unlikely(nvme_qpair->ctrlr_ch->reset_iter != NULL)) { 874 return false; 875 } 876 877 return true; 878 } 879 880 static inline bool 881 nvme_io_path_is_available(struct nvme_io_path *io_path) 882 { 883 if (spdk_unlikely(!nvme_qpair_is_connected(io_path->qpair))) { 884 return false; 885 } 886 887 if (spdk_unlikely(!nvme_ns_is_accessible(io_path->nvme_ns))) { 888 return false; 889 } 890 891 return true; 892 } 893 894 static inline bool 895 nvme_ctrlr_is_failed(struct nvme_ctrlr *nvme_ctrlr) 896 { 897 if (nvme_ctrlr->destruct) { 898 return true; 899 } 900 901 if (nvme_ctrlr->fast_io_fail_timedout) { 902 return true; 903 } 904 905 if (nvme_ctrlr->resetting) { 906 if (nvme_ctrlr->opts.reconnect_delay_sec != 0) { 907 return false; 908 } else { 909 return true; 910 } 911 } 912 913 if (nvme_ctrlr->reconnect_is_delayed) { 914 return false; 915 } 916 917 if (nvme_ctrlr->disabled) { 918 return true; 919 } 920 921 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 922 return true; 923 } else { 924 return false; 925 } 926 } 927 928 static bool 929 nvme_ctrlr_is_available(struct nvme_ctrlr *nvme_ctrlr) 930 { 931 if (nvme_ctrlr->destruct) { 932 return false; 933 } 934 935 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 936 return false; 937 } 938 939 if (nvme_ctrlr->resetting || nvme_ctrlr->reconnect_is_delayed) { 940 return false; 941 } 942 943 if (nvme_ctrlr->disabled) { 944 return false; 945 } 946 947 return true; 948 } 949 950 /* Simulate circular linked list. */ 951 static inline struct nvme_io_path * 952 nvme_io_path_get_next(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *prev_path) 953 { 954 struct nvme_io_path *next_path; 955 956 if (prev_path != NULL) { 957 next_path = STAILQ_NEXT(prev_path, stailq); 958 if (next_path != NULL) { 959 return next_path; 960 } 961 } 962 963 return STAILQ_FIRST(&nbdev_ch->io_path_list); 964 } 965 966 static struct nvme_io_path * 967 _bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 968 { 969 struct nvme_io_path *io_path, *start, *non_optimized = NULL; 970 971 start = nvme_io_path_get_next(nbdev_ch, nbdev_ch->current_io_path); 972 973 io_path = start; 974 do { 975 if (spdk_likely(nvme_qpair_is_connected(io_path->qpair) && 976 nvme_ns_is_active(io_path->nvme_ns))) { 977 switch (io_path->nvme_ns->ana_state) { 978 case SPDK_NVME_ANA_OPTIMIZED_STATE: 979 nbdev_ch->current_io_path = io_path; 980 return io_path; 981 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 982 if (non_optimized == NULL) { 983 non_optimized = io_path; 984 } 985 break; 986 default: 987 break; 988 } 989 } 990 io_path = nvme_io_path_get_next(nbdev_ch, io_path); 991 } while (io_path != start); 992 993 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) { 994 /* We come here only if there is no optimized path. Cache even non_optimized 995 * path for load balance across multiple non_optimized paths. 996 */ 997 nbdev_ch->current_io_path = non_optimized; 998 } 999 1000 return non_optimized; 1001 } 1002 1003 static struct nvme_io_path * 1004 _bdev_nvme_find_io_path_min_qd(struct nvme_bdev_channel *nbdev_ch) 1005 { 1006 struct nvme_io_path *io_path; 1007 struct nvme_io_path *optimized = NULL, *non_optimized = NULL; 1008 uint32_t opt_min_qd = UINT32_MAX, non_opt_min_qd = UINT32_MAX; 1009 uint32_t num_outstanding_reqs; 1010 1011 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 1012 if (spdk_unlikely(!nvme_qpair_is_connected(io_path->qpair))) { 1013 /* The device is currently resetting. */ 1014 continue; 1015 } 1016 1017 if (spdk_unlikely(!nvme_ns_is_active(io_path->nvme_ns))) { 1018 continue; 1019 } 1020 1021 num_outstanding_reqs = spdk_nvme_qpair_get_num_outstanding_reqs(io_path->qpair->qpair); 1022 switch (io_path->nvme_ns->ana_state) { 1023 case SPDK_NVME_ANA_OPTIMIZED_STATE: 1024 if (num_outstanding_reqs < opt_min_qd) { 1025 opt_min_qd = num_outstanding_reqs; 1026 optimized = io_path; 1027 } 1028 break; 1029 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 1030 if (num_outstanding_reqs < non_opt_min_qd) { 1031 non_opt_min_qd = num_outstanding_reqs; 1032 non_optimized = io_path; 1033 } 1034 break; 1035 default: 1036 break; 1037 } 1038 } 1039 1040 /* don't cache io path for BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH selector */ 1041 if (optimized != NULL) { 1042 return optimized; 1043 } 1044 1045 return non_optimized; 1046 } 1047 1048 static inline struct nvme_io_path * 1049 bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 1050 { 1051 if (spdk_likely(nbdev_ch->current_io_path != NULL)) { 1052 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE) { 1053 return nbdev_ch->current_io_path; 1054 } else if (nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 1055 if (++nbdev_ch->rr_counter < nbdev_ch->rr_min_io) { 1056 return nbdev_ch->current_io_path; 1057 } 1058 nbdev_ch->rr_counter = 0; 1059 } 1060 } 1061 1062 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE || 1063 nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 1064 return _bdev_nvme_find_io_path(nbdev_ch); 1065 } else { 1066 return _bdev_nvme_find_io_path_min_qd(nbdev_ch); 1067 } 1068 } 1069 1070 /* Return true if there is any io_path whose qpair is active or ctrlr is not failed, 1071 * or false otherwise. 1072 * 1073 * If any io_path has an active qpair but find_io_path() returned NULL, its namespace 1074 * is likely to be non-accessible now but may become accessible. 1075 * 1076 * If any io_path has an unfailed ctrlr but find_io_path() returned NULL, the ctrlr 1077 * is likely to be resetting now but the reset may succeed. A ctrlr is set to unfailed 1078 * when starting to reset it but it is set to failed when the reset failed. Hence, if 1079 * a ctrlr is unfailed, it is likely that it works fine or is resetting. 1080 */ 1081 static bool 1082 any_io_path_may_become_available(struct nvme_bdev_channel *nbdev_ch) 1083 { 1084 struct nvme_io_path *io_path; 1085 1086 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 1087 if (io_path->nvme_ns->ana_transition_timedout) { 1088 continue; 1089 } 1090 1091 if (nvme_qpair_is_connected(io_path->qpair) || 1092 !nvme_ctrlr_is_failed(io_path->qpair->ctrlr)) { 1093 return true; 1094 } 1095 } 1096 1097 return false; 1098 } 1099 1100 static void 1101 bdev_nvme_retry_io(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io) 1102 { 1103 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 1104 struct spdk_io_channel *ch; 1105 1106 if (nbdev_io->io_path != NULL && nvme_io_path_is_available(nbdev_io->io_path)) { 1107 _bdev_nvme_submit_request(nbdev_ch, bdev_io); 1108 } else { 1109 ch = spdk_io_channel_from_ctx(nbdev_ch); 1110 bdev_nvme_submit_request(ch, bdev_io); 1111 } 1112 } 1113 1114 static int 1115 bdev_nvme_retry_ios(void *arg) 1116 { 1117 struct nvme_bdev_channel *nbdev_ch = arg; 1118 struct nvme_bdev_io *bio, *tmp_bio; 1119 uint64_t now, delay_us; 1120 1121 now = spdk_get_ticks(); 1122 1123 TAILQ_FOREACH_SAFE(bio, &nbdev_ch->retry_io_list, retry_link, tmp_bio) { 1124 if (bio->retry_ticks > now) { 1125 break; 1126 } 1127 1128 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bio, retry_link); 1129 1130 bdev_nvme_retry_io(nbdev_ch, spdk_bdev_io_from_ctx(bio)); 1131 } 1132 1133 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1134 1135 bio = TAILQ_FIRST(&nbdev_ch->retry_io_list); 1136 if (bio != NULL) { 1137 delay_us = (bio->retry_ticks - now) * SPDK_SEC_TO_USEC / spdk_get_ticks_hz(); 1138 1139 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 1140 delay_us); 1141 } 1142 1143 return SPDK_POLLER_BUSY; 1144 } 1145 1146 static void 1147 bdev_nvme_queue_retry_io(struct nvme_bdev_channel *nbdev_ch, 1148 struct nvme_bdev_io *bio, uint64_t delay_ms) 1149 { 1150 struct nvme_bdev_io *tmp_bio; 1151 1152 bio->retry_ticks = spdk_get_ticks() + delay_ms * spdk_get_ticks_hz() / 1000ULL; 1153 1154 TAILQ_FOREACH_REVERSE(tmp_bio, &nbdev_ch->retry_io_list, retry_io_head, retry_link) { 1155 if (tmp_bio->retry_ticks <= bio->retry_ticks) { 1156 TAILQ_INSERT_AFTER(&nbdev_ch->retry_io_list, tmp_bio, bio, 1157 retry_link); 1158 return; 1159 } 1160 } 1161 1162 /* No earlier I/Os were found. This I/O must be the new head. */ 1163 TAILQ_INSERT_HEAD(&nbdev_ch->retry_io_list, bio, retry_link); 1164 1165 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1166 1167 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 1168 delay_ms * 1000ULL); 1169 } 1170 1171 static void 1172 bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch) 1173 { 1174 struct nvme_bdev_io *bio, *tmp_bio; 1175 1176 TAILQ_FOREACH_SAFE(bio, &nbdev_ch->retry_io_list, retry_link, tmp_bio) { 1177 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bio, retry_link); 1178 __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_ABORTED, NULL); 1179 } 1180 1181 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1182 } 1183 1184 static int 1185 bdev_nvme_abort_retry_io(struct nvme_bdev_channel *nbdev_ch, 1186 struct nvme_bdev_io *bio_to_abort) 1187 { 1188 struct nvme_bdev_io *bio; 1189 1190 TAILQ_FOREACH(bio, &nbdev_ch->retry_io_list, retry_link) { 1191 if (bio == bio_to_abort) { 1192 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bio, retry_link); 1193 __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_ABORTED, NULL); 1194 return 0; 1195 } 1196 } 1197 1198 return -ENOENT; 1199 } 1200 1201 static void 1202 bdev_nvme_update_nvme_error_stat(struct spdk_bdev_io *bdev_io, const struct spdk_nvme_cpl *cpl) 1203 { 1204 struct nvme_bdev *nbdev; 1205 uint16_t sct, sc; 1206 1207 assert(spdk_nvme_cpl_is_error(cpl)); 1208 1209 nbdev = bdev_io->bdev->ctxt; 1210 1211 if (nbdev->err_stat == NULL) { 1212 return; 1213 } 1214 1215 sct = cpl->status.sct; 1216 sc = cpl->status.sc; 1217 1218 pthread_mutex_lock(&nbdev->mutex); 1219 1220 nbdev->err_stat->status_type[sct]++; 1221 switch (sct) { 1222 case SPDK_NVME_SCT_GENERIC: 1223 case SPDK_NVME_SCT_COMMAND_SPECIFIC: 1224 case SPDK_NVME_SCT_MEDIA_ERROR: 1225 case SPDK_NVME_SCT_PATH: 1226 nbdev->err_stat->status[sct][sc]++; 1227 break; 1228 default: 1229 break; 1230 } 1231 1232 pthread_mutex_unlock(&nbdev->mutex); 1233 } 1234 1235 static inline void 1236 bdev_nvme_update_io_path_stat(struct nvme_bdev_io *bio) 1237 { 1238 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1239 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 1240 uint32_t blocklen = bdev_io->bdev->blocklen; 1241 struct spdk_bdev_io_stat *stat; 1242 uint64_t tsc_diff; 1243 1244 if (bio->io_path->stat == NULL) { 1245 return; 1246 } 1247 1248 tsc_diff = spdk_get_ticks() - bio->submit_tsc; 1249 stat = bio->io_path->stat; 1250 1251 switch (bdev_io->type) { 1252 case SPDK_BDEV_IO_TYPE_READ: 1253 stat->bytes_read += num_blocks * blocklen; 1254 stat->num_read_ops++; 1255 stat->read_latency_ticks += tsc_diff; 1256 if (stat->max_read_latency_ticks < tsc_diff) { 1257 stat->max_read_latency_ticks = tsc_diff; 1258 } 1259 if (stat->min_read_latency_ticks > tsc_diff) { 1260 stat->min_read_latency_ticks = tsc_diff; 1261 } 1262 break; 1263 case SPDK_BDEV_IO_TYPE_WRITE: 1264 stat->bytes_written += num_blocks * blocklen; 1265 stat->num_write_ops++; 1266 stat->write_latency_ticks += tsc_diff; 1267 if (stat->max_write_latency_ticks < tsc_diff) { 1268 stat->max_write_latency_ticks = tsc_diff; 1269 } 1270 if (stat->min_write_latency_ticks > tsc_diff) { 1271 stat->min_write_latency_ticks = tsc_diff; 1272 } 1273 break; 1274 case SPDK_BDEV_IO_TYPE_UNMAP: 1275 stat->bytes_unmapped += num_blocks * blocklen; 1276 stat->num_unmap_ops++; 1277 stat->unmap_latency_ticks += tsc_diff; 1278 if (stat->max_unmap_latency_ticks < tsc_diff) { 1279 stat->max_unmap_latency_ticks = tsc_diff; 1280 } 1281 if (stat->min_unmap_latency_ticks > tsc_diff) { 1282 stat->min_unmap_latency_ticks = tsc_diff; 1283 } 1284 break; 1285 case SPDK_BDEV_IO_TYPE_ZCOPY: 1286 /* Track the data in the start phase only */ 1287 if (!bdev_io->u.bdev.zcopy.start) { 1288 break; 1289 } 1290 if (bdev_io->u.bdev.zcopy.populate) { 1291 stat->bytes_read += num_blocks * blocklen; 1292 stat->num_read_ops++; 1293 stat->read_latency_ticks += tsc_diff; 1294 if (stat->max_read_latency_ticks < tsc_diff) { 1295 stat->max_read_latency_ticks = tsc_diff; 1296 } 1297 if (stat->min_read_latency_ticks > tsc_diff) { 1298 stat->min_read_latency_ticks = tsc_diff; 1299 } 1300 } else { 1301 stat->bytes_written += num_blocks * blocklen; 1302 stat->num_write_ops++; 1303 stat->write_latency_ticks += tsc_diff; 1304 if (stat->max_write_latency_ticks < tsc_diff) { 1305 stat->max_write_latency_ticks = tsc_diff; 1306 } 1307 if (stat->min_write_latency_ticks > tsc_diff) { 1308 stat->min_write_latency_ticks = tsc_diff; 1309 } 1310 } 1311 break; 1312 case SPDK_BDEV_IO_TYPE_COPY: 1313 stat->bytes_copied += num_blocks * blocklen; 1314 stat->num_copy_ops++; 1315 stat->copy_latency_ticks += tsc_diff; 1316 if (stat->max_copy_latency_ticks < tsc_diff) { 1317 stat->max_copy_latency_ticks = tsc_diff; 1318 } 1319 if (stat->min_copy_latency_ticks > tsc_diff) { 1320 stat->min_copy_latency_ticks = tsc_diff; 1321 } 1322 break; 1323 default: 1324 break; 1325 } 1326 } 1327 1328 static bool 1329 bdev_nvme_check_retry_io(struct nvme_bdev_io *bio, 1330 const struct spdk_nvme_cpl *cpl, 1331 struct nvme_bdev_channel *nbdev_ch, 1332 uint64_t *_delay_ms) 1333 { 1334 struct nvme_io_path *io_path = bio->io_path; 1335 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 1336 const struct spdk_nvme_ctrlr_data *cdata; 1337 1338 if (spdk_nvme_cpl_is_path_error(cpl) || 1339 spdk_nvme_cpl_is_aborted_sq_deletion(cpl) || 1340 !nvme_io_path_is_available(io_path) || 1341 !nvme_ctrlr_is_available(nvme_ctrlr)) { 1342 bdev_nvme_clear_current_io_path(nbdev_ch); 1343 bio->io_path = NULL; 1344 if (spdk_nvme_cpl_is_ana_error(cpl)) { 1345 if (nvme_ctrlr_read_ana_log_page(nvme_ctrlr) == 0) { 1346 io_path->nvme_ns->ana_state_updating = true; 1347 } 1348 } 1349 if (!any_io_path_may_become_available(nbdev_ch)) { 1350 return false; 1351 } 1352 *_delay_ms = 0; 1353 } else { 1354 bio->retry_count++; 1355 1356 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 1357 1358 if (cpl->status.crd != 0) { 1359 *_delay_ms = cdata->crdt[cpl->status.crd] * 100; 1360 } else { 1361 *_delay_ms = 0; 1362 } 1363 } 1364 1365 return true; 1366 } 1367 1368 static inline void 1369 bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio, 1370 const struct spdk_nvme_cpl *cpl) 1371 { 1372 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1373 struct nvme_bdev_channel *nbdev_ch; 1374 uint64_t delay_ms; 1375 1376 assert(!bdev_nvme_io_type_is_admin(bdev_io->type)); 1377 1378 if (spdk_likely(spdk_nvme_cpl_is_success(cpl))) { 1379 bdev_nvme_update_io_path_stat(bio); 1380 goto complete; 1381 } 1382 1383 /* Update error counts before deciding if retry is needed. 1384 * Hence, error counts may be more than the number of I/O errors. 1385 */ 1386 bdev_nvme_update_nvme_error_stat(bdev_io, cpl); 1387 1388 if (cpl->status.dnr != 0 || spdk_nvme_cpl_is_aborted_by_request(cpl) || 1389 (g_opts.bdev_retry_count != -1 && bio->retry_count >= g_opts.bdev_retry_count)) { 1390 goto complete; 1391 } 1392 1393 /* At this point we don't know whether the sequence was successfully executed or not, so we 1394 * cannot retry the IO */ 1395 if (bdev_io->u.bdev.accel_sequence != NULL) { 1396 goto complete; 1397 } 1398 1399 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1400 1401 if (bdev_nvme_check_retry_io(bio, cpl, nbdev_ch, &delay_ms)) { 1402 bdev_nvme_queue_retry_io(nbdev_ch, bio, delay_ms); 1403 return; 1404 } 1405 1406 complete: 1407 bio->retry_count = 0; 1408 bio->submit_tsc = 0; 1409 bdev_io->u.bdev.accel_sequence = NULL; 1410 __bdev_nvme_io_complete(bdev_io, 0, cpl); 1411 } 1412 1413 static inline void 1414 bdev_nvme_io_complete(struct nvme_bdev_io *bio, int rc) 1415 { 1416 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1417 struct nvme_bdev_channel *nbdev_ch; 1418 enum spdk_bdev_io_status io_status; 1419 1420 assert(!bdev_nvme_io_type_is_admin(bdev_io->type)); 1421 1422 switch (rc) { 1423 case 0: 1424 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1425 break; 1426 case -ENOMEM: 1427 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1428 break; 1429 case -ENXIO: 1430 if (g_opts.bdev_retry_count == -1 || bio->retry_count < g_opts.bdev_retry_count) { 1431 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1432 1433 bdev_nvme_clear_current_io_path(nbdev_ch); 1434 bio->io_path = NULL; 1435 1436 if (any_io_path_may_become_available(nbdev_ch)) { 1437 bdev_nvme_queue_retry_io(nbdev_ch, bio, 1000ULL); 1438 return; 1439 } 1440 } 1441 1442 /* fallthrough */ 1443 default: 1444 spdk_accel_sequence_abort(bdev_io->u.bdev.accel_sequence); 1445 bdev_io->u.bdev.accel_sequence = NULL; 1446 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1447 break; 1448 } 1449 1450 bio->retry_count = 0; 1451 bio->submit_tsc = 0; 1452 __bdev_nvme_io_complete(bdev_io, io_status, NULL); 1453 } 1454 1455 static inline void 1456 bdev_nvme_admin_complete(struct nvme_bdev_io *bio, int rc) 1457 { 1458 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1459 enum spdk_bdev_io_status io_status; 1460 1461 switch (rc) { 1462 case 0: 1463 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1464 break; 1465 case -ENOMEM: 1466 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1467 break; 1468 case -ENXIO: 1469 /* fallthrough */ 1470 default: 1471 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1472 break; 1473 } 1474 1475 __bdev_nvme_io_complete(bdev_io, io_status, NULL); 1476 } 1477 1478 static void 1479 bdev_nvme_clear_io_path_caches_done(struct spdk_io_channel_iter *i, int status) 1480 { 1481 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 1482 1483 pthread_mutex_lock(&nvme_ctrlr->mutex); 1484 1485 assert(nvme_ctrlr->io_path_cache_clearing == true); 1486 nvme_ctrlr->io_path_cache_clearing = false; 1487 1488 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 1489 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1490 return; 1491 } 1492 1493 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1494 1495 nvme_ctrlr_unregister(nvme_ctrlr); 1496 } 1497 1498 static void 1499 _bdev_nvme_clear_io_path_cache(struct nvme_qpair *nvme_qpair) 1500 { 1501 struct nvme_io_path *io_path; 1502 1503 TAILQ_FOREACH(io_path, &nvme_qpair->io_path_list, tailq) { 1504 if (io_path->nbdev_ch == NULL) { 1505 continue; 1506 } 1507 bdev_nvme_clear_current_io_path(io_path->nbdev_ch); 1508 } 1509 } 1510 1511 static void 1512 bdev_nvme_clear_io_path_cache(struct spdk_io_channel_iter *i) 1513 { 1514 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 1515 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 1516 1517 assert(ctrlr_ch->qpair != NULL); 1518 1519 _bdev_nvme_clear_io_path_cache(ctrlr_ch->qpair); 1520 1521 spdk_for_each_channel_continue(i, 0); 1522 } 1523 1524 static void 1525 bdev_nvme_clear_io_path_caches(struct nvme_ctrlr *nvme_ctrlr) 1526 { 1527 pthread_mutex_lock(&nvme_ctrlr->mutex); 1528 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 1529 nvme_ctrlr->io_path_cache_clearing) { 1530 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1531 return; 1532 } 1533 1534 nvme_ctrlr->io_path_cache_clearing = true; 1535 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1536 1537 spdk_for_each_channel(nvme_ctrlr, 1538 bdev_nvme_clear_io_path_cache, 1539 NULL, 1540 bdev_nvme_clear_io_path_caches_done); 1541 } 1542 1543 static struct nvme_qpair * 1544 nvme_poll_group_get_qpair(struct nvme_poll_group *group, struct spdk_nvme_qpair *qpair) 1545 { 1546 struct nvme_qpair *nvme_qpair; 1547 1548 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1549 if (nvme_qpair->qpair == qpair) { 1550 break; 1551 } 1552 } 1553 1554 return nvme_qpair; 1555 } 1556 1557 static void nvme_qpair_delete(struct nvme_qpair *nvme_qpair); 1558 1559 static void 1560 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx) 1561 { 1562 struct nvme_poll_group *group = poll_group_ctx; 1563 struct nvme_qpair *nvme_qpair; 1564 struct nvme_ctrlr_channel *ctrlr_ch; 1565 int status; 1566 1567 nvme_qpair = nvme_poll_group_get_qpair(group, qpair); 1568 if (nvme_qpair == NULL) { 1569 return; 1570 } 1571 1572 if (nvme_qpair->qpair != NULL) { 1573 spdk_nvme_ctrlr_free_io_qpair(nvme_qpair->qpair); 1574 nvme_qpair->qpair = NULL; 1575 } 1576 1577 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1578 1579 ctrlr_ch = nvme_qpair->ctrlr_ch; 1580 1581 if (ctrlr_ch != NULL) { 1582 if (ctrlr_ch->reset_iter != NULL) { 1583 /* We are in a full reset sequence. */ 1584 if (ctrlr_ch->connect_poller != NULL) { 1585 /* qpair was failed to connect. Abort the reset sequence. */ 1586 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was failed to connect. abort the reset ctrlr sequence.\n", 1587 qpair); 1588 spdk_poller_unregister(&ctrlr_ch->connect_poller); 1589 status = -1; 1590 } else { 1591 /* qpair was completed to disconnect. Just move to the next ctrlr_channel. */ 1592 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was disconnected and freed in a reset ctrlr sequence.\n", 1593 qpair); 1594 status = 0; 1595 } 1596 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, status); 1597 ctrlr_ch->reset_iter = NULL; 1598 } else { 1599 /* qpair was disconnected unexpectedly. Reset controller for recovery. */ 1600 SPDK_NOTICELOG("qpair %p was disconnected and freed. reset controller.\n", qpair); 1601 bdev_nvme_failover_ctrlr(nvme_qpair->ctrlr); 1602 } 1603 } else { 1604 /* In this case, ctrlr_channel is already deleted. */ 1605 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was disconnected and freed. delete nvme_qpair.\n", qpair); 1606 nvme_qpair_delete(nvme_qpair); 1607 } 1608 } 1609 1610 static void 1611 bdev_nvme_check_io_qpairs(struct nvme_poll_group *group) 1612 { 1613 struct nvme_qpair *nvme_qpair; 1614 1615 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1616 if (nvme_qpair->qpair == NULL || nvme_qpair->ctrlr_ch == NULL) { 1617 continue; 1618 } 1619 1620 if (spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) != 1621 SPDK_NVME_QPAIR_FAILURE_NONE) { 1622 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1623 } 1624 } 1625 } 1626 1627 static int 1628 bdev_nvme_poll(void *arg) 1629 { 1630 struct nvme_poll_group *group = arg; 1631 int64_t num_completions; 1632 1633 if (group->collect_spin_stat && group->start_ticks == 0) { 1634 group->start_ticks = spdk_get_ticks(); 1635 } 1636 1637 num_completions = spdk_nvme_poll_group_process_completions(group->group, 0, 1638 bdev_nvme_disconnected_qpair_cb); 1639 if (group->collect_spin_stat) { 1640 if (num_completions > 0) { 1641 if (group->end_ticks != 0) { 1642 group->spin_ticks += (group->end_ticks - group->start_ticks); 1643 group->end_ticks = 0; 1644 } 1645 group->start_ticks = 0; 1646 } else { 1647 group->end_ticks = spdk_get_ticks(); 1648 } 1649 } 1650 1651 if (spdk_unlikely(num_completions < 0)) { 1652 bdev_nvme_check_io_qpairs(group); 1653 } 1654 1655 return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 1656 } 1657 1658 static int bdev_nvme_poll_adminq(void *arg); 1659 1660 static void 1661 bdev_nvme_change_adminq_poll_period(struct nvme_ctrlr *nvme_ctrlr, uint64_t new_period_us) 1662 { 1663 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 1664 1665 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, 1666 nvme_ctrlr, new_period_us); 1667 } 1668 1669 static int 1670 bdev_nvme_poll_adminq(void *arg) 1671 { 1672 int32_t rc; 1673 struct nvme_ctrlr *nvme_ctrlr = arg; 1674 nvme_ctrlr_disconnected_cb disconnected_cb; 1675 1676 assert(nvme_ctrlr != NULL); 1677 1678 rc = spdk_nvme_ctrlr_process_admin_completions(nvme_ctrlr->ctrlr); 1679 if (rc < 0) { 1680 disconnected_cb = nvme_ctrlr->disconnected_cb; 1681 nvme_ctrlr->disconnected_cb = NULL; 1682 1683 if (disconnected_cb != NULL) { 1684 bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 1685 g_opts.nvme_adminq_poll_period_us); 1686 disconnected_cb(nvme_ctrlr); 1687 } else { 1688 bdev_nvme_failover_ctrlr(nvme_ctrlr); 1689 } 1690 } else if (spdk_nvme_ctrlr_get_admin_qp_failure_reason(nvme_ctrlr->ctrlr) != 1691 SPDK_NVME_QPAIR_FAILURE_NONE) { 1692 bdev_nvme_clear_io_path_caches(nvme_ctrlr); 1693 } 1694 1695 return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY; 1696 } 1697 1698 static void 1699 nvme_bdev_free(void *io_device) 1700 { 1701 struct nvme_bdev *nvme_disk = io_device; 1702 1703 pthread_mutex_destroy(&nvme_disk->mutex); 1704 free(nvme_disk->disk.name); 1705 free(nvme_disk->err_stat); 1706 free(nvme_disk); 1707 } 1708 1709 static int 1710 bdev_nvme_destruct(void *ctx) 1711 { 1712 struct nvme_bdev *nvme_disk = ctx; 1713 struct nvme_ns *nvme_ns, *tmp_nvme_ns; 1714 1715 SPDK_DTRACE_PROBE2(bdev_nvme_destruct, nvme_disk->nbdev_ctrlr->name, nvme_disk->nsid); 1716 1717 TAILQ_FOREACH_SAFE(nvme_ns, &nvme_disk->nvme_ns_list, tailq, tmp_nvme_ns) { 1718 pthread_mutex_lock(&nvme_ns->ctrlr->mutex); 1719 1720 nvme_ns->bdev = NULL; 1721 1722 assert(nvme_ns->id > 0); 1723 1724 if (nvme_ctrlr_get_ns(nvme_ns->ctrlr, nvme_ns->id) == NULL) { 1725 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1726 1727 nvme_ctrlr_release(nvme_ns->ctrlr); 1728 nvme_ns_free(nvme_ns); 1729 } else { 1730 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1731 } 1732 } 1733 1734 pthread_mutex_lock(&g_bdev_nvme_mutex); 1735 TAILQ_REMOVE(&nvme_disk->nbdev_ctrlr->bdevs, nvme_disk, tailq); 1736 pthread_mutex_unlock(&g_bdev_nvme_mutex); 1737 1738 spdk_io_device_unregister(nvme_disk, nvme_bdev_free); 1739 1740 return 0; 1741 } 1742 1743 static int 1744 bdev_nvme_create_qpair(struct nvme_qpair *nvme_qpair) 1745 { 1746 struct nvme_ctrlr *nvme_ctrlr; 1747 struct spdk_nvme_io_qpair_opts opts; 1748 struct spdk_nvme_qpair *qpair; 1749 int rc; 1750 1751 nvme_ctrlr = nvme_qpair->ctrlr; 1752 1753 spdk_nvme_ctrlr_get_default_io_qpair_opts(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1754 opts.delay_cmd_submit = g_opts.delay_cmd_submit; 1755 opts.create_only = true; 1756 opts.async_mode = true; 1757 opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests); 1758 g_opts.io_queue_requests = opts.io_queue_requests; 1759 1760 qpair = spdk_nvme_ctrlr_alloc_io_qpair(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1761 if (qpair == NULL) { 1762 return -1; 1763 } 1764 1765 SPDK_DTRACE_PROBE3(bdev_nvme_create_qpair, nvme_ctrlr->nbdev_ctrlr->name, 1766 spdk_nvme_qpair_get_id(qpair), spdk_thread_get_id(nvme_ctrlr->thread)); 1767 1768 assert(nvme_qpair->group != NULL); 1769 1770 rc = spdk_nvme_poll_group_add(nvme_qpair->group->group, qpair); 1771 if (rc != 0) { 1772 SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n"); 1773 goto err; 1774 } 1775 1776 rc = spdk_nvme_ctrlr_connect_io_qpair(nvme_ctrlr->ctrlr, qpair); 1777 if (rc != 0) { 1778 SPDK_ERRLOG("Unable to connect I/O qpair.\n"); 1779 goto err; 1780 } 1781 1782 nvme_qpair->qpair = qpair; 1783 1784 if (!g_opts.disable_auto_failback) { 1785 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1786 } 1787 1788 return 0; 1789 1790 err: 1791 spdk_nvme_ctrlr_free_io_qpair(qpair); 1792 1793 return rc; 1794 } 1795 1796 static void bdev_nvme_reset_io_continue(void *cb_arg, int rc); 1797 1798 static void 1799 bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i) 1800 { 1801 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 1802 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 1803 int rc = 0; 1804 struct nvme_bdev_io *bio; 1805 1806 if (spdk_io_channel_iter_get_ctx(i) != NULL) { 1807 rc = -1; 1808 } 1809 1810 while (!TAILQ_EMPTY(&ctrlr_ch->pending_resets)) { 1811 bio = TAILQ_FIRST(&ctrlr_ch->pending_resets); 1812 TAILQ_REMOVE(&ctrlr_ch->pending_resets, bio, retry_link); 1813 1814 bdev_nvme_reset_io_continue(bio, rc); 1815 } 1816 1817 spdk_for_each_channel_continue(i, 0); 1818 } 1819 1820 /* This function marks the current trid as failed by storing the current ticks 1821 * and then sets the next trid to the active trid within a controller if exists. 1822 * 1823 * The purpose of the boolean return value is to request the caller to disconnect 1824 * the current trid now to try connecting the next trid. 1825 */ 1826 static bool 1827 bdev_nvme_failover_trid(struct nvme_ctrlr *nvme_ctrlr, bool remove, bool start) 1828 { 1829 struct nvme_path_id *path_id, *next_path; 1830 int rc __attribute__((unused)); 1831 1832 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 1833 assert(path_id); 1834 assert(path_id == nvme_ctrlr->active_path_id); 1835 next_path = TAILQ_NEXT(path_id, link); 1836 1837 /* Update the last failed time. It means the trid is failed if its last 1838 * failed time is non-zero. 1839 */ 1840 path_id->last_failed_tsc = spdk_get_ticks(); 1841 1842 if (next_path == NULL) { 1843 /* There is no alternate trid within a controller. */ 1844 return false; 1845 } 1846 1847 if (!start && nvme_ctrlr->opts.reconnect_delay_sec == 0) { 1848 /* Connect is not retried in a controller reset sequence. Connecting 1849 * the next trid will be done by the next bdev_nvme_failover_ctrlr() call. 1850 */ 1851 return false; 1852 } 1853 1854 assert(path_id->trid.trtype != SPDK_NVME_TRANSPORT_PCIE); 1855 1856 SPDK_NOTICELOG("Start failover from %s:%s to %s:%s\n", path_id->trid.traddr, 1857 path_id->trid.trsvcid, next_path->trid.traddr, next_path->trid.trsvcid); 1858 1859 spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr); 1860 nvme_ctrlr->active_path_id = next_path; 1861 rc = spdk_nvme_ctrlr_set_trid(nvme_ctrlr->ctrlr, &next_path->trid); 1862 assert(rc == 0); 1863 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 1864 if (!remove) { 1865 /** Shuffle the old trid to the end of the list and use the new one. 1866 * Allows for round robin through multiple connections. 1867 */ 1868 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, path_id, link); 1869 } else { 1870 free(path_id); 1871 } 1872 1873 if (start || next_path->last_failed_tsc == 0) { 1874 /* bdev_nvme_failover_ctrlr() is just called or the next trid is not failed 1875 * or used yet. Try the next trid now. 1876 */ 1877 return true; 1878 } 1879 1880 if (spdk_get_ticks() > next_path->last_failed_tsc + spdk_get_ticks_hz() * 1881 nvme_ctrlr->opts.reconnect_delay_sec) { 1882 /* Enough backoff passed since the next trid failed. Try the next trid now. */ 1883 return true; 1884 } 1885 1886 /* The next trid will be tried after reconnect_delay_sec seconds. */ 1887 return false; 1888 } 1889 1890 static bool 1891 bdev_nvme_check_ctrlr_loss_timeout(struct nvme_ctrlr *nvme_ctrlr) 1892 { 1893 int32_t elapsed; 1894 1895 if (nvme_ctrlr->opts.ctrlr_loss_timeout_sec == 0 || 1896 nvme_ctrlr->opts.ctrlr_loss_timeout_sec == -1) { 1897 return false; 1898 } 1899 1900 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 1901 if (elapsed >= nvme_ctrlr->opts.ctrlr_loss_timeout_sec) { 1902 return true; 1903 } else { 1904 return false; 1905 } 1906 } 1907 1908 static bool 1909 bdev_nvme_check_fast_io_fail_timeout(struct nvme_ctrlr *nvme_ctrlr) 1910 { 1911 uint32_t elapsed; 1912 1913 if (nvme_ctrlr->opts.fast_io_fail_timeout_sec == 0) { 1914 return false; 1915 } 1916 1917 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 1918 if (elapsed >= nvme_ctrlr->opts.fast_io_fail_timeout_sec) { 1919 return true; 1920 } else { 1921 return false; 1922 } 1923 } 1924 1925 static void bdev_nvme_reset_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, bool success); 1926 1927 static void 1928 nvme_ctrlr_disconnect(struct nvme_ctrlr *nvme_ctrlr, nvme_ctrlr_disconnected_cb cb_fn) 1929 { 1930 int rc; 1931 1932 rc = spdk_nvme_ctrlr_disconnect(nvme_ctrlr->ctrlr); 1933 if (rc != 0) { 1934 /* Disconnect fails if ctrlr is already resetting or removed. In this case, 1935 * fail the reset sequence immediately. 1936 */ 1937 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 1938 return; 1939 } 1940 1941 /* spdk_nvme_ctrlr_disconnect() may complete asynchronously later by polling adminq. 1942 * Set callback here to execute the specified operation after ctrlr is really disconnected. 1943 */ 1944 assert(nvme_ctrlr->disconnected_cb == NULL); 1945 nvme_ctrlr->disconnected_cb = cb_fn; 1946 1947 /* During disconnection, reduce the period to poll adminq more often. */ 1948 bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 0); 1949 } 1950 1951 enum bdev_nvme_op_after_reset { 1952 OP_NONE, 1953 OP_COMPLETE_PENDING_DESTRUCT, 1954 OP_DESTRUCT, 1955 OP_DELAYED_RECONNECT, 1956 OP_FAILOVER, 1957 }; 1958 1959 typedef enum bdev_nvme_op_after_reset _bdev_nvme_op_after_reset; 1960 1961 static _bdev_nvme_op_after_reset 1962 bdev_nvme_check_op_after_reset(struct nvme_ctrlr *nvme_ctrlr, bool success) 1963 { 1964 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 1965 /* Complete pending destruct after reset completes. */ 1966 return OP_COMPLETE_PENDING_DESTRUCT; 1967 } else if (nvme_ctrlr->pending_failover) { 1968 nvme_ctrlr->pending_failover = false; 1969 nvme_ctrlr->reset_start_tsc = 0; 1970 return OP_FAILOVER; 1971 } else if (success || nvme_ctrlr->opts.reconnect_delay_sec == 0) { 1972 nvme_ctrlr->reset_start_tsc = 0; 1973 return OP_NONE; 1974 } else if (bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 1975 return OP_DESTRUCT; 1976 } else { 1977 if (bdev_nvme_check_fast_io_fail_timeout(nvme_ctrlr)) { 1978 nvme_ctrlr->fast_io_fail_timedout = true; 1979 } 1980 return OP_DELAYED_RECONNECT; 1981 } 1982 } 1983 1984 static int bdev_nvme_delete_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool hotplug); 1985 static void bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 1986 1987 static int 1988 bdev_nvme_reconnect_delay_timer_expired(void *ctx) 1989 { 1990 struct nvme_ctrlr *nvme_ctrlr = ctx; 1991 1992 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect_delay, nvme_ctrlr->nbdev_ctrlr->name); 1993 pthread_mutex_lock(&nvme_ctrlr->mutex); 1994 1995 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 1996 1997 if (!nvme_ctrlr->reconnect_is_delayed) { 1998 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1999 return SPDK_POLLER_BUSY; 2000 } 2001 2002 nvme_ctrlr->reconnect_is_delayed = false; 2003 2004 if (nvme_ctrlr->destruct) { 2005 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2006 return SPDK_POLLER_BUSY; 2007 } 2008 2009 assert(nvme_ctrlr->resetting == false); 2010 nvme_ctrlr->resetting = true; 2011 2012 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2013 2014 spdk_poller_resume(nvme_ctrlr->adminq_timer_poller); 2015 2016 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2017 return SPDK_POLLER_BUSY; 2018 } 2019 2020 static void 2021 bdev_nvme_start_reconnect_delay_timer(struct nvme_ctrlr *nvme_ctrlr) 2022 { 2023 spdk_poller_pause(nvme_ctrlr->adminq_timer_poller); 2024 2025 assert(nvme_ctrlr->reconnect_is_delayed == false); 2026 nvme_ctrlr->reconnect_is_delayed = true; 2027 2028 assert(nvme_ctrlr->reconnect_delay_timer == NULL); 2029 nvme_ctrlr->reconnect_delay_timer = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_delay_timer_expired, 2030 nvme_ctrlr, 2031 nvme_ctrlr->opts.reconnect_delay_sec * SPDK_SEC_TO_USEC); 2032 } 2033 2034 static void remove_discovery_entry(struct nvme_ctrlr *nvme_ctrlr); 2035 2036 static void 2037 _bdev_nvme_reset_ctrlr_complete(struct spdk_io_channel_iter *i, int status) 2038 { 2039 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2040 bool success = spdk_io_channel_iter_get_ctx(i) == NULL; 2041 bdev_nvme_ctrlr_op_cb ctrlr_op_cb_fn = nvme_ctrlr->ctrlr_op_cb_fn; 2042 void *ctrlr_op_cb_arg = nvme_ctrlr->ctrlr_op_cb_arg; 2043 enum bdev_nvme_op_after_reset op_after_reset; 2044 2045 assert(nvme_ctrlr->thread == spdk_get_thread()); 2046 2047 nvme_ctrlr->ctrlr_op_cb_fn = NULL; 2048 nvme_ctrlr->ctrlr_op_cb_arg = NULL; 2049 2050 if (!success) { 2051 SPDK_ERRLOG("Resetting controller failed.\n"); 2052 } else { 2053 SPDK_NOTICELOG("Resetting controller successful.\n"); 2054 } 2055 2056 pthread_mutex_lock(&nvme_ctrlr->mutex); 2057 nvme_ctrlr->resetting = false; 2058 nvme_ctrlr->dont_retry = false; 2059 nvme_ctrlr->in_failover = false; 2060 2061 op_after_reset = bdev_nvme_check_op_after_reset(nvme_ctrlr, success); 2062 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2063 2064 /* Delay callbacks when the next operation is a failover. */ 2065 if (ctrlr_op_cb_fn && op_after_reset != OP_FAILOVER) { 2066 ctrlr_op_cb_fn(ctrlr_op_cb_arg, success ? 0 : -1); 2067 } 2068 2069 switch (op_after_reset) { 2070 case OP_COMPLETE_PENDING_DESTRUCT: 2071 nvme_ctrlr_unregister(nvme_ctrlr); 2072 break; 2073 case OP_DESTRUCT: 2074 bdev_nvme_delete_ctrlr(nvme_ctrlr, false); 2075 remove_discovery_entry(nvme_ctrlr); 2076 break; 2077 case OP_DELAYED_RECONNECT: 2078 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_start_reconnect_delay_timer); 2079 break; 2080 case OP_FAILOVER: 2081 nvme_ctrlr->ctrlr_op_cb_fn = ctrlr_op_cb_fn; 2082 nvme_ctrlr->ctrlr_op_cb_arg = ctrlr_op_cb_arg; 2083 bdev_nvme_failover_ctrlr(nvme_ctrlr); 2084 break; 2085 default: 2086 break; 2087 } 2088 } 2089 2090 static void 2091 bdev_nvme_reset_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, bool success) 2092 { 2093 pthread_mutex_lock(&nvme_ctrlr->mutex); 2094 if (!success) { 2095 /* Connecting the active trid failed. Set the next alternate trid to the 2096 * active trid if it exists. 2097 */ 2098 if (bdev_nvme_failover_trid(nvme_ctrlr, false, false)) { 2099 /* The next alternate trid exists and is ready to try. Try it now. */ 2100 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2101 2102 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr); 2103 return; 2104 } 2105 2106 /* We came here if there is no alternate trid or if the next trid exists but 2107 * is not ready to try. We will try the active trid after reconnect_delay_sec 2108 * seconds if it is non-zero or at the next reset call otherwise. 2109 */ 2110 } else { 2111 /* Connecting the active trid succeeded. Clear the last failed time because it 2112 * means the trid is failed if its last failed time is non-zero. 2113 */ 2114 nvme_ctrlr->active_path_id->last_failed_tsc = 0; 2115 } 2116 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2117 2118 /* Make sure we clear any pending resets before returning. */ 2119 spdk_for_each_channel(nvme_ctrlr, 2120 bdev_nvme_complete_pending_resets, 2121 success ? NULL : (void *)0x1, 2122 _bdev_nvme_reset_ctrlr_complete); 2123 } 2124 2125 static void 2126 bdev_nvme_reset_create_qpairs_failed(struct spdk_io_channel_iter *i, int status) 2127 { 2128 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2129 2130 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 2131 } 2132 2133 static void 2134 bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i) 2135 { 2136 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 2137 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch); 2138 struct nvme_qpair *nvme_qpair; 2139 2140 nvme_qpair = ctrlr_ch->qpair; 2141 assert(nvme_qpair != NULL); 2142 2143 _bdev_nvme_clear_io_path_cache(nvme_qpair); 2144 2145 if (nvme_qpair->qpair != NULL) { 2146 if (nvme_qpair->ctrlr->dont_retry) { 2147 spdk_nvme_qpair_set_abort_dnr(nvme_qpair->qpair, true); 2148 } 2149 spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); 2150 2151 /* The current full reset sequence will move to the next 2152 * ctrlr_channel after the qpair is actually disconnected. 2153 */ 2154 assert(ctrlr_ch->reset_iter == NULL); 2155 ctrlr_ch->reset_iter = i; 2156 } else { 2157 spdk_for_each_channel_continue(i, 0); 2158 } 2159 } 2160 2161 static void 2162 bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status) 2163 { 2164 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2165 2166 if (status == 0) { 2167 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, true); 2168 } else { 2169 /* Delete the added qpairs and quiesce ctrlr to make the states clean. */ 2170 spdk_for_each_channel(nvme_ctrlr, 2171 bdev_nvme_reset_destroy_qpair, 2172 NULL, 2173 bdev_nvme_reset_create_qpairs_failed); 2174 } 2175 } 2176 2177 static int 2178 bdev_nvme_reset_check_qpair_connected(void *ctx) 2179 { 2180 struct nvme_ctrlr_channel *ctrlr_ch = ctx; 2181 2182 if (ctrlr_ch->reset_iter == NULL) { 2183 /* qpair was already failed to connect and the reset sequence is being aborted. */ 2184 assert(ctrlr_ch->connect_poller == NULL); 2185 assert(ctrlr_ch->qpair->qpair == NULL); 2186 return SPDK_POLLER_BUSY; 2187 } 2188 2189 assert(ctrlr_ch->qpair->qpair != NULL); 2190 2191 if (!spdk_nvme_qpair_is_connected(ctrlr_ch->qpair->qpair)) { 2192 return SPDK_POLLER_BUSY; 2193 } 2194 2195 spdk_poller_unregister(&ctrlr_ch->connect_poller); 2196 2197 /* qpair was completed to connect. Move to the next ctrlr_channel */ 2198 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 2199 ctrlr_ch->reset_iter = NULL; 2200 2201 if (!g_opts.disable_auto_failback) { 2202 _bdev_nvme_clear_io_path_cache(ctrlr_ch->qpair); 2203 } 2204 2205 return SPDK_POLLER_BUSY; 2206 } 2207 2208 static void 2209 bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i) 2210 { 2211 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 2212 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 2213 int rc; 2214 2215 rc = bdev_nvme_create_qpair(ctrlr_ch->qpair); 2216 if (rc == 0) { 2217 ctrlr_ch->connect_poller = SPDK_POLLER_REGISTER(bdev_nvme_reset_check_qpair_connected, 2218 ctrlr_ch, 0); 2219 2220 /* The current full reset sequence will move to the next 2221 * ctrlr_channel after the qpair is actually connected. 2222 */ 2223 assert(ctrlr_ch->reset_iter == NULL); 2224 ctrlr_ch->reset_iter = i; 2225 } else { 2226 spdk_for_each_channel_continue(i, rc); 2227 } 2228 } 2229 2230 static void 2231 nvme_ctrlr_check_namespaces(struct nvme_ctrlr *nvme_ctrlr) 2232 { 2233 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 2234 struct nvme_ns *nvme_ns; 2235 2236 for (nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 2237 nvme_ns != NULL; 2238 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns)) { 2239 if (!spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) { 2240 SPDK_DEBUGLOG(bdev_nvme, "NSID %u was removed during reset.\n", nvme_ns->id); 2241 /* NS can be added again. Just nullify nvme_ns->ns. */ 2242 nvme_ns->ns = NULL; 2243 } 2244 } 2245 } 2246 2247 2248 static int 2249 bdev_nvme_reconnect_ctrlr_poll(void *arg) 2250 { 2251 struct nvme_ctrlr *nvme_ctrlr = arg; 2252 int rc = -ETIMEDOUT; 2253 2254 if (!bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 2255 rc = spdk_nvme_ctrlr_reconnect_poll_async(nvme_ctrlr->ctrlr); 2256 if (rc == -EAGAIN) { 2257 return SPDK_POLLER_BUSY; 2258 } 2259 } 2260 2261 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 2262 if (rc == 0) { 2263 nvme_ctrlr_check_namespaces(nvme_ctrlr); 2264 2265 /* Recreate all of the I/O queue pairs */ 2266 spdk_for_each_channel(nvme_ctrlr, 2267 bdev_nvme_reset_create_qpair, 2268 NULL, 2269 bdev_nvme_reset_create_qpairs_done); 2270 } else { 2271 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 2272 } 2273 return SPDK_POLLER_BUSY; 2274 } 2275 2276 static void 2277 bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2278 { 2279 spdk_nvme_ctrlr_reconnect_async(nvme_ctrlr->ctrlr); 2280 2281 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect, nvme_ctrlr->nbdev_ctrlr->name); 2282 assert(nvme_ctrlr->reset_detach_poller == NULL); 2283 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_ctrlr_poll, 2284 nvme_ctrlr, 0); 2285 } 2286 2287 static void 2288 bdev_nvme_reset_destroy_qpair_done(struct spdk_io_channel_iter *i, int status) 2289 { 2290 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2291 2292 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reset, nvme_ctrlr->nbdev_ctrlr->name); 2293 assert(status == 0); 2294 2295 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2296 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2297 } else { 2298 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr); 2299 } 2300 } 2301 2302 static void 2303 bdev_nvme_reset_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr) 2304 { 2305 spdk_for_each_channel(nvme_ctrlr, 2306 bdev_nvme_reset_destroy_qpair, 2307 NULL, 2308 bdev_nvme_reset_destroy_qpair_done); 2309 } 2310 2311 static void 2312 bdev_nvme_reconnect_ctrlr_now(void *ctx) 2313 { 2314 struct nvme_ctrlr *nvme_ctrlr = ctx; 2315 2316 assert(nvme_ctrlr->resetting == true); 2317 assert(nvme_ctrlr->thread == spdk_get_thread()); 2318 2319 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 2320 2321 spdk_poller_resume(nvme_ctrlr->adminq_timer_poller); 2322 2323 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2324 } 2325 2326 static void 2327 _bdev_nvme_reset_ctrlr(void *ctx) 2328 { 2329 struct nvme_ctrlr *nvme_ctrlr = ctx; 2330 2331 assert(nvme_ctrlr->resetting == true); 2332 assert(nvme_ctrlr->thread == spdk_get_thread()); 2333 2334 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2335 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reset_destroy_qpairs); 2336 } else { 2337 bdev_nvme_reset_destroy_qpairs(nvme_ctrlr); 2338 } 2339 } 2340 2341 static int 2342 bdev_nvme_reset_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2343 { 2344 spdk_msg_fn msg_fn; 2345 2346 pthread_mutex_lock(&nvme_ctrlr->mutex); 2347 if (nvme_ctrlr->destruct) { 2348 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2349 return -ENXIO; 2350 } 2351 2352 if (nvme_ctrlr->resetting) { 2353 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2354 SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); 2355 return -EBUSY; 2356 } 2357 2358 if (nvme_ctrlr->disabled) { 2359 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2360 SPDK_NOTICELOG("Unable to perform reset. Controller is disabled.\n"); 2361 return -EALREADY; 2362 } 2363 2364 nvme_ctrlr->resetting = true; 2365 nvme_ctrlr->dont_retry = true; 2366 2367 if (nvme_ctrlr->reconnect_is_delayed) { 2368 SPDK_DEBUGLOG(bdev_nvme, "Reconnect is already scheduled.\n"); 2369 msg_fn = bdev_nvme_reconnect_ctrlr_now; 2370 nvme_ctrlr->reconnect_is_delayed = false; 2371 } else { 2372 msg_fn = _bdev_nvme_reset_ctrlr; 2373 assert(nvme_ctrlr->reset_start_tsc == 0); 2374 } 2375 2376 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2377 2378 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2379 2380 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 2381 return 0; 2382 } 2383 2384 static int 2385 bdev_nvme_enable_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2386 { 2387 pthread_mutex_lock(&nvme_ctrlr->mutex); 2388 if (nvme_ctrlr->destruct) { 2389 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2390 return -ENXIO; 2391 } 2392 2393 if (nvme_ctrlr->resetting) { 2394 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2395 return -EBUSY; 2396 } 2397 2398 if (!nvme_ctrlr->disabled) { 2399 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2400 return -EALREADY; 2401 } 2402 2403 nvme_ctrlr->disabled = false; 2404 nvme_ctrlr->resetting = true; 2405 2406 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2407 2408 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2409 2410 spdk_thread_send_msg(nvme_ctrlr->thread, bdev_nvme_reconnect_ctrlr_now, nvme_ctrlr); 2411 return 0; 2412 } 2413 2414 static void 2415 _bdev_nvme_disable_ctrlr_complete(struct spdk_io_channel_iter *i, int status) 2416 { 2417 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2418 bdev_nvme_ctrlr_op_cb ctrlr_op_cb_fn = nvme_ctrlr->ctrlr_op_cb_fn; 2419 void *ctrlr_op_cb_arg = nvme_ctrlr->ctrlr_op_cb_arg; 2420 enum bdev_nvme_op_after_reset op_after_disable; 2421 2422 assert(nvme_ctrlr->thread == spdk_get_thread()); 2423 2424 nvme_ctrlr->ctrlr_op_cb_fn = NULL; 2425 nvme_ctrlr->ctrlr_op_cb_arg = NULL; 2426 2427 pthread_mutex_lock(&nvme_ctrlr->mutex); 2428 2429 nvme_ctrlr->resetting = false; 2430 nvme_ctrlr->dont_retry = false; 2431 2432 op_after_disable = bdev_nvme_check_op_after_reset(nvme_ctrlr, true); 2433 2434 nvme_ctrlr->disabled = true; 2435 spdk_poller_pause(nvme_ctrlr->adminq_timer_poller); 2436 2437 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2438 2439 if (ctrlr_op_cb_fn) { 2440 ctrlr_op_cb_fn(ctrlr_op_cb_arg, 0); 2441 } 2442 2443 switch (op_after_disable) { 2444 case OP_COMPLETE_PENDING_DESTRUCT: 2445 nvme_ctrlr_unregister(nvme_ctrlr); 2446 break; 2447 default: 2448 break; 2449 } 2450 2451 } 2452 2453 static void 2454 bdev_nvme_disable_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr) 2455 { 2456 /* Make sure we clear any pending resets before returning. */ 2457 spdk_for_each_channel(nvme_ctrlr, 2458 bdev_nvme_complete_pending_resets, 2459 NULL, 2460 _bdev_nvme_disable_ctrlr_complete); 2461 } 2462 2463 static void 2464 bdev_nvme_disable_destroy_qpairs_done(struct spdk_io_channel_iter *i, int status) 2465 { 2466 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2467 2468 assert(status == 0); 2469 2470 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2471 bdev_nvme_disable_ctrlr_complete(nvme_ctrlr); 2472 } else { 2473 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_disable_ctrlr_complete); 2474 } 2475 } 2476 2477 static void 2478 bdev_nvme_disable_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr) 2479 { 2480 spdk_for_each_channel(nvme_ctrlr, 2481 bdev_nvme_reset_destroy_qpair, 2482 NULL, 2483 bdev_nvme_disable_destroy_qpairs_done); 2484 } 2485 2486 static void 2487 _bdev_nvme_cancel_reconnect_and_disable_ctrlr(void *ctx) 2488 { 2489 struct nvme_ctrlr *nvme_ctrlr = ctx; 2490 2491 assert(nvme_ctrlr->resetting == true); 2492 assert(nvme_ctrlr->thread == spdk_get_thread()); 2493 2494 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 2495 2496 bdev_nvme_disable_ctrlr_complete(nvme_ctrlr); 2497 } 2498 2499 static void 2500 _bdev_nvme_disconnect_and_disable_ctrlr(void *ctx) 2501 { 2502 struct nvme_ctrlr *nvme_ctrlr = ctx; 2503 2504 assert(nvme_ctrlr->resetting == true); 2505 assert(nvme_ctrlr->thread == spdk_get_thread()); 2506 2507 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2508 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_disable_destroy_qpairs); 2509 } else { 2510 bdev_nvme_disable_destroy_qpairs(nvme_ctrlr); 2511 } 2512 } 2513 2514 static int 2515 bdev_nvme_disable_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2516 { 2517 spdk_msg_fn msg_fn; 2518 2519 pthread_mutex_lock(&nvme_ctrlr->mutex); 2520 if (nvme_ctrlr->destruct) { 2521 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2522 return -ENXIO; 2523 } 2524 2525 if (nvme_ctrlr->resetting) { 2526 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2527 return -EBUSY; 2528 } 2529 2530 if (nvme_ctrlr->disabled) { 2531 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2532 return -EALREADY; 2533 } 2534 2535 nvme_ctrlr->resetting = true; 2536 nvme_ctrlr->dont_retry = true; 2537 2538 if (nvme_ctrlr->reconnect_is_delayed) { 2539 msg_fn = _bdev_nvme_cancel_reconnect_and_disable_ctrlr; 2540 nvme_ctrlr->reconnect_is_delayed = false; 2541 } else { 2542 msg_fn = _bdev_nvme_disconnect_and_disable_ctrlr; 2543 } 2544 2545 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2546 2547 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2548 2549 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 2550 return 0; 2551 } 2552 2553 static int 2554 nvme_ctrlr_op(struct nvme_ctrlr *nvme_ctrlr, enum nvme_ctrlr_op op, 2555 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2556 { 2557 int rc; 2558 2559 switch (op) { 2560 case NVME_CTRLR_OP_RESET: 2561 rc = bdev_nvme_reset_ctrlr(nvme_ctrlr); 2562 break; 2563 case NVME_CTRLR_OP_ENABLE: 2564 rc = bdev_nvme_enable_ctrlr(nvme_ctrlr); 2565 break; 2566 case NVME_CTRLR_OP_DISABLE: 2567 rc = bdev_nvme_disable_ctrlr(nvme_ctrlr); 2568 break; 2569 default: 2570 rc = -EINVAL; 2571 break; 2572 } 2573 2574 if (rc == 0) { 2575 assert(nvme_ctrlr->ctrlr_op_cb_fn == NULL); 2576 assert(nvme_ctrlr->ctrlr_op_cb_arg == NULL); 2577 nvme_ctrlr->ctrlr_op_cb_fn = cb_fn; 2578 nvme_ctrlr->ctrlr_op_cb_arg = cb_arg; 2579 } 2580 return rc; 2581 } 2582 2583 struct nvme_ctrlr_op_rpc_ctx { 2584 struct nvme_ctrlr *nvme_ctrlr; 2585 struct spdk_thread *orig_thread; 2586 enum nvme_ctrlr_op op; 2587 int rc; 2588 bdev_nvme_ctrlr_op_cb cb_fn; 2589 void *cb_arg; 2590 }; 2591 2592 static void 2593 _nvme_ctrlr_op_rpc_complete(void *_ctx) 2594 { 2595 struct nvme_ctrlr_op_rpc_ctx *ctx = _ctx; 2596 2597 assert(ctx != NULL); 2598 assert(ctx->cb_fn != NULL); 2599 2600 ctx->cb_fn(ctx->cb_arg, ctx->rc); 2601 2602 free(ctx); 2603 } 2604 2605 static void 2606 nvme_ctrlr_op_rpc_complete(void *cb_arg, int rc) 2607 { 2608 struct nvme_ctrlr_op_rpc_ctx *ctx = cb_arg; 2609 2610 ctx->rc = rc; 2611 2612 spdk_thread_send_msg(ctx->orig_thread, _nvme_ctrlr_op_rpc_complete, ctx); 2613 } 2614 2615 void 2616 nvme_ctrlr_op_rpc(struct nvme_ctrlr *nvme_ctrlr, enum nvme_ctrlr_op op, 2617 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2618 { 2619 struct nvme_ctrlr_op_rpc_ctx *ctx; 2620 int rc; 2621 2622 assert(cb_fn != NULL); 2623 2624 ctx = calloc(1, sizeof(*ctx)); 2625 if (ctx == NULL) { 2626 SPDK_ERRLOG("Failed to allocate nvme_ctrlr_op_rpc_ctx.\n"); 2627 cb_fn(cb_arg, -ENOMEM); 2628 return; 2629 } 2630 2631 ctx->orig_thread = spdk_get_thread(); 2632 ctx->cb_fn = cb_fn; 2633 ctx->cb_arg = cb_arg; 2634 2635 rc = nvme_ctrlr_op(nvme_ctrlr, op, nvme_ctrlr_op_rpc_complete, ctx); 2636 if (rc == 0) { 2637 return; 2638 } else if (rc == -EALREADY) { 2639 rc = 0; 2640 } 2641 2642 nvme_ctrlr_op_rpc_complete(ctx, rc); 2643 } 2644 2645 static void nvme_bdev_ctrlr_op_rpc_continue(void *cb_arg, int rc); 2646 2647 static void 2648 _nvme_bdev_ctrlr_op_rpc_continue(void *_ctx) 2649 { 2650 struct nvme_ctrlr_op_rpc_ctx *ctx = _ctx; 2651 struct nvme_ctrlr *prev_nvme_ctrlr, *next_nvme_ctrlr; 2652 int rc; 2653 2654 prev_nvme_ctrlr = ctx->nvme_ctrlr; 2655 ctx->nvme_ctrlr = NULL; 2656 2657 if (ctx->rc != 0) { 2658 goto complete; 2659 } 2660 2661 next_nvme_ctrlr = TAILQ_NEXT(prev_nvme_ctrlr, tailq); 2662 if (next_nvme_ctrlr == NULL) { 2663 goto complete; 2664 } 2665 2666 rc = nvme_ctrlr_op(next_nvme_ctrlr, ctx->op, nvme_bdev_ctrlr_op_rpc_continue, ctx); 2667 if (rc == 0) { 2668 ctx->nvme_ctrlr = next_nvme_ctrlr; 2669 return; 2670 } else if (rc == -EALREADY) { 2671 ctx->nvme_ctrlr = next_nvme_ctrlr; 2672 rc = 0; 2673 } 2674 2675 ctx->rc = rc; 2676 2677 complete: 2678 ctx->cb_fn(ctx->cb_arg, ctx->rc); 2679 free(ctx); 2680 } 2681 2682 static void 2683 nvme_bdev_ctrlr_op_rpc_continue(void *cb_arg, int rc) 2684 { 2685 struct nvme_ctrlr_op_rpc_ctx *ctx = cb_arg; 2686 2687 ctx->rc = rc; 2688 2689 spdk_thread_send_msg(ctx->orig_thread, _nvme_bdev_ctrlr_op_rpc_continue, ctx); 2690 } 2691 2692 void 2693 nvme_bdev_ctrlr_op_rpc(struct nvme_bdev_ctrlr *nbdev_ctrlr, enum nvme_ctrlr_op op, 2694 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2695 { 2696 struct nvme_ctrlr_op_rpc_ctx *ctx; 2697 struct nvme_ctrlr *nvme_ctrlr; 2698 int rc; 2699 2700 assert(cb_fn != NULL); 2701 2702 ctx = calloc(1, sizeof(*ctx)); 2703 if (ctx == NULL) { 2704 SPDK_ERRLOG("Failed to allocate nvme_ctrlr_op_rpc_ctx.\n"); 2705 cb_fn(cb_arg, -ENOMEM); 2706 return; 2707 } 2708 2709 ctx->orig_thread = spdk_get_thread(); 2710 ctx->op = op; 2711 ctx->cb_fn = cb_fn; 2712 ctx->cb_arg = cb_arg; 2713 2714 nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 2715 assert(nvme_ctrlr != NULL); 2716 2717 rc = nvme_ctrlr_op(nvme_ctrlr, op, nvme_bdev_ctrlr_op_rpc_continue, ctx); 2718 if (rc == 0) { 2719 ctx->nvme_ctrlr = nvme_ctrlr; 2720 return; 2721 } else if (rc == -EALREADY) { 2722 ctx->nvme_ctrlr = nvme_ctrlr; 2723 rc = 0; 2724 } 2725 2726 nvme_bdev_ctrlr_op_rpc_continue(ctx, rc); 2727 } 2728 2729 static int _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio); 2730 2731 static void 2732 _bdev_nvme_reset_io_complete(struct spdk_io_channel_iter *i, int status) 2733 { 2734 struct nvme_bdev_io *bio = spdk_io_channel_iter_get_ctx(i); 2735 enum spdk_bdev_io_status io_status; 2736 2737 if (bio->cpl.cdw0 == 0) { 2738 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 2739 } else { 2740 io_status = SPDK_BDEV_IO_STATUS_FAILED; 2741 } 2742 2743 __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), io_status, NULL); 2744 } 2745 2746 static void 2747 bdev_nvme_abort_bdev_channel(struct spdk_io_channel_iter *i) 2748 { 2749 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 2750 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 2751 2752 bdev_nvme_abort_retry_ios(nbdev_ch); 2753 2754 spdk_for_each_channel_continue(i, 0); 2755 } 2756 2757 static void 2758 bdev_nvme_reset_io_complete(struct nvme_bdev_io *bio) 2759 { 2760 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2761 struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt; 2762 2763 /* Abort all queued I/Os for retry. */ 2764 spdk_for_each_channel(nbdev, 2765 bdev_nvme_abort_bdev_channel, 2766 bio, 2767 _bdev_nvme_reset_io_complete); 2768 } 2769 2770 static void 2771 _bdev_nvme_reset_io_continue(void *ctx) 2772 { 2773 struct nvme_bdev_io *bio = ctx; 2774 struct nvme_io_path *prev_io_path, *next_io_path; 2775 int rc; 2776 2777 prev_io_path = bio->io_path; 2778 bio->io_path = NULL; 2779 2780 if (bio->cpl.cdw0 != 0) { 2781 goto complete; 2782 } 2783 2784 next_io_path = STAILQ_NEXT(prev_io_path, stailq); 2785 if (next_io_path == NULL) { 2786 goto complete; 2787 } 2788 2789 rc = _bdev_nvme_reset_io(next_io_path, bio); 2790 if (rc == 0) { 2791 return; 2792 } 2793 2794 bio->cpl.cdw0 = 1; 2795 2796 complete: 2797 bdev_nvme_reset_io_complete(bio); 2798 } 2799 2800 static void 2801 bdev_nvme_reset_io_continue(void *cb_arg, int rc) 2802 { 2803 struct nvme_bdev_io *bio = cb_arg; 2804 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2805 2806 bio->cpl.cdw0 = (rc == 0) ? 0 : 1; 2807 2808 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), _bdev_nvme_reset_io_continue, bio); 2809 } 2810 2811 static int 2812 _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio) 2813 { 2814 struct nvme_ctrlr_channel *ctrlr_ch; 2815 int rc; 2816 2817 rc = nvme_ctrlr_op(io_path->qpair->ctrlr, NVME_CTRLR_OP_RESET, 2818 bdev_nvme_reset_io_continue, bio); 2819 if (rc != 0 && rc != -EBUSY) { 2820 return rc; 2821 } 2822 2823 assert(bio->io_path == NULL); 2824 bio->io_path = io_path; 2825 2826 if (rc == -EBUSY) { 2827 ctrlr_ch = io_path->qpair->ctrlr_ch; 2828 assert(ctrlr_ch != NULL); 2829 /* 2830 * Reset call is queued only if it is from the app framework. This is on purpose so that 2831 * we don't interfere with the app framework reset strategy. i.e. we are deferring to the 2832 * upper level. If they are in the middle of a reset, we won't try to schedule another one. 2833 */ 2834 TAILQ_INSERT_TAIL(&ctrlr_ch->pending_resets, bio, retry_link); 2835 } 2836 2837 return 0; 2838 } 2839 2840 static void 2841 bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio) 2842 { 2843 struct nvme_io_path *io_path; 2844 int rc; 2845 2846 bio->cpl.cdw0 = 0; 2847 2848 /* Reset all nvme_ctrlrs of a bdev controller sequentially. */ 2849 io_path = STAILQ_FIRST(&nbdev_ch->io_path_list); 2850 assert(io_path != NULL); 2851 2852 rc = _bdev_nvme_reset_io(io_path, bio); 2853 if (rc != 0) { 2854 /* If the current nvme_ctrlr is disabled, skip it and move to the next nvme_ctrlr. */ 2855 rc = (rc == -EALREADY) ? 0 : rc; 2856 2857 bdev_nvme_reset_io_continue(bio, rc); 2858 } 2859 } 2860 2861 static int 2862 bdev_nvme_failover_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool remove) 2863 { 2864 if (nvme_ctrlr->destruct) { 2865 /* Don't bother resetting if the controller is in the process of being destructed. */ 2866 return -ENXIO; 2867 } 2868 2869 if (nvme_ctrlr->resetting) { 2870 if (!nvme_ctrlr->in_failover) { 2871 SPDK_NOTICELOG("Reset is already in progress. Defer failover until reset completes.\n"); 2872 2873 /* Defer failover until reset completes. */ 2874 nvme_ctrlr->pending_failover = true; 2875 return -EINPROGRESS; 2876 } else { 2877 SPDK_NOTICELOG("Unable to perform failover, already in progress.\n"); 2878 return -EBUSY; 2879 } 2880 } 2881 2882 bdev_nvme_failover_trid(nvme_ctrlr, remove, true); 2883 2884 if (nvme_ctrlr->reconnect_is_delayed) { 2885 SPDK_NOTICELOG("Reconnect is already scheduled.\n"); 2886 2887 /* We rely on the next reconnect for the failover. */ 2888 return -EALREADY; 2889 } 2890 2891 if (nvme_ctrlr->disabled) { 2892 SPDK_NOTICELOG("Controller is disabled.\n"); 2893 2894 /* We rely on the enablement for the failover. */ 2895 return -EALREADY; 2896 } 2897 2898 nvme_ctrlr->resetting = true; 2899 nvme_ctrlr->in_failover = true; 2900 2901 assert(nvme_ctrlr->reset_start_tsc == 0); 2902 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2903 2904 return 0; 2905 } 2906 2907 static int 2908 bdev_nvme_failover_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2909 { 2910 int rc; 2911 2912 pthread_mutex_lock(&nvme_ctrlr->mutex); 2913 rc = bdev_nvme_failover_ctrlr_unsafe(nvme_ctrlr, false); 2914 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2915 2916 if (rc == 0) { 2917 spdk_thread_send_msg(nvme_ctrlr->thread, _bdev_nvme_reset_ctrlr, nvme_ctrlr); 2918 } else if (rc == -EALREADY) { 2919 rc = 0; 2920 } 2921 2922 return rc; 2923 } 2924 2925 static int bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, 2926 uint64_t num_blocks); 2927 2928 static int bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, 2929 uint64_t num_blocks); 2930 2931 static int bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, 2932 uint64_t src_offset_blocks, 2933 uint64_t num_blocks); 2934 2935 static void 2936 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 2937 bool success) 2938 { 2939 struct nvme_bdev_io *bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 2940 int ret; 2941 2942 if (!success) { 2943 ret = -EINVAL; 2944 goto exit; 2945 } 2946 2947 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 2948 ret = -ENXIO; 2949 goto exit; 2950 } 2951 2952 ret = bdev_nvme_readv(bio, 2953 bdev_io->u.bdev.iovs, 2954 bdev_io->u.bdev.iovcnt, 2955 bdev_io->u.bdev.md_buf, 2956 bdev_io->u.bdev.num_blocks, 2957 bdev_io->u.bdev.offset_blocks, 2958 bdev_io->u.bdev.dif_check_flags, 2959 bdev_io->u.bdev.memory_domain, 2960 bdev_io->u.bdev.memory_domain_ctx, 2961 bdev_io->u.bdev.accel_sequence); 2962 2963 exit: 2964 if (spdk_unlikely(ret != 0)) { 2965 bdev_nvme_io_complete(bio, ret); 2966 } 2967 } 2968 2969 static inline void 2970 _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io) 2971 { 2972 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 2973 struct spdk_bdev *bdev = bdev_io->bdev; 2974 struct nvme_bdev_io *nbdev_io_to_abort; 2975 int rc = 0; 2976 2977 switch (bdev_io->type) { 2978 case SPDK_BDEV_IO_TYPE_READ: 2979 if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) { 2980 2981 rc = bdev_nvme_readv(nbdev_io, 2982 bdev_io->u.bdev.iovs, 2983 bdev_io->u.bdev.iovcnt, 2984 bdev_io->u.bdev.md_buf, 2985 bdev_io->u.bdev.num_blocks, 2986 bdev_io->u.bdev.offset_blocks, 2987 bdev_io->u.bdev.dif_check_flags, 2988 bdev_io->u.bdev.memory_domain, 2989 bdev_io->u.bdev.memory_domain_ctx, 2990 bdev_io->u.bdev.accel_sequence); 2991 } else { 2992 spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb, 2993 bdev_io->u.bdev.num_blocks * bdev->blocklen); 2994 rc = 0; 2995 } 2996 break; 2997 case SPDK_BDEV_IO_TYPE_WRITE: 2998 rc = bdev_nvme_writev(nbdev_io, 2999 bdev_io->u.bdev.iovs, 3000 bdev_io->u.bdev.iovcnt, 3001 bdev_io->u.bdev.md_buf, 3002 bdev_io->u.bdev.num_blocks, 3003 bdev_io->u.bdev.offset_blocks, 3004 bdev_io->u.bdev.dif_check_flags, 3005 bdev_io->u.bdev.memory_domain, 3006 bdev_io->u.bdev.memory_domain_ctx, 3007 bdev_io->u.bdev.accel_sequence); 3008 break; 3009 case SPDK_BDEV_IO_TYPE_COMPARE: 3010 rc = bdev_nvme_comparev(nbdev_io, 3011 bdev_io->u.bdev.iovs, 3012 bdev_io->u.bdev.iovcnt, 3013 bdev_io->u.bdev.md_buf, 3014 bdev_io->u.bdev.num_blocks, 3015 bdev_io->u.bdev.offset_blocks, 3016 bdev_io->u.bdev.dif_check_flags); 3017 break; 3018 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 3019 rc = bdev_nvme_comparev_and_writev(nbdev_io, 3020 bdev_io->u.bdev.iovs, 3021 bdev_io->u.bdev.iovcnt, 3022 bdev_io->u.bdev.fused_iovs, 3023 bdev_io->u.bdev.fused_iovcnt, 3024 bdev_io->u.bdev.md_buf, 3025 bdev_io->u.bdev.num_blocks, 3026 bdev_io->u.bdev.offset_blocks, 3027 bdev_io->u.bdev.dif_check_flags); 3028 break; 3029 case SPDK_BDEV_IO_TYPE_UNMAP: 3030 rc = bdev_nvme_unmap(nbdev_io, 3031 bdev_io->u.bdev.offset_blocks, 3032 bdev_io->u.bdev.num_blocks); 3033 break; 3034 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3035 rc = bdev_nvme_write_zeroes(nbdev_io, 3036 bdev_io->u.bdev.offset_blocks, 3037 bdev_io->u.bdev.num_blocks); 3038 break; 3039 case SPDK_BDEV_IO_TYPE_RESET: 3040 nbdev_io->io_path = NULL; 3041 bdev_nvme_reset_io(nbdev_ch, nbdev_io); 3042 return; 3043 3044 case SPDK_BDEV_IO_TYPE_FLUSH: 3045 bdev_nvme_io_complete(nbdev_io, 0); 3046 return; 3047 3048 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 3049 rc = bdev_nvme_zone_appendv(nbdev_io, 3050 bdev_io->u.bdev.iovs, 3051 bdev_io->u.bdev.iovcnt, 3052 bdev_io->u.bdev.md_buf, 3053 bdev_io->u.bdev.num_blocks, 3054 bdev_io->u.bdev.offset_blocks, 3055 bdev_io->u.bdev.dif_check_flags); 3056 break; 3057 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 3058 rc = bdev_nvme_get_zone_info(nbdev_io, 3059 bdev_io->u.zone_mgmt.zone_id, 3060 bdev_io->u.zone_mgmt.num_zones, 3061 bdev_io->u.zone_mgmt.buf); 3062 break; 3063 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 3064 rc = bdev_nvme_zone_management(nbdev_io, 3065 bdev_io->u.zone_mgmt.zone_id, 3066 bdev_io->u.zone_mgmt.zone_action); 3067 break; 3068 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 3069 nbdev_io->io_path = NULL; 3070 bdev_nvme_admin_passthru(nbdev_ch, 3071 nbdev_io, 3072 &bdev_io->u.nvme_passthru.cmd, 3073 bdev_io->u.nvme_passthru.buf, 3074 bdev_io->u.nvme_passthru.nbytes); 3075 return; 3076 3077 case SPDK_BDEV_IO_TYPE_NVME_IO: 3078 rc = bdev_nvme_io_passthru(nbdev_io, 3079 &bdev_io->u.nvme_passthru.cmd, 3080 bdev_io->u.nvme_passthru.buf, 3081 bdev_io->u.nvme_passthru.nbytes); 3082 break; 3083 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3084 rc = bdev_nvme_io_passthru_md(nbdev_io, 3085 &bdev_io->u.nvme_passthru.cmd, 3086 bdev_io->u.nvme_passthru.buf, 3087 bdev_io->u.nvme_passthru.nbytes, 3088 bdev_io->u.nvme_passthru.md_buf, 3089 bdev_io->u.nvme_passthru.md_len); 3090 break; 3091 case SPDK_BDEV_IO_TYPE_NVME_IOV_MD: 3092 rc = bdev_nvme_iov_passthru_md(nbdev_io, 3093 &bdev_io->u.nvme_passthru.cmd, 3094 bdev_io->u.nvme_passthru.iovs, 3095 bdev_io->u.nvme_passthru.iovcnt, 3096 bdev_io->u.nvme_passthru.nbytes, 3097 bdev_io->u.nvme_passthru.md_buf, 3098 bdev_io->u.nvme_passthru.md_len); 3099 break; 3100 case SPDK_BDEV_IO_TYPE_ABORT: 3101 nbdev_io->io_path = NULL; 3102 nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx; 3103 bdev_nvme_abort(nbdev_ch, 3104 nbdev_io, 3105 nbdev_io_to_abort); 3106 return; 3107 3108 case SPDK_BDEV_IO_TYPE_COPY: 3109 rc = bdev_nvme_copy(nbdev_io, 3110 bdev_io->u.bdev.offset_blocks, 3111 bdev_io->u.bdev.copy.src_offset_blocks, 3112 bdev_io->u.bdev.num_blocks); 3113 break; 3114 default: 3115 rc = -EINVAL; 3116 break; 3117 } 3118 3119 if (spdk_unlikely(rc != 0)) { 3120 bdev_nvme_io_complete(nbdev_io, rc); 3121 } 3122 } 3123 3124 static void 3125 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 3126 { 3127 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 3128 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 3129 3130 if (spdk_likely(nbdev_io->submit_tsc == 0)) { 3131 nbdev_io->submit_tsc = spdk_bdev_io_get_submit_tsc(bdev_io); 3132 } else { 3133 /* There are cases where submit_tsc != 0, i.e. retry I/O. 3134 * We need to update submit_tsc here. 3135 */ 3136 nbdev_io->submit_tsc = spdk_get_ticks(); 3137 } 3138 3139 spdk_trace_record(TRACE_BDEV_NVME_IO_START, 0, 0, (uintptr_t)nbdev_io, (uintptr_t)bdev_io); 3140 nbdev_io->io_path = bdev_nvme_find_io_path(nbdev_ch); 3141 if (spdk_unlikely(!nbdev_io->io_path)) { 3142 if (!bdev_nvme_io_type_is_admin(bdev_io->type)) { 3143 bdev_nvme_io_complete(nbdev_io, -ENXIO); 3144 return; 3145 } 3146 3147 /* Admin commands do not use the optimal I/O path. 3148 * Simply fall through even if it is not found. 3149 */ 3150 } 3151 3152 _bdev_nvme_submit_request(nbdev_ch, bdev_io); 3153 } 3154 3155 static bool 3156 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 3157 { 3158 struct nvme_bdev *nbdev = ctx; 3159 struct nvme_ns *nvme_ns; 3160 struct spdk_nvme_ns *ns; 3161 struct spdk_nvme_ctrlr *ctrlr; 3162 const struct spdk_nvme_ctrlr_data *cdata; 3163 3164 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 3165 assert(nvme_ns != NULL); 3166 ns = nvme_ns->ns; 3167 if (ns == NULL) { 3168 return false; 3169 } 3170 3171 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 3172 3173 switch (io_type) { 3174 case SPDK_BDEV_IO_TYPE_READ: 3175 case SPDK_BDEV_IO_TYPE_WRITE: 3176 case SPDK_BDEV_IO_TYPE_RESET: 3177 case SPDK_BDEV_IO_TYPE_FLUSH: 3178 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 3179 case SPDK_BDEV_IO_TYPE_NVME_IO: 3180 case SPDK_BDEV_IO_TYPE_ABORT: 3181 return true; 3182 3183 case SPDK_BDEV_IO_TYPE_COMPARE: 3184 return spdk_nvme_ns_supports_compare(ns); 3185 3186 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3187 return spdk_nvme_ns_get_md_size(ns) ? true : false; 3188 3189 case SPDK_BDEV_IO_TYPE_UNMAP: 3190 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3191 return cdata->oncs.dsm; 3192 3193 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3194 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3195 return cdata->oncs.write_zeroes; 3196 3197 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 3198 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 3199 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) { 3200 return true; 3201 } 3202 return false; 3203 3204 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 3205 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 3206 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS; 3207 3208 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 3209 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS && 3210 spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ZONE_APPEND_SUPPORTED; 3211 3212 case SPDK_BDEV_IO_TYPE_COPY: 3213 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3214 return cdata->oncs.copy; 3215 3216 default: 3217 return false; 3218 } 3219 } 3220 3221 static int 3222 nvme_qpair_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ctrlr_channel *ctrlr_ch) 3223 { 3224 struct nvme_qpair *nvme_qpair; 3225 struct spdk_io_channel *pg_ch; 3226 int rc; 3227 3228 nvme_qpair = calloc(1, sizeof(*nvme_qpair)); 3229 if (!nvme_qpair) { 3230 SPDK_ERRLOG("Failed to alloc nvme_qpair.\n"); 3231 return -1; 3232 } 3233 3234 TAILQ_INIT(&nvme_qpair->io_path_list); 3235 3236 nvme_qpair->ctrlr = nvme_ctrlr; 3237 nvme_qpair->ctrlr_ch = ctrlr_ch; 3238 3239 pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs); 3240 if (!pg_ch) { 3241 free(nvme_qpair); 3242 return -1; 3243 } 3244 3245 nvme_qpair->group = spdk_io_channel_get_ctx(pg_ch); 3246 3247 #ifdef SPDK_CONFIG_VTUNE 3248 nvme_qpair->group->collect_spin_stat = true; 3249 #else 3250 nvme_qpair->group->collect_spin_stat = false; 3251 #endif 3252 3253 if (!nvme_ctrlr->disabled) { 3254 /* If a nvme_ctrlr is disabled, don't try to create qpair for it. Qpair will 3255 * be created when it's enabled. 3256 */ 3257 rc = bdev_nvme_create_qpair(nvme_qpair); 3258 if (rc != 0) { 3259 /* nvme_ctrlr can't create IO qpair if connection is down. 3260 * If reconnect_delay_sec is non-zero, creating IO qpair is retried 3261 * after reconnect_delay_sec seconds. If bdev_retry_count is non-zero, 3262 * submitted IO will be queued until IO qpair is successfully created. 3263 * 3264 * Hence, if both are satisfied, ignore the failure. 3265 */ 3266 if (nvme_ctrlr->opts.reconnect_delay_sec == 0 || g_opts.bdev_retry_count == 0) { 3267 spdk_put_io_channel(pg_ch); 3268 free(nvme_qpair); 3269 return rc; 3270 } 3271 } 3272 } 3273 3274 TAILQ_INSERT_TAIL(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 3275 3276 ctrlr_ch->qpair = nvme_qpair; 3277 3278 pthread_mutex_lock(&nvme_qpair->ctrlr->mutex); 3279 nvme_qpair->ctrlr->ref++; 3280 pthread_mutex_unlock(&nvme_qpair->ctrlr->mutex); 3281 3282 return 0; 3283 } 3284 3285 static int 3286 bdev_nvme_create_ctrlr_channel_cb(void *io_device, void *ctx_buf) 3287 { 3288 struct nvme_ctrlr *nvme_ctrlr = io_device; 3289 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 3290 3291 TAILQ_INIT(&ctrlr_ch->pending_resets); 3292 3293 return nvme_qpair_create(nvme_ctrlr, ctrlr_ch); 3294 } 3295 3296 static void 3297 nvme_qpair_delete(struct nvme_qpair *nvme_qpair) 3298 { 3299 struct nvme_io_path *io_path, *next; 3300 3301 assert(nvme_qpair->group != NULL); 3302 3303 TAILQ_FOREACH_SAFE(io_path, &nvme_qpair->io_path_list, tailq, next) { 3304 TAILQ_REMOVE(&nvme_qpair->io_path_list, io_path, tailq); 3305 nvme_io_path_free(io_path); 3306 } 3307 3308 TAILQ_REMOVE(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 3309 3310 spdk_put_io_channel(spdk_io_channel_from_ctx(nvme_qpair->group)); 3311 3312 nvme_ctrlr_release(nvme_qpair->ctrlr); 3313 3314 free(nvme_qpair); 3315 } 3316 3317 static void 3318 bdev_nvme_destroy_ctrlr_channel_cb(void *io_device, void *ctx_buf) 3319 { 3320 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 3321 struct nvme_qpair *nvme_qpair; 3322 3323 nvme_qpair = ctrlr_ch->qpair; 3324 assert(nvme_qpair != NULL); 3325 3326 _bdev_nvme_clear_io_path_cache(nvme_qpair); 3327 3328 if (nvme_qpair->qpair != NULL) { 3329 if (ctrlr_ch->reset_iter == NULL) { 3330 spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); 3331 } else { 3332 /* Skip current ctrlr_channel in a full reset sequence because 3333 * it is being deleted now. The qpair is already being disconnected. 3334 * We do not have to restart disconnecting it. 3335 */ 3336 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 3337 } 3338 3339 /* We cannot release a reference to the poll group now. 3340 * The qpair may be disconnected asynchronously later. 3341 * We need to poll it until it is actually disconnected. 3342 * Just detach the qpair from the deleting ctrlr_channel. 3343 */ 3344 nvme_qpair->ctrlr_ch = NULL; 3345 } else { 3346 assert(ctrlr_ch->reset_iter == NULL); 3347 3348 nvme_qpair_delete(nvme_qpair); 3349 } 3350 } 3351 3352 static inline struct spdk_io_channel * 3353 bdev_nvme_get_accel_channel(struct nvme_poll_group *group) 3354 { 3355 if (spdk_unlikely(!group->accel_channel)) { 3356 group->accel_channel = spdk_accel_get_io_channel(); 3357 if (!group->accel_channel) { 3358 SPDK_ERRLOG("Cannot get the accel_channel for bdev nvme polling group=%p\n", 3359 group); 3360 return NULL; 3361 } 3362 } 3363 3364 return group->accel_channel; 3365 } 3366 3367 static void 3368 bdev_nvme_submit_accel_crc32c(void *ctx, uint32_t *dst, struct iovec *iov, 3369 uint32_t iov_cnt, uint32_t seed, 3370 spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) 3371 { 3372 struct spdk_io_channel *accel_ch; 3373 struct nvme_poll_group *group = ctx; 3374 int rc; 3375 3376 assert(cb_fn != NULL); 3377 3378 accel_ch = bdev_nvme_get_accel_channel(group); 3379 if (spdk_unlikely(accel_ch == NULL)) { 3380 cb_fn(cb_arg, -ENOMEM); 3381 return; 3382 } 3383 3384 rc = spdk_accel_submit_crc32cv(accel_ch, dst, iov, iov_cnt, seed, cb_fn, cb_arg); 3385 if (rc) { 3386 /* For the two cases, spdk_accel_submit_crc32cv does not call the user's cb_fn */ 3387 if (rc == -ENOMEM || rc == -EINVAL) { 3388 cb_fn(cb_arg, rc); 3389 } 3390 SPDK_ERRLOG("Cannot complete the accelerated crc32c operation with iov=%p\n", iov); 3391 } 3392 } 3393 3394 static void 3395 bdev_nvme_finish_sequence(void *seq, spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) 3396 { 3397 spdk_accel_sequence_finish(seq, cb_fn, cb_arg); 3398 } 3399 3400 static void 3401 bdev_nvme_abort_sequence(void *seq) 3402 { 3403 spdk_accel_sequence_abort(seq); 3404 } 3405 3406 static void 3407 bdev_nvme_reverse_sequence(void *seq) 3408 { 3409 spdk_accel_sequence_reverse(seq); 3410 } 3411 3412 static int 3413 bdev_nvme_append_crc32c(void *ctx, void **seq, uint32_t *dst, struct iovec *iovs, uint32_t iovcnt, 3414 struct spdk_memory_domain *domain, void *domain_ctx, uint32_t seed, 3415 spdk_nvme_accel_step_cb cb_fn, void *cb_arg) 3416 { 3417 struct spdk_io_channel *ch; 3418 struct nvme_poll_group *group = ctx; 3419 3420 ch = bdev_nvme_get_accel_channel(group); 3421 if (spdk_unlikely(ch == NULL)) { 3422 return -ENOMEM; 3423 } 3424 3425 return spdk_accel_append_crc32c((struct spdk_accel_sequence **)seq, ch, dst, iovs, iovcnt, 3426 domain, domain_ctx, seed, cb_fn, cb_arg); 3427 } 3428 3429 static struct spdk_nvme_accel_fn_table g_bdev_nvme_accel_fn_table = { 3430 .table_size = sizeof(struct spdk_nvme_accel_fn_table), 3431 .submit_accel_crc32c = bdev_nvme_submit_accel_crc32c, 3432 .append_crc32c = bdev_nvme_append_crc32c, 3433 .finish_sequence = bdev_nvme_finish_sequence, 3434 .reverse_sequence = bdev_nvme_reverse_sequence, 3435 .abort_sequence = bdev_nvme_abort_sequence, 3436 }; 3437 3438 static int 3439 bdev_nvme_create_poll_group_cb(void *io_device, void *ctx_buf) 3440 { 3441 struct nvme_poll_group *group = ctx_buf; 3442 3443 TAILQ_INIT(&group->qpair_list); 3444 3445 group->group = spdk_nvme_poll_group_create(group, &g_bdev_nvme_accel_fn_table); 3446 if (group->group == NULL) { 3447 return -1; 3448 } 3449 3450 group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us); 3451 3452 if (group->poller == NULL) { 3453 spdk_nvme_poll_group_destroy(group->group); 3454 return -1; 3455 } 3456 3457 return 0; 3458 } 3459 3460 static void 3461 bdev_nvme_destroy_poll_group_cb(void *io_device, void *ctx_buf) 3462 { 3463 struct nvme_poll_group *group = ctx_buf; 3464 3465 assert(TAILQ_EMPTY(&group->qpair_list)); 3466 3467 if (group->accel_channel) { 3468 spdk_put_io_channel(group->accel_channel); 3469 } 3470 3471 spdk_poller_unregister(&group->poller); 3472 if (spdk_nvme_poll_group_destroy(group->group)) { 3473 SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.\n"); 3474 assert(false); 3475 } 3476 } 3477 3478 static struct spdk_io_channel * 3479 bdev_nvme_get_io_channel(void *ctx) 3480 { 3481 struct nvme_bdev *nvme_bdev = ctx; 3482 3483 return spdk_get_io_channel(nvme_bdev); 3484 } 3485 3486 static void * 3487 bdev_nvme_get_module_ctx(void *ctx) 3488 { 3489 struct nvme_bdev *nvme_bdev = ctx; 3490 struct nvme_ns *nvme_ns; 3491 3492 if (!nvme_bdev || nvme_bdev->disk.module != &nvme_if) { 3493 return NULL; 3494 } 3495 3496 nvme_ns = TAILQ_FIRST(&nvme_bdev->nvme_ns_list); 3497 if (!nvme_ns) { 3498 return NULL; 3499 } 3500 3501 return nvme_ns->ns; 3502 } 3503 3504 static const char * 3505 _nvme_ana_state_str(enum spdk_nvme_ana_state ana_state) 3506 { 3507 switch (ana_state) { 3508 case SPDK_NVME_ANA_OPTIMIZED_STATE: 3509 return "optimized"; 3510 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 3511 return "non_optimized"; 3512 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 3513 return "inaccessible"; 3514 case SPDK_NVME_ANA_PERSISTENT_LOSS_STATE: 3515 return "persistent_loss"; 3516 case SPDK_NVME_ANA_CHANGE_STATE: 3517 return "change"; 3518 default: 3519 return NULL; 3520 } 3521 } 3522 3523 static int 3524 bdev_nvme_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size) 3525 { 3526 struct spdk_memory_domain **_domains = NULL; 3527 struct nvme_bdev *nbdev = ctx; 3528 struct nvme_ns *nvme_ns; 3529 int i = 0, _array_size = array_size; 3530 int rc = 0; 3531 3532 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 3533 if (domains && array_size >= i) { 3534 _domains = &domains[i]; 3535 } else { 3536 _domains = NULL; 3537 } 3538 rc = spdk_nvme_ctrlr_get_memory_domains(nvme_ns->ctrlr->ctrlr, _domains, _array_size); 3539 if (rc > 0) { 3540 i += rc; 3541 if (_array_size >= rc) { 3542 _array_size -= rc; 3543 } else { 3544 _array_size = 0; 3545 } 3546 } else if (rc < 0) { 3547 return rc; 3548 } 3549 } 3550 3551 return i; 3552 } 3553 3554 static const char * 3555 nvme_ctrlr_get_state_str(struct nvme_ctrlr *nvme_ctrlr) 3556 { 3557 if (nvme_ctrlr->destruct) { 3558 return "deleting"; 3559 } else if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 3560 return "failed"; 3561 } else if (nvme_ctrlr->resetting) { 3562 return "resetting"; 3563 } else if (nvme_ctrlr->reconnect_is_delayed > 0) { 3564 return "reconnect_is_delayed"; 3565 } else if (nvme_ctrlr->disabled) { 3566 return "disabled"; 3567 } else { 3568 return "enabled"; 3569 } 3570 } 3571 3572 void 3573 nvme_ctrlr_info_json(struct spdk_json_write_ctx *w, struct nvme_ctrlr *nvme_ctrlr) 3574 { 3575 struct spdk_nvme_transport_id *trid; 3576 const struct spdk_nvme_ctrlr_opts *opts; 3577 const struct spdk_nvme_ctrlr_data *cdata; 3578 struct nvme_path_id *path_id; 3579 3580 spdk_json_write_object_begin(w); 3581 3582 spdk_json_write_named_string(w, "state", nvme_ctrlr_get_state_str(nvme_ctrlr)); 3583 3584 #ifdef SPDK_CONFIG_NVME_CUSE 3585 size_t cuse_name_size = 128; 3586 char cuse_name[cuse_name_size]; 3587 3588 int rc = spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr, cuse_name, &cuse_name_size); 3589 if (rc == 0) { 3590 spdk_json_write_named_string(w, "cuse_device", cuse_name); 3591 } 3592 #endif 3593 trid = &nvme_ctrlr->active_path_id->trid; 3594 spdk_json_write_named_object_begin(w, "trid"); 3595 nvme_bdev_dump_trid_json(trid, w); 3596 spdk_json_write_object_end(w); 3597 3598 path_id = TAILQ_NEXT(nvme_ctrlr->active_path_id, link); 3599 if (path_id != NULL) { 3600 spdk_json_write_named_array_begin(w, "alternate_trids"); 3601 do { 3602 trid = &path_id->trid; 3603 spdk_json_write_object_begin(w); 3604 nvme_bdev_dump_trid_json(trid, w); 3605 spdk_json_write_object_end(w); 3606 3607 path_id = TAILQ_NEXT(path_id, link); 3608 } while (path_id != NULL); 3609 spdk_json_write_array_end(w); 3610 } 3611 3612 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 3613 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 3614 3615 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 3616 spdk_json_write_named_object_begin(w, "host"); 3617 spdk_json_write_named_string(w, "nqn", opts->hostnqn); 3618 spdk_json_write_named_string(w, "addr", opts->src_addr); 3619 spdk_json_write_named_string(w, "svcid", opts->src_svcid); 3620 spdk_json_write_object_end(w); 3621 3622 spdk_json_write_object_end(w); 3623 } 3624 3625 static void 3626 nvme_namespace_info_json(struct spdk_json_write_ctx *w, 3627 struct nvme_ns *nvme_ns) 3628 { 3629 struct spdk_nvme_ns *ns; 3630 struct spdk_nvme_ctrlr *ctrlr; 3631 const struct spdk_nvme_ctrlr_data *cdata; 3632 const struct spdk_nvme_transport_id *trid; 3633 union spdk_nvme_vs_register vs; 3634 const struct spdk_nvme_ns_data *nsdata; 3635 char buf[128]; 3636 3637 ns = nvme_ns->ns; 3638 if (ns == NULL) { 3639 return; 3640 } 3641 3642 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 3643 3644 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3645 trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); 3646 vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr); 3647 3648 spdk_json_write_object_begin(w); 3649 3650 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 3651 spdk_json_write_named_string(w, "pci_address", trid->traddr); 3652 } 3653 3654 spdk_json_write_named_object_begin(w, "trid"); 3655 3656 nvme_bdev_dump_trid_json(trid, w); 3657 3658 spdk_json_write_object_end(w); 3659 3660 #ifdef SPDK_CONFIG_NVME_CUSE 3661 size_t cuse_name_size = 128; 3662 char cuse_name[cuse_name_size]; 3663 3664 int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns), 3665 cuse_name, &cuse_name_size); 3666 if (rc == 0) { 3667 spdk_json_write_named_string(w, "cuse_device", cuse_name); 3668 } 3669 #endif 3670 3671 spdk_json_write_named_object_begin(w, "ctrlr_data"); 3672 3673 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 3674 3675 spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid); 3676 3677 snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn); 3678 spdk_str_trim(buf); 3679 spdk_json_write_named_string(w, "model_number", buf); 3680 3681 snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn); 3682 spdk_str_trim(buf); 3683 spdk_json_write_named_string(w, "serial_number", buf); 3684 3685 snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr); 3686 spdk_str_trim(buf); 3687 spdk_json_write_named_string(w, "firmware_revision", buf); 3688 3689 if (cdata->subnqn[0] != '\0') { 3690 spdk_json_write_named_string(w, "subnqn", cdata->subnqn); 3691 } 3692 3693 spdk_json_write_named_object_begin(w, "oacs"); 3694 3695 spdk_json_write_named_uint32(w, "security", cdata->oacs.security); 3696 spdk_json_write_named_uint32(w, "format", cdata->oacs.format); 3697 spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware); 3698 spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage); 3699 3700 spdk_json_write_object_end(w); 3701 3702 spdk_json_write_named_bool(w, "multi_ctrlr", cdata->cmic.multi_ctrlr); 3703 spdk_json_write_named_bool(w, "ana_reporting", cdata->cmic.ana_reporting); 3704 3705 spdk_json_write_object_end(w); 3706 3707 spdk_json_write_named_object_begin(w, "vs"); 3708 3709 spdk_json_write_name(w, "nvme_version"); 3710 if (vs.bits.ter) { 3711 spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter); 3712 } else { 3713 spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr); 3714 } 3715 3716 spdk_json_write_object_end(w); 3717 3718 nsdata = spdk_nvme_ns_get_data(ns); 3719 3720 spdk_json_write_named_object_begin(w, "ns_data"); 3721 3722 spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns)); 3723 3724 if (cdata->cmic.ana_reporting) { 3725 spdk_json_write_named_string(w, "ana_state", 3726 _nvme_ana_state_str(nvme_ns->ana_state)); 3727 } 3728 3729 spdk_json_write_named_bool(w, "can_share", nsdata->nmic.can_share); 3730 3731 spdk_json_write_object_end(w); 3732 3733 if (cdata->oacs.security) { 3734 spdk_json_write_named_object_begin(w, "security"); 3735 3736 spdk_json_write_named_bool(w, "opal", nvme_ns->bdev->opal); 3737 3738 spdk_json_write_object_end(w); 3739 } 3740 3741 spdk_json_write_object_end(w); 3742 } 3743 3744 static const char * 3745 nvme_bdev_get_mp_policy_str(struct nvme_bdev *nbdev) 3746 { 3747 switch (nbdev->mp_policy) { 3748 case BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE: 3749 return "active_passive"; 3750 case BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE: 3751 return "active_active"; 3752 default: 3753 assert(false); 3754 return "invalid"; 3755 } 3756 } 3757 3758 static int 3759 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) 3760 { 3761 struct nvme_bdev *nvme_bdev = ctx; 3762 struct nvme_ns *nvme_ns; 3763 3764 pthread_mutex_lock(&nvme_bdev->mutex); 3765 spdk_json_write_named_array_begin(w, "nvme"); 3766 TAILQ_FOREACH(nvme_ns, &nvme_bdev->nvme_ns_list, tailq) { 3767 nvme_namespace_info_json(w, nvme_ns); 3768 } 3769 spdk_json_write_array_end(w); 3770 spdk_json_write_named_string(w, "mp_policy", nvme_bdev_get_mp_policy_str(nvme_bdev)); 3771 pthread_mutex_unlock(&nvme_bdev->mutex); 3772 3773 return 0; 3774 } 3775 3776 static void 3777 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3778 { 3779 /* No config per bdev needed */ 3780 } 3781 3782 static uint64_t 3783 bdev_nvme_get_spin_time(struct spdk_io_channel *ch) 3784 { 3785 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 3786 struct nvme_io_path *io_path; 3787 struct nvme_poll_group *group; 3788 uint64_t spin_time = 0; 3789 3790 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 3791 group = io_path->qpair->group; 3792 3793 if (!group || !group->collect_spin_stat) { 3794 continue; 3795 } 3796 3797 if (group->end_ticks != 0) { 3798 group->spin_ticks += (group->end_ticks - group->start_ticks); 3799 group->end_ticks = 0; 3800 } 3801 3802 spin_time += group->spin_ticks; 3803 group->start_ticks = 0; 3804 group->spin_ticks = 0; 3805 } 3806 3807 return (spin_time * 1000000ULL) / spdk_get_ticks_hz(); 3808 } 3809 3810 static void 3811 bdev_nvme_reset_device_stat(void *ctx) 3812 { 3813 struct nvme_bdev *nbdev = ctx; 3814 3815 if (nbdev->err_stat != NULL) { 3816 memset(nbdev->err_stat, 0, sizeof(struct nvme_error_stat)); 3817 } 3818 } 3819 3820 /* JSON string should be lowercases and underscore delimited string. */ 3821 static void 3822 bdev_nvme_format_nvme_status(char *dst, const char *src) 3823 { 3824 char tmp[256]; 3825 3826 spdk_strcpy_replace(dst, 256, src, " - ", "_"); 3827 spdk_strcpy_replace(tmp, 256, dst, "-", "_"); 3828 spdk_strcpy_replace(dst, 256, tmp, " ", "_"); 3829 spdk_strlwr(dst); 3830 } 3831 3832 static void 3833 bdev_nvme_dump_device_stat_json(void *ctx, struct spdk_json_write_ctx *w) 3834 { 3835 struct nvme_bdev *nbdev = ctx; 3836 struct spdk_nvme_status status = {}; 3837 uint16_t sct, sc; 3838 char status_json[256]; 3839 const char *status_str; 3840 3841 if (nbdev->err_stat == NULL) { 3842 return; 3843 } 3844 3845 spdk_json_write_named_object_begin(w, "nvme_error"); 3846 3847 spdk_json_write_named_object_begin(w, "status_type"); 3848 for (sct = 0; sct < 8; sct++) { 3849 if (nbdev->err_stat->status_type[sct] == 0) { 3850 continue; 3851 } 3852 status.sct = sct; 3853 3854 status_str = spdk_nvme_cpl_get_status_type_string(&status); 3855 assert(status_str != NULL); 3856 bdev_nvme_format_nvme_status(status_json, status_str); 3857 3858 spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status_type[sct]); 3859 } 3860 spdk_json_write_object_end(w); 3861 3862 spdk_json_write_named_object_begin(w, "status_code"); 3863 for (sct = 0; sct < 4; sct++) { 3864 status.sct = sct; 3865 for (sc = 0; sc < 256; sc++) { 3866 if (nbdev->err_stat->status[sct][sc] == 0) { 3867 continue; 3868 } 3869 status.sc = sc; 3870 3871 status_str = spdk_nvme_cpl_get_status_string(&status); 3872 assert(status_str != NULL); 3873 bdev_nvme_format_nvme_status(status_json, status_str); 3874 3875 spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status[sct][sc]); 3876 } 3877 } 3878 spdk_json_write_object_end(w); 3879 3880 spdk_json_write_object_end(w); 3881 } 3882 3883 static bool 3884 bdev_nvme_accel_sequence_supported(void *ctx, enum spdk_bdev_io_type type) 3885 { 3886 struct nvme_bdev *nbdev = ctx; 3887 struct spdk_nvme_ctrlr *ctrlr; 3888 3889 if (!g_opts.allow_accel_sequence) { 3890 return false; 3891 } 3892 3893 switch (type) { 3894 case SPDK_BDEV_IO_TYPE_WRITE: 3895 case SPDK_BDEV_IO_TYPE_READ: 3896 break; 3897 default: 3898 return false; 3899 } 3900 3901 ctrlr = bdev_nvme_get_ctrlr(&nbdev->disk); 3902 assert(ctrlr != NULL); 3903 3904 return spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ACCEL_SEQUENCE_SUPPORTED; 3905 } 3906 3907 static const struct spdk_bdev_fn_table nvmelib_fn_table = { 3908 .destruct = bdev_nvme_destruct, 3909 .submit_request = bdev_nvme_submit_request, 3910 .io_type_supported = bdev_nvme_io_type_supported, 3911 .get_io_channel = bdev_nvme_get_io_channel, 3912 .dump_info_json = bdev_nvme_dump_info_json, 3913 .write_config_json = bdev_nvme_write_config_json, 3914 .get_spin_time = bdev_nvme_get_spin_time, 3915 .get_module_ctx = bdev_nvme_get_module_ctx, 3916 .get_memory_domains = bdev_nvme_get_memory_domains, 3917 .accel_sequence_supported = bdev_nvme_accel_sequence_supported, 3918 .reset_device_stat = bdev_nvme_reset_device_stat, 3919 .dump_device_stat_json = bdev_nvme_dump_device_stat_json, 3920 }; 3921 3922 typedef int (*bdev_nvme_parse_ana_log_page_cb)( 3923 const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg); 3924 3925 static int 3926 bdev_nvme_parse_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 3927 bdev_nvme_parse_ana_log_page_cb cb_fn, void *cb_arg) 3928 { 3929 struct spdk_nvme_ana_group_descriptor *copied_desc; 3930 uint8_t *orig_desc; 3931 uint32_t i, desc_size, copy_len; 3932 int rc = 0; 3933 3934 if (nvme_ctrlr->ana_log_page == NULL) { 3935 return -EINVAL; 3936 } 3937 3938 copied_desc = nvme_ctrlr->copied_ana_desc; 3939 3940 orig_desc = (uint8_t *)nvme_ctrlr->ana_log_page + sizeof(struct spdk_nvme_ana_page); 3941 copy_len = nvme_ctrlr->max_ana_log_page_size - sizeof(struct spdk_nvme_ana_page); 3942 3943 for (i = 0; i < nvme_ctrlr->ana_log_page->num_ana_group_desc; i++) { 3944 memcpy(copied_desc, orig_desc, copy_len); 3945 3946 rc = cb_fn(copied_desc, cb_arg); 3947 if (rc != 0) { 3948 break; 3949 } 3950 3951 desc_size = sizeof(struct spdk_nvme_ana_group_descriptor) + 3952 copied_desc->num_of_nsid * sizeof(uint32_t); 3953 orig_desc += desc_size; 3954 copy_len -= desc_size; 3955 } 3956 3957 return rc; 3958 } 3959 3960 static int 3961 nvme_ns_ana_transition_timedout(void *ctx) 3962 { 3963 struct nvme_ns *nvme_ns = ctx; 3964 3965 spdk_poller_unregister(&nvme_ns->anatt_timer); 3966 nvme_ns->ana_transition_timedout = true; 3967 3968 return SPDK_POLLER_BUSY; 3969 } 3970 3971 static void 3972 _nvme_ns_set_ana_state(struct nvme_ns *nvme_ns, 3973 const struct spdk_nvme_ana_group_descriptor *desc) 3974 { 3975 const struct spdk_nvme_ctrlr_data *cdata; 3976 3977 nvme_ns->ana_group_id = desc->ana_group_id; 3978 nvme_ns->ana_state = desc->ana_state; 3979 nvme_ns->ana_state_updating = false; 3980 3981 switch (nvme_ns->ana_state) { 3982 case SPDK_NVME_ANA_OPTIMIZED_STATE: 3983 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 3984 nvme_ns->ana_transition_timedout = false; 3985 spdk_poller_unregister(&nvme_ns->anatt_timer); 3986 break; 3987 3988 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 3989 case SPDK_NVME_ANA_CHANGE_STATE: 3990 if (nvme_ns->anatt_timer != NULL) { 3991 break; 3992 } 3993 3994 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 3995 nvme_ns->anatt_timer = SPDK_POLLER_REGISTER(nvme_ns_ana_transition_timedout, 3996 nvme_ns, 3997 cdata->anatt * SPDK_SEC_TO_USEC); 3998 break; 3999 default: 4000 break; 4001 } 4002 } 4003 4004 static int 4005 nvme_ns_set_ana_state(const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg) 4006 { 4007 struct nvme_ns *nvme_ns = cb_arg; 4008 uint32_t i; 4009 4010 assert(nvme_ns->ns != NULL); 4011 4012 for (i = 0; i < desc->num_of_nsid; i++) { 4013 if (desc->nsid[i] != spdk_nvme_ns_get_id(nvme_ns->ns)) { 4014 continue; 4015 } 4016 4017 _nvme_ns_set_ana_state(nvme_ns, desc); 4018 return 1; 4019 } 4020 4021 return 0; 4022 } 4023 4024 static int 4025 nvme_generate_uuid(const char *sn, uint32_t nsid, struct spdk_uuid *uuid) 4026 { 4027 int rc = 0; 4028 struct spdk_uuid new_uuid, namespace_uuid; 4029 char merged_str[SPDK_NVME_CTRLR_SN_LEN + NSID_STR_LEN + 1] = {'\0'}; 4030 /* This namespace UUID was generated using uuid_generate() method. */ 4031 const char *namespace_str = {"edaed2de-24bc-4b07-b559-f47ecbe730fd"}; 4032 int size; 4033 4034 assert(strlen(sn) <= SPDK_NVME_CTRLR_SN_LEN); 4035 4036 spdk_uuid_set_null(&new_uuid); 4037 spdk_uuid_set_null(&namespace_uuid); 4038 4039 size = snprintf(merged_str, sizeof(merged_str), "%s%"PRIu32, sn, nsid); 4040 if (size <= 0 || (unsigned long)size >= sizeof(merged_str)) { 4041 return -EINVAL; 4042 } 4043 4044 spdk_uuid_parse(&namespace_uuid, namespace_str); 4045 4046 rc = spdk_uuid_generate_sha1(&new_uuid, &namespace_uuid, merged_str, size); 4047 if (rc == 0) { 4048 memcpy(uuid, &new_uuid, sizeof(struct spdk_uuid)); 4049 } 4050 4051 return rc; 4052 } 4053 4054 static int 4055 nvme_disk_create(struct spdk_bdev *disk, const char *base_name, 4056 struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns, 4057 uint32_t prchk_flags, void *ctx) 4058 { 4059 const struct spdk_uuid *uuid; 4060 const uint8_t *nguid; 4061 const struct spdk_nvme_ctrlr_data *cdata; 4062 const struct spdk_nvme_ns_data *nsdata; 4063 const struct spdk_nvme_ctrlr_opts *opts; 4064 enum spdk_nvme_csi csi; 4065 uint32_t atomic_bs, phys_bs, bs; 4066 char sn_tmp[SPDK_NVME_CTRLR_SN_LEN + 1] = {'\0'}; 4067 int rc; 4068 4069 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 4070 csi = spdk_nvme_ns_get_csi(ns); 4071 opts = spdk_nvme_ctrlr_get_opts(ctrlr); 4072 4073 switch (csi) { 4074 case SPDK_NVME_CSI_NVM: 4075 disk->product_name = "NVMe disk"; 4076 break; 4077 case SPDK_NVME_CSI_ZNS: 4078 disk->product_name = "NVMe ZNS disk"; 4079 disk->zoned = true; 4080 disk->zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 4081 disk->max_zone_append_size = spdk_nvme_zns_ctrlr_get_max_zone_append_size(ctrlr) / 4082 spdk_nvme_ns_get_extended_sector_size(ns); 4083 disk->max_open_zones = spdk_nvme_zns_ns_get_max_open_zones(ns); 4084 disk->max_active_zones = spdk_nvme_zns_ns_get_max_active_zones(ns); 4085 break; 4086 default: 4087 SPDK_ERRLOG("unsupported CSI: %u\n", csi); 4088 return -ENOTSUP; 4089 } 4090 4091 nguid = spdk_nvme_ns_get_nguid(ns); 4092 if (!nguid) { 4093 uuid = spdk_nvme_ns_get_uuid(ns); 4094 if (uuid) { 4095 disk->uuid = *uuid; 4096 } else if (g_opts.generate_uuids) { 4097 spdk_strcpy_pad(sn_tmp, cdata->sn, SPDK_NVME_CTRLR_SN_LEN, '\0'); 4098 rc = nvme_generate_uuid(sn_tmp, spdk_nvme_ns_get_id(ns), &disk->uuid); 4099 if (rc < 0) { 4100 SPDK_ERRLOG("UUID generation failed (%s)\n", spdk_strerror(-rc)); 4101 return rc; 4102 } 4103 } 4104 } else { 4105 memcpy(&disk->uuid, nguid, sizeof(disk->uuid)); 4106 } 4107 4108 disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns)); 4109 if (!disk->name) { 4110 return -ENOMEM; 4111 } 4112 4113 disk->write_cache = 0; 4114 if (cdata->vwc.present) { 4115 /* Enable if the Volatile Write Cache exists */ 4116 disk->write_cache = 1; 4117 } 4118 if (cdata->oncs.write_zeroes) { 4119 disk->max_write_zeroes = UINT16_MAX + 1; 4120 } 4121 disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns); 4122 disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns); 4123 disk->max_segment_size = spdk_nvme_ctrlr_get_max_xfer_size(ctrlr); 4124 /* NVMe driver will split one request into multiple requests 4125 * based on MDTS and stripe boundary, the bdev layer will use 4126 * max_segment_size and max_num_segments to split one big IO 4127 * into multiple requests, then small request can't run out 4128 * of NVMe internal requests data structure. 4129 */ 4130 if (opts && opts->io_queue_requests) { 4131 disk->max_num_segments = opts->io_queue_requests / 2; 4132 } 4133 if (spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_SGL_SUPPORTED) { 4134 /* The nvme driver will try to split I/O that have too many 4135 * SGEs, but it doesn't work if that last SGE doesn't end on 4136 * an aggregate total that is block aligned. The bdev layer has 4137 * a more robust splitting framework, so use that instead for 4138 * this case. (See issue #3269.) 4139 */ 4140 uint16_t max_sges = spdk_nvme_ctrlr_get_max_sges(ctrlr); 4141 4142 if (disk->max_num_segments == 0) { 4143 disk->max_num_segments = max_sges; 4144 } else { 4145 disk->max_num_segments = spdk_min(disk->max_num_segments, max_sges); 4146 } 4147 } 4148 disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns); 4149 4150 nsdata = spdk_nvme_ns_get_data(ns); 4151 bs = spdk_nvme_ns_get_sector_size(ns); 4152 atomic_bs = bs; 4153 phys_bs = bs; 4154 if (nsdata->nabo == 0) { 4155 if (nsdata->nsfeat.ns_atomic_write_unit && nsdata->nawupf) { 4156 atomic_bs = bs * (1 + nsdata->nawupf); 4157 } else { 4158 atomic_bs = bs * (1 + cdata->awupf); 4159 } 4160 } 4161 if (nsdata->nsfeat.optperf) { 4162 phys_bs = bs * (1 + nsdata->npwg); 4163 } 4164 disk->phys_blocklen = spdk_min(phys_bs, atomic_bs); 4165 4166 disk->md_len = spdk_nvme_ns_get_md_size(ns); 4167 if (disk->md_len != 0) { 4168 disk->md_interleave = nsdata->flbas.extended; 4169 disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns); 4170 if (disk->dif_type != SPDK_DIF_DISABLE) { 4171 disk->dif_is_head_of_md = nsdata->dps.md_start; 4172 disk->dif_check_flags = prchk_flags; 4173 } 4174 } 4175 4176 if (!(spdk_nvme_ctrlr_get_flags(ctrlr) & 4177 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) { 4178 disk->acwu = 0; 4179 } else if (nsdata->nsfeat.ns_atomic_write_unit) { 4180 disk->acwu = nsdata->nacwu + 1; /* 0-based */ 4181 } else { 4182 disk->acwu = cdata->acwu + 1; /* 0-based */ 4183 } 4184 4185 if (cdata->oncs.copy) { 4186 /* For now bdev interface allows only single segment copy */ 4187 disk->max_copy = nsdata->mssrl; 4188 } 4189 4190 disk->ctxt = ctx; 4191 disk->fn_table = &nvmelib_fn_table; 4192 disk->module = &nvme_if; 4193 4194 return 0; 4195 } 4196 4197 static struct nvme_bdev * 4198 nvme_bdev_alloc(void) 4199 { 4200 struct nvme_bdev *bdev; 4201 int rc; 4202 4203 bdev = calloc(1, sizeof(*bdev)); 4204 if (!bdev) { 4205 SPDK_ERRLOG("bdev calloc() failed\n"); 4206 return NULL; 4207 } 4208 4209 if (g_opts.nvme_error_stat) { 4210 bdev->err_stat = calloc(1, sizeof(struct nvme_error_stat)); 4211 if (!bdev->err_stat) { 4212 SPDK_ERRLOG("err_stat calloc() failed\n"); 4213 free(bdev); 4214 return NULL; 4215 } 4216 } 4217 4218 rc = pthread_mutex_init(&bdev->mutex, NULL); 4219 if (rc != 0) { 4220 free(bdev->err_stat); 4221 free(bdev); 4222 return NULL; 4223 } 4224 4225 bdev->ref = 1; 4226 bdev->mp_policy = BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE; 4227 bdev->mp_selector = BDEV_NVME_MP_SELECTOR_ROUND_ROBIN; 4228 bdev->rr_min_io = UINT32_MAX; 4229 TAILQ_INIT(&bdev->nvme_ns_list); 4230 4231 return bdev; 4232 } 4233 4234 static int 4235 nvme_bdev_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4236 { 4237 struct nvme_bdev *bdev; 4238 struct nvme_bdev_ctrlr *nbdev_ctrlr = nvme_ctrlr->nbdev_ctrlr; 4239 int rc; 4240 4241 bdev = nvme_bdev_alloc(); 4242 if (bdev == NULL) { 4243 SPDK_ERRLOG("Failed to allocate NVMe bdev\n"); 4244 return -ENOMEM; 4245 } 4246 4247 bdev->opal = nvme_ctrlr->opal_dev != NULL; 4248 4249 rc = nvme_disk_create(&bdev->disk, nbdev_ctrlr->name, nvme_ctrlr->ctrlr, 4250 nvme_ns->ns, nvme_ctrlr->opts.prchk_flags, bdev); 4251 if (rc != 0) { 4252 SPDK_ERRLOG("Failed to create NVMe disk\n"); 4253 nvme_bdev_free(bdev); 4254 return rc; 4255 } 4256 4257 spdk_io_device_register(bdev, 4258 bdev_nvme_create_bdev_channel_cb, 4259 bdev_nvme_destroy_bdev_channel_cb, 4260 sizeof(struct nvme_bdev_channel), 4261 bdev->disk.name); 4262 4263 nvme_ns->bdev = bdev; 4264 bdev->nsid = nvme_ns->id; 4265 TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); 4266 4267 bdev->nbdev_ctrlr = nbdev_ctrlr; 4268 TAILQ_INSERT_TAIL(&nbdev_ctrlr->bdevs, bdev, tailq); 4269 4270 rc = spdk_bdev_register(&bdev->disk); 4271 if (rc != 0) { 4272 SPDK_ERRLOG("spdk_bdev_register() failed\n"); 4273 spdk_io_device_unregister(bdev, NULL); 4274 nvme_ns->bdev = NULL; 4275 TAILQ_REMOVE(&nbdev_ctrlr->bdevs, bdev, tailq); 4276 nvme_bdev_free(bdev); 4277 return rc; 4278 } 4279 4280 return 0; 4281 } 4282 4283 static bool 4284 bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2) 4285 { 4286 const struct spdk_nvme_ns_data *nsdata1, *nsdata2; 4287 const struct spdk_uuid *uuid1, *uuid2; 4288 4289 nsdata1 = spdk_nvme_ns_get_data(ns1); 4290 nsdata2 = spdk_nvme_ns_get_data(ns2); 4291 uuid1 = spdk_nvme_ns_get_uuid(ns1); 4292 uuid2 = spdk_nvme_ns_get_uuid(ns2); 4293 4294 return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid)) == 0 && 4295 nsdata1->eui64 == nsdata2->eui64 && 4296 ((uuid1 == NULL && uuid2 == NULL) || 4297 (uuid1 != NULL && uuid2 != NULL && spdk_uuid_compare(uuid1, uuid2) == 0)) && 4298 spdk_nvme_ns_get_csi(ns1) == spdk_nvme_ns_get_csi(ns2); 4299 } 4300 4301 static bool 4302 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 4303 struct spdk_nvme_ctrlr_opts *opts) 4304 { 4305 struct nvme_probe_skip_entry *entry; 4306 4307 TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) { 4308 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 4309 return false; 4310 } 4311 } 4312 4313 opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst; 4314 opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight; 4315 opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight; 4316 opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight; 4317 opts->disable_read_ana_log_page = true; 4318 4319 SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr); 4320 4321 return true; 4322 } 4323 4324 static void 4325 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl) 4326 { 4327 struct nvme_ctrlr *nvme_ctrlr = ctx; 4328 4329 if (spdk_nvme_cpl_is_error(cpl)) { 4330 SPDK_WARNLOG("Abort failed. Resetting controller. sc is %u, sct is %u.\n", cpl->status.sc, 4331 cpl->status.sct); 4332 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4333 } else if (cpl->cdw0 & 0x1) { 4334 SPDK_WARNLOG("Specified command could not be aborted.\n"); 4335 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4336 } 4337 } 4338 4339 static void 4340 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr, 4341 struct spdk_nvme_qpair *qpair, uint16_t cid) 4342 { 4343 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 4344 union spdk_nvme_csts_register csts; 4345 int rc; 4346 4347 assert(nvme_ctrlr->ctrlr == ctrlr); 4348 4349 SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid); 4350 4351 /* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O 4352 * queue. (Note: qpair == NULL when there's an admin cmd timeout.) Otherwise we 4353 * would submit another fabrics cmd on the admin queue to read CSTS and check for its 4354 * completion recursively. 4355 */ 4356 if (nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) { 4357 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); 4358 if (csts.bits.cfs) { 4359 SPDK_ERRLOG("Controller Fatal Status, reset required\n"); 4360 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4361 return; 4362 } 4363 } 4364 4365 switch (g_opts.action_on_timeout) { 4366 case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT: 4367 if (qpair) { 4368 /* Don't send abort to ctrlr when ctrlr is not available. */ 4369 pthread_mutex_lock(&nvme_ctrlr->mutex); 4370 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 4371 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4372 SPDK_NOTICELOG("Quit abort. Ctrlr is not available.\n"); 4373 return; 4374 } 4375 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4376 4377 rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid, 4378 nvme_abort_cpl, nvme_ctrlr); 4379 if (rc == 0) { 4380 return; 4381 } 4382 4383 SPDK_ERRLOG("Unable to send abort. Resetting, rc is %d.\n", rc); 4384 } 4385 4386 /* FALLTHROUGH */ 4387 case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET: 4388 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4389 break; 4390 case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE: 4391 SPDK_DEBUGLOG(bdev_nvme, "No action for nvme controller timeout.\n"); 4392 break; 4393 default: 4394 SPDK_ERRLOG("An invalid timeout action value is found.\n"); 4395 break; 4396 } 4397 } 4398 4399 static struct nvme_ns * 4400 nvme_ns_alloc(void) 4401 { 4402 struct nvme_ns *nvme_ns; 4403 4404 nvme_ns = calloc(1, sizeof(struct nvme_ns)); 4405 if (nvme_ns == NULL) { 4406 return NULL; 4407 } 4408 4409 if (g_opts.io_path_stat) { 4410 nvme_ns->stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); 4411 if (nvme_ns->stat == NULL) { 4412 free(nvme_ns); 4413 return NULL; 4414 } 4415 spdk_bdev_reset_io_stat(nvme_ns->stat, SPDK_BDEV_RESET_STAT_MAXMIN); 4416 } 4417 4418 return nvme_ns; 4419 } 4420 4421 static void 4422 nvme_ns_free(struct nvme_ns *nvme_ns) 4423 { 4424 free(nvme_ns->stat); 4425 free(nvme_ns); 4426 } 4427 4428 static void 4429 nvme_ctrlr_populate_namespace_done(struct nvme_ns *nvme_ns, int rc) 4430 { 4431 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 4432 struct nvme_async_probe_ctx *ctx = nvme_ns->probe_ctx; 4433 4434 if (rc == 0) { 4435 nvme_ns->probe_ctx = NULL; 4436 pthread_mutex_lock(&nvme_ctrlr->mutex); 4437 nvme_ctrlr->ref++; 4438 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4439 } else { 4440 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4441 nvme_ns_free(nvme_ns); 4442 } 4443 4444 if (ctx) { 4445 ctx->populates_in_progress--; 4446 if (ctx->populates_in_progress == 0) { 4447 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 4448 } 4449 } 4450 } 4451 4452 static void 4453 bdev_nvme_add_io_path(struct spdk_io_channel_iter *i) 4454 { 4455 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4456 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4457 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4458 int rc; 4459 4460 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 4461 if (rc != 0) { 4462 SPDK_ERRLOG("Failed to add I/O path to bdev_channel dynamically.\n"); 4463 } 4464 4465 spdk_for_each_channel_continue(i, rc); 4466 } 4467 4468 static void 4469 bdev_nvme_delete_io_path(struct spdk_io_channel_iter *i) 4470 { 4471 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4472 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4473 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4474 struct nvme_io_path *io_path; 4475 4476 io_path = _bdev_nvme_get_io_path(nbdev_ch, nvme_ns); 4477 if (io_path != NULL) { 4478 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 4479 } 4480 4481 spdk_for_each_channel_continue(i, 0); 4482 } 4483 4484 static void 4485 bdev_nvme_add_io_path_failed(struct spdk_io_channel_iter *i, int status) 4486 { 4487 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4488 4489 nvme_ctrlr_populate_namespace_done(nvme_ns, -1); 4490 } 4491 4492 static void 4493 bdev_nvme_add_io_path_done(struct spdk_io_channel_iter *i, int status) 4494 { 4495 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4496 struct nvme_bdev *bdev = spdk_io_channel_iter_get_io_device(i); 4497 4498 if (status == 0) { 4499 nvme_ctrlr_populate_namespace_done(nvme_ns, 0); 4500 } else { 4501 /* Delete the added io_paths and fail populating the namespace. */ 4502 spdk_for_each_channel(bdev, 4503 bdev_nvme_delete_io_path, 4504 nvme_ns, 4505 bdev_nvme_add_io_path_failed); 4506 } 4507 } 4508 4509 static int 4510 nvme_bdev_add_ns(struct nvme_bdev *bdev, struct nvme_ns *nvme_ns) 4511 { 4512 struct nvme_ns *tmp_ns; 4513 const struct spdk_nvme_ns_data *nsdata; 4514 4515 nsdata = spdk_nvme_ns_get_data(nvme_ns->ns); 4516 if (!nsdata->nmic.can_share) { 4517 SPDK_ERRLOG("Namespace cannot be shared.\n"); 4518 return -EINVAL; 4519 } 4520 4521 pthread_mutex_lock(&bdev->mutex); 4522 4523 tmp_ns = TAILQ_FIRST(&bdev->nvme_ns_list); 4524 assert(tmp_ns != NULL); 4525 4526 if (tmp_ns->ns != NULL && !bdev_nvme_compare_ns(nvme_ns->ns, tmp_ns->ns)) { 4527 pthread_mutex_unlock(&bdev->mutex); 4528 SPDK_ERRLOG("Namespaces are not identical.\n"); 4529 return -EINVAL; 4530 } 4531 4532 bdev->ref++; 4533 TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); 4534 nvme_ns->bdev = bdev; 4535 4536 pthread_mutex_unlock(&bdev->mutex); 4537 4538 /* Add nvme_io_path to nvme_bdev_channels dynamically. */ 4539 spdk_for_each_channel(bdev, 4540 bdev_nvme_add_io_path, 4541 nvme_ns, 4542 bdev_nvme_add_io_path_done); 4543 4544 return 0; 4545 } 4546 4547 static void 4548 nvme_ctrlr_populate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4549 { 4550 struct spdk_nvme_ns *ns; 4551 struct nvme_bdev *bdev; 4552 int rc = 0; 4553 4554 ns = spdk_nvme_ctrlr_get_ns(nvme_ctrlr->ctrlr, nvme_ns->id); 4555 if (!ns) { 4556 SPDK_DEBUGLOG(bdev_nvme, "Invalid NS %d\n", nvme_ns->id); 4557 rc = -EINVAL; 4558 goto done; 4559 } 4560 4561 nvme_ns->ns = ns; 4562 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 4563 4564 if (nvme_ctrlr->ana_log_page != NULL) { 4565 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ns_set_ana_state, nvme_ns); 4566 } 4567 4568 bdev = nvme_bdev_ctrlr_get_bdev(nvme_ctrlr->nbdev_ctrlr, nvme_ns->id); 4569 if (bdev == NULL) { 4570 rc = nvme_bdev_create(nvme_ctrlr, nvme_ns); 4571 } else { 4572 rc = nvme_bdev_add_ns(bdev, nvme_ns); 4573 if (rc == 0) { 4574 return; 4575 } 4576 } 4577 done: 4578 nvme_ctrlr_populate_namespace_done(nvme_ns, rc); 4579 } 4580 4581 static void 4582 nvme_ctrlr_depopulate_namespace_done(struct nvme_ns *nvme_ns) 4583 { 4584 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 4585 4586 assert(nvme_ctrlr != NULL); 4587 4588 pthread_mutex_lock(&nvme_ctrlr->mutex); 4589 4590 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4591 4592 if (nvme_ns->bdev != NULL) { 4593 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4594 return; 4595 } 4596 4597 nvme_ns_free(nvme_ns); 4598 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4599 4600 nvme_ctrlr_release(nvme_ctrlr); 4601 } 4602 4603 static void 4604 bdev_nvme_delete_io_path_done(struct spdk_io_channel_iter *i, int status) 4605 { 4606 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4607 4608 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 4609 } 4610 4611 static void 4612 nvme_ctrlr_depopulate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4613 { 4614 struct nvme_bdev *bdev; 4615 4616 spdk_poller_unregister(&nvme_ns->anatt_timer); 4617 4618 bdev = nvme_ns->bdev; 4619 if (bdev != NULL) { 4620 pthread_mutex_lock(&bdev->mutex); 4621 4622 assert(bdev->ref > 0); 4623 bdev->ref--; 4624 if (bdev->ref == 0) { 4625 pthread_mutex_unlock(&bdev->mutex); 4626 4627 spdk_bdev_unregister(&bdev->disk, NULL, NULL); 4628 } else { 4629 /* spdk_bdev_unregister() is not called until the last nvme_ns is 4630 * depopulated. Hence we need to remove nvme_ns from bdev->nvme_ns_list 4631 * and clear nvme_ns->bdev here. 4632 */ 4633 TAILQ_REMOVE(&bdev->nvme_ns_list, nvme_ns, tailq); 4634 nvme_ns->bdev = NULL; 4635 4636 pthread_mutex_unlock(&bdev->mutex); 4637 4638 /* Delete nvme_io_paths from nvme_bdev_channels dynamically. After that, 4639 * we call depopulate_namespace_done() to avoid use-after-free. 4640 */ 4641 spdk_for_each_channel(bdev, 4642 bdev_nvme_delete_io_path, 4643 nvme_ns, 4644 bdev_nvme_delete_io_path_done); 4645 return; 4646 } 4647 } 4648 4649 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 4650 } 4651 4652 static void 4653 nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 4654 struct nvme_async_probe_ctx *ctx) 4655 { 4656 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 4657 struct nvme_ns *nvme_ns, *next; 4658 struct spdk_nvme_ns *ns; 4659 struct nvme_bdev *bdev; 4660 uint32_t nsid; 4661 int rc; 4662 uint64_t num_sectors; 4663 4664 if (ctx) { 4665 /* Initialize this count to 1 to handle the populate functions 4666 * calling nvme_ctrlr_populate_namespace_done() immediately. 4667 */ 4668 ctx->populates_in_progress = 1; 4669 } 4670 4671 /* First loop over our existing namespaces and see if they have been 4672 * removed. */ 4673 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 4674 while (nvme_ns != NULL) { 4675 next = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 4676 4677 if (spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) { 4678 /* NS is still there or added again. Its attributes may have changed. */ 4679 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id); 4680 if (nvme_ns->ns != ns) { 4681 assert(nvme_ns->ns == NULL); 4682 nvme_ns->ns = ns; 4683 SPDK_DEBUGLOG(bdev_nvme, "NSID %u was added\n", nvme_ns->id); 4684 } 4685 4686 num_sectors = spdk_nvme_ns_get_num_sectors(ns); 4687 bdev = nvme_ns->bdev; 4688 assert(bdev != NULL); 4689 if (bdev->disk.blockcnt != num_sectors) { 4690 SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n", 4691 nvme_ns->id, 4692 bdev->disk.name, 4693 bdev->disk.blockcnt, 4694 num_sectors); 4695 rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors); 4696 if (rc != 0) { 4697 SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n", 4698 bdev->disk.name, rc); 4699 } 4700 } 4701 } else { 4702 /* Namespace was removed */ 4703 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 4704 } 4705 4706 nvme_ns = next; 4707 } 4708 4709 /* Loop through all of the namespaces at the nvme level and see if any of them are new */ 4710 nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 4711 while (nsid != 0) { 4712 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 4713 4714 if (nvme_ns == NULL) { 4715 /* Found a new one */ 4716 nvme_ns = nvme_ns_alloc(); 4717 if (nvme_ns == NULL) { 4718 SPDK_ERRLOG("Failed to allocate namespace\n"); 4719 /* This just fails to attach the namespace. It may work on a future attempt. */ 4720 continue; 4721 } 4722 4723 nvme_ns->id = nsid; 4724 nvme_ns->ctrlr = nvme_ctrlr; 4725 4726 nvme_ns->bdev = NULL; 4727 4728 if (ctx) { 4729 ctx->populates_in_progress++; 4730 } 4731 nvme_ns->probe_ctx = ctx; 4732 4733 RB_INSERT(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4734 4735 nvme_ctrlr_populate_namespace(nvme_ctrlr, nvme_ns); 4736 } 4737 4738 nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid); 4739 } 4740 4741 if (ctx) { 4742 /* Decrement this count now that the loop is over to account 4743 * for the one we started with. If the count is then 0, we 4744 * know any populate_namespace functions completed immediately, 4745 * so we'll kick the callback here. 4746 */ 4747 ctx->populates_in_progress--; 4748 if (ctx->populates_in_progress == 0) { 4749 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 4750 } 4751 } 4752 4753 } 4754 4755 static void 4756 nvme_ctrlr_depopulate_namespaces(struct nvme_ctrlr *nvme_ctrlr) 4757 { 4758 struct nvme_ns *nvme_ns, *tmp; 4759 4760 RB_FOREACH_SAFE(nvme_ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp) { 4761 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 4762 } 4763 } 4764 4765 static uint32_t 4766 nvme_ctrlr_get_ana_log_page_size(struct nvme_ctrlr *nvme_ctrlr) 4767 { 4768 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 4769 const struct spdk_nvme_ctrlr_data *cdata; 4770 uint32_t nsid, ns_count = 0; 4771 4772 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 4773 4774 for (nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 4775 nsid != 0; nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid)) { 4776 ns_count++; 4777 } 4778 4779 return sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 4780 sizeof(struct spdk_nvme_ana_group_descriptor) + ns_count * 4781 sizeof(uint32_t); 4782 } 4783 4784 static int 4785 nvme_ctrlr_set_ana_states(const struct spdk_nvme_ana_group_descriptor *desc, 4786 void *cb_arg) 4787 { 4788 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 4789 struct nvme_ns *nvme_ns; 4790 uint32_t i, nsid; 4791 4792 for (i = 0; i < desc->num_of_nsid; i++) { 4793 nsid = desc->nsid[i]; 4794 if (nsid == 0) { 4795 continue; 4796 } 4797 4798 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 4799 4800 assert(nvme_ns != NULL); 4801 if (nvme_ns == NULL) { 4802 /* Target told us that an inactive namespace had an ANA change */ 4803 continue; 4804 } 4805 4806 _nvme_ns_set_ana_state(nvme_ns, desc); 4807 } 4808 4809 return 0; 4810 } 4811 4812 static void 4813 bdev_nvme_disable_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 4814 { 4815 struct nvme_ns *nvme_ns; 4816 4817 spdk_free(nvme_ctrlr->ana_log_page); 4818 nvme_ctrlr->ana_log_page = NULL; 4819 4820 for (nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 4821 nvme_ns != NULL; 4822 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns)) { 4823 nvme_ns->ana_state_updating = false; 4824 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 4825 } 4826 } 4827 4828 static void 4829 nvme_ctrlr_read_ana_log_page_done(void *ctx, const struct spdk_nvme_cpl *cpl) 4830 { 4831 struct nvme_ctrlr *nvme_ctrlr = ctx; 4832 4833 if (cpl != NULL && spdk_nvme_cpl_is_success(cpl)) { 4834 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ctrlr_set_ana_states, 4835 nvme_ctrlr); 4836 } else { 4837 bdev_nvme_disable_read_ana_log_page(nvme_ctrlr); 4838 } 4839 4840 pthread_mutex_lock(&nvme_ctrlr->mutex); 4841 4842 assert(nvme_ctrlr->ana_log_page_updating == true); 4843 nvme_ctrlr->ana_log_page_updating = false; 4844 4845 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 4846 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4847 4848 nvme_ctrlr_unregister(nvme_ctrlr); 4849 } else { 4850 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4851 4852 bdev_nvme_clear_io_path_caches(nvme_ctrlr); 4853 } 4854 } 4855 4856 static int 4857 nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 4858 { 4859 uint32_t ana_log_page_size; 4860 int rc; 4861 4862 if (nvme_ctrlr->ana_log_page == NULL) { 4863 return -EINVAL; 4864 } 4865 4866 ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); 4867 4868 if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { 4869 SPDK_ERRLOG("ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", 4870 ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); 4871 return -EINVAL; 4872 } 4873 4874 pthread_mutex_lock(&nvme_ctrlr->mutex); 4875 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 4876 nvme_ctrlr->ana_log_page_updating) { 4877 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4878 return -EBUSY; 4879 } 4880 4881 nvme_ctrlr->ana_log_page_updating = true; 4882 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4883 4884 rc = spdk_nvme_ctrlr_cmd_get_log_page(nvme_ctrlr->ctrlr, 4885 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 4886 SPDK_NVME_GLOBAL_NS_TAG, 4887 nvme_ctrlr->ana_log_page, 4888 ana_log_page_size, 0, 4889 nvme_ctrlr_read_ana_log_page_done, 4890 nvme_ctrlr); 4891 if (rc != 0) { 4892 nvme_ctrlr_read_ana_log_page_done(nvme_ctrlr, NULL); 4893 } 4894 4895 return rc; 4896 } 4897 4898 static void 4899 dummy_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 4900 { 4901 } 4902 4903 struct bdev_nvme_set_preferred_path_ctx { 4904 struct spdk_bdev_desc *desc; 4905 struct nvme_ns *nvme_ns; 4906 bdev_nvme_set_preferred_path_cb cb_fn; 4907 void *cb_arg; 4908 }; 4909 4910 static void 4911 bdev_nvme_set_preferred_path_done(struct spdk_io_channel_iter *i, int status) 4912 { 4913 struct bdev_nvme_set_preferred_path_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4914 4915 assert(ctx != NULL); 4916 assert(ctx->desc != NULL); 4917 assert(ctx->cb_fn != NULL); 4918 4919 spdk_bdev_close(ctx->desc); 4920 4921 ctx->cb_fn(ctx->cb_arg, status); 4922 4923 free(ctx); 4924 } 4925 4926 static void 4927 _bdev_nvme_set_preferred_path(struct spdk_io_channel_iter *i) 4928 { 4929 struct bdev_nvme_set_preferred_path_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4930 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4931 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4932 struct nvme_io_path *io_path, *prev; 4933 4934 prev = NULL; 4935 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 4936 if (io_path->nvme_ns == ctx->nvme_ns) { 4937 break; 4938 } 4939 prev = io_path; 4940 } 4941 4942 if (io_path != NULL) { 4943 if (prev != NULL) { 4944 STAILQ_REMOVE_AFTER(&nbdev_ch->io_path_list, prev, stailq); 4945 STAILQ_INSERT_HEAD(&nbdev_ch->io_path_list, io_path, stailq); 4946 } 4947 4948 /* We can set io_path to nbdev_ch->current_io_path directly here. 4949 * However, it needs to be conditional. To simplify the code, 4950 * just clear nbdev_ch->current_io_path and let find_io_path() 4951 * fill it. 4952 * 4953 * Automatic failback may be disabled. Hence even if the io_path is 4954 * already at the head, clear nbdev_ch->current_io_path. 4955 */ 4956 bdev_nvme_clear_current_io_path(nbdev_ch); 4957 } 4958 4959 spdk_for_each_channel_continue(i, 0); 4960 } 4961 4962 static struct nvme_ns * 4963 bdev_nvme_set_preferred_ns(struct nvme_bdev *nbdev, uint16_t cntlid) 4964 { 4965 struct nvme_ns *nvme_ns, *prev; 4966 const struct spdk_nvme_ctrlr_data *cdata; 4967 4968 prev = NULL; 4969 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 4970 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 4971 4972 if (cdata->cntlid == cntlid) { 4973 break; 4974 } 4975 prev = nvme_ns; 4976 } 4977 4978 if (nvme_ns != NULL && prev != NULL) { 4979 TAILQ_REMOVE(&nbdev->nvme_ns_list, nvme_ns, tailq); 4980 TAILQ_INSERT_HEAD(&nbdev->nvme_ns_list, nvme_ns, tailq); 4981 } 4982 4983 return nvme_ns; 4984 } 4985 4986 /* This function supports only multipath mode. There is only a single I/O path 4987 * for each NVMe-oF controller. Hence, just move the matched I/O path to the 4988 * head of the I/O path list for each NVMe bdev channel. 4989 * 4990 * NVMe bdev channel may be acquired after completing this function. move the 4991 * matched namespace to the head of the namespace list for the NVMe bdev too. 4992 */ 4993 void 4994 bdev_nvme_set_preferred_path(const char *name, uint16_t cntlid, 4995 bdev_nvme_set_preferred_path_cb cb_fn, void *cb_arg) 4996 { 4997 struct bdev_nvme_set_preferred_path_ctx *ctx; 4998 struct spdk_bdev *bdev; 4999 struct nvme_bdev *nbdev; 5000 int rc = 0; 5001 5002 assert(cb_fn != NULL); 5003 5004 ctx = calloc(1, sizeof(*ctx)); 5005 if (ctx == NULL) { 5006 SPDK_ERRLOG("Failed to alloc context.\n"); 5007 rc = -ENOMEM; 5008 goto err_alloc; 5009 } 5010 5011 ctx->cb_fn = cb_fn; 5012 ctx->cb_arg = cb_arg; 5013 5014 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 5015 if (rc != 0) { 5016 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 5017 goto err_open; 5018 } 5019 5020 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 5021 5022 if (bdev->module != &nvme_if) { 5023 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 5024 rc = -ENODEV; 5025 goto err_bdev; 5026 } 5027 5028 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 5029 5030 pthread_mutex_lock(&nbdev->mutex); 5031 5032 ctx->nvme_ns = bdev_nvme_set_preferred_ns(nbdev, cntlid); 5033 if (ctx->nvme_ns == NULL) { 5034 pthread_mutex_unlock(&nbdev->mutex); 5035 5036 SPDK_ERRLOG("bdev %s does not have namespace to controller %u.\n", name, cntlid); 5037 rc = -ENODEV; 5038 goto err_bdev; 5039 } 5040 5041 pthread_mutex_unlock(&nbdev->mutex); 5042 5043 spdk_for_each_channel(nbdev, 5044 _bdev_nvme_set_preferred_path, 5045 ctx, 5046 bdev_nvme_set_preferred_path_done); 5047 return; 5048 5049 err_bdev: 5050 spdk_bdev_close(ctx->desc); 5051 err_open: 5052 free(ctx); 5053 err_alloc: 5054 cb_fn(cb_arg, rc); 5055 } 5056 5057 struct bdev_nvme_set_multipath_policy_ctx { 5058 struct spdk_bdev_desc *desc; 5059 bdev_nvme_set_multipath_policy_cb cb_fn; 5060 void *cb_arg; 5061 }; 5062 5063 static void 5064 bdev_nvme_set_multipath_policy_done(struct spdk_io_channel_iter *i, int status) 5065 { 5066 struct bdev_nvme_set_multipath_policy_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 5067 5068 assert(ctx != NULL); 5069 assert(ctx->desc != NULL); 5070 assert(ctx->cb_fn != NULL); 5071 5072 spdk_bdev_close(ctx->desc); 5073 5074 ctx->cb_fn(ctx->cb_arg, status); 5075 5076 free(ctx); 5077 } 5078 5079 static void 5080 _bdev_nvme_set_multipath_policy(struct spdk_io_channel_iter *i) 5081 { 5082 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 5083 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 5084 struct nvme_bdev *nbdev = spdk_io_channel_get_io_device(_ch); 5085 5086 nbdev_ch->mp_policy = nbdev->mp_policy; 5087 nbdev_ch->mp_selector = nbdev->mp_selector; 5088 nbdev_ch->rr_min_io = nbdev->rr_min_io; 5089 bdev_nvme_clear_current_io_path(nbdev_ch); 5090 5091 spdk_for_each_channel_continue(i, 0); 5092 } 5093 5094 void 5095 bdev_nvme_set_multipath_policy(const char *name, enum bdev_nvme_multipath_policy policy, 5096 enum bdev_nvme_multipath_selector selector, uint32_t rr_min_io, 5097 bdev_nvme_set_multipath_policy_cb cb_fn, void *cb_arg) 5098 { 5099 struct bdev_nvme_set_multipath_policy_ctx *ctx; 5100 struct spdk_bdev *bdev; 5101 struct nvme_bdev *nbdev; 5102 int rc; 5103 5104 assert(cb_fn != NULL); 5105 5106 if (policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE && selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 5107 if (rr_min_io == UINT32_MAX) { 5108 rr_min_io = 1; 5109 } else if (rr_min_io == 0) { 5110 rc = -EINVAL; 5111 goto exit; 5112 } 5113 } else if (rr_min_io != UINT32_MAX) { 5114 rc = -EINVAL; 5115 goto exit; 5116 } 5117 5118 ctx = calloc(1, sizeof(*ctx)); 5119 if (ctx == NULL) { 5120 SPDK_ERRLOG("Failed to alloc context.\n"); 5121 rc = -ENOMEM; 5122 goto exit; 5123 } 5124 5125 ctx->cb_fn = cb_fn; 5126 ctx->cb_arg = cb_arg; 5127 5128 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 5129 if (rc != 0) { 5130 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 5131 rc = -ENODEV; 5132 goto err_open; 5133 } 5134 5135 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 5136 if (bdev->module != &nvme_if) { 5137 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 5138 rc = -ENODEV; 5139 goto err_module; 5140 } 5141 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 5142 5143 pthread_mutex_lock(&nbdev->mutex); 5144 nbdev->mp_policy = policy; 5145 nbdev->mp_selector = selector; 5146 nbdev->rr_min_io = rr_min_io; 5147 pthread_mutex_unlock(&nbdev->mutex); 5148 5149 spdk_for_each_channel(nbdev, 5150 _bdev_nvme_set_multipath_policy, 5151 ctx, 5152 bdev_nvme_set_multipath_policy_done); 5153 return; 5154 5155 err_module: 5156 spdk_bdev_close(ctx->desc); 5157 err_open: 5158 free(ctx); 5159 exit: 5160 cb_fn(cb_arg, rc); 5161 } 5162 5163 static void 5164 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 5165 { 5166 struct nvme_ctrlr *nvme_ctrlr = arg; 5167 union spdk_nvme_async_event_completion event; 5168 5169 if (spdk_nvme_cpl_is_error(cpl)) { 5170 SPDK_WARNLOG("AER request execute failed\n"); 5171 return; 5172 } 5173 5174 event.raw = cpl->cdw0; 5175 if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 5176 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) { 5177 nvme_ctrlr_populate_namespaces(nvme_ctrlr, NULL); 5178 } else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 5179 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_ANA_CHANGE)) { 5180 nvme_ctrlr_read_ana_log_page(nvme_ctrlr); 5181 } 5182 } 5183 5184 static void 5185 free_nvme_async_probe_ctx(struct nvme_async_probe_ctx *ctx) 5186 { 5187 spdk_keyring_put_key(ctx->drv_opts.tls_psk); 5188 spdk_keyring_put_key(ctx->drv_opts.dhchap_key); 5189 free(ctx); 5190 } 5191 5192 static void 5193 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, int rc) 5194 { 5195 if (ctx->cb_fn) { 5196 ctx->cb_fn(ctx->cb_ctx, ctx->reported_bdevs, rc); 5197 } 5198 5199 ctx->namespaces_populated = true; 5200 if (ctx->probe_done) { 5201 /* The probe was already completed, so we need to free the context 5202 * here. This can happen for cases like OCSSD, where we need to 5203 * send additional commands to the SSD after attach. 5204 */ 5205 free_nvme_async_probe_ctx(ctx); 5206 } 5207 } 5208 5209 static void 5210 nvme_ctrlr_create_done(struct nvme_ctrlr *nvme_ctrlr, 5211 struct nvme_async_probe_ctx *ctx) 5212 { 5213 spdk_io_device_register(nvme_ctrlr, 5214 bdev_nvme_create_ctrlr_channel_cb, 5215 bdev_nvme_destroy_ctrlr_channel_cb, 5216 sizeof(struct nvme_ctrlr_channel), 5217 nvme_ctrlr->nbdev_ctrlr->name); 5218 5219 nvme_ctrlr_populate_namespaces(nvme_ctrlr, ctx); 5220 } 5221 5222 static void 5223 nvme_ctrlr_init_ana_log_page_done(void *_ctx, const struct spdk_nvme_cpl *cpl) 5224 { 5225 struct nvme_ctrlr *nvme_ctrlr = _ctx; 5226 struct nvme_async_probe_ctx *ctx = nvme_ctrlr->probe_ctx; 5227 5228 nvme_ctrlr->probe_ctx = NULL; 5229 5230 if (spdk_nvme_cpl_is_error(cpl)) { 5231 nvme_ctrlr_delete(nvme_ctrlr); 5232 5233 if (ctx != NULL) { 5234 ctx->reported_bdevs = 0; 5235 populate_namespaces_cb(ctx, -1); 5236 } 5237 return; 5238 } 5239 5240 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 5241 } 5242 5243 static int 5244 nvme_ctrlr_init_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 5245 struct nvme_async_probe_ctx *ctx) 5246 { 5247 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 5248 const struct spdk_nvme_ctrlr_data *cdata; 5249 uint32_t ana_log_page_size; 5250 5251 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5252 5253 /* Set buffer size enough to include maximum number of allowed namespaces. */ 5254 ana_log_page_size = sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 5255 sizeof(struct spdk_nvme_ana_group_descriptor) + cdata->mnan * 5256 sizeof(uint32_t); 5257 5258 nvme_ctrlr->ana_log_page = spdk_zmalloc(ana_log_page_size, 64, NULL, 5259 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 5260 if (nvme_ctrlr->ana_log_page == NULL) { 5261 SPDK_ERRLOG("could not allocate ANA log page buffer\n"); 5262 return -ENXIO; 5263 } 5264 5265 /* Each descriptor in a ANA log page is not ensured to be 8-bytes aligned. 5266 * Hence copy each descriptor to a temporary area when parsing it. 5267 * 5268 * Allocate a buffer whose size is as large as ANA log page buffer because 5269 * we do not know the size of a descriptor until actually reading it. 5270 */ 5271 nvme_ctrlr->copied_ana_desc = calloc(1, ana_log_page_size); 5272 if (nvme_ctrlr->copied_ana_desc == NULL) { 5273 SPDK_ERRLOG("could not allocate a buffer to parse ANA descriptor\n"); 5274 return -ENOMEM; 5275 } 5276 5277 nvme_ctrlr->max_ana_log_page_size = ana_log_page_size; 5278 5279 nvme_ctrlr->probe_ctx = ctx; 5280 5281 /* Then, set the read size only to include the current active namespaces. */ 5282 ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); 5283 5284 if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { 5285 SPDK_ERRLOG("ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", 5286 ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); 5287 return -EINVAL; 5288 } 5289 5290 return spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, 5291 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 5292 SPDK_NVME_GLOBAL_NS_TAG, 5293 nvme_ctrlr->ana_log_page, 5294 ana_log_page_size, 0, 5295 nvme_ctrlr_init_ana_log_page_done, 5296 nvme_ctrlr); 5297 } 5298 5299 /* hostnqn and subnqn were already verified before attaching a controller. 5300 * Hence check only the multipath capability and cntlid here. 5301 */ 5302 static bool 5303 bdev_nvme_check_multipath(struct nvme_bdev_ctrlr *nbdev_ctrlr, struct spdk_nvme_ctrlr *ctrlr) 5304 { 5305 struct nvme_ctrlr *tmp; 5306 const struct spdk_nvme_ctrlr_data *cdata, *tmp_cdata; 5307 5308 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5309 5310 if (!cdata->cmic.multi_ctrlr) { 5311 SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); 5312 return false; 5313 } 5314 5315 TAILQ_FOREACH(tmp, &nbdev_ctrlr->ctrlrs, tailq) { 5316 tmp_cdata = spdk_nvme_ctrlr_get_data(tmp->ctrlr); 5317 5318 if (!tmp_cdata->cmic.multi_ctrlr) { 5319 SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); 5320 return false; 5321 } 5322 if (cdata->cntlid == tmp_cdata->cntlid) { 5323 SPDK_ERRLOG("cntlid %u are duplicated.\n", tmp_cdata->cntlid); 5324 return false; 5325 } 5326 } 5327 5328 return true; 5329 } 5330 5331 static int 5332 nvme_bdev_ctrlr_create(const char *name, struct nvme_ctrlr *nvme_ctrlr) 5333 { 5334 struct nvme_bdev_ctrlr *nbdev_ctrlr; 5335 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 5336 int rc = 0; 5337 5338 pthread_mutex_lock(&g_bdev_nvme_mutex); 5339 5340 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 5341 if (nbdev_ctrlr != NULL) { 5342 if (!bdev_nvme_check_multipath(nbdev_ctrlr, ctrlr)) { 5343 rc = -EINVAL; 5344 goto exit; 5345 } 5346 } else { 5347 nbdev_ctrlr = calloc(1, sizeof(*nbdev_ctrlr)); 5348 if (nbdev_ctrlr == NULL) { 5349 SPDK_ERRLOG("Failed to allocate nvme_bdev_ctrlr.\n"); 5350 rc = -ENOMEM; 5351 goto exit; 5352 } 5353 nbdev_ctrlr->name = strdup(name); 5354 if (nbdev_ctrlr->name == NULL) { 5355 SPDK_ERRLOG("Failed to allocate name of nvme_bdev_ctrlr.\n"); 5356 free(nbdev_ctrlr); 5357 goto exit; 5358 } 5359 TAILQ_INIT(&nbdev_ctrlr->ctrlrs); 5360 TAILQ_INIT(&nbdev_ctrlr->bdevs); 5361 TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 5362 } 5363 nvme_ctrlr->nbdev_ctrlr = nbdev_ctrlr; 5364 TAILQ_INSERT_TAIL(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 5365 exit: 5366 pthread_mutex_unlock(&g_bdev_nvme_mutex); 5367 return rc; 5368 } 5369 5370 static int 5371 nvme_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr, 5372 const char *name, 5373 const struct spdk_nvme_transport_id *trid, 5374 struct nvme_async_probe_ctx *ctx) 5375 { 5376 struct nvme_ctrlr *nvme_ctrlr; 5377 struct nvme_path_id *path_id; 5378 const struct spdk_nvme_ctrlr_data *cdata; 5379 int rc; 5380 5381 nvme_ctrlr = calloc(1, sizeof(*nvme_ctrlr)); 5382 if (nvme_ctrlr == NULL) { 5383 SPDK_ERRLOG("Failed to allocate device struct\n"); 5384 return -ENOMEM; 5385 } 5386 5387 rc = pthread_mutex_init(&nvme_ctrlr->mutex, NULL); 5388 if (rc != 0) { 5389 free(nvme_ctrlr); 5390 return rc; 5391 } 5392 5393 TAILQ_INIT(&nvme_ctrlr->trids); 5394 RB_INIT(&nvme_ctrlr->namespaces); 5395 5396 /* Get another reference to the key, so the first one can be released from probe_ctx */ 5397 if (ctx != NULL) { 5398 if (ctx->drv_opts.tls_psk != NULL) { 5399 nvme_ctrlr->psk = spdk_keyring_get_key( 5400 spdk_key_get_name(ctx->drv_opts.tls_psk)); 5401 if (nvme_ctrlr->psk == NULL) { 5402 /* Could only happen if the key was removed in the meantime */ 5403 SPDK_ERRLOG("Couldn't get a reference to the key '%s'\n", 5404 spdk_key_get_name(ctx->drv_opts.tls_psk)); 5405 rc = -ENOKEY; 5406 goto err; 5407 } 5408 } 5409 5410 if (ctx->drv_opts.dhchap_key != NULL) { 5411 nvme_ctrlr->dhchap_key = spdk_keyring_get_key( 5412 spdk_key_get_name(ctx->drv_opts.dhchap_key)); 5413 if (nvme_ctrlr->dhchap_key == NULL) { 5414 SPDK_ERRLOG("Couldn't get a reference to the key '%s'\n", 5415 spdk_key_get_name(ctx->drv_opts.dhchap_key)); 5416 rc = -ENOKEY; 5417 goto err; 5418 } 5419 } 5420 } 5421 5422 path_id = calloc(1, sizeof(*path_id)); 5423 if (path_id == NULL) { 5424 SPDK_ERRLOG("Failed to allocate trid entry pointer\n"); 5425 rc = -ENOMEM; 5426 goto err; 5427 } 5428 5429 path_id->trid = *trid; 5430 if (ctx != NULL) { 5431 memcpy(path_id->hostid.hostaddr, ctx->drv_opts.src_addr, sizeof(path_id->hostid.hostaddr)); 5432 memcpy(path_id->hostid.hostsvcid, ctx->drv_opts.src_svcid, sizeof(path_id->hostid.hostsvcid)); 5433 } 5434 nvme_ctrlr->active_path_id = path_id; 5435 TAILQ_INSERT_HEAD(&nvme_ctrlr->trids, path_id, link); 5436 5437 nvme_ctrlr->thread = spdk_get_thread(); 5438 nvme_ctrlr->ctrlr = ctrlr; 5439 nvme_ctrlr->ref = 1; 5440 5441 if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) { 5442 SPDK_ERRLOG("OCSSDs are not supported"); 5443 rc = -ENOTSUP; 5444 goto err; 5445 } 5446 5447 if (ctx != NULL) { 5448 memcpy(&nvme_ctrlr->opts, &ctx->bdev_opts, sizeof(ctx->bdev_opts)); 5449 } else { 5450 bdev_nvme_get_default_ctrlr_opts(&nvme_ctrlr->opts); 5451 } 5452 5453 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_ctrlr, 5454 g_opts.nvme_adminq_poll_period_us); 5455 5456 if (g_opts.timeout_us > 0) { 5457 /* Register timeout callback. Timeout values for IO vs. admin reqs can be different. */ 5458 /* If timeout_admin_us is 0 (not specified), admin uses same timeout as IO. */ 5459 uint64_t adm_timeout_us = (g_opts.timeout_admin_us == 0) ? 5460 g_opts.timeout_us : g_opts.timeout_admin_us; 5461 spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us, 5462 adm_timeout_us, timeout_cb, nvme_ctrlr); 5463 } 5464 5465 spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_ctrlr); 5466 spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_ctrlr); 5467 5468 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 5469 SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) { 5470 nvme_ctrlr->opal_dev = spdk_opal_dev_construct(ctrlr); 5471 } 5472 5473 rc = nvme_bdev_ctrlr_create(name, nvme_ctrlr); 5474 if (rc != 0) { 5475 goto err; 5476 } 5477 5478 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5479 5480 if (cdata->cmic.ana_reporting) { 5481 rc = nvme_ctrlr_init_ana_log_page(nvme_ctrlr, ctx); 5482 if (rc == 0) { 5483 return 0; 5484 } 5485 } else { 5486 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 5487 return 0; 5488 } 5489 5490 err: 5491 nvme_ctrlr_delete(nvme_ctrlr); 5492 return rc; 5493 } 5494 5495 void 5496 bdev_nvme_get_default_ctrlr_opts(struct nvme_ctrlr_opts *opts) 5497 { 5498 opts->prchk_flags = 0; 5499 opts->ctrlr_loss_timeout_sec = g_opts.ctrlr_loss_timeout_sec; 5500 opts->reconnect_delay_sec = g_opts.reconnect_delay_sec; 5501 opts->fast_io_fail_timeout_sec = g_opts.fast_io_fail_timeout_sec; 5502 } 5503 5504 static void 5505 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5506 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *drv_opts) 5507 { 5508 char *name; 5509 5510 name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++); 5511 if (!name) { 5512 SPDK_ERRLOG("Failed to assign name to NVMe device\n"); 5513 return; 5514 } 5515 5516 if (nvme_ctrlr_create(ctrlr, name, trid, NULL) == 0) { 5517 SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name); 5518 } else { 5519 SPDK_ERRLOG("Failed to attach to %s (%s)\n", trid->traddr, name); 5520 } 5521 5522 free(name); 5523 } 5524 5525 static void 5526 _nvme_ctrlr_destruct(void *ctx) 5527 { 5528 struct nvme_ctrlr *nvme_ctrlr = ctx; 5529 5530 nvme_ctrlr_depopulate_namespaces(nvme_ctrlr); 5531 nvme_ctrlr_release(nvme_ctrlr); 5532 } 5533 5534 static int 5535 bdev_nvme_delete_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) 5536 { 5537 struct nvme_probe_skip_entry *entry; 5538 5539 /* The controller's destruction was already started */ 5540 if (nvme_ctrlr->destruct) { 5541 return -EALREADY; 5542 } 5543 5544 if (!hotplug && 5545 nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { 5546 entry = calloc(1, sizeof(*entry)); 5547 if (!entry) { 5548 return -ENOMEM; 5549 } 5550 entry->trid = nvme_ctrlr->active_path_id->trid; 5551 TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq); 5552 } 5553 5554 nvme_ctrlr->destruct = true; 5555 return 0; 5556 } 5557 5558 static int 5559 bdev_nvme_delete_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) 5560 { 5561 int rc; 5562 5563 pthread_mutex_lock(&nvme_ctrlr->mutex); 5564 rc = bdev_nvme_delete_ctrlr_unsafe(nvme_ctrlr, hotplug); 5565 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5566 5567 if (rc == 0) { 5568 _nvme_ctrlr_destruct(nvme_ctrlr); 5569 } else if (rc == -EALREADY) { 5570 rc = 0; 5571 } 5572 5573 return rc; 5574 } 5575 5576 static void 5577 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr) 5578 { 5579 struct nvme_ctrlr *nvme_ctrlr = cb_ctx; 5580 5581 bdev_nvme_delete_ctrlr(nvme_ctrlr, true); 5582 } 5583 5584 static int 5585 bdev_nvme_hotplug_probe(void *arg) 5586 { 5587 if (g_hotplug_probe_ctx == NULL) { 5588 spdk_poller_unregister(&g_hotplug_probe_poller); 5589 return SPDK_POLLER_IDLE; 5590 } 5591 5592 if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) { 5593 g_hotplug_probe_ctx = NULL; 5594 spdk_poller_unregister(&g_hotplug_probe_poller); 5595 } 5596 5597 return SPDK_POLLER_BUSY; 5598 } 5599 5600 static int 5601 bdev_nvme_hotplug(void *arg) 5602 { 5603 struct spdk_nvme_transport_id trid_pcie; 5604 5605 if (g_hotplug_probe_ctx) { 5606 return SPDK_POLLER_BUSY; 5607 } 5608 5609 memset(&trid_pcie, 0, sizeof(trid_pcie)); 5610 spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE); 5611 5612 g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL, 5613 hotplug_probe_cb, attach_cb, NULL); 5614 5615 if (g_hotplug_probe_ctx) { 5616 assert(g_hotplug_probe_poller == NULL); 5617 g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000); 5618 } 5619 5620 return SPDK_POLLER_BUSY; 5621 } 5622 5623 void 5624 bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts) 5625 { 5626 *opts = g_opts; 5627 } 5628 5629 static bool bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, 5630 uint32_t reconnect_delay_sec, 5631 uint32_t fast_io_fail_timeout_sec); 5632 5633 static int 5634 bdev_nvme_validate_opts(const struct spdk_bdev_nvme_opts *opts) 5635 { 5636 if ((opts->timeout_us == 0) && (opts->timeout_admin_us != 0)) { 5637 /* Can't set timeout_admin_us without also setting timeout_us */ 5638 SPDK_WARNLOG("Invalid options: Can't have (timeout_us == 0) with (timeout_admin_us > 0)\n"); 5639 return -EINVAL; 5640 } 5641 5642 if (opts->bdev_retry_count < -1) { 5643 SPDK_WARNLOG("Invalid option: bdev_retry_count can't be less than -1.\n"); 5644 return -EINVAL; 5645 } 5646 5647 if (!bdev_nvme_check_io_error_resiliency_params(opts->ctrlr_loss_timeout_sec, 5648 opts->reconnect_delay_sec, 5649 opts->fast_io_fail_timeout_sec)) { 5650 return -EINVAL; 5651 } 5652 5653 return 0; 5654 } 5655 5656 int 5657 bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts) 5658 { 5659 int ret; 5660 5661 ret = bdev_nvme_validate_opts(opts); 5662 if (ret) { 5663 SPDK_WARNLOG("Failed to set nvme opts.\n"); 5664 return ret; 5665 } 5666 5667 if (g_bdev_nvme_init_thread != NULL) { 5668 if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 5669 return -EPERM; 5670 } 5671 } 5672 5673 if (opts->rdma_srq_size != 0 || 5674 opts->rdma_max_cq_size != 0 || 5675 opts->rdma_cm_event_timeout_ms != 0) { 5676 struct spdk_nvme_transport_opts drv_opts; 5677 5678 spdk_nvme_transport_get_opts(&drv_opts, sizeof(drv_opts)); 5679 if (opts->rdma_srq_size != 0) { 5680 drv_opts.rdma_srq_size = opts->rdma_srq_size; 5681 } 5682 if (opts->rdma_max_cq_size != 0) { 5683 drv_opts.rdma_max_cq_size = opts->rdma_max_cq_size; 5684 } 5685 if (opts->rdma_cm_event_timeout_ms != 0) { 5686 drv_opts.rdma_cm_event_timeout_ms = opts->rdma_cm_event_timeout_ms; 5687 } 5688 5689 ret = spdk_nvme_transport_set_opts(&drv_opts, sizeof(drv_opts)); 5690 if (ret) { 5691 SPDK_ERRLOG("Failed to set NVMe transport opts.\n"); 5692 return ret; 5693 } 5694 } 5695 5696 g_opts = *opts; 5697 5698 return 0; 5699 } 5700 5701 struct set_nvme_hotplug_ctx { 5702 uint64_t period_us; 5703 bool enabled; 5704 spdk_msg_fn fn; 5705 void *fn_ctx; 5706 }; 5707 5708 static void 5709 set_nvme_hotplug_period_cb(void *_ctx) 5710 { 5711 struct set_nvme_hotplug_ctx *ctx = _ctx; 5712 5713 spdk_poller_unregister(&g_hotplug_poller); 5714 if (ctx->enabled) { 5715 g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us); 5716 } 5717 5718 g_nvme_hotplug_poll_period_us = ctx->period_us; 5719 g_nvme_hotplug_enabled = ctx->enabled; 5720 if (ctx->fn) { 5721 ctx->fn(ctx->fn_ctx); 5722 } 5723 5724 free(ctx); 5725 } 5726 5727 int 5728 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx) 5729 { 5730 struct set_nvme_hotplug_ctx *ctx; 5731 5732 if (enabled == true && !spdk_process_is_primary()) { 5733 return -EPERM; 5734 } 5735 5736 ctx = calloc(1, sizeof(*ctx)); 5737 if (ctx == NULL) { 5738 return -ENOMEM; 5739 } 5740 5741 period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us; 5742 ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX); 5743 ctx->enabled = enabled; 5744 ctx->fn = cb; 5745 ctx->fn_ctx = cb_ctx; 5746 5747 spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx); 5748 return 0; 5749 } 5750 5751 static void 5752 nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 5753 struct nvme_async_probe_ctx *ctx) 5754 { 5755 struct nvme_ns *nvme_ns; 5756 struct nvme_bdev *nvme_bdev; 5757 size_t j; 5758 5759 assert(nvme_ctrlr != NULL); 5760 5761 if (ctx->names == NULL) { 5762 ctx->reported_bdevs = 0; 5763 populate_namespaces_cb(ctx, 0); 5764 return; 5765 } 5766 5767 /* 5768 * Report the new bdevs that were created in this call. 5769 * There can be more than one bdev per NVMe controller. 5770 */ 5771 j = 0; 5772 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 5773 while (nvme_ns != NULL) { 5774 nvme_bdev = nvme_ns->bdev; 5775 if (j < ctx->max_bdevs) { 5776 ctx->names[j] = nvme_bdev->disk.name; 5777 j++; 5778 } else { 5779 SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n", 5780 ctx->max_bdevs); 5781 ctx->reported_bdevs = 0; 5782 populate_namespaces_cb(ctx, -ERANGE); 5783 return; 5784 } 5785 5786 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 5787 } 5788 5789 ctx->reported_bdevs = j; 5790 populate_namespaces_cb(ctx, 0); 5791 } 5792 5793 static int 5794 bdev_nvme_check_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 5795 struct spdk_nvme_ctrlr *new_ctrlr, 5796 struct spdk_nvme_transport_id *trid) 5797 { 5798 struct nvme_path_id *tmp_trid; 5799 5800 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 5801 SPDK_ERRLOG("PCIe failover is not supported.\n"); 5802 return -ENOTSUP; 5803 } 5804 5805 /* Currently we only support failover to the same transport type. */ 5806 if (nvme_ctrlr->active_path_id->trid.trtype != trid->trtype) { 5807 SPDK_WARNLOG("Failover from trtype: %s to a different trtype: %s is not supported currently\n", 5808 spdk_nvme_transport_id_trtype_str(nvme_ctrlr->active_path_id->trid.trtype), 5809 spdk_nvme_transport_id_trtype_str(trid->trtype)); 5810 return -EINVAL; 5811 } 5812 5813 5814 /* Currently we only support failover to the same NQN. */ 5815 if (strncmp(trid->subnqn, nvme_ctrlr->active_path_id->trid.subnqn, SPDK_NVMF_NQN_MAX_LEN)) { 5816 SPDK_WARNLOG("Failover from subnqn: %s to a different subnqn: %s is not supported currently\n", 5817 nvme_ctrlr->active_path_id->trid.subnqn, trid->subnqn); 5818 return -EINVAL; 5819 } 5820 5821 /* Skip all the other checks if we've already registered this path. */ 5822 TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) { 5823 if (!spdk_nvme_transport_id_compare(&tmp_trid->trid, trid)) { 5824 SPDK_WARNLOG("This path (traddr: %s subnqn: %s) is already registered\n", trid->traddr, 5825 trid->subnqn); 5826 return -EEXIST; 5827 } 5828 } 5829 5830 return 0; 5831 } 5832 5833 static int 5834 bdev_nvme_check_secondary_namespace(struct nvme_ctrlr *nvme_ctrlr, 5835 struct spdk_nvme_ctrlr *new_ctrlr) 5836 { 5837 struct nvme_ns *nvme_ns; 5838 struct spdk_nvme_ns *new_ns; 5839 5840 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 5841 while (nvme_ns != NULL) { 5842 new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nvme_ns->id); 5843 assert(new_ns != NULL); 5844 5845 if (!bdev_nvme_compare_ns(nvme_ns->ns, new_ns)) { 5846 return -EINVAL; 5847 } 5848 5849 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 5850 } 5851 5852 return 0; 5853 } 5854 5855 static int 5856 _bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 5857 struct spdk_nvme_transport_id *trid) 5858 { 5859 struct nvme_path_id *active_id, *new_trid, *tmp_trid; 5860 5861 new_trid = calloc(1, sizeof(*new_trid)); 5862 if (new_trid == NULL) { 5863 return -ENOMEM; 5864 } 5865 new_trid->trid = *trid; 5866 5867 active_id = nvme_ctrlr->active_path_id; 5868 assert(active_id != NULL); 5869 assert(active_id == TAILQ_FIRST(&nvme_ctrlr->trids)); 5870 5871 /* Skip the active trid not to replace it until it is failed. */ 5872 tmp_trid = TAILQ_NEXT(active_id, link); 5873 if (tmp_trid == NULL) { 5874 goto add_tail; 5875 } 5876 5877 /* It means the trid is faled if its last failed time is non-zero. 5878 * Insert the new alternate trid before any failed trid. 5879 */ 5880 TAILQ_FOREACH_FROM(tmp_trid, &nvme_ctrlr->trids, link) { 5881 if (tmp_trid->last_failed_tsc != 0) { 5882 TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link); 5883 return 0; 5884 } 5885 } 5886 5887 add_tail: 5888 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, new_trid, link); 5889 return 0; 5890 } 5891 5892 /* This is the case that a secondary path is added to an existing 5893 * nvme_ctrlr for failover. After checking if it can access the same 5894 * namespaces as the primary path, it is disconnected until failover occurs. 5895 */ 5896 static int 5897 bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 5898 struct spdk_nvme_ctrlr *new_ctrlr, 5899 struct spdk_nvme_transport_id *trid) 5900 { 5901 int rc; 5902 5903 assert(nvme_ctrlr != NULL); 5904 5905 pthread_mutex_lock(&nvme_ctrlr->mutex); 5906 5907 rc = bdev_nvme_check_secondary_trid(nvme_ctrlr, new_ctrlr, trid); 5908 if (rc != 0) { 5909 goto exit; 5910 } 5911 5912 rc = bdev_nvme_check_secondary_namespace(nvme_ctrlr, new_ctrlr); 5913 if (rc != 0) { 5914 goto exit; 5915 } 5916 5917 rc = _bdev_nvme_add_secondary_trid(nvme_ctrlr, trid); 5918 5919 exit: 5920 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5921 5922 spdk_nvme_detach(new_ctrlr); 5923 5924 return rc; 5925 } 5926 5927 static void 5928 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5929 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 5930 { 5931 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 5932 struct nvme_async_probe_ctx *ctx; 5933 int rc; 5934 5935 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 5936 ctx->ctrlr_attached = true; 5937 5938 rc = nvme_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx); 5939 if (rc != 0) { 5940 ctx->reported_bdevs = 0; 5941 populate_namespaces_cb(ctx, rc); 5942 } 5943 } 5944 5945 static void 5946 connect_set_failover_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5947 struct spdk_nvme_ctrlr *ctrlr, 5948 const struct spdk_nvme_ctrlr_opts *opts) 5949 { 5950 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 5951 struct nvme_ctrlr *nvme_ctrlr; 5952 struct nvme_async_probe_ctx *ctx; 5953 int rc; 5954 5955 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 5956 ctx->ctrlr_attached = true; 5957 5958 nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->base_name); 5959 if (nvme_ctrlr) { 5960 rc = bdev_nvme_add_secondary_trid(nvme_ctrlr, ctrlr, &ctx->trid); 5961 } else { 5962 rc = -ENODEV; 5963 } 5964 5965 ctx->reported_bdevs = 0; 5966 populate_namespaces_cb(ctx, rc); 5967 } 5968 5969 static int 5970 bdev_nvme_async_poll(void *arg) 5971 { 5972 struct nvme_async_probe_ctx *ctx = arg; 5973 int rc; 5974 5975 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 5976 if (spdk_unlikely(rc != -EAGAIN)) { 5977 ctx->probe_done = true; 5978 spdk_poller_unregister(&ctx->poller); 5979 if (!ctx->ctrlr_attached) { 5980 /* The probe is done, but no controller was attached. 5981 * That means we had a failure, so report -EIO back to 5982 * the caller (usually the RPC). populate_namespaces_cb() 5983 * will take care of freeing the nvme_async_probe_ctx. 5984 */ 5985 ctx->reported_bdevs = 0; 5986 populate_namespaces_cb(ctx, -EIO); 5987 } else if (ctx->namespaces_populated) { 5988 /* The namespaces for the attached controller were all 5989 * populated and the response was already sent to the 5990 * caller (usually the RPC). So free the context here. 5991 */ 5992 free_nvme_async_probe_ctx(ctx); 5993 } 5994 } 5995 5996 return SPDK_POLLER_BUSY; 5997 } 5998 5999 static bool 6000 bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, 6001 uint32_t reconnect_delay_sec, 6002 uint32_t fast_io_fail_timeout_sec) 6003 { 6004 if (ctrlr_loss_timeout_sec < -1) { 6005 SPDK_ERRLOG("ctrlr_loss_timeout_sec can't be less than -1.\n"); 6006 return false; 6007 } else if (ctrlr_loss_timeout_sec == -1) { 6008 if (reconnect_delay_sec == 0) { 6009 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 6010 return false; 6011 } else if (fast_io_fail_timeout_sec != 0 && 6012 fast_io_fail_timeout_sec < reconnect_delay_sec) { 6013 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io-fail_timeout_sec.\n"); 6014 return false; 6015 } 6016 } else if (ctrlr_loss_timeout_sec != 0) { 6017 if (reconnect_delay_sec == 0) { 6018 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 6019 return false; 6020 } else if (reconnect_delay_sec > (uint32_t)ctrlr_loss_timeout_sec) { 6021 SPDK_ERRLOG("reconnect_delay_sec can't be more than ctrlr_loss_timeout_sec.\n"); 6022 return false; 6023 } else if (fast_io_fail_timeout_sec != 0) { 6024 if (fast_io_fail_timeout_sec < reconnect_delay_sec) { 6025 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io_fail_timeout_sec.\n"); 6026 return false; 6027 } else if (fast_io_fail_timeout_sec > (uint32_t)ctrlr_loss_timeout_sec) { 6028 SPDK_ERRLOG("fast_io_fail_timeout_sec can't be more than ctrlr_loss_timeout_sec.\n"); 6029 return false; 6030 } 6031 } 6032 } else if (reconnect_delay_sec != 0 || fast_io_fail_timeout_sec != 0) { 6033 SPDK_ERRLOG("Both reconnect_delay_sec and fast_io_fail_timeout_sec must be 0 if ctrlr_loss_timeout_sec is 0.\n"); 6034 return false; 6035 } 6036 6037 return true; 6038 } 6039 6040 static int 6041 bdev_nvme_load_psk(const char *fname, char *buf, size_t bufsz) 6042 { 6043 FILE *psk_file; 6044 struct stat statbuf; 6045 int rc; 6046 #define TCP_PSK_INVALID_PERMISSIONS 0177 6047 6048 if (stat(fname, &statbuf) != 0) { 6049 SPDK_ERRLOG("Could not read permissions for PSK file\n"); 6050 return -EACCES; 6051 } 6052 6053 if ((statbuf.st_mode & TCP_PSK_INVALID_PERMISSIONS) != 0) { 6054 SPDK_ERRLOG("Incorrect permissions for PSK file\n"); 6055 return -EPERM; 6056 } 6057 if ((size_t)statbuf.st_size >= bufsz) { 6058 SPDK_ERRLOG("Invalid PSK: too long\n"); 6059 return -EINVAL; 6060 } 6061 psk_file = fopen(fname, "r"); 6062 if (psk_file == NULL) { 6063 SPDK_ERRLOG("Could not open PSK file\n"); 6064 return -EINVAL; 6065 } 6066 6067 memset(buf, 0, bufsz); 6068 rc = fread(buf, 1, statbuf.st_size, psk_file); 6069 if (rc != statbuf.st_size) { 6070 SPDK_ERRLOG("Failed to read PSK\n"); 6071 fclose(psk_file); 6072 return -EINVAL; 6073 } 6074 6075 fclose(psk_file); 6076 return 0; 6077 } 6078 6079 int 6080 bdev_nvme_create(struct spdk_nvme_transport_id *trid, 6081 const char *base_name, 6082 const char **names, 6083 uint32_t count, 6084 spdk_bdev_create_nvme_fn cb_fn, 6085 void *cb_ctx, 6086 struct spdk_nvme_ctrlr_opts *drv_opts, 6087 struct nvme_ctrlr_opts *bdev_opts, 6088 bool multipath) 6089 { 6090 struct nvme_probe_skip_entry *entry, *tmp; 6091 struct nvme_async_probe_ctx *ctx; 6092 spdk_nvme_attach_cb attach_cb; 6093 int rc, len; 6094 6095 /* TODO expand this check to include both the host and target TRIDs. 6096 * Only if both are the same should we fail. 6097 */ 6098 if (nvme_ctrlr_get(trid) != NULL) { 6099 SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr); 6100 return -EEXIST; 6101 } 6102 6103 len = strnlen(base_name, SPDK_CONTROLLER_NAME_MAX); 6104 6105 if (len == 0 || len == SPDK_CONTROLLER_NAME_MAX) { 6106 SPDK_ERRLOG("controller name must be between 1 and %d characters\n", SPDK_CONTROLLER_NAME_MAX - 1); 6107 return -EINVAL; 6108 } 6109 6110 if (bdev_opts != NULL && 6111 !bdev_nvme_check_io_error_resiliency_params(bdev_opts->ctrlr_loss_timeout_sec, 6112 bdev_opts->reconnect_delay_sec, 6113 bdev_opts->fast_io_fail_timeout_sec)) { 6114 return -EINVAL; 6115 } 6116 6117 ctx = calloc(1, sizeof(*ctx)); 6118 if (!ctx) { 6119 return -ENOMEM; 6120 } 6121 ctx->base_name = base_name; 6122 ctx->names = names; 6123 ctx->max_bdevs = count; 6124 ctx->cb_fn = cb_fn; 6125 ctx->cb_ctx = cb_ctx; 6126 ctx->trid = *trid; 6127 6128 if (bdev_opts) { 6129 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 6130 } else { 6131 bdev_nvme_get_default_ctrlr_opts(&ctx->bdev_opts); 6132 } 6133 6134 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 6135 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) { 6136 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 6137 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 6138 free(entry); 6139 break; 6140 } 6141 } 6142 } 6143 6144 if (drv_opts) { 6145 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 6146 } else { 6147 spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->drv_opts, sizeof(ctx->drv_opts)); 6148 } 6149 6150 ctx->drv_opts.transport_retry_count = g_opts.transport_retry_count; 6151 ctx->drv_opts.transport_ack_timeout = g_opts.transport_ack_timeout; 6152 ctx->drv_opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms; 6153 ctx->drv_opts.disable_read_ana_log_page = true; 6154 ctx->drv_opts.transport_tos = g_opts.transport_tos; 6155 6156 if (ctx->bdev_opts.psk[0] != '\0') { 6157 /* Try to use the keyring first */ 6158 ctx->drv_opts.tls_psk = spdk_keyring_get_key(ctx->bdev_opts.psk); 6159 if (ctx->drv_opts.tls_psk == NULL) { 6160 rc = bdev_nvme_load_psk(ctx->bdev_opts.psk, 6161 ctx->drv_opts.psk, sizeof(ctx->drv_opts.psk)); 6162 if (rc != 0) { 6163 SPDK_ERRLOG("Could not load PSK from %s\n", ctx->bdev_opts.psk); 6164 free_nvme_async_probe_ctx(ctx); 6165 return rc; 6166 } 6167 } 6168 } 6169 6170 if (ctx->bdev_opts.dhchap_key != NULL) { 6171 ctx->drv_opts.dhchap_key = spdk_keyring_get_key(ctx->bdev_opts.dhchap_key); 6172 if (ctx->drv_opts.dhchap_key == NULL) { 6173 SPDK_ERRLOG("Could not load DH-HMAC-CHAP key: %s\n", 6174 ctx->bdev_opts.dhchap_key); 6175 free_nvme_async_probe_ctx(ctx); 6176 return -ENOKEY; 6177 } 6178 6179 ctx->drv_opts.dhchap_digests = g_opts.dhchap_digests; 6180 ctx->drv_opts.dhchap_dhgroups = g_opts.dhchap_dhgroups; 6181 } 6182 6183 if (nvme_bdev_ctrlr_get_by_name(base_name) == NULL || multipath) { 6184 attach_cb = connect_attach_cb; 6185 } else { 6186 attach_cb = connect_set_failover_cb; 6187 } 6188 6189 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, attach_cb); 6190 if (ctx->probe_ctx == NULL) { 6191 SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr); 6192 free_nvme_async_probe_ctx(ctx); 6193 return -ENODEV; 6194 } 6195 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000); 6196 6197 return 0; 6198 } 6199 6200 struct bdev_nvme_delete_ctx { 6201 char *name; 6202 struct nvme_path_id path_id; 6203 bdev_nvme_delete_done_fn delete_done; 6204 void *delete_done_ctx; 6205 uint64_t timeout_ticks; 6206 struct spdk_poller *poller; 6207 }; 6208 6209 static void 6210 free_bdev_nvme_delete_ctx(struct bdev_nvme_delete_ctx *ctx) 6211 { 6212 if (ctx != NULL) { 6213 free(ctx->name); 6214 free(ctx); 6215 } 6216 } 6217 6218 static bool 6219 nvme_path_id_compare(struct nvme_path_id *p, const struct nvme_path_id *path_id) 6220 { 6221 if (path_id->trid.trtype != 0) { 6222 if (path_id->trid.trtype == SPDK_NVME_TRANSPORT_CUSTOM) { 6223 if (strcasecmp(path_id->trid.trstring, p->trid.trstring) != 0) { 6224 return false; 6225 } 6226 } else { 6227 if (path_id->trid.trtype != p->trid.trtype) { 6228 return false; 6229 } 6230 } 6231 } 6232 6233 if (!spdk_mem_all_zero(path_id->trid.traddr, sizeof(path_id->trid.traddr))) { 6234 if (strcasecmp(path_id->trid.traddr, p->trid.traddr) != 0) { 6235 return false; 6236 } 6237 } 6238 6239 if (path_id->trid.adrfam != 0) { 6240 if (path_id->trid.adrfam != p->trid.adrfam) { 6241 return false; 6242 } 6243 } 6244 6245 if (!spdk_mem_all_zero(path_id->trid.trsvcid, sizeof(path_id->trid.trsvcid))) { 6246 if (strcasecmp(path_id->trid.trsvcid, p->trid.trsvcid) != 0) { 6247 return false; 6248 } 6249 } 6250 6251 if (!spdk_mem_all_zero(path_id->trid.subnqn, sizeof(path_id->trid.subnqn))) { 6252 if (strcmp(path_id->trid.subnqn, p->trid.subnqn) != 0) { 6253 return false; 6254 } 6255 } 6256 6257 if (!spdk_mem_all_zero(path_id->hostid.hostaddr, sizeof(path_id->hostid.hostaddr))) { 6258 if (strcmp(path_id->hostid.hostaddr, p->hostid.hostaddr) != 0) { 6259 return false; 6260 } 6261 } 6262 6263 if (!spdk_mem_all_zero(path_id->hostid.hostsvcid, sizeof(path_id->hostid.hostsvcid))) { 6264 if (strcmp(path_id->hostid.hostsvcid, p->hostid.hostsvcid) != 0) { 6265 return false; 6266 } 6267 } 6268 6269 return true; 6270 } 6271 6272 static bool 6273 nvme_path_id_exists(const char *name, const struct nvme_path_id *path_id) 6274 { 6275 struct nvme_bdev_ctrlr *nbdev_ctrlr; 6276 struct nvme_ctrlr *ctrlr; 6277 struct nvme_path_id *p; 6278 6279 pthread_mutex_lock(&g_bdev_nvme_mutex); 6280 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 6281 if (!nbdev_ctrlr) { 6282 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6283 return false; 6284 } 6285 6286 TAILQ_FOREACH(ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 6287 pthread_mutex_lock(&ctrlr->mutex); 6288 TAILQ_FOREACH(p, &ctrlr->trids, link) { 6289 if (nvme_path_id_compare(p, path_id)) { 6290 pthread_mutex_unlock(&ctrlr->mutex); 6291 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6292 return true; 6293 } 6294 } 6295 pthread_mutex_unlock(&ctrlr->mutex); 6296 } 6297 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6298 6299 return false; 6300 } 6301 6302 static int 6303 bdev_nvme_delete_complete_poll(void *arg) 6304 { 6305 struct bdev_nvme_delete_ctx *ctx = arg; 6306 int rc = 0; 6307 6308 if (nvme_path_id_exists(ctx->name, &ctx->path_id)) { 6309 if (ctx->timeout_ticks > spdk_get_ticks()) { 6310 return SPDK_POLLER_BUSY; 6311 } 6312 6313 SPDK_ERRLOG("NVMe path '%s' still exists after delete\n", ctx->name); 6314 rc = -ETIMEDOUT; 6315 } 6316 6317 spdk_poller_unregister(&ctx->poller); 6318 6319 ctx->delete_done(ctx->delete_done_ctx, rc); 6320 free_bdev_nvme_delete_ctx(ctx); 6321 6322 return SPDK_POLLER_BUSY; 6323 } 6324 6325 static int 6326 _bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, const struct nvme_path_id *path_id) 6327 { 6328 struct nvme_path_id *p, *t; 6329 spdk_msg_fn msg_fn; 6330 int rc = -ENXIO; 6331 6332 pthread_mutex_lock(&nvme_ctrlr->mutex); 6333 6334 TAILQ_FOREACH_REVERSE_SAFE(p, &nvme_ctrlr->trids, nvme_paths, link, t) { 6335 if (p == TAILQ_FIRST(&nvme_ctrlr->trids)) { 6336 break; 6337 } 6338 6339 if (!nvme_path_id_compare(p, path_id)) { 6340 continue; 6341 } 6342 6343 /* We are not using the specified path. */ 6344 TAILQ_REMOVE(&nvme_ctrlr->trids, p, link); 6345 free(p); 6346 rc = 0; 6347 } 6348 6349 if (p == NULL || !nvme_path_id_compare(p, path_id)) { 6350 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6351 return rc; 6352 } 6353 6354 /* If we made it here, then this path is a match! Now we need to remove it. */ 6355 6356 /* This is the active path in use right now. The active path is always the first in the list. */ 6357 assert(p == nvme_ctrlr->active_path_id); 6358 6359 if (!TAILQ_NEXT(p, link)) { 6360 /* The current path is the only path. */ 6361 msg_fn = _nvme_ctrlr_destruct; 6362 rc = bdev_nvme_delete_ctrlr_unsafe(nvme_ctrlr, false); 6363 } else { 6364 /* There is an alternative path. */ 6365 msg_fn = _bdev_nvme_reset_ctrlr; 6366 rc = bdev_nvme_failover_ctrlr_unsafe(nvme_ctrlr, true); 6367 } 6368 6369 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6370 6371 if (rc == 0) { 6372 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 6373 } else if (rc == -EALREADY) { 6374 rc = 0; 6375 } 6376 6377 return rc; 6378 } 6379 6380 int 6381 bdev_nvme_delete(const char *name, const struct nvme_path_id *path_id, 6382 bdev_nvme_delete_done_fn delete_done, void *delete_done_ctx) 6383 { 6384 struct nvme_bdev_ctrlr *nbdev_ctrlr; 6385 struct nvme_ctrlr *nvme_ctrlr, *tmp_nvme_ctrlr; 6386 struct bdev_nvme_delete_ctx *ctx = NULL; 6387 int rc = -ENXIO, _rc; 6388 6389 if (name == NULL || path_id == NULL) { 6390 rc = -EINVAL; 6391 goto exit; 6392 } 6393 6394 pthread_mutex_lock(&g_bdev_nvme_mutex); 6395 6396 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 6397 if (nbdev_ctrlr == NULL) { 6398 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6399 6400 SPDK_ERRLOG("Failed to find NVMe bdev controller\n"); 6401 rc = -ENODEV; 6402 goto exit; 6403 } 6404 6405 TAILQ_FOREACH_SAFE(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq, tmp_nvme_ctrlr) { 6406 _rc = _bdev_nvme_delete(nvme_ctrlr, path_id); 6407 if (_rc < 0 && _rc != -ENXIO) { 6408 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6409 rc = _rc; 6410 goto exit; 6411 } else if (_rc == 0) { 6412 /* We traverse all remaining nvme_ctrlrs even if one nvme_ctrlr 6413 * was deleted successfully. To remember the successful deletion, 6414 * overwrite rc only if _rc is zero. 6415 */ 6416 rc = 0; 6417 } 6418 } 6419 6420 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6421 6422 if (rc != 0 || delete_done == NULL) { 6423 goto exit; 6424 } 6425 6426 ctx = calloc(1, sizeof(*ctx)); 6427 if (ctx == NULL) { 6428 SPDK_ERRLOG("Failed to allocate context for bdev_nvme_delete\n"); 6429 rc = -ENOMEM; 6430 goto exit; 6431 } 6432 6433 ctx->name = strdup(name); 6434 if (ctx->name == NULL) { 6435 SPDK_ERRLOG("Failed to copy controller name for deletion\n"); 6436 rc = -ENOMEM; 6437 goto exit; 6438 } 6439 6440 ctx->delete_done = delete_done; 6441 ctx->delete_done_ctx = delete_done_ctx; 6442 ctx->path_id = *path_id; 6443 ctx->timeout_ticks = spdk_get_ticks() + 10 * spdk_get_ticks_hz(); 6444 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_delete_complete_poll, ctx, 1000); 6445 if (ctx->poller == NULL) { 6446 SPDK_ERRLOG("Failed to register bdev_nvme_delete poller\n"); 6447 rc = -ENOMEM; 6448 goto exit; 6449 } 6450 6451 exit: 6452 if (rc != 0) { 6453 free_bdev_nvme_delete_ctx(ctx); 6454 } 6455 6456 return rc; 6457 } 6458 6459 #define DISCOVERY_INFOLOG(ctx, format, ...) \ 6460 SPDK_INFOLOG(bdev_nvme, "Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 6461 6462 #define DISCOVERY_ERRLOG(ctx, format, ...) \ 6463 SPDK_ERRLOG("Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 6464 6465 struct discovery_entry_ctx { 6466 char name[128]; 6467 struct spdk_nvme_transport_id trid; 6468 struct spdk_nvme_ctrlr_opts drv_opts; 6469 struct spdk_nvmf_discovery_log_page_entry entry; 6470 TAILQ_ENTRY(discovery_entry_ctx) tailq; 6471 struct discovery_ctx *ctx; 6472 }; 6473 6474 struct discovery_ctx { 6475 char *name; 6476 spdk_bdev_nvme_start_discovery_fn start_cb_fn; 6477 spdk_bdev_nvme_stop_discovery_fn stop_cb_fn; 6478 void *cb_ctx; 6479 struct spdk_nvme_probe_ctx *probe_ctx; 6480 struct spdk_nvme_detach_ctx *detach_ctx; 6481 struct spdk_nvme_ctrlr *ctrlr; 6482 struct spdk_nvme_transport_id trid; 6483 struct discovery_entry_ctx *entry_ctx_in_use; 6484 struct spdk_poller *poller; 6485 struct spdk_nvme_ctrlr_opts drv_opts; 6486 struct nvme_ctrlr_opts bdev_opts; 6487 struct spdk_nvmf_discovery_log_page *log_page; 6488 TAILQ_ENTRY(discovery_ctx) tailq; 6489 TAILQ_HEAD(, discovery_entry_ctx) nvm_entry_ctxs; 6490 TAILQ_HEAD(, discovery_entry_ctx) discovery_entry_ctxs; 6491 int rc; 6492 bool wait_for_attach; 6493 uint64_t timeout_ticks; 6494 /* Denotes that the discovery service is being started. We're waiting 6495 * for the initial connection to the discovery controller to be 6496 * established and attach discovered NVM ctrlrs. 6497 */ 6498 bool initializing; 6499 /* Denotes if a discovery is currently in progress for this context. 6500 * That includes connecting to newly discovered subsystems. Used to 6501 * ensure we do not start a new discovery until an existing one is 6502 * complete. 6503 */ 6504 bool in_progress; 6505 6506 /* Denotes if another discovery is needed after the one in progress 6507 * completes. Set when we receive an AER completion while a discovery 6508 * is already in progress. 6509 */ 6510 bool pending; 6511 6512 /* Signal to the discovery context poller that it should stop the 6513 * discovery service, including detaching from the current discovery 6514 * controller. 6515 */ 6516 bool stop; 6517 6518 struct spdk_thread *calling_thread; 6519 uint32_t index; 6520 uint32_t attach_in_progress; 6521 char *hostnqn; 6522 6523 /* Denotes if the discovery service was started by the mdns discovery. 6524 */ 6525 bool from_mdns_discovery_service; 6526 }; 6527 6528 TAILQ_HEAD(discovery_ctxs, discovery_ctx); 6529 static struct discovery_ctxs g_discovery_ctxs = TAILQ_HEAD_INITIALIZER(g_discovery_ctxs); 6530 6531 static void get_discovery_log_page(struct discovery_ctx *ctx); 6532 6533 static void 6534 free_discovery_ctx(struct discovery_ctx *ctx) 6535 { 6536 free(ctx->log_page); 6537 free(ctx->hostnqn); 6538 free(ctx->name); 6539 free(ctx); 6540 } 6541 6542 static void 6543 discovery_complete(struct discovery_ctx *ctx) 6544 { 6545 ctx->initializing = false; 6546 ctx->in_progress = false; 6547 if (ctx->pending) { 6548 ctx->pending = false; 6549 get_discovery_log_page(ctx); 6550 } 6551 } 6552 6553 static void 6554 build_trid_from_log_page_entry(struct spdk_nvme_transport_id *trid, 6555 struct spdk_nvmf_discovery_log_page_entry *entry) 6556 { 6557 char *space; 6558 6559 trid->trtype = entry->trtype; 6560 trid->adrfam = entry->adrfam; 6561 memcpy(trid->traddr, entry->traddr, sizeof(entry->traddr)); 6562 memcpy(trid->trsvcid, entry->trsvcid, sizeof(entry->trsvcid)); 6563 /* Because the source buffer (entry->subnqn) is longer than trid->subnqn, and 6564 * before call to this function trid->subnqn is zeroed out, we need 6565 * to copy sizeof(trid->subnqn) minus one byte to make sure the last character 6566 * remains 0. Then we can shorten the string (replace ' ' with 0) if required 6567 */ 6568 memcpy(trid->subnqn, entry->subnqn, sizeof(trid->subnqn) - 1); 6569 6570 /* We want the traddr, trsvcid and subnqn fields to be NULL-terminated. 6571 * But the log page entries typically pad them with spaces, not zeroes. 6572 * So add a NULL terminator to each of these fields at the appropriate 6573 * location. 6574 */ 6575 space = strchr(trid->traddr, ' '); 6576 if (space) { 6577 *space = 0; 6578 } 6579 space = strchr(trid->trsvcid, ' '); 6580 if (space) { 6581 *space = 0; 6582 } 6583 space = strchr(trid->subnqn, ' '); 6584 if (space) { 6585 *space = 0; 6586 } 6587 } 6588 6589 static void 6590 _stop_discovery(void *_ctx) 6591 { 6592 struct discovery_ctx *ctx = _ctx; 6593 6594 if (ctx->attach_in_progress > 0) { 6595 spdk_thread_send_msg(spdk_get_thread(), _stop_discovery, ctx); 6596 return; 6597 } 6598 6599 ctx->stop = true; 6600 6601 while (!TAILQ_EMPTY(&ctx->nvm_entry_ctxs)) { 6602 struct discovery_entry_ctx *entry_ctx; 6603 struct nvme_path_id path = {}; 6604 6605 entry_ctx = TAILQ_FIRST(&ctx->nvm_entry_ctxs); 6606 path.trid = entry_ctx->trid; 6607 bdev_nvme_delete(entry_ctx->name, &path, NULL, NULL); 6608 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 6609 free(entry_ctx); 6610 } 6611 6612 while (!TAILQ_EMPTY(&ctx->discovery_entry_ctxs)) { 6613 struct discovery_entry_ctx *entry_ctx; 6614 6615 entry_ctx = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 6616 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 6617 free(entry_ctx); 6618 } 6619 6620 free(ctx->entry_ctx_in_use); 6621 ctx->entry_ctx_in_use = NULL; 6622 } 6623 6624 static void 6625 stop_discovery(struct discovery_ctx *ctx, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 6626 { 6627 ctx->stop_cb_fn = cb_fn; 6628 ctx->cb_ctx = cb_ctx; 6629 6630 if (ctx->attach_in_progress > 0) { 6631 DISCOVERY_INFOLOG(ctx, "stopping discovery with attach_in_progress: %"PRIu32"\n", 6632 ctx->attach_in_progress); 6633 } 6634 6635 _stop_discovery(ctx); 6636 } 6637 6638 static void 6639 remove_discovery_entry(struct nvme_ctrlr *nvme_ctrlr) 6640 { 6641 struct discovery_ctx *d_ctx; 6642 struct nvme_path_id *path_id; 6643 struct spdk_nvme_transport_id trid = {}; 6644 struct discovery_entry_ctx *entry_ctx, *tmp; 6645 6646 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 6647 6648 TAILQ_FOREACH(d_ctx, &g_discovery_ctxs, tailq) { 6649 TAILQ_FOREACH_SAFE(entry_ctx, &d_ctx->nvm_entry_ctxs, tailq, tmp) { 6650 build_trid_from_log_page_entry(&trid, &entry_ctx->entry); 6651 if (spdk_nvme_transport_id_compare(&trid, &path_id->trid) != 0) { 6652 continue; 6653 } 6654 6655 TAILQ_REMOVE(&d_ctx->nvm_entry_ctxs, entry_ctx, tailq); 6656 free(entry_ctx); 6657 DISCOVERY_INFOLOG(d_ctx, "Remove discovery entry: %s:%s:%s\n", 6658 trid.subnqn, trid.traddr, trid.trsvcid); 6659 6660 /* Fail discovery ctrlr to force reattach attempt */ 6661 spdk_nvme_ctrlr_fail(d_ctx->ctrlr); 6662 } 6663 } 6664 } 6665 6666 static void 6667 discovery_remove_controllers(struct discovery_ctx *ctx) 6668 { 6669 struct spdk_nvmf_discovery_log_page *log_page = ctx->log_page; 6670 struct discovery_entry_ctx *entry_ctx, *tmp; 6671 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 6672 struct spdk_nvme_transport_id old_trid = {}; 6673 uint64_t numrec, i; 6674 bool found; 6675 6676 numrec = from_le64(&log_page->numrec); 6677 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->nvm_entry_ctxs, tailq, tmp) { 6678 found = false; 6679 old_entry = &entry_ctx->entry; 6680 build_trid_from_log_page_entry(&old_trid, old_entry); 6681 for (i = 0; i < numrec; i++) { 6682 new_entry = &log_page->entries[i]; 6683 if (!memcmp(old_entry, new_entry, sizeof(*old_entry))) { 6684 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s found again\n", 6685 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 6686 found = true; 6687 break; 6688 } 6689 } 6690 if (!found) { 6691 struct nvme_path_id path = {}; 6692 6693 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s not found\n", 6694 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 6695 6696 path.trid = entry_ctx->trid; 6697 bdev_nvme_delete(entry_ctx->name, &path, NULL, NULL); 6698 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 6699 free(entry_ctx); 6700 } 6701 } 6702 free(log_page); 6703 ctx->log_page = NULL; 6704 discovery_complete(ctx); 6705 } 6706 6707 static void 6708 complete_discovery_start(struct discovery_ctx *ctx, int status) 6709 { 6710 ctx->timeout_ticks = 0; 6711 ctx->rc = status; 6712 if (ctx->start_cb_fn) { 6713 ctx->start_cb_fn(ctx->cb_ctx, status); 6714 ctx->start_cb_fn = NULL; 6715 ctx->cb_ctx = NULL; 6716 } 6717 } 6718 6719 static void 6720 discovery_attach_controller_done(void *cb_ctx, size_t bdev_count, int rc) 6721 { 6722 struct discovery_entry_ctx *entry_ctx = cb_ctx; 6723 struct discovery_ctx *ctx = entry_ctx->ctx; 6724 6725 DISCOVERY_INFOLOG(ctx, "attach %s done\n", entry_ctx->name); 6726 ctx->attach_in_progress--; 6727 if (ctx->attach_in_progress == 0) { 6728 complete_discovery_start(ctx, ctx->rc); 6729 if (ctx->initializing && ctx->rc != 0) { 6730 DISCOVERY_ERRLOG(ctx, "stopping discovery due to errors: %d\n", ctx->rc); 6731 stop_discovery(ctx, NULL, ctx->cb_ctx); 6732 } else { 6733 discovery_remove_controllers(ctx); 6734 } 6735 } 6736 } 6737 6738 static struct discovery_entry_ctx * 6739 create_discovery_entry_ctx(struct discovery_ctx *ctx, struct spdk_nvme_transport_id *trid) 6740 { 6741 struct discovery_entry_ctx *new_ctx; 6742 6743 new_ctx = calloc(1, sizeof(*new_ctx)); 6744 if (new_ctx == NULL) { 6745 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6746 return NULL; 6747 } 6748 6749 new_ctx->ctx = ctx; 6750 memcpy(&new_ctx->trid, trid, sizeof(*trid)); 6751 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 6752 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 6753 return new_ctx; 6754 } 6755 6756 static void 6757 discovery_log_page_cb(void *cb_arg, int rc, const struct spdk_nvme_cpl *cpl, 6758 struct spdk_nvmf_discovery_log_page *log_page) 6759 { 6760 struct discovery_ctx *ctx = cb_arg; 6761 struct discovery_entry_ctx *entry_ctx, *tmp; 6762 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 6763 uint64_t numrec, i; 6764 bool found; 6765 6766 if (rc || spdk_nvme_cpl_is_error(cpl)) { 6767 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 6768 return; 6769 } 6770 6771 ctx->log_page = log_page; 6772 assert(ctx->attach_in_progress == 0); 6773 numrec = from_le64(&log_page->numrec); 6774 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->discovery_entry_ctxs, tailq, tmp) { 6775 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 6776 free(entry_ctx); 6777 } 6778 for (i = 0; i < numrec; i++) { 6779 found = false; 6780 new_entry = &log_page->entries[i]; 6781 if (new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY_CURRENT || 6782 new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) { 6783 struct discovery_entry_ctx *new_ctx; 6784 struct spdk_nvme_transport_id trid = {}; 6785 6786 build_trid_from_log_page_entry(&trid, new_entry); 6787 new_ctx = create_discovery_entry_ctx(ctx, &trid); 6788 if (new_ctx == NULL) { 6789 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6790 break; 6791 } 6792 6793 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, new_ctx, tailq); 6794 continue; 6795 } 6796 TAILQ_FOREACH(entry_ctx, &ctx->nvm_entry_ctxs, tailq) { 6797 old_entry = &entry_ctx->entry; 6798 if (!memcmp(new_entry, old_entry, sizeof(*new_entry))) { 6799 found = true; 6800 break; 6801 } 6802 } 6803 if (!found) { 6804 struct discovery_entry_ctx *subnqn_ctx = NULL, *new_ctx; 6805 struct discovery_ctx *d_ctx; 6806 6807 TAILQ_FOREACH(d_ctx, &g_discovery_ctxs, tailq) { 6808 TAILQ_FOREACH(subnqn_ctx, &d_ctx->nvm_entry_ctxs, tailq) { 6809 if (!memcmp(subnqn_ctx->entry.subnqn, new_entry->subnqn, 6810 sizeof(new_entry->subnqn))) { 6811 break; 6812 } 6813 } 6814 if (subnqn_ctx) { 6815 break; 6816 } 6817 } 6818 6819 new_ctx = calloc(1, sizeof(*new_ctx)); 6820 if (new_ctx == NULL) { 6821 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6822 break; 6823 } 6824 6825 new_ctx->ctx = ctx; 6826 memcpy(&new_ctx->entry, new_entry, sizeof(*new_entry)); 6827 build_trid_from_log_page_entry(&new_ctx->trid, new_entry); 6828 if (subnqn_ctx) { 6829 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s", subnqn_ctx->name); 6830 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new path for %s\n", 6831 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 6832 new_ctx->name); 6833 } else { 6834 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s%d", ctx->name, ctx->index++); 6835 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new subsystem %s\n", 6836 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 6837 new_ctx->name); 6838 } 6839 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 6840 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 6841 rc = bdev_nvme_create(&new_ctx->trid, new_ctx->name, NULL, 0, 6842 discovery_attach_controller_done, new_ctx, 6843 &new_ctx->drv_opts, &ctx->bdev_opts, true); 6844 if (rc == 0) { 6845 TAILQ_INSERT_TAIL(&ctx->nvm_entry_ctxs, new_ctx, tailq); 6846 ctx->attach_in_progress++; 6847 } else { 6848 DISCOVERY_ERRLOG(ctx, "bdev_nvme_create failed (%s)\n", spdk_strerror(-rc)); 6849 } 6850 } 6851 } 6852 6853 if (ctx->attach_in_progress == 0) { 6854 discovery_remove_controllers(ctx); 6855 } 6856 } 6857 6858 static void 6859 get_discovery_log_page(struct discovery_ctx *ctx) 6860 { 6861 int rc; 6862 6863 assert(ctx->in_progress == false); 6864 ctx->in_progress = true; 6865 rc = spdk_nvme_ctrlr_get_discovery_log_page(ctx->ctrlr, discovery_log_page_cb, ctx); 6866 if (rc != 0) { 6867 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 6868 } 6869 DISCOVERY_INFOLOG(ctx, "sent discovery log page command\n"); 6870 } 6871 6872 static void 6873 discovery_aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 6874 { 6875 struct discovery_ctx *ctx = arg; 6876 uint32_t log_page_id = (cpl->cdw0 & 0xFF0000) >> 16; 6877 6878 if (spdk_nvme_cpl_is_error(cpl)) { 6879 DISCOVERY_ERRLOG(ctx, "aer failed\n"); 6880 return; 6881 } 6882 6883 if (log_page_id != SPDK_NVME_LOG_DISCOVERY) { 6884 DISCOVERY_ERRLOG(ctx, "unexpected log page 0x%x\n", log_page_id); 6885 return; 6886 } 6887 6888 DISCOVERY_INFOLOG(ctx, "got aer\n"); 6889 if (ctx->in_progress) { 6890 ctx->pending = true; 6891 return; 6892 } 6893 6894 get_discovery_log_page(ctx); 6895 } 6896 6897 static void 6898 discovery_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 6899 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 6900 { 6901 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 6902 struct discovery_ctx *ctx; 6903 6904 ctx = SPDK_CONTAINEROF(user_opts, struct discovery_ctx, drv_opts); 6905 6906 DISCOVERY_INFOLOG(ctx, "discovery ctrlr attached\n"); 6907 ctx->probe_ctx = NULL; 6908 ctx->ctrlr = ctrlr; 6909 6910 if (ctx->rc != 0) { 6911 DISCOVERY_ERRLOG(ctx, "encountered error while attaching discovery ctrlr: %d\n", 6912 ctx->rc); 6913 return; 6914 } 6915 6916 spdk_nvme_ctrlr_register_aer_callback(ctx->ctrlr, discovery_aer_cb, ctx); 6917 } 6918 6919 static int 6920 discovery_poller(void *arg) 6921 { 6922 struct discovery_ctx *ctx = arg; 6923 struct spdk_nvme_transport_id *trid; 6924 int rc; 6925 6926 if (ctx->detach_ctx) { 6927 rc = spdk_nvme_detach_poll_async(ctx->detach_ctx); 6928 if (rc != -EAGAIN) { 6929 ctx->detach_ctx = NULL; 6930 ctx->ctrlr = NULL; 6931 } 6932 } else if (ctx->stop) { 6933 if (ctx->ctrlr != NULL) { 6934 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 6935 if (rc == 0) { 6936 return SPDK_POLLER_BUSY; 6937 } 6938 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 6939 } 6940 spdk_poller_unregister(&ctx->poller); 6941 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 6942 assert(ctx->start_cb_fn == NULL); 6943 if (ctx->stop_cb_fn != NULL) { 6944 ctx->stop_cb_fn(ctx->cb_ctx); 6945 } 6946 free_discovery_ctx(ctx); 6947 } else if (ctx->probe_ctx == NULL && ctx->ctrlr == NULL) { 6948 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 6949 DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); 6950 assert(ctx->initializing); 6951 spdk_poller_unregister(&ctx->poller); 6952 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 6953 complete_discovery_start(ctx, -ETIMEDOUT); 6954 stop_discovery(ctx, NULL, NULL); 6955 free_discovery_ctx(ctx); 6956 return SPDK_POLLER_BUSY; 6957 } 6958 6959 assert(ctx->entry_ctx_in_use == NULL); 6960 ctx->entry_ctx_in_use = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 6961 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 6962 trid = &ctx->entry_ctx_in_use->trid; 6963 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, discovery_attach_cb); 6964 if (ctx->probe_ctx) { 6965 spdk_poller_unregister(&ctx->poller); 6966 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000); 6967 } else { 6968 DISCOVERY_ERRLOG(ctx, "could not start discovery connect\n"); 6969 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 6970 ctx->entry_ctx_in_use = NULL; 6971 } 6972 } else if (ctx->probe_ctx) { 6973 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 6974 DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); 6975 complete_discovery_start(ctx, -ETIMEDOUT); 6976 return SPDK_POLLER_BUSY; 6977 } 6978 6979 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 6980 if (rc != -EAGAIN) { 6981 if (ctx->rc != 0) { 6982 assert(ctx->initializing); 6983 stop_discovery(ctx, NULL, ctx->cb_ctx); 6984 } else { 6985 assert(rc == 0); 6986 DISCOVERY_INFOLOG(ctx, "discovery ctrlr connected\n"); 6987 ctx->rc = rc; 6988 get_discovery_log_page(ctx); 6989 } 6990 } 6991 } else { 6992 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 6993 DISCOVERY_ERRLOG(ctx, "timed out while attaching NVM ctrlrs\n"); 6994 complete_discovery_start(ctx, -ETIMEDOUT); 6995 /* We need to wait until all NVM ctrlrs are attached before we stop the 6996 * discovery service to make sure we don't detach a ctrlr that is still 6997 * being attached. 6998 */ 6999 if (ctx->attach_in_progress == 0) { 7000 stop_discovery(ctx, NULL, ctx->cb_ctx); 7001 return SPDK_POLLER_BUSY; 7002 } 7003 } 7004 7005 rc = spdk_nvme_ctrlr_process_admin_completions(ctx->ctrlr); 7006 if (rc < 0) { 7007 spdk_poller_unregister(&ctx->poller); 7008 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 7009 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 7010 ctx->entry_ctx_in_use = NULL; 7011 7012 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 7013 if (rc != 0) { 7014 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 7015 ctx->ctrlr = NULL; 7016 } 7017 } 7018 } 7019 7020 return SPDK_POLLER_BUSY; 7021 } 7022 7023 static void 7024 start_discovery_poller(void *arg) 7025 { 7026 struct discovery_ctx *ctx = arg; 7027 7028 TAILQ_INSERT_TAIL(&g_discovery_ctxs, ctx, tailq); 7029 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 7030 } 7031 7032 int 7033 bdev_nvme_start_discovery(struct spdk_nvme_transport_id *trid, 7034 const char *base_name, 7035 struct spdk_nvme_ctrlr_opts *drv_opts, 7036 struct nvme_ctrlr_opts *bdev_opts, 7037 uint64_t attach_timeout, 7038 bool from_mdns, 7039 spdk_bdev_nvme_start_discovery_fn cb_fn, void *cb_ctx) 7040 { 7041 struct discovery_ctx *ctx; 7042 struct discovery_entry_ctx *discovery_entry_ctx; 7043 7044 snprintf(trid->subnqn, sizeof(trid->subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN); 7045 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 7046 if (strcmp(ctx->name, base_name) == 0) { 7047 return -EEXIST; 7048 } 7049 7050 if (ctx->entry_ctx_in_use != NULL) { 7051 if (!spdk_nvme_transport_id_compare(trid, &ctx->entry_ctx_in_use->trid)) { 7052 return -EEXIST; 7053 } 7054 } 7055 7056 TAILQ_FOREACH(discovery_entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 7057 if (!spdk_nvme_transport_id_compare(trid, &discovery_entry_ctx->trid)) { 7058 return -EEXIST; 7059 } 7060 } 7061 } 7062 7063 ctx = calloc(1, sizeof(*ctx)); 7064 if (ctx == NULL) { 7065 return -ENOMEM; 7066 } 7067 7068 ctx->name = strdup(base_name); 7069 if (ctx->name == NULL) { 7070 free_discovery_ctx(ctx); 7071 return -ENOMEM; 7072 } 7073 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 7074 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 7075 ctx->from_mdns_discovery_service = from_mdns; 7076 ctx->bdev_opts.from_discovery_service = true; 7077 ctx->calling_thread = spdk_get_thread(); 7078 ctx->start_cb_fn = cb_fn; 7079 ctx->cb_ctx = cb_ctx; 7080 ctx->initializing = true; 7081 if (ctx->start_cb_fn) { 7082 /* We can use this when dumping json to denote if this RPC parameter 7083 * was specified or not. 7084 */ 7085 ctx->wait_for_attach = true; 7086 } 7087 if (attach_timeout != 0) { 7088 ctx->timeout_ticks = spdk_get_ticks() + attach_timeout * 7089 spdk_get_ticks_hz() / 1000ull; 7090 } 7091 TAILQ_INIT(&ctx->nvm_entry_ctxs); 7092 TAILQ_INIT(&ctx->discovery_entry_ctxs); 7093 memcpy(&ctx->trid, trid, sizeof(*trid)); 7094 /* Even if user did not specify hostnqn, we can still strdup("\0"); */ 7095 ctx->hostnqn = strdup(ctx->drv_opts.hostnqn); 7096 if (ctx->hostnqn == NULL) { 7097 free_discovery_ctx(ctx); 7098 return -ENOMEM; 7099 } 7100 discovery_entry_ctx = create_discovery_entry_ctx(ctx, trid); 7101 if (discovery_entry_ctx == NULL) { 7102 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 7103 free_discovery_ctx(ctx); 7104 return -ENOMEM; 7105 } 7106 7107 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, discovery_entry_ctx, tailq); 7108 spdk_thread_send_msg(g_bdev_nvme_init_thread, start_discovery_poller, ctx); 7109 return 0; 7110 } 7111 7112 int 7113 bdev_nvme_stop_discovery(const char *name, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 7114 { 7115 struct discovery_ctx *ctx; 7116 7117 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 7118 if (strcmp(name, ctx->name) == 0) { 7119 if (ctx->stop) { 7120 return -EALREADY; 7121 } 7122 /* If we're still starting the discovery service and ->rc is non-zero, we're 7123 * going to stop it as soon as we can 7124 */ 7125 if (ctx->initializing && ctx->rc != 0) { 7126 return -EALREADY; 7127 } 7128 stop_discovery(ctx, cb_fn, cb_ctx); 7129 return 0; 7130 } 7131 } 7132 7133 return -ENOENT; 7134 } 7135 7136 static int 7137 bdev_nvme_library_init(void) 7138 { 7139 g_bdev_nvme_init_thread = spdk_get_thread(); 7140 7141 spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_create_poll_group_cb, 7142 bdev_nvme_destroy_poll_group_cb, 7143 sizeof(struct nvme_poll_group), "nvme_poll_groups"); 7144 7145 return 0; 7146 } 7147 7148 static void 7149 bdev_nvme_fini_destruct_ctrlrs(void) 7150 { 7151 struct nvme_bdev_ctrlr *nbdev_ctrlr; 7152 struct nvme_ctrlr *nvme_ctrlr; 7153 7154 pthread_mutex_lock(&g_bdev_nvme_mutex); 7155 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 7156 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 7157 pthread_mutex_lock(&nvme_ctrlr->mutex); 7158 if (nvme_ctrlr->destruct) { 7159 /* This controller's destruction was already started 7160 * before the application started shutting down 7161 */ 7162 pthread_mutex_unlock(&nvme_ctrlr->mutex); 7163 continue; 7164 } 7165 nvme_ctrlr->destruct = true; 7166 pthread_mutex_unlock(&nvme_ctrlr->mutex); 7167 7168 spdk_thread_send_msg(nvme_ctrlr->thread, _nvme_ctrlr_destruct, 7169 nvme_ctrlr); 7170 } 7171 } 7172 7173 g_bdev_nvme_module_finish = true; 7174 if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 7175 pthread_mutex_unlock(&g_bdev_nvme_mutex); 7176 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 7177 spdk_bdev_module_fini_done(); 7178 return; 7179 } 7180 7181 pthread_mutex_unlock(&g_bdev_nvme_mutex); 7182 } 7183 7184 static void 7185 check_discovery_fini(void *arg) 7186 { 7187 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 7188 bdev_nvme_fini_destruct_ctrlrs(); 7189 } 7190 } 7191 7192 static void 7193 bdev_nvme_library_fini(void) 7194 { 7195 struct nvme_probe_skip_entry *entry, *entry_tmp; 7196 struct discovery_ctx *ctx; 7197 7198 spdk_poller_unregister(&g_hotplug_poller); 7199 free(g_hotplug_probe_ctx); 7200 g_hotplug_probe_ctx = NULL; 7201 7202 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) { 7203 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 7204 free(entry); 7205 } 7206 7207 assert(spdk_get_thread() == g_bdev_nvme_init_thread); 7208 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 7209 bdev_nvme_fini_destruct_ctrlrs(); 7210 } else { 7211 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 7212 stop_discovery(ctx, check_discovery_fini, NULL); 7213 } 7214 } 7215 } 7216 7217 static void 7218 bdev_nvme_verify_pi_error(struct nvme_bdev_io *bio) 7219 { 7220 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7221 struct spdk_bdev *bdev = bdev_io->bdev; 7222 struct spdk_dif_ctx dif_ctx; 7223 struct spdk_dif_error err_blk = {}; 7224 int rc; 7225 struct spdk_dif_ctx_init_ext_opts dif_opts; 7226 7227 dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format); 7228 dif_opts.dif_pi_format = SPDK_DIF_PI_FORMAT_16; 7229 rc = spdk_dif_ctx_init(&dif_ctx, 7230 bdev->blocklen, bdev->md_len, bdev->md_interleave, 7231 bdev->dif_is_head_of_md, bdev->dif_type, 7232 bdev_io->u.bdev.dif_check_flags, 7233 bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0, &dif_opts); 7234 if (rc != 0) { 7235 SPDK_ERRLOG("Initialization of DIF context failed\n"); 7236 return; 7237 } 7238 7239 if (bdev->md_interleave) { 7240 rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 7241 bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 7242 } else { 7243 struct iovec md_iov = { 7244 .iov_base = bdev_io->u.bdev.md_buf, 7245 .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len, 7246 }; 7247 7248 rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 7249 &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 7250 } 7251 7252 if (rc != 0) { 7253 SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n", 7254 err_blk.err_type, err_blk.err_offset); 7255 } else { 7256 SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n"); 7257 } 7258 } 7259 7260 static void 7261 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 7262 { 7263 struct nvme_bdev_io *bio = ref; 7264 7265 if (spdk_nvme_cpl_is_success(cpl)) { 7266 /* Run PI verification for read data buffer. */ 7267 bdev_nvme_verify_pi_error(bio); 7268 } 7269 7270 /* Return original completion status */ 7271 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 7272 } 7273 7274 static void 7275 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 7276 { 7277 struct nvme_bdev_io *bio = ref; 7278 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7279 int ret; 7280 7281 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 7282 SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n", 7283 cpl->status.sct, cpl->status.sc); 7284 7285 /* Save completion status to use after verifying PI error. */ 7286 bio->cpl = *cpl; 7287 7288 if (spdk_likely(nvme_io_path_is_available(bio->io_path))) { 7289 /* Read without PI checking to verify PI error. */ 7290 ret = bdev_nvme_no_pi_readv(bio, 7291 bdev_io->u.bdev.iovs, 7292 bdev_io->u.bdev.iovcnt, 7293 bdev_io->u.bdev.md_buf, 7294 bdev_io->u.bdev.num_blocks, 7295 bdev_io->u.bdev.offset_blocks); 7296 if (ret == 0) { 7297 return; 7298 } 7299 } 7300 } 7301 7302 bdev_nvme_io_complete_nvme_status(bio, cpl); 7303 } 7304 7305 static void 7306 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7307 { 7308 struct nvme_bdev_io *bio = ref; 7309 7310 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 7311 SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n", 7312 cpl->status.sct, cpl->status.sc); 7313 /* Run PI verification for write data buffer if PI error is detected. */ 7314 bdev_nvme_verify_pi_error(bio); 7315 } 7316 7317 bdev_nvme_io_complete_nvme_status(bio, cpl); 7318 } 7319 7320 static void 7321 bdev_nvme_zone_appendv_done(void *ref, const struct spdk_nvme_cpl *cpl) 7322 { 7323 struct nvme_bdev_io *bio = ref; 7324 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7325 7326 /* spdk_bdev_io_get_append_location() requires that the ALBA is stored in offset_blocks. 7327 * Additionally, offset_blocks has to be set before calling bdev_nvme_verify_pi_error(). 7328 */ 7329 bdev_io->u.bdev.offset_blocks = *(uint64_t *)&cpl->cdw0; 7330 7331 if (spdk_nvme_cpl_is_pi_error(cpl)) { 7332 SPDK_ERRLOG("zone append completed with PI error (sct=%d, sc=%d)\n", 7333 cpl->status.sct, cpl->status.sc); 7334 /* Run PI verification for zone append data buffer if PI error is detected. */ 7335 bdev_nvme_verify_pi_error(bio); 7336 } 7337 7338 bdev_nvme_io_complete_nvme_status(bio, cpl); 7339 } 7340 7341 static void 7342 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7343 { 7344 struct nvme_bdev_io *bio = ref; 7345 7346 if (spdk_nvme_cpl_is_pi_error(cpl)) { 7347 SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n", 7348 cpl->status.sct, cpl->status.sc); 7349 /* Run PI verification for compare data buffer if PI error is detected. */ 7350 bdev_nvme_verify_pi_error(bio); 7351 } 7352 7353 bdev_nvme_io_complete_nvme_status(bio, cpl); 7354 } 7355 7356 static void 7357 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7358 { 7359 struct nvme_bdev_io *bio = ref; 7360 7361 /* Compare operation completion */ 7362 if (!bio->first_fused_completed) { 7363 /* Save compare result for write callback */ 7364 bio->cpl = *cpl; 7365 bio->first_fused_completed = true; 7366 return; 7367 } 7368 7369 /* Write operation completion */ 7370 if (spdk_nvme_cpl_is_error(&bio->cpl)) { 7371 /* If bio->cpl is already an error, it means the compare operation failed. In that case, 7372 * complete the IO with the compare operation's status. 7373 */ 7374 if (!spdk_nvme_cpl_is_error(cpl)) { 7375 SPDK_ERRLOG("Unexpected write success after compare failure.\n"); 7376 } 7377 7378 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 7379 } else { 7380 bdev_nvme_io_complete_nvme_status(bio, cpl); 7381 } 7382 } 7383 7384 static void 7385 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl) 7386 { 7387 struct nvme_bdev_io *bio = ref; 7388 7389 bdev_nvme_io_complete_nvme_status(bio, cpl); 7390 } 7391 7392 static int 7393 fill_zone_from_report(struct spdk_bdev_zone_info *info, struct spdk_nvme_zns_zone_desc *desc) 7394 { 7395 switch (desc->zt) { 7396 case SPDK_NVME_ZONE_TYPE_SEQWR: 7397 info->type = SPDK_BDEV_ZONE_TYPE_SEQWR; 7398 break; 7399 default: 7400 SPDK_ERRLOG("Invalid zone type: %#x in zone report\n", desc->zt); 7401 return -EIO; 7402 } 7403 7404 switch (desc->zs) { 7405 case SPDK_NVME_ZONE_STATE_EMPTY: 7406 info->state = SPDK_BDEV_ZONE_STATE_EMPTY; 7407 break; 7408 case SPDK_NVME_ZONE_STATE_IOPEN: 7409 info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN; 7410 break; 7411 case SPDK_NVME_ZONE_STATE_EOPEN: 7412 info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN; 7413 break; 7414 case SPDK_NVME_ZONE_STATE_CLOSED: 7415 info->state = SPDK_BDEV_ZONE_STATE_CLOSED; 7416 break; 7417 case SPDK_NVME_ZONE_STATE_RONLY: 7418 info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY; 7419 break; 7420 case SPDK_NVME_ZONE_STATE_FULL: 7421 info->state = SPDK_BDEV_ZONE_STATE_FULL; 7422 break; 7423 case SPDK_NVME_ZONE_STATE_OFFLINE: 7424 info->state = SPDK_BDEV_ZONE_STATE_OFFLINE; 7425 break; 7426 default: 7427 SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", desc->zs); 7428 return -EIO; 7429 } 7430 7431 info->zone_id = desc->zslba; 7432 info->write_pointer = desc->wp; 7433 info->capacity = desc->zcap; 7434 7435 return 0; 7436 } 7437 7438 static void 7439 bdev_nvme_get_zone_info_done(void *ref, const struct spdk_nvme_cpl *cpl) 7440 { 7441 struct nvme_bdev_io *bio = ref; 7442 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7443 uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id; 7444 uint32_t zones_to_copy = bdev_io->u.zone_mgmt.num_zones; 7445 struct spdk_bdev_zone_info *info = bdev_io->u.zone_mgmt.buf; 7446 uint64_t max_zones_per_buf, i; 7447 uint32_t zone_report_bufsize; 7448 struct spdk_nvme_ns *ns; 7449 struct spdk_nvme_qpair *qpair; 7450 int ret; 7451 7452 if (spdk_nvme_cpl_is_error(cpl)) { 7453 goto out_complete_io_nvme_cpl; 7454 } 7455 7456 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 7457 ret = -ENXIO; 7458 goto out_complete_io_ret; 7459 } 7460 7461 ns = bio->io_path->nvme_ns->ns; 7462 qpair = bio->io_path->qpair->qpair; 7463 7464 zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 7465 max_zones_per_buf = (zone_report_bufsize - sizeof(*bio->zone_report_buf)) / 7466 sizeof(bio->zone_report_buf->descs[0]); 7467 7468 if (bio->zone_report_buf->nr_zones > max_zones_per_buf) { 7469 ret = -EINVAL; 7470 goto out_complete_io_ret; 7471 } 7472 7473 if (!bio->zone_report_buf->nr_zones) { 7474 ret = -EINVAL; 7475 goto out_complete_io_ret; 7476 } 7477 7478 for (i = 0; i < bio->zone_report_buf->nr_zones && bio->handled_zones < zones_to_copy; i++) { 7479 ret = fill_zone_from_report(&info[bio->handled_zones], 7480 &bio->zone_report_buf->descs[i]); 7481 if (ret) { 7482 goto out_complete_io_ret; 7483 } 7484 bio->handled_zones++; 7485 } 7486 7487 if (bio->handled_zones < zones_to_copy) { 7488 uint64_t zone_size_lba = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 7489 uint64_t slba = zone_id + (zone_size_lba * bio->handled_zones); 7490 7491 memset(bio->zone_report_buf, 0, zone_report_bufsize); 7492 ret = spdk_nvme_zns_report_zones(ns, qpair, 7493 bio->zone_report_buf, zone_report_bufsize, 7494 slba, SPDK_NVME_ZRA_LIST_ALL, true, 7495 bdev_nvme_get_zone_info_done, bio); 7496 if (!ret) { 7497 return; 7498 } else { 7499 goto out_complete_io_ret; 7500 } 7501 } 7502 7503 out_complete_io_nvme_cpl: 7504 free(bio->zone_report_buf); 7505 bio->zone_report_buf = NULL; 7506 bdev_nvme_io_complete_nvme_status(bio, cpl); 7507 return; 7508 7509 out_complete_io_ret: 7510 free(bio->zone_report_buf); 7511 bio->zone_report_buf = NULL; 7512 bdev_nvme_io_complete(bio, ret); 7513 } 7514 7515 static void 7516 bdev_nvme_zone_management_done(void *ref, const struct spdk_nvme_cpl *cpl) 7517 { 7518 struct nvme_bdev_io *bio = ref; 7519 7520 bdev_nvme_io_complete_nvme_status(bio, cpl); 7521 } 7522 7523 static void 7524 bdev_nvme_admin_passthru_complete_nvme_status(void *ctx) 7525 { 7526 struct nvme_bdev_io *bio = ctx; 7527 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7528 const struct spdk_nvme_cpl *cpl = &bio->cpl; 7529 7530 assert(bdev_nvme_io_type_is_admin(bdev_io->type)); 7531 7532 __bdev_nvme_io_complete(bdev_io, 0, cpl); 7533 } 7534 7535 static void 7536 bdev_nvme_abort_complete(void *ctx) 7537 { 7538 struct nvme_bdev_io *bio = ctx; 7539 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7540 7541 if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) { 7542 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS, NULL); 7543 } else { 7544 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED, NULL); 7545 } 7546 } 7547 7548 static void 7549 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl) 7550 { 7551 struct nvme_bdev_io *bio = ref; 7552 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7553 7554 bio->cpl = *cpl; 7555 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), bdev_nvme_abort_complete, bio); 7556 } 7557 7558 static void 7559 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl) 7560 { 7561 struct nvme_bdev_io *bio = ref; 7562 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7563 7564 bio->cpl = *cpl; 7565 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7566 bdev_nvme_admin_passthru_complete_nvme_status, bio); 7567 } 7568 7569 static void 7570 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset) 7571 { 7572 struct nvme_bdev_io *bio = ref; 7573 struct iovec *iov; 7574 7575 bio->iov_offset = sgl_offset; 7576 for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) { 7577 iov = &bio->iovs[bio->iovpos]; 7578 if (bio->iov_offset < iov->iov_len) { 7579 break; 7580 } 7581 7582 bio->iov_offset -= iov->iov_len; 7583 } 7584 } 7585 7586 static int 7587 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length) 7588 { 7589 struct nvme_bdev_io *bio = ref; 7590 struct iovec *iov; 7591 7592 assert(bio->iovpos < bio->iovcnt); 7593 7594 iov = &bio->iovs[bio->iovpos]; 7595 7596 *address = iov->iov_base; 7597 *length = iov->iov_len; 7598 7599 if (bio->iov_offset) { 7600 assert(bio->iov_offset <= iov->iov_len); 7601 *address += bio->iov_offset; 7602 *length -= bio->iov_offset; 7603 } 7604 7605 bio->iov_offset += *length; 7606 if (bio->iov_offset == iov->iov_len) { 7607 bio->iovpos++; 7608 bio->iov_offset = 0; 7609 } 7610 7611 return 0; 7612 } 7613 7614 static void 7615 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset) 7616 { 7617 struct nvme_bdev_io *bio = ref; 7618 struct iovec *iov; 7619 7620 bio->fused_iov_offset = sgl_offset; 7621 for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) { 7622 iov = &bio->fused_iovs[bio->fused_iovpos]; 7623 if (bio->fused_iov_offset < iov->iov_len) { 7624 break; 7625 } 7626 7627 bio->fused_iov_offset -= iov->iov_len; 7628 } 7629 } 7630 7631 static int 7632 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length) 7633 { 7634 struct nvme_bdev_io *bio = ref; 7635 struct iovec *iov; 7636 7637 assert(bio->fused_iovpos < bio->fused_iovcnt); 7638 7639 iov = &bio->fused_iovs[bio->fused_iovpos]; 7640 7641 *address = iov->iov_base; 7642 *length = iov->iov_len; 7643 7644 if (bio->fused_iov_offset) { 7645 assert(bio->fused_iov_offset <= iov->iov_len); 7646 *address += bio->fused_iov_offset; 7647 *length -= bio->fused_iov_offset; 7648 } 7649 7650 bio->fused_iov_offset += *length; 7651 if (bio->fused_iov_offset == iov->iov_len) { 7652 bio->fused_iovpos++; 7653 bio->fused_iov_offset = 0; 7654 } 7655 7656 return 0; 7657 } 7658 7659 static int 7660 bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7661 void *md, uint64_t lba_count, uint64_t lba) 7662 { 7663 int rc; 7664 7665 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n", 7666 lba_count, lba); 7667 7668 bio->iovs = iov; 7669 bio->iovcnt = iovcnt; 7670 bio->iovpos = 0; 7671 bio->iov_offset = 0; 7672 7673 rc = spdk_nvme_ns_cmd_readv_with_md(bio->io_path->nvme_ns->ns, 7674 bio->io_path->qpair->qpair, 7675 lba, lba_count, 7676 bdev_nvme_no_pi_readv_done, bio, 0, 7677 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 7678 md, 0, 0); 7679 7680 if (rc != 0 && rc != -ENOMEM) { 7681 SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc); 7682 } 7683 return rc; 7684 } 7685 7686 static int 7687 bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7688 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, 7689 struct spdk_memory_domain *domain, void *domain_ctx, 7690 struct spdk_accel_sequence *seq) 7691 { 7692 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7693 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7694 int rc; 7695 7696 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7697 lba_count, lba); 7698 7699 bio->iovs = iov; 7700 bio->iovcnt = iovcnt; 7701 bio->iovpos = 0; 7702 bio->iov_offset = 0; 7703 7704 if (domain != NULL || seq != NULL) { 7705 bio->ext_opts.size = SPDK_SIZEOF(&bio->ext_opts, accel_sequence); 7706 bio->ext_opts.memory_domain = domain; 7707 bio->ext_opts.memory_domain_ctx = domain_ctx; 7708 bio->ext_opts.io_flags = flags; 7709 bio->ext_opts.metadata = md; 7710 bio->ext_opts.accel_sequence = seq; 7711 7712 if (iovcnt == 1) { 7713 rc = spdk_nvme_ns_cmd_read_ext(ns, qpair, iov[0].iov_base, lba, lba_count, bdev_nvme_readv_done, 7714 bio, &bio->ext_opts); 7715 } else { 7716 rc = spdk_nvme_ns_cmd_readv_ext(ns, qpair, lba, lba_count, 7717 bdev_nvme_readv_done, bio, 7718 bdev_nvme_queued_reset_sgl, 7719 bdev_nvme_queued_next_sge, 7720 &bio->ext_opts); 7721 } 7722 } else if (iovcnt == 1) { 7723 rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base, 7724 md, lba, lba_count, bdev_nvme_readv_done, 7725 bio, flags, 0, 0); 7726 } else { 7727 rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count, 7728 bdev_nvme_readv_done, bio, flags, 7729 bdev_nvme_queued_reset_sgl, 7730 bdev_nvme_queued_next_sge, md, 0, 0); 7731 } 7732 7733 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 7734 SPDK_ERRLOG("readv failed: rc = %d\n", rc); 7735 } 7736 return rc; 7737 } 7738 7739 static int 7740 bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7741 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, 7742 struct spdk_memory_domain *domain, void *domain_ctx, 7743 struct spdk_accel_sequence *seq) 7744 { 7745 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7746 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7747 int rc; 7748 7749 SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7750 lba_count, lba); 7751 7752 bio->iovs = iov; 7753 bio->iovcnt = iovcnt; 7754 bio->iovpos = 0; 7755 bio->iov_offset = 0; 7756 7757 if (domain != NULL || seq != NULL) { 7758 bio->ext_opts.size = SPDK_SIZEOF(&bio->ext_opts, accel_sequence); 7759 bio->ext_opts.memory_domain = domain; 7760 bio->ext_opts.memory_domain_ctx = domain_ctx; 7761 bio->ext_opts.io_flags = flags; 7762 bio->ext_opts.metadata = md; 7763 bio->ext_opts.accel_sequence = seq; 7764 7765 if (iovcnt == 1) { 7766 rc = spdk_nvme_ns_cmd_write_ext(ns, qpair, iov[0].iov_base, lba, lba_count, bdev_nvme_writev_done, 7767 bio, &bio->ext_opts); 7768 } else { 7769 rc = spdk_nvme_ns_cmd_writev_ext(ns, qpair, lba, lba_count, 7770 bdev_nvme_writev_done, bio, 7771 bdev_nvme_queued_reset_sgl, 7772 bdev_nvme_queued_next_sge, 7773 &bio->ext_opts); 7774 } 7775 } else if (iovcnt == 1) { 7776 rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base, 7777 md, lba, lba_count, bdev_nvme_writev_done, 7778 bio, flags, 0, 0); 7779 } else { 7780 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 7781 bdev_nvme_writev_done, bio, flags, 7782 bdev_nvme_queued_reset_sgl, 7783 bdev_nvme_queued_next_sge, md, 0, 0); 7784 } 7785 7786 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 7787 SPDK_ERRLOG("writev failed: rc = %d\n", rc); 7788 } 7789 return rc; 7790 } 7791 7792 static int 7793 bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7794 void *md, uint64_t lba_count, uint64_t zslba, 7795 uint32_t flags) 7796 { 7797 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7798 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7799 int rc; 7800 7801 SPDK_DEBUGLOG(bdev_nvme, "zone append %" PRIu64 " blocks to zone start lba %#" PRIx64 "\n", 7802 lba_count, zslba); 7803 7804 bio->iovs = iov; 7805 bio->iovcnt = iovcnt; 7806 bio->iovpos = 0; 7807 bio->iov_offset = 0; 7808 7809 if (iovcnt == 1) { 7810 rc = spdk_nvme_zns_zone_append_with_md(ns, qpair, iov[0].iov_base, md, zslba, 7811 lba_count, 7812 bdev_nvme_zone_appendv_done, bio, 7813 flags, 7814 0, 0); 7815 } else { 7816 rc = spdk_nvme_zns_zone_appendv_with_md(ns, qpair, zslba, lba_count, 7817 bdev_nvme_zone_appendv_done, bio, flags, 7818 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 7819 md, 0, 0); 7820 } 7821 7822 if (rc != 0 && rc != -ENOMEM) { 7823 SPDK_ERRLOG("zone append failed: rc = %d\n", rc); 7824 } 7825 return rc; 7826 } 7827 7828 static int 7829 bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7830 void *md, uint64_t lba_count, uint64_t lba, 7831 uint32_t flags) 7832 { 7833 int rc; 7834 7835 SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7836 lba_count, lba); 7837 7838 bio->iovs = iov; 7839 bio->iovcnt = iovcnt; 7840 bio->iovpos = 0; 7841 bio->iov_offset = 0; 7842 7843 rc = spdk_nvme_ns_cmd_comparev_with_md(bio->io_path->nvme_ns->ns, 7844 bio->io_path->qpair->qpair, 7845 lba, lba_count, 7846 bdev_nvme_comparev_done, bio, flags, 7847 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 7848 md, 0, 0); 7849 7850 if (rc != 0 && rc != -ENOMEM) { 7851 SPDK_ERRLOG("comparev failed: rc = %d\n", rc); 7852 } 7853 return rc; 7854 } 7855 7856 static int 7857 bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, 7858 struct iovec *write_iov, int write_iovcnt, 7859 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags) 7860 { 7861 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7862 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7863 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7864 int rc; 7865 7866 SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7867 lba_count, lba); 7868 7869 bio->iovs = cmp_iov; 7870 bio->iovcnt = cmp_iovcnt; 7871 bio->iovpos = 0; 7872 bio->iov_offset = 0; 7873 bio->fused_iovs = write_iov; 7874 bio->fused_iovcnt = write_iovcnt; 7875 bio->fused_iovpos = 0; 7876 bio->fused_iov_offset = 0; 7877 7878 if (bdev_io->num_retries == 0) { 7879 bio->first_fused_submitted = false; 7880 bio->first_fused_completed = false; 7881 } 7882 7883 if (!bio->first_fused_submitted) { 7884 flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST; 7885 memset(&bio->cpl, 0, sizeof(bio->cpl)); 7886 7887 rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count, 7888 bdev_nvme_comparev_and_writev_done, bio, flags, 7889 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0); 7890 if (rc == 0) { 7891 bio->first_fused_submitted = true; 7892 flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST; 7893 } else { 7894 if (rc != -ENOMEM) { 7895 SPDK_ERRLOG("compare failed: rc = %d\n", rc); 7896 } 7897 return rc; 7898 } 7899 } 7900 7901 flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND; 7902 7903 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 7904 bdev_nvme_comparev_and_writev_done, bio, flags, 7905 bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0); 7906 if (rc != 0 && rc != -ENOMEM) { 7907 SPDK_ERRLOG("write failed: rc = %d\n", rc); 7908 rc = 0; 7909 } 7910 7911 return rc; 7912 } 7913 7914 static int 7915 bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 7916 { 7917 struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES]; 7918 struct spdk_nvme_dsm_range *range; 7919 uint64_t offset, remaining; 7920 uint64_t num_ranges_u64; 7921 uint16_t num_ranges; 7922 int rc; 7923 7924 num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) / 7925 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 7926 if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) { 7927 SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks); 7928 return -EINVAL; 7929 } 7930 num_ranges = (uint16_t)num_ranges_u64; 7931 7932 offset = offset_blocks; 7933 remaining = num_blocks; 7934 range = &dsm_ranges[0]; 7935 7936 /* Fill max-size ranges until the remaining blocks fit into one range */ 7937 while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) { 7938 range->attributes.raw = 0; 7939 range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 7940 range->starting_lba = offset; 7941 7942 offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 7943 remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 7944 range++; 7945 } 7946 7947 /* Final range describes the remaining blocks */ 7948 range->attributes.raw = 0; 7949 range->length = remaining; 7950 range->starting_lba = offset; 7951 7952 rc = spdk_nvme_ns_cmd_dataset_management(bio->io_path->nvme_ns->ns, 7953 bio->io_path->qpair->qpair, 7954 SPDK_NVME_DSM_ATTR_DEALLOCATE, 7955 dsm_ranges, num_ranges, 7956 bdev_nvme_queued_done, bio); 7957 7958 return rc; 7959 } 7960 7961 static int 7962 bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 7963 { 7964 if (num_blocks > UINT16_MAX + 1) { 7965 SPDK_ERRLOG("NVMe write zeroes is limited to 16-bit block count\n"); 7966 return -EINVAL; 7967 } 7968 7969 return spdk_nvme_ns_cmd_write_zeroes(bio->io_path->nvme_ns->ns, 7970 bio->io_path->qpair->qpair, 7971 offset_blocks, num_blocks, 7972 bdev_nvme_queued_done, bio, 7973 0); 7974 } 7975 7976 static int 7977 bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones, 7978 struct spdk_bdev_zone_info *info) 7979 { 7980 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7981 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7982 uint32_t zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 7983 uint64_t zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 7984 uint64_t total_zones = spdk_nvme_zns_ns_get_num_zones(ns); 7985 7986 if (zone_id % zone_size != 0) { 7987 return -EINVAL; 7988 } 7989 7990 if (num_zones > total_zones || !num_zones) { 7991 return -EINVAL; 7992 } 7993 7994 assert(!bio->zone_report_buf); 7995 bio->zone_report_buf = calloc(1, zone_report_bufsize); 7996 if (!bio->zone_report_buf) { 7997 return -ENOMEM; 7998 } 7999 8000 bio->handled_zones = 0; 8001 8002 return spdk_nvme_zns_report_zones(ns, qpair, bio->zone_report_buf, zone_report_bufsize, 8003 zone_id, SPDK_NVME_ZRA_LIST_ALL, true, 8004 bdev_nvme_get_zone_info_done, bio); 8005 } 8006 8007 static int 8008 bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 8009 enum spdk_bdev_zone_action action) 8010 { 8011 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8012 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8013 8014 switch (action) { 8015 case SPDK_BDEV_ZONE_CLOSE: 8016 return spdk_nvme_zns_close_zone(ns, qpair, zone_id, false, 8017 bdev_nvme_zone_management_done, bio); 8018 case SPDK_BDEV_ZONE_FINISH: 8019 return spdk_nvme_zns_finish_zone(ns, qpair, zone_id, false, 8020 bdev_nvme_zone_management_done, bio); 8021 case SPDK_BDEV_ZONE_OPEN: 8022 return spdk_nvme_zns_open_zone(ns, qpair, zone_id, false, 8023 bdev_nvme_zone_management_done, bio); 8024 case SPDK_BDEV_ZONE_RESET: 8025 return spdk_nvme_zns_reset_zone(ns, qpair, zone_id, false, 8026 bdev_nvme_zone_management_done, bio); 8027 case SPDK_BDEV_ZONE_OFFLINE: 8028 return spdk_nvme_zns_offline_zone(ns, qpair, zone_id, false, 8029 bdev_nvme_zone_management_done, bio); 8030 default: 8031 return -EINVAL; 8032 } 8033 } 8034 8035 static void 8036 bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 8037 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) 8038 { 8039 struct nvme_io_path *io_path; 8040 struct nvme_ctrlr *nvme_ctrlr; 8041 uint32_t max_xfer_size; 8042 int rc = -ENXIO; 8043 8044 /* Choose the first ctrlr which is not failed. */ 8045 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 8046 nvme_ctrlr = io_path->qpair->ctrlr; 8047 8048 /* We should skip any unavailable nvme_ctrlr rather than checking 8049 * if the return value of spdk_nvme_ctrlr_cmd_admin_raw() is -ENXIO. 8050 */ 8051 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 8052 continue; 8053 } 8054 8055 max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ctrlr->ctrlr); 8056 8057 if (nbytes > max_xfer_size) { 8058 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 8059 rc = -EINVAL; 8060 goto err; 8061 } 8062 8063 rc = spdk_nvme_ctrlr_cmd_admin_raw(nvme_ctrlr->ctrlr, cmd, buf, (uint32_t)nbytes, 8064 bdev_nvme_admin_passthru_done, bio); 8065 if (rc == 0) { 8066 return; 8067 } 8068 } 8069 8070 err: 8071 bdev_nvme_admin_complete(bio, rc); 8072 } 8073 8074 static int 8075 bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 8076 void *buf, size_t nbytes) 8077 { 8078 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8079 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8080 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 8081 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 8082 8083 if (nbytes > max_xfer_size) { 8084 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 8085 return -EINVAL; 8086 } 8087 8088 /* 8089 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 8090 * so fill it out automatically. 8091 */ 8092 cmd->nsid = spdk_nvme_ns_get_id(ns); 8093 8094 return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf, 8095 (uint32_t)nbytes, bdev_nvme_queued_done, bio); 8096 } 8097 8098 static int 8099 bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 8100 void *buf, size_t nbytes, void *md_buf, size_t md_len) 8101 { 8102 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8103 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8104 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 8105 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 8106 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 8107 8108 if (nbytes > max_xfer_size) { 8109 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 8110 return -EINVAL; 8111 } 8112 8113 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 8114 SPDK_ERRLOG("invalid meta data buffer size\n"); 8115 return -EINVAL; 8116 } 8117 8118 /* 8119 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 8120 * so fill it out automatically. 8121 */ 8122 cmd->nsid = spdk_nvme_ns_get_id(ns); 8123 8124 return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf, 8125 (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio); 8126 } 8127 8128 static int 8129 bdev_nvme_iov_passthru_md(struct nvme_bdev_io *bio, 8130 struct spdk_nvme_cmd *cmd, struct iovec *iov, int iovcnt, 8131 size_t nbytes, void *md_buf, size_t md_len) 8132 { 8133 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8134 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8135 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 8136 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 8137 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 8138 8139 bio->iovs = iov; 8140 bio->iovcnt = iovcnt; 8141 bio->iovpos = 0; 8142 bio->iov_offset = 0; 8143 8144 if (nbytes > max_xfer_size) { 8145 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 8146 return -EINVAL; 8147 } 8148 8149 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 8150 SPDK_ERRLOG("invalid meta data buffer size\n"); 8151 return -EINVAL; 8152 } 8153 8154 /* 8155 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands 8156 * require a nsid, so fill it out automatically. 8157 */ 8158 cmd->nsid = spdk_nvme_ns_get_id(ns); 8159 8160 return spdk_nvme_ctrlr_cmd_iov_raw_with_md( 8161 ctrlr, qpair, cmd, (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio, 8162 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge); 8163 } 8164 8165 static void 8166 bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 8167 struct nvme_bdev_io *bio_to_abort) 8168 { 8169 struct nvme_io_path *io_path; 8170 int rc = 0; 8171 8172 rc = bdev_nvme_abort_retry_io(nbdev_ch, bio_to_abort); 8173 if (rc == 0) { 8174 bdev_nvme_admin_complete(bio, 0); 8175 return; 8176 } 8177 8178 io_path = bio_to_abort->io_path; 8179 if (io_path != NULL) { 8180 rc = spdk_nvme_ctrlr_cmd_abort_ext(io_path->qpair->ctrlr->ctrlr, 8181 io_path->qpair->qpair, 8182 bio_to_abort, 8183 bdev_nvme_abort_done, bio); 8184 } else { 8185 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 8186 rc = spdk_nvme_ctrlr_cmd_abort_ext(io_path->qpair->ctrlr->ctrlr, 8187 NULL, 8188 bio_to_abort, 8189 bdev_nvme_abort_done, bio); 8190 8191 if (rc != -ENOENT) { 8192 break; 8193 } 8194 } 8195 } 8196 8197 if (rc != 0) { 8198 /* If no command was found or there was any error, complete the abort 8199 * request with failure. 8200 */ 8201 bdev_nvme_admin_complete(bio, rc); 8202 } 8203 } 8204 8205 static int 8206 bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, uint64_t src_offset_blocks, 8207 uint64_t num_blocks) 8208 { 8209 struct spdk_nvme_scc_source_range range = { 8210 .slba = src_offset_blocks, 8211 .nlb = num_blocks - 1 8212 }; 8213 8214 return spdk_nvme_ns_cmd_copy(bio->io_path->nvme_ns->ns, 8215 bio->io_path->qpair->qpair, 8216 &range, 1, dst_offset_blocks, 8217 bdev_nvme_queued_done, bio); 8218 } 8219 8220 static void 8221 bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w) 8222 { 8223 const char *action; 8224 uint32_t i; 8225 8226 if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) { 8227 action = "reset"; 8228 } else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) { 8229 action = "abort"; 8230 } else { 8231 action = "none"; 8232 } 8233 8234 spdk_json_write_object_begin(w); 8235 8236 spdk_json_write_named_string(w, "method", "bdev_nvme_set_options"); 8237 8238 spdk_json_write_named_object_begin(w, "params"); 8239 spdk_json_write_named_string(w, "action_on_timeout", action); 8240 spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us); 8241 spdk_json_write_named_uint64(w, "timeout_admin_us", g_opts.timeout_admin_us); 8242 spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms); 8243 spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst); 8244 spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight); 8245 spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight); 8246 spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight); 8247 spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us); 8248 spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us); 8249 spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests); 8250 spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit); 8251 spdk_json_write_named_uint32(w, "transport_retry_count", g_opts.transport_retry_count); 8252 spdk_json_write_named_int32(w, "bdev_retry_count", g_opts.bdev_retry_count); 8253 spdk_json_write_named_uint8(w, "transport_ack_timeout", g_opts.transport_ack_timeout); 8254 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", g_opts.ctrlr_loss_timeout_sec); 8255 spdk_json_write_named_uint32(w, "reconnect_delay_sec", g_opts.reconnect_delay_sec); 8256 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", g_opts.fast_io_fail_timeout_sec); 8257 spdk_json_write_named_bool(w, "disable_auto_failback", g_opts.disable_auto_failback); 8258 spdk_json_write_named_bool(w, "generate_uuids", g_opts.generate_uuids); 8259 spdk_json_write_named_uint8(w, "transport_tos", g_opts.transport_tos); 8260 spdk_json_write_named_bool(w, "nvme_error_stat", g_opts.nvme_error_stat); 8261 spdk_json_write_named_uint32(w, "rdma_srq_size", g_opts.rdma_srq_size); 8262 spdk_json_write_named_bool(w, "io_path_stat", g_opts.io_path_stat); 8263 spdk_json_write_named_bool(w, "allow_accel_sequence", g_opts.allow_accel_sequence); 8264 spdk_json_write_named_uint32(w, "rdma_max_cq_size", g_opts.rdma_max_cq_size); 8265 spdk_json_write_named_uint16(w, "rdma_cm_event_timeout_ms", g_opts.rdma_cm_event_timeout_ms); 8266 spdk_json_write_named_array_begin(w, "dhchap_digests"); 8267 for (i = 0; i < 32; ++i) { 8268 if (g_opts.dhchap_digests & SPDK_BIT(i)) { 8269 spdk_json_write_string(w, spdk_nvme_dhchap_get_digest_name(i)); 8270 } 8271 } 8272 spdk_json_write_array_end(w); 8273 spdk_json_write_named_array_begin(w, "dhchap_dhgroups"); 8274 for (i = 0; i < 32; ++i) { 8275 if (g_opts.dhchap_dhgroups & SPDK_BIT(i)) { 8276 spdk_json_write_string(w, spdk_nvme_dhchap_get_dhgroup_name(i)); 8277 } 8278 } 8279 8280 spdk_json_write_array_end(w); 8281 spdk_json_write_object_end(w); 8282 8283 spdk_json_write_object_end(w); 8284 } 8285 8286 static void 8287 bdev_nvme_discovery_config_json(struct spdk_json_write_ctx *w, struct discovery_ctx *ctx) 8288 { 8289 struct spdk_nvme_transport_id trid; 8290 8291 spdk_json_write_object_begin(w); 8292 8293 spdk_json_write_named_string(w, "method", "bdev_nvme_start_discovery"); 8294 8295 spdk_json_write_named_object_begin(w, "params"); 8296 spdk_json_write_named_string(w, "name", ctx->name); 8297 spdk_json_write_named_string(w, "hostnqn", ctx->hostnqn); 8298 8299 trid = ctx->trid; 8300 memset(trid.subnqn, 0, sizeof(trid.subnqn)); 8301 nvme_bdev_dump_trid_json(&trid, w); 8302 8303 spdk_json_write_named_bool(w, "wait_for_attach", ctx->wait_for_attach); 8304 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", ctx->bdev_opts.ctrlr_loss_timeout_sec); 8305 spdk_json_write_named_uint32(w, "reconnect_delay_sec", ctx->bdev_opts.reconnect_delay_sec); 8306 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 8307 ctx->bdev_opts.fast_io_fail_timeout_sec); 8308 spdk_json_write_object_end(w); 8309 8310 spdk_json_write_object_end(w); 8311 } 8312 8313 #ifdef SPDK_CONFIG_NVME_CUSE 8314 static void 8315 nvme_ctrlr_cuse_config_json(struct spdk_json_write_ctx *w, 8316 struct nvme_ctrlr *nvme_ctrlr) 8317 { 8318 size_t cuse_name_size = 128; 8319 char cuse_name[cuse_name_size]; 8320 8321 if (spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr, 8322 cuse_name, &cuse_name_size) != 0) { 8323 return; 8324 } 8325 8326 spdk_json_write_object_begin(w); 8327 8328 spdk_json_write_named_string(w, "method", "bdev_nvme_cuse_register"); 8329 8330 spdk_json_write_named_object_begin(w, "params"); 8331 spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name); 8332 spdk_json_write_object_end(w); 8333 8334 spdk_json_write_object_end(w); 8335 } 8336 #endif 8337 8338 static void 8339 nvme_ctrlr_config_json(struct spdk_json_write_ctx *w, 8340 struct nvme_ctrlr *nvme_ctrlr) 8341 { 8342 struct spdk_nvme_transport_id *trid; 8343 const struct spdk_nvme_ctrlr_opts *opts; 8344 8345 if (nvme_ctrlr->opts.from_discovery_service) { 8346 /* Do not emit an RPC for this - it will be implicitly 8347 * covered by a separate bdev_nvme_start_discovery or 8348 * bdev_nvme_start_mdns_discovery RPC. 8349 */ 8350 return; 8351 } 8352 8353 trid = &nvme_ctrlr->active_path_id->trid; 8354 8355 spdk_json_write_object_begin(w); 8356 8357 spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller"); 8358 8359 spdk_json_write_named_object_begin(w, "params"); 8360 spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name); 8361 nvme_bdev_dump_trid_json(trid, w); 8362 spdk_json_write_named_bool(w, "prchk_reftag", 8363 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0); 8364 spdk_json_write_named_bool(w, "prchk_guard", 8365 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0); 8366 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", nvme_ctrlr->opts.ctrlr_loss_timeout_sec); 8367 spdk_json_write_named_uint32(w, "reconnect_delay_sec", nvme_ctrlr->opts.reconnect_delay_sec); 8368 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 8369 nvme_ctrlr->opts.fast_io_fail_timeout_sec); 8370 if (nvme_ctrlr->psk != NULL) { 8371 spdk_json_write_named_string(w, "psk", spdk_key_get_name(nvme_ctrlr->psk)); 8372 } else if (nvme_ctrlr->opts.psk[0] != '\0') { 8373 spdk_json_write_named_string(w, "psk", nvme_ctrlr->opts.psk); 8374 } 8375 8376 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 8377 spdk_json_write_named_string(w, "hostnqn", opts->hostnqn); 8378 spdk_json_write_named_bool(w, "hdgst", opts->header_digest); 8379 spdk_json_write_named_bool(w, "ddgst", opts->data_digest); 8380 if (opts->src_addr[0] != '\0') { 8381 spdk_json_write_named_string(w, "hostaddr", opts->src_addr); 8382 } 8383 if (opts->src_svcid[0] != '\0') { 8384 spdk_json_write_named_string(w, "hostsvcid", opts->src_svcid); 8385 } 8386 8387 spdk_json_write_object_end(w); 8388 8389 spdk_json_write_object_end(w); 8390 } 8391 8392 static void 8393 bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w) 8394 { 8395 spdk_json_write_object_begin(w); 8396 spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug"); 8397 8398 spdk_json_write_named_object_begin(w, "params"); 8399 spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us); 8400 spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled); 8401 spdk_json_write_object_end(w); 8402 8403 spdk_json_write_object_end(w); 8404 } 8405 8406 static int 8407 bdev_nvme_config_json(struct spdk_json_write_ctx *w) 8408 { 8409 struct nvme_bdev_ctrlr *nbdev_ctrlr; 8410 struct nvme_ctrlr *nvme_ctrlr; 8411 struct discovery_ctx *ctx; 8412 8413 bdev_nvme_opts_config_json(w); 8414 8415 pthread_mutex_lock(&g_bdev_nvme_mutex); 8416 8417 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 8418 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 8419 nvme_ctrlr_config_json(w, nvme_ctrlr); 8420 8421 #ifdef SPDK_CONFIG_NVME_CUSE 8422 nvme_ctrlr_cuse_config_json(w, nvme_ctrlr); 8423 #endif 8424 } 8425 } 8426 8427 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 8428 if (!ctx->from_mdns_discovery_service) { 8429 bdev_nvme_discovery_config_json(w, ctx); 8430 } 8431 } 8432 8433 bdev_nvme_mdns_discovery_config_json(w); 8434 8435 /* Dump as last parameter to give all NVMe bdevs chance to be constructed 8436 * before enabling hotplug poller. 8437 */ 8438 bdev_nvme_hotplug_config_json(w); 8439 8440 pthread_mutex_unlock(&g_bdev_nvme_mutex); 8441 return 0; 8442 } 8443 8444 struct spdk_nvme_ctrlr * 8445 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev) 8446 { 8447 struct nvme_bdev *nbdev; 8448 struct nvme_ns *nvme_ns; 8449 8450 if (!bdev || bdev->module != &nvme_if) { 8451 return NULL; 8452 } 8453 8454 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 8455 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 8456 assert(nvme_ns != NULL); 8457 8458 return nvme_ns->ctrlr->ctrlr; 8459 } 8460 8461 void 8462 nvme_io_path_info_json(struct spdk_json_write_ctx *w, struct nvme_io_path *io_path) 8463 { 8464 struct nvme_ns *nvme_ns = io_path->nvme_ns; 8465 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 8466 const struct spdk_nvme_ctrlr_data *cdata; 8467 const struct spdk_nvme_transport_id *trid; 8468 const char *adrfam_str; 8469 8470 spdk_json_write_object_begin(w); 8471 8472 spdk_json_write_named_string(w, "bdev_name", nvme_ns->bdev->disk.name); 8473 8474 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 8475 trid = spdk_nvme_ctrlr_get_transport_id(nvme_ctrlr->ctrlr); 8476 8477 spdk_json_write_named_uint32(w, "cntlid", cdata->cntlid); 8478 spdk_json_write_named_bool(w, "current", io_path->nbdev_ch != NULL && 8479 io_path == io_path->nbdev_ch->current_io_path); 8480 spdk_json_write_named_bool(w, "connected", nvme_qpair_is_connected(io_path->qpair)); 8481 spdk_json_write_named_bool(w, "accessible", nvme_ns_is_accessible(nvme_ns)); 8482 8483 spdk_json_write_named_object_begin(w, "transport"); 8484 spdk_json_write_named_string(w, "trtype", trid->trstring); 8485 spdk_json_write_named_string(w, "traddr", trid->traddr); 8486 if (trid->trsvcid[0] != '\0') { 8487 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 8488 } 8489 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 8490 if (adrfam_str) { 8491 spdk_json_write_named_string(w, "adrfam", adrfam_str); 8492 } 8493 spdk_json_write_object_end(w); 8494 8495 spdk_json_write_object_end(w); 8496 } 8497 8498 void 8499 bdev_nvme_get_discovery_info(struct spdk_json_write_ctx *w) 8500 { 8501 struct discovery_ctx *ctx; 8502 struct discovery_entry_ctx *entry_ctx; 8503 8504 spdk_json_write_array_begin(w); 8505 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 8506 spdk_json_write_object_begin(w); 8507 spdk_json_write_named_string(w, "name", ctx->name); 8508 8509 spdk_json_write_named_object_begin(w, "trid"); 8510 nvme_bdev_dump_trid_json(&ctx->trid, w); 8511 spdk_json_write_object_end(w); 8512 8513 spdk_json_write_named_array_begin(w, "referrals"); 8514 TAILQ_FOREACH(entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 8515 spdk_json_write_object_begin(w); 8516 spdk_json_write_named_object_begin(w, "trid"); 8517 nvme_bdev_dump_trid_json(&entry_ctx->trid, w); 8518 spdk_json_write_object_end(w); 8519 spdk_json_write_object_end(w); 8520 } 8521 spdk_json_write_array_end(w); 8522 8523 spdk_json_write_object_end(w); 8524 } 8525 spdk_json_write_array_end(w); 8526 } 8527 8528 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme) 8529 8530 SPDK_TRACE_REGISTER_FN(bdev_nvme_trace, "bdev_nvme", TRACE_GROUP_BDEV_NVME) 8531 { 8532 struct spdk_trace_tpoint_opts opts[] = { 8533 { 8534 "BDEV_NVME_IO_START", TRACE_BDEV_NVME_IO_START, 8535 OWNER_TYPE_NONE, OBJECT_BDEV_NVME_IO, 1, 8536 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 8537 }, 8538 { 8539 "BDEV_NVME_IO_DONE", TRACE_BDEV_NVME_IO_DONE, 8540 OWNER_TYPE_NONE, OBJECT_BDEV_NVME_IO, 0, 8541 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 8542 } 8543 }; 8544 8545 8546 spdk_trace_register_object(OBJECT_BDEV_NVME_IO, 'N'); 8547 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 8548 spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_SUBMIT, OBJECT_BDEV_NVME_IO, 0); 8549 spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_SUBMIT, OBJECT_BDEV_NVME_IO, 0); 8550 spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_COMPLETE, OBJECT_BDEV_NVME_IO, 0); 8551 spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_COMPLETE, OBJECT_BDEV_NVME_IO, 0); 8552 } 8553