1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 * Copyright (c) 2022 Dell Inc, or its subsidiaries. All rights reserved. 6 */ 7 8 #include "spdk/stdinc.h" 9 10 #include "bdev_nvme.h" 11 12 #include "spdk/accel.h" 13 #include "spdk/config.h" 14 #include "spdk/endian.h" 15 #include "spdk/bdev.h" 16 #include "spdk/json.h" 17 #include "spdk/keyring.h" 18 #include "spdk/likely.h" 19 #include "spdk/nvme.h" 20 #include "spdk/nvme_ocssd.h" 21 #include "spdk/nvme_zns.h" 22 #include "spdk/opal.h" 23 #include "spdk/thread.h" 24 #include "spdk/trace.h" 25 #include "spdk/string.h" 26 #include "spdk/util.h" 27 #include "spdk/uuid.h" 28 29 #include "spdk/bdev_module.h" 30 #include "spdk/log.h" 31 32 #include "spdk_internal/usdt.h" 33 #include "spdk_internal/trace_defs.h" 34 35 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true 36 #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS (10000) 37 38 #define NSID_STR_LEN 10 39 40 #define SPDK_CONTROLLER_NAME_MAX 512 41 42 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w); 43 44 struct nvme_bdev_io { 45 /** array of iovecs to transfer. */ 46 struct iovec *iovs; 47 48 /** Number of iovecs in iovs array. */ 49 int iovcnt; 50 51 /** Current iovec position. */ 52 int iovpos; 53 54 /** Offset in current iovec. */ 55 uint32_t iov_offset; 56 57 /** I/O path the current I/O or admin passthrough is submitted on, or the I/O path 58 * being reset in a reset I/O. 59 */ 60 struct nvme_io_path *io_path; 61 62 /** array of iovecs to transfer. */ 63 struct iovec *fused_iovs; 64 65 /** Number of iovecs in iovs array. */ 66 int fused_iovcnt; 67 68 /** Current iovec position. */ 69 int fused_iovpos; 70 71 /** Offset in current iovec. */ 72 uint32_t fused_iov_offset; 73 74 /** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */ 75 struct spdk_nvme_cpl cpl; 76 77 /** Extended IO opts passed by the user to bdev layer and mapped to NVME format */ 78 struct spdk_nvme_ns_cmd_ext_io_opts ext_opts; 79 80 /** Keeps track if first of fused commands was submitted */ 81 bool first_fused_submitted; 82 83 /** Keeps track if first of fused commands was completed */ 84 bool first_fused_completed; 85 86 /** Temporary pointer to zone report buffer */ 87 struct spdk_nvme_zns_zone_report *zone_report_buf; 88 89 /** Keep track of how many zones that have been copied to the spdk_bdev_zone_info struct */ 90 uint64_t handled_zones; 91 92 /** Expiration value in ticks to retry the current I/O. */ 93 uint64_t retry_ticks; 94 95 /* How many times the current I/O was retried. */ 96 int32_t retry_count; 97 98 /* Current tsc at submit time. */ 99 uint64_t submit_tsc; 100 101 /* Used to put nvme_bdev_io into the list */ 102 TAILQ_ENTRY(nvme_bdev_io) retry_link; 103 }; 104 105 struct nvme_probe_skip_entry { 106 struct spdk_nvme_transport_id trid; 107 TAILQ_ENTRY(nvme_probe_skip_entry) tailq; 108 }; 109 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */ 110 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER( 111 g_skipped_nvme_ctrlrs); 112 113 #define BDEV_NVME_DEFAULT_DIGESTS (SPDK_BIT(SPDK_NVMF_DHCHAP_HASH_SHA256) | \ 114 SPDK_BIT(SPDK_NVMF_DHCHAP_HASH_SHA384) | \ 115 SPDK_BIT(SPDK_NVMF_DHCHAP_HASH_SHA512)) 116 117 #define BDEV_NVME_DEFAULT_DHGROUPS (SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_NULL) | \ 118 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_2048) | \ 119 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_3072) | \ 120 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_4096) | \ 121 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_6144) | \ 122 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_8192)) 123 124 static struct spdk_bdev_nvme_opts g_opts = { 125 .action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE, 126 .timeout_us = 0, 127 .timeout_admin_us = 0, 128 .keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS, 129 .transport_retry_count = 4, 130 .arbitration_burst = 0, 131 .low_priority_weight = 0, 132 .medium_priority_weight = 0, 133 .high_priority_weight = 0, 134 .nvme_adminq_poll_period_us = 10000ULL, 135 .nvme_ioq_poll_period_us = 0, 136 .io_queue_requests = 0, 137 .delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT, 138 .bdev_retry_count = 3, 139 .transport_ack_timeout = 0, 140 .ctrlr_loss_timeout_sec = 0, 141 .reconnect_delay_sec = 0, 142 .fast_io_fail_timeout_sec = 0, 143 .disable_auto_failback = false, 144 .generate_uuids = false, 145 .transport_tos = 0, 146 .nvme_error_stat = false, 147 .io_path_stat = false, 148 .allow_accel_sequence = false, 149 .dhchap_digests = BDEV_NVME_DEFAULT_DIGESTS, 150 .dhchap_dhgroups = BDEV_NVME_DEFAULT_DHGROUPS, 151 }; 152 153 #define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL 154 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT 100000ULL 155 156 static int g_hot_insert_nvme_controller_index = 0; 157 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT; 158 static bool g_nvme_hotplug_enabled = false; 159 struct spdk_thread *g_bdev_nvme_init_thread; 160 static struct spdk_poller *g_hotplug_poller; 161 static struct spdk_poller *g_hotplug_probe_poller; 162 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx; 163 164 static void nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 165 struct nvme_async_probe_ctx *ctx); 166 static void nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 167 struct nvme_async_probe_ctx *ctx); 168 static int bdev_nvme_library_init(void); 169 static void bdev_nvme_library_fini(void); 170 static void _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, 171 struct spdk_bdev_io *bdev_io); 172 static void bdev_nvme_submit_request(struct spdk_io_channel *ch, 173 struct spdk_bdev_io *bdev_io); 174 static int bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 175 void *md, uint64_t lba_count, uint64_t lba, 176 uint32_t flags, struct spdk_memory_domain *domain, void *domain_ctx, 177 struct spdk_accel_sequence *seq); 178 static int bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 179 void *md, uint64_t lba_count, uint64_t lba); 180 static int bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 181 void *md, uint64_t lba_count, uint64_t lba, 182 uint32_t flags, struct spdk_memory_domain *domain, void *domain_ctx, 183 struct spdk_accel_sequence *seq); 184 static int bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 185 void *md, uint64_t lba_count, 186 uint64_t zslba, uint32_t flags); 187 static int bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 188 void *md, uint64_t lba_count, uint64_t lba, 189 uint32_t flags); 190 static int bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, 191 struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov, 192 int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba, 193 uint32_t flags); 194 static int bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, 195 uint32_t num_zones, struct spdk_bdev_zone_info *info); 196 static int bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 197 enum spdk_bdev_zone_action action); 198 static void bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, 199 struct nvme_bdev_io *bio, 200 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); 201 static int bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 202 void *buf, size_t nbytes); 203 static int bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 204 void *buf, size_t nbytes, void *md_buf, size_t md_len); 205 static int bdev_nvme_iov_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 206 struct iovec *iov, int iovcnt, size_t nbytes, 207 void *md_buf, size_t md_len); 208 static void bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, 209 struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort); 210 static void bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio); 211 static int bdev_nvme_reset_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 212 static int bdev_nvme_failover_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 213 static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr); 214 static int nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr); 215 216 static struct nvme_ns *nvme_ns_alloc(void); 217 static void nvme_ns_free(struct nvme_ns *ns); 218 219 static int 220 nvme_ns_cmp(struct nvme_ns *ns1, struct nvme_ns *ns2) 221 { 222 return ns1->id < ns2->id ? -1 : ns1->id > ns2->id; 223 } 224 225 RB_GENERATE_STATIC(nvme_ns_tree, nvme_ns, node, nvme_ns_cmp); 226 227 struct spdk_nvme_qpair * 228 bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch) 229 { 230 struct nvme_ctrlr_channel *ctrlr_ch; 231 232 assert(ctrlr_io_ch != NULL); 233 234 ctrlr_ch = spdk_io_channel_get_ctx(ctrlr_io_ch); 235 236 return ctrlr_ch->qpair->qpair; 237 } 238 239 static int 240 bdev_nvme_get_ctx_size(void) 241 { 242 return sizeof(struct nvme_bdev_io); 243 } 244 245 static struct spdk_bdev_module nvme_if = { 246 .name = "nvme", 247 .async_fini = true, 248 .module_init = bdev_nvme_library_init, 249 .module_fini = bdev_nvme_library_fini, 250 .config_json = bdev_nvme_config_json, 251 .get_ctx_size = bdev_nvme_get_ctx_size, 252 253 }; 254 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if) 255 256 struct nvme_bdev_ctrlrs g_nvme_bdev_ctrlrs = TAILQ_HEAD_INITIALIZER(g_nvme_bdev_ctrlrs); 257 pthread_mutex_t g_bdev_nvme_mutex = PTHREAD_MUTEX_INITIALIZER; 258 bool g_bdev_nvme_module_finish; 259 260 struct nvme_bdev_ctrlr * 261 nvme_bdev_ctrlr_get_by_name(const char *name) 262 { 263 struct nvme_bdev_ctrlr *nbdev_ctrlr; 264 265 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 266 if (strcmp(name, nbdev_ctrlr->name) == 0) { 267 break; 268 } 269 } 270 271 return nbdev_ctrlr; 272 } 273 274 static struct nvme_ctrlr * 275 nvme_bdev_ctrlr_get_ctrlr(struct nvme_bdev_ctrlr *nbdev_ctrlr, 276 const struct spdk_nvme_transport_id *trid) 277 { 278 struct nvme_ctrlr *nvme_ctrlr; 279 280 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 281 if (spdk_nvme_transport_id_compare(trid, &nvme_ctrlr->active_path_id->trid) == 0) { 282 break; 283 } 284 } 285 286 return nvme_ctrlr; 287 } 288 289 struct nvme_ctrlr * 290 nvme_bdev_ctrlr_get_ctrlr_by_id(struct nvme_bdev_ctrlr *nbdev_ctrlr, 291 uint16_t cntlid) 292 { 293 struct nvme_ctrlr *nvme_ctrlr; 294 const struct spdk_nvme_ctrlr_data *cdata; 295 296 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 297 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 298 if (cdata->cntlid == cntlid) { 299 break; 300 } 301 } 302 303 return nvme_ctrlr; 304 } 305 306 static struct nvme_bdev * 307 nvme_bdev_ctrlr_get_bdev(struct nvme_bdev_ctrlr *nbdev_ctrlr, uint32_t nsid) 308 { 309 struct nvme_bdev *bdev; 310 311 pthread_mutex_lock(&g_bdev_nvme_mutex); 312 TAILQ_FOREACH(bdev, &nbdev_ctrlr->bdevs, tailq) { 313 if (bdev->nsid == nsid) { 314 break; 315 } 316 } 317 pthread_mutex_unlock(&g_bdev_nvme_mutex); 318 319 return bdev; 320 } 321 322 struct nvme_ns * 323 nvme_ctrlr_get_ns(struct nvme_ctrlr *nvme_ctrlr, uint32_t nsid) 324 { 325 struct nvme_ns ns; 326 327 assert(nsid > 0); 328 329 ns.id = nsid; 330 return RB_FIND(nvme_ns_tree, &nvme_ctrlr->namespaces, &ns); 331 } 332 333 struct nvme_ns * 334 nvme_ctrlr_get_first_active_ns(struct nvme_ctrlr *nvme_ctrlr) 335 { 336 return RB_MIN(nvme_ns_tree, &nvme_ctrlr->namespaces); 337 } 338 339 struct nvme_ns * 340 nvme_ctrlr_get_next_active_ns(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *ns) 341 { 342 if (ns == NULL) { 343 return NULL; 344 } 345 346 return RB_NEXT(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 347 } 348 349 static struct nvme_ctrlr * 350 nvme_ctrlr_get(const struct spdk_nvme_transport_id *trid) 351 { 352 struct nvme_bdev_ctrlr *nbdev_ctrlr; 353 struct nvme_ctrlr *nvme_ctrlr = NULL; 354 355 pthread_mutex_lock(&g_bdev_nvme_mutex); 356 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 357 nvme_ctrlr = nvme_bdev_ctrlr_get_ctrlr(nbdev_ctrlr, trid); 358 if (nvme_ctrlr != NULL) { 359 break; 360 } 361 } 362 pthread_mutex_unlock(&g_bdev_nvme_mutex); 363 364 return nvme_ctrlr; 365 } 366 367 struct nvme_ctrlr * 368 nvme_ctrlr_get_by_name(const char *name) 369 { 370 struct nvme_bdev_ctrlr *nbdev_ctrlr; 371 struct nvme_ctrlr *nvme_ctrlr = NULL; 372 373 if (name == NULL) { 374 return NULL; 375 } 376 377 pthread_mutex_lock(&g_bdev_nvme_mutex); 378 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 379 if (nbdev_ctrlr != NULL) { 380 nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 381 } 382 pthread_mutex_unlock(&g_bdev_nvme_mutex); 383 384 return nvme_ctrlr; 385 } 386 387 void 388 nvme_bdev_ctrlr_for_each(nvme_bdev_ctrlr_for_each_fn fn, void *ctx) 389 { 390 struct nvme_bdev_ctrlr *nbdev_ctrlr; 391 392 pthread_mutex_lock(&g_bdev_nvme_mutex); 393 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 394 fn(nbdev_ctrlr, ctx); 395 } 396 pthread_mutex_unlock(&g_bdev_nvme_mutex); 397 } 398 399 void 400 nvme_bdev_dump_trid_json(const struct spdk_nvme_transport_id *trid, struct spdk_json_write_ctx *w) 401 { 402 const char *trtype_str; 403 const char *adrfam_str; 404 405 trtype_str = spdk_nvme_transport_id_trtype_str(trid->trtype); 406 if (trtype_str) { 407 spdk_json_write_named_string(w, "trtype", trtype_str); 408 } 409 410 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 411 if (adrfam_str) { 412 spdk_json_write_named_string(w, "adrfam", adrfam_str); 413 } 414 415 if (trid->traddr[0] != '\0') { 416 spdk_json_write_named_string(w, "traddr", trid->traddr); 417 } 418 419 if (trid->trsvcid[0] != '\0') { 420 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 421 } 422 423 if (trid->subnqn[0] != '\0') { 424 spdk_json_write_named_string(w, "subnqn", trid->subnqn); 425 } 426 } 427 428 static void 429 nvme_bdev_ctrlr_delete(struct nvme_bdev_ctrlr *nbdev_ctrlr, 430 struct nvme_ctrlr *nvme_ctrlr) 431 { 432 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_delete, nvme_ctrlr->nbdev_ctrlr->name); 433 pthread_mutex_lock(&g_bdev_nvme_mutex); 434 435 TAILQ_REMOVE(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 436 if (!TAILQ_EMPTY(&nbdev_ctrlr->ctrlrs)) { 437 pthread_mutex_unlock(&g_bdev_nvme_mutex); 438 439 return; 440 } 441 TAILQ_REMOVE(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 442 443 pthread_mutex_unlock(&g_bdev_nvme_mutex); 444 445 assert(TAILQ_EMPTY(&nbdev_ctrlr->bdevs)); 446 447 free(nbdev_ctrlr->name); 448 free(nbdev_ctrlr); 449 } 450 451 static void 452 _nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 453 { 454 struct nvme_path_id *path_id, *tmp_path; 455 struct nvme_ns *ns, *tmp_ns; 456 457 free(nvme_ctrlr->copied_ana_desc); 458 spdk_free(nvme_ctrlr->ana_log_page); 459 460 if (nvme_ctrlr->opal_dev) { 461 spdk_opal_dev_destruct(nvme_ctrlr->opal_dev); 462 nvme_ctrlr->opal_dev = NULL; 463 } 464 465 if (nvme_ctrlr->nbdev_ctrlr) { 466 nvme_bdev_ctrlr_delete(nvme_ctrlr->nbdev_ctrlr, nvme_ctrlr); 467 } 468 469 RB_FOREACH_SAFE(ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp_ns) { 470 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 471 nvme_ns_free(ns); 472 } 473 474 TAILQ_FOREACH_SAFE(path_id, &nvme_ctrlr->trids, link, tmp_path) { 475 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 476 free(path_id); 477 } 478 479 pthread_mutex_destroy(&nvme_ctrlr->mutex); 480 spdk_keyring_put_key(nvme_ctrlr->psk); 481 spdk_keyring_put_key(nvme_ctrlr->dhchap_key); 482 free(nvme_ctrlr); 483 484 pthread_mutex_lock(&g_bdev_nvme_mutex); 485 if (g_bdev_nvme_module_finish && TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 486 pthread_mutex_unlock(&g_bdev_nvme_mutex); 487 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 488 spdk_bdev_module_fini_done(); 489 return; 490 } 491 pthread_mutex_unlock(&g_bdev_nvme_mutex); 492 } 493 494 static int 495 nvme_detach_poller(void *arg) 496 { 497 struct nvme_ctrlr *nvme_ctrlr = arg; 498 int rc; 499 500 rc = spdk_nvme_detach_poll_async(nvme_ctrlr->detach_ctx); 501 if (rc != -EAGAIN) { 502 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 503 _nvme_ctrlr_delete(nvme_ctrlr); 504 } 505 506 return SPDK_POLLER_BUSY; 507 } 508 509 static void 510 nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 511 { 512 int rc; 513 514 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 515 516 /* First, unregister the adminq poller, as the driver will poll adminq if necessary */ 517 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 518 519 /* If we got here, the reset/detach poller cannot be active */ 520 assert(nvme_ctrlr->reset_detach_poller == NULL); 521 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(nvme_detach_poller, 522 nvme_ctrlr, 1000); 523 if (nvme_ctrlr->reset_detach_poller == NULL) { 524 SPDK_ERRLOG("Failed to register detach poller\n"); 525 goto error; 526 } 527 528 rc = spdk_nvme_detach_async(nvme_ctrlr->ctrlr, &nvme_ctrlr->detach_ctx); 529 if (rc != 0) { 530 SPDK_ERRLOG("Failed to detach the NVMe controller\n"); 531 goto error; 532 } 533 534 return; 535 error: 536 /* We don't have a good way to handle errors here, so just do what we can and delete the 537 * controller without detaching the underlying NVMe device. 538 */ 539 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 540 _nvme_ctrlr_delete(nvme_ctrlr); 541 } 542 543 static void 544 nvme_ctrlr_unregister_cb(void *io_device) 545 { 546 struct nvme_ctrlr *nvme_ctrlr = io_device; 547 548 nvme_ctrlr_delete(nvme_ctrlr); 549 } 550 551 static void 552 nvme_ctrlr_unregister(void *ctx) 553 { 554 struct nvme_ctrlr *nvme_ctrlr = ctx; 555 556 spdk_io_device_unregister(nvme_ctrlr, nvme_ctrlr_unregister_cb); 557 } 558 559 static bool 560 nvme_ctrlr_can_be_unregistered(struct nvme_ctrlr *nvme_ctrlr) 561 { 562 if (!nvme_ctrlr->destruct) { 563 return false; 564 } 565 566 if (nvme_ctrlr->ref > 0) { 567 return false; 568 } 569 570 if (nvme_ctrlr->resetting) { 571 return false; 572 } 573 574 if (nvme_ctrlr->ana_log_page_updating) { 575 return false; 576 } 577 578 if (nvme_ctrlr->io_path_cache_clearing) { 579 return false; 580 } 581 582 return true; 583 } 584 585 static void 586 nvme_ctrlr_release(struct nvme_ctrlr *nvme_ctrlr) 587 { 588 pthread_mutex_lock(&nvme_ctrlr->mutex); 589 SPDK_DTRACE_PROBE2(bdev_nvme_ctrlr_release, nvme_ctrlr->nbdev_ctrlr->name, nvme_ctrlr->ref); 590 591 assert(nvme_ctrlr->ref > 0); 592 nvme_ctrlr->ref--; 593 594 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 595 pthread_mutex_unlock(&nvme_ctrlr->mutex); 596 return; 597 } 598 599 pthread_mutex_unlock(&nvme_ctrlr->mutex); 600 601 spdk_thread_exec_msg(nvme_ctrlr->thread, nvme_ctrlr_unregister, nvme_ctrlr); 602 } 603 604 static void 605 bdev_nvme_clear_current_io_path(struct nvme_bdev_channel *nbdev_ch) 606 { 607 nbdev_ch->current_io_path = NULL; 608 nbdev_ch->rr_counter = 0; 609 } 610 611 static struct nvme_io_path * 612 _bdev_nvme_get_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 613 { 614 struct nvme_io_path *io_path; 615 616 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 617 if (io_path->nvme_ns == nvme_ns) { 618 break; 619 } 620 } 621 622 return io_path; 623 } 624 625 static struct nvme_io_path * 626 nvme_io_path_alloc(void) 627 { 628 struct nvme_io_path *io_path; 629 630 io_path = calloc(1, sizeof(*io_path)); 631 if (io_path == NULL) { 632 SPDK_ERRLOG("Failed to alloc io_path.\n"); 633 return NULL; 634 } 635 636 if (g_opts.io_path_stat) { 637 io_path->stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); 638 if (io_path->stat == NULL) { 639 free(io_path); 640 SPDK_ERRLOG("Failed to alloc io_path stat.\n"); 641 return NULL; 642 } 643 spdk_bdev_reset_io_stat(io_path->stat, SPDK_BDEV_RESET_STAT_MAXMIN); 644 } 645 646 return io_path; 647 } 648 649 static void 650 nvme_io_path_free(struct nvme_io_path *io_path) 651 { 652 free(io_path->stat); 653 free(io_path); 654 } 655 656 static int 657 _bdev_nvme_add_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 658 { 659 struct nvme_io_path *io_path; 660 struct spdk_io_channel *ch; 661 struct nvme_ctrlr_channel *ctrlr_ch; 662 struct nvme_qpair *nvme_qpair; 663 664 io_path = nvme_io_path_alloc(); 665 if (io_path == NULL) { 666 return -ENOMEM; 667 } 668 669 io_path->nvme_ns = nvme_ns; 670 671 ch = spdk_get_io_channel(nvme_ns->ctrlr); 672 if (ch == NULL) { 673 nvme_io_path_free(io_path); 674 SPDK_ERRLOG("Failed to alloc io_channel.\n"); 675 return -ENOMEM; 676 } 677 678 ctrlr_ch = spdk_io_channel_get_ctx(ch); 679 680 nvme_qpair = ctrlr_ch->qpair; 681 assert(nvme_qpair != NULL); 682 683 io_path->qpair = nvme_qpair; 684 TAILQ_INSERT_TAIL(&nvme_qpair->io_path_list, io_path, tailq); 685 686 io_path->nbdev_ch = nbdev_ch; 687 STAILQ_INSERT_TAIL(&nbdev_ch->io_path_list, io_path, stailq); 688 689 bdev_nvme_clear_current_io_path(nbdev_ch); 690 691 return 0; 692 } 693 694 static void 695 bdev_nvme_clear_retry_io_path(struct nvme_bdev_channel *nbdev_ch, 696 struct nvme_io_path *io_path) 697 { 698 struct nvme_bdev_io *bio; 699 700 TAILQ_FOREACH(bio, &nbdev_ch->retry_io_list, retry_link) { 701 if (bio->io_path == io_path) { 702 bio->io_path = NULL; 703 } 704 } 705 } 706 707 static void 708 _bdev_nvme_delete_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *io_path) 709 { 710 struct spdk_io_channel *ch; 711 struct nvme_qpair *nvme_qpair; 712 struct nvme_ctrlr_channel *ctrlr_ch; 713 struct nvme_bdev *nbdev; 714 715 nbdev = spdk_io_channel_get_io_device(spdk_io_channel_from_ctx(nbdev_ch)); 716 717 /* Add the statistics to nvme_ns before this path is destroyed. */ 718 pthread_mutex_lock(&nbdev->mutex); 719 if (nbdev->ref != 0 && io_path->nvme_ns->stat != NULL && io_path->stat != NULL) { 720 spdk_bdev_add_io_stat(io_path->nvme_ns->stat, io_path->stat); 721 } 722 pthread_mutex_unlock(&nbdev->mutex); 723 724 bdev_nvme_clear_current_io_path(nbdev_ch); 725 bdev_nvme_clear_retry_io_path(nbdev_ch, io_path); 726 727 STAILQ_REMOVE(&nbdev_ch->io_path_list, io_path, nvme_io_path, stailq); 728 io_path->nbdev_ch = NULL; 729 730 nvme_qpair = io_path->qpair; 731 assert(nvme_qpair != NULL); 732 733 ctrlr_ch = nvme_qpair->ctrlr_ch; 734 assert(ctrlr_ch != NULL); 735 736 ch = spdk_io_channel_from_ctx(ctrlr_ch); 737 spdk_put_io_channel(ch); 738 739 /* After an io_path is removed, I/Os submitted to it may complete and update statistics 740 * of the io_path. To avoid heap-use-after-free error from this case, do not free the 741 * io_path here but free the io_path when the associated qpair is freed. It is ensured 742 * that all I/Os submitted to the io_path are completed when the associated qpair is freed. 743 */ 744 } 745 746 static void 747 _bdev_nvme_delete_io_paths(struct nvme_bdev_channel *nbdev_ch) 748 { 749 struct nvme_io_path *io_path, *tmp_io_path; 750 751 STAILQ_FOREACH_SAFE(io_path, &nbdev_ch->io_path_list, stailq, tmp_io_path) { 752 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 753 } 754 } 755 756 static int 757 bdev_nvme_create_bdev_channel_cb(void *io_device, void *ctx_buf) 758 { 759 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 760 struct nvme_bdev *nbdev = io_device; 761 struct nvme_ns *nvme_ns; 762 int rc; 763 764 STAILQ_INIT(&nbdev_ch->io_path_list); 765 TAILQ_INIT(&nbdev_ch->retry_io_list); 766 767 pthread_mutex_lock(&nbdev->mutex); 768 769 nbdev_ch->mp_policy = nbdev->mp_policy; 770 nbdev_ch->mp_selector = nbdev->mp_selector; 771 nbdev_ch->rr_min_io = nbdev->rr_min_io; 772 773 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 774 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 775 if (rc != 0) { 776 pthread_mutex_unlock(&nbdev->mutex); 777 778 _bdev_nvme_delete_io_paths(nbdev_ch); 779 return rc; 780 } 781 } 782 pthread_mutex_unlock(&nbdev->mutex); 783 784 return 0; 785 } 786 787 /* If cpl != NULL, complete the bdev_io with nvme status based on 'cpl'. 788 * If cpl == NULL, complete the bdev_io with bdev status based on 'status'. 789 */ 790 static inline void 791 __bdev_nvme_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status, 792 const struct spdk_nvme_cpl *cpl) 793 { 794 spdk_trace_record(TRACE_BDEV_NVME_IO_DONE, 0, 0, (uintptr_t)bdev_io->driver_ctx, 795 (uintptr_t)bdev_io); 796 if (cpl) { 797 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 798 } else { 799 spdk_bdev_io_complete(bdev_io, status); 800 } 801 } 802 803 static void bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch); 804 805 static void 806 bdev_nvme_destroy_bdev_channel_cb(void *io_device, void *ctx_buf) 807 { 808 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 809 810 bdev_nvme_abort_retry_ios(nbdev_ch); 811 _bdev_nvme_delete_io_paths(nbdev_ch); 812 } 813 814 static inline bool 815 bdev_nvme_io_type_is_admin(enum spdk_bdev_io_type io_type) 816 { 817 switch (io_type) { 818 case SPDK_BDEV_IO_TYPE_RESET: 819 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 820 case SPDK_BDEV_IO_TYPE_ABORT: 821 return true; 822 default: 823 break; 824 } 825 826 return false; 827 } 828 829 static inline bool 830 nvme_ns_is_active(struct nvme_ns *nvme_ns) 831 { 832 if (spdk_unlikely(nvme_ns->ana_state_updating)) { 833 return false; 834 } 835 836 if (spdk_unlikely(nvme_ns->ns == NULL)) { 837 return false; 838 } 839 840 return true; 841 } 842 843 static inline bool 844 nvme_ns_is_accessible(struct nvme_ns *nvme_ns) 845 { 846 if (spdk_unlikely(!nvme_ns_is_active(nvme_ns))) { 847 return false; 848 } 849 850 switch (nvme_ns->ana_state) { 851 case SPDK_NVME_ANA_OPTIMIZED_STATE: 852 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 853 return true; 854 default: 855 break; 856 } 857 858 return false; 859 } 860 861 static inline bool 862 nvme_qpair_is_connected(struct nvme_qpair *nvme_qpair) 863 { 864 if (spdk_unlikely(nvme_qpair->qpair == NULL)) { 865 return false; 866 } 867 868 if (spdk_unlikely(spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) != 869 SPDK_NVME_QPAIR_FAILURE_NONE)) { 870 return false; 871 } 872 873 if (spdk_unlikely(nvme_qpair->ctrlr_ch->reset_iter != NULL)) { 874 return false; 875 } 876 877 return true; 878 } 879 880 static inline bool 881 nvme_io_path_is_available(struct nvme_io_path *io_path) 882 { 883 if (spdk_unlikely(!nvme_qpair_is_connected(io_path->qpair))) { 884 return false; 885 } 886 887 if (spdk_unlikely(!nvme_ns_is_accessible(io_path->nvme_ns))) { 888 return false; 889 } 890 891 return true; 892 } 893 894 static inline bool 895 nvme_ctrlr_is_failed(struct nvme_ctrlr *nvme_ctrlr) 896 { 897 if (nvme_ctrlr->destruct) { 898 return true; 899 } 900 901 if (nvme_ctrlr->fast_io_fail_timedout) { 902 return true; 903 } 904 905 if (nvme_ctrlr->resetting) { 906 if (nvme_ctrlr->opts.reconnect_delay_sec != 0) { 907 return false; 908 } else { 909 return true; 910 } 911 } 912 913 if (nvme_ctrlr->reconnect_is_delayed) { 914 return false; 915 } 916 917 if (nvme_ctrlr->disabled) { 918 return true; 919 } 920 921 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 922 return true; 923 } else { 924 return false; 925 } 926 } 927 928 static bool 929 nvme_ctrlr_is_available(struct nvme_ctrlr *nvme_ctrlr) 930 { 931 if (nvme_ctrlr->destruct) { 932 return false; 933 } 934 935 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 936 return false; 937 } 938 939 if (nvme_ctrlr->resetting || nvme_ctrlr->reconnect_is_delayed) { 940 return false; 941 } 942 943 if (nvme_ctrlr->disabled) { 944 return false; 945 } 946 947 return true; 948 } 949 950 /* Simulate circular linked list. */ 951 static inline struct nvme_io_path * 952 nvme_io_path_get_next(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *prev_path) 953 { 954 struct nvme_io_path *next_path; 955 956 if (prev_path != NULL) { 957 next_path = STAILQ_NEXT(prev_path, stailq); 958 if (next_path != NULL) { 959 return next_path; 960 } 961 } 962 963 return STAILQ_FIRST(&nbdev_ch->io_path_list); 964 } 965 966 static struct nvme_io_path * 967 _bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 968 { 969 struct nvme_io_path *io_path, *start, *non_optimized = NULL; 970 971 start = nvme_io_path_get_next(nbdev_ch, nbdev_ch->current_io_path); 972 973 io_path = start; 974 do { 975 if (spdk_likely(nvme_qpair_is_connected(io_path->qpair) && 976 nvme_ns_is_active(io_path->nvme_ns))) { 977 switch (io_path->nvme_ns->ana_state) { 978 case SPDK_NVME_ANA_OPTIMIZED_STATE: 979 nbdev_ch->current_io_path = io_path; 980 return io_path; 981 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 982 if (non_optimized == NULL) { 983 non_optimized = io_path; 984 } 985 break; 986 default: 987 break; 988 } 989 } 990 io_path = nvme_io_path_get_next(nbdev_ch, io_path); 991 } while (io_path != start); 992 993 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) { 994 /* We come here only if there is no optimized path. Cache even non_optimized 995 * path for load balance across multiple non_optimized paths. 996 */ 997 nbdev_ch->current_io_path = non_optimized; 998 } 999 1000 return non_optimized; 1001 } 1002 1003 static struct nvme_io_path * 1004 _bdev_nvme_find_io_path_min_qd(struct nvme_bdev_channel *nbdev_ch) 1005 { 1006 struct nvme_io_path *io_path; 1007 struct nvme_io_path *optimized = NULL, *non_optimized = NULL; 1008 uint32_t opt_min_qd = UINT32_MAX, non_opt_min_qd = UINT32_MAX; 1009 uint32_t num_outstanding_reqs; 1010 1011 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 1012 if (spdk_unlikely(!nvme_qpair_is_connected(io_path->qpair))) { 1013 /* The device is currently resetting. */ 1014 continue; 1015 } 1016 1017 if (spdk_unlikely(!nvme_ns_is_active(io_path->nvme_ns))) { 1018 continue; 1019 } 1020 1021 num_outstanding_reqs = spdk_nvme_qpair_get_num_outstanding_reqs(io_path->qpair->qpair); 1022 switch (io_path->nvme_ns->ana_state) { 1023 case SPDK_NVME_ANA_OPTIMIZED_STATE: 1024 if (num_outstanding_reqs < opt_min_qd) { 1025 opt_min_qd = num_outstanding_reqs; 1026 optimized = io_path; 1027 } 1028 break; 1029 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 1030 if (num_outstanding_reqs < non_opt_min_qd) { 1031 non_opt_min_qd = num_outstanding_reqs; 1032 non_optimized = io_path; 1033 } 1034 break; 1035 default: 1036 break; 1037 } 1038 } 1039 1040 /* don't cache io path for BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH selector */ 1041 if (optimized != NULL) { 1042 return optimized; 1043 } 1044 1045 return non_optimized; 1046 } 1047 1048 static inline struct nvme_io_path * 1049 bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 1050 { 1051 if (spdk_likely(nbdev_ch->current_io_path != NULL)) { 1052 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE) { 1053 return nbdev_ch->current_io_path; 1054 } else if (nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 1055 if (++nbdev_ch->rr_counter < nbdev_ch->rr_min_io) { 1056 return nbdev_ch->current_io_path; 1057 } 1058 nbdev_ch->rr_counter = 0; 1059 } 1060 } 1061 1062 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE || 1063 nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 1064 return _bdev_nvme_find_io_path(nbdev_ch); 1065 } else { 1066 return _bdev_nvme_find_io_path_min_qd(nbdev_ch); 1067 } 1068 } 1069 1070 /* Return true if there is any io_path whose qpair is active or ctrlr is not failed, 1071 * or false otherwise. 1072 * 1073 * If any io_path has an active qpair but find_io_path() returned NULL, its namespace 1074 * is likely to be non-accessible now but may become accessible. 1075 * 1076 * If any io_path has an unfailed ctrlr but find_io_path() returned NULL, the ctrlr 1077 * is likely to be resetting now but the reset may succeed. A ctrlr is set to unfailed 1078 * when starting to reset it but it is set to failed when the reset failed. Hence, if 1079 * a ctrlr is unfailed, it is likely that it works fine or is resetting. 1080 */ 1081 static bool 1082 any_io_path_may_become_available(struct nvme_bdev_channel *nbdev_ch) 1083 { 1084 struct nvme_io_path *io_path; 1085 1086 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 1087 if (io_path->nvme_ns->ana_transition_timedout) { 1088 continue; 1089 } 1090 1091 if (nvme_qpair_is_connected(io_path->qpair) || 1092 !nvme_ctrlr_is_failed(io_path->qpair->ctrlr)) { 1093 return true; 1094 } 1095 } 1096 1097 return false; 1098 } 1099 1100 static void 1101 bdev_nvme_retry_io(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io) 1102 { 1103 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 1104 struct spdk_io_channel *ch; 1105 1106 if (nbdev_io->io_path != NULL && nvme_io_path_is_available(nbdev_io->io_path)) { 1107 _bdev_nvme_submit_request(nbdev_ch, bdev_io); 1108 } else { 1109 ch = spdk_io_channel_from_ctx(nbdev_ch); 1110 bdev_nvme_submit_request(ch, bdev_io); 1111 } 1112 } 1113 1114 static int 1115 bdev_nvme_retry_ios(void *arg) 1116 { 1117 struct nvme_bdev_channel *nbdev_ch = arg; 1118 struct nvme_bdev_io *bio, *tmp_bio; 1119 uint64_t now, delay_us; 1120 1121 now = spdk_get_ticks(); 1122 1123 TAILQ_FOREACH_SAFE(bio, &nbdev_ch->retry_io_list, retry_link, tmp_bio) { 1124 if (bio->retry_ticks > now) { 1125 break; 1126 } 1127 1128 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bio, retry_link); 1129 1130 bdev_nvme_retry_io(nbdev_ch, spdk_bdev_io_from_ctx(bio)); 1131 } 1132 1133 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1134 1135 bio = TAILQ_FIRST(&nbdev_ch->retry_io_list); 1136 if (bio != NULL) { 1137 delay_us = (bio->retry_ticks - now) * SPDK_SEC_TO_USEC / spdk_get_ticks_hz(); 1138 1139 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 1140 delay_us); 1141 } 1142 1143 return SPDK_POLLER_BUSY; 1144 } 1145 1146 static void 1147 bdev_nvme_queue_retry_io(struct nvme_bdev_channel *nbdev_ch, 1148 struct nvme_bdev_io *bio, uint64_t delay_ms) 1149 { 1150 struct nvme_bdev_io *tmp_bio; 1151 1152 bio->retry_ticks = spdk_get_ticks() + delay_ms * spdk_get_ticks_hz() / 1000ULL; 1153 1154 TAILQ_FOREACH_REVERSE(tmp_bio, &nbdev_ch->retry_io_list, retry_io_head, retry_link) { 1155 if (tmp_bio->retry_ticks <= bio->retry_ticks) { 1156 TAILQ_INSERT_AFTER(&nbdev_ch->retry_io_list, tmp_bio, bio, 1157 retry_link); 1158 return; 1159 } 1160 } 1161 1162 /* No earlier I/Os were found. This I/O must be the new head. */ 1163 TAILQ_INSERT_HEAD(&nbdev_ch->retry_io_list, bio, retry_link); 1164 1165 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1166 1167 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 1168 delay_ms * 1000ULL); 1169 } 1170 1171 static void 1172 bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch) 1173 { 1174 struct nvme_bdev_io *bio, *tmp_bio; 1175 1176 TAILQ_FOREACH_SAFE(bio, &nbdev_ch->retry_io_list, retry_link, tmp_bio) { 1177 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bio, retry_link); 1178 __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_ABORTED, NULL); 1179 } 1180 1181 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1182 } 1183 1184 static int 1185 bdev_nvme_abort_retry_io(struct nvme_bdev_channel *nbdev_ch, 1186 struct nvme_bdev_io *bio_to_abort) 1187 { 1188 struct nvme_bdev_io *bio; 1189 1190 TAILQ_FOREACH(bio, &nbdev_ch->retry_io_list, retry_link) { 1191 if (bio == bio_to_abort) { 1192 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bio, retry_link); 1193 __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_ABORTED, NULL); 1194 return 0; 1195 } 1196 } 1197 1198 return -ENOENT; 1199 } 1200 1201 static void 1202 bdev_nvme_update_nvme_error_stat(struct spdk_bdev_io *bdev_io, const struct spdk_nvme_cpl *cpl) 1203 { 1204 struct nvme_bdev *nbdev; 1205 uint16_t sct, sc; 1206 1207 assert(spdk_nvme_cpl_is_error(cpl)); 1208 1209 nbdev = bdev_io->bdev->ctxt; 1210 1211 if (nbdev->err_stat == NULL) { 1212 return; 1213 } 1214 1215 sct = cpl->status.sct; 1216 sc = cpl->status.sc; 1217 1218 pthread_mutex_lock(&nbdev->mutex); 1219 1220 nbdev->err_stat->status_type[sct]++; 1221 switch (sct) { 1222 case SPDK_NVME_SCT_GENERIC: 1223 case SPDK_NVME_SCT_COMMAND_SPECIFIC: 1224 case SPDK_NVME_SCT_MEDIA_ERROR: 1225 case SPDK_NVME_SCT_PATH: 1226 nbdev->err_stat->status[sct][sc]++; 1227 break; 1228 default: 1229 break; 1230 } 1231 1232 pthread_mutex_unlock(&nbdev->mutex); 1233 } 1234 1235 static inline void 1236 bdev_nvme_update_io_path_stat(struct nvme_bdev_io *bio) 1237 { 1238 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1239 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 1240 uint32_t blocklen = bdev_io->bdev->blocklen; 1241 struct spdk_bdev_io_stat *stat; 1242 uint64_t tsc_diff; 1243 1244 if (bio->io_path->stat == NULL) { 1245 return; 1246 } 1247 1248 tsc_diff = spdk_get_ticks() - bio->submit_tsc; 1249 stat = bio->io_path->stat; 1250 1251 switch (bdev_io->type) { 1252 case SPDK_BDEV_IO_TYPE_READ: 1253 stat->bytes_read += num_blocks * blocklen; 1254 stat->num_read_ops++; 1255 stat->read_latency_ticks += tsc_diff; 1256 if (stat->max_read_latency_ticks < tsc_diff) { 1257 stat->max_read_latency_ticks = tsc_diff; 1258 } 1259 if (stat->min_read_latency_ticks > tsc_diff) { 1260 stat->min_read_latency_ticks = tsc_diff; 1261 } 1262 break; 1263 case SPDK_BDEV_IO_TYPE_WRITE: 1264 stat->bytes_written += num_blocks * blocklen; 1265 stat->num_write_ops++; 1266 stat->write_latency_ticks += tsc_diff; 1267 if (stat->max_write_latency_ticks < tsc_diff) { 1268 stat->max_write_latency_ticks = tsc_diff; 1269 } 1270 if (stat->min_write_latency_ticks > tsc_diff) { 1271 stat->min_write_latency_ticks = tsc_diff; 1272 } 1273 break; 1274 case SPDK_BDEV_IO_TYPE_UNMAP: 1275 stat->bytes_unmapped += num_blocks * blocklen; 1276 stat->num_unmap_ops++; 1277 stat->unmap_latency_ticks += tsc_diff; 1278 if (stat->max_unmap_latency_ticks < tsc_diff) { 1279 stat->max_unmap_latency_ticks = tsc_diff; 1280 } 1281 if (stat->min_unmap_latency_ticks > tsc_diff) { 1282 stat->min_unmap_latency_ticks = tsc_diff; 1283 } 1284 break; 1285 case SPDK_BDEV_IO_TYPE_ZCOPY: 1286 /* Track the data in the start phase only */ 1287 if (!bdev_io->u.bdev.zcopy.start) { 1288 break; 1289 } 1290 if (bdev_io->u.bdev.zcopy.populate) { 1291 stat->bytes_read += num_blocks * blocklen; 1292 stat->num_read_ops++; 1293 stat->read_latency_ticks += tsc_diff; 1294 if (stat->max_read_latency_ticks < tsc_diff) { 1295 stat->max_read_latency_ticks = tsc_diff; 1296 } 1297 if (stat->min_read_latency_ticks > tsc_diff) { 1298 stat->min_read_latency_ticks = tsc_diff; 1299 } 1300 } else { 1301 stat->bytes_written += num_blocks * blocklen; 1302 stat->num_write_ops++; 1303 stat->write_latency_ticks += tsc_diff; 1304 if (stat->max_write_latency_ticks < tsc_diff) { 1305 stat->max_write_latency_ticks = tsc_diff; 1306 } 1307 if (stat->min_write_latency_ticks > tsc_diff) { 1308 stat->min_write_latency_ticks = tsc_diff; 1309 } 1310 } 1311 break; 1312 case SPDK_BDEV_IO_TYPE_COPY: 1313 stat->bytes_copied += num_blocks * blocklen; 1314 stat->num_copy_ops++; 1315 stat->copy_latency_ticks += tsc_diff; 1316 if (stat->max_copy_latency_ticks < tsc_diff) { 1317 stat->max_copy_latency_ticks = tsc_diff; 1318 } 1319 if (stat->min_copy_latency_ticks > tsc_diff) { 1320 stat->min_copy_latency_ticks = tsc_diff; 1321 } 1322 break; 1323 default: 1324 break; 1325 } 1326 } 1327 1328 static bool 1329 bdev_nvme_check_retry_io(struct nvme_bdev_io *bio, 1330 const struct spdk_nvme_cpl *cpl, 1331 struct nvme_bdev_channel *nbdev_ch, 1332 uint64_t *_delay_ms) 1333 { 1334 struct nvme_io_path *io_path = bio->io_path; 1335 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 1336 const struct spdk_nvme_ctrlr_data *cdata; 1337 1338 if (spdk_nvme_cpl_is_path_error(cpl) || 1339 spdk_nvme_cpl_is_aborted_sq_deletion(cpl) || 1340 !nvme_io_path_is_available(io_path) || 1341 !nvme_ctrlr_is_available(nvme_ctrlr)) { 1342 bdev_nvme_clear_current_io_path(nbdev_ch); 1343 bio->io_path = NULL; 1344 if (spdk_nvme_cpl_is_ana_error(cpl)) { 1345 if (nvme_ctrlr_read_ana_log_page(nvme_ctrlr) == 0) { 1346 io_path->nvme_ns->ana_state_updating = true; 1347 } 1348 } 1349 if (!any_io_path_may_become_available(nbdev_ch)) { 1350 return false; 1351 } 1352 *_delay_ms = 0; 1353 } else { 1354 bio->retry_count++; 1355 1356 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 1357 1358 if (cpl->status.crd != 0) { 1359 *_delay_ms = cdata->crdt[cpl->status.crd] * 100; 1360 } else { 1361 *_delay_ms = 0; 1362 } 1363 } 1364 1365 return true; 1366 } 1367 1368 static inline void 1369 bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio, 1370 const struct spdk_nvme_cpl *cpl) 1371 { 1372 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1373 struct nvme_bdev_channel *nbdev_ch; 1374 uint64_t delay_ms; 1375 1376 assert(!bdev_nvme_io_type_is_admin(bdev_io->type)); 1377 1378 if (spdk_likely(spdk_nvme_cpl_is_success(cpl))) { 1379 bdev_nvme_update_io_path_stat(bio); 1380 goto complete; 1381 } 1382 1383 /* Update error counts before deciding if retry is needed. 1384 * Hence, error counts may be more than the number of I/O errors. 1385 */ 1386 bdev_nvme_update_nvme_error_stat(bdev_io, cpl); 1387 1388 if (cpl->status.dnr != 0 || spdk_nvme_cpl_is_aborted_by_request(cpl) || 1389 (g_opts.bdev_retry_count != -1 && bio->retry_count >= g_opts.bdev_retry_count)) { 1390 goto complete; 1391 } 1392 1393 /* At this point we don't know whether the sequence was successfully executed or not, so we 1394 * cannot retry the IO */ 1395 if (bdev_io->u.bdev.accel_sequence != NULL) { 1396 goto complete; 1397 } 1398 1399 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1400 1401 if (bdev_nvme_check_retry_io(bio, cpl, nbdev_ch, &delay_ms)) { 1402 bdev_nvme_queue_retry_io(nbdev_ch, bio, delay_ms); 1403 return; 1404 } 1405 1406 complete: 1407 bio->retry_count = 0; 1408 bio->submit_tsc = 0; 1409 bdev_io->u.bdev.accel_sequence = NULL; 1410 __bdev_nvme_io_complete(bdev_io, 0, cpl); 1411 } 1412 1413 static inline void 1414 bdev_nvme_io_complete(struct nvme_bdev_io *bio, int rc) 1415 { 1416 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1417 struct nvme_bdev_channel *nbdev_ch; 1418 enum spdk_bdev_io_status io_status; 1419 1420 assert(!bdev_nvme_io_type_is_admin(bdev_io->type)); 1421 1422 switch (rc) { 1423 case 0: 1424 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1425 break; 1426 case -ENOMEM: 1427 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1428 break; 1429 case -ENXIO: 1430 if (g_opts.bdev_retry_count == -1 || bio->retry_count < g_opts.bdev_retry_count) { 1431 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1432 1433 bdev_nvme_clear_current_io_path(nbdev_ch); 1434 bio->io_path = NULL; 1435 1436 if (any_io_path_may_become_available(nbdev_ch)) { 1437 bdev_nvme_queue_retry_io(nbdev_ch, bio, 1000ULL); 1438 return; 1439 } 1440 } 1441 1442 /* fallthrough */ 1443 default: 1444 spdk_accel_sequence_abort(bdev_io->u.bdev.accel_sequence); 1445 bdev_io->u.bdev.accel_sequence = NULL; 1446 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1447 break; 1448 } 1449 1450 bio->retry_count = 0; 1451 bio->submit_tsc = 0; 1452 __bdev_nvme_io_complete(bdev_io, io_status, NULL); 1453 } 1454 1455 static inline void 1456 bdev_nvme_admin_complete(struct nvme_bdev_io *bio, int rc) 1457 { 1458 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1459 enum spdk_bdev_io_status io_status; 1460 1461 switch (rc) { 1462 case 0: 1463 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1464 break; 1465 case -ENOMEM: 1466 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1467 break; 1468 case -ENXIO: 1469 /* fallthrough */ 1470 default: 1471 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1472 break; 1473 } 1474 1475 __bdev_nvme_io_complete(bdev_io, io_status, NULL); 1476 } 1477 1478 static void 1479 bdev_nvme_clear_io_path_caches_done(struct spdk_io_channel_iter *i, int status) 1480 { 1481 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 1482 1483 pthread_mutex_lock(&nvme_ctrlr->mutex); 1484 1485 assert(nvme_ctrlr->io_path_cache_clearing == true); 1486 nvme_ctrlr->io_path_cache_clearing = false; 1487 1488 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 1489 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1490 return; 1491 } 1492 1493 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1494 1495 nvme_ctrlr_unregister(nvme_ctrlr); 1496 } 1497 1498 static void 1499 _bdev_nvme_clear_io_path_cache(struct nvme_qpair *nvme_qpair) 1500 { 1501 struct nvme_io_path *io_path; 1502 1503 TAILQ_FOREACH(io_path, &nvme_qpair->io_path_list, tailq) { 1504 if (io_path->nbdev_ch == NULL) { 1505 continue; 1506 } 1507 bdev_nvme_clear_current_io_path(io_path->nbdev_ch); 1508 } 1509 } 1510 1511 static void 1512 bdev_nvme_clear_io_path_cache(struct spdk_io_channel_iter *i) 1513 { 1514 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 1515 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 1516 1517 assert(ctrlr_ch->qpair != NULL); 1518 1519 _bdev_nvme_clear_io_path_cache(ctrlr_ch->qpair); 1520 1521 spdk_for_each_channel_continue(i, 0); 1522 } 1523 1524 static void 1525 bdev_nvme_clear_io_path_caches(struct nvme_ctrlr *nvme_ctrlr) 1526 { 1527 pthread_mutex_lock(&nvme_ctrlr->mutex); 1528 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 1529 nvme_ctrlr->io_path_cache_clearing) { 1530 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1531 return; 1532 } 1533 1534 nvme_ctrlr->io_path_cache_clearing = true; 1535 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1536 1537 spdk_for_each_channel(nvme_ctrlr, 1538 bdev_nvme_clear_io_path_cache, 1539 NULL, 1540 bdev_nvme_clear_io_path_caches_done); 1541 } 1542 1543 static struct nvme_qpair * 1544 nvme_poll_group_get_qpair(struct nvme_poll_group *group, struct spdk_nvme_qpair *qpair) 1545 { 1546 struct nvme_qpair *nvme_qpair; 1547 1548 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1549 if (nvme_qpair->qpair == qpair) { 1550 break; 1551 } 1552 } 1553 1554 return nvme_qpair; 1555 } 1556 1557 static void nvme_qpair_delete(struct nvme_qpair *nvme_qpair); 1558 1559 static void 1560 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx) 1561 { 1562 struct nvme_poll_group *group = poll_group_ctx; 1563 struct nvme_qpair *nvme_qpair; 1564 struct nvme_ctrlr_channel *ctrlr_ch; 1565 int status; 1566 1567 nvme_qpair = nvme_poll_group_get_qpair(group, qpair); 1568 if (nvme_qpair == NULL) { 1569 return; 1570 } 1571 1572 if (nvme_qpair->qpair != NULL) { 1573 spdk_nvme_ctrlr_free_io_qpair(nvme_qpair->qpair); 1574 nvme_qpair->qpair = NULL; 1575 } 1576 1577 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1578 1579 ctrlr_ch = nvme_qpair->ctrlr_ch; 1580 1581 if (ctrlr_ch != NULL) { 1582 if (ctrlr_ch->reset_iter != NULL) { 1583 /* We are in a full reset sequence. */ 1584 if (ctrlr_ch->connect_poller != NULL) { 1585 /* qpair was failed to connect. Abort the reset sequence. */ 1586 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was failed to connect. abort the reset ctrlr sequence.\n", 1587 qpair); 1588 spdk_poller_unregister(&ctrlr_ch->connect_poller); 1589 status = -1; 1590 } else { 1591 /* qpair was completed to disconnect. Just move to the next ctrlr_channel. */ 1592 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was disconnected and freed in a reset ctrlr sequence.\n", 1593 qpair); 1594 status = 0; 1595 } 1596 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, status); 1597 ctrlr_ch->reset_iter = NULL; 1598 } else { 1599 /* qpair was disconnected unexpectedly. Reset controller for recovery. */ 1600 SPDK_NOTICELOG("qpair %p was disconnected and freed. reset controller.\n", qpair); 1601 bdev_nvme_failover_ctrlr(nvme_qpair->ctrlr); 1602 } 1603 } else { 1604 /* In this case, ctrlr_channel is already deleted. */ 1605 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was disconnected and freed. delete nvme_qpair.\n", qpair); 1606 nvme_qpair_delete(nvme_qpair); 1607 } 1608 } 1609 1610 static void 1611 bdev_nvme_check_io_qpairs(struct nvme_poll_group *group) 1612 { 1613 struct nvme_qpair *nvme_qpair; 1614 1615 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1616 if (nvme_qpair->qpair == NULL || nvme_qpair->ctrlr_ch == NULL) { 1617 continue; 1618 } 1619 1620 if (spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) != 1621 SPDK_NVME_QPAIR_FAILURE_NONE) { 1622 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1623 } 1624 } 1625 } 1626 1627 static int 1628 bdev_nvme_poll(void *arg) 1629 { 1630 struct nvme_poll_group *group = arg; 1631 int64_t num_completions; 1632 1633 if (group->collect_spin_stat && group->start_ticks == 0) { 1634 group->start_ticks = spdk_get_ticks(); 1635 } 1636 1637 num_completions = spdk_nvme_poll_group_process_completions(group->group, 0, 1638 bdev_nvme_disconnected_qpair_cb); 1639 if (group->collect_spin_stat) { 1640 if (num_completions > 0) { 1641 if (group->end_ticks != 0) { 1642 group->spin_ticks += (group->end_ticks - group->start_ticks); 1643 group->end_ticks = 0; 1644 } 1645 group->start_ticks = 0; 1646 } else { 1647 group->end_ticks = spdk_get_ticks(); 1648 } 1649 } 1650 1651 if (spdk_unlikely(num_completions < 0)) { 1652 bdev_nvme_check_io_qpairs(group); 1653 } 1654 1655 return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 1656 } 1657 1658 static int bdev_nvme_poll_adminq(void *arg); 1659 1660 static void 1661 bdev_nvme_change_adminq_poll_period(struct nvme_ctrlr *nvme_ctrlr, uint64_t new_period_us) 1662 { 1663 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 1664 1665 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, 1666 nvme_ctrlr, new_period_us); 1667 } 1668 1669 static int 1670 bdev_nvme_poll_adminq(void *arg) 1671 { 1672 int32_t rc; 1673 struct nvme_ctrlr *nvme_ctrlr = arg; 1674 nvme_ctrlr_disconnected_cb disconnected_cb; 1675 1676 assert(nvme_ctrlr != NULL); 1677 1678 rc = spdk_nvme_ctrlr_process_admin_completions(nvme_ctrlr->ctrlr); 1679 if (rc < 0) { 1680 disconnected_cb = nvme_ctrlr->disconnected_cb; 1681 nvme_ctrlr->disconnected_cb = NULL; 1682 1683 if (disconnected_cb != NULL) { 1684 bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 1685 g_opts.nvme_adminq_poll_period_us); 1686 disconnected_cb(nvme_ctrlr); 1687 } else { 1688 bdev_nvme_failover_ctrlr(nvme_ctrlr); 1689 } 1690 } else if (spdk_nvme_ctrlr_get_admin_qp_failure_reason(nvme_ctrlr->ctrlr) != 1691 SPDK_NVME_QPAIR_FAILURE_NONE) { 1692 bdev_nvme_clear_io_path_caches(nvme_ctrlr); 1693 } 1694 1695 return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY; 1696 } 1697 1698 static void 1699 nvme_bdev_free(void *io_device) 1700 { 1701 struct nvme_bdev *nvme_disk = io_device; 1702 1703 pthread_mutex_destroy(&nvme_disk->mutex); 1704 free(nvme_disk->disk.name); 1705 free(nvme_disk->err_stat); 1706 free(nvme_disk); 1707 } 1708 1709 static int 1710 bdev_nvme_destruct(void *ctx) 1711 { 1712 struct nvme_bdev *nvme_disk = ctx; 1713 struct nvme_ns *nvme_ns, *tmp_nvme_ns; 1714 1715 SPDK_DTRACE_PROBE2(bdev_nvme_destruct, nvme_disk->nbdev_ctrlr->name, nvme_disk->nsid); 1716 1717 TAILQ_FOREACH_SAFE(nvme_ns, &nvme_disk->nvme_ns_list, tailq, tmp_nvme_ns) { 1718 pthread_mutex_lock(&nvme_ns->ctrlr->mutex); 1719 1720 nvme_ns->bdev = NULL; 1721 1722 assert(nvme_ns->id > 0); 1723 1724 if (nvme_ctrlr_get_ns(nvme_ns->ctrlr, nvme_ns->id) == NULL) { 1725 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1726 1727 nvme_ctrlr_release(nvme_ns->ctrlr); 1728 nvme_ns_free(nvme_ns); 1729 } else { 1730 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1731 } 1732 } 1733 1734 pthread_mutex_lock(&g_bdev_nvme_mutex); 1735 TAILQ_REMOVE(&nvme_disk->nbdev_ctrlr->bdevs, nvme_disk, tailq); 1736 pthread_mutex_unlock(&g_bdev_nvme_mutex); 1737 1738 spdk_io_device_unregister(nvme_disk, nvme_bdev_free); 1739 1740 return 0; 1741 } 1742 1743 static int 1744 bdev_nvme_create_qpair(struct nvme_qpair *nvme_qpair) 1745 { 1746 struct nvme_ctrlr *nvme_ctrlr; 1747 struct spdk_nvme_io_qpair_opts opts; 1748 struct spdk_nvme_qpair *qpair; 1749 int rc; 1750 1751 nvme_ctrlr = nvme_qpair->ctrlr; 1752 1753 spdk_nvme_ctrlr_get_default_io_qpair_opts(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1754 opts.delay_cmd_submit = g_opts.delay_cmd_submit; 1755 opts.create_only = true; 1756 opts.async_mode = true; 1757 opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests); 1758 g_opts.io_queue_requests = opts.io_queue_requests; 1759 1760 qpair = spdk_nvme_ctrlr_alloc_io_qpair(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1761 if (qpair == NULL) { 1762 return -1; 1763 } 1764 1765 SPDK_DTRACE_PROBE3(bdev_nvme_create_qpair, nvme_ctrlr->nbdev_ctrlr->name, 1766 spdk_nvme_qpair_get_id(qpair), spdk_thread_get_id(nvme_ctrlr->thread)); 1767 1768 assert(nvme_qpair->group != NULL); 1769 1770 rc = spdk_nvme_poll_group_add(nvme_qpair->group->group, qpair); 1771 if (rc != 0) { 1772 SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n"); 1773 goto err; 1774 } 1775 1776 rc = spdk_nvme_ctrlr_connect_io_qpair(nvme_ctrlr->ctrlr, qpair); 1777 if (rc != 0) { 1778 SPDK_ERRLOG("Unable to connect I/O qpair.\n"); 1779 goto err; 1780 } 1781 1782 nvme_qpair->qpair = qpair; 1783 1784 if (!g_opts.disable_auto_failback) { 1785 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1786 } 1787 1788 return 0; 1789 1790 err: 1791 spdk_nvme_ctrlr_free_io_qpair(qpair); 1792 1793 return rc; 1794 } 1795 1796 static void 1797 bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i) 1798 { 1799 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 1800 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 1801 enum spdk_bdev_io_status status = SPDK_BDEV_IO_STATUS_SUCCESS; 1802 struct nvme_bdev_io *bio; 1803 1804 if (spdk_io_channel_iter_get_ctx(i) != NULL) { 1805 status = SPDK_BDEV_IO_STATUS_FAILED; 1806 } 1807 1808 while (!TAILQ_EMPTY(&ctrlr_ch->pending_resets)) { 1809 bio = TAILQ_FIRST(&ctrlr_ch->pending_resets); 1810 TAILQ_REMOVE(&ctrlr_ch->pending_resets, bio, retry_link); 1811 __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), status, NULL); 1812 } 1813 1814 spdk_for_each_channel_continue(i, 0); 1815 } 1816 1817 /* This function marks the current trid as failed by storing the current ticks 1818 * and then sets the next trid to the active trid within a controller if exists. 1819 * 1820 * The purpose of the boolean return value is to request the caller to disconnect 1821 * the current trid now to try connecting the next trid. 1822 */ 1823 static bool 1824 bdev_nvme_failover_trid(struct nvme_ctrlr *nvme_ctrlr, bool remove, bool start) 1825 { 1826 struct nvme_path_id *path_id, *next_path; 1827 int rc __attribute__((unused)); 1828 1829 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 1830 assert(path_id); 1831 assert(path_id == nvme_ctrlr->active_path_id); 1832 next_path = TAILQ_NEXT(path_id, link); 1833 1834 /* Update the last failed time. It means the trid is failed if its last 1835 * failed time is non-zero. 1836 */ 1837 path_id->last_failed_tsc = spdk_get_ticks(); 1838 1839 if (next_path == NULL) { 1840 /* There is no alternate trid within a controller. */ 1841 return false; 1842 } 1843 1844 if (!start && nvme_ctrlr->opts.reconnect_delay_sec == 0) { 1845 /* Connect is not retried in a controller reset sequence. Connecting 1846 * the next trid will be done by the next bdev_nvme_failover_ctrlr() call. 1847 */ 1848 return false; 1849 } 1850 1851 assert(path_id->trid.trtype != SPDK_NVME_TRANSPORT_PCIE); 1852 1853 SPDK_NOTICELOG("Start failover from %s:%s to %s:%s\n", path_id->trid.traddr, 1854 path_id->trid.trsvcid, next_path->trid.traddr, next_path->trid.trsvcid); 1855 1856 spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr); 1857 nvme_ctrlr->active_path_id = next_path; 1858 rc = spdk_nvme_ctrlr_set_trid(nvme_ctrlr->ctrlr, &next_path->trid); 1859 assert(rc == 0); 1860 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 1861 if (!remove) { 1862 /** Shuffle the old trid to the end of the list and use the new one. 1863 * Allows for round robin through multiple connections. 1864 */ 1865 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, path_id, link); 1866 } else { 1867 free(path_id); 1868 } 1869 1870 if (start || next_path->last_failed_tsc == 0) { 1871 /* bdev_nvme_failover_ctrlr() is just called or the next trid is not failed 1872 * or used yet. Try the next trid now. 1873 */ 1874 return true; 1875 } 1876 1877 if (spdk_get_ticks() > next_path->last_failed_tsc + spdk_get_ticks_hz() * 1878 nvme_ctrlr->opts.reconnect_delay_sec) { 1879 /* Enough backoff passed since the next trid failed. Try the next trid now. */ 1880 return true; 1881 } 1882 1883 /* The next trid will be tried after reconnect_delay_sec seconds. */ 1884 return false; 1885 } 1886 1887 static bool 1888 bdev_nvme_check_ctrlr_loss_timeout(struct nvme_ctrlr *nvme_ctrlr) 1889 { 1890 int32_t elapsed; 1891 1892 if (nvme_ctrlr->opts.ctrlr_loss_timeout_sec == 0 || 1893 nvme_ctrlr->opts.ctrlr_loss_timeout_sec == -1) { 1894 return false; 1895 } 1896 1897 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 1898 if (elapsed >= nvme_ctrlr->opts.ctrlr_loss_timeout_sec) { 1899 return true; 1900 } else { 1901 return false; 1902 } 1903 } 1904 1905 static bool 1906 bdev_nvme_check_fast_io_fail_timeout(struct nvme_ctrlr *nvme_ctrlr) 1907 { 1908 uint32_t elapsed; 1909 1910 if (nvme_ctrlr->opts.fast_io_fail_timeout_sec == 0) { 1911 return false; 1912 } 1913 1914 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 1915 if (elapsed >= nvme_ctrlr->opts.fast_io_fail_timeout_sec) { 1916 return true; 1917 } else { 1918 return false; 1919 } 1920 } 1921 1922 static void bdev_nvme_reset_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, bool success); 1923 1924 static void 1925 nvme_ctrlr_disconnect(struct nvme_ctrlr *nvme_ctrlr, nvme_ctrlr_disconnected_cb cb_fn) 1926 { 1927 int rc; 1928 1929 rc = spdk_nvme_ctrlr_disconnect(nvme_ctrlr->ctrlr); 1930 if (rc != 0) { 1931 /* Disconnect fails if ctrlr is already resetting or removed. In this case, 1932 * fail the reset sequence immediately. 1933 */ 1934 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 1935 return; 1936 } 1937 1938 /* spdk_nvme_ctrlr_disconnect() may complete asynchronously later by polling adminq. 1939 * Set callback here to execute the specified operation after ctrlr is really disconnected. 1940 */ 1941 assert(nvme_ctrlr->disconnected_cb == NULL); 1942 nvme_ctrlr->disconnected_cb = cb_fn; 1943 1944 /* During disconnection, reduce the period to poll adminq more often. */ 1945 bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 0); 1946 } 1947 1948 enum bdev_nvme_op_after_reset { 1949 OP_NONE, 1950 OP_COMPLETE_PENDING_DESTRUCT, 1951 OP_DESTRUCT, 1952 OP_DELAYED_RECONNECT, 1953 OP_FAILOVER, 1954 }; 1955 1956 typedef enum bdev_nvme_op_after_reset _bdev_nvme_op_after_reset; 1957 1958 static _bdev_nvme_op_after_reset 1959 bdev_nvme_check_op_after_reset(struct nvme_ctrlr *nvme_ctrlr, bool success) 1960 { 1961 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 1962 /* Complete pending destruct after reset completes. */ 1963 return OP_COMPLETE_PENDING_DESTRUCT; 1964 } else if (nvme_ctrlr->pending_failover) { 1965 nvme_ctrlr->pending_failover = false; 1966 nvme_ctrlr->reset_start_tsc = 0; 1967 return OP_FAILOVER; 1968 } else if (success || nvme_ctrlr->opts.reconnect_delay_sec == 0) { 1969 nvme_ctrlr->reset_start_tsc = 0; 1970 return OP_NONE; 1971 } else if (bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 1972 return OP_DESTRUCT; 1973 } else { 1974 if (bdev_nvme_check_fast_io_fail_timeout(nvme_ctrlr)) { 1975 nvme_ctrlr->fast_io_fail_timedout = true; 1976 } 1977 return OP_DELAYED_RECONNECT; 1978 } 1979 } 1980 1981 static int bdev_nvme_delete_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool hotplug); 1982 static void bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 1983 1984 static int 1985 bdev_nvme_reconnect_delay_timer_expired(void *ctx) 1986 { 1987 struct nvme_ctrlr *nvme_ctrlr = ctx; 1988 1989 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect_delay, nvme_ctrlr->nbdev_ctrlr->name); 1990 pthread_mutex_lock(&nvme_ctrlr->mutex); 1991 1992 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 1993 1994 if (!nvme_ctrlr->reconnect_is_delayed) { 1995 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1996 return SPDK_POLLER_BUSY; 1997 } 1998 1999 nvme_ctrlr->reconnect_is_delayed = false; 2000 2001 if (nvme_ctrlr->destruct) { 2002 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2003 return SPDK_POLLER_BUSY; 2004 } 2005 2006 assert(nvme_ctrlr->resetting == false); 2007 nvme_ctrlr->resetting = true; 2008 2009 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2010 2011 spdk_poller_resume(nvme_ctrlr->adminq_timer_poller); 2012 2013 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2014 return SPDK_POLLER_BUSY; 2015 } 2016 2017 static void 2018 bdev_nvme_start_reconnect_delay_timer(struct nvme_ctrlr *nvme_ctrlr) 2019 { 2020 spdk_poller_pause(nvme_ctrlr->adminq_timer_poller); 2021 2022 assert(nvme_ctrlr->reconnect_is_delayed == false); 2023 nvme_ctrlr->reconnect_is_delayed = true; 2024 2025 assert(nvme_ctrlr->reconnect_delay_timer == NULL); 2026 nvme_ctrlr->reconnect_delay_timer = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_delay_timer_expired, 2027 nvme_ctrlr, 2028 nvme_ctrlr->opts.reconnect_delay_sec * SPDK_SEC_TO_USEC); 2029 } 2030 2031 static void remove_discovery_entry(struct nvme_ctrlr *nvme_ctrlr); 2032 2033 static void 2034 _bdev_nvme_reset_ctrlr_complete(struct spdk_io_channel_iter *i, int status) 2035 { 2036 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2037 bool success = spdk_io_channel_iter_get_ctx(i) == NULL; 2038 bdev_nvme_ctrlr_op_cb ctrlr_op_cb_fn = nvme_ctrlr->ctrlr_op_cb_fn; 2039 void *ctrlr_op_cb_arg = nvme_ctrlr->ctrlr_op_cb_arg; 2040 enum bdev_nvme_op_after_reset op_after_reset; 2041 2042 assert(nvme_ctrlr->thread == spdk_get_thread()); 2043 2044 nvme_ctrlr->ctrlr_op_cb_fn = NULL; 2045 nvme_ctrlr->ctrlr_op_cb_arg = NULL; 2046 2047 if (!success) { 2048 SPDK_ERRLOG("Resetting controller failed.\n"); 2049 } else { 2050 SPDK_NOTICELOG("Resetting controller successful.\n"); 2051 } 2052 2053 pthread_mutex_lock(&nvme_ctrlr->mutex); 2054 nvme_ctrlr->resetting = false; 2055 nvme_ctrlr->dont_retry = false; 2056 nvme_ctrlr->in_failover = false; 2057 2058 op_after_reset = bdev_nvme_check_op_after_reset(nvme_ctrlr, success); 2059 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2060 2061 /* Delay callbacks when the next operation is a failover. */ 2062 if (ctrlr_op_cb_fn && op_after_reset != OP_FAILOVER) { 2063 ctrlr_op_cb_fn(ctrlr_op_cb_arg, success ? 0 : -1); 2064 } 2065 2066 switch (op_after_reset) { 2067 case OP_COMPLETE_PENDING_DESTRUCT: 2068 nvme_ctrlr_unregister(nvme_ctrlr); 2069 break; 2070 case OP_DESTRUCT: 2071 bdev_nvme_delete_ctrlr(nvme_ctrlr, false); 2072 remove_discovery_entry(nvme_ctrlr); 2073 break; 2074 case OP_DELAYED_RECONNECT: 2075 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_start_reconnect_delay_timer); 2076 break; 2077 case OP_FAILOVER: 2078 nvme_ctrlr->ctrlr_op_cb_fn = ctrlr_op_cb_fn; 2079 nvme_ctrlr->ctrlr_op_cb_arg = ctrlr_op_cb_arg; 2080 bdev_nvme_failover_ctrlr(nvme_ctrlr); 2081 break; 2082 default: 2083 break; 2084 } 2085 } 2086 2087 static void 2088 bdev_nvme_reset_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, bool success) 2089 { 2090 pthread_mutex_lock(&nvme_ctrlr->mutex); 2091 if (!success) { 2092 /* Connecting the active trid failed. Set the next alternate trid to the 2093 * active trid if it exists. 2094 */ 2095 if (bdev_nvme_failover_trid(nvme_ctrlr, false, false)) { 2096 /* The next alternate trid exists and is ready to try. Try it now. */ 2097 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2098 2099 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr); 2100 return; 2101 } 2102 2103 /* We came here if there is no alternate trid or if the next trid exists but 2104 * is not ready to try. We will try the active trid after reconnect_delay_sec 2105 * seconds if it is non-zero or at the next reset call otherwise. 2106 */ 2107 } else { 2108 /* Connecting the active trid succeeded. Clear the last failed time because it 2109 * means the trid is failed if its last failed time is non-zero. 2110 */ 2111 nvme_ctrlr->active_path_id->last_failed_tsc = 0; 2112 } 2113 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2114 2115 /* Make sure we clear any pending resets before returning. */ 2116 spdk_for_each_channel(nvme_ctrlr, 2117 bdev_nvme_complete_pending_resets, 2118 success ? NULL : (void *)0x1, 2119 _bdev_nvme_reset_ctrlr_complete); 2120 } 2121 2122 static void 2123 bdev_nvme_reset_create_qpairs_failed(struct spdk_io_channel_iter *i, int status) 2124 { 2125 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2126 2127 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 2128 } 2129 2130 static void 2131 bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i) 2132 { 2133 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 2134 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch); 2135 struct nvme_qpair *nvme_qpair; 2136 2137 nvme_qpair = ctrlr_ch->qpair; 2138 assert(nvme_qpair != NULL); 2139 2140 _bdev_nvme_clear_io_path_cache(nvme_qpair); 2141 2142 if (nvme_qpair->qpair != NULL) { 2143 if (nvme_qpair->ctrlr->dont_retry) { 2144 spdk_nvme_qpair_set_abort_dnr(nvme_qpair->qpair, true); 2145 } 2146 spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); 2147 2148 /* The current full reset sequence will move to the next 2149 * ctrlr_channel after the qpair is actually disconnected. 2150 */ 2151 assert(ctrlr_ch->reset_iter == NULL); 2152 ctrlr_ch->reset_iter = i; 2153 } else { 2154 spdk_for_each_channel_continue(i, 0); 2155 } 2156 } 2157 2158 static void 2159 bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status) 2160 { 2161 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2162 2163 if (status == 0) { 2164 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, true); 2165 } else { 2166 /* Delete the added qpairs and quiesce ctrlr to make the states clean. */ 2167 spdk_for_each_channel(nvme_ctrlr, 2168 bdev_nvme_reset_destroy_qpair, 2169 NULL, 2170 bdev_nvme_reset_create_qpairs_failed); 2171 } 2172 } 2173 2174 static int 2175 bdev_nvme_reset_check_qpair_connected(void *ctx) 2176 { 2177 struct nvme_ctrlr_channel *ctrlr_ch = ctx; 2178 2179 if (ctrlr_ch->reset_iter == NULL) { 2180 /* qpair was already failed to connect and the reset sequence is being aborted. */ 2181 assert(ctrlr_ch->connect_poller == NULL); 2182 assert(ctrlr_ch->qpair->qpair == NULL); 2183 return SPDK_POLLER_BUSY; 2184 } 2185 2186 assert(ctrlr_ch->qpair->qpair != NULL); 2187 2188 if (!spdk_nvme_qpair_is_connected(ctrlr_ch->qpair->qpair)) { 2189 return SPDK_POLLER_BUSY; 2190 } 2191 2192 spdk_poller_unregister(&ctrlr_ch->connect_poller); 2193 2194 /* qpair was completed to connect. Move to the next ctrlr_channel */ 2195 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 2196 ctrlr_ch->reset_iter = NULL; 2197 2198 if (!g_opts.disable_auto_failback) { 2199 _bdev_nvme_clear_io_path_cache(ctrlr_ch->qpair); 2200 } 2201 2202 return SPDK_POLLER_BUSY; 2203 } 2204 2205 static void 2206 bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i) 2207 { 2208 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 2209 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 2210 int rc; 2211 2212 rc = bdev_nvme_create_qpair(ctrlr_ch->qpair); 2213 if (rc == 0) { 2214 ctrlr_ch->connect_poller = SPDK_POLLER_REGISTER(bdev_nvme_reset_check_qpair_connected, 2215 ctrlr_ch, 0); 2216 2217 /* The current full reset sequence will move to the next 2218 * ctrlr_channel after the qpair is actually connected. 2219 */ 2220 assert(ctrlr_ch->reset_iter == NULL); 2221 ctrlr_ch->reset_iter = i; 2222 } else { 2223 spdk_for_each_channel_continue(i, rc); 2224 } 2225 } 2226 2227 static void 2228 nvme_ctrlr_check_namespaces(struct nvme_ctrlr *nvme_ctrlr) 2229 { 2230 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 2231 struct nvme_ns *nvme_ns; 2232 2233 for (nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 2234 nvme_ns != NULL; 2235 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns)) { 2236 if (!spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) { 2237 SPDK_DEBUGLOG(bdev_nvme, "NSID %u was removed during reset.\n", nvme_ns->id); 2238 /* NS can be added again. Just nullify nvme_ns->ns. */ 2239 nvme_ns->ns = NULL; 2240 } 2241 } 2242 } 2243 2244 2245 static int 2246 bdev_nvme_reconnect_ctrlr_poll(void *arg) 2247 { 2248 struct nvme_ctrlr *nvme_ctrlr = arg; 2249 int rc = -ETIMEDOUT; 2250 2251 if (!bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 2252 rc = spdk_nvme_ctrlr_reconnect_poll_async(nvme_ctrlr->ctrlr); 2253 if (rc == -EAGAIN) { 2254 return SPDK_POLLER_BUSY; 2255 } 2256 } 2257 2258 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 2259 if (rc == 0) { 2260 nvme_ctrlr_check_namespaces(nvme_ctrlr); 2261 2262 /* Recreate all of the I/O queue pairs */ 2263 spdk_for_each_channel(nvme_ctrlr, 2264 bdev_nvme_reset_create_qpair, 2265 NULL, 2266 bdev_nvme_reset_create_qpairs_done); 2267 } else { 2268 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 2269 } 2270 return SPDK_POLLER_BUSY; 2271 } 2272 2273 static void 2274 bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2275 { 2276 spdk_nvme_ctrlr_reconnect_async(nvme_ctrlr->ctrlr); 2277 2278 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect, nvme_ctrlr->nbdev_ctrlr->name); 2279 assert(nvme_ctrlr->reset_detach_poller == NULL); 2280 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_ctrlr_poll, 2281 nvme_ctrlr, 0); 2282 } 2283 2284 static void 2285 bdev_nvme_reset_destroy_qpair_done(struct spdk_io_channel_iter *i, int status) 2286 { 2287 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2288 2289 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reset, nvme_ctrlr->nbdev_ctrlr->name); 2290 assert(status == 0); 2291 2292 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2293 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2294 } else { 2295 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr); 2296 } 2297 } 2298 2299 static void 2300 bdev_nvme_reset_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr) 2301 { 2302 spdk_for_each_channel(nvme_ctrlr, 2303 bdev_nvme_reset_destroy_qpair, 2304 NULL, 2305 bdev_nvme_reset_destroy_qpair_done); 2306 } 2307 2308 static void 2309 bdev_nvme_reconnect_ctrlr_now(void *ctx) 2310 { 2311 struct nvme_ctrlr *nvme_ctrlr = ctx; 2312 2313 assert(nvme_ctrlr->resetting == true); 2314 assert(nvme_ctrlr->thread == spdk_get_thread()); 2315 2316 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 2317 2318 spdk_poller_resume(nvme_ctrlr->adminq_timer_poller); 2319 2320 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2321 } 2322 2323 static void 2324 _bdev_nvme_reset_ctrlr(void *ctx) 2325 { 2326 struct nvme_ctrlr *nvme_ctrlr = ctx; 2327 2328 assert(nvme_ctrlr->resetting == true); 2329 assert(nvme_ctrlr->thread == spdk_get_thread()); 2330 2331 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2332 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reset_destroy_qpairs); 2333 } else { 2334 bdev_nvme_reset_destroy_qpairs(nvme_ctrlr); 2335 } 2336 } 2337 2338 static int 2339 bdev_nvme_reset_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2340 { 2341 spdk_msg_fn msg_fn; 2342 2343 pthread_mutex_lock(&nvme_ctrlr->mutex); 2344 if (nvme_ctrlr->destruct) { 2345 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2346 return -ENXIO; 2347 } 2348 2349 if (nvme_ctrlr->resetting) { 2350 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2351 SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); 2352 return -EBUSY; 2353 } 2354 2355 if (nvme_ctrlr->disabled) { 2356 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2357 SPDK_NOTICELOG("Unable to perform reset. Controller is disabled.\n"); 2358 return -EALREADY; 2359 } 2360 2361 nvme_ctrlr->resetting = true; 2362 nvme_ctrlr->dont_retry = true; 2363 2364 if (nvme_ctrlr->reconnect_is_delayed) { 2365 SPDK_DEBUGLOG(bdev_nvme, "Reconnect is already scheduled.\n"); 2366 msg_fn = bdev_nvme_reconnect_ctrlr_now; 2367 nvme_ctrlr->reconnect_is_delayed = false; 2368 } else { 2369 msg_fn = _bdev_nvme_reset_ctrlr; 2370 assert(nvme_ctrlr->reset_start_tsc == 0); 2371 } 2372 2373 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2374 2375 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2376 2377 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 2378 return 0; 2379 } 2380 2381 static int 2382 bdev_nvme_enable_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2383 { 2384 pthread_mutex_lock(&nvme_ctrlr->mutex); 2385 if (nvme_ctrlr->destruct) { 2386 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2387 return -ENXIO; 2388 } 2389 2390 if (nvme_ctrlr->resetting) { 2391 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2392 return -EBUSY; 2393 } 2394 2395 if (!nvme_ctrlr->disabled) { 2396 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2397 return -EALREADY; 2398 } 2399 2400 nvme_ctrlr->disabled = false; 2401 nvme_ctrlr->resetting = true; 2402 2403 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2404 2405 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2406 2407 spdk_thread_send_msg(nvme_ctrlr->thread, bdev_nvme_reconnect_ctrlr_now, nvme_ctrlr); 2408 return 0; 2409 } 2410 2411 static void 2412 _bdev_nvme_disable_ctrlr_complete(struct spdk_io_channel_iter *i, int status) 2413 { 2414 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2415 bdev_nvme_ctrlr_op_cb ctrlr_op_cb_fn = nvme_ctrlr->ctrlr_op_cb_fn; 2416 void *ctrlr_op_cb_arg = nvme_ctrlr->ctrlr_op_cb_arg; 2417 enum bdev_nvme_op_after_reset op_after_disable; 2418 2419 assert(nvme_ctrlr->thread == spdk_get_thread()); 2420 2421 nvme_ctrlr->ctrlr_op_cb_fn = NULL; 2422 nvme_ctrlr->ctrlr_op_cb_arg = NULL; 2423 2424 pthread_mutex_lock(&nvme_ctrlr->mutex); 2425 2426 nvme_ctrlr->resetting = false; 2427 nvme_ctrlr->dont_retry = false; 2428 2429 op_after_disable = bdev_nvme_check_op_after_reset(nvme_ctrlr, true); 2430 2431 nvme_ctrlr->disabled = true; 2432 spdk_poller_pause(nvme_ctrlr->adminq_timer_poller); 2433 2434 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2435 2436 if (ctrlr_op_cb_fn) { 2437 ctrlr_op_cb_fn(ctrlr_op_cb_arg, 0); 2438 } 2439 2440 switch (op_after_disable) { 2441 case OP_COMPLETE_PENDING_DESTRUCT: 2442 nvme_ctrlr_unregister(nvme_ctrlr); 2443 break; 2444 default: 2445 break; 2446 } 2447 2448 } 2449 2450 static void 2451 bdev_nvme_disable_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr) 2452 { 2453 /* Make sure we clear any pending resets before returning. */ 2454 spdk_for_each_channel(nvme_ctrlr, 2455 bdev_nvme_complete_pending_resets, 2456 NULL, 2457 _bdev_nvme_disable_ctrlr_complete); 2458 } 2459 2460 static void 2461 bdev_nvme_disable_destroy_qpairs_done(struct spdk_io_channel_iter *i, int status) 2462 { 2463 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2464 2465 assert(status == 0); 2466 2467 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2468 bdev_nvme_disable_ctrlr_complete(nvme_ctrlr); 2469 } else { 2470 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_disable_ctrlr_complete); 2471 } 2472 } 2473 2474 static void 2475 bdev_nvme_disable_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr) 2476 { 2477 spdk_for_each_channel(nvme_ctrlr, 2478 bdev_nvme_reset_destroy_qpair, 2479 NULL, 2480 bdev_nvme_disable_destroy_qpairs_done); 2481 } 2482 2483 static void 2484 _bdev_nvme_cancel_reconnect_and_disable_ctrlr(void *ctx) 2485 { 2486 struct nvme_ctrlr *nvme_ctrlr = ctx; 2487 2488 assert(nvme_ctrlr->resetting == true); 2489 assert(nvme_ctrlr->thread == spdk_get_thread()); 2490 2491 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 2492 2493 bdev_nvme_disable_ctrlr_complete(nvme_ctrlr); 2494 } 2495 2496 static void 2497 _bdev_nvme_disconnect_and_disable_ctrlr(void *ctx) 2498 { 2499 struct nvme_ctrlr *nvme_ctrlr = ctx; 2500 2501 assert(nvme_ctrlr->resetting == true); 2502 assert(nvme_ctrlr->thread == spdk_get_thread()); 2503 2504 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2505 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_disable_destroy_qpairs); 2506 } else { 2507 bdev_nvme_disable_destroy_qpairs(nvme_ctrlr); 2508 } 2509 } 2510 2511 static int 2512 bdev_nvme_disable_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2513 { 2514 spdk_msg_fn msg_fn; 2515 2516 pthread_mutex_lock(&nvme_ctrlr->mutex); 2517 if (nvme_ctrlr->destruct) { 2518 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2519 return -ENXIO; 2520 } 2521 2522 if (nvme_ctrlr->resetting) { 2523 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2524 return -EBUSY; 2525 } 2526 2527 if (nvme_ctrlr->disabled) { 2528 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2529 return -EALREADY; 2530 } 2531 2532 nvme_ctrlr->resetting = true; 2533 nvme_ctrlr->dont_retry = true; 2534 2535 if (nvme_ctrlr->reconnect_is_delayed) { 2536 msg_fn = _bdev_nvme_cancel_reconnect_and_disable_ctrlr; 2537 nvme_ctrlr->reconnect_is_delayed = false; 2538 } else { 2539 msg_fn = _bdev_nvme_disconnect_and_disable_ctrlr; 2540 } 2541 2542 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2543 2544 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2545 2546 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 2547 return 0; 2548 } 2549 2550 static int 2551 nvme_ctrlr_op(struct nvme_ctrlr *nvme_ctrlr, enum nvme_ctrlr_op op, 2552 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2553 { 2554 int rc; 2555 2556 switch (op) { 2557 case NVME_CTRLR_OP_RESET: 2558 rc = bdev_nvme_reset_ctrlr(nvme_ctrlr); 2559 break; 2560 case NVME_CTRLR_OP_ENABLE: 2561 rc = bdev_nvme_enable_ctrlr(nvme_ctrlr); 2562 break; 2563 case NVME_CTRLR_OP_DISABLE: 2564 rc = bdev_nvme_disable_ctrlr(nvme_ctrlr); 2565 break; 2566 default: 2567 rc = -EINVAL; 2568 break; 2569 } 2570 2571 if (rc == 0) { 2572 assert(nvme_ctrlr->ctrlr_op_cb_fn == NULL); 2573 assert(nvme_ctrlr->ctrlr_op_cb_arg == NULL); 2574 nvme_ctrlr->ctrlr_op_cb_fn = cb_fn; 2575 nvme_ctrlr->ctrlr_op_cb_arg = cb_arg; 2576 } 2577 return rc; 2578 } 2579 2580 struct nvme_ctrlr_op_rpc_ctx { 2581 struct nvme_ctrlr *nvme_ctrlr; 2582 struct spdk_thread *orig_thread; 2583 enum nvme_ctrlr_op op; 2584 int rc; 2585 bdev_nvme_ctrlr_op_cb cb_fn; 2586 void *cb_arg; 2587 }; 2588 2589 static void 2590 _nvme_ctrlr_op_rpc_complete(void *_ctx) 2591 { 2592 struct nvme_ctrlr_op_rpc_ctx *ctx = _ctx; 2593 2594 assert(ctx != NULL); 2595 assert(ctx->cb_fn != NULL); 2596 2597 ctx->cb_fn(ctx->cb_arg, ctx->rc); 2598 2599 free(ctx); 2600 } 2601 2602 static void 2603 nvme_ctrlr_op_rpc_complete(void *cb_arg, int rc) 2604 { 2605 struct nvme_ctrlr_op_rpc_ctx *ctx = cb_arg; 2606 2607 ctx->rc = rc; 2608 2609 spdk_thread_send_msg(ctx->orig_thread, _nvme_ctrlr_op_rpc_complete, ctx); 2610 } 2611 2612 void 2613 nvme_ctrlr_op_rpc(struct nvme_ctrlr *nvme_ctrlr, enum nvme_ctrlr_op op, 2614 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2615 { 2616 struct nvme_ctrlr_op_rpc_ctx *ctx; 2617 int rc; 2618 2619 assert(cb_fn != NULL); 2620 2621 ctx = calloc(1, sizeof(*ctx)); 2622 if (ctx == NULL) { 2623 SPDK_ERRLOG("Failed to allocate nvme_ctrlr_op_rpc_ctx.\n"); 2624 cb_fn(cb_arg, -ENOMEM); 2625 return; 2626 } 2627 2628 ctx->orig_thread = spdk_get_thread(); 2629 ctx->cb_fn = cb_fn; 2630 ctx->cb_arg = cb_arg; 2631 2632 rc = nvme_ctrlr_op(nvme_ctrlr, op, nvme_ctrlr_op_rpc_complete, ctx); 2633 if (rc == 0) { 2634 return; 2635 } else if (rc == -EALREADY) { 2636 rc = 0; 2637 } 2638 2639 nvme_ctrlr_op_rpc_complete(ctx, rc); 2640 } 2641 2642 static void nvme_bdev_ctrlr_op_rpc_continue(void *cb_arg, int rc); 2643 2644 static void 2645 _nvme_bdev_ctrlr_op_rpc_continue(void *_ctx) 2646 { 2647 struct nvme_ctrlr_op_rpc_ctx *ctx = _ctx; 2648 struct nvme_ctrlr *prev_nvme_ctrlr, *next_nvme_ctrlr; 2649 int rc; 2650 2651 prev_nvme_ctrlr = ctx->nvme_ctrlr; 2652 ctx->nvme_ctrlr = NULL; 2653 2654 if (ctx->rc != 0) { 2655 goto complete; 2656 } 2657 2658 next_nvme_ctrlr = TAILQ_NEXT(prev_nvme_ctrlr, tailq); 2659 if (next_nvme_ctrlr == NULL) { 2660 goto complete; 2661 } 2662 2663 rc = nvme_ctrlr_op(next_nvme_ctrlr, ctx->op, nvme_bdev_ctrlr_op_rpc_continue, ctx); 2664 if (rc == 0) { 2665 ctx->nvme_ctrlr = next_nvme_ctrlr; 2666 return; 2667 } else if (rc == -EALREADY) { 2668 ctx->nvme_ctrlr = next_nvme_ctrlr; 2669 rc = 0; 2670 } 2671 2672 ctx->rc = rc; 2673 2674 complete: 2675 ctx->cb_fn(ctx->cb_arg, ctx->rc); 2676 free(ctx); 2677 } 2678 2679 static void 2680 nvme_bdev_ctrlr_op_rpc_continue(void *cb_arg, int rc) 2681 { 2682 struct nvme_ctrlr_op_rpc_ctx *ctx = cb_arg; 2683 2684 ctx->rc = rc; 2685 2686 spdk_thread_send_msg(ctx->orig_thread, _nvme_bdev_ctrlr_op_rpc_continue, ctx); 2687 } 2688 2689 void 2690 nvme_bdev_ctrlr_op_rpc(struct nvme_bdev_ctrlr *nbdev_ctrlr, enum nvme_ctrlr_op op, 2691 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2692 { 2693 struct nvme_ctrlr_op_rpc_ctx *ctx; 2694 struct nvme_ctrlr *nvme_ctrlr; 2695 int rc; 2696 2697 assert(cb_fn != NULL); 2698 2699 ctx = calloc(1, sizeof(*ctx)); 2700 if (ctx == NULL) { 2701 SPDK_ERRLOG("Failed to allocate nvme_ctrlr_op_rpc_ctx.\n"); 2702 cb_fn(cb_arg, -ENOMEM); 2703 return; 2704 } 2705 2706 ctx->orig_thread = spdk_get_thread(); 2707 ctx->op = op; 2708 ctx->cb_fn = cb_fn; 2709 ctx->cb_arg = cb_arg; 2710 2711 nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 2712 assert(nvme_ctrlr != NULL); 2713 2714 rc = nvme_ctrlr_op(nvme_ctrlr, op, nvme_bdev_ctrlr_op_rpc_continue, ctx); 2715 if (rc == 0) { 2716 ctx->nvme_ctrlr = nvme_ctrlr; 2717 return; 2718 } else if (rc == -EALREADY) { 2719 ctx->nvme_ctrlr = nvme_ctrlr; 2720 rc = 0; 2721 } 2722 2723 nvme_bdev_ctrlr_op_rpc_continue(ctx, rc); 2724 } 2725 2726 static int _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio); 2727 2728 static void 2729 _bdev_nvme_reset_io_complete(struct spdk_io_channel_iter *i, int status) 2730 { 2731 struct nvme_bdev_io *bio = spdk_io_channel_iter_get_ctx(i); 2732 enum spdk_bdev_io_status io_status; 2733 2734 if (bio->cpl.cdw0 == 0) { 2735 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 2736 } else { 2737 io_status = SPDK_BDEV_IO_STATUS_FAILED; 2738 } 2739 2740 __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), io_status, NULL); 2741 } 2742 2743 static void 2744 bdev_nvme_abort_bdev_channel(struct spdk_io_channel_iter *i) 2745 { 2746 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 2747 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 2748 2749 bdev_nvme_abort_retry_ios(nbdev_ch); 2750 2751 spdk_for_each_channel_continue(i, 0); 2752 } 2753 2754 static void 2755 bdev_nvme_reset_io_complete(struct nvme_bdev_io *bio) 2756 { 2757 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2758 struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt; 2759 2760 /* Abort all queued I/Os for retry. */ 2761 spdk_for_each_channel(nbdev, 2762 bdev_nvme_abort_bdev_channel, 2763 bio, 2764 _bdev_nvme_reset_io_complete); 2765 } 2766 2767 static void 2768 _bdev_nvme_reset_io_continue(void *ctx) 2769 { 2770 struct nvme_bdev_io *bio = ctx; 2771 struct nvme_io_path *prev_io_path, *next_io_path; 2772 int rc; 2773 2774 prev_io_path = bio->io_path; 2775 bio->io_path = NULL; 2776 2777 if (bio->cpl.cdw0 != 0) { 2778 goto complete; 2779 } 2780 2781 next_io_path = STAILQ_NEXT(prev_io_path, stailq); 2782 if (next_io_path == NULL) { 2783 goto complete; 2784 } 2785 2786 rc = _bdev_nvme_reset_io(next_io_path, bio); 2787 if (rc == 0) { 2788 return; 2789 } 2790 2791 bio->cpl.cdw0 = 1; 2792 2793 complete: 2794 bdev_nvme_reset_io_complete(bio); 2795 } 2796 2797 static void 2798 bdev_nvme_reset_io_continue(void *cb_arg, int rc) 2799 { 2800 struct nvme_bdev_io *bio = cb_arg; 2801 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2802 2803 bio->cpl.cdw0 = (rc == 0) ? 0 : 1; 2804 2805 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), _bdev_nvme_reset_io_continue, bio); 2806 } 2807 2808 static int 2809 _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio) 2810 { 2811 struct nvme_ctrlr_channel *ctrlr_ch; 2812 int rc; 2813 2814 rc = nvme_ctrlr_op(io_path->qpair->ctrlr, NVME_CTRLR_OP_RESET, 2815 bdev_nvme_reset_io_continue, bio); 2816 if (rc == 0) { 2817 assert(bio->io_path == NULL); 2818 bio->io_path = io_path; 2819 } else if (rc == -EBUSY) { 2820 ctrlr_ch = io_path->qpair->ctrlr_ch; 2821 assert(ctrlr_ch != NULL); 2822 /* 2823 * Reset call is queued only if it is from the app framework. This is on purpose so that 2824 * we don't interfere with the app framework reset strategy. i.e. we are deferring to the 2825 * upper level. If they are in the middle of a reset, we won't try to schedule another one. 2826 */ 2827 TAILQ_INSERT_TAIL(&ctrlr_ch->pending_resets, bio, retry_link); 2828 rc = 0; 2829 } 2830 2831 return rc; 2832 } 2833 2834 static void 2835 bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio) 2836 { 2837 struct nvme_io_path *io_path; 2838 int rc; 2839 2840 bio->cpl.cdw0 = 0; 2841 2842 /* Reset all nvme_ctrlrs of a bdev controller sequentially. */ 2843 io_path = STAILQ_FIRST(&nbdev_ch->io_path_list); 2844 assert(io_path != NULL); 2845 2846 rc = _bdev_nvme_reset_io(io_path, bio); 2847 if (rc != 0) { 2848 /* If the current nvme_ctrlr is disabled, skip it and move to the next nvme_ctrlr. */ 2849 bdev_nvme_reset_io_continue(bio, rc == -EALREADY); 2850 } 2851 } 2852 2853 static int 2854 bdev_nvme_failover_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool remove) 2855 { 2856 if (nvme_ctrlr->destruct) { 2857 /* Don't bother resetting if the controller is in the process of being destructed. */ 2858 return -ENXIO; 2859 } 2860 2861 if (nvme_ctrlr->resetting) { 2862 if (!nvme_ctrlr->in_failover) { 2863 SPDK_NOTICELOG("Reset is already in progress. Defer failover until reset completes.\n"); 2864 2865 /* Defer failover until reset completes. */ 2866 nvme_ctrlr->pending_failover = true; 2867 return -EINPROGRESS; 2868 } else { 2869 SPDK_NOTICELOG("Unable to perform failover, already in progress.\n"); 2870 return -EBUSY; 2871 } 2872 } 2873 2874 bdev_nvme_failover_trid(nvme_ctrlr, remove, true); 2875 2876 if (nvme_ctrlr->reconnect_is_delayed) { 2877 SPDK_NOTICELOG("Reconnect is already scheduled.\n"); 2878 2879 /* We rely on the next reconnect for the failover. */ 2880 return -EALREADY; 2881 } 2882 2883 if (nvme_ctrlr->disabled) { 2884 SPDK_NOTICELOG("Controller is disabled.\n"); 2885 2886 /* We rely on the enablement for the failover. */ 2887 return -EALREADY; 2888 } 2889 2890 nvme_ctrlr->resetting = true; 2891 nvme_ctrlr->in_failover = true; 2892 2893 assert(nvme_ctrlr->reset_start_tsc == 0); 2894 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2895 2896 return 0; 2897 } 2898 2899 static int 2900 bdev_nvme_failover_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2901 { 2902 int rc; 2903 2904 pthread_mutex_lock(&nvme_ctrlr->mutex); 2905 rc = bdev_nvme_failover_ctrlr_unsafe(nvme_ctrlr, false); 2906 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2907 2908 if (rc == 0) { 2909 spdk_thread_send_msg(nvme_ctrlr->thread, _bdev_nvme_reset_ctrlr, nvme_ctrlr); 2910 } else if (rc == -EALREADY) { 2911 rc = 0; 2912 } 2913 2914 return rc; 2915 } 2916 2917 static int bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, 2918 uint64_t num_blocks); 2919 2920 static int bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, 2921 uint64_t num_blocks); 2922 2923 static int bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, 2924 uint64_t src_offset_blocks, 2925 uint64_t num_blocks); 2926 2927 static void 2928 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 2929 bool success) 2930 { 2931 struct nvme_bdev_io *bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 2932 int ret; 2933 2934 if (!success) { 2935 ret = -EINVAL; 2936 goto exit; 2937 } 2938 2939 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 2940 ret = -ENXIO; 2941 goto exit; 2942 } 2943 2944 ret = bdev_nvme_readv(bio, 2945 bdev_io->u.bdev.iovs, 2946 bdev_io->u.bdev.iovcnt, 2947 bdev_io->u.bdev.md_buf, 2948 bdev_io->u.bdev.num_blocks, 2949 bdev_io->u.bdev.offset_blocks, 2950 bdev_io->u.bdev.dif_check_flags, 2951 bdev_io->u.bdev.memory_domain, 2952 bdev_io->u.bdev.memory_domain_ctx, 2953 bdev_io->u.bdev.accel_sequence); 2954 2955 exit: 2956 if (spdk_unlikely(ret != 0)) { 2957 bdev_nvme_io_complete(bio, ret); 2958 } 2959 } 2960 2961 static inline void 2962 _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io) 2963 { 2964 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 2965 struct spdk_bdev *bdev = bdev_io->bdev; 2966 struct nvme_bdev_io *nbdev_io_to_abort; 2967 int rc = 0; 2968 2969 switch (bdev_io->type) { 2970 case SPDK_BDEV_IO_TYPE_READ: 2971 if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) { 2972 2973 rc = bdev_nvme_readv(nbdev_io, 2974 bdev_io->u.bdev.iovs, 2975 bdev_io->u.bdev.iovcnt, 2976 bdev_io->u.bdev.md_buf, 2977 bdev_io->u.bdev.num_blocks, 2978 bdev_io->u.bdev.offset_blocks, 2979 bdev_io->u.bdev.dif_check_flags, 2980 bdev_io->u.bdev.memory_domain, 2981 bdev_io->u.bdev.memory_domain_ctx, 2982 bdev_io->u.bdev.accel_sequence); 2983 } else { 2984 spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb, 2985 bdev_io->u.bdev.num_blocks * bdev->blocklen); 2986 rc = 0; 2987 } 2988 break; 2989 case SPDK_BDEV_IO_TYPE_WRITE: 2990 rc = bdev_nvme_writev(nbdev_io, 2991 bdev_io->u.bdev.iovs, 2992 bdev_io->u.bdev.iovcnt, 2993 bdev_io->u.bdev.md_buf, 2994 bdev_io->u.bdev.num_blocks, 2995 bdev_io->u.bdev.offset_blocks, 2996 bdev_io->u.bdev.dif_check_flags, 2997 bdev_io->u.bdev.memory_domain, 2998 bdev_io->u.bdev.memory_domain_ctx, 2999 bdev_io->u.bdev.accel_sequence); 3000 break; 3001 case SPDK_BDEV_IO_TYPE_COMPARE: 3002 rc = bdev_nvme_comparev(nbdev_io, 3003 bdev_io->u.bdev.iovs, 3004 bdev_io->u.bdev.iovcnt, 3005 bdev_io->u.bdev.md_buf, 3006 bdev_io->u.bdev.num_blocks, 3007 bdev_io->u.bdev.offset_blocks, 3008 bdev_io->u.bdev.dif_check_flags); 3009 break; 3010 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 3011 rc = bdev_nvme_comparev_and_writev(nbdev_io, 3012 bdev_io->u.bdev.iovs, 3013 bdev_io->u.bdev.iovcnt, 3014 bdev_io->u.bdev.fused_iovs, 3015 bdev_io->u.bdev.fused_iovcnt, 3016 bdev_io->u.bdev.md_buf, 3017 bdev_io->u.bdev.num_blocks, 3018 bdev_io->u.bdev.offset_blocks, 3019 bdev_io->u.bdev.dif_check_flags); 3020 break; 3021 case SPDK_BDEV_IO_TYPE_UNMAP: 3022 rc = bdev_nvme_unmap(nbdev_io, 3023 bdev_io->u.bdev.offset_blocks, 3024 bdev_io->u.bdev.num_blocks); 3025 break; 3026 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3027 rc = bdev_nvme_write_zeroes(nbdev_io, 3028 bdev_io->u.bdev.offset_blocks, 3029 bdev_io->u.bdev.num_blocks); 3030 break; 3031 case SPDK_BDEV_IO_TYPE_RESET: 3032 nbdev_io->io_path = NULL; 3033 bdev_nvme_reset_io(nbdev_ch, nbdev_io); 3034 return; 3035 3036 case SPDK_BDEV_IO_TYPE_FLUSH: 3037 bdev_nvme_io_complete(nbdev_io, 0); 3038 return; 3039 3040 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 3041 rc = bdev_nvme_zone_appendv(nbdev_io, 3042 bdev_io->u.bdev.iovs, 3043 bdev_io->u.bdev.iovcnt, 3044 bdev_io->u.bdev.md_buf, 3045 bdev_io->u.bdev.num_blocks, 3046 bdev_io->u.bdev.offset_blocks, 3047 bdev_io->u.bdev.dif_check_flags); 3048 break; 3049 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 3050 rc = bdev_nvme_get_zone_info(nbdev_io, 3051 bdev_io->u.zone_mgmt.zone_id, 3052 bdev_io->u.zone_mgmt.num_zones, 3053 bdev_io->u.zone_mgmt.buf); 3054 break; 3055 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 3056 rc = bdev_nvme_zone_management(nbdev_io, 3057 bdev_io->u.zone_mgmt.zone_id, 3058 bdev_io->u.zone_mgmt.zone_action); 3059 break; 3060 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 3061 nbdev_io->io_path = NULL; 3062 bdev_nvme_admin_passthru(nbdev_ch, 3063 nbdev_io, 3064 &bdev_io->u.nvme_passthru.cmd, 3065 bdev_io->u.nvme_passthru.buf, 3066 bdev_io->u.nvme_passthru.nbytes); 3067 return; 3068 3069 case SPDK_BDEV_IO_TYPE_NVME_IO: 3070 rc = bdev_nvme_io_passthru(nbdev_io, 3071 &bdev_io->u.nvme_passthru.cmd, 3072 bdev_io->u.nvme_passthru.buf, 3073 bdev_io->u.nvme_passthru.nbytes); 3074 break; 3075 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3076 rc = bdev_nvme_io_passthru_md(nbdev_io, 3077 &bdev_io->u.nvme_passthru.cmd, 3078 bdev_io->u.nvme_passthru.buf, 3079 bdev_io->u.nvme_passthru.nbytes, 3080 bdev_io->u.nvme_passthru.md_buf, 3081 bdev_io->u.nvme_passthru.md_len); 3082 break; 3083 case SPDK_BDEV_IO_TYPE_NVME_IOV_MD: 3084 rc = bdev_nvme_iov_passthru_md(nbdev_io, 3085 &bdev_io->u.nvme_passthru.cmd, 3086 bdev_io->u.nvme_passthru.iovs, 3087 bdev_io->u.nvme_passthru.iovcnt, 3088 bdev_io->u.nvme_passthru.nbytes, 3089 bdev_io->u.nvme_passthru.md_buf, 3090 bdev_io->u.nvme_passthru.md_len); 3091 break; 3092 case SPDK_BDEV_IO_TYPE_ABORT: 3093 nbdev_io->io_path = NULL; 3094 nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx; 3095 bdev_nvme_abort(nbdev_ch, 3096 nbdev_io, 3097 nbdev_io_to_abort); 3098 return; 3099 3100 case SPDK_BDEV_IO_TYPE_COPY: 3101 rc = bdev_nvme_copy(nbdev_io, 3102 bdev_io->u.bdev.offset_blocks, 3103 bdev_io->u.bdev.copy.src_offset_blocks, 3104 bdev_io->u.bdev.num_blocks); 3105 break; 3106 default: 3107 rc = -EINVAL; 3108 break; 3109 } 3110 3111 if (spdk_unlikely(rc != 0)) { 3112 bdev_nvme_io_complete(nbdev_io, rc); 3113 } 3114 } 3115 3116 static void 3117 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 3118 { 3119 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 3120 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 3121 3122 if (spdk_likely(nbdev_io->submit_tsc == 0)) { 3123 nbdev_io->submit_tsc = spdk_bdev_io_get_submit_tsc(bdev_io); 3124 } else { 3125 /* There are cases where submit_tsc != 0, i.e. retry I/O. 3126 * We need to update submit_tsc here. 3127 */ 3128 nbdev_io->submit_tsc = spdk_get_ticks(); 3129 } 3130 3131 spdk_trace_record(TRACE_BDEV_NVME_IO_START, 0, 0, (uintptr_t)nbdev_io, (uintptr_t)bdev_io); 3132 nbdev_io->io_path = bdev_nvme_find_io_path(nbdev_ch); 3133 if (spdk_unlikely(!nbdev_io->io_path)) { 3134 if (!bdev_nvme_io_type_is_admin(bdev_io->type)) { 3135 bdev_nvme_io_complete(nbdev_io, -ENXIO); 3136 return; 3137 } 3138 3139 /* Admin commands do not use the optimal I/O path. 3140 * Simply fall through even if it is not found. 3141 */ 3142 } 3143 3144 _bdev_nvme_submit_request(nbdev_ch, bdev_io); 3145 } 3146 3147 static bool 3148 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 3149 { 3150 struct nvme_bdev *nbdev = ctx; 3151 struct nvme_ns *nvme_ns; 3152 struct spdk_nvme_ns *ns; 3153 struct spdk_nvme_ctrlr *ctrlr; 3154 const struct spdk_nvme_ctrlr_data *cdata; 3155 3156 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 3157 assert(nvme_ns != NULL); 3158 ns = nvme_ns->ns; 3159 if (ns == NULL) { 3160 return false; 3161 } 3162 3163 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 3164 3165 switch (io_type) { 3166 case SPDK_BDEV_IO_TYPE_READ: 3167 case SPDK_BDEV_IO_TYPE_WRITE: 3168 case SPDK_BDEV_IO_TYPE_RESET: 3169 case SPDK_BDEV_IO_TYPE_FLUSH: 3170 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 3171 case SPDK_BDEV_IO_TYPE_NVME_IO: 3172 case SPDK_BDEV_IO_TYPE_ABORT: 3173 return true; 3174 3175 case SPDK_BDEV_IO_TYPE_COMPARE: 3176 return spdk_nvme_ns_supports_compare(ns); 3177 3178 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3179 return spdk_nvme_ns_get_md_size(ns) ? true : false; 3180 3181 case SPDK_BDEV_IO_TYPE_UNMAP: 3182 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3183 return cdata->oncs.dsm; 3184 3185 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3186 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3187 return cdata->oncs.write_zeroes; 3188 3189 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 3190 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 3191 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) { 3192 return true; 3193 } 3194 return false; 3195 3196 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 3197 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 3198 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS; 3199 3200 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 3201 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS && 3202 spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ZONE_APPEND_SUPPORTED; 3203 3204 case SPDK_BDEV_IO_TYPE_COPY: 3205 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3206 return cdata->oncs.copy; 3207 3208 default: 3209 return false; 3210 } 3211 } 3212 3213 static int 3214 nvme_qpair_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ctrlr_channel *ctrlr_ch) 3215 { 3216 struct nvme_qpair *nvme_qpair; 3217 struct spdk_io_channel *pg_ch; 3218 int rc; 3219 3220 nvme_qpair = calloc(1, sizeof(*nvme_qpair)); 3221 if (!nvme_qpair) { 3222 SPDK_ERRLOG("Failed to alloc nvme_qpair.\n"); 3223 return -1; 3224 } 3225 3226 TAILQ_INIT(&nvme_qpair->io_path_list); 3227 3228 nvme_qpair->ctrlr = nvme_ctrlr; 3229 nvme_qpair->ctrlr_ch = ctrlr_ch; 3230 3231 pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs); 3232 if (!pg_ch) { 3233 free(nvme_qpair); 3234 return -1; 3235 } 3236 3237 nvme_qpair->group = spdk_io_channel_get_ctx(pg_ch); 3238 3239 #ifdef SPDK_CONFIG_VTUNE 3240 nvme_qpair->group->collect_spin_stat = true; 3241 #else 3242 nvme_qpair->group->collect_spin_stat = false; 3243 #endif 3244 3245 if (!nvme_ctrlr->disabled) { 3246 /* If a nvme_ctrlr is disabled, don't try to create qpair for it. Qpair will 3247 * be created when it's enabled. 3248 */ 3249 rc = bdev_nvme_create_qpair(nvme_qpair); 3250 if (rc != 0) { 3251 /* nvme_ctrlr can't create IO qpair if connection is down. 3252 * If reconnect_delay_sec is non-zero, creating IO qpair is retried 3253 * after reconnect_delay_sec seconds. If bdev_retry_count is non-zero, 3254 * submitted IO will be queued until IO qpair is successfully created. 3255 * 3256 * Hence, if both are satisfied, ignore the failure. 3257 */ 3258 if (nvme_ctrlr->opts.reconnect_delay_sec == 0 || g_opts.bdev_retry_count == 0) { 3259 spdk_put_io_channel(pg_ch); 3260 free(nvme_qpair); 3261 return rc; 3262 } 3263 } 3264 } 3265 3266 TAILQ_INSERT_TAIL(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 3267 3268 ctrlr_ch->qpair = nvme_qpair; 3269 3270 pthread_mutex_lock(&nvme_qpair->ctrlr->mutex); 3271 nvme_qpair->ctrlr->ref++; 3272 pthread_mutex_unlock(&nvme_qpair->ctrlr->mutex); 3273 3274 return 0; 3275 } 3276 3277 static int 3278 bdev_nvme_create_ctrlr_channel_cb(void *io_device, void *ctx_buf) 3279 { 3280 struct nvme_ctrlr *nvme_ctrlr = io_device; 3281 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 3282 3283 TAILQ_INIT(&ctrlr_ch->pending_resets); 3284 3285 return nvme_qpair_create(nvme_ctrlr, ctrlr_ch); 3286 } 3287 3288 static void 3289 nvme_qpair_delete(struct nvme_qpair *nvme_qpair) 3290 { 3291 struct nvme_io_path *io_path, *next; 3292 3293 assert(nvme_qpair->group != NULL); 3294 3295 TAILQ_FOREACH_SAFE(io_path, &nvme_qpair->io_path_list, tailq, next) { 3296 TAILQ_REMOVE(&nvme_qpair->io_path_list, io_path, tailq); 3297 nvme_io_path_free(io_path); 3298 } 3299 3300 TAILQ_REMOVE(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 3301 3302 spdk_put_io_channel(spdk_io_channel_from_ctx(nvme_qpair->group)); 3303 3304 nvme_ctrlr_release(nvme_qpair->ctrlr); 3305 3306 free(nvme_qpair); 3307 } 3308 3309 static void 3310 bdev_nvme_destroy_ctrlr_channel_cb(void *io_device, void *ctx_buf) 3311 { 3312 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 3313 struct nvme_qpair *nvme_qpair; 3314 3315 nvme_qpair = ctrlr_ch->qpair; 3316 assert(nvme_qpair != NULL); 3317 3318 _bdev_nvme_clear_io_path_cache(nvme_qpair); 3319 3320 if (nvme_qpair->qpair != NULL) { 3321 if (ctrlr_ch->reset_iter == NULL) { 3322 spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); 3323 } else { 3324 /* Skip current ctrlr_channel in a full reset sequence because 3325 * it is being deleted now. The qpair is already being disconnected. 3326 * We do not have to restart disconnecting it. 3327 */ 3328 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 3329 } 3330 3331 /* We cannot release a reference to the poll group now. 3332 * The qpair may be disconnected asynchronously later. 3333 * We need to poll it until it is actually disconnected. 3334 * Just detach the qpair from the deleting ctrlr_channel. 3335 */ 3336 nvme_qpair->ctrlr_ch = NULL; 3337 } else { 3338 assert(ctrlr_ch->reset_iter == NULL); 3339 3340 nvme_qpair_delete(nvme_qpair); 3341 } 3342 } 3343 3344 static inline struct spdk_io_channel * 3345 bdev_nvme_get_accel_channel(struct nvme_poll_group *group) 3346 { 3347 if (spdk_unlikely(!group->accel_channel)) { 3348 group->accel_channel = spdk_accel_get_io_channel(); 3349 if (!group->accel_channel) { 3350 SPDK_ERRLOG("Cannot get the accel_channel for bdev nvme polling group=%p\n", 3351 group); 3352 return NULL; 3353 } 3354 } 3355 3356 return group->accel_channel; 3357 } 3358 3359 static void 3360 bdev_nvme_submit_accel_crc32c(void *ctx, uint32_t *dst, struct iovec *iov, 3361 uint32_t iov_cnt, uint32_t seed, 3362 spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) 3363 { 3364 struct spdk_io_channel *accel_ch; 3365 struct nvme_poll_group *group = ctx; 3366 int rc; 3367 3368 assert(cb_fn != NULL); 3369 3370 accel_ch = bdev_nvme_get_accel_channel(group); 3371 if (spdk_unlikely(accel_ch == NULL)) { 3372 cb_fn(cb_arg, -ENOMEM); 3373 return; 3374 } 3375 3376 rc = spdk_accel_submit_crc32cv(accel_ch, dst, iov, iov_cnt, seed, cb_fn, cb_arg); 3377 if (rc) { 3378 /* For the two cases, spdk_accel_submit_crc32cv does not call the user's cb_fn */ 3379 if (rc == -ENOMEM || rc == -EINVAL) { 3380 cb_fn(cb_arg, rc); 3381 } 3382 SPDK_ERRLOG("Cannot complete the accelerated crc32c operation with iov=%p\n", iov); 3383 } 3384 } 3385 3386 static void 3387 bdev_nvme_finish_sequence(void *seq, spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) 3388 { 3389 spdk_accel_sequence_finish(seq, cb_fn, cb_arg); 3390 } 3391 3392 static void 3393 bdev_nvme_abort_sequence(void *seq) 3394 { 3395 spdk_accel_sequence_abort(seq); 3396 } 3397 3398 static void 3399 bdev_nvme_reverse_sequence(void *seq) 3400 { 3401 spdk_accel_sequence_reverse(seq); 3402 } 3403 3404 static int 3405 bdev_nvme_append_crc32c(void *ctx, void **seq, uint32_t *dst, struct iovec *iovs, uint32_t iovcnt, 3406 struct spdk_memory_domain *domain, void *domain_ctx, uint32_t seed, 3407 spdk_nvme_accel_step_cb cb_fn, void *cb_arg) 3408 { 3409 struct spdk_io_channel *ch; 3410 struct nvme_poll_group *group = ctx; 3411 3412 ch = bdev_nvme_get_accel_channel(group); 3413 if (spdk_unlikely(ch == NULL)) { 3414 return -ENOMEM; 3415 } 3416 3417 return spdk_accel_append_crc32c((struct spdk_accel_sequence **)seq, ch, dst, iovs, iovcnt, 3418 domain, domain_ctx, seed, cb_fn, cb_arg); 3419 } 3420 3421 static struct spdk_nvme_accel_fn_table g_bdev_nvme_accel_fn_table = { 3422 .table_size = sizeof(struct spdk_nvme_accel_fn_table), 3423 .submit_accel_crc32c = bdev_nvme_submit_accel_crc32c, 3424 .append_crc32c = bdev_nvme_append_crc32c, 3425 .finish_sequence = bdev_nvme_finish_sequence, 3426 .reverse_sequence = bdev_nvme_reverse_sequence, 3427 .abort_sequence = bdev_nvme_abort_sequence, 3428 }; 3429 3430 static int 3431 bdev_nvme_create_poll_group_cb(void *io_device, void *ctx_buf) 3432 { 3433 struct nvme_poll_group *group = ctx_buf; 3434 3435 TAILQ_INIT(&group->qpair_list); 3436 3437 group->group = spdk_nvme_poll_group_create(group, &g_bdev_nvme_accel_fn_table); 3438 if (group->group == NULL) { 3439 return -1; 3440 } 3441 3442 group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us); 3443 3444 if (group->poller == NULL) { 3445 spdk_nvme_poll_group_destroy(group->group); 3446 return -1; 3447 } 3448 3449 return 0; 3450 } 3451 3452 static void 3453 bdev_nvme_destroy_poll_group_cb(void *io_device, void *ctx_buf) 3454 { 3455 struct nvme_poll_group *group = ctx_buf; 3456 3457 assert(TAILQ_EMPTY(&group->qpair_list)); 3458 3459 if (group->accel_channel) { 3460 spdk_put_io_channel(group->accel_channel); 3461 } 3462 3463 spdk_poller_unregister(&group->poller); 3464 if (spdk_nvme_poll_group_destroy(group->group)) { 3465 SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.\n"); 3466 assert(false); 3467 } 3468 } 3469 3470 static struct spdk_io_channel * 3471 bdev_nvme_get_io_channel(void *ctx) 3472 { 3473 struct nvme_bdev *nvme_bdev = ctx; 3474 3475 return spdk_get_io_channel(nvme_bdev); 3476 } 3477 3478 static void * 3479 bdev_nvme_get_module_ctx(void *ctx) 3480 { 3481 struct nvme_bdev *nvme_bdev = ctx; 3482 struct nvme_ns *nvme_ns; 3483 3484 if (!nvme_bdev || nvme_bdev->disk.module != &nvme_if) { 3485 return NULL; 3486 } 3487 3488 nvme_ns = TAILQ_FIRST(&nvme_bdev->nvme_ns_list); 3489 if (!nvme_ns) { 3490 return NULL; 3491 } 3492 3493 return nvme_ns->ns; 3494 } 3495 3496 static const char * 3497 _nvme_ana_state_str(enum spdk_nvme_ana_state ana_state) 3498 { 3499 switch (ana_state) { 3500 case SPDK_NVME_ANA_OPTIMIZED_STATE: 3501 return "optimized"; 3502 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 3503 return "non_optimized"; 3504 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 3505 return "inaccessible"; 3506 case SPDK_NVME_ANA_PERSISTENT_LOSS_STATE: 3507 return "persistent_loss"; 3508 case SPDK_NVME_ANA_CHANGE_STATE: 3509 return "change"; 3510 default: 3511 return NULL; 3512 } 3513 } 3514 3515 static int 3516 bdev_nvme_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size) 3517 { 3518 struct spdk_memory_domain **_domains = NULL; 3519 struct nvme_bdev *nbdev = ctx; 3520 struct nvme_ns *nvme_ns; 3521 int i = 0, _array_size = array_size; 3522 int rc = 0; 3523 3524 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 3525 if (domains && array_size >= i) { 3526 _domains = &domains[i]; 3527 } else { 3528 _domains = NULL; 3529 } 3530 rc = spdk_nvme_ctrlr_get_memory_domains(nvme_ns->ctrlr->ctrlr, _domains, _array_size); 3531 if (rc > 0) { 3532 i += rc; 3533 if (_array_size >= rc) { 3534 _array_size -= rc; 3535 } else { 3536 _array_size = 0; 3537 } 3538 } else if (rc < 0) { 3539 return rc; 3540 } 3541 } 3542 3543 return i; 3544 } 3545 3546 static const char * 3547 nvme_ctrlr_get_state_str(struct nvme_ctrlr *nvme_ctrlr) 3548 { 3549 if (nvme_ctrlr->destruct) { 3550 return "deleting"; 3551 } else if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 3552 return "failed"; 3553 } else if (nvme_ctrlr->resetting) { 3554 return "resetting"; 3555 } else if (nvme_ctrlr->reconnect_is_delayed > 0) { 3556 return "reconnect_is_delayed"; 3557 } else if (nvme_ctrlr->disabled) { 3558 return "disabled"; 3559 } else { 3560 return "enabled"; 3561 } 3562 } 3563 3564 void 3565 nvme_ctrlr_info_json(struct spdk_json_write_ctx *w, struct nvme_ctrlr *nvme_ctrlr) 3566 { 3567 struct spdk_nvme_transport_id *trid; 3568 const struct spdk_nvme_ctrlr_opts *opts; 3569 const struct spdk_nvme_ctrlr_data *cdata; 3570 struct nvme_path_id *path_id; 3571 3572 spdk_json_write_object_begin(w); 3573 3574 spdk_json_write_named_string(w, "state", nvme_ctrlr_get_state_str(nvme_ctrlr)); 3575 3576 #ifdef SPDK_CONFIG_NVME_CUSE 3577 size_t cuse_name_size = 128; 3578 char cuse_name[cuse_name_size]; 3579 3580 int rc = spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr, cuse_name, &cuse_name_size); 3581 if (rc == 0) { 3582 spdk_json_write_named_string(w, "cuse_device", cuse_name); 3583 } 3584 #endif 3585 trid = &nvme_ctrlr->active_path_id->trid; 3586 spdk_json_write_named_object_begin(w, "trid"); 3587 nvme_bdev_dump_trid_json(trid, w); 3588 spdk_json_write_object_end(w); 3589 3590 path_id = TAILQ_NEXT(nvme_ctrlr->active_path_id, link); 3591 if (path_id != NULL) { 3592 spdk_json_write_named_array_begin(w, "alternate_trids"); 3593 do { 3594 trid = &path_id->trid; 3595 spdk_json_write_object_begin(w); 3596 nvme_bdev_dump_trid_json(trid, w); 3597 spdk_json_write_object_end(w); 3598 3599 path_id = TAILQ_NEXT(path_id, link); 3600 } while (path_id != NULL); 3601 spdk_json_write_array_end(w); 3602 } 3603 3604 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 3605 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 3606 3607 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 3608 spdk_json_write_named_object_begin(w, "host"); 3609 spdk_json_write_named_string(w, "nqn", opts->hostnqn); 3610 spdk_json_write_named_string(w, "addr", opts->src_addr); 3611 spdk_json_write_named_string(w, "svcid", opts->src_svcid); 3612 spdk_json_write_object_end(w); 3613 3614 spdk_json_write_object_end(w); 3615 } 3616 3617 static void 3618 nvme_namespace_info_json(struct spdk_json_write_ctx *w, 3619 struct nvme_ns *nvme_ns) 3620 { 3621 struct spdk_nvme_ns *ns; 3622 struct spdk_nvme_ctrlr *ctrlr; 3623 const struct spdk_nvme_ctrlr_data *cdata; 3624 const struct spdk_nvme_transport_id *trid; 3625 union spdk_nvme_vs_register vs; 3626 const struct spdk_nvme_ns_data *nsdata; 3627 char buf[128]; 3628 3629 ns = nvme_ns->ns; 3630 if (ns == NULL) { 3631 return; 3632 } 3633 3634 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 3635 3636 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3637 trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); 3638 vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr); 3639 3640 spdk_json_write_object_begin(w); 3641 3642 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 3643 spdk_json_write_named_string(w, "pci_address", trid->traddr); 3644 } 3645 3646 spdk_json_write_named_object_begin(w, "trid"); 3647 3648 nvme_bdev_dump_trid_json(trid, w); 3649 3650 spdk_json_write_object_end(w); 3651 3652 #ifdef SPDK_CONFIG_NVME_CUSE 3653 size_t cuse_name_size = 128; 3654 char cuse_name[cuse_name_size]; 3655 3656 int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns), 3657 cuse_name, &cuse_name_size); 3658 if (rc == 0) { 3659 spdk_json_write_named_string(w, "cuse_device", cuse_name); 3660 } 3661 #endif 3662 3663 spdk_json_write_named_object_begin(w, "ctrlr_data"); 3664 3665 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 3666 3667 spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid); 3668 3669 snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn); 3670 spdk_str_trim(buf); 3671 spdk_json_write_named_string(w, "model_number", buf); 3672 3673 snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn); 3674 spdk_str_trim(buf); 3675 spdk_json_write_named_string(w, "serial_number", buf); 3676 3677 snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr); 3678 spdk_str_trim(buf); 3679 spdk_json_write_named_string(w, "firmware_revision", buf); 3680 3681 if (cdata->subnqn[0] != '\0') { 3682 spdk_json_write_named_string(w, "subnqn", cdata->subnqn); 3683 } 3684 3685 spdk_json_write_named_object_begin(w, "oacs"); 3686 3687 spdk_json_write_named_uint32(w, "security", cdata->oacs.security); 3688 spdk_json_write_named_uint32(w, "format", cdata->oacs.format); 3689 spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware); 3690 spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage); 3691 3692 spdk_json_write_object_end(w); 3693 3694 spdk_json_write_named_bool(w, "multi_ctrlr", cdata->cmic.multi_ctrlr); 3695 spdk_json_write_named_bool(w, "ana_reporting", cdata->cmic.ana_reporting); 3696 3697 spdk_json_write_object_end(w); 3698 3699 spdk_json_write_named_object_begin(w, "vs"); 3700 3701 spdk_json_write_name(w, "nvme_version"); 3702 if (vs.bits.ter) { 3703 spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter); 3704 } else { 3705 spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr); 3706 } 3707 3708 spdk_json_write_object_end(w); 3709 3710 nsdata = spdk_nvme_ns_get_data(ns); 3711 3712 spdk_json_write_named_object_begin(w, "ns_data"); 3713 3714 spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns)); 3715 3716 if (cdata->cmic.ana_reporting) { 3717 spdk_json_write_named_string(w, "ana_state", 3718 _nvme_ana_state_str(nvme_ns->ana_state)); 3719 } 3720 3721 spdk_json_write_named_bool(w, "can_share", nsdata->nmic.can_share); 3722 3723 spdk_json_write_object_end(w); 3724 3725 if (cdata->oacs.security) { 3726 spdk_json_write_named_object_begin(w, "security"); 3727 3728 spdk_json_write_named_bool(w, "opal", nvme_ns->bdev->opal); 3729 3730 spdk_json_write_object_end(w); 3731 } 3732 3733 spdk_json_write_object_end(w); 3734 } 3735 3736 static const char * 3737 nvme_bdev_get_mp_policy_str(struct nvme_bdev *nbdev) 3738 { 3739 switch (nbdev->mp_policy) { 3740 case BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE: 3741 return "active_passive"; 3742 case BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE: 3743 return "active_active"; 3744 default: 3745 assert(false); 3746 return "invalid"; 3747 } 3748 } 3749 3750 static int 3751 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) 3752 { 3753 struct nvme_bdev *nvme_bdev = ctx; 3754 struct nvme_ns *nvme_ns; 3755 3756 pthread_mutex_lock(&nvme_bdev->mutex); 3757 spdk_json_write_named_array_begin(w, "nvme"); 3758 TAILQ_FOREACH(nvme_ns, &nvme_bdev->nvme_ns_list, tailq) { 3759 nvme_namespace_info_json(w, nvme_ns); 3760 } 3761 spdk_json_write_array_end(w); 3762 spdk_json_write_named_string(w, "mp_policy", nvme_bdev_get_mp_policy_str(nvme_bdev)); 3763 pthread_mutex_unlock(&nvme_bdev->mutex); 3764 3765 return 0; 3766 } 3767 3768 static void 3769 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3770 { 3771 /* No config per bdev needed */ 3772 } 3773 3774 static uint64_t 3775 bdev_nvme_get_spin_time(struct spdk_io_channel *ch) 3776 { 3777 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 3778 struct nvme_io_path *io_path; 3779 struct nvme_poll_group *group; 3780 uint64_t spin_time = 0; 3781 3782 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 3783 group = io_path->qpair->group; 3784 3785 if (!group || !group->collect_spin_stat) { 3786 continue; 3787 } 3788 3789 if (group->end_ticks != 0) { 3790 group->spin_ticks += (group->end_ticks - group->start_ticks); 3791 group->end_ticks = 0; 3792 } 3793 3794 spin_time += group->spin_ticks; 3795 group->start_ticks = 0; 3796 group->spin_ticks = 0; 3797 } 3798 3799 return (spin_time * 1000000ULL) / spdk_get_ticks_hz(); 3800 } 3801 3802 static void 3803 bdev_nvme_reset_device_stat(void *ctx) 3804 { 3805 struct nvme_bdev *nbdev = ctx; 3806 3807 if (nbdev->err_stat != NULL) { 3808 memset(nbdev->err_stat, 0, sizeof(struct nvme_error_stat)); 3809 } 3810 } 3811 3812 /* JSON string should be lowercases and underscore delimited string. */ 3813 static void 3814 bdev_nvme_format_nvme_status(char *dst, const char *src) 3815 { 3816 char tmp[256]; 3817 3818 spdk_strcpy_replace(dst, 256, src, " - ", "_"); 3819 spdk_strcpy_replace(tmp, 256, dst, "-", "_"); 3820 spdk_strcpy_replace(dst, 256, tmp, " ", "_"); 3821 spdk_strlwr(dst); 3822 } 3823 3824 static void 3825 bdev_nvme_dump_device_stat_json(void *ctx, struct spdk_json_write_ctx *w) 3826 { 3827 struct nvme_bdev *nbdev = ctx; 3828 struct spdk_nvme_status status = {}; 3829 uint16_t sct, sc; 3830 char status_json[256]; 3831 const char *status_str; 3832 3833 if (nbdev->err_stat == NULL) { 3834 return; 3835 } 3836 3837 spdk_json_write_named_object_begin(w, "nvme_error"); 3838 3839 spdk_json_write_named_object_begin(w, "status_type"); 3840 for (sct = 0; sct < 8; sct++) { 3841 if (nbdev->err_stat->status_type[sct] == 0) { 3842 continue; 3843 } 3844 status.sct = sct; 3845 3846 status_str = spdk_nvme_cpl_get_status_type_string(&status); 3847 assert(status_str != NULL); 3848 bdev_nvme_format_nvme_status(status_json, status_str); 3849 3850 spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status_type[sct]); 3851 } 3852 spdk_json_write_object_end(w); 3853 3854 spdk_json_write_named_object_begin(w, "status_code"); 3855 for (sct = 0; sct < 4; sct++) { 3856 status.sct = sct; 3857 for (sc = 0; sc < 256; sc++) { 3858 if (nbdev->err_stat->status[sct][sc] == 0) { 3859 continue; 3860 } 3861 status.sc = sc; 3862 3863 status_str = spdk_nvme_cpl_get_status_string(&status); 3864 assert(status_str != NULL); 3865 bdev_nvme_format_nvme_status(status_json, status_str); 3866 3867 spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status[sct][sc]); 3868 } 3869 } 3870 spdk_json_write_object_end(w); 3871 3872 spdk_json_write_object_end(w); 3873 } 3874 3875 static bool 3876 bdev_nvme_accel_sequence_supported(void *ctx, enum spdk_bdev_io_type type) 3877 { 3878 struct nvme_bdev *nbdev = ctx; 3879 struct spdk_nvme_ctrlr *ctrlr; 3880 3881 if (!g_opts.allow_accel_sequence) { 3882 return false; 3883 } 3884 3885 switch (type) { 3886 case SPDK_BDEV_IO_TYPE_WRITE: 3887 case SPDK_BDEV_IO_TYPE_READ: 3888 break; 3889 default: 3890 return false; 3891 } 3892 3893 ctrlr = bdev_nvme_get_ctrlr(&nbdev->disk); 3894 assert(ctrlr != NULL); 3895 3896 return spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ACCEL_SEQUENCE_SUPPORTED; 3897 } 3898 3899 static const struct spdk_bdev_fn_table nvmelib_fn_table = { 3900 .destruct = bdev_nvme_destruct, 3901 .submit_request = bdev_nvme_submit_request, 3902 .io_type_supported = bdev_nvme_io_type_supported, 3903 .get_io_channel = bdev_nvme_get_io_channel, 3904 .dump_info_json = bdev_nvme_dump_info_json, 3905 .write_config_json = bdev_nvme_write_config_json, 3906 .get_spin_time = bdev_nvme_get_spin_time, 3907 .get_module_ctx = bdev_nvme_get_module_ctx, 3908 .get_memory_domains = bdev_nvme_get_memory_domains, 3909 .accel_sequence_supported = bdev_nvme_accel_sequence_supported, 3910 .reset_device_stat = bdev_nvme_reset_device_stat, 3911 .dump_device_stat_json = bdev_nvme_dump_device_stat_json, 3912 }; 3913 3914 typedef int (*bdev_nvme_parse_ana_log_page_cb)( 3915 const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg); 3916 3917 static int 3918 bdev_nvme_parse_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 3919 bdev_nvme_parse_ana_log_page_cb cb_fn, void *cb_arg) 3920 { 3921 struct spdk_nvme_ana_group_descriptor *copied_desc; 3922 uint8_t *orig_desc; 3923 uint32_t i, desc_size, copy_len; 3924 int rc = 0; 3925 3926 if (nvme_ctrlr->ana_log_page == NULL) { 3927 return -EINVAL; 3928 } 3929 3930 copied_desc = nvme_ctrlr->copied_ana_desc; 3931 3932 orig_desc = (uint8_t *)nvme_ctrlr->ana_log_page + sizeof(struct spdk_nvme_ana_page); 3933 copy_len = nvme_ctrlr->max_ana_log_page_size - sizeof(struct spdk_nvme_ana_page); 3934 3935 for (i = 0; i < nvme_ctrlr->ana_log_page->num_ana_group_desc; i++) { 3936 memcpy(copied_desc, orig_desc, copy_len); 3937 3938 rc = cb_fn(copied_desc, cb_arg); 3939 if (rc != 0) { 3940 break; 3941 } 3942 3943 desc_size = sizeof(struct spdk_nvme_ana_group_descriptor) + 3944 copied_desc->num_of_nsid * sizeof(uint32_t); 3945 orig_desc += desc_size; 3946 copy_len -= desc_size; 3947 } 3948 3949 return rc; 3950 } 3951 3952 static int 3953 nvme_ns_ana_transition_timedout(void *ctx) 3954 { 3955 struct nvme_ns *nvme_ns = ctx; 3956 3957 spdk_poller_unregister(&nvme_ns->anatt_timer); 3958 nvme_ns->ana_transition_timedout = true; 3959 3960 return SPDK_POLLER_BUSY; 3961 } 3962 3963 static void 3964 _nvme_ns_set_ana_state(struct nvme_ns *nvme_ns, 3965 const struct spdk_nvme_ana_group_descriptor *desc) 3966 { 3967 const struct spdk_nvme_ctrlr_data *cdata; 3968 3969 nvme_ns->ana_group_id = desc->ana_group_id; 3970 nvme_ns->ana_state = desc->ana_state; 3971 nvme_ns->ana_state_updating = false; 3972 3973 switch (nvme_ns->ana_state) { 3974 case SPDK_NVME_ANA_OPTIMIZED_STATE: 3975 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 3976 nvme_ns->ana_transition_timedout = false; 3977 spdk_poller_unregister(&nvme_ns->anatt_timer); 3978 break; 3979 3980 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 3981 case SPDK_NVME_ANA_CHANGE_STATE: 3982 if (nvme_ns->anatt_timer != NULL) { 3983 break; 3984 } 3985 3986 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 3987 nvme_ns->anatt_timer = SPDK_POLLER_REGISTER(nvme_ns_ana_transition_timedout, 3988 nvme_ns, 3989 cdata->anatt * SPDK_SEC_TO_USEC); 3990 break; 3991 default: 3992 break; 3993 } 3994 } 3995 3996 static int 3997 nvme_ns_set_ana_state(const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg) 3998 { 3999 struct nvme_ns *nvme_ns = cb_arg; 4000 uint32_t i; 4001 4002 assert(nvme_ns->ns != NULL); 4003 4004 for (i = 0; i < desc->num_of_nsid; i++) { 4005 if (desc->nsid[i] != spdk_nvme_ns_get_id(nvme_ns->ns)) { 4006 continue; 4007 } 4008 4009 _nvme_ns_set_ana_state(nvme_ns, desc); 4010 return 1; 4011 } 4012 4013 return 0; 4014 } 4015 4016 static int 4017 nvme_generate_uuid(const char *sn, uint32_t nsid, struct spdk_uuid *uuid) 4018 { 4019 int rc = 0; 4020 struct spdk_uuid new_uuid, namespace_uuid; 4021 char merged_str[SPDK_NVME_CTRLR_SN_LEN + NSID_STR_LEN + 1] = {'\0'}; 4022 /* This namespace UUID was generated using uuid_generate() method. */ 4023 const char *namespace_str = {"edaed2de-24bc-4b07-b559-f47ecbe730fd"}; 4024 int size; 4025 4026 assert(strlen(sn) <= SPDK_NVME_CTRLR_SN_LEN); 4027 4028 spdk_uuid_set_null(&new_uuid); 4029 spdk_uuid_set_null(&namespace_uuid); 4030 4031 size = snprintf(merged_str, sizeof(merged_str), "%s%"PRIu32, sn, nsid); 4032 if (size <= 0 || (unsigned long)size >= sizeof(merged_str)) { 4033 return -EINVAL; 4034 } 4035 4036 spdk_uuid_parse(&namespace_uuid, namespace_str); 4037 4038 rc = spdk_uuid_generate_sha1(&new_uuid, &namespace_uuid, merged_str, size); 4039 if (rc == 0) { 4040 memcpy(uuid, &new_uuid, sizeof(struct spdk_uuid)); 4041 } 4042 4043 return rc; 4044 } 4045 4046 static int 4047 nvme_disk_create(struct spdk_bdev *disk, const char *base_name, 4048 struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns, 4049 uint32_t prchk_flags, void *ctx) 4050 { 4051 const struct spdk_uuid *uuid; 4052 const uint8_t *nguid; 4053 const struct spdk_nvme_ctrlr_data *cdata; 4054 const struct spdk_nvme_ns_data *nsdata; 4055 const struct spdk_nvme_ctrlr_opts *opts; 4056 enum spdk_nvme_csi csi; 4057 uint32_t atomic_bs, phys_bs, bs; 4058 char sn_tmp[SPDK_NVME_CTRLR_SN_LEN + 1] = {'\0'}; 4059 int rc; 4060 4061 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 4062 csi = spdk_nvme_ns_get_csi(ns); 4063 opts = spdk_nvme_ctrlr_get_opts(ctrlr); 4064 4065 switch (csi) { 4066 case SPDK_NVME_CSI_NVM: 4067 disk->product_name = "NVMe disk"; 4068 break; 4069 case SPDK_NVME_CSI_ZNS: 4070 disk->product_name = "NVMe ZNS disk"; 4071 disk->zoned = true; 4072 disk->zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 4073 disk->max_zone_append_size = spdk_nvme_zns_ctrlr_get_max_zone_append_size(ctrlr) / 4074 spdk_nvme_ns_get_extended_sector_size(ns); 4075 disk->max_open_zones = spdk_nvme_zns_ns_get_max_open_zones(ns); 4076 disk->max_active_zones = spdk_nvme_zns_ns_get_max_active_zones(ns); 4077 break; 4078 default: 4079 SPDK_ERRLOG("unsupported CSI: %u\n", csi); 4080 return -ENOTSUP; 4081 } 4082 4083 nguid = spdk_nvme_ns_get_nguid(ns); 4084 if (!nguid) { 4085 uuid = spdk_nvme_ns_get_uuid(ns); 4086 if (uuid) { 4087 disk->uuid = *uuid; 4088 } else if (g_opts.generate_uuids) { 4089 spdk_strcpy_pad(sn_tmp, cdata->sn, SPDK_NVME_CTRLR_SN_LEN, '\0'); 4090 rc = nvme_generate_uuid(sn_tmp, spdk_nvme_ns_get_id(ns), &disk->uuid); 4091 if (rc != 0) { 4092 SPDK_ERRLOG("UUID generation failed (%s)\n", strerror(rc)); 4093 return rc; 4094 } 4095 } 4096 } else { 4097 memcpy(&disk->uuid, nguid, sizeof(disk->uuid)); 4098 } 4099 4100 disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns)); 4101 if (!disk->name) { 4102 return -ENOMEM; 4103 } 4104 4105 disk->write_cache = 0; 4106 if (cdata->vwc.present) { 4107 /* Enable if the Volatile Write Cache exists */ 4108 disk->write_cache = 1; 4109 } 4110 if (cdata->oncs.write_zeroes) { 4111 disk->max_write_zeroes = UINT16_MAX + 1; 4112 } 4113 disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns); 4114 disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns); 4115 disk->max_segment_size = spdk_nvme_ctrlr_get_max_xfer_size(ctrlr); 4116 /* NVMe driver will split one request into multiple requests 4117 * based on MDTS and stripe boundary, the bdev layer will use 4118 * max_segment_size and max_num_segments to split one big IO 4119 * into multiple requests, then small request can't run out 4120 * of NVMe internal requests data structure. 4121 */ 4122 if (opts && opts->io_queue_requests) { 4123 disk->max_num_segments = opts->io_queue_requests / 2; 4124 } 4125 if (spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_SGL_SUPPORTED) { 4126 /* The nvme driver will try to split I/O that have too many 4127 * SGEs, but it doesn't work if that last SGE doesn't end on 4128 * an aggregate total that is block aligned. The bdev layer has 4129 * a more robust splitting framework, so use that instead for 4130 * this case. (See issue #3269.) 4131 */ 4132 uint16_t max_sges = spdk_nvme_ctrlr_get_max_sges(ctrlr); 4133 4134 if (disk->max_num_segments == 0) { 4135 disk->max_num_segments = max_sges; 4136 } else { 4137 disk->max_num_segments = spdk_min(disk->max_num_segments, max_sges); 4138 } 4139 } 4140 disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns); 4141 4142 nsdata = spdk_nvme_ns_get_data(ns); 4143 bs = spdk_nvme_ns_get_sector_size(ns); 4144 atomic_bs = bs; 4145 phys_bs = bs; 4146 if (nsdata->nabo == 0) { 4147 if (nsdata->nsfeat.ns_atomic_write_unit && nsdata->nawupf) { 4148 atomic_bs = bs * (1 + nsdata->nawupf); 4149 } else { 4150 atomic_bs = bs * (1 + cdata->awupf); 4151 } 4152 } 4153 if (nsdata->nsfeat.optperf) { 4154 phys_bs = bs * (1 + nsdata->npwg); 4155 } 4156 disk->phys_blocklen = spdk_min(phys_bs, atomic_bs); 4157 4158 disk->md_len = spdk_nvme_ns_get_md_size(ns); 4159 if (disk->md_len != 0) { 4160 disk->md_interleave = nsdata->flbas.extended; 4161 disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns); 4162 if (disk->dif_type != SPDK_DIF_DISABLE) { 4163 disk->dif_is_head_of_md = nsdata->dps.md_start; 4164 disk->dif_check_flags = prchk_flags; 4165 } 4166 } 4167 4168 if (!(spdk_nvme_ctrlr_get_flags(ctrlr) & 4169 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) { 4170 disk->acwu = 0; 4171 } else if (nsdata->nsfeat.ns_atomic_write_unit) { 4172 disk->acwu = nsdata->nacwu + 1; /* 0-based */ 4173 } else { 4174 disk->acwu = cdata->acwu + 1; /* 0-based */ 4175 } 4176 4177 if (cdata->oncs.copy) { 4178 /* For now bdev interface allows only single segment copy */ 4179 disk->max_copy = nsdata->mssrl; 4180 } 4181 4182 disk->ctxt = ctx; 4183 disk->fn_table = &nvmelib_fn_table; 4184 disk->module = &nvme_if; 4185 4186 return 0; 4187 } 4188 4189 static struct nvme_bdev * 4190 nvme_bdev_alloc(void) 4191 { 4192 struct nvme_bdev *bdev; 4193 int rc; 4194 4195 bdev = calloc(1, sizeof(*bdev)); 4196 if (!bdev) { 4197 SPDK_ERRLOG("bdev calloc() failed\n"); 4198 return NULL; 4199 } 4200 4201 if (g_opts.nvme_error_stat) { 4202 bdev->err_stat = calloc(1, sizeof(struct nvme_error_stat)); 4203 if (!bdev->err_stat) { 4204 SPDK_ERRLOG("err_stat calloc() failed\n"); 4205 free(bdev); 4206 return NULL; 4207 } 4208 } 4209 4210 rc = pthread_mutex_init(&bdev->mutex, NULL); 4211 if (rc != 0) { 4212 free(bdev->err_stat); 4213 free(bdev); 4214 return NULL; 4215 } 4216 4217 bdev->ref = 1; 4218 bdev->mp_policy = BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE; 4219 bdev->mp_selector = BDEV_NVME_MP_SELECTOR_ROUND_ROBIN; 4220 bdev->rr_min_io = UINT32_MAX; 4221 TAILQ_INIT(&bdev->nvme_ns_list); 4222 4223 return bdev; 4224 } 4225 4226 static int 4227 nvme_bdev_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4228 { 4229 struct nvme_bdev *bdev; 4230 struct nvme_bdev_ctrlr *nbdev_ctrlr = nvme_ctrlr->nbdev_ctrlr; 4231 int rc; 4232 4233 bdev = nvme_bdev_alloc(); 4234 if (bdev == NULL) { 4235 SPDK_ERRLOG("Failed to allocate NVMe bdev\n"); 4236 return -ENOMEM; 4237 } 4238 4239 bdev->opal = nvme_ctrlr->opal_dev != NULL; 4240 4241 rc = nvme_disk_create(&bdev->disk, nbdev_ctrlr->name, nvme_ctrlr->ctrlr, 4242 nvme_ns->ns, nvme_ctrlr->opts.prchk_flags, bdev); 4243 if (rc != 0) { 4244 SPDK_ERRLOG("Failed to create NVMe disk\n"); 4245 nvme_bdev_free(bdev); 4246 return rc; 4247 } 4248 4249 spdk_io_device_register(bdev, 4250 bdev_nvme_create_bdev_channel_cb, 4251 bdev_nvme_destroy_bdev_channel_cb, 4252 sizeof(struct nvme_bdev_channel), 4253 bdev->disk.name); 4254 4255 nvme_ns->bdev = bdev; 4256 bdev->nsid = nvme_ns->id; 4257 TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); 4258 4259 bdev->nbdev_ctrlr = nbdev_ctrlr; 4260 TAILQ_INSERT_TAIL(&nbdev_ctrlr->bdevs, bdev, tailq); 4261 4262 rc = spdk_bdev_register(&bdev->disk); 4263 if (rc != 0) { 4264 SPDK_ERRLOG("spdk_bdev_register() failed\n"); 4265 spdk_io_device_unregister(bdev, NULL); 4266 nvme_ns->bdev = NULL; 4267 TAILQ_REMOVE(&nbdev_ctrlr->bdevs, bdev, tailq); 4268 nvme_bdev_free(bdev); 4269 return rc; 4270 } 4271 4272 return 0; 4273 } 4274 4275 static bool 4276 bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2) 4277 { 4278 const struct spdk_nvme_ns_data *nsdata1, *nsdata2; 4279 const struct spdk_uuid *uuid1, *uuid2; 4280 4281 nsdata1 = spdk_nvme_ns_get_data(ns1); 4282 nsdata2 = spdk_nvme_ns_get_data(ns2); 4283 uuid1 = spdk_nvme_ns_get_uuid(ns1); 4284 uuid2 = spdk_nvme_ns_get_uuid(ns2); 4285 4286 return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid)) == 0 && 4287 nsdata1->eui64 == nsdata2->eui64 && 4288 ((uuid1 == NULL && uuid2 == NULL) || 4289 (uuid1 != NULL && uuid2 != NULL && spdk_uuid_compare(uuid1, uuid2) == 0)) && 4290 spdk_nvme_ns_get_csi(ns1) == spdk_nvme_ns_get_csi(ns2); 4291 } 4292 4293 static bool 4294 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 4295 struct spdk_nvme_ctrlr_opts *opts) 4296 { 4297 struct nvme_probe_skip_entry *entry; 4298 4299 TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) { 4300 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 4301 return false; 4302 } 4303 } 4304 4305 opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst; 4306 opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight; 4307 opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight; 4308 opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight; 4309 opts->disable_read_ana_log_page = true; 4310 4311 SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr); 4312 4313 return true; 4314 } 4315 4316 static void 4317 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl) 4318 { 4319 struct nvme_ctrlr *nvme_ctrlr = ctx; 4320 4321 if (spdk_nvme_cpl_is_error(cpl)) { 4322 SPDK_WARNLOG("Abort failed. Resetting controller. sc is %u, sct is %u.\n", cpl->status.sc, 4323 cpl->status.sct); 4324 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4325 } else if (cpl->cdw0 & 0x1) { 4326 SPDK_WARNLOG("Specified command could not be aborted.\n"); 4327 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4328 } 4329 } 4330 4331 static void 4332 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr, 4333 struct spdk_nvme_qpair *qpair, uint16_t cid) 4334 { 4335 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 4336 union spdk_nvme_csts_register csts; 4337 int rc; 4338 4339 assert(nvme_ctrlr->ctrlr == ctrlr); 4340 4341 SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid); 4342 4343 /* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O 4344 * queue. (Note: qpair == NULL when there's an admin cmd timeout.) Otherwise we 4345 * would submit another fabrics cmd on the admin queue to read CSTS and check for its 4346 * completion recursively. 4347 */ 4348 if (nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) { 4349 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); 4350 if (csts.bits.cfs) { 4351 SPDK_ERRLOG("Controller Fatal Status, reset required\n"); 4352 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4353 return; 4354 } 4355 } 4356 4357 switch (g_opts.action_on_timeout) { 4358 case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT: 4359 if (qpair) { 4360 /* Don't send abort to ctrlr when ctrlr is not available. */ 4361 pthread_mutex_lock(&nvme_ctrlr->mutex); 4362 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 4363 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4364 SPDK_NOTICELOG("Quit abort. Ctrlr is not available.\n"); 4365 return; 4366 } 4367 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4368 4369 rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid, 4370 nvme_abort_cpl, nvme_ctrlr); 4371 if (rc == 0) { 4372 return; 4373 } 4374 4375 SPDK_ERRLOG("Unable to send abort. Resetting, rc is %d.\n", rc); 4376 } 4377 4378 /* FALLTHROUGH */ 4379 case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET: 4380 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4381 break; 4382 case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE: 4383 SPDK_DEBUGLOG(bdev_nvme, "No action for nvme controller timeout.\n"); 4384 break; 4385 default: 4386 SPDK_ERRLOG("An invalid timeout action value is found.\n"); 4387 break; 4388 } 4389 } 4390 4391 static struct nvme_ns * 4392 nvme_ns_alloc(void) 4393 { 4394 struct nvme_ns *nvme_ns; 4395 4396 nvme_ns = calloc(1, sizeof(struct nvme_ns)); 4397 if (nvme_ns == NULL) { 4398 return NULL; 4399 } 4400 4401 if (g_opts.io_path_stat) { 4402 nvme_ns->stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); 4403 if (nvme_ns->stat == NULL) { 4404 free(nvme_ns); 4405 return NULL; 4406 } 4407 spdk_bdev_reset_io_stat(nvme_ns->stat, SPDK_BDEV_RESET_STAT_MAXMIN); 4408 } 4409 4410 return nvme_ns; 4411 } 4412 4413 static void 4414 nvme_ns_free(struct nvme_ns *nvme_ns) 4415 { 4416 free(nvme_ns->stat); 4417 free(nvme_ns); 4418 } 4419 4420 static void 4421 nvme_ctrlr_populate_namespace_done(struct nvme_ns *nvme_ns, int rc) 4422 { 4423 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 4424 struct nvme_async_probe_ctx *ctx = nvme_ns->probe_ctx; 4425 4426 if (rc == 0) { 4427 nvme_ns->probe_ctx = NULL; 4428 pthread_mutex_lock(&nvme_ctrlr->mutex); 4429 nvme_ctrlr->ref++; 4430 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4431 } else { 4432 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4433 nvme_ns_free(nvme_ns); 4434 } 4435 4436 if (ctx) { 4437 ctx->populates_in_progress--; 4438 if (ctx->populates_in_progress == 0) { 4439 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 4440 } 4441 } 4442 } 4443 4444 static void 4445 bdev_nvme_add_io_path(struct spdk_io_channel_iter *i) 4446 { 4447 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4448 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4449 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4450 int rc; 4451 4452 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 4453 if (rc != 0) { 4454 SPDK_ERRLOG("Failed to add I/O path to bdev_channel dynamically.\n"); 4455 } 4456 4457 spdk_for_each_channel_continue(i, rc); 4458 } 4459 4460 static void 4461 bdev_nvme_delete_io_path(struct spdk_io_channel_iter *i) 4462 { 4463 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4464 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4465 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4466 struct nvme_io_path *io_path; 4467 4468 io_path = _bdev_nvme_get_io_path(nbdev_ch, nvme_ns); 4469 if (io_path != NULL) { 4470 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 4471 } 4472 4473 spdk_for_each_channel_continue(i, 0); 4474 } 4475 4476 static void 4477 bdev_nvme_add_io_path_failed(struct spdk_io_channel_iter *i, int status) 4478 { 4479 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4480 4481 nvme_ctrlr_populate_namespace_done(nvme_ns, -1); 4482 } 4483 4484 static void 4485 bdev_nvme_add_io_path_done(struct spdk_io_channel_iter *i, int status) 4486 { 4487 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4488 struct nvme_bdev *bdev = spdk_io_channel_iter_get_io_device(i); 4489 4490 if (status == 0) { 4491 nvme_ctrlr_populate_namespace_done(nvme_ns, 0); 4492 } else { 4493 /* Delete the added io_paths and fail populating the namespace. */ 4494 spdk_for_each_channel(bdev, 4495 bdev_nvme_delete_io_path, 4496 nvme_ns, 4497 bdev_nvme_add_io_path_failed); 4498 } 4499 } 4500 4501 static int 4502 nvme_bdev_add_ns(struct nvme_bdev *bdev, struct nvme_ns *nvme_ns) 4503 { 4504 struct nvme_ns *tmp_ns; 4505 const struct spdk_nvme_ns_data *nsdata; 4506 4507 nsdata = spdk_nvme_ns_get_data(nvme_ns->ns); 4508 if (!nsdata->nmic.can_share) { 4509 SPDK_ERRLOG("Namespace cannot be shared.\n"); 4510 return -EINVAL; 4511 } 4512 4513 pthread_mutex_lock(&bdev->mutex); 4514 4515 tmp_ns = TAILQ_FIRST(&bdev->nvme_ns_list); 4516 assert(tmp_ns != NULL); 4517 4518 if (tmp_ns->ns != NULL && !bdev_nvme_compare_ns(nvme_ns->ns, tmp_ns->ns)) { 4519 pthread_mutex_unlock(&bdev->mutex); 4520 SPDK_ERRLOG("Namespaces are not identical.\n"); 4521 return -EINVAL; 4522 } 4523 4524 bdev->ref++; 4525 TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); 4526 nvme_ns->bdev = bdev; 4527 4528 pthread_mutex_unlock(&bdev->mutex); 4529 4530 /* Add nvme_io_path to nvme_bdev_channels dynamically. */ 4531 spdk_for_each_channel(bdev, 4532 bdev_nvme_add_io_path, 4533 nvme_ns, 4534 bdev_nvme_add_io_path_done); 4535 4536 return 0; 4537 } 4538 4539 static void 4540 nvme_ctrlr_populate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4541 { 4542 struct spdk_nvme_ns *ns; 4543 struct nvme_bdev *bdev; 4544 int rc = 0; 4545 4546 ns = spdk_nvme_ctrlr_get_ns(nvme_ctrlr->ctrlr, nvme_ns->id); 4547 if (!ns) { 4548 SPDK_DEBUGLOG(bdev_nvme, "Invalid NS %d\n", nvme_ns->id); 4549 rc = -EINVAL; 4550 goto done; 4551 } 4552 4553 nvme_ns->ns = ns; 4554 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 4555 4556 if (nvme_ctrlr->ana_log_page != NULL) { 4557 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ns_set_ana_state, nvme_ns); 4558 } 4559 4560 bdev = nvme_bdev_ctrlr_get_bdev(nvme_ctrlr->nbdev_ctrlr, nvme_ns->id); 4561 if (bdev == NULL) { 4562 rc = nvme_bdev_create(nvme_ctrlr, nvme_ns); 4563 } else { 4564 rc = nvme_bdev_add_ns(bdev, nvme_ns); 4565 if (rc == 0) { 4566 return; 4567 } 4568 } 4569 done: 4570 nvme_ctrlr_populate_namespace_done(nvme_ns, rc); 4571 } 4572 4573 static void 4574 nvme_ctrlr_depopulate_namespace_done(struct nvme_ns *nvme_ns) 4575 { 4576 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 4577 4578 assert(nvme_ctrlr != NULL); 4579 4580 pthread_mutex_lock(&nvme_ctrlr->mutex); 4581 4582 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4583 4584 if (nvme_ns->bdev != NULL) { 4585 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4586 return; 4587 } 4588 4589 nvme_ns_free(nvme_ns); 4590 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4591 4592 nvme_ctrlr_release(nvme_ctrlr); 4593 } 4594 4595 static void 4596 bdev_nvme_delete_io_path_done(struct spdk_io_channel_iter *i, int status) 4597 { 4598 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4599 4600 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 4601 } 4602 4603 static void 4604 nvme_ctrlr_depopulate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4605 { 4606 struct nvme_bdev *bdev; 4607 4608 spdk_poller_unregister(&nvme_ns->anatt_timer); 4609 4610 bdev = nvme_ns->bdev; 4611 if (bdev != NULL) { 4612 pthread_mutex_lock(&bdev->mutex); 4613 4614 assert(bdev->ref > 0); 4615 bdev->ref--; 4616 if (bdev->ref == 0) { 4617 pthread_mutex_unlock(&bdev->mutex); 4618 4619 spdk_bdev_unregister(&bdev->disk, NULL, NULL); 4620 } else { 4621 /* spdk_bdev_unregister() is not called until the last nvme_ns is 4622 * depopulated. Hence we need to remove nvme_ns from bdev->nvme_ns_list 4623 * and clear nvme_ns->bdev here. 4624 */ 4625 TAILQ_REMOVE(&bdev->nvme_ns_list, nvme_ns, tailq); 4626 nvme_ns->bdev = NULL; 4627 4628 pthread_mutex_unlock(&bdev->mutex); 4629 4630 /* Delete nvme_io_paths from nvme_bdev_channels dynamically. After that, 4631 * we call depopulate_namespace_done() to avoid use-after-free. 4632 */ 4633 spdk_for_each_channel(bdev, 4634 bdev_nvme_delete_io_path, 4635 nvme_ns, 4636 bdev_nvme_delete_io_path_done); 4637 return; 4638 } 4639 } 4640 4641 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 4642 } 4643 4644 static void 4645 nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 4646 struct nvme_async_probe_ctx *ctx) 4647 { 4648 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 4649 struct nvme_ns *nvme_ns, *next; 4650 struct spdk_nvme_ns *ns; 4651 struct nvme_bdev *bdev; 4652 uint32_t nsid; 4653 int rc; 4654 uint64_t num_sectors; 4655 4656 if (ctx) { 4657 /* Initialize this count to 1 to handle the populate functions 4658 * calling nvme_ctrlr_populate_namespace_done() immediately. 4659 */ 4660 ctx->populates_in_progress = 1; 4661 } 4662 4663 /* First loop over our existing namespaces and see if they have been 4664 * removed. */ 4665 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 4666 while (nvme_ns != NULL) { 4667 next = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 4668 4669 if (spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) { 4670 /* NS is still there or added again. Its attributes may have changed. */ 4671 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id); 4672 if (nvme_ns->ns != ns) { 4673 assert(nvme_ns->ns == NULL); 4674 nvme_ns->ns = ns; 4675 SPDK_DEBUGLOG(bdev_nvme, "NSID %u was added\n", nvme_ns->id); 4676 } 4677 4678 num_sectors = spdk_nvme_ns_get_num_sectors(ns); 4679 bdev = nvme_ns->bdev; 4680 assert(bdev != NULL); 4681 if (bdev->disk.blockcnt != num_sectors) { 4682 SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n", 4683 nvme_ns->id, 4684 bdev->disk.name, 4685 bdev->disk.blockcnt, 4686 num_sectors); 4687 rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors); 4688 if (rc != 0) { 4689 SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n", 4690 bdev->disk.name, rc); 4691 } 4692 } 4693 } else { 4694 /* Namespace was removed */ 4695 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 4696 } 4697 4698 nvme_ns = next; 4699 } 4700 4701 /* Loop through all of the namespaces at the nvme level and see if any of them are new */ 4702 nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 4703 while (nsid != 0) { 4704 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 4705 4706 if (nvme_ns == NULL) { 4707 /* Found a new one */ 4708 nvme_ns = nvme_ns_alloc(); 4709 if (nvme_ns == NULL) { 4710 SPDK_ERRLOG("Failed to allocate namespace\n"); 4711 /* This just fails to attach the namespace. It may work on a future attempt. */ 4712 continue; 4713 } 4714 4715 nvme_ns->id = nsid; 4716 nvme_ns->ctrlr = nvme_ctrlr; 4717 4718 nvme_ns->bdev = NULL; 4719 4720 if (ctx) { 4721 ctx->populates_in_progress++; 4722 } 4723 nvme_ns->probe_ctx = ctx; 4724 4725 RB_INSERT(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4726 4727 nvme_ctrlr_populate_namespace(nvme_ctrlr, nvme_ns); 4728 } 4729 4730 nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid); 4731 } 4732 4733 if (ctx) { 4734 /* Decrement this count now that the loop is over to account 4735 * for the one we started with. If the count is then 0, we 4736 * know any populate_namespace functions completed immediately, 4737 * so we'll kick the callback here. 4738 */ 4739 ctx->populates_in_progress--; 4740 if (ctx->populates_in_progress == 0) { 4741 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 4742 } 4743 } 4744 4745 } 4746 4747 static void 4748 nvme_ctrlr_depopulate_namespaces(struct nvme_ctrlr *nvme_ctrlr) 4749 { 4750 struct nvme_ns *nvme_ns, *tmp; 4751 4752 RB_FOREACH_SAFE(nvme_ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp) { 4753 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 4754 } 4755 } 4756 4757 static uint32_t 4758 nvme_ctrlr_get_ana_log_page_size(struct nvme_ctrlr *nvme_ctrlr) 4759 { 4760 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 4761 const struct spdk_nvme_ctrlr_data *cdata; 4762 uint32_t nsid, ns_count = 0; 4763 4764 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 4765 4766 for (nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 4767 nsid != 0; nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid)) { 4768 ns_count++; 4769 } 4770 4771 return sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 4772 sizeof(struct spdk_nvme_ana_group_descriptor) + ns_count * 4773 sizeof(uint32_t); 4774 } 4775 4776 static int 4777 nvme_ctrlr_set_ana_states(const struct spdk_nvme_ana_group_descriptor *desc, 4778 void *cb_arg) 4779 { 4780 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 4781 struct nvme_ns *nvme_ns; 4782 uint32_t i, nsid; 4783 4784 for (i = 0; i < desc->num_of_nsid; i++) { 4785 nsid = desc->nsid[i]; 4786 if (nsid == 0) { 4787 continue; 4788 } 4789 4790 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 4791 4792 assert(nvme_ns != NULL); 4793 if (nvme_ns == NULL) { 4794 /* Target told us that an inactive namespace had an ANA change */ 4795 continue; 4796 } 4797 4798 _nvme_ns_set_ana_state(nvme_ns, desc); 4799 } 4800 4801 return 0; 4802 } 4803 4804 static void 4805 bdev_nvme_disable_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 4806 { 4807 struct nvme_ns *nvme_ns; 4808 4809 spdk_free(nvme_ctrlr->ana_log_page); 4810 nvme_ctrlr->ana_log_page = NULL; 4811 4812 for (nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 4813 nvme_ns != NULL; 4814 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns)) { 4815 nvme_ns->ana_state_updating = false; 4816 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 4817 } 4818 } 4819 4820 static void 4821 nvme_ctrlr_read_ana_log_page_done(void *ctx, const struct spdk_nvme_cpl *cpl) 4822 { 4823 struct nvme_ctrlr *nvme_ctrlr = ctx; 4824 4825 if (cpl != NULL && spdk_nvme_cpl_is_success(cpl)) { 4826 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ctrlr_set_ana_states, 4827 nvme_ctrlr); 4828 } else { 4829 bdev_nvme_disable_read_ana_log_page(nvme_ctrlr); 4830 } 4831 4832 pthread_mutex_lock(&nvme_ctrlr->mutex); 4833 4834 assert(nvme_ctrlr->ana_log_page_updating == true); 4835 nvme_ctrlr->ana_log_page_updating = false; 4836 4837 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 4838 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4839 4840 nvme_ctrlr_unregister(nvme_ctrlr); 4841 } else { 4842 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4843 4844 bdev_nvme_clear_io_path_caches(nvme_ctrlr); 4845 } 4846 } 4847 4848 static int 4849 nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 4850 { 4851 uint32_t ana_log_page_size; 4852 int rc; 4853 4854 if (nvme_ctrlr->ana_log_page == NULL) { 4855 return -EINVAL; 4856 } 4857 4858 ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); 4859 4860 if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { 4861 SPDK_ERRLOG("ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", 4862 ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); 4863 return -EINVAL; 4864 } 4865 4866 pthread_mutex_lock(&nvme_ctrlr->mutex); 4867 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 4868 nvme_ctrlr->ana_log_page_updating) { 4869 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4870 return -EBUSY; 4871 } 4872 4873 nvme_ctrlr->ana_log_page_updating = true; 4874 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4875 4876 rc = spdk_nvme_ctrlr_cmd_get_log_page(nvme_ctrlr->ctrlr, 4877 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 4878 SPDK_NVME_GLOBAL_NS_TAG, 4879 nvme_ctrlr->ana_log_page, 4880 ana_log_page_size, 0, 4881 nvme_ctrlr_read_ana_log_page_done, 4882 nvme_ctrlr); 4883 if (rc != 0) { 4884 nvme_ctrlr_read_ana_log_page_done(nvme_ctrlr, NULL); 4885 } 4886 4887 return rc; 4888 } 4889 4890 static void 4891 dummy_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 4892 { 4893 } 4894 4895 struct bdev_nvme_set_preferred_path_ctx { 4896 struct spdk_bdev_desc *desc; 4897 struct nvme_ns *nvme_ns; 4898 bdev_nvme_set_preferred_path_cb cb_fn; 4899 void *cb_arg; 4900 }; 4901 4902 static void 4903 bdev_nvme_set_preferred_path_done(struct spdk_io_channel_iter *i, int status) 4904 { 4905 struct bdev_nvme_set_preferred_path_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4906 4907 assert(ctx != NULL); 4908 assert(ctx->desc != NULL); 4909 assert(ctx->cb_fn != NULL); 4910 4911 spdk_bdev_close(ctx->desc); 4912 4913 ctx->cb_fn(ctx->cb_arg, status); 4914 4915 free(ctx); 4916 } 4917 4918 static void 4919 _bdev_nvme_set_preferred_path(struct spdk_io_channel_iter *i) 4920 { 4921 struct bdev_nvme_set_preferred_path_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4922 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4923 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4924 struct nvme_io_path *io_path, *prev; 4925 4926 prev = NULL; 4927 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 4928 if (io_path->nvme_ns == ctx->nvme_ns) { 4929 break; 4930 } 4931 prev = io_path; 4932 } 4933 4934 if (io_path != NULL) { 4935 if (prev != NULL) { 4936 STAILQ_REMOVE_AFTER(&nbdev_ch->io_path_list, prev, stailq); 4937 STAILQ_INSERT_HEAD(&nbdev_ch->io_path_list, io_path, stailq); 4938 } 4939 4940 /* We can set io_path to nbdev_ch->current_io_path directly here. 4941 * However, it needs to be conditional. To simplify the code, 4942 * just clear nbdev_ch->current_io_path and let find_io_path() 4943 * fill it. 4944 * 4945 * Automatic failback may be disabled. Hence even if the io_path is 4946 * already at the head, clear nbdev_ch->current_io_path. 4947 */ 4948 bdev_nvme_clear_current_io_path(nbdev_ch); 4949 } 4950 4951 spdk_for_each_channel_continue(i, 0); 4952 } 4953 4954 static struct nvme_ns * 4955 bdev_nvme_set_preferred_ns(struct nvme_bdev *nbdev, uint16_t cntlid) 4956 { 4957 struct nvme_ns *nvme_ns, *prev; 4958 const struct spdk_nvme_ctrlr_data *cdata; 4959 4960 prev = NULL; 4961 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 4962 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 4963 4964 if (cdata->cntlid == cntlid) { 4965 break; 4966 } 4967 prev = nvme_ns; 4968 } 4969 4970 if (nvme_ns != NULL && prev != NULL) { 4971 TAILQ_REMOVE(&nbdev->nvme_ns_list, nvme_ns, tailq); 4972 TAILQ_INSERT_HEAD(&nbdev->nvme_ns_list, nvme_ns, tailq); 4973 } 4974 4975 return nvme_ns; 4976 } 4977 4978 /* This function supports only multipath mode. There is only a single I/O path 4979 * for each NVMe-oF controller. Hence, just move the matched I/O path to the 4980 * head of the I/O path list for each NVMe bdev channel. 4981 * 4982 * NVMe bdev channel may be acquired after completing this function. move the 4983 * matched namespace to the head of the namespace list for the NVMe bdev too. 4984 */ 4985 void 4986 bdev_nvme_set_preferred_path(const char *name, uint16_t cntlid, 4987 bdev_nvme_set_preferred_path_cb cb_fn, void *cb_arg) 4988 { 4989 struct bdev_nvme_set_preferred_path_ctx *ctx; 4990 struct spdk_bdev *bdev; 4991 struct nvme_bdev *nbdev; 4992 int rc = 0; 4993 4994 assert(cb_fn != NULL); 4995 4996 ctx = calloc(1, sizeof(*ctx)); 4997 if (ctx == NULL) { 4998 SPDK_ERRLOG("Failed to alloc context.\n"); 4999 rc = -ENOMEM; 5000 goto err_alloc; 5001 } 5002 5003 ctx->cb_fn = cb_fn; 5004 ctx->cb_arg = cb_arg; 5005 5006 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 5007 if (rc != 0) { 5008 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 5009 goto err_open; 5010 } 5011 5012 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 5013 5014 if (bdev->module != &nvme_if) { 5015 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 5016 rc = -ENODEV; 5017 goto err_bdev; 5018 } 5019 5020 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 5021 5022 pthread_mutex_lock(&nbdev->mutex); 5023 5024 ctx->nvme_ns = bdev_nvme_set_preferred_ns(nbdev, cntlid); 5025 if (ctx->nvme_ns == NULL) { 5026 pthread_mutex_unlock(&nbdev->mutex); 5027 5028 SPDK_ERRLOG("bdev %s does not have namespace to controller %u.\n", name, cntlid); 5029 rc = -ENODEV; 5030 goto err_bdev; 5031 } 5032 5033 pthread_mutex_unlock(&nbdev->mutex); 5034 5035 spdk_for_each_channel(nbdev, 5036 _bdev_nvme_set_preferred_path, 5037 ctx, 5038 bdev_nvme_set_preferred_path_done); 5039 return; 5040 5041 err_bdev: 5042 spdk_bdev_close(ctx->desc); 5043 err_open: 5044 free(ctx); 5045 err_alloc: 5046 cb_fn(cb_arg, rc); 5047 } 5048 5049 struct bdev_nvme_set_multipath_policy_ctx { 5050 struct spdk_bdev_desc *desc; 5051 bdev_nvme_set_multipath_policy_cb cb_fn; 5052 void *cb_arg; 5053 }; 5054 5055 static void 5056 bdev_nvme_set_multipath_policy_done(struct spdk_io_channel_iter *i, int status) 5057 { 5058 struct bdev_nvme_set_multipath_policy_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 5059 5060 assert(ctx != NULL); 5061 assert(ctx->desc != NULL); 5062 assert(ctx->cb_fn != NULL); 5063 5064 spdk_bdev_close(ctx->desc); 5065 5066 ctx->cb_fn(ctx->cb_arg, status); 5067 5068 free(ctx); 5069 } 5070 5071 static void 5072 _bdev_nvme_set_multipath_policy(struct spdk_io_channel_iter *i) 5073 { 5074 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 5075 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 5076 struct nvme_bdev *nbdev = spdk_io_channel_get_io_device(_ch); 5077 5078 nbdev_ch->mp_policy = nbdev->mp_policy; 5079 nbdev_ch->mp_selector = nbdev->mp_selector; 5080 nbdev_ch->rr_min_io = nbdev->rr_min_io; 5081 bdev_nvme_clear_current_io_path(nbdev_ch); 5082 5083 spdk_for_each_channel_continue(i, 0); 5084 } 5085 5086 void 5087 bdev_nvme_set_multipath_policy(const char *name, enum bdev_nvme_multipath_policy policy, 5088 enum bdev_nvme_multipath_selector selector, uint32_t rr_min_io, 5089 bdev_nvme_set_multipath_policy_cb cb_fn, void *cb_arg) 5090 { 5091 struct bdev_nvme_set_multipath_policy_ctx *ctx; 5092 struct spdk_bdev *bdev; 5093 struct nvme_bdev *nbdev; 5094 int rc; 5095 5096 assert(cb_fn != NULL); 5097 5098 if (policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE && selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 5099 if (rr_min_io == UINT32_MAX) { 5100 rr_min_io = 1; 5101 } else if (rr_min_io == 0) { 5102 rc = -EINVAL; 5103 goto exit; 5104 } 5105 } else if (rr_min_io != UINT32_MAX) { 5106 rc = -EINVAL; 5107 goto exit; 5108 } 5109 5110 ctx = calloc(1, sizeof(*ctx)); 5111 if (ctx == NULL) { 5112 SPDK_ERRLOG("Failed to alloc context.\n"); 5113 rc = -ENOMEM; 5114 goto exit; 5115 } 5116 5117 ctx->cb_fn = cb_fn; 5118 ctx->cb_arg = cb_arg; 5119 5120 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 5121 if (rc != 0) { 5122 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 5123 rc = -ENODEV; 5124 goto err_open; 5125 } 5126 5127 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 5128 if (bdev->module != &nvme_if) { 5129 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 5130 rc = -ENODEV; 5131 goto err_module; 5132 } 5133 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 5134 5135 pthread_mutex_lock(&nbdev->mutex); 5136 nbdev->mp_policy = policy; 5137 nbdev->mp_selector = selector; 5138 nbdev->rr_min_io = rr_min_io; 5139 pthread_mutex_unlock(&nbdev->mutex); 5140 5141 spdk_for_each_channel(nbdev, 5142 _bdev_nvme_set_multipath_policy, 5143 ctx, 5144 bdev_nvme_set_multipath_policy_done); 5145 return; 5146 5147 err_module: 5148 spdk_bdev_close(ctx->desc); 5149 err_open: 5150 free(ctx); 5151 exit: 5152 cb_fn(cb_arg, rc); 5153 } 5154 5155 static void 5156 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 5157 { 5158 struct nvme_ctrlr *nvme_ctrlr = arg; 5159 union spdk_nvme_async_event_completion event; 5160 5161 if (spdk_nvme_cpl_is_error(cpl)) { 5162 SPDK_WARNLOG("AER request execute failed\n"); 5163 return; 5164 } 5165 5166 event.raw = cpl->cdw0; 5167 if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 5168 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) { 5169 nvme_ctrlr_populate_namespaces(nvme_ctrlr, NULL); 5170 } else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 5171 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_ANA_CHANGE)) { 5172 nvme_ctrlr_read_ana_log_page(nvme_ctrlr); 5173 } 5174 } 5175 5176 static void 5177 free_nvme_async_probe_ctx(struct nvme_async_probe_ctx *ctx) 5178 { 5179 spdk_keyring_put_key(ctx->drv_opts.tls_psk); 5180 spdk_keyring_put_key(ctx->drv_opts.dhchap_key); 5181 free(ctx); 5182 } 5183 5184 static void 5185 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, int rc) 5186 { 5187 if (ctx->cb_fn) { 5188 ctx->cb_fn(ctx->cb_ctx, ctx->reported_bdevs, rc); 5189 } 5190 5191 ctx->namespaces_populated = true; 5192 if (ctx->probe_done) { 5193 /* The probe was already completed, so we need to free the context 5194 * here. This can happen for cases like OCSSD, where we need to 5195 * send additional commands to the SSD after attach. 5196 */ 5197 free_nvme_async_probe_ctx(ctx); 5198 } 5199 } 5200 5201 static void 5202 nvme_ctrlr_create_done(struct nvme_ctrlr *nvme_ctrlr, 5203 struct nvme_async_probe_ctx *ctx) 5204 { 5205 spdk_io_device_register(nvme_ctrlr, 5206 bdev_nvme_create_ctrlr_channel_cb, 5207 bdev_nvme_destroy_ctrlr_channel_cb, 5208 sizeof(struct nvme_ctrlr_channel), 5209 nvme_ctrlr->nbdev_ctrlr->name); 5210 5211 nvme_ctrlr_populate_namespaces(nvme_ctrlr, ctx); 5212 } 5213 5214 static void 5215 nvme_ctrlr_init_ana_log_page_done(void *_ctx, const struct spdk_nvme_cpl *cpl) 5216 { 5217 struct nvme_ctrlr *nvme_ctrlr = _ctx; 5218 struct nvme_async_probe_ctx *ctx = nvme_ctrlr->probe_ctx; 5219 5220 nvme_ctrlr->probe_ctx = NULL; 5221 5222 if (spdk_nvme_cpl_is_error(cpl)) { 5223 nvme_ctrlr_delete(nvme_ctrlr); 5224 5225 if (ctx != NULL) { 5226 ctx->reported_bdevs = 0; 5227 populate_namespaces_cb(ctx, -1); 5228 } 5229 return; 5230 } 5231 5232 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 5233 } 5234 5235 static int 5236 nvme_ctrlr_init_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 5237 struct nvme_async_probe_ctx *ctx) 5238 { 5239 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 5240 const struct spdk_nvme_ctrlr_data *cdata; 5241 uint32_t ana_log_page_size; 5242 5243 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5244 5245 /* Set buffer size enough to include maximum number of allowed namespaces. */ 5246 ana_log_page_size = sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 5247 sizeof(struct spdk_nvme_ana_group_descriptor) + cdata->mnan * 5248 sizeof(uint32_t); 5249 5250 nvme_ctrlr->ana_log_page = spdk_zmalloc(ana_log_page_size, 64, NULL, 5251 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 5252 if (nvme_ctrlr->ana_log_page == NULL) { 5253 SPDK_ERRLOG("could not allocate ANA log page buffer\n"); 5254 return -ENXIO; 5255 } 5256 5257 /* Each descriptor in a ANA log page is not ensured to be 8-bytes aligned. 5258 * Hence copy each descriptor to a temporary area when parsing it. 5259 * 5260 * Allocate a buffer whose size is as large as ANA log page buffer because 5261 * we do not know the size of a descriptor until actually reading it. 5262 */ 5263 nvme_ctrlr->copied_ana_desc = calloc(1, ana_log_page_size); 5264 if (nvme_ctrlr->copied_ana_desc == NULL) { 5265 SPDK_ERRLOG("could not allocate a buffer to parse ANA descriptor\n"); 5266 return -ENOMEM; 5267 } 5268 5269 nvme_ctrlr->max_ana_log_page_size = ana_log_page_size; 5270 5271 nvme_ctrlr->probe_ctx = ctx; 5272 5273 /* Then, set the read size only to include the current active namespaces. */ 5274 ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); 5275 5276 if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { 5277 SPDK_ERRLOG("ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", 5278 ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); 5279 return -EINVAL; 5280 } 5281 5282 return spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, 5283 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 5284 SPDK_NVME_GLOBAL_NS_TAG, 5285 nvme_ctrlr->ana_log_page, 5286 ana_log_page_size, 0, 5287 nvme_ctrlr_init_ana_log_page_done, 5288 nvme_ctrlr); 5289 } 5290 5291 /* hostnqn and subnqn were already verified before attaching a controller. 5292 * Hence check only the multipath capability and cntlid here. 5293 */ 5294 static bool 5295 bdev_nvme_check_multipath(struct nvme_bdev_ctrlr *nbdev_ctrlr, struct spdk_nvme_ctrlr *ctrlr) 5296 { 5297 struct nvme_ctrlr *tmp; 5298 const struct spdk_nvme_ctrlr_data *cdata, *tmp_cdata; 5299 5300 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5301 5302 if (!cdata->cmic.multi_ctrlr) { 5303 SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); 5304 return false; 5305 } 5306 5307 TAILQ_FOREACH(tmp, &nbdev_ctrlr->ctrlrs, tailq) { 5308 tmp_cdata = spdk_nvme_ctrlr_get_data(tmp->ctrlr); 5309 5310 if (!tmp_cdata->cmic.multi_ctrlr) { 5311 SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); 5312 return false; 5313 } 5314 if (cdata->cntlid == tmp_cdata->cntlid) { 5315 SPDK_ERRLOG("cntlid %u are duplicated.\n", tmp_cdata->cntlid); 5316 return false; 5317 } 5318 } 5319 5320 return true; 5321 } 5322 5323 static int 5324 nvme_bdev_ctrlr_create(const char *name, struct nvme_ctrlr *nvme_ctrlr) 5325 { 5326 struct nvme_bdev_ctrlr *nbdev_ctrlr; 5327 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 5328 int rc = 0; 5329 5330 pthread_mutex_lock(&g_bdev_nvme_mutex); 5331 5332 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 5333 if (nbdev_ctrlr != NULL) { 5334 if (!bdev_nvme_check_multipath(nbdev_ctrlr, ctrlr)) { 5335 rc = -EINVAL; 5336 goto exit; 5337 } 5338 } else { 5339 nbdev_ctrlr = calloc(1, sizeof(*nbdev_ctrlr)); 5340 if (nbdev_ctrlr == NULL) { 5341 SPDK_ERRLOG("Failed to allocate nvme_bdev_ctrlr.\n"); 5342 rc = -ENOMEM; 5343 goto exit; 5344 } 5345 nbdev_ctrlr->name = strdup(name); 5346 if (nbdev_ctrlr->name == NULL) { 5347 SPDK_ERRLOG("Failed to allocate name of nvme_bdev_ctrlr.\n"); 5348 free(nbdev_ctrlr); 5349 goto exit; 5350 } 5351 TAILQ_INIT(&nbdev_ctrlr->ctrlrs); 5352 TAILQ_INIT(&nbdev_ctrlr->bdevs); 5353 TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 5354 } 5355 nvme_ctrlr->nbdev_ctrlr = nbdev_ctrlr; 5356 TAILQ_INSERT_TAIL(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 5357 exit: 5358 pthread_mutex_unlock(&g_bdev_nvme_mutex); 5359 return rc; 5360 } 5361 5362 static int 5363 nvme_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr, 5364 const char *name, 5365 const struct spdk_nvme_transport_id *trid, 5366 struct nvme_async_probe_ctx *ctx) 5367 { 5368 struct nvme_ctrlr *nvme_ctrlr; 5369 struct nvme_path_id *path_id; 5370 const struct spdk_nvme_ctrlr_data *cdata; 5371 int rc; 5372 5373 nvme_ctrlr = calloc(1, sizeof(*nvme_ctrlr)); 5374 if (nvme_ctrlr == NULL) { 5375 SPDK_ERRLOG("Failed to allocate device struct\n"); 5376 return -ENOMEM; 5377 } 5378 5379 rc = pthread_mutex_init(&nvme_ctrlr->mutex, NULL); 5380 if (rc != 0) { 5381 free(nvme_ctrlr); 5382 return rc; 5383 } 5384 5385 TAILQ_INIT(&nvme_ctrlr->trids); 5386 RB_INIT(&nvme_ctrlr->namespaces); 5387 5388 /* Get another reference to the key, so the first one can be released from probe_ctx */ 5389 if (ctx != NULL) { 5390 if (ctx->drv_opts.tls_psk != NULL) { 5391 nvme_ctrlr->psk = spdk_keyring_get_key( 5392 spdk_key_get_name(ctx->drv_opts.tls_psk)); 5393 if (nvme_ctrlr->psk == NULL) { 5394 /* Could only happen if the key was removed in the meantime */ 5395 SPDK_ERRLOG("Couldn't get a reference to the key '%s'\n", 5396 spdk_key_get_name(ctx->drv_opts.tls_psk)); 5397 rc = -ENOKEY; 5398 goto err; 5399 } 5400 } 5401 5402 if (ctx->drv_opts.dhchap_key != NULL) { 5403 nvme_ctrlr->dhchap_key = spdk_keyring_get_key( 5404 spdk_key_get_name(ctx->drv_opts.dhchap_key)); 5405 if (nvme_ctrlr->dhchap_key == NULL) { 5406 SPDK_ERRLOG("Couldn't get a reference to the key '%s'\n", 5407 spdk_key_get_name(ctx->drv_opts.dhchap_key)); 5408 rc = -ENOKEY; 5409 goto err; 5410 } 5411 } 5412 } 5413 5414 path_id = calloc(1, sizeof(*path_id)); 5415 if (path_id == NULL) { 5416 SPDK_ERRLOG("Failed to allocate trid entry pointer\n"); 5417 rc = -ENOMEM; 5418 goto err; 5419 } 5420 5421 path_id->trid = *trid; 5422 if (ctx != NULL) { 5423 memcpy(path_id->hostid.hostaddr, ctx->drv_opts.src_addr, sizeof(path_id->hostid.hostaddr)); 5424 memcpy(path_id->hostid.hostsvcid, ctx->drv_opts.src_svcid, sizeof(path_id->hostid.hostsvcid)); 5425 } 5426 nvme_ctrlr->active_path_id = path_id; 5427 TAILQ_INSERT_HEAD(&nvme_ctrlr->trids, path_id, link); 5428 5429 nvme_ctrlr->thread = spdk_get_thread(); 5430 nvme_ctrlr->ctrlr = ctrlr; 5431 nvme_ctrlr->ref = 1; 5432 5433 if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) { 5434 SPDK_ERRLOG("OCSSDs are not supported"); 5435 rc = -ENOTSUP; 5436 goto err; 5437 } 5438 5439 if (ctx != NULL) { 5440 memcpy(&nvme_ctrlr->opts, &ctx->bdev_opts, sizeof(ctx->bdev_opts)); 5441 } else { 5442 bdev_nvme_get_default_ctrlr_opts(&nvme_ctrlr->opts); 5443 } 5444 5445 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_ctrlr, 5446 g_opts.nvme_adminq_poll_period_us); 5447 5448 if (g_opts.timeout_us > 0) { 5449 /* Register timeout callback. Timeout values for IO vs. admin reqs can be different. */ 5450 /* If timeout_admin_us is 0 (not specified), admin uses same timeout as IO. */ 5451 uint64_t adm_timeout_us = (g_opts.timeout_admin_us == 0) ? 5452 g_opts.timeout_us : g_opts.timeout_admin_us; 5453 spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us, 5454 adm_timeout_us, timeout_cb, nvme_ctrlr); 5455 } 5456 5457 spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_ctrlr); 5458 spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_ctrlr); 5459 5460 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 5461 SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) { 5462 nvme_ctrlr->opal_dev = spdk_opal_dev_construct(ctrlr); 5463 } 5464 5465 rc = nvme_bdev_ctrlr_create(name, nvme_ctrlr); 5466 if (rc != 0) { 5467 goto err; 5468 } 5469 5470 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5471 5472 if (cdata->cmic.ana_reporting) { 5473 rc = nvme_ctrlr_init_ana_log_page(nvme_ctrlr, ctx); 5474 if (rc == 0) { 5475 return 0; 5476 } 5477 } else { 5478 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 5479 return 0; 5480 } 5481 5482 err: 5483 nvme_ctrlr_delete(nvme_ctrlr); 5484 return rc; 5485 } 5486 5487 void 5488 bdev_nvme_get_default_ctrlr_opts(struct nvme_ctrlr_opts *opts) 5489 { 5490 opts->prchk_flags = 0; 5491 opts->ctrlr_loss_timeout_sec = g_opts.ctrlr_loss_timeout_sec; 5492 opts->reconnect_delay_sec = g_opts.reconnect_delay_sec; 5493 opts->fast_io_fail_timeout_sec = g_opts.fast_io_fail_timeout_sec; 5494 } 5495 5496 static void 5497 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5498 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *drv_opts) 5499 { 5500 char *name; 5501 5502 name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++); 5503 if (!name) { 5504 SPDK_ERRLOG("Failed to assign name to NVMe device\n"); 5505 return; 5506 } 5507 5508 if (nvme_ctrlr_create(ctrlr, name, trid, NULL) == 0) { 5509 SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name); 5510 } else { 5511 SPDK_ERRLOG("Failed to attach to %s (%s)\n", trid->traddr, name); 5512 } 5513 5514 free(name); 5515 } 5516 5517 static void 5518 _nvme_ctrlr_destruct(void *ctx) 5519 { 5520 struct nvme_ctrlr *nvme_ctrlr = ctx; 5521 5522 nvme_ctrlr_depopulate_namespaces(nvme_ctrlr); 5523 nvme_ctrlr_release(nvme_ctrlr); 5524 } 5525 5526 static int 5527 bdev_nvme_delete_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) 5528 { 5529 struct nvme_probe_skip_entry *entry; 5530 5531 /* The controller's destruction was already started */ 5532 if (nvme_ctrlr->destruct) { 5533 return -EALREADY; 5534 } 5535 5536 if (!hotplug && 5537 nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { 5538 entry = calloc(1, sizeof(*entry)); 5539 if (!entry) { 5540 return -ENOMEM; 5541 } 5542 entry->trid = nvme_ctrlr->active_path_id->trid; 5543 TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq); 5544 } 5545 5546 nvme_ctrlr->destruct = true; 5547 return 0; 5548 } 5549 5550 static int 5551 bdev_nvme_delete_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) 5552 { 5553 int rc; 5554 5555 pthread_mutex_lock(&nvme_ctrlr->mutex); 5556 rc = bdev_nvme_delete_ctrlr_unsafe(nvme_ctrlr, hotplug); 5557 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5558 5559 if (rc == 0) { 5560 _nvme_ctrlr_destruct(nvme_ctrlr); 5561 } else if (rc == -EALREADY) { 5562 rc = 0; 5563 } 5564 5565 return rc; 5566 } 5567 5568 static void 5569 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr) 5570 { 5571 struct nvme_ctrlr *nvme_ctrlr = cb_ctx; 5572 5573 bdev_nvme_delete_ctrlr(nvme_ctrlr, true); 5574 } 5575 5576 static int 5577 bdev_nvme_hotplug_probe(void *arg) 5578 { 5579 if (g_hotplug_probe_ctx == NULL) { 5580 spdk_poller_unregister(&g_hotplug_probe_poller); 5581 return SPDK_POLLER_IDLE; 5582 } 5583 5584 if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) { 5585 g_hotplug_probe_ctx = NULL; 5586 spdk_poller_unregister(&g_hotplug_probe_poller); 5587 } 5588 5589 return SPDK_POLLER_BUSY; 5590 } 5591 5592 static int 5593 bdev_nvme_hotplug(void *arg) 5594 { 5595 struct spdk_nvme_transport_id trid_pcie; 5596 5597 if (g_hotplug_probe_ctx) { 5598 return SPDK_POLLER_BUSY; 5599 } 5600 5601 memset(&trid_pcie, 0, sizeof(trid_pcie)); 5602 spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE); 5603 5604 g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL, 5605 hotplug_probe_cb, attach_cb, NULL); 5606 5607 if (g_hotplug_probe_ctx) { 5608 assert(g_hotplug_probe_poller == NULL); 5609 g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000); 5610 } 5611 5612 return SPDK_POLLER_BUSY; 5613 } 5614 5615 void 5616 bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts) 5617 { 5618 *opts = g_opts; 5619 } 5620 5621 static bool bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, 5622 uint32_t reconnect_delay_sec, 5623 uint32_t fast_io_fail_timeout_sec); 5624 5625 static int 5626 bdev_nvme_validate_opts(const struct spdk_bdev_nvme_opts *opts) 5627 { 5628 if ((opts->timeout_us == 0) && (opts->timeout_admin_us != 0)) { 5629 /* Can't set timeout_admin_us without also setting timeout_us */ 5630 SPDK_WARNLOG("Invalid options: Can't have (timeout_us == 0) with (timeout_admin_us > 0)\n"); 5631 return -EINVAL; 5632 } 5633 5634 if (opts->bdev_retry_count < -1) { 5635 SPDK_WARNLOG("Invalid option: bdev_retry_count can't be less than -1.\n"); 5636 return -EINVAL; 5637 } 5638 5639 if (!bdev_nvme_check_io_error_resiliency_params(opts->ctrlr_loss_timeout_sec, 5640 opts->reconnect_delay_sec, 5641 opts->fast_io_fail_timeout_sec)) { 5642 return -EINVAL; 5643 } 5644 5645 return 0; 5646 } 5647 5648 int 5649 bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts) 5650 { 5651 int ret; 5652 5653 ret = bdev_nvme_validate_opts(opts); 5654 if (ret) { 5655 SPDK_WARNLOG("Failed to set nvme opts.\n"); 5656 return ret; 5657 } 5658 5659 if (g_bdev_nvme_init_thread != NULL) { 5660 if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 5661 return -EPERM; 5662 } 5663 } 5664 5665 if (opts->rdma_srq_size != 0 || 5666 opts->rdma_max_cq_size != 0 || 5667 opts->rdma_cm_event_timeout_ms != 0) { 5668 struct spdk_nvme_transport_opts drv_opts; 5669 5670 spdk_nvme_transport_get_opts(&drv_opts, sizeof(drv_opts)); 5671 if (opts->rdma_srq_size != 0) { 5672 drv_opts.rdma_srq_size = opts->rdma_srq_size; 5673 } 5674 if (opts->rdma_max_cq_size != 0) { 5675 drv_opts.rdma_max_cq_size = opts->rdma_max_cq_size; 5676 } 5677 if (opts->rdma_cm_event_timeout_ms != 0) { 5678 drv_opts.rdma_cm_event_timeout_ms = opts->rdma_cm_event_timeout_ms; 5679 } 5680 5681 ret = spdk_nvme_transport_set_opts(&drv_opts, sizeof(drv_opts)); 5682 if (ret) { 5683 SPDK_ERRLOG("Failed to set NVMe transport opts.\n"); 5684 return ret; 5685 } 5686 } 5687 5688 g_opts = *opts; 5689 5690 return 0; 5691 } 5692 5693 struct set_nvme_hotplug_ctx { 5694 uint64_t period_us; 5695 bool enabled; 5696 spdk_msg_fn fn; 5697 void *fn_ctx; 5698 }; 5699 5700 static void 5701 set_nvme_hotplug_period_cb(void *_ctx) 5702 { 5703 struct set_nvme_hotplug_ctx *ctx = _ctx; 5704 5705 spdk_poller_unregister(&g_hotplug_poller); 5706 if (ctx->enabled) { 5707 g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us); 5708 } 5709 5710 g_nvme_hotplug_poll_period_us = ctx->period_us; 5711 g_nvme_hotplug_enabled = ctx->enabled; 5712 if (ctx->fn) { 5713 ctx->fn(ctx->fn_ctx); 5714 } 5715 5716 free(ctx); 5717 } 5718 5719 int 5720 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx) 5721 { 5722 struct set_nvme_hotplug_ctx *ctx; 5723 5724 if (enabled == true && !spdk_process_is_primary()) { 5725 return -EPERM; 5726 } 5727 5728 ctx = calloc(1, sizeof(*ctx)); 5729 if (ctx == NULL) { 5730 return -ENOMEM; 5731 } 5732 5733 period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us; 5734 ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX); 5735 ctx->enabled = enabled; 5736 ctx->fn = cb; 5737 ctx->fn_ctx = cb_ctx; 5738 5739 spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx); 5740 return 0; 5741 } 5742 5743 static void 5744 nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 5745 struct nvme_async_probe_ctx *ctx) 5746 { 5747 struct nvme_ns *nvme_ns; 5748 struct nvme_bdev *nvme_bdev; 5749 size_t j; 5750 5751 assert(nvme_ctrlr != NULL); 5752 5753 if (ctx->names == NULL) { 5754 ctx->reported_bdevs = 0; 5755 populate_namespaces_cb(ctx, 0); 5756 return; 5757 } 5758 5759 /* 5760 * Report the new bdevs that were created in this call. 5761 * There can be more than one bdev per NVMe controller. 5762 */ 5763 j = 0; 5764 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 5765 while (nvme_ns != NULL) { 5766 nvme_bdev = nvme_ns->bdev; 5767 if (j < ctx->max_bdevs) { 5768 ctx->names[j] = nvme_bdev->disk.name; 5769 j++; 5770 } else { 5771 SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n", 5772 ctx->max_bdevs); 5773 ctx->reported_bdevs = 0; 5774 populate_namespaces_cb(ctx, -ERANGE); 5775 return; 5776 } 5777 5778 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 5779 } 5780 5781 ctx->reported_bdevs = j; 5782 populate_namespaces_cb(ctx, 0); 5783 } 5784 5785 static int 5786 bdev_nvme_check_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 5787 struct spdk_nvme_ctrlr *new_ctrlr, 5788 struct spdk_nvme_transport_id *trid) 5789 { 5790 struct nvme_path_id *tmp_trid; 5791 5792 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 5793 SPDK_ERRLOG("PCIe failover is not supported.\n"); 5794 return -ENOTSUP; 5795 } 5796 5797 /* Currently we only support failover to the same transport type. */ 5798 if (nvme_ctrlr->active_path_id->trid.trtype != trid->trtype) { 5799 SPDK_WARNLOG("Failover from trtype: %s to a different trtype: %s is not supported currently\n", 5800 spdk_nvme_transport_id_trtype_str(nvme_ctrlr->active_path_id->trid.trtype), 5801 spdk_nvme_transport_id_trtype_str(trid->trtype)); 5802 return -EINVAL; 5803 } 5804 5805 5806 /* Currently we only support failover to the same NQN. */ 5807 if (strncmp(trid->subnqn, nvme_ctrlr->active_path_id->trid.subnqn, SPDK_NVMF_NQN_MAX_LEN)) { 5808 SPDK_WARNLOG("Failover from subnqn: %s to a different subnqn: %s is not supported currently\n", 5809 nvme_ctrlr->active_path_id->trid.subnqn, trid->subnqn); 5810 return -EINVAL; 5811 } 5812 5813 /* Skip all the other checks if we've already registered this path. */ 5814 TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) { 5815 if (!spdk_nvme_transport_id_compare(&tmp_trid->trid, trid)) { 5816 SPDK_WARNLOG("This path (traddr: %s subnqn: %s) is already registered\n", trid->traddr, 5817 trid->subnqn); 5818 return -EEXIST; 5819 } 5820 } 5821 5822 return 0; 5823 } 5824 5825 static int 5826 bdev_nvme_check_secondary_namespace(struct nvme_ctrlr *nvme_ctrlr, 5827 struct spdk_nvme_ctrlr *new_ctrlr) 5828 { 5829 struct nvme_ns *nvme_ns; 5830 struct spdk_nvme_ns *new_ns; 5831 5832 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 5833 while (nvme_ns != NULL) { 5834 new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nvme_ns->id); 5835 assert(new_ns != NULL); 5836 5837 if (!bdev_nvme_compare_ns(nvme_ns->ns, new_ns)) { 5838 return -EINVAL; 5839 } 5840 5841 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 5842 } 5843 5844 return 0; 5845 } 5846 5847 static int 5848 _bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 5849 struct spdk_nvme_transport_id *trid) 5850 { 5851 struct nvme_path_id *active_id, *new_trid, *tmp_trid; 5852 5853 new_trid = calloc(1, sizeof(*new_trid)); 5854 if (new_trid == NULL) { 5855 return -ENOMEM; 5856 } 5857 new_trid->trid = *trid; 5858 5859 active_id = nvme_ctrlr->active_path_id; 5860 assert(active_id != NULL); 5861 assert(active_id == TAILQ_FIRST(&nvme_ctrlr->trids)); 5862 5863 /* Skip the active trid not to replace it until it is failed. */ 5864 tmp_trid = TAILQ_NEXT(active_id, link); 5865 if (tmp_trid == NULL) { 5866 goto add_tail; 5867 } 5868 5869 /* It means the trid is faled if its last failed time is non-zero. 5870 * Insert the new alternate trid before any failed trid. 5871 */ 5872 TAILQ_FOREACH_FROM(tmp_trid, &nvme_ctrlr->trids, link) { 5873 if (tmp_trid->last_failed_tsc != 0) { 5874 TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link); 5875 return 0; 5876 } 5877 } 5878 5879 add_tail: 5880 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, new_trid, link); 5881 return 0; 5882 } 5883 5884 /* This is the case that a secondary path is added to an existing 5885 * nvme_ctrlr for failover. After checking if it can access the same 5886 * namespaces as the primary path, it is disconnected until failover occurs. 5887 */ 5888 static int 5889 bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 5890 struct spdk_nvme_ctrlr *new_ctrlr, 5891 struct spdk_nvme_transport_id *trid) 5892 { 5893 int rc; 5894 5895 assert(nvme_ctrlr != NULL); 5896 5897 pthread_mutex_lock(&nvme_ctrlr->mutex); 5898 5899 rc = bdev_nvme_check_secondary_trid(nvme_ctrlr, new_ctrlr, trid); 5900 if (rc != 0) { 5901 goto exit; 5902 } 5903 5904 rc = bdev_nvme_check_secondary_namespace(nvme_ctrlr, new_ctrlr); 5905 if (rc != 0) { 5906 goto exit; 5907 } 5908 5909 rc = _bdev_nvme_add_secondary_trid(nvme_ctrlr, trid); 5910 5911 exit: 5912 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5913 5914 spdk_nvme_detach(new_ctrlr); 5915 5916 return rc; 5917 } 5918 5919 static void 5920 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5921 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 5922 { 5923 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 5924 struct nvme_async_probe_ctx *ctx; 5925 int rc; 5926 5927 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 5928 ctx->ctrlr_attached = true; 5929 5930 rc = nvme_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx); 5931 if (rc != 0) { 5932 ctx->reported_bdevs = 0; 5933 populate_namespaces_cb(ctx, rc); 5934 } 5935 } 5936 5937 static void 5938 connect_set_failover_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5939 struct spdk_nvme_ctrlr *ctrlr, 5940 const struct spdk_nvme_ctrlr_opts *opts) 5941 { 5942 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 5943 struct nvme_ctrlr *nvme_ctrlr; 5944 struct nvme_async_probe_ctx *ctx; 5945 int rc; 5946 5947 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 5948 ctx->ctrlr_attached = true; 5949 5950 nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->base_name); 5951 if (nvme_ctrlr) { 5952 rc = bdev_nvme_add_secondary_trid(nvme_ctrlr, ctrlr, &ctx->trid); 5953 } else { 5954 rc = -ENODEV; 5955 } 5956 5957 ctx->reported_bdevs = 0; 5958 populate_namespaces_cb(ctx, rc); 5959 } 5960 5961 static int 5962 bdev_nvme_async_poll(void *arg) 5963 { 5964 struct nvme_async_probe_ctx *ctx = arg; 5965 int rc; 5966 5967 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 5968 if (spdk_unlikely(rc != -EAGAIN)) { 5969 ctx->probe_done = true; 5970 spdk_poller_unregister(&ctx->poller); 5971 if (!ctx->ctrlr_attached) { 5972 /* The probe is done, but no controller was attached. 5973 * That means we had a failure, so report -EIO back to 5974 * the caller (usually the RPC). populate_namespaces_cb() 5975 * will take care of freeing the nvme_async_probe_ctx. 5976 */ 5977 ctx->reported_bdevs = 0; 5978 populate_namespaces_cb(ctx, -EIO); 5979 } else if (ctx->namespaces_populated) { 5980 /* The namespaces for the attached controller were all 5981 * populated and the response was already sent to the 5982 * caller (usually the RPC). So free the context here. 5983 */ 5984 free_nvme_async_probe_ctx(ctx); 5985 } 5986 } 5987 5988 return SPDK_POLLER_BUSY; 5989 } 5990 5991 static bool 5992 bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, 5993 uint32_t reconnect_delay_sec, 5994 uint32_t fast_io_fail_timeout_sec) 5995 { 5996 if (ctrlr_loss_timeout_sec < -1) { 5997 SPDK_ERRLOG("ctrlr_loss_timeout_sec can't be less than -1.\n"); 5998 return false; 5999 } else if (ctrlr_loss_timeout_sec == -1) { 6000 if (reconnect_delay_sec == 0) { 6001 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 6002 return false; 6003 } else if (fast_io_fail_timeout_sec != 0 && 6004 fast_io_fail_timeout_sec < reconnect_delay_sec) { 6005 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io-fail_timeout_sec.\n"); 6006 return false; 6007 } 6008 } else if (ctrlr_loss_timeout_sec != 0) { 6009 if (reconnect_delay_sec == 0) { 6010 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 6011 return false; 6012 } else if (reconnect_delay_sec > (uint32_t)ctrlr_loss_timeout_sec) { 6013 SPDK_ERRLOG("reconnect_delay_sec can't be more than ctrlr_loss_timeout_sec.\n"); 6014 return false; 6015 } else if (fast_io_fail_timeout_sec != 0) { 6016 if (fast_io_fail_timeout_sec < reconnect_delay_sec) { 6017 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io_fail_timeout_sec.\n"); 6018 return false; 6019 } else if (fast_io_fail_timeout_sec > (uint32_t)ctrlr_loss_timeout_sec) { 6020 SPDK_ERRLOG("fast_io_fail_timeout_sec can't be more than ctrlr_loss_timeout_sec.\n"); 6021 return false; 6022 } 6023 } 6024 } else if (reconnect_delay_sec != 0 || fast_io_fail_timeout_sec != 0) { 6025 SPDK_ERRLOG("Both reconnect_delay_sec and fast_io_fail_timeout_sec must be 0 if ctrlr_loss_timeout_sec is 0.\n"); 6026 return false; 6027 } 6028 6029 return true; 6030 } 6031 6032 static int 6033 bdev_nvme_load_psk(const char *fname, char *buf, size_t bufsz) 6034 { 6035 FILE *psk_file; 6036 struct stat statbuf; 6037 int rc; 6038 #define TCP_PSK_INVALID_PERMISSIONS 0177 6039 6040 if (stat(fname, &statbuf) != 0) { 6041 SPDK_ERRLOG("Could not read permissions for PSK file\n"); 6042 return -EACCES; 6043 } 6044 6045 if ((statbuf.st_mode & TCP_PSK_INVALID_PERMISSIONS) != 0) { 6046 SPDK_ERRLOG("Incorrect permissions for PSK file\n"); 6047 return -EPERM; 6048 } 6049 if ((size_t)statbuf.st_size >= bufsz) { 6050 SPDK_ERRLOG("Invalid PSK: too long\n"); 6051 return -EINVAL; 6052 } 6053 psk_file = fopen(fname, "r"); 6054 if (psk_file == NULL) { 6055 SPDK_ERRLOG("Could not open PSK file\n"); 6056 return -EINVAL; 6057 } 6058 6059 memset(buf, 0, bufsz); 6060 rc = fread(buf, 1, statbuf.st_size, psk_file); 6061 if (rc != statbuf.st_size) { 6062 SPDK_ERRLOG("Failed to read PSK\n"); 6063 fclose(psk_file); 6064 return -EINVAL; 6065 } 6066 6067 fclose(psk_file); 6068 return 0; 6069 } 6070 6071 int 6072 bdev_nvme_create(struct spdk_nvme_transport_id *trid, 6073 const char *base_name, 6074 const char **names, 6075 uint32_t count, 6076 spdk_bdev_create_nvme_fn cb_fn, 6077 void *cb_ctx, 6078 struct spdk_nvme_ctrlr_opts *drv_opts, 6079 struct nvme_ctrlr_opts *bdev_opts, 6080 bool multipath) 6081 { 6082 struct nvme_probe_skip_entry *entry, *tmp; 6083 struct nvme_async_probe_ctx *ctx; 6084 spdk_nvme_attach_cb attach_cb; 6085 int rc, len; 6086 6087 /* TODO expand this check to include both the host and target TRIDs. 6088 * Only if both are the same should we fail. 6089 */ 6090 if (nvme_ctrlr_get(trid) != NULL) { 6091 SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr); 6092 return -EEXIST; 6093 } 6094 6095 len = strnlen(base_name, SPDK_CONTROLLER_NAME_MAX); 6096 6097 if (len == 0 || len == SPDK_CONTROLLER_NAME_MAX) { 6098 SPDK_ERRLOG("controller name must be between 1 and %d characters\n", SPDK_CONTROLLER_NAME_MAX - 1); 6099 return -EINVAL; 6100 } 6101 6102 if (bdev_opts != NULL && 6103 !bdev_nvme_check_io_error_resiliency_params(bdev_opts->ctrlr_loss_timeout_sec, 6104 bdev_opts->reconnect_delay_sec, 6105 bdev_opts->fast_io_fail_timeout_sec)) { 6106 return -EINVAL; 6107 } 6108 6109 ctx = calloc(1, sizeof(*ctx)); 6110 if (!ctx) { 6111 return -ENOMEM; 6112 } 6113 ctx->base_name = base_name; 6114 ctx->names = names; 6115 ctx->max_bdevs = count; 6116 ctx->cb_fn = cb_fn; 6117 ctx->cb_ctx = cb_ctx; 6118 ctx->trid = *trid; 6119 6120 if (bdev_opts) { 6121 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 6122 } else { 6123 bdev_nvme_get_default_ctrlr_opts(&ctx->bdev_opts); 6124 } 6125 6126 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 6127 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) { 6128 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 6129 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 6130 free(entry); 6131 break; 6132 } 6133 } 6134 } 6135 6136 if (drv_opts) { 6137 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 6138 } else { 6139 spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->drv_opts, sizeof(ctx->drv_opts)); 6140 } 6141 6142 ctx->drv_opts.transport_retry_count = g_opts.transport_retry_count; 6143 ctx->drv_opts.transport_ack_timeout = g_opts.transport_ack_timeout; 6144 ctx->drv_opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms; 6145 ctx->drv_opts.disable_read_ana_log_page = true; 6146 ctx->drv_opts.transport_tos = g_opts.transport_tos; 6147 6148 if (ctx->bdev_opts.psk[0] != '\0') { 6149 /* Try to use the keyring first */ 6150 ctx->drv_opts.tls_psk = spdk_keyring_get_key(ctx->bdev_opts.psk); 6151 if (ctx->drv_opts.tls_psk == NULL) { 6152 rc = bdev_nvme_load_psk(ctx->bdev_opts.psk, 6153 ctx->drv_opts.psk, sizeof(ctx->drv_opts.psk)); 6154 if (rc != 0) { 6155 SPDK_ERRLOG("Could not load PSK from %s\n", ctx->bdev_opts.psk); 6156 free_nvme_async_probe_ctx(ctx); 6157 return rc; 6158 } 6159 } 6160 } 6161 6162 if (ctx->bdev_opts.dhchap_key != NULL) { 6163 ctx->drv_opts.dhchap_key = spdk_keyring_get_key(ctx->bdev_opts.dhchap_key); 6164 if (ctx->drv_opts.dhchap_key == NULL) { 6165 SPDK_ERRLOG("Could not load DH-HMAC-CHAP key: %s\n", 6166 ctx->bdev_opts.dhchap_key); 6167 free_nvme_async_probe_ctx(ctx); 6168 return -ENOKEY; 6169 } 6170 6171 ctx->drv_opts.dhchap_digests = g_opts.dhchap_digests; 6172 ctx->drv_opts.dhchap_dhgroups = g_opts.dhchap_dhgroups; 6173 } 6174 6175 if (nvme_bdev_ctrlr_get_by_name(base_name) == NULL || multipath) { 6176 attach_cb = connect_attach_cb; 6177 } else { 6178 attach_cb = connect_set_failover_cb; 6179 } 6180 6181 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, attach_cb); 6182 if (ctx->probe_ctx == NULL) { 6183 SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr); 6184 free_nvme_async_probe_ctx(ctx); 6185 return -ENODEV; 6186 } 6187 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000); 6188 6189 return 0; 6190 } 6191 6192 struct bdev_nvme_delete_ctx { 6193 char *name; 6194 struct nvme_path_id path_id; 6195 bdev_nvme_delete_done_fn delete_done; 6196 void *delete_done_ctx; 6197 uint64_t timeout_ticks; 6198 struct spdk_poller *poller; 6199 }; 6200 6201 static void 6202 free_bdev_nvme_delete_ctx(struct bdev_nvme_delete_ctx *ctx) 6203 { 6204 if (ctx != NULL) { 6205 free(ctx->name); 6206 free(ctx); 6207 } 6208 } 6209 6210 static bool 6211 nvme_path_id_compare(struct nvme_path_id *p, const struct nvme_path_id *path_id) 6212 { 6213 if (path_id->trid.trtype != 0) { 6214 if (path_id->trid.trtype == SPDK_NVME_TRANSPORT_CUSTOM) { 6215 if (strcasecmp(path_id->trid.trstring, p->trid.trstring) != 0) { 6216 return false; 6217 } 6218 } else { 6219 if (path_id->trid.trtype != p->trid.trtype) { 6220 return false; 6221 } 6222 } 6223 } 6224 6225 if (!spdk_mem_all_zero(path_id->trid.traddr, sizeof(path_id->trid.traddr))) { 6226 if (strcasecmp(path_id->trid.traddr, p->trid.traddr) != 0) { 6227 return false; 6228 } 6229 } 6230 6231 if (path_id->trid.adrfam != 0) { 6232 if (path_id->trid.adrfam != p->trid.adrfam) { 6233 return false; 6234 } 6235 } 6236 6237 if (!spdk_mem_all_zero(path_id->trid.trsvcid, sizeof(path_id->trid.trsvcid))) { 6238 if (strcasecmp(path_id->trid.trsvcid, p->trid.trsvcid) != 0) { 6239 return false; 6240 } 6241 } 6242 6243 if (!spdk_mem_all_zero(path_id->trid.subnqn, sizeof(path_id->trid.subnqn))) { 6244 if (strcmp(path_id->trid.subnqn, p->trid.subnqn) != 0) { 6245 return false; 6246 } 6247 } 6248 6249 if (!spdk_mem_all_zero(path_id->hostid.hostaddr, sizeof(path_id->hostid.hostaddr))) { 6250 if (strcmp(path_id->hostid.hostaddr, p->hostid.hostaddr) != 0) { 6251 return false; 6252 } 6253 } 6254 6255 if (!spdk_mem_all_zero(path_id->hostid.hostsvcid, sizeof(path_id->hostid.hostsvcid))) { 6256 if (strcmp(path_id->hostid.hostsvcid, p->hostid.hostsvcid) != 0) { 6257 return false; 6258 } 6259 } 6260 6261 return true; 6262 } 6263 6264 static bool 6265 nvme_path_id_exists(const char *name, const struct nvme_path_id *path_id) 6266 { 6267 struct nvme_bdev_ctrlr *nbdev_ctrlr; 6268 struct nvme_ctrlr *ctrlr; 6269 struct nvme_path_id *p; 6270 6271 pthread_mutex_lock(&g_bdev_nvme_mutex); 6272 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 6273 if (!nbdev_ctrlr) { 6274 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6275 return false; 6276 } 6277 6278 TAILQ_FOREACH(ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 6279 pthread_mutex_lock(&ctrlr->mutex); 6280 TAILQ_FOREACH(p, &ctrlr->trids, link) { 6281 if (nvme_path_id_compare(p, path_id)) { 6282 pthread_mutex_unlock(&ctrlr->mutex); 6283 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6284 return true; 6285 } 6286 } 6287 pthread_mutex_unlock(&ctrlr->mutex); 6288 } 6289 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6290 6291 return false; 6292 } 6293 6294 static int 6295 bdev_nvme_delete_complete_poll(void *arg) 6296 { 6297 struct bdev_nvme_delete_ctx *ctx = arg; 6298 int rc = 0; 6299 6300 if (nvme_path_id_exists(ctx->name, &ctx->path_id)) { 6301 if (ctx->timeout_ticks > spdk_get_ticks()) { 6302 return SPDK_POLLER_BUSY; 6303 } 6304 6305 SPDK_ERRLOG("NVMe path '%s' still exists after delete\n", ctx->name); 6306 rc = -ETIMEDOUT; 6307 } 6308 6309 spdk_poller_unregister(&ctx->poller); 6310 6311 ctx->delete_done(ctx->delete_done_ctx, rc); 6312 free_bdev_nvme_delete_ctx(ctx); 6313 6314 return SPDK_POLLER_BUSY; 6315 } 6316 6317 static int 6318 _bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, const struct nvme_path_id *path_id) 6319 { 6320 struct nvme_path_id *p, *t; 6321 spdk_msg_fn msg_fn; 6322 int rc = -ENXIO; 6323 6324 pthread_mutex_lock(&nvme_ctrlr->mutex); 6325 6326 TAILQ_FOREACH_REVERSE_SAFE(p, &nvme_ctrlr->trids, nvme_paths, link, t) { 6327 if (p == TAILQ_FIRST(&nvme_ctrlr->trids)) { 6328 break; 6329 } 6330 6331 if (!nvme_path_id_compare(p, path_id)) { 6332 continue; 6333 } 6334 6335 /* We are not using the specified path. */ 6336 TAILQ_REMOVE(&nvme_ctrlr->trids, p, link); 6337 free(p); 6338 rc = 0; 6339 } 6340 6341 if (p == NULL || !nvme_path_id_compare(p, path_id)) { 6342 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6343 return rc; 6344 } 6345 6346 /* If we made it here, then this path is a match! Now we need to remove it. */ 6347 6348 /* This is the active path in use right now. The active path is always the first in the list. */ 6349 assert(p == nvme_ctrlr->active_path_id); 6350 6351 if (!TAILQ_NEXT(p, link)) { 6352 /* The current path is the only path. */ 6353 msg_fn = _nvme_ctrlr_destruct; 6354 rc = bdev_nvme_delete_ctrlr_unsafe(nvme_ctrlr, false); 6355 } else { 6356 /* There is an alternative path. */ 6357 msg_fn = _bdev_nvme_reset_ctrlr; 6358 rc = bdev_nvme_failover_ctrlr_unsafe(nvme_ctrlr, true); 6359 } 6360 6361 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6362 6363 if (rc == 0) { 6364 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 6365 } else if (rc == -EALREADY) { 6366 rc = 0; 6367 } 6368 6369 return rc; 6370 } 6371 6372 int 6373 bdev_nvme_delete(const char *name, const struct nvme_path_id *path_id, 6374 bdev_nvme_delete_done_fn delete_done, void *delete_done_ctx) 6375 { 6376 struct nvme_bdev_ctrlr *nbdev_ctrlr; 6377 struct nvme_ctrlr *nvme_ctrlr, *tmp_nvme_ctrlr; 6378 struct bdev_nvme_delete_ctx *ctx = NULL; 6379 int rc = -ENXIO, _rc; 6380 6381 if (name == NULL || path_id == NULL) { 6382 rc = -EINVAL; 6383 goto exit; 6384 } 6385 6386 pthread_mutex_lock(&g_bdev_nvme_mutex); 6387 6388 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 6389 if (nbdev_ctrlr == NULL) { 6390 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6391 6392 SPDK_ERRLOG("Failed to find NVMe bdev controller\n"); 6393 rc = -ENODEV; 6394 goto exit; 6395 } 6396 6397 TAILQ_FOREACH_SAFE(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq, tmp_nvme_ctrlr) { 6398 _rc = _bdev_nvme_delete(nvme_ctrlr, path_id); 6399 if (_rc < 0 && _rc != -ENXIO) { 6400 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6401 rc = _rc; 6402 goto exit; 6403 } else if (_rc == 0) { 6404 /* We traverse all remaining nvme_ctrlrs even if one nvme_ctrlr 6405 * was deleted successfully. To remember the successful deletion, 6406 * overwrite rc only if _rc is zero. 6407 */ 6408 rc = 0; 6409 } 6410 } 6411 6412 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6413 6414 if (rc != 0 || delete_done == NULL) { 6415 goto exit; 6416 } 6417 6418 ctx = calloc(1, sizeof(*ctx)); 6419 if (ctx == NULL) { 6420 SPDK_ERRLOG("Failed to allocate context for bdev_nvme_delete\n"); 6421 rc = -ENOMEM; 6422 goto exit; 6423 } 6424 6425 ctx->name = strdup(name); 6426 if (ctx->name == NULL) { 6427 SPDK_ERRLOG("Failed to copy controller name for deletion\n"); 6428 rc = -ENOMEM; 6429 goto exit; 6430 } 6431 6432 ctx->delete_done = delete_done; 6433 ctx->delete_done_ctx = delete_done_ctx; 6434 ctx->path_id = *path_id; 6435 ctx->timeout_ticks = spdk_get_ticks() + 10 * spdk_get_ticks_hz(); 6436 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_delete_complete_poll, ctx, 1000); 6437 if (ctx->poller == NULL) { 6438 SPDK_ERRLOG("Failed to register bdev_nvme_delete poller\n"); 6439 rc = -ENOMEM; 6440 goto exit; 6441 } 6442 6443 exit: 6444 if (rc != 0) { 6445 free_bdev_nvme_delete_ctx(ctx); 6446 } 6447 6448 return rc; 6449 } 6450 6451 #define DISCOVERY_INFOLOG(ctx, format, ...) \ 6452 SPDK_INFOLOG(bdev_nvme, "Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 6453 6454 #define DISCOVERY_ERRLOG(ctx, format, ...) \ 6455 SPDK_ERRLOG("Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 6456 6457 struct discovery_entry_ctx { 6458 char name[128]; 6459 struct spdk_nvme_transport_id trid; 6460 struct spdk_nvme_ctrlr_opts drv_opts; 6461 struct spdk_nvmf_discovery_log_page_entry entry; 6462 TAILQ_ENTRY(discovery_entry_ctx) tailq; 6463 struct discovery_ctx *ctx; 6464 }; 6465 6466 struct discovery_ctx { 6467 char *name; 6468 spdk_bdev_nvme_start_discovery_fn start_cb_fn; 6469 spdk_bdev_nvme_stop_discovery_fn stop_cb_fn; 6470 void *cb_ctx; 6471 struct spdk_nvme_probe_ctx *probe_ctx; 6472 struct spdk_nvme_detach_ctx *detach_ctx; 6473 struct spdk_nvme_ctrlr *ctrlr; 6474 struct spdk_nvme_transport_id trid; 6475 struct discovery_entry_ctx *entry_ctx_in_use; 6476 struct spdk_poller *poller; 6477 struct spdk_nvme_ctrlr_opts drv_opts; 6478 struct nvme_ctrlr_opts bdev_opts; 6479 struct spdk_nvmf_discovery_log_page *log_page; 6480 TAILQ_ENTRY(discovery_ctx) tailq; 6481 TAILQ_HEAD(, discovery_entry_ctx) nvm_entry_ctxs; 6482 TAILQ_HEAD(, discovery_entry_ctx) discovery_entry_ctxs; 6483 int rc; 6484 bool wait_for_attach; 6485 uint64_t timeout_ticks; 6486 /* Denotes that the discovery service is being started. We're waiting 6487 * for the initial connection to the discovery controller to be 6488 * established and attach discovered NVM ctrlrs. 6489 */ 6490 bool initializing; 6491 /* Denotes if a discovery is currently in progress for this context. 6492 * That includes connecting to newly discovered subsystems. Used to 6493 * ensure we do not start a new discovery until an existing one is 6494 * complete. 6495 */ 6496 bool in_progress; 6497 6498 /* Denotes if another discovery is needed after the one in progress 6499 * completes. Set when we receive an AER completion while a discovery 6500 * is already in progress. 6501 */ 6502 bool pending; 6503 6504 /* Signal to the discovery context poller that it should stop the 6505 * discovery service, including detaching from the current discovery 6506 * controller. 6507 */ 6508 bool stop; 6509 6510 struct spdk_thread *calling_thread; 6511 uint32_t index; 6512 uint32_t attach_in_progress; 6513 char *hostnqn; 6514 6515 /* Denotes if the discovery service was started by the mdns discovery. 6516 */ 6517 bool from_mdns_discovery_service; 6518 }; 6519 6520 TAILQ_HEAD(discovery_ctxs, discovery_ctx); 6521 static struct discovery_ctxs g_discovery_ctxs = TAILQ_HEAD_INITIALIZER(g_discovery_ctxs); 6522 6523 static void get_discovery_log_page(struct discovery_ctx *ctx); 6524 6525 static void 6526 free_discovery_ctx(struct discovery_ctx *ctx) 6527 { 6528 free(ctx->log_page); 6529 free(ctx->hostnqn); 6530 free(ctx->name); 6531 free(ctx); 6532 } 6533 6534 static void 6535 discovery_complete(struct discovery_ctx *ctx) 6536 { 6537 ctx->initializing = false; 6538 ctx->in_progress = false; 6539 if (ctx->pending) { 6540 ctx->pending = false; 6541 get_discovery_log_page(ctx); 6542 } 6543 } 6544 6545 static void 6546 build_trid_from_log_page_entry(struct spdk_nvme_transport_id *trid, 6547 struct spdk_nvmf_discovery_log_page_entry *entry) 6548 { 6549 char *space; 6550 6551 trid->trtype = entry->trtype; 6552 trid->adrfam = entry->adrfam; 6553 memcpy(trid->traddr, entry->traddr, sizeof(entry->traddr)); 6554 memcpy(trid->trsvcid, entry->trsvcid, sizeof(entry->trsvcid)); 6555 /* Because the source buffer (entry->subnqn) is longer than trid->subnqn, and 6556 * before call to this function trid->subnqn is zeroed out, we need 6557 * to copy sizeof(trid->subnqn) minus one byte to make sure the last character 6558 * remains 0. Then we can shorten the string (replace ' ' with 0) if required 6559 */ 6560 memcpy(trid->subnqn, entry->subnqn, sizeof(trid->subnqn) - 1); 6561 6562 /* We want the traddr, trsvcid and subnqn fields to be NULL-terminated. 6563 * But the log page entries typically pad them with spaces, not zeroes. 6564 * So add a NULL terminator to each of these fields at the appropriate 6565 * location. 6566 */ 6567 space = strchr(trid->traddr, ' '); 6568 if (space) { 6569 *space = 0; 6570 } 6571 space = strchr(trid->trsvcid, ' '); 6572 if (space) { 6573 *space = 0; 6574 } 6575 space = strchr(trid->subnqn, ' '); 6576 if (space) { 6577 *space = 0; 6578 } 6579 } 6580 6581 static void 6582 _stop_discovery(void *_ctx) 6583 { 6584 struct discovery_ctx *ctx = _ctx; 6585 6586 if (ctx->attach_in_progress > 0) { 6587 spdk_thread_send_msg(spdk_get_thread(), _stop_discovery, ctx); 6588 return; 6589 } 6590 6591 ctx->stop = true; 6592 6593 while (!TAILQ_EMPTY(&ctx->nvm_entry_ctxs)) { 6594 struct discovery_entry_ctx *entry_ctx; 6595 struct nvme_path_id path = {}; 6596 6597 entry_ctx = TAILQ_FIRST(&ctx->nvm_entry_ctxs); 6598 path.trid = entry_ctx->trid; 6599 bdev_nvme_delete(entry_ctx->name, &path, NULL, NULL); 6600 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 6601 free(entry_ctx); 6602 } 6603 6604 while (!TAILQ_EMPTY(&ctx->discovery_entry_ctxs)) { 6605 struct discovery_entry_ctx *entry_ctx; 6606 6607 entry_ctx = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 6608 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 6609 free(entry_ctx); 6610 } 6611 6612 free(ctx->entry_ctx_in_use); 6613 ctx->entry_ctx_in_use = NULL; 6614 } 6615 6616 static void 6617 stop_discovery(struct discovery_ctx *ctx, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 6618 { 6619 ctx->stop_cb_fn = cb_fn; 6620 ctx->cb_ctx = cb_ctx; 6621 6622 if (ctx->attach_in_progress > 0) { 6623 DISCOVERY_INFOLOG(ctx, "stopping discovery with attach_in_progress: %"PRIu32"\n", 6624 ctx->attach_in_progress); 6625 } 6626 6627 _stop_discovery(ctx); 6628 } 6629 6630 static void 6631 remove_discovery_entry(struct nvme_ctrlr *nvme_ctrlr) 6632 { 6633 struct discovery_ctx *d_ctx; 6634 struct nvme_path_id *path_id; 6635 struct spdk_nvme_transport_id trid = {}; 6636 struct discovery_entry_ctx *entry_ctx, *tmp; 6637 6638 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 6639 6640 TAILQ_FOREACH(d_ctx, &g_discovery_ctxs, tailq) { 6641 TAILQ_FOREACH_SAFE(entry_ctx, &d_ctx->nvm_entry_ctxs, tailq, tmp) { 6642 build_trid_from_log_page_entry(&trid, &entry_ctx->entry); 6643 if (spdk_nvme_transport_id_compare(&trid, &path_id->trid) != 0) { 6644 continue; 6645 } 6646 6647 TAILQ_REMOVE(&d_ctx->nvm_entry_ctxs, entry_ctx, tailq); 6648 free(entry_ctx); 6649 DISCOVERY_INFOLOG(d_ctx, "Remove discovery entry: %s:%s:%s\n", 6650 trid.subnqn, trid.traddr, trid.trsvcid); 6651 6652 /* Fail discovery ctrlr to force reattach attempt */ 6653 spdk_nvme_ctrlr_fail(d_ctx->ctrlr); 6654 } 6655 } 6656 } 6657 6658 static void 6659 discovery_remove_controllers(struct discovery_ctx *ctx) 6660 { 6661 struct spdk_nvmf_discovery_log_page *log_page = ctx->log_page; 6662 struct discovery_entry_ctx *entry_ctx, *tmp; 6663 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 6664 struct spdk_nvme_transport_id old_trid = {}; 6665 uint64_t numrec, i; 6666 bool found; 6667 6668 numrec = from_le64(&log_page->numrec); 6669 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->nvm_entry_ctxs, tailq, tmp) { 6670 found = false; 6671 old_entry = &entry_ctx->entry; 6672 build_trid_from_log_page_entry(&old_trid, old_entry); 6673 for (i = 0; i < numrec; i++) { 6674 new_entry = &log_page->entries[i]; 6675 if (!memcmp(old_entry, new_entry, sizeof(*old_entry))) { 6676 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s found again\n", 6677 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 6678 found = true; 6679 break; 6680 } 6681 } 6682 if (!found) { 6683 struct nvme_path_id path = {}; 6684 6685 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s not found\n", 6686 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 6687 6688 path.trid = entry_ctx->trid; 6689 bdev_nvme_delete(entry_ctx->name, &path, NULL, NULL); 6690 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 6691 free(entry_ctx); 6692 } 6693 } 6694 free(log_page); 6695 ctx->log_page = NULL; 6696 discovery_complete(ctx); 6697 } 6698 6699 static void 6700 complete_discovery_start(struct discovery_ctx *ctx, int status) 6701 { 6702 ctx->timeout_ticks = 0; 6703 ctx->rc = status; 6704 if (ctx->start_cb_fn) { 6705 ctx->start_cb_fn(ctx->cb_ctx, status); 6706 ctx->start_cb_fn = NULL; 6707 ctx->cb_ctx = NULL; 6708 } 6709 } 6710 6711 static void 6712 discovery_attach_controller_done(void *cb_ctx, size_t bdev_count, int rc) 6713 { 6714 struct discovery_entry_ctx *entry_ctx = cb_ctx; 6715 struct discovery_ctx *ctx = entry_ctx->ctx; 6716 6717 DISCOVERY_INFOLOG(ctx, "attach %s done\n", entry_ctx->name); 6718 ctx->attach_in_progress--; 6719 if (ctx->attach_in_progress == 0) { 6720 complete_discovery_start(ctx, ctx->rc); 6721 if (ctx->initializing && ctx->rc != 0) { 6722 DISCOVERY_ERRLOG(ctx, "stopping discovery due to errors: %d\n", ctx->rc); 6723 stop_discovery(ctx, NULL, ctx->cb_ctx); 6724 } else { 6725 discovery_remove_controllers(ctx); 6726 } 6727 } 6728 } 6729 6730 static struct discovery_entry_ctx * 6731 create_discovery_entry_ctx(struct discovery_ctx *ctx, struct spdk_nvme_transport_id *trid) 6732 { 6733 struct discovery_entry_ctx *new_ctx; 6734 6735 new_ctx = calloc(1, sizeof(*new_ctx)); 6736 if (new_ctx == NULL) { 6737 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6738 return NULL; 6739 } 6740 6741 new_ctx->ctx = ctx; 6742 memcpy(&new_ctx->trid, trid, sizeof(*trid)); 6743 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 6744 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 6745 return new_ctx; 6746 } 6747 6748 static void 6749 discovery_log_page_cb(void *cb_arg, int rc, const struct spdk_nvme_cpl *cpl, 6750 struct spdk_nvmf_discovery_log_page *log_page) 6751 { 6752 struct discovery_ctx *ctx = cb_arg; 6753 struct discovery_entry_ctx *entry_ctx, *tmp; 6754 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 6755 uint64_t numrec, i; 6756 bool found; 6757 6758 if (rc || spdk_nvme_cpl_is_error(cpl)) { 6759 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 6760 return; 6761 } 6762 6763 ctx->log_page = log_page; 6764 assert(ctx->attach_in_progress == 0); 6765 numrec = from_le64(&log_page->numrec); 6766 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->discovery_entry_ctxs, tailq, tmp) { 6767 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 6768 free(entry_ctx); 6769 } 6770 for (i = 0; i < numrec; i++) { 6771 found = false; 6772 new_entry = &log_page->entries[i]; 6773 if (new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY_CURRENT || 6774 new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) { 6775 struct discovery_entry_ctx *new_ctx; 6776 struct spdk_nvme_transport_id trid = {}; 6777 6778 build_trid_from_log_page_entry(&trid, new_entry); 6779 new_ctx = create_discovery_entry_ctx(ctx, &trid); 6780 if (new_ctx == NULL) { 6781 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6782 break; 6783 } 6784 6785 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, new_ctx, tailq); 6786 continue; 6787 } 6788 TAILQ_FOREACH(entry_ctx, &ctx->nvm_entry_ctxs, tailq) { 6789 old_entry = &entry_ctx->entry; 6790 if (!memcmp(new_entry, old_entry, sizeof(*new_entry))) { 6791 found = true; 6792 break; 6793 } 6794 } 6795 if (!found) { 6796 struct discovery_entry_ctx *subnqn_ctx = NULL, *new_ctx; 6797 struct discovery_ctx *d_ctx; 6798 6799 TAILQ_FOREACH(d_ctx, &g_discovery_ctxs, tailq) { 6800 TAILQ_FOREACH(subnqn_ctx, &d_ctx->nvm_entry_ctxs, tailq) { 6801 if (!memcmp(subnqn_ctx->entry.subnqn, new_entry->subnqn, 6802 sizeof(new_entry->subnqn))) { 6803 break; 6804 } 6805 } 6806 if (subnqn_ctx) { 6807 break; 6808 } 6809 } 6810 6811 new_ctx = calloc(1, sizeof(*new_ctx)); 6812 if (new_ctx == NULL) { 6813 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6814 break; 6815 } 6816 6817 new_ctx->ctx = ctx; 6818 memcpy(&new_ctx->entry, new_entry, sizeof(*new_entry)); 6819 build_trid_from_log_page_entry(&new_ctx->trid, new_entry); 6820 if (subnqn_ctx) { 6821 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s", subnqn_ctx->name); 6822 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new path for %s\n", 6823 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 6824 new_ctx->name); 6825 } else { 6826 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s%d", ctx->name, ctx->index++); 6827 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new subsystem %s\n", 6828 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 6829 new_ctx->name); 6830 } 6831 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 6832 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 6833 rc = bdev_nvme_create(&new_ctx->trid, new_ctx->name, NULL, 0, 6834 discovery_attach_controller_done, new_ctx, 6835 &new_ctx->drv_opts, &ctx->bdev_opts, true); 6836 if (rc == 0) { 6837 TAILQ_INSERT_TAIL(&ctx->nvm_entry_ctxs, new_ctx, tailq); 6838 ctx->attach_in_progress++; 6839 } else { 6840 DISCOVERY_ERRLOG(ctx, "bdev_nvme_create failed (%s)\n", spdk_strerror(-rc)); 6841 } 6842 } 6843 } 6844 6845 if (ctx->attach_in_progress == 0) { 6846 discovery_remove_controllers(ctx); 6847 } 6848 } 6849 6850 static void 6851 get_discovery_log_page(struct discovery_ctx *ctx) 6852 { 6853 int rc; 6854 6855 assert(ctx->in_progress == false); 6856 ctx->in_progress = true; 6857 rc = spdk_nvme_ctrlr_get_discovery_log_page(ctx->ctrlr, discovery_log_page_cb, ctx); 6858 if (rc != 0) { 6859 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 6860 } 6861 DISCOVERY_INFOLOG(ctx, "sent discovery log page command\n"); 6862 } 6863 6864 static void 6865 discovery_aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 6866 { 6867 struct discovery_ctx *ctx = arg; 6868 uint32_t log_page_id = (cpl->cdw0 & 0xFF0000) >> 16; 6869 6870 if (spdk_nvme_cpl_is_error(cpl)) { 6871 DISCOVERY_ERRLOG(ctx, "aer failed\n"); 6872 return; 6873 } 6874 6875 if (log_page_id != SPDK_NVME_LOG_DISCOVERY) { 6876 DISCOVERY_ERRLOG(ctx, "unexpected log page 0x%x\n", log_page_id); 6877 return; 6878 } 6879 6880 DISCOVERY_INFOLOG(ctx, "got aer\n"); 6881 if (ctx->in_progress) { 6882 ctx->pending = true; 6883 return; 6884 } 6885 6886 get_discovery_log_page(ctx); 6887 } 6888 6889 static void 6890 discovery_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 6891 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 6892 { 6893 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 6894 struct discovery_ctx *ctx; 6895 6896 ctx = SPDK_CONTAINEROF(user_opts, struct discovery_ctx, drv_opts); 6897 6898 DISCOVERY_INFOLOG(ctx, "discovery ctrlr attached\n"); 6899 ctx->probe_ctx = NULL; 6900 ctx->ctrlr = ctrlr; 6901 6902 if (ctx->rc != 0) { 6903 DISCOVERY_ERRLOG(ctx, "encountered error while attaching discovery ctrlr: %d\n", 6904 ctx->rc); 6905 return; 6906 } 6907 6908 spdk_nvme_ctrlr_register_aer_callback(ctx->ctrlr, discovery_aer_cb, ctx); 6909 } 6910 6911 static int 6912 discovery_poller(void *arg) 6913 { 6914 struct discovery_ctx *ctx = arg; 6915 struct spdk_nvme_transport_id *trid; 6916 int rc; 6917 6918 if (ctx->detach_ctx) { 6919 rc = spdk_nvme_detach_poll_async(ctx->detach_ctx); 6920 if (rc != -EAGAIN) { 6921 ctx->detach_ctx = NULL; 6922 ctx->ctrlr = NULL; 6923 } 6924 } else if (ctx->stop) { 6925 if (ctx->ctrlr != NULL) { 6926 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 6927 if (rc == 0) { 6928 return SPDK_POLLER_BUSY; 6929 } 6930 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 6931 } 6932 spdk_poller_unregister(&ctx->poller); 6933 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 6934 assert(ctx->start_cb_fn == NULL); 6935 if (ctx->stop_cb_fn != NULL) { 6936 ctx->stop_cb_fn(ctx->cb_ctx); 6937 } 6938 free_discovery_ctx(ctx); 6939 } else if (ctx->probe_ctx == NULL && ctx->ctrlr == NULL) { 6940 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 6941 DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); 6942 assert(ctx->initializing); 6943 spdk_poller_unregister(&ctx->poller); 6944 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 6945 complete_discovery_start(ctx, -ETIMEDOUT); 6946 stop_discovery(ctx, NULL, NULL); 6947 free_discovery_ctx(ctx); 6948 return SPDK_POLLER_BUSY; 6949 } 6950 6951 assert(ctx->entry_ctx_in_use == NULL); 6952 ctx->entry_ctx_in_use = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 6953 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 6954 trid = &ctx->entry_ctx_in_use->trid; 6955 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, discovery_attach_cb); 6956 if (ctx->probe_ctx) { 6957 spdk_poller_unregister(&ctx->poller); 6958 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000); 6959 } else { 6960 DISCOVERY_ERRLOG(ctx, "could not start discovery connect\n"); 6961 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 6962 ctx->entry_ctx_in_use = NULL; 6963 } 6964 } else if (ctx->probe_ctx) { 6965 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 6966 DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); 6967 complete_discovery_start(ctx, -ETIMEDOUT); 6968 return SPDK_POLLER_BUSY; 6969 } 6970 6971 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 6972 if (rc != -EAGAIN) { 6973 if (ctx->rc != 0) { 6974 assert(ctx->initializing); 6975 stop_discovery(ctx, NULL, ctx->cb_ctx); 6976 } else { 6977 assert(rc == 0); 6978 DISCOVERY_INFOLOG(ctx, "discovery ctrlr connected\n"); 6979 ctx->rc = rc; 6980 get_discovery_log_page(ctx); 6981 } 6982 } 6983 } else { 6984 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 6985 DISCOVERY_ERRLOG(ctx, "timed out while attaching NVM ctrlrs\n"); 6986 complete_discovery_start(ctx, -ETIMEDOUT); 6987 /* We need to wait until all NVM ctrlrs are attached before we stop the 6988 * discovery service to make sure we don't detach a ctrlr that is still 6989 * being attached. 6990 */ 6991 if (ctx->attach_in_progress == 0) { 6992 stop_discovery(ctx, NULL, ctx->cb_ctx); 6993 return SPDK_POLLER_BUSY; 6994 } 6995 } 6996 6997 rc = spdk_nvme_ctrlr_process_admin_completions(ctx->ctrlr); 6998 if (rc < 0) { 6999 spdk_poller_unregister(&ctx->poller); 7000 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 7001 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 7002 ctx->entry_ctx_in_use = NULL; 7003 7004 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 7005 if (rc != 0) { 7006 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 7007 ctx->ctrlr = NULL; 7008 } 7009 } 7010 } 7011 7012 return SPDK_POLLER_BUSY; 7013 } 7014 7015 static void 7016 start_discovery_poller(void *arg) 7017 { 7018 struct discovery_ctx *ctx = arg; 7019 7020 TAILQ_INSERT_TAIL(&g_discovery_ctxs, ctx, tailq); 7021 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 7022 } 7023 7024 int 7025 bdev_nvme_start_discovery(struct spdk_nvme_transport_id *trid, 7026 const char *base_name, 7027 struct spdk_nvme_ctrlr_opts *drv_opts, 7028 struct nvme_ctrlr_opts *bdev_opts, 7029 uint64_t attach_timeout, 7030 bool from_mdns, 7031 spdk_bdev_nvme_start_discovery_fn cb_fn, void *cb_ctx) 7032 { 7033 struct discovery_ctx *ctx; 7034 struct discovery_entry_ctx *discovery_entry_ctx; 7035 7036 snprintf(trid->subnqn, sizeof(trid->subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN); 7037 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 7038 if (strcmp(ctx->name, base_name) == 0) { 7039 return -EEXIST; 7040 } 7041 7042 if (ctx->entry_ctx_in_use != NULL) { 7043 if (!spdk_nvme_transport_id_compare(trid, &ctx->entry_ctx_in_use->trid)) { 7044 return -EEXIST; 7045 } 7046 } 7047 7048 TAILQ_FOREACH(discovery_entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 7049 if (!spdk_nvme_transport_id_compare(trid, &discovery_entry_ctx->trid)) { 7050 return -EEXIST; 7051 } 7052 } 7053 } 7054 7055 ctx = calloc(1, sizeof(*ctx)); 7056 if (ctx == NULL) { 7057 return -ENOMEM; 7058 } 7059 7060 ctx->name = strdup(base_name); 7061 if (ctx->name == NULL) { 7062 free_discovery_ctx(ctx); 7063 return -ENOMEM; 7064 } 7065 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 7066 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 7067 ctx->from_mdns_discovery_service = from_mdns; 7068 ctx->bdev_opts.from_discovery_service = true; 7069 ctx->calling_thread = spdk_get_thread(); 7070 ctx->start_cb_fn = cb_fn; 7071 ctx->cb_ctx = cb_ctx; 7072 ctx->initializing = true; 7073 if (ctx->start_cb_fn) { 7074 /* We can use this when dumping json to denote if this RPC parameter 7075 * was specified or not. 7076 */ 7077 ctx->wait_for_attach = true; 7078 } 7079 if (attach_timeout != 0) { 7080 ctx->timeout_ticks = spdk_get_ticks() + attach_timeout * 7081 spdk_get_ticks_hz() / 1000ull; 7082 } 7083 TAILQ_INIT(&ctx->nvm_entry_ctxs); 7084 TAILQ_INIT(&ctx->discovery_entry_ctxs); 7085 memcpy(&ctx->trid, trid, sizeof(*trid)); 7086 /* Even if user did not specify hostnqn, we can still strdup("\0"); */ 7087 ctx->hostnqn = strdup(ctx->drv_opts.hostnqn); 7088 if (ctx->hostnqn == NULL) { 7089 free_discovery_ctx(ctx); 7090 return -ENOMEM; 7091 } 7092 discovery_entry_ctx = create_discovery_entry_ctx(ctx, trid); 7093 if (discovery_entry_ctx == NULL) { 7094 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 7095 free_discovery_ctx(ctx); 7096 return -ENOMEM; 7097 } 7098 7099 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, discovery_entry_ctx, tailq); 7100 spdk_thread_send_msg(g_bdev_nvme_init_thread, start_discovery_poller, ctx); 7101 return 0; 7102 } 7103 7104 int 7105 bdev_nvme_stop_discovery(const char *name, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 7106 { 7107 struct discovery_ctx *ctx; 7108 7109 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 7110 if (strcmp(name, ctx->name) == 0) { 7111 if (ctx->stop) { 7112 return -EALREADY; 7113 } 7114 /* If we're still starting the discovery service and ->rc is non-zero, we're 7115 * going to stop it as soon as we can 7116 */ 7117 if (ctx->initializing && ctx->rc != 0) { 7118 return -EALREADY; 7119 } 7120 stop_discovery(ctx, cb_fn, cb_ctx); 7121 return 0; 7122 } 7123 } 7124 7125 return -ENOENT; 7126 } 7127 7128 static int 7129 bdev_nvme_library_init(void) 7130 { 7131 g_bdev_nvme_init_thread = spdk_get_thread(); 7132 7133 spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_create_poll_group_cb, 7134 bdev_nvme_destroy_poll_group_cb, 7135 sizeof(struct nvme_poll_group), "nvme_poll_groups"); 7136 7137 return 0; 7138 } 7139 7140 static void 7141 bdev_nvme_fini_destruct_ctrlrs(void) 7142 { 7143 struct nvme_bdev_ctrlr *nbdev_ctrlr; 7144 struct nvme_ctrlr *nvme_ctrlr; 7145 7146 pthread_mutex_lock(&g_bdev_nvme_mutex); 7147 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 7148 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 7149 pthread_mutex_lock(&nvme_ctrlr->mutex); 7150 if (nvme_ctrlr->destruct) { 7151 /* This controller's destruction was already started 7152 * before the application started shutting down 7153 */ 7154 pthread_mutex_unlock(&nvme_ctrlr->mutex); 7155 continue; 7156 } 7157 nvme_ctrlr->destruct = true; 7158 pthread_mutex_unlock(&nvme_ctrlr->mutex); 7159 7160 spdk_thread_send_msg(nvme_ctrlr->thread, _nvme_ctrlr_destruct, 7161 nvme_ctrlr); 7162 } 7163 } 7164 7165 g_bdev_nvme_module_finish = true; 7166 if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 7167 pthread_mutex_unlock(&g_bdev_nvme_mutex); 7168 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 7169 spdk_bdev_module_fini_done(); 7170 return; 7171 } 7172 7173 pthread_mutex_unlock(&g_bdev_nvme_mutex); 7174 } 7175 7176 static void 7177 check_discovery_fini(void *arg) 7178 { 7179 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 7180 bdev_nvme_fini_destruct_ctrlrs(); 7181 } 7182 } 7183 7184 static void 7185 bdev_nvme_library_fini(void) 7186 { 7187 struct nvme_probe_skip_entry *entry, *entry_tmp; 7188 struct discovery_ctx *ctx; 7189 7190 spdk_poller_unregister(&g_hotplug_poller); 7191 free(g_hotplug_probe_ctx); 7192 g_hotplug_probe_ctx = NULL; 7193 7194 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) { 7195 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 7196 free(entry); 7197 } 7198 7199 assert(spdk_get_thread() == g_bdev_nvme_init_thread); 7200 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 7201 bdev_nvme_fini_destruct_ctrlrs(); 7202 } else { 7203 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 7204 stop_discovery(ctx, check_discovery_fini, NULL); 7205 } 7206 } 7207 } 7208 7209 static void 7210 bdev_nvme_verify_pi_error(struct nvme_bdev_io *bio) 7211 { 7212 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7213 struct spdk_bdev *bdev = bdev_io->bdev; 7214 struct spdk_dif_ctx dif_ctx; 7215 struct spdk_dif_error err_blk = {}; 7216 int rc; 7217 struct spdk_dif_ctx_init_ext_opts dif_opts; 7218 7219 dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format); 7220 dif_opts.dif_pi_format = SPDK_DIF_PI_FORMAT_16; 7221 rc = spdk_dif_ctx_init(&dif_ctx, 7222 bdev->blocklen, bdev->md_len, bdev->md_interleave, 7223 bdev->dif_is_head_of_md, bdev->dif_type, 7224 bdev_io->u.bdev.dif_check_flags, 7225 bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0, &dif_opts); 7226 if (rc != 0) { 7227 SPDK_ERRLOG("Initialization of DIF context failed\n"); 7228 return; 7229 } 7230 7231 if (bdev->md_interleave) { 7232 rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 7233 bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 7234 } else { 7235 struct iovec md_iov = { 7236 .iov_base = bdev_io->u.bdev.md_buf, 7237 .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len, 7238 }; 7239 7240 rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 7241 &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 7242 } 7243 7244 if (rc != 0) { 7245 SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n", 7246 err_blk.err_type, err_blk.err_offset); 7247 } else { 7248 SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n"); 7249 } 7250 } 7251 7252 static void 7253 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 7254 { 7255 struct nvme_bdev_io *bio = ref; 7256 7257 if (spdk_nvme_cpl_is_success(cpl)) { 7258 /* Run PI verification for read data buffer. */ 7259 bdev_nvme_verify_pi_error(bio); 7260 } 7261 7262 /* Return original completion status */ 7263 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 7264 } 7265 7266 static void 7267 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 7268 { 7269 struct nvme_bdev_io *bio = ref; 7270 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7271 int ret; 7272 7273 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 7274 SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n", 7275 cpl->status.sct, cpl->status.sc); 7276 7277 /* Save completion status to use after verifying PI error. */ 7278 bio->cpl = *cpl; 7279 7280 if (spdk_likely(nvme_io_path_is_available(bio->io_path))) { 7281 /* Read without PI checking to verify PI error. */ 7282 ret = bdev_nvme_no_pi_readv(bio, 7283 bdev_io->u.bdev.iovs, 7284 bdev_io->u.bdev.iovcnt, 7285 bdev_io->u.bdev.md_buf, 7286 bdev_io->u.bdev.num_blocks, 7287 bdev_io->u.bdev.offset_blocks); 7288 if (ret == 0) { 7289 return; 7290 } 7291 } 7292 } 7293 7294 bdev_nvme_io_complete_nvme_status(bio, cpl); 7295 } 7296 7297 static void 7298 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7299 { 7300 struct nvme_bdev_io *bio = ref; 7301 7302 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 7303 SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n", 7304 cpl->status.sct, cpl->status.sc); 7305 /* Run PI verification for write data buffer if PI error is detected. */ 7306 bdev_nvme_verify_pi_error(bio); 7307 } 7308 7309 bdev_nvme_io_complete_nvme_status(bio, cpl); 7310 } 7311 7312 static void 7313 bdev_nvme_zone_appendv_done(void *ref, const struct spdk_nvme_cpl *cpl) 7314 { 7315 struct nvme_bdev_io *bio = ref; 7316 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7317 7318 /* spdk_bdev_io_get_append_location() requires that the ALBA is stored in offset_blocks. 7319 * Additionally, offset_blocks has to be set before calling bdev_nvme_verify_pi_error(). 7320 */ 7321 bdev_io->u.bdev.offset_blocks = *(uint64_t *)&cpl->cdw0; 7322 7323 if (spdk_nvme_cpl_is_pi_error(cpl)) { 7324 SPDK_ERRLOG("zone append completed with PI error (sct=%d, sc=%d)\n", 7325 cpl->status.sct, cpl->status.sc); 7326 /* Run PI verification for zone append data buffer if PI error is detected. */ 7327 bdev_nvme_verify_pi_error(bio); 7328 } 7329 7330 bdev_nvme_io_complete_nvme_status(bio, cpl); 7331 } 7332 7333 static void 7334 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7335 { 7336 struct nvme_bdev_io *bio = ref; 7337 7338 if (spdk_nvme_cpl_is_pi_error(cpl)) { 7339 SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n", 7340 cpl->status.sct, cpl->status.sc); 7341 /* Run PI verification for compare data buffer if PI error is detected. */ 7342 bdev_nvme_verify_pi_error(bio); 7343 } 7344 7345 bdev_nvme_io_complete_nvme_status(bio, cpl); 7346 } 7347 7348 static void 7349 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7350 { 7351 struct nvme_bdev_io *bio = ref; 7352 7353 /* Compare operation completion */ 7354 if (!bio->first_fused_completed) { 7355 /* Save compare result for write callback */ 7356 bio->cpl = *cpl; 7357 bio->first_fused_completed = true; 7358 return; 7359 } 7360 7361 /* Write operation completion */ 7362 if (spdk_nvme_cpl_is_error(&bio->cpl)) { 7363 /* If bio->cpl is already an error, it means the compare operation failed. In that case, 7364 * complete the IO with the compare operation's status. 7365 */ 7366 if (!spdk_nvme_cpl_is_error(cpl)) { 7367 SPDK_ERRLOG("Unexpected write success after compare failure.\n"); 7368 } 7369 7370 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 7371 } else { 7372 bdev_nvme_io_complete_nvme_status(bio, cpl); 7373 } 7374 } 7375 7376 static void 7377 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl) 7378 { 7379 struct nvme_bdev_io *bio = ref; 7380 7381 bdev_nvme_io_complete_nvme_status(bio, cpl); 7382 } 7383 7384 static int 7385 fill_zone_from_report(struct spdk_bdev_zone_info *info, struct spdk_nvme_zns_zone_desc *desc) 7386 { 7387 switch (desc->zt) { 7388 case SPDK_NVME_ZONE_TYPE_SEQWR: 7389 info->type = SPDK_BDEV_ZONE_TYPE_SEQWR; 7390 break; 7391 default: 7392 SPDK_ERRLOG("Invalid zone type: %#x in zone report\n", desc->zt); 7393 return -EIO; 7394 } 7395 7396 switch (desc->zs) { 7397 case SPDK_NVME_ZONE_STATE_EMPTY: 7398 info->state = SPDK_BDEV_ZONE_STATE_EMPTY; 7399 break; 7400 case SPDK_NVME_ZONE_STATE_IOPEN: 7401 info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN; 7402 break; 7403 case SPDK_NVME_ZONE_STATE_EOPEN: 7404 info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN; 7405 break; 7406 case SPDK_NVME_ZONE_STATE_CLOSED: 7407 info->state = SPDK_BDEV_ZONE_STATE_CLOSED; 7408 break; 7409 case SPDK_NVME_ZONE_STATE_RONLY: 7410 info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY; 7411 break; 7412 case SPDK_NVME_ZONE_STATE_FULL: 7413 info->state = SPDK_BDEV_ZONE_STATE_FULL; 7414 break; 7415 case SPDK_NVME_ZONE_STATE_OFFLINE: 7416 info->state = SPDK_BDEV_ZONE_STATE_OFFLINE; 7417 break; 7418 default: 7419 SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", desc->zs); 7420 return -EIO; 7421 } 7422 7423 info->zone_id = desc->zslba; 7424 info->write_pointer = desc->wp; 7425 info->capacity = desc->zcap; 7426 7427 return 0; 7428 } 7429 7430 static void 7431 bdev_nvme_get_zone_info_done(void *ref, const struct spdk_nvme_cpl *cpl) 7432 { 7433 struct nvme_bdev_io *bio = ref; 7434 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7435 uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id; 7436 uint32_t zones_to_copy = bdev_io->u.zone_mgmt.num_zones; 7437 struct spdk_bdev_zone_info *info = bdev_io->u.zone_mgmt.buf; 7438 uint64_t max_zones_per_buf, i; 7439 uint32_t zone_report_bufsize; 7440 struct spdk_nvme_ns *ns; 7441 struct spdk_nvme_qpair *qpair; 7442 int ret; 7443 7444 if (spdk_nvme_cpl_is_error(cpl)) { 7445 goto out_complete_io_nvme_cpl; 7446 } 7447 7448 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 7449 ret = -ENXIO; 7450 goto out_complete_io_ret; 7451 } 7452 7453 ns = bio->io_path->nvme_ns->ns; 7454 qpair = bio->io_path->qpair->qpair; 7455 7456 zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 7457 max_zones_per_buf = (zone_report_bufsize - sizeof(*bio->zone_report_buf)) / 7458 sizeof(bio->zone_report_buf->descs[0]); 7459 7460 if (bio->zone_report_buf->nr_zones > max_zones_per_buf) { 7461 ret = -EINVAL; 7462 goto out_complete_io_ret; 7463 } 7464 7465 if (!bio->zone_report_buf->nr_zones) { 7466 ret = -EINVAL; 7467 goto out_complete_io_ret; 7468 } 7469 7470 for (i = 0; i < bio->zone_report_buf->nr_zones && bio->handled_zones < zones_to_copy; i++) { 7471 ret = fill_zone_from_report(&info[bio->handled_zones], 7472 &bio->zone_report_buf->descs[i]); 7473 if (ret) { 7474 goto out_complete_io_ret; 7475 } 7476 bio->handled_zones++; 7477 } 7478 7479 if (bio->handled_zones < zones_to_copy) { 7480 uint64_t zone_size_lba = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 7481 uint64_t slba = zone_id + (zone_size_lba * bio->handled_zones); 7482 7483 memset(bio->zone_report_buf, 0, zone_report_bufsize); 7484 ret = spdk_nvme_zns_report_zones(ns, qpair, 7485 bio->zone_report_buf, zone_report_bufsize, 7486 slba, SPDK_NVME_ZRA_LIST_ALL, true, 7487 bdev_nvme_get_zone_info_done, bio); 7488 if (!ret) { 7489 return; 7490 } else { 7491 goto out_complete_io_ret; 7492 } 7493 } 7494 7495 out_complete_io_nvme_cpl: 7496 free(bio->zone_report_buf); 7497 bio->zone_report_buf = NULL; 7498 bdev_nvme_io_complete_nvme_status(bio, cpl); 7499 return; 7500 7501 out_complete_io_ret: 7502 free(bio->zone_report_buf); 7503 bio->zone_report_buf = NULL; 7504 bdev_nvme_io_complete(bio, ret); 7505 } 7506 7507 static void 7508 bdev_nvme_zone_management_done(void *ref, const struct spdk_nvme_cpl *cpl) 7509 { 7510 struct nvme_bdev_io *bio = ref; 7511 7512 bdev_nvme_io_complete_nvme_status(bio, cpl); 7513 } 7514 7515 static void 7516 bdev_nvme_admin_passthru_complete_nvme_status(void *ctx) 7517 { 7518 struct nvme_bdev_io *bio = ctx; 7519 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7520 const struct spdk_nvme_cpl *cpl = &bio->cpl; 7521 7522 assert(bdev_nvme_io_type_is_admin(bdev_io->type)); 7523 7524 __bdev_nvme_io_complete(bdev_io, 0, cpl); 7525 } 7526 7527 static void 7528 bdev_nvme_abort_complete(void *ctx) 7529 { 7530 struct nvme_bdev_io *bio = ctx; 7531 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7532 7533 if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) { 7534 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS, NULL); 7535 } else { 7536 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED, NULL); 7537 } 7538 } 7539 7540 static void 7541 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl) 7542 { 7543 struct nvme_bdev_io *bio = ref; 7544 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7545 7546 bio->cpl = *cpl; 7547 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), bdev_nvme_abort_complete, bio); 7548 } 7549 7550 static void 7551 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl) 7552 { 7553 struct nvme_bdev_io *bio = ref; 7554 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7555 7556 bio->cpl = *cpl; 7557 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7558 bdev_nvme_admin_passthru_complete_nvme_status, bio); 7559 } 7560 7561 static void 7562 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset) 7563 { 7564 struct nvme_bdev_io *bio = ref; 7565 struct iovec *iov; 7566 7567 bio->iov_offset = sgl_offset; 7568 for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) { 7569 iov = &bio->iovs[bio->iovpos]; 7570 if (bio->iov_offset < iov->iov_len) { 7571 break; 7572 } 7573 7574 bio->iov_offset -= iov->iov_len; 7575 } 7576 } 7577 7578 static int 7579 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length) 7580 { 7581 struct nvme_bdev_io *bio = ref; 7582 struct iovec *iov; 7583 7584 assert(bio->iovpos < bio->iovcnt); 7585 7586 iov = &bio->iovs[bio->iovpos]; 7587 7588 *address = iov->iov_base; 7589 *length = iov->iov_len; 7590 7591 if (bio->iov_offset) { 7592 assert(bio->iov_offset <= iov->iov_len); 7593 *address += bio->iov_offset; 7594 *length -= bio->iov_offset; 7595 } 7596 7597 bio->iov_offset += *length; 7598 if (bio->iov_offset == iov->iov_len) { 7599 bio->iovpos++; 7600 bio->iov_offset = 0; 7601 } 7602 7603 return 0; 7604 } 7605 7606 static void 7607 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset) 7608 { 7609 struct nvme_bdev_io *bio = ref; 7610 struct iovec *iov; 7611 7612 bio->fused_iov_offset = sgl_offset; 7613 for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) { 7614 iov = &bio->fused_iovs[bio->fused_iovpos]; 7615 if (bio->fused_iov_offset < iov->iov_len) { 7616 break; 7617 } 7618 7619 bio->fused_iov_offset -= iov->iov_len; 7620 } 7621 } 7622 7623 static int 7624 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length) 7625 { 7626 struct nvme_bdev_io *bio = ref; 7627 struct iovec *iov; 7628 7629 assert(bio->fused_iovpos < bio->fused_iovcnt); 7630 7631 iov = &bio->fused_iovs[bio->fused_iovpos]; 7632 7633 *address = iov->iov_base; 7634 *length = iov->iov_len; 7635 7636 if (bio->fused_iov_offset) { 7637 assert(bio->fused_iov_offset <= iov->iov_len); 7638 *address += bio->fused_iov_offset; 7639 *length -= bio->fused_iov_offset; 7640 } 7641 7642 bio->fused_iov_offset += *length; 7643 if (bio->fused_iov_offset == iov->iov_len) { 7644 bio->fused_iovpos++; 7645 bio->fused_iov_offset = 0; 7646 } 7647 7648 return 0; 7649 } 7650 7651 static int 7652 bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7653 void *md, uint64_t lba_count, uint64_t lba) 7654 { 7655 int rc; 7656 7657 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n", 7658 lba_count, lba); 7659 7660 bio->iovs = iov; 7661 bio->iovcnt = iovcnt; 7662 bio->iovpos = 0; 7663 bio->iov_offset = 0; 7664 7665 rc = spdk_nvme_ns_cmd_readv_with_md(bio->io_path->nvme_ns->ns, 7666 bio->io_path->qpair->qpair, 7667 lba, lba_count, 7668 bdev_nvme_no_pi_readv_done, bio, 0, 7669 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 7670 md, 0, 0); 7671 7672 if (rc != 0 && rc != -ENOMEM) { 7673 SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc); 7674 } 7675 return rc; 7676 } 7677 7678 static int 7679 bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7680 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, 7681 struct spdk_memory_domain *domain, void *domain_ctx, 7682 struct spdk_accel_sequence *seq) 7683 { 7684 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7685 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7686 int rc; 7687 7688 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7689 lba_count, lba); 7690 7691 bio->iovs = iov; 7692 bio->iovcnt = iovcnt; 7693 bio->iovpos = 0; 7694 bio->iov_offset = 0; 7695 7696 if (domain != NULL || seq != NULL) { 7697 bio->ext_opts.size = SPDK_SIZEOF(&bio->ext_opts, accel_sequence); 7698 bio->ext_opts.memory_domain = domain; 7699 bio->ext_opts.memory_domain_ctx = domain_ctx; 7700 bio->ext_opts.io_flags = flags; 7701 bio->ext_opts.metadata = md; 7702 bio->ext_opts.accel_sequence = seq; 7703 7704 if (iovcnt == 1) { 7705 rc = spdk_nvme_ns_cmd_read_ext(ns, qpair, iov[0].iov_base, lba, lba_count, bdev_nvme_readv_done, 7706 bio, &bio->ext_opts); 7707 } else { 7708 rc = spdk_nvme_ns_cmd_readv_ext(ns, qpair, lba, lba_count, 7709 bdev_nvme_readv_done, bio, 7710 bdev_nvme_queued_reset_sgl, 7711 bdev_nvme_queued_next_sge, 7712 &bio->ext_opts); 7713 } 7714 } else if (iovcnt == 1) { 7715 rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base, 7716 md, lba, lba_count, bdev_nvme_readv_done, 7717 bio, flags, 0, 0); 7718 } else { 7719 rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count, 7720 bdev_nvme_readv_done, bio, flags, 7721 bdev_nvme_queued_reset_sgl, 7722 bdev_nvme_queued_next_sge, md, 0, 0); 7723 } 7724 7725 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 7726 SPDK_ERRLOG("readv failed: rc = %d\n", rc); 7727 } 7728 return rc; 7729 } 7730 7731 static int 7732 bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7733 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, 7734 struct spdk_memory_domain *domain, void *domain_ctx, 7735 struct spdk_accel_sequence *seq) 7736 { 7737 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7738 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7739 int rc; 7740 7741 SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7742 lba_count, lba); 7743 7744 bio->iovs = iov; 7745 bio->iovcnt = iovcnt; 7746 bio->iovpos = 0; 7747 bio->iov_offset = 0; 7748 7749 if (domain != NULL || seq != NULL) { 7750 bio->ext_opts.size = SPDK_SIZEOF(&bio->ext_opts, accel_sequence); 7751 bio->ext_opts.memory_domain = domain; 7752 bio->ext_opts.memory_domain_ctx = domain_ctx; 7753 bio->ext_opts.io_flags = flags; 7754 bio->ext_opts.metadata = md; 7755 bio->ext_opts.accel_sequence = seq; 7756 7757 if (iovcnt == 1) { 7758 rc = spdk_nvme_ns_cmd_write_ext(ns, qpair, iov[0].iov_base, lba, lba_count, bdev_nvme_writev_done, 7759 bio, &bio->ext_opts); 7760 } else { 7761 rc = spdk_nvme_ns_cmd_writev_ext(ns, qpair, lba, lba_count, 7762 bdev_nvme_writev_done, bio, 7763 bdev_nvme_queued_reset_sgl, 7764 bdev_nvme_queued_next_sge, 7765 &bio->ext_opts); 7766 } 7767 } else if (iovcnt == 1) { 7768 rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base, 7769 md, lba, lba_count, bdev_nvme_writev_done, 7770 bio, flags, 0, 0); 7771 } else { 7772 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 7773 bdev_nvme_writev_done, bio, flags, 7774 bdev_nvme_queued_reset_sgl, 7775 bdev_nvme_queued_next_sge, md, 0, 0); 7776 } 7777 7778 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 7779 SPDK_ERRLOG("writev failed: rc = %d\n", rc); 7780 } 7781 return rc; 7782 } 7783 7784 static int 7785 bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7786 void *md, uint64_t lba_count, uint64_t zslba, 7787 uint32_t flags) 7788 { 7789 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7790 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7791 int rc; 7792 7793 SPDK_DEBUGLOG(bdev_nvme, "zone append %" PRIu64 " blocks to zone start lba %#" PRIx64 "\n", 7794 lba_count, zslba); 7795 7796 bio->iovs = iov; 7797 bio->iovcnt = iovcnt; 7798 bio->iovpos = 0; 7799 bio->iov_offset = 0; 7800 7801 if (iovcnt == 1) { 7802 rc = spdk_nvme_zns_zone_append_with_md(ns, qpair, iov[0].iov_base, md, zslba, 7803 lba_count, 7804 bdev_nvme_zone_appendv_done, bio, 7805 flags, 7806 0, 0); 7807 } else { 7808 rc = spdk_nvme_zns_zone_appendv_with_md(ns, qpair, zslba, lba_count, 7809 bdev_nvme_zone_appendv_done, bio, flags, 7810 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 7811 md, 0, 0); 7812 } 7813 7814 if (rc != 0 && rc != -ENOMEM) { 7815 SPDK_ERRLOG("zone append failed: rc = %d\n", rc); 7816 } 7817 return rc; 7818 } 7819 7820 static int 7821 bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7822 void *md, uint64_t lba_count, uint64_t lba, 7823 uint32_t flags) 7824 { 7825 int rc; 7826 7827 SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7828 lba_count, lba); 7829 7830 bio->iovs = iov; 7831 bio->iovcnt = iovcnt; 7832 bio->iovpos = 0; 7833 bio->iov_offset = 0; 7834 7835 rc = spdk_nvme_ns_cmd_comparev_with_md(bio->io_path->nvme_ns->ns, 7836 bio->io_path->qpair->qpair, 7837 lba, lba_count, 7838 bdev_nvme_comparev_done, bio, flags, 7839 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 7840 md, 0, 0); 7841 7842 if (rc != 0 && rc != -ENOMEM) { 7843 SPDK_ERRLOG("comparev failed: rc = %d\n", rc); 7844 } 7845 return rc; 7846 } 7847 7848 static int 7849 bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, 7850 struct iovec *write_iov, int write_iovcnt, 7851 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags) 7852 { 7853 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7854 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7855 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7856 int rc; 7857 7858 SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7859 lba_count, lba); 7860 7861 bio->iovs = cmp_iov; 7862 bio->iovcnt = cmp_iovcnt; 7863 bio->iovpos = 0; 7864 bio->iov_offset = 0; 7865 bio->fused_iovs = write_iov; 7866 bio->fused_iovcnt = write_iovcnt; 7867 bio->fused_iovpos = 0; 7868 bio->fused_iov_offset = 0; 7869 7870 if (bdev_io->num_retries == 0) { 7871 bio->first_fused_submitted = false; 7872 bio->first_fused_completed = false; 7873 } 7874 7875 if (!bio->first_fused_submitted) { 7876 flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST; 7877 memset(&bio->cpl, 0, sizeof(bio->cpl)); 7878 7879 rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count, 7880 bdev_nvme_comparev_and_writev_done, bio, flags, 7881 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0); 7882 if (rc == 0) { 7883 bio->first_fused_submitted = true; 7884 flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST; 7885 } else { 7886 if (rc != -ENOMEM) { 7887 SPDK_ERRLOG("compare failed: rc = %d\n", rc); 7888 } 7889 return rc; 7890 } 7891 } 7892 7893 flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND; 7894 7895 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 7896 bdev_nvme_comparev_and_writev_done, bio, flags, 7897 bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0); 7898 if (rc != 0 && rc != -ENOMEM) { 7899 SPDK_ERRLOG("write failed: rc = %d\n", rc); 7900 rc = 0; 7901 } 7902 7903 return rc; 7904 } 7905 7906 static int 7907 bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 7908 { 7909 struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES]; 7910 struct spdk_nvme_dsm_range *range; 7911 uint64_t offset, remaining; 7912 uint64_t num_ranges_u64; 7913 uint16_t num_ranges; 7914 int rc; 7915 7916 num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) / 7917 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 7918 if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) { 7919 SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks); 7920 return -EINVAL; 7921 } 7922 num_ranges = (uint16_t)num_ranges_u64; 7923 7924 offset = offset_blocks; 7925 remaining = num_blocks; 7926 range = &dsm_ranges[0]; 7927 7928 /* Fill max-size ranges until the remaining blocks fit into one range */ 7929 while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) { 7930 range->attributes.raw = 0; 7931 range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 7932 range->starting_lba = offset; 7933 7934 offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 7935 remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 7936 range++; 7937 } 7938 7939 /* Final range describes the remaining blocks */ 7940 range->attributes.raw = 0; 7941 range->length = remaining; 7942 range->starting_lba = offset; 7943 7944 rc = spdk_nvme_ns_cmd_dataset_management(bio->io_path->nvme_ns->ns, 7945 bio->io_path->qpair->qpair, 7946 SPDK_NVME_DSM_ATTR_DEALLOCATE, 7947 dsm_ranges, num_ranges, 7948 bdev_nvme_queued_done, bio); 7949 7950 return rc; 7951 } 7952 7953 static int 7954 bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 7955 { 7956 if (num_blocks > UINT16_MAX + 1) { 7957 SPDK_ERRLOG("NVMe write zeroes is limited to 16-bit block count\n"); 7958 return -EINVAL; 7959 } 7960 7961 return spdk_nvme_ns_cmd_write_zeroes(bio->io_path->nvme_ns->ns, 7962 bio->io_path->qpair->qpair, 7963 offset_blocks, num_blocks, 7964 bdev_nvme_queued_done, bio, 7965 0); 7966 } 7967 7968 static int 7969 bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones, 7970 struct spdk_bdev_zone_info *info) 7971 { 7972 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7973 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7974 uint32_t zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 7975 uint64_t zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 7976 uint64_t total_zones = spdk_nvme_zns_ns_get_num_zones(ns); 7977 7978 if (zone_id % zone_size != 0) { 7979 return -EINVAL; 7980 } 7981 7982 if (num_zones > total_zones || !num_zones) { 7983 return -EINVAL; 7984 } 7985 7986 assert(!bio->zone_report_buf); 7987 bio->zone_report_buf = calloc(1, zone_report_bufsize); 7988 if (!bio->zone_report_buf) { 7989 return -ENOMEM; 7990 } 7991 7992 bio->handled_zones = 0; 7993 7994 return spdk_nvme_zns_report_zones(ns, qpair, bio->zone_report_buf, zone_report_bufsize, 7995 zone_id, SPDK_NVME_ZRA_LIST_ALL, true, 7996 bdev_nvme_get_zone_info_done, bio); 7997 } 7998 7999 static int 8000 bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 8001 enum spdk_bdev_zone_action action) 8002 { 8003 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8004 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8005 8006 switch (action) { 8007 case SPDK_BDEV_ZONE_CLOSE: 8008 return spdk_nvme_zns_close_zone(ns, qpair, zone_id, false, 8009 bdev_nvme_zone_management_done, bio); 8010 case SPDK_BDEV_ZONE_FINISH: 8011 return spdk_nvme_zns_finish_zone(ns, qpair, zone_id, false, 8012 bdev_nvme_zone_management_done, bio); 8013 case SPDK_BDEV_ZONE_OPEN: 8014 return spdk_nvme_zns_open_zone(ns, qpair, zone_id, false, 8015 bdev_nvme_zone_management_done, bio); 8016 case SPDK_BDEV_ZONE_RESET: 8017 return spdk_nvme_zns_reset_zone(ns, qpair, zone_id, false, 8018 bdev_nvme_zone_management_done, bio); 8019 case SPDK_BDEV_ZONE_OFFLINE: 8020 return spdk_nvme_zns_offline_zone(ns, qpair, zone_id, false, 8021 bdev_nvme_zone_management_done, bio); 8022 default: 8023 return -EINVAL; 8024 } 8025 } 8026 8027 static void 8028 bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 8029 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) 8030 { 8031 struct nvme_io_path *io_path; 8032 struct nvme_ctrlr *nvme_ctrlr; 8033 uint32_t max_xfer_size; 8034 int rc = -ENXIO; 8035 8036 /* Choose the first ctrlr which is not failed. */ 8037 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 8038 nvme_ctrlr = io_path->qpair->ctrlr; 8039 8040 /* We should skip any unavailable nvme_ctrlr rather than checking 8041 * if the return value of spdk_nvme_ctrlr_cmd_admin_raw() is -ENXIO. 8042 */ 8043 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 8044 continue; 8045 } 8046 8047 max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ctrlr->ctrlr); 8048 8049 if (nbytes > max_xfer_size) { 8050 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 8051 rc = -EINVAL; 8052 goto err; 8053 } 8054 8055 rc = spdk_nvme_ctrlr_cmd_admin_raw(nvme_ctrlr->ctrlr, cmd, buf, (uint32_t)nbytes, 8056 bdev_nvme_admin_passthru_done, bio); 8057 if (rc == 0) { 8058 return; 8059 } 8060 } 8061 8062 err: 8063 bdev_nvme_admin_complete(bio, rc); 8064 } 8065 8066 static int 8067 bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 8068 void *buf, size_t nbytes) 8069 { 8070 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8071 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8072 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 8073 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 8074 8075 if (nbytes > max_xfer_size) { 8076 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 8077 return -EINVAL; 8078 } 8079 8080 /* 8081 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 8082 * so fill it out automatically. 8083 */ 8084 cmd->nsid = spdk_nvme_ns_get_id(ns); 8085 8086 return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf, 8087 (uint32_t)nbytes, bdev_nvme_queued_done, bio); 8088 } 8089 8090 static int 8091 bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 8092 void *buf, size_t nbytes, void *md_buf, size_t md_len) 8093 { 8094 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8095 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8096 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 8097 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 8098 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 8099 8100 if (nbytes > max_xfer_size) { 8101 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 8102 return -EINVAL; 8103 } 8104 8105 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 8106 SPDK_ERRLOG("invalid meta data buffer size\n"); 8107 return -EINVAL; 8108 } 8109 8110 /* 8111 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 8112 * so fill it out automatically. 8113 */ 8114 cmd->nsid = spdk_nvme_ns_get_id(ns); 8115 8116 return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf, 8117 (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio); 8118 } 8119 8120 static int 8121 bdev_nvme_iov_passthru_md(struct nvme_bdev_io *bio, 8122 struct spdk_nvme_cmd *cmd, struct iovec *iov, int iovcnt, 8123 size_t nbytes, void *md_buf, size_t md_len) 8124 { 8125 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8126 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8127 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 8128 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 8129 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 8130 8131 bio->iovs = iov; 8132 bio->iovcnt = iovcnt; 8133 bio->iovpos = 0; 8134 bio->iov_offset = 0; 8135 8136 if (nbytes > max_xfer_size) { 8137 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 8138 return -EINVAL; 8139 } 8140 8141 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 8142 SPDK_ERRLOG("invalid meta data buffer size\n"); 8143 return -EINVAL; 8144 } 8145 8146 /* 8147 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands 8148 * require a nsid, so fill it out automatically. 8149 */ 8150 cmd->nsid = spdk_nvme_ns_get_id(ns); 8151 8152 return spdk_nvme_ctrlr_cmd_iov_raw_with_md( 8153 ctrlr, qpair, cmd, (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio, 8154 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge); 8155 } 8156 8157 static void 8158 bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 8159 struct nvme_bdev_io *bio_to_abort) 8160 { 8161 struct nvme_io_path *io_path; 8162 int rc = 0; 8163 8164 rc = bdev_nvme_abort_retry_io(nbdev_ch, bio_to_abort); 8165 if (rc == 0) { 8166 bdev_nvme_admin_complete(bio, 0); 8167 return; 8168 } 8169 8170 io_path = bio_to_abort->io_path; 8171 if (io_path != NULL) { 8172 rc = spdk_nvme_ctrlr_cmd_abort_ext(io_path->qpair->ctrlr->ctrlr, 8173 io_path->qpair->qpair, 8174 bio_to_abort, 8175 bdev_nvme_abort_done, bio); 8176 } else { 8177 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 8178 rc = spdk_nvme_ctrlr_cmd_abort_ext(io_path->qpair->ctrlr->ctrlr, 8179 NULL, 8180 bio_to_abort, 8181 bdev_nvme_abort_done, bio); 8182 8183 if (rc != -ENOENT) { 8184 break; 8185 } 8186 } 8187 } 8188 8189 if (rc != 0) { 8190 /* If no command was found or there was any error, complete the abort 8191 * request with failure. 8192 */ 8193 bdev_nvme_admin_complete(bio, rc); 8194 } 8195 } 8196 8197 static int 8198 bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, uint64_t src_offset_blocks, 8199 uint64_t num_blocks) 8200 { 8201 struct spdk_nvme_scc_source_range range = { 8202 .slba = src_offset_blocks, 8203 .nlb = num_blocks - 1 8204 }; 8205 8206 return spdk_nvme_ns_cmd_copy(bio->io_path->nvme_ns->ns, 8207 bio->io_path->qpair->qpair, 8208 &range, 1, dst_offset_blocks, 8209 bdev_nvme_queued_done, bio); 8210 } 8211 8212 static void 8213 bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w) 8214 { 8215 const char *action; 8216 uint32_t i; 8217 8218 if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) { 8219 action = "reset"; 8220 } else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) { 8221 action = "abort"; 8222 } else { 8223 action = "none"; 8224 } 8225 8226 spdk_json_write_object_begin(w); 8227 8228 spdk_json_write_named_string(w, "method", "bdev_nvme_set_options"); 8229 8230 spdk_json_write_named_object_begin(w, "params"); 8231 spdk_json_write_named_string(w, "action_on_timeout", action); 8232 spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us); 8233 spdk_json_write_named_uint64(w, "timeout_admin_us", g_opts.timeout_admin_us); 8234 spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms); 8235 spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst); 8236 spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight); 8237 spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight); 8238 spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight); 8239 spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us); 8240 spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us); 8241 spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests); 8242 spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit); 8243 spdk_json_write_named_uint32(w, "transport_retry_count", g_opts.transport_retry_count); 8244 spdk_json_write_named_int32(w, "bdev_retry_count", g_opts.bdev_retry_count); 8245 spdk_json_write_named_uint8(w, "transport_ack_timeout", g_opts.transport_ack_timeout); 8246 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", g_opts.ctrlr_loss_timeout_sec); 8247 spdk_json_write_named_uint32(w, "reconnect_delay_sec", g_opts.reconnect_delay_sec); 8248 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", g_opts.fast_io_fail_timeout_sec); 8249 spdk_json_write_named_bool(w, "disable_auto_failback", g_opts.disable_auto_failback); 8250 spdk_json_write_named_bool(w, "generate_uuids", g_opts.generate_uuids); 8251 spdk_json_write_named_uint8(w, "transport_tos", g_opts.transport_tos); 8252 spdk_json_write_named_bool(w, "nvme_error_stat", g_opts.nvme_error_stat); 8253 spdk_json_write_named_uint32(w, "rdma_srq_size", g_opts.rdma_srq_size); 8254 spdk_json_write_named_bool(w, "io_path_stat", g_opts.io_path_stat); 8255 spdk_json_write_named_bool(w, "allow_accel_sequence", g_opts.allow_accel_sequence); 8256 spdk_json_write_named_uint32(w, "rdma_max_cq_size", g_opts.rdma_max_cq_size); 8257 spdk_json_write_named_uint16(w, "rdma_cm_event_timeout_ms", g_opts.rdma_cm_event_timeout_ms); 8258 spdk_json_write_named_array_begin(w, "dhchap_digests"); 8259 for (i = 0; i < 32; ++i) { 8260 if (g_opts.dhchap_digests & SPDK_BIT(i)) { 8261 spdk_json_write_string(w, spdk_nvme_dhchap_get_digest_name(i)); 8262 } 8263 } 8264 spdk_json_write_array_end(w); 8265 spdk_json_write_named_array_begin(w, "dhchap_dhgroups"); 8266 for (i = 0; i < 32; ++i) { 8267 if (g_opts.dhchap_dhgroups & SPDK_BIT(i)) { 8268 spdk_json_write_string(w, spdk_nvme_dhchap_get_dhgroup_name(i)); 8269 } 8270 } 8271 8272 spdk_json_write_array_end(w); 8273 spdk_json_write_object_end(w); 8274 8275 spdk_json_write_object_end(w); 8276 } 8277 8278 static void 8279 bdev_nvme_discovery_config_json(struct spdk_json_write_ctx *w, struct discovery_ctx *ctx) 8280 { 8281 struct spdk_nvme_transport_id trid; 8282 8283 spdk_json_write_object_begin(w); 8284 8285 spdk_json_write_named_string(w, "method", "bdev_nvme_start_discovery"); 8286 8287 spdk_json_write_named_object_begin(w, "params"); 8288 spdk_json_write_named_string(w, "name", ctx->name); 8289 spdk_json_write_named_string(w, "hostnqn", ctx->hostnqn); 8290 8291 trid = ctx->trid; 8292 memset(trid.subnqn, 0, sizeof(trid.subnqn)); 8293 nvme_bdev_dump_trid_json(&trid, w); 8294 8295 spdk_json_write_named_bool(w, "wait_for_attach", ctx->wait_for_attach); 8296 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", ctx->bdev_opts.ctrlr_loss_timeout_sec); 8297 spdk_json_write_named_uint32(w, "reconnect_delay_sec", ctx->bdev_opts.reconnect_delay_sec); 8298 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 8299 ctx->bdev_opts.fast_io_fail_timeout_sec); 8300 spdk_json_write_object_end(w); 8301 8302 spdk_json_write_object_end(w); 8303 } 8304 8305 #ifdef SPDK_CONFIG_NVME_CUSE 8306 static void 8307 nvme_ctrlr_cuse_config_json(struct spdk_json_write_ctx *w, 8308 struct nvme_ctrlr *nvme_ctrlr) 8309 { 8310 size_t cuse_name_size = 128; 8311 char cuse_name[cuse_name_size]; 8312 8313 if (spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr, 8314 cuse_name, &cuse_name_size) != 0) { 8315 return; 8316 } 8317 8318 spdk_json_write_object_begin(w); 8319 8320 spdk_json_write_named_string(w, "method", "bdev_nvme_cuse_register"); 8321 8322 spdk_json_write_named_object_begin(w, "params"); 8323 spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name); 8324 spdk_json_write_object_end(w); 8325 8326 spdk_json_write_object_end(w); 8327 } 8328 #endif 8329 8330 static void 8331 nvme_ctrlr_config_json(struct spdk_json_write_ctx *w, 8332 struct nvme_ctrlr *nvme_ctrlr) 8333 { 8334 struct spdk_nvme_transport_id *trid; 8335 const struct spdk_nvme_ctrlr_opts *opts; 8336 8337 if (nvme_ctrlr->opts.from_discovery_service) { 8338 /* Do not emit an RPC for this - it will be implicitly 8339 * covered by a separate bdev_nvme_start_discovery or 8340 * bdev_nvme_start_mdns_discovery RPC. 8341 */ 8342 return; 8343 } 8344 8345 trid = &nvme_ctrlr->active_path_id->trid; 8346 8347 spdk_json_write_object_begin(w); 8348 8349 spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller"); 8350 8351 spdk_json_write_named_object_begin(w, "params"); 8352 spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name); 8353 nvme_bdev_dump_trid_json(trid, w); 8354 spdk_json_write_named_bool(w, "prchk_reftag", 8355 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0); 8356 spdk_json_write_named_bool(w, "prchk_guard", 8357 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0); 8358 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", nvme_ctrlr->opts.ctrlr_loss_timeout_sec); 8359 spdk_json_write_named_uint32(w, "reconnect_delay_sec", nvme_ctrlr->opts.reconnect_delay_sec); 8360 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 8361 nvme_ctrlr->opts.fast_io_fail_timeout_sec); 8362 if (nvme_ctrlr->psk != NULL) { 8363 spdk_json_write_named_string(w, "psk", spdk_key_get_name(nvme_ctrlr->psk)); 8364 } else if (nvme_ctrlr->opts.psk[0] != '\0') { 8365 spdk_json_write_named_string(w, "psk", nvme_ctrlr->opts.psk); 8366 } 8367 8368 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 8369 spdk_json_write_named_string(w, "hostnqn", opts->hostnqn); 8370 spdk_json_write_named_bool(w, "hdgst", opts->header_digest); 8371 spdk_json_write_named_bool(w, "ddgst", opts->data_digest); 8372 if (opts->src_addr[0] != '\0') { 8373 spdk_json_write_named_string(w, "hostaddr", opts->src_addr); 8374 } 8375 if (opts->src_svcid[0] != '\0') { 8376 spdk_json_write_named_string(w, "hostsvcid", opts->src_svcid); 8377 } 8378 8379 spdk_json_write_object_end(w); 8380 8381 spdk_json_write_object_end(w); 8382 } 8383 8384 static void 8385 bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w) 8386 { 8387 spdk_json_write_object_begin(w); 8388 spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug"); 8389 8390 spdk_json_write_named_object_begin(w, "params"); 8391 spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us); 8392 spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled); 8393 spdk_json_write_object_end(w); 8394 8395 spdk_json_write_object_end(w); 8396 } 8397 8398 static int 8399 bdev_nvme_config_json(struct spdk_json_write_ctx *w) 8400 { 8401 struct nvme_bdev_ctrlr *nbdev_ctrlr; 8402 struct nvme_ctrlr *nvme_ctrlr; 8403 struct discovery_ctx *ctx; 8404 8405 bdev_nvme_opts_config_json(w); 8406 8407 pthread_mutex_lock(&g_bdev_nvme_mutex); 8408 8409 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 8410 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 8411 nvme_ctrlr_config_json(w, nvme_ctrlr); 8412 8413 #ifdef SPDK_CONFIG_NVME_CUSE 8414 nvme_ctrlr_cuse_config_json(w, nvme_ctrlr); 8415 #endif 8416 } 8417 } 8418 8419 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 8420 if (!ctx->from_mdns_discovery_service) { 8421 bdev_nvme_discovery_config_json(w, ctx); 8422 } 8423 } 8424 8425 bdev_nvme_mdns_discovery_config_json(w); 8426 8427 /* Dump as last parameter to give all NVMe bdevs chance to be constructed 8428 * before enabling hotplug poller. 8429 */ 8430 bdev_nvme_hotplug_config_json(w); 8431 8432 pthread_mutex_unlock(&g_bdev_nvme_mutex); 8433 return 0; 8434 } 8435 8436 struct spdk_nvme_ctrlr * 8437 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev) 8438 { 8439 struct nvme_bdev *nbdev; 8440 struct nvme_ns *nvme_ns; 8441 8442 if (!bdev || bdev->module != &nvme_if) { 8443 return NULL; 8444 } 8445 8446 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 8447 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 8448 assert(nvme_ns != NULL); 8449 8450 return nvme_ns->ctrlr->ctrlr; 8451 } 8452 8453 void 8454 nvme_io_path_info_json(struct spdk_json_write_ctx *w, struct nvme_io_path *io_path) 8455 { 8456 struct nvme_ns *nvme_ns = io_path->nvme_ns; 8457 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 8458 const struct spdk_nvme_ctrlr_data *cdata; 8459 const struct spdk_nvme_transport_id *trid; 8460 const char *adrfam_str; 8461 8462 spdk_json_write_object_begin(w); 8463 8464 spdk_json_write_named_string(w, "bdev_name", nvme_ns->bdev->disk.name); 8465 8466 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 8467 trid = spdk_nvme_ctrlr_get_transport_id(nvme_ctrlr->ctrlr); 8468 8469 spdk_json_write_named_uint32(w, "cntlid", cdata->cntlid); 8470 spdk_json_write_named_bool(w, "current", io_path->nbdev_ch != NULL && 8471 io_path == io_path->nbdev_ch->current_io_path); 8472 spdk_json_write_named_bool(w, "connected", nvme_qpair_is_connected(io_path->qpair)); 8473 spdk_json_write_named_bool(w, "accessible", nvme_ns_is_accessible(nvme_ns)); 8474 8475 spdk_json_write_named_object_begin(w, "transport"); 8476 spdk_json_write_named_string(w, "trtype", trid->trstring); 8477 spdk_json_write_named_string(w, "traddr", trid->traddr); 8478 if (trid->trsvcid[0] != '\0') { 8479 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 8480 } 8481 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 8482 if (adrfam_str) { 8483 spdk_json_write_named_string(w, "adrfam", adrfam_str); 8484 } 8485 spdk_json_write_object_end(w); 8486 8487 spdk_json_write_object_end(w); 8488 } 8489 8490 void 8491 bdev_nvme_get_discovery_info(struct spdk_json_write_ctx *w) 8492 { 8493 struct discovery_ctx *ctx; 8494 struct discovery_entry_ctx *entry_ctx; 8495 8496 spdk_json_write_array_begin(w); 8497 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 8498 spdk_json_write_object_begin(w); 8499 spdk_json_write_named_string(w, "name", ctx->name); 8500 8501 spdk_json_write_named_object_begin(w, "trid"); 8502 nvme_bdev_dump_trid_json(&ctx->trid, w); 8503 spdk_json_write_object_end(w); 8504 8505 spdk_json_write_named_array_begin(w, "referrals"); 8506 TAILQ_FOREACH(entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 8507 spdk_json_write_object_begin(w); 8508 spdk_json_write_named_object_begin(w, "trid"); 8509 nvme_bdev_dump_trid_json(&entry_ctx->trid, w); 8510 spdk_json_write_object_end(w); 8511 spdk_json_write_object_end(w); 8512 } 8513 spdk_json_write_array_end(w); 8514 8515 spdk_json_write_object_end(w); 8516 } 8517 spdk_json_write_array_end(w); 8518 } 8519 8520 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme) 8521 8522 SPDK_TRACE_REGISTER_FN(bdev_nvme_trace, "bdev_nvme", TRACE_GROUP_BDEV_NVME) 8523 { 8524 struct spdk_trace_tpoint_opts opts[] = { 8525 { 8526 "BDEV_NVME_IO_START", TRACE_BDEV_NVME_IO_START, 8527 OWNER_NONE, OBJECT_BDEV_NVME_IO, 1, 8528 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 8529 }, 8530 { 8531 "BDEV_NVME_IO_DONE", TRACE_BDEV_NVME_IO_DONE, 8532 OWNER_NONE, OBJECT_BDEV_NVME_IO, 0, 8533 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 8534 } 8535 }; 8536 8537 8538 spdk_trace_register_object(OBJECT_BDEV_NVME_IO, 'N'); 8539 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 8540 spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_SUBMIT, OBJECT_BDEV_NVME_IO, 0); 8541 spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_SUBMIT, OBJECT_BDEV_NVME_IO, 0); 8542 spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_COMPLETE, OBJECT_BDEV_NVME_IO, 0); 8543 spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_COMPLETE, OBJECT_BDEV_NVME_IO, 0); 8544 } 8545