1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 * Copyright (c) 2022 Dell Inc, or its subsidiaries. All rights reserved. 6 */ 7 8 #include "spdk/stdinc.h" 9 10 #include "bdev_nvme.h" 11 12 #include "spdk/accel.h" 13 #include "spdk/config.h" 14 #include "spdk/endian.h" 15 #include "spdk/bdev.h" 16 #include "spdk/json.h" 17 #include "spdk/keyring.h" 18 #include "spdk/likely.h" 19 #include "spdk/nvme.h" 20 #include "spdk/nvme_ocssd.h" 21 #include "spdk/nvme_zns.h" 22 #include "spdk/opal.h" 23 #include "spdk/thread.h" 24 #include "spdk/trace.h" 25 #include "spdk/string.h" 26 #include "spdk/util.h" 27 #include "spdk/uuid.h" 28 29 #include "spdk/bdev_module.h" 30 #include "spdk/log.h" 31 32 #include "spdk_internal/usdt.h" 33 #include "spdk_internal/trace_defs.h" 34 35 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true 36 #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS (10000) 37 38 #define NSID_STR_LEN 10 39 40 #define SPDK_CONTROLLER_NAME_MAX 512 41 42 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w); 43 44 struct nvme_bdev_io { 45 /** array of iovecs to transfer. */ 46 struct iovec *iovs; 47 48 /** Number of iovecs in iovs array. */ 49 int iovcnt; 50 51 /** Current iovec position. */ 52 int iovpos; 53 54 /** Offset in current iovec. */ 55 uint32_t iov_offset; 56 57 /** I/O path the current I/O or admin passthrough is submitted on, or the I/O path 58 * being reset in a reset I/O. 59 */ 60 struct nvme_io_path *io_path; 61 62 /** array of iovecs to transfer. */ 63 struct iovec *fused_iovs; 64 65 /** Number of iovecs in iovs array. */ 66 int fused_iovcnt; 67 68 /** Current iovec position. */ 69 int fused_iovpos; 70 71 /** Offset in current iovec. */ 72 uint32_t fused_iov_offset; 73 74 /** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */ 75 struct spdk_nvme_cpl cpl; 76 77 /** Extended IO opts passed by the user to bdev layer and mapped to NVME format */ 78 struct spdk_nvme_ns_cmd_ext_io_opts ext_opts; 79 80 /** Keeps track if first of fused commands was submitted */ 81 bool first_fused_submitted; 82 83 /** Keeps track if first of fused commands was completed */ 84 bool first_fused_completed; 85 86 /** Temporary pointer to zone report buffer */ 87 struct spdk_nvme_zns_zone_report *zone_report_buf; 88 89 /** Keep track of how many zones that have been copied to the spdk_bdev_zone_info struct */ 90 uint64_t handled_zones; 91 92 /** Expiration value in ticks to retry the current I/O. */ 93 uint64_t retry_ticks; 94 95 /* How many times the current I/O was retried. */ 96 int32_t retry_count; 97 98 /* Current tsc at submit time. */ 99 uint64_t submit_tsc; 100 101 /* Used to put nvme_bdev_io into the list */ 102 TAILQ_ENTRY(nvme_bdev_io) retry_link; 103 }; 104 105 struct nvme_probe_skip_entry { 106 struct spdk_nvme_transport_id trid; 107 TAILQ_ENTRY(nvme_probe_skip_entry) tailq; 108 }; 109 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */ 110 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER( 111 g_skipped_nvme_ctrlrs); 112 113 #define BDEV_NVME_DEFAULT_DIGESTS (SPDK_BIT(SPDK_NVMF_DHCHAP_HASH_SHA256) | \ 114 SPDK_BIT(SPDK_NVMF_DHCHAP_HASH_SHA384) | \ 115 SPDK_BIT(SPDK_NVMF_DHCHAP_HASH_SHA512)) 116 117 #define BDEV_NVME_DEFAULT_DHGROUPS (SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_NULL) | \ 118 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_2048) | \ 119 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_3072) | \ 120 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_4096) | \ 121 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_6144) | \ 122 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_8192)) 123 124 static struct spdk_bdev_nvme_opts g_opts = { 125 .action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE, 126 .timeout_us = 0, 127 .timeout_admin_us = 0, 128 .keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS, 129 .transport_retry_count = 4, 130 .arbitration_burst = 0, 131 .low_priority_weight = 0, 132 .medium_priority_weight = 0, 133 .high_priority_weight = 0, 134 .nvme_adminq_poll_period_us = 10000ULL, 135 .nvme_ioq_poll_period_us = 0, 136 .io_queue_requests = 0, 137 .delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT, 138 .bdev_retry_count = 3, 139 .transport_ack_timeout = 0, 140 .ctrlr_loss_timeout_sec = 0, 141 .reconnect_delay_sec = 0, 142 .fast_io_fail_timeout_sec = 0, 143 .disable_auto_failback = false, 144 .generate_uuids = false, 145 .transport_tos = 0, 146 .nvme_error_stat = false, 147 .io_path_stat = false, 148 .allow_accel_sequence = false, 149 .dhchap_digests = BDEV_NVME_DEFAULT_DIGESTS, 150 .dhchap_dhgroups = BDEV_NVME_DEFAULT_DHGROUPS, 151 }; 152 153 #define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL 154 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT 100000ULL 155 156 static int g_hot_insert_nvme_controller_index = 0; 157 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT; 158 static bool g_nvme_hotplug_enabled = false; 159 struct spdk_thread *g_bdev_nvme_init_thread; 160 static struct spdk_poller *g_hotplug_poller; 161 static struct spdk_poller *g_hotplug_probe_poller; 162 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx; 163 164 static void nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 165 struct nvme_async_probe_ctx *ctx); 166 static void nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 167 struct nvme_async_probe_ctx *ctx); 168 static int bdev_nvme_library_init(void); 169 static void bdev_nvme_library_fini(void); 170 static void _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, 171 struct spdk_bdev_io *bdev_io); 172 static void bdev_nvme_submit_request(struct spdk_io_channel *ch, 173 struct spdk_bdev_io *bdev_io); 174 static int bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 175 void *md, uint64_t lba_count, uint64_t lba, 176 uint32_t flags, struct spdk_memory_domain *domain, void *domain_ctx, 177 struct spdk_accel_sequence *seq); 178 static int bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 179 void *md, uint64_t lba_count, uint64_t lba); 180 static int bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 181 void *md, uint64_t lba_count, uint64_t lba, 182 uint32_t flags, struct spdk_memory_domain *domain, void *domain_ctx, 183 struct spdk_accel_sequence *seq); 184 static int bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 185 void *md, uint64_t lba_count, 186 uint64_t zslba, uint32_t flags); 187 static int bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 188 void *md, uint64_t lba_count, uint64_t lba, 189 uint32_t flags); 190 static int bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, 191 struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov, 192 int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba, 193 uint32_t flags); 194 static int bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, 195 uint32_t num_zones, struct spdk_bdev_zone_info *info); 196 static int bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 197 enum spdk_bdev_zone_action action); 198 static void bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, 199 struct nvme_bdev_io *bio, 200 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); 201 static int bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 202 void *buf, size_t nbytes); 203 static int bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 204 void *buf, size_t nbytes, void *md_buf, size_t md_len); 205 static int bdev_nvme_iov_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 206 struct iovec *iov, int iovcnt, size_t nbytes, 207 void *md_buf, size_t md_len); 208 static void bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, 209 struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort); 210 static void bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio); 211 static int bdev_nvme_reset_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 212 static int bdev_nvme_failover_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 213 static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr); 214 static int nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr); 215 216 static struct nvme_ns *nvme_ns_alloc(void); 217 static void nvme_ns_free(struct nvme_ns *ns); 218 219 static int 220 nvme_ns_cmp(struct nvme_ns *ns1, struct nvme_ns *ns2) 221 { 222 return ns1->id < ns2->id ? -1 : ns1->id > ns2->id; 223 } 224 225 RB_GENERATE_STATIC(nvme_ns_tree, nvme_ns, node, nvme_ns_cmp); 226 227 struct spdk_nvme_qpair * 228 bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch) 229 { 230 struct nvme_ctrlr_channel *ctrlr_ch; 231 232 assert(ctrlr_io_ch != NULL); 233 234 ctrlr_ch = spdk_io_channel_get_ctx(ctrlr_io_ch); 235 236 return ctrlr_ch->qpair->qpair; 237 } 238 239 static int 240 bdev_nvme_get_ctx_size(void) 241 { 242 return sizeof(struct nvme_bdev_io); 243 } 244 245 static struct spdk_bdev_module nvme_if = { 246 .name = "nvme", 247 .async_fini = true, 248 .module_init = bdev_nvme_library_init, 249 .module_fini = bdev_nvme_library_fini, 250 .config_json = bdev_nvme_config_json, 251 .get_ctx_size = bdev_nvme_get_ctx_size, 252 253 }; 254 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if) 255 256 struct nvme_bdev_ctrlrs g_nvme_bdev_ctrlrs = TAILQ_HEAD_INITIALIZER(g_nvme_bdev_ctrlrs); 257 pthread_mutex_t g_bdev_nvme_mutex = PTHREAD_MUTEX_INITIALIZER; 258 bool g_bdev_nvme_module_finish; 259 260 struct nvme_bdev_ctrlr * 261 nvme_bdev_ctrlr_get_by_name(const char *name) 262 { 263 struct nvme_bdev_ctrlr *nbdev_ctrlr; 264 265 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 266 if (strcmp(name, nbdev_ctrlr->name) == 0) { 267 break; 268 } 269 } 270 271 return nbdev_ctrlr; 272 } 273 274 static struct nvme_ctrlr * 275 nvme_bdev_ctrlr_get_ctrlr(struct nvme_bdev_ctrlr *nbdev_ctrlr, 276 const struct spdk_nvme_transport_id *trid) 277 { 278 struct nvme_ctrlr *nvme_ctrlr; 279 280 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 281 if (spdk_nvme_transport_id_compare(trid, &nvme_ctrlr->active_path_id->trid) == 0) { 282 break; 283 } 284 } 285 286 return nvme_ctrlr; 287 } 288 289 struct nvme_ctrlr * 290 nvme_bdev_ctrlr_get_ctrlr_by_id(struct nvme_bdev_ctrlr *nbdev_ctrlr, 291 uint16_t cntlid) 292 { 293 struct nvme_ctrlr *nvme_ctrlr; 294 const struct spdk_nvme_ctrlr_data *cdata; 295 296 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 297 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 298 if (cdata->cntlid == cntlid) { 299 break; 300 } 301 } 302 303 return nvme_ctrlr; 304 } 305 306 static struct nvme_bdev * 307 nvme_bdev_ctrlr_get_bdev(struct nvme_bdev_ctrlr *nbdev_ctrlr, uint32_t nsid) 308 { 309 struct nvme_bdev *bdev; 310 311 pthread_mutex_lock(&g_bdev_nvme_mutex); 312 TAILQ_FOREACH(bdev, &nbdev_ctrlr->bdevs, tailq) { 313 if (bdev->nsid == nsid) { 314 break; 315 } 316 } 317 pthread_mutex_unlock(&g_bdev_nvme_mutex); 318 319 return bdev; 320 } 321 322 struct nvme_ns * 323 nvme_ctrlr_get_ns(struct nvme_ctrlr *nvme_ctrlr, uint32_t nsid) 324 { 325 struct nvme_ns ns; 326 327 assert(nsid > 0); 328 329 ns.id = nsid; 330 return RB_FIND(nvme_ns_tree, &nvme_ctrlr->namespaces, &ns); 331 } 332 333 struct nvme_ns * 334 nvme_ctrlr_get_first_active_ns(struct nvme_ctrlr *nvme_ctrlr) 335 { 336 return RB_MIN(nvme_ns_tree, &nvme_ctrlr->namespaces); 337 } 338 339 struct nvme_ns * 340 nvme_ctrlr_get_next_active_ns(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *ns) 341 { 342 if (ns == NULL) { 343 return NULL; 344 } 345 346 return RB_NEXT(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 347 } 348 349 static struct nvme_ctrlr * 350 nvme_ctrlr_get(const struct spdk_nvme_transport_id *trid) 351 { 352 struct nvme_bdev_ctrlr *nbdev_ctrlr; 353 struct nvme_ctrlr *nvme_ctrlr = NULL; 354 355 pthread_mutex_lock(&g_bdev_nvme_mutex); 356 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 357 nvme_ctrlr = nvme_bdev_ctrlr_get_ctrlr(nbdev_ctrlr, trid); 358 if (nvme_ctrlr != NULL) { 359 break; 360 } 361 } 362 pthread_mutex_unlock(&g_bdev_nvme_mutex); 363 364 return nvme_ctrlr; 365 } 366 367 struct nvme_ctrlr * 368 nvme_ctrlr_get_by_name(const char *name) 369 { 370 struct nvme_bdev_ctrlr *nbdev_ctrlr; 371 struct nvme_ctrlr *nvme_ctrlr = NULL; 372 373 if (name == NULL) { 374 return NULL; 375 } 376 377 pthread_mutex_lock(&g_bdev_nvme_mutex); 378 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 379 if (nbdev_ctrlr != NULL) { 380 nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 381 } 382 pthread_mutex_unlock(&g_bdev_nvme_mutex); 383 384 return nvme_ctrlr; 385 } 386 387 void 388 nvme_bdev_ctrlr_for_each(nvme_bdev_ctrlr_for_each_fn fn, void *ctx) 389 { 390 struct nvme_bdev_ctrlr *nbdev_ctrlr; 391 392 pthread_mutex_lock(&g_bdev_nvme_mutex); 393 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 394 fn(nbdev_ctrlr, ctx); 395 } 396 pthread_mutex_unlock(&g_bdev_nvme_mutex); 397 } 398 399 void 400 nvme_bdev_dump_trid_json(const struct spdk_nvme_transport_id *trid, struct spdk_json_write_ctx *w) 401 { 402 const char *trtype_str; 403 const char *adrfam_str; 404 405 trtype_str = spdk_nvme_transport_id_trtype_str(trid->trtype); 406 if (trtype_str) { 407 spdk_json_write_named_string(w, "trtype", trtype_str); 408 } 409 410 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 411 if (adrfam_str) { 412 spdk_json_write_named_string(w, "adrfam", adrfam_str); 413 } 414 415 if (trid->traddr[0] != '\0') { 416 spdk_json_write_named_string(w, "traddr", trid->traddr); 417 } 418 419 if (trid->trsvcid[0] != '\0') { 420 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 421 } 422 423 if (trid->subnqn[0] != '\0') { 424 spdk_json_write_named_string(w, "subnqn", trid->subnqn); 425 } 426 } 427 428 static void 429 nvme_bdev_ctrlr_delete(struct nvme_bdev_ctrlr *nbdev_ctrlr, 430 struct nvme_ctrlr *nvme_ctrlr) 431 { 432 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_delete, nvme_ctrlr->nbdev_ctrlr->name); 433 pthread_mutex_lock(&g_bdev_nvme_mutex); 434 435 TAILQ_REMOVE(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 436 if (!TAILQ_EMPTY(&nbdev_ctrlr->ctrlrs)) { 437 pthread_mutex_unlock(&g_bdev_nvme_mutex); 438 439 return; 440 } 441 TAILQ_REMOVE(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 442 443 pthread_mutex_unlock(&g_bdev_nvme_mutex); 444 445 assert(TAILQ_EMPTY(&nbdev_ctrlr->bdevs)); 446 447 free(nbdev_ctrlr->name); 448 free(nbdev_ctrlr); 449 } 450 451 static void 452 _nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 453 { 454 struct nvme_path_id *path_id, *tmp_path; 455 struct nvme_ns *ns, *tmp_ns; 456 457 free(nvme_ctrlr->copied_ana_desc); 458 spdk_free(nvme_ctrlr->ana_log_page); 459 460 if (nvme_ctrlr->opal_dev) { 461 spdk_opal_dev_destruct(nvme_ctrlr->opal_dev); 462 nvme_ctrlr->opal_dev = NULL; 463 } 464 465 if (nvme_ctrlr->nbdev_ctrlr) { 466 nvme_bdev_ctrlr_delete(nvme_ctrlr->nbdev_ctrlr, nvme_ctrlr); 467 } 468 469 RB_FOREACH_SAFE(ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp_ns) { 470 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 471 nvme_ns_free(ns); 472 } 473 474 TAILQ_FOREACH_SAFE(path_id, &nvme_ctrlr->trids, link, tmp_path) { 475 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 476 free(path_id); 477 } 478 479 pthread_mutex_destroy(&nvme_ctrlr->mutex); 480 spdk_keyring_put_key(nvme_ctrlr->psk); 481 spdk_keyring_put_key(nvme_ctrlr->dhchap_key); 482 free(nvme_ctrlr); 483 484 pthread_mutex_lock(&g_bdev_nvme_mutex); 485 if (g_bdev_nvme_module_finish && TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 486 pthread_mutex_unlock(&g_bdev_nvme_mutex); 487 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 488 spdk_bdev_module_fini_done(); 489 return; 490 } 491 pthread_mutex_unlock(&g_bdev_nvme_mutex); 492 } 493 494 static int 495 nvme_detach_poller(void *arg) 496 { 497 struct nvme_ctrlr *nvme_ctrlr = arg; 498 int rc; 499 500 rc = spdk_nvme_detach_poll_async(nvme_ctrlr->detach_ctx); 501 if (rc != -EAGAIN) { 502 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 503 _nvme_ctrlr_delete(nvme_ctrlr); 504 } 505 506 return SPDK_POLLER_BUSY; 507 } 508 509 static void 510 nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 511 { 512 int rc; 513 514 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 515 516 /* First, unregister the adminq poller, as the driver will poll adminq if necessary */ 517 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 518 519 /* If we got here, the reset/detach poller cannot be active */ 520 assert(nvme_ctrlr->reset_detach_poller == NULL); 521 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(nvme_detach_poller, 522 nvme_ctrlr, 1000); 523 if (nvme_ctrlr->reset_detach_poller == NULL) { 524 SPDK_ERRLOG("Failed to register detach poller\n"); 525 goto error; 526 } 527 528 rc = spdk_nvme_detach_async(nvme_ctrlr->ctrlr, &nvme_ctrlr->detach_ctx); 529 if (rc != 0) { 530 SPDK_ERRLOG("Failed to detach the NVMe controller\n"); 531 goto error; 532 } 533 534 return; 535 error: 536 /* We don't have a good way to handle errors here, so just do what we can and delete the 537 * controller without detaching the underlying NVMe device. 538 */ 539 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 540 _nvme_ctrlr_delete(nvme_ctrlr); 541 } 542 543 static void 544 nvme_ctrlr_unregister_cb(void *io_device) 545 { 546 struct nvme_ctrlr *nvme_ctrlr = io_device; 547 548 nvme_ctrlr_delete(nvme_ctrlr); 549 } 550 551 static void 552 nvme_ctrlr_unregister(void *ctx) 553 { 554 struct nvme_ctrlr *nvme_ctrlr = ctx; 555 556 spdk_io_device_unregister(nvme_ctrlr, nvme_ctrlr_unregister_cb); 557 } 558 559 static bool 560 nvme_ctrlr_can_be_unregistered(struct nvme_ctrlr *nvme_ctrlr) 561 { 562 if (!nvme_ctrlr->destruct) { 563 return false; 564 } 565 566 if (nvme_ctrlr->ref > 0) { 567 return false; 568 } 569 570 if (nvme_ctrlr->resetting) { 571 return false; 572 } 573 574 if (nvme_ctrlr->ana_log_page_updating) { 575 return false; 576 } 577 578 if (nvme_ctrlr->io_path_cache_clearing) { 579 return false; 580 } 581 582 return true; 583 } 584 585 static void 586 nvme_ctrlr_release(struct nvme_ctrlr *nvme_ctrlr) 587 { 588 pthread_mutex_lock(&nvme_ctrlr->mutex); 589 SPDK_DTRACE_PROBE2(bdev_nvme_ctrlr_release, nvme_ctrlr->nbdev_ctrlr->name, nvme_ctrlr->ref); 590 591 assert(nvme_ctrlr->ref > 0); 592 nvme_ctrlr->ref--; 593 594 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 595 pthread_mutex_unlock(&nvme_ctrlr->mutex); 596 return; 597 } 598 599 pthread_mutex_unlock(&nvme_ctrlr->mutex); 600 601 spdk_thread_exec_msg(nvme_ctrlr->thread, nvme_ctrlr_unregister, nvme_ctrlr); 602 } 603 604 static void 605 bdev_nvme_clear_current_io_path(struct nvme_bdev_channel *nbdev_ch) 606 { 607 nbdev_ch->current_io_path = NULL; 608 nbdev_ch->rr_counter = 0; 609 } 610 611 static struct nvme_io_path * 612 _bdev_nvme_get_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 613 { 614 struct nvme_io_path *io_path; 615 616 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 617 if (io_path->nvme_ns == nvme_ns) { 618 break; 619 } 620 } 621 622 return io_path; 623 } 624 625 static struct nvme_io_path * 626 nvme_io_path_alloc(void) 627 { 628 struct nvme_io_path *io_path; 629 630 io_path = calloc(1, sizeof(*io_path)); 631 if (io_path == NULL) { 632 SPDK_ERRLOG("Failed to alloc io_path.\n"); 633 return NULL; 634 } 635 636 if (g_opts.io_path_stat) { 637 io_path->stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); 638 if (io_path->stat == NULL) { 639 free(io_path); 640 SPDK_ERRLOG("Failed to alloc io_path stat.\n"); 641 return NULL; 642 } 643 spdk_bdev_reset_io_stat(io_path->stat, SPDK_BDEV_RESET_STAT_MAXMIN); 644 } 645 646 return io_path; 647 } 648 649 static void 650 nvme_io_path_free(struct nvme_io_path *io_path) 651 { 652 free(io_path->stat); 653 free(io_path); 654 } 655 656 static int 657 _bdev_nvme_add_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 658 { 659 struct nvme_io_path *io_path; 660 struct spdk_io_channel *ch; 661 struct nvme_ctrlr_channel *ctrlr_ch; 662 struct nvme_qpair *nvme_qpair; 663 664 io_path = nvme_io_path_alloc(); 665 if (io_path == NULL) { 666 return -ENOMEM; 667 } 668 669 io_path->nvme_ns = nvme_ns; 670 671 ch = spdk_get_io_channel(nvme_ns->ctrlr); 672 if (ch == NULL) { 673 nvme_io_path_free(io_path); 674 SPDK_ERRLOG("Failed to alloc io_channel.\n"); 675 return -ENOMEM; 676 } 677 678 ctrlr_ch = spdk_io_channel_get_ctx(ch); 679 680 nvme_qpair = ctrlr_ch->qpair; 681 assert(nvme_qpair != NULL); 682 683 io_path->qpair = nvme_qpair; 684 TAILQ_INSERT_TAIL(&nvme_qpair->io_path_list, io_path, tailq); 685 686 io_path->nbdev_ch = nbdev_ch; 687 STAILQ_INSERT_TAIL(&nbdev_ch->io_path_list, io_path, stailq); 688 689 bdev_nvme_clear_current_io_path(nbdev_ch); 690 691 return 0; 692 } 693 694 static void 695 bdev_nvme_clear_retry_io_path(struct nvme_bdev_channel *nbdev_ch, 696 struct nvme_io_path *io_path) 697 { 698 struct nvme_bdev_io *bio; 699 700 TAILQ_FOREACH(bio, &nbdev_ch->retry_io_list, retry_link) { 701 if (bio->io_path == io_path) { 702 bio->io_path = NULL; 703 } 704 } 705 } 706 707 static void 708 _bdev_nvme_delete_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *io_path) 709 { 710 struct spdk_io_channel *ch; 711 struct nvme_qpair *nvme_qpair; 712 struct nvme_ctrlr_channel *ctrlr_ch; 713 struct nvme_bdev *nbdev; 714 715 nbdev = spdk_io_channel_get_io_device(spdk_io_channel_from_ctx(nbdev_ch)); 716 717 /* Add the statistics to nvme_ns before this path is destroyed. */ 718 pthread_mutex_lock(&nbdev->mutex); 719 if (nbdev->ref != 0 && io_path->nvme_ns->stat != NULL && io_path->stat != NULL) { 720 spdk_bdev_add_io_stat(io_path->nvme_ns->stat, io_path->stat); 721 } 722 pthread_mutex_unlock(&nbdev->mutex); 723 724 bdev_nvme_clear_current_io_path(nbdev_ch); 725 bdev_nvme_clear_retry_io_path(nbdev_ch, io_path); 726 727 STAILQ_REMOVE(&nbdev_ch->io_path_list, io_path, nvme_io_path, stailq); 728 io_path->nbdev_ch = NULL; 729 730 nvme_qpair = io_path->qpair; 731 assert(nvme_qpair != NULL); 732 733 ctrlr_ch = nvme_qpair->ctrlr_ch; 734 assert(ctrlr_ch != NULL); 735 736 ch = spdk_io_channel_from_ctx(ctrlr_ch); 737 spdk_put_io_channel(ch); 738 739 /* After an io_path is removed, I/Os submitted to it may complete and update statistics 740 * of the io_path. To avoid heap-use-after-free error from this case, do not free the 741 * io_path here but free the io_path when the associated qpair is freed. It is ensured 742 * that all I/Os submitted to the io_path are completed when the associated qpair is freed. 743 */ 744 } 745 746 static void 747 _bdev_nvme_delete_io_paths(struct nvme_bdev_channel *nbdev_ch) 748 { 749 struct nvme_io_path *io_path, *tmp_io_path; 750 751 STAILQ_FOREACH_SAFE(io_path, &nbdev_ch->io_path_list, stailq, tmp_io_path) { 752 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 753 } 754 } 755 756 static int 757 bdev_nvme_create_bdev_channel_cb(void *io_device, void *ctx_buf) 758 { 759 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 760 struct nvme_bdev *nbdev = io_device; 761 struct nvme_ns *nvme_ns; 762 int rc; 763 764 STAILQ_INIT(&nbdev_ch->io_path_list); 765 TAILQ_INIT(&nbdev_ch->retry_io_list); 766 767 pthread_mutex_lock(&nbdev->mutex); 768 769 nbdev_ch->mp_policy = nbdev->mp_policy; 770 nbdev_ch->mp_selector = nbdev->mp_selector; 771 nbdev_ch->rr_min_io = nbdev->rr_min_io; 772 773 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 774 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 775 if (rc != 0) { 776 pthread_mutex_unlock(&nbdev->mutex); 777 778 _bdev_nvme_delete_io_paths(nbdev_ch); 779 return rc; 780 } 781 } 782 pthread_mutex_unlock(&nbdev->mutex); 783 784 return 0; 785 } 786 787 /* If cpl != NULL, complete the bdev_io with nvme status based on 'cpl'. 788 * If cpl == NULL, complete the bdev_io with bdev status based on 'status'. 789 */ 790 static inline void 791 __bdev_nvme_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status, 792 const struct spdk_nvme_cpl *cpl) 793 { 794 spdk_trace_record(TRACE_BDEV_NVME_IO_DONE, 0, 0, (uintptr_t)bdev_io->driver_ctx, 795 (uintptr_t)bdev_io); 796 if (cpl) { 797 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 798 } else { 799 spdk_bdev_io_complete(bdev_io, status); 800 } 801 } 802 803 static void bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch); 804 805 static void 806 bdev_nvme_destroy_bdev_channel_cb(void *io_device, void *ctx_buf) 807 { 808 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 809 810 bdev_nvme_abort_retry_ios(nbdev_ch); 811 _bdev_nvme_delete_io_paths(nbdev_ch); 812 } 813 814 static inline bool 815 bdev_nvme_io_type_is_admin(enum spdk_bdev_io_type io_type) 816 { 817 switch (io_type) { 818 case SPDK_BDEV_IO_TYPE_RESET: 819 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 820 case SPDK_BDEV_IO_TYPE_ABORT: 821 return true; 822 default: 823 break; 824 } 825 826 return false; 827 } 828 829 static inline bool 830 nvme_ns_is_accessible(struct nvme_ns *nvme_ns) 831 { 832 if (spdk_unlikely(nvme_ns->ana_state_updating)) { 833 return false; 834 } 835 836 switch (nvme_ns->ana_state) { 837 case SPDK_NVME_ANA_OPTIMIZED_STATE: 838 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 839 return true; 840 default: 841 break; 842 } 843 844 return false; 845 } 846 847 static inline bool 848 nvme_qpair_is_connected(struct nvme_qpair *nvme_qpair) 849 { 850 if (spdk_unlikely(nvme_qpair->qpair == NULL)) { 851 return false; 852 } 853 854 if (spdk_unlikely(spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) != 855 SPDK_NVME_QPAIR_FAILURE_NONE)) { 856 return false; 857 } 858 859 if (spdk_unlikely(nvme_qpair->ctrlr_ch->reset_iter != NULL)) { 860 return false; 861 } 862 863 return true; 864 } 865 866 static inline bool 867 nvme_io_path_is_available(struct nvme_io_path *io_path) 868 { 869 if (spdk_unlikely(!nvme_qpair_is_connected(io_path->qpair))) { 870 return false; 871 } 872 873 if (spdk_unlikely(!nvme_ns_is_accessible(io_path->nvme_ns))) { 874 return false; 875 } 876 877 return true; 878 } 879 880 static inline bool 881 nvme_ctrlr_is_failed(struct nvme_ctrlr *nvme_ctrlr) 882 { 883 if (nvme_ctrlr->destruct) { 884 return true; 885 } 886 887 if (nvme_ctrlr->fast_io_fail_timedout) { 888 return true; 889 } 890 891 if (nvme_ctrlr->resetting) { 892 if (nvme_ctrlr->opts.reconnect_delay_sec != 0) { 893 return false; 894 } else { 895 return true; 896 } 897 } 898 899 if (nvme_ctrlr->reconnect_is_delayed) { 900 return false; 901 } 902 903 if (nvme_ctrlr->disabled) { 904 return true; 905 } 906 907 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 908 return true; 909 } else { 910 return false; 911 } 912 } 913 914 static bool 915 nvme_ctrlr_is_available(struct nvme_ctrlr *nvme_ctrlr) 916 { 917 if (nvme_ctrlr->destruct) { 918 return false; 919 } 920 921 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 922 return false; 923 } 924 925 if (nvme_ctrlr->resetting || nvme_ctrlr->reconnect_is_delayed) { 926 return false; 927 } 928 929 if (nvme_ctrlr->disabled) { 930 return false; 931 } 932 933 return true; 934 } 935 936 /* Simulate circular linked list. */ 937 static inline struct nvme_io_path * 938 nvme_io_path_get_next(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *prev_path) 939 { 940 struct nvme_io_path *next_path; 941 942 if (prev_path != NULL) { 943 next_path = STAILQ_NEXT(prev_path, stailq); 944 if (next_path != NULL) { 945 return next_path; 946 } 947 } 948 949 return STAILQ_FIRST(&nbdev_ch->io_path_list); 950 } 951 952 static struct nvme_io_path * 953 _bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 954 { 955 struct nvme_io_path *io_path, *start, *non_optimized = NULL; 956 957 start = nvme_io_path_get_next(nbdev_ch, nbdev_ch->current_io_path); 958 959 io_path = start; 960 do { 961 if (spdk_likely(nvme_qpair_is_connected(io_path->qpair) && 962 !io_path->nvme_ns->ana_state_updating)) { 963 switch (io_path->nvme_ns->ana_state) { 964 case SPDK_NVME_ANA_OPTIMIZED_STATE: 965 nbdev_ch->current_io_path = io_path; 966 return io_path; 967 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 968 if (non_optimized == NULL) { 969 non_optimized = io_path; 970 } 971 break; 972 default: 973 break; 974 } 975 } 976 io_path = nvme_io_path_get_next(nbdev_ch, io_path); 977 } while (io_path != start); 978 979 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) { 980 /* We come here only if there is no optimized path. Cache even non_optimized 981 * path for load balance across multiple non_optimized paths. 982 */ 983 nbdev_ch->current_io_path = non_optimized; 984 } 985 986 return non_optimized; 987 } 988 989 static struct nvme_io_path * 990 _bdev_nvme_find_io_path_min_qd(struct nvme_bdev_channel *nbdev_ch) 991 { 992 struct nvme_io_path *io_path; 993 struct nvme_io_path *optimized = NULL, *non_optimized = NULL; 994 uint32_t opt_min_qd = UINT32_MAX, non_opt_min_qd = UINT32_MAX; 995 uint32_t num_outstanding_reqs; 996 997 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 998 if (spdk_unlikely(!nvme_qpair_is_connected(io_path->qpair))) { 999 /* The device is currently resetting. */ 1000 continue; 1001 } 1002 1003 if (spdk_unlikely(io_path->nvme_ns->ana_state_updating)) { 1004 continue; 1005 } 1006 1007 num_outstanding_reqs = spdk_nvme_qpair_get_num_outstanding_reqs(io_path->qpair->qpair); 1008 switch (io_path->nvme_ns->ana_state) { 1009 case SPDK_NVME_ANA_OPTIMIZED_STATE: 1010 if (num_outstanding_reqs < opt_min_qd) { 1011 opt_min_qd = num_outstanding_reqs; 1012 optimized = io_path; 1013 } 1014 break; 1015 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 1016 if (num_outstanding_reqs < non_opt_min_qd) { 1017 non_opt_min_qd = num_outstanding_reqs; 1018 non_optimized = io_path; 1019 } 1020 break; 1021 default: 1022 break; 1023 } 1024 } 1025 1026 /* don't cache io path for BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH selector */ 1027 if (optimized != NULL) { 1028 return optimized; 1029 } 1030 1031 return non_optimized; 1032 } 1033 1034 static inline struct nvme_io_path * 1035 bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 1036 { 1037 if (spdk_likely(nbdev_ch->current_io_path != NULL)) { 1038 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE) { 1039 return nbdev_ch->current_io_path; 1040 } else if (nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 1041 if (++nbdev_ch->rr_counter < nbdev_ch->rr_min_io) { 1042 return nbdev_ch->current_io_path; 1043 } 1044 nbdev_ch->rr_counter = 0; 1045 } 1046 } 1047 1048 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE || 1049 nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 1050 return _bdev_nvme_find_io_path(nbdev_ch); 1051 } else { 1052 return _bdev_nvme_find_io_path_min_qd(nbdev_ch); 1053 } 1054 } 1055 1056 /* Return true if there is any io_path whose qpair is active or ctrlr is not failed, 1057 * or false otherwise. 1058 * 1059 * If any io_path has an active qpair but find_io_path() returned NULL, its namespace 1060 * is likely to be non-accessible now but may become accessible. 1061 * 1062 * If any io_path has an unfailed ctrlr but find_io_path() returned NULL, the ctrlr 1063 * is likely to be resetting now but the reset may succeed. A ctrlr is set to unfailed 1064 * when starting to reset it but it is set to failed when the reset failed. Hence, if 1065 * a ctrlr is unfailed, it is likely that it works fine or is resetting. 1066 */ 1067 static bool 1068 any_io_path_may_become_available(struct nvme_bdev_channel *nbdev_ch) 1069 { 1070 struct nvme_io_path *io_path; 1071 1072 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 1073 if (io_path->nvme_ns->ana_transition_timedout) { 1074 continue; 1075 } 1076 1077 if (nvme_qpair_is_connected(io_path->qpair) || 1078 !nvme_ctrlr_is_failed(io_path->qpair->ctrlr)) { 1079 return true; 1080 } 1081 } 1082 1083 return false; 1084 } 1085 1086 static void 1087 bdev_nvme_retry_io(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io) 1088 { 1089 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 1090 struct spdk_io_channel *ch; 1091 1092 if (nbdev_io->io_path != NULL && nvme_io_path_is_available(nbdev_io->io_path)) { 1093 _bdev_nvme_submit_request(nbdev_ch, bdev_io); 1094 } else { 1095 ch = spdk_io_channel_from_ctx(nbdev_ch); 1096 bdev_nvme_submit_request(ch, bdev_io); 1097 } 1098 } 1099 1100 static int 1101 bdev_nvme_retry_ios(void *arg) 1102 { 1103 struct nvme_bdev_channel *nbdev_ch = arg; 1104 struct nvme_bdev_io *bio, *tmp_bio; 1105 uint64_t now, delay_us; 1106 1107 now = spdk_get_ticks(); 1108 1109 TAILQ_FOREACH_SAFE(bio, &nbdev_ch->retry_io_list, retry_link, tmp_bio) { 1110 if (bio->retry_ticks > now) { 1111 break; 1112 } 1113 1114 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bio, retry_link); 1115 1116 bdev_nvme_retry_io(nbdev_ch, spdk_bdev_io_from_ctx(bio)); 1117 } 1118 1119 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1120 1121 bio = TAILQ_FIRST(&nbdev_ch->retry_io_list); 1122 if (bio != NULL) { 1123 delay_us = (bio->retry_ticks - now) * SPDK_SEC_TO_USEC / spdk_get_ticks_hz(); 1124 1125 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 1126 delay_us); 1127 } 1128 1129 return SPDK_POLLER_BUSY; 1130 } 1131 1132 static void 1133 bdev_nvme_queue_retry_io(struct nvme_bdev_channel *nbdev_ch, 1134 struct nvme_bdev_io *bio, uint64_t delay_ms) 1135 { 1136 struct nvme_bdev_io *tmp_bio; 1137 1138 bio->retry_ticks = spdk_get_ticks() + delay_ms * spdk_get_ticks_hz() / 1000ULL; 1139 1140 TAILQ_FOREACH_REVERSE(tmp_bio, &nbdev_ch->retry_io_list, retry_io_head, retry_link) { 1141 if (tmp_bio->retry_ticks <= bio->retry_ticks) { 1142 TAILQ_INSERT_AFTER(&nbdev_ch->retry_io_list, tmp_bio, bio, 1143 retry_link); 1144 return; 1145 } 1146 } 1147 1148 /* No earlier I/Os were found. This I/O must be the new head. */ 1149 TAILQ_INSERT_HEAD(&nbdev_ch->retry_io_list, bio, retry_link); 1150 1151 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1152 1153 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 1154 delay_ms * 1000ULL); 1155 } 1156 1157 static void 1158 bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch) 1159 { 1160 struct nvme_bdev_io *bio, *tmp_bio; 1161 1162 TAILQ_FOREACH_SAFE(bio, &nbdev_ch->retry_io_list, retry_link, tmp_bio) { 1163 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bio, retry_link); 1164 __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_ABORTED, NULL); 1165 } 1166 1167 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1168 } 1169 1170 static int 1171 bdev_nvme_abort_retry_io(struct nvme_bdev_channel *nbdev_ch, 1172 struct nvme_bdev_io *bio_to_abort) 1173 { 1174 struct nvme_bdev_io *bio; 1175 1176 TAILQ_FOREACH(bio, &nbdev_ch->retry_io_list, retry_link) { 1177 if (bio == bio_to_abort) { 1178 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bio, retry_link); 1179 __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_ABORTED, NULL); 1180 return 0; 1181 } 1182 } 1183 1184 return -ENOENT; 1185 } 1186 1187 static void 1188 bdev_nvme_update_nvme_error_stat(struct spdk_bdev_io *bdev_io, const struct spdk_nvme_cpl *cpl) 1189 { 1190 struct nvme_bdev *nbdev; 1191 uint16_t sct, sc; 1192 1193 assert(spdk_nvme_cpl_is_error(cpl)); 1194 1195 nbdev = bdev_io->bdev->ctxt; 1196 1197 if (nbdev->err_stat == NULL) { 1198 return; 1199 } 1200 1201 sct = cpl->status.sct; 1202 sc = cpl->status.sc; 1203 1204 pthread_mutex_lock(&nbdev->mutex); 1205 1206 nbdev->err_stat->status_type[sct]++; 1207 switch (sct) { 1208 case SPDK_NVME_SCT_GENERIC: 1209 case SPDK_NVME_SCT_COMMAND_SPECIFIC: 1210 case SPDK_NVME_SCT_MEDIA_ERROR: 1211 case SPDK_NVME_SCT_PATH: 1212 nbdev->err_stat->status[sct][sc]++; 1213 break; 1214 default: 1215 break; 1216 } 1217 1218 pthread_mutex_unlock(&nbdev->mutex); 1219 } 1220 1221 static inline void 1222 bdev_nvme_update_io_path_stat(struct nvme_bdev_io *bio) 1223 { 1224 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1225 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 1226 uint32_t blocklen = bdev_io->bdev->blocklen; 1227 struct spdk_bdev_io_stat *stat; 1228 uint64_t tsc_diff; 1229 1230 if (bio->io_path->stat == NULL) { 1231 return; 1232 } 1233 1234 tsc_diff = spdk_get_ticks() - bio->submit_tsc; 1235 stat = bio->io_path->stat; 1236 1237 switch (bdev_io->type) { 1238 case SPDK_BDEV_IO_TYPE_READ: 1239 stat->bytes_read += num_blocks * blocklen; 1240 stat->num_read_ops++; 1241 stat->read_latency_ticks += tsc_diff; 1242 if (stat->max_read_latency_ticks < tsc_diff) { 1243 stat->max_read_latency_ticks = tsc_diff; 1244 } 1245 if (stat->min_read_latency_ticks > tsc_diff) { 1246 stat->min_read_latency_ticks = tsc_diff; 1247 } 1248 break; 1249 case SPDK_BDEV_IO_TYPE_WRITE: 1250 stat->bytes_written += num_blocks * blocklen; 1251 stat->num_write_ops++; 1252 stat->write_latency_ticks += tsc_diff; 1253 if (stat->max_write_latency_ticks < tsc_diff) { 1254 stat->max_write_latency_ticks = tsc_diff; 1255 } 1256 if (stat->min_write_latency_ticks > tsc_diff) { 1257 stat->min_write_latency_ticks = tsc_diff; 1258 } 1259 break; 1260 case SPDK_BDEV_IO_TYPE_UNMAP: 1261 stat->bytes_unmapped += num_blocks * blocklen; 1262 stat->num_unmap_ops++; 1263 stat->unmap_latency_ticks += tsc_diff; 1264 if (stat->max_unmap_latency_ticks < tsc_diff) { 1265 stat->max_unmap_latency_ticks = tsc_diff; 1266 } 1267 if (stat->min_unmap_latency_ticks > tsc_diff) { 1268 stat->min_unmap_latency_ticks = tsc_diff; 1269 } 1270 break; 1271 case SPDK_BDEV_IO_TYPE_ZCOPY: 1272 /* Track the data in the start phase only */ 1273 if (!bdev_io->u.bdev.zcopy.start) { 1274 break; 1275 } 1276 if (bdev_io->u.bdev.zcopy.populate) { 1277 stat->bytes_read += num_blocks * blocklen; 1278 stat->num_read_ops++; 1279 stat->read_latency_ticks += tsc_diff; 1280 if (stat->max_read_latency_ticks < tsc_diff) { 1281 stat->max_read_latency_ticks = tsc_diff; 1282 } 1283 if (stat->min_read_latency_ticks > tsc_diff) { 1284 stat->min_read_latency_ticks = tsc_diff; 1285 } 1286 } else { 1287 stat->bytes_written += num_blocks * blocklen; 1288 stat->num_write_ops++; 1289 stat->write_latency_ticks += tsc_diff; 1290 if (stat->max_write_latency_ticks < tsc_diff) { 1291 stat->max_write_latency_ticks = tsc_diff; 1292 } 1293 if (stat->min_write_latency_ticks > tsc_diff) { 1294 stat->min_write_latency_ticks = tsc_diff; 1295 } 1296 } 1297 break; 1298 case SPDK_BDEV_IO_TYPE_COPY: 1299 stat->bytes_copied += num_blocks * blocklen; 1300 stat->num_copy_ops++; 1301 stat->copy_latency_ticks += tsc_diff; 1302 if (stat->max_copy_latency_ticks < tsc_diff) { 1303 stat->max_copy_latency_ticks = tsc_diff; 1304 } 1305 if (stat->min_copy_latency_ticks > tsc_diff) { 1306 stat->min_copy_latency_ticks = tsc_diff; 1307 } 1308 break; 1309 default: 1310 break; 1311 } 1312 } 1313 1314 static bool 1315 bdev_nvme_check_retry_io(struct nvme_bdev_io *bio, 1316 const struct spdk_nvme_cpl *cpl, 1317 struct nvme_bdev_channel *nbdev_ch, 1318 uint64_t *_delay_ms) 1319 { 1320 struct nvme_io_path *io_path = bio->io_path; 1321 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 1322 const struct spdk_nvme_ctrlr_data *cdata; 1323 1324 if (spdk_nvme_cpl_is_path_error(cpl) || 1325 spdk_nvme_cpl_is_aborted_sq_deletion(cpl) || 1326 !nvme_io_path_is_available(io_path) || 1327 !nvme_ctrlr_is_available(nvme_ctrlr)) { 1328 bdev_nvme_clear_current_io_path(nbdev_ch); 1329 bio->io_path = NULL; 1330 if (spdk_nvme_cpl_is_ana_error(cpl)) { 1331 if (nvme_ctrlr_read_ana_log_page(nvme_ctrlr) == 0) { 1332 io_path->nvme_ns->ana_state_updating = true; 1333 } 1334 } 1335 if (!any_io_path_may_become_available(nbdev_ch)) { 1336 return false; 1337 } 1338 *_delay_ms = 0; 1339 } else { 1340 bio->retry_count++; 1341 1342 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 1343 1344 if (cpl->status.crd != 0) { 1345 *_delay_ms = cdata->crdt[cpl->status.crd] * 100; 1346 } else { 1347 *_delay_ms = 0; 1348 } 1349 } 1350 1351 return true; 1352 } 1353 1354 static inline void 1355 bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio, 1356 const struct spdk_nvme_cpl *cpl) 1357 { 1358 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1359 struct nvme_bdev_channel *nbdev_ch; 1360 uint64_t delay_ms; 1361 1362 assert(!bdev_nvme_io_type_is_admin(bdev_io->type)); 1363 1364 if (spdk_likely(spdk_nvme_cpl_is_success(cpl))) { 1365 bdev_nvme_update_io_path_stat(bio); 1366 goto complete; 1367 } 1368 1369 /* Update error counts before deciding if retry is needed. 1370 * Hence, error counts may be more than the number of I/O errors. 1371 */ 1372 bdev_nvme_update_nvme_error_stat(bdev_io, cpl); 1373 1374 if (cpl->status.dnr != 0 || spdk_nvme_cpl_is_aborted_by_request(cpl) || 1375 (g_opts.bdev_retry_count != -1 && bio->retry_count >= g_opts.bdev_retry_count)) { 1376 goto complete; 1377 } 1378 1379 /* At this point we don't know whether the sequence was successfully executed or not, so we 1380 * cannot retry the IO */ 1381 if (bdev_io->u.bdev.accel_sequence != NULL) { 1382 goto complete; 1383 } 1384 1385 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1386 1387 if (bdev_nvme_check_retry_io(bio, cpl, nbdev_ch, &delay_ms)) { 1388 bdev_nvme_queue_retry_io(nbdev_ch, bio, delay_ms); 1389 return; 1390 } 1391 1392 complete: 1393 bio->retry_count = 0; 1394 bio->submit_tsc = 0; 1395 bdev_io->u.bdev.accel_sequence = NULL; 1396 __bdev_nvme_io_complete(bdev_io, 0, cpl); 1397 } 1398 1399 static inline void 1400 bdev_nvme_io_complete(struct nvme_bdev_io *bio, int rc) 1401 { 1402 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1403 struct nvme_bdev_channel *nbdev_ch; 1404 enum spdk_bdev_io_status io_status; 1405 1406 assert(!bdev_nvme_io_type_is_admin(bdev_io->type)); 1407 1408 switch (rc) { 1409 case 0: 1410 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1411 break; 1412 case -ENOMEM: 1413 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1414 break; 1415 case -ENXIO: 1416 if (g_opts.bdev_retry_count == -1 || bio->retry_count < g_opts.bdev_retry_count) { 1417 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1418 1419 bdev_nvme_clear_current_io_path(nbdev_ch); 1420 bio->io_path = NULL; 1421 1422 if (any_io_path_may_become_available(nbdev_ch)) { 1423 bdev_nvme_queue_retry_io(nbdev_ch, bio, 1000ULL); 1424 return; 1425 } 1426 } 1427 1428 /* fallthrough */ 1429 default: 1430 spdk_accel_sequence_abort(bdev_io->u.bdev.accel_sequence); 1431 bdev_io->u.bdev.accel_sequence = NULL; 1432 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1433 break; 1434 } 1435 1436 bio->retry_count = 0; 1437 bio->submit_tsc = 0; 1438 __bdev_nvme_io_complete(bdev_io, io_status, NULL); 1439 } 1440 1441 static inline void 1442 bdev_nvme_admin_complete(struct nvme_bdev_io *bio, int rc) 1443 { 1444 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1445 enum spdk_bdev_io_status io_status; 1446 1447 switch (rc) { 1448 case 0: 1449 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1450 break; 1451 case -ENOMEM: 1452 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1453 break; 1454 case -ENXIO: 1455 /* fallthrough */ 1456 default: 1457 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1458 break; 1459 } 1460 1461 __bdev_nvme_io_complete(bdev_io, io_status, NULL); 1462 } 1463 1464 static void 1465 bdev_nvme_clear_io_path_caches_done(struct spdk_io_channel_iter *i, int status) 1466 { 1467 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 1468 1469 pthread_mutex_lock(&nvme_ctrlr->mutex); 1470 1471 assert(nvme_ctrlr->io_path_cache_clearing == true); 1472 nvme_ctrlr->io_path_cache_clearing = false; 1473 1474 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 1475 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1476 return; 1477 } 1478 1479 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1480 1481 nvme_ctrlr_unregister(nvme_ctrlr); 1482 } 1483 1484 static void 1485 _bdev_nvme_clear_io_path_cache(struct nvme_qpair *nvme_qpair) 1486 { 1487 struct nvme_io_path *io_path; 1488 1489 TAILQ_FOREACH(io_path, &nvme_qpair->io_path_list, tailq) { 1490 if (io_path->nbdev_ch == NULL) { 1491 continue; 1492 } 1493 bdev_nvme_clear_current_io_path(io_path->nbdev_ch); 1494 } 1495 } 1496 1497 static void 1498 bdev_nvme_clear_io_path_cache(struct spdk_io_channel_iter *i) 1499 { 1500 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 1501 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 1502 1503 assert(ctrlr_ch->qpair != NULL); 1504 1505 _bdev_nvme_clear_io_path_cache(ctrlr_ch->qpair); 1506 1507 spdk_for_each_channel_continue(i, 0); 1508 } 1509 1510 static void 1511 bdev_nvme_clear_io_path_caches(struct nvme_ctrlr *nvme_ctrlr) 1512 { 1513 pthread_mutex_lock(&nvme_ctrlr->mutex); 1514 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 1515 nvme_ctrlr->io_path_cache_clearing) { 1516 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1517 return; 1518 } 1519 1520 nvme_ctrlr->io_path_cache_clearing = true; 1521 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1522 1523 spdk_for_each_channel(nvme_ctrlr, 1524 bdev_nvme_clear_io_path_cache, 1525 NULL, 1526 bdev_nvme_clear_io_path_caches_done); 1527 } 1528 1529 static struct nvme_qpair * 1530 nvme_poll_group_get_qpair(struct nvme_poll_group *group, struct spdk_nvme_qpair *qpair) 1531 { 1532 struct nvme_qpair *nvme_qpair; 1533 1534 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1535 if (nvme_qpair->qpair == qpair) { 1536 break; 1537 } 1538 } 1539 1540 return nvme_qpair; 1541 } 1542 1543 static void nvme_qpair_delete(struct nvme_qpair *nvme_qpair); 1544 1545 static void 1546 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx) 1547 { 1548 struct nvme_poll_group *group = poll_group_ctx; 1549 struct nvme_qpair *nvme_qpair; 1550 struct nvme_ctrlr_channel *ctrlr_ch; 1551 int status; 1552 1553 nvme_qpair = nvme_poll_group_get_qpair(group, qpair); 1554 if (nvme_qpair == NULL) { 1555 return; 1556 } 1557 1558 if (nvme_qpair->qpair != NULL) { 1559 spdk_nvme_ctrlr_free_io_qpair(nvme_qpair->qpair); 1560 nvme_qpair->qpair = NULL; 1561 } 1562 1563 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1564 1565 ctrlr_ch = nvme_qpair->ctrlr_ch; 1566 1567 if (ctrlr_ch != NULL) { 1568 if (ctrlr_ch->reset_iter != NULL) { 1569 /* We are in a full reset sequence. */ 1570 if (ctrlr_ch->connect_poller != NULL) { 1571 /* qpair was failed to connect. Abort the reset sequence. */ 1572 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was failed to connect. abort the reset ctrlr sequence.\n", 1573 qpair); 1574 spdk_poller_unregister(&ctrlr_ch->connect_poller); 1575 status = -1; 1576 } else { 1577 /* qpair was completed to disconnect. Just move to the next ctrlr_channel. */ 1578 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was disconnected and freed in a reset ctrlr sequence.\n", 1579 qpair); 1580 status = 0; 1581 } 1582 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, status); 1583 ctrlr_ch->reset_iter = NULL; 1584 } else { 1585 /* qpair was disconnected unexpectedly. Reset controller for recovery. */ 1586 SPDK_NOTICELOG("qpair %p was disconnected and freed. reset controller.\n", qpair); 1587 bdev_nvme_failover_ctrlr(nvme_qpair->ctrlr); 1588 } 1589 } else { 1590 /* In this case, ctrlr_channel is already deleted. */ 1591 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was disconnected and freed. delete nvme_qpair.\n", qpair); 1592 nvme_qpair_delete(nvme_qpair); 1593 } 1594 } 1595 1596 static void 1597 bdev_nvme_check_io_qpairs(struct nvme_poll_group *group) 1598 { 1599 struct nvme_qpair *nvme_qpair; 1600 1601 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1602 if (nvme_qpair->qpair == NULL || nvme_qpair->ctrlr_ch == NULL) { 1603 continue; 1604 } 1605 1606 if (spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) != 1607 SPDK_NVME_QPAIR_FAILURE_NONE) { 1608 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1609 } 1610 } 1611 } 1612 1613 static int 1614 bdev_nvme_poll(void *arg) 1615 { 1616 struct nvme_poll_group *group = arg; 1617 int64_t num_completions; 1618 1619 if (group->collect_spin_stat && group->start_ticks == 0) { 1620 group->start_ticks = spdk_get_ticks(); 1621 } 1622 1623 num_completions = spdk_nvme_poll_group_process_completions(group->group, 0, 1624 bdev_nvme_disconnected_qpair_cb); 1625 if (group->collect_spin_stat) { 1626 if (num_completions > 0) { 1627 if (group->end_ticks != 0) { 1628 group->spin_ticks += (group->end_ticks - group->start_ticks); 1629 group->end_ticks = 0; 1630 } 1631 group->start_ticks = 0; 1632 } else { 1633 group->end_ticks = spdk_get_ticks(); 1634 } 1635 } 1636 1637 if (spdk_unlikely(num_completions < 0)) { 1638 bdev_nvme_check_io_qpairs(group); 1639 } 1640 1641 return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 1642 } 1643 1644 static int bdev_nvme_poll_adminq(void *arg); 1645 1646 static void 1647 bdev_nvme_change_adminq_poll_period(struct nvme_ctrlr *nvme_ctrlr, uint64_t new_period_us) 1648 { 1649 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 1650 1651 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, 1652 nvme_ctrlr, new_period_us); 1653 } 1654 1655 static int 1656 bdev_nvme_poll_adminq(void *arg) 1657 { 1658 int32_t rc; 1659 struct nvme_ctrlr *nvme_ctrlr = arg; 1660 nvme_ctrlr_disconnected_cb disconnected_cb; 1661 1662 assert(nvme_ctrlr != NULL); 1663 1664 rc = spdk_nvme_ctrlr_process_admin_completions(nvme_ctrlr->ctrlr); 1665 if (rc < 0) { 1666 disconnected_cb = nvme_ctrlr->disconnected_cb; 1667 nvme_ctrlr->disconnected_cb = NULL; 1668 1669 if (disconnected_cb != NULL) { 1670 bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 1671 g_opts.nvme_adminq_poll_period_us); 1672 disconnected_cb(nvme_ctrlr); 1673 } else { 1674 bdev_nvme_failover_ctrlr(nvme_ctrlr); 1675 } 1676 } else if (spdk_nvme_ctrlr_get_admin_qp_failure_reason(nvme_ctrlr->ctrlr) != 1677 SPDK_NVME_QPAIR_FAILURE_NONE) { 1678 bdev_nvme_clear_io_path_caches(nvme_ctrlr); 1679 } 1680 1681 return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY; 1682 } 1683 1684 static void 1685 nvme_bdev_free(void *io_device) 1686 { 1687 struct nvme_bdev *nvme_disk = io_device; 1688 1689 pthread_mutex_destroy(&nvme_disk->mutex); 1690 free(nvme_disk->disk.name); 1691 free(nvme_disk->err_stat); 1692 free(nvme_disk); 1693 } 1694 1695 static int 1696 bdev_nvme_destruct(void *ctx) 1697 { 1698 struct nvme_bdev *nvme_disk = ctx; 1699 struct nvme_ns *nvme_ns, *tmp_nvme_ns; 1700 1701 SPDK_DTRACE_PROBE2(bdev_nvme_destruct, nvme_disk->nbdev_ctrlr->name, nvme_disk->nsid); 1702 1703 TAILQ_FOREACH_SAFE(nvme_ns, &nvme_disk->nvme_ns_list, tailq, tmp_nvme_ns) { 1704 pthread_mutex_lock(&nvme_ns->ctrlr->mutex); 1705 1706 nvme_ns->bdev = NULL; 1707 1708 assert(nvme_ns->id > 0); 1709 1710 if (nvme_ctrlr_get_ns(nvme_ns->ctrlr, nvme_ns->id) == NULL) { 1711 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1712 1713 nvme_ctrlr_release(nvme_ns->ctrlr); 1714 nvme_ns_free(nvme_ns); 1715 } else { 1716 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1717 } 1718 } 1719 1720 pthread_mutex_lock(&g_bdev_nvme_mutex); 1721 TAILQ_REMOVE(&nvme_disk->nbdev_ctrlr->bdevs, nvme_disk, tailq); 1722 pthread_mutex_unlock(&g_bdev_nvme_mutex); 1723 1724 spdk_io_device_unregister(nvme_disk, nvme_bdev_free); 1725 1726 return 0; 1727 } 1728 1729 static int 1730 bdev_nvme_create_qpair(struct nvme_qpair *nvme_qpair) 1731 { 1732 struct nvme_ctrlr *nvme_ctrlr; 1733 struct spdk_nvme_io_qpair_opts opts; 1734 struct spdk_nvme_qpair *qpair; 1735 int rc; 1736 1737 nvme_ctrlr = nvme_qpair->ctrlr; 1738 1739 spdk_nvme_ctrlr_get_default_io_qpair_opts(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1740 opts.delay_cmd_submit = g_opts.delay_cmd_submit; 1741 opts.create_only = true; 1742 opts.async_mode = true; 1743 opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests); 1744 g_opts.io_queue_requests = opts.io_queue_requests; 1745 1746 qpair = spdk_nvme_ctrlr_alloc_io_qpair(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1747 if (qpair == NULL) { 1748 return -1; 1749 } 1750 1751 SPDK_DTRACE_PROBE3(bdev_nvme_create_qpair, nvme_ctrlr->nbdev_ctrlr->name, 1752 spdk_nvme_qpair_get_id(qpair), spdk_thread_get_id(nvme_ctrlr->thread)); 1753 1754 assert(nvme_qpair->group != NULL); 1755 1756 rc = spdk_nvme_poll_group_add(nvme_qpair->group->group, qpair); 1757 if (rc != 0) { 1758 SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n"); 1759 goto err; 1760 } 1761 1762 rc = spdk_nvme_ctrlr_connect_io_qpair(nvme_ctrlr->ctrlr, qpair); 1763 if (rc != 0) { 1764 SPDK_ERRLOG("Unable to connect I/O qpair.\n"); 1765 goto err; 1766 } 1767 1768 nvme_qpair->qpair = qpair; 1769 1770 if (!g_opts.disable_auto_failback) { 1771 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1772 } 1773 1774 return 0; 1775 1776 err: 1777 spdk_nvme_ctrlr_free_io_qpair(qpair); 1778 1779 return rc; 1780 } 1781 1782 static void 1783 bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i) 1784 { 1785 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 1786 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 1787 enum spdk_bdev_io_status status = SPDK_BDEV_IO_STATUS_SUCCESS; 1788 struct nvme_bdev_io *bio; 1789 1790 if (spdk_io_channel_iter_get_ctx(i) != NULL) { 1791 status = SPDK_BDEV_IO_STATUS_FAILED; 1792 } 1793 1794 while (!TAILQ_EMPTY(&ctrlr_ch->pending_resets)) { 1795 bio = TAILQ_FIRST(&ctrlr_ch->pending_resets); 1796 TAILQ_REMOVE(&ctrlr_ch->pending_resets, bio, retry_link); 1797 __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), status, NULL); 1798 } 1799 1800 spdk_for_each_channel_continue(i, 0); 1801 } 1802 1803 /* This function marks the current trid as failed by storing the current ticks 1804 * and then sets the next trid to the active trid within a controller if exists. 1805 * 1806 * The purpose of the boolean return value is to request the caller to disconnect 1807 * the current trid now to try connecting the next trid. 1808 */ 1809 static bool 1810 bdev_nvme_failover_trid(struct nvme_ctrlr *nvme_ctrlr, bool remove, bool start) 1811 { 1812 struct nvme_path_id *path_id, *next_path; 1813 int rc __attribute__((unused)); 1814 1815 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 1816 assert(path_id); 1817 assert(path_id == nvme_ctrlr->active_path_id); 1818 next_path = TAILQ_NEXT(path_id, link); 1819 1820 /* Update the last failed time. It means the trid is failed if its last 1821 * failed time is non-zero. 1822 */ 1823 path_id->last_failed_tsc = spdk_get_ticks(); 1824 1825 if (next_path == NULL) { 1826 /* There is no alternate trid within a controller. */ 1827 return false; 1828 } 1829 1830 if (!start && nvme_ctrlr->opts.reconnect_delay_sec == 0) { 1831 /* Connect is not retried in a controller reset sequence. Connecting 1832 * the next trid will be done by the next bdev_nvme_failover_ctrlr() call. 1833 */ 1834 return false; 1835 } 1836 1837 assert(path_id->trid.trtype != SPDK_NVME_TRANSPORT_PCIE); 1838 1839 SPDK_NOTICELOG("Start failover from %s:%s to %s:%s\n", path_id->trid.traddr, 1840 path_id->trid.trsvcid, next_path->trid.traddr, next_path->trid.trsvcid); 1841 1842 spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr); 1843 nvme_ctrlr->active_path_id = next_path; 1844 rc = spdk_nvme_ctrlr_set_trid(nvme_ctrlr->ctrlr, &next_path->trid); 1845 assert(rc == 0); 1846 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 1847 if (!remove) { 1848 /** Shuffle the old trid to the end of the list and use the new one. 1849 * Allows for round robin through multiple connections. 1850 */ 1851 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, path_id, link); 1852 } else { 1853 free(path_id); 1854 } 1855 1856 if (start || next_path->last_failed_tsc == 0) { 1857 /* bdev_nvme_failover_ctrlr() is just called or the next trid is not failed 1858 * or used yet. Try the next trid now. 1859 */ 1860 return true; 1861 } 1862 1863 if (spdk_get_ticks() > next_path->last_failed_tsc + spdk_get_ticks_hz() * 1864 nvme_ctrlr->opts.reconnect_delay_sec) { 1865 /* Enough backoff passed since the next trid failed. Try the next trid now. */ 1866 return true; 1867 } 1868 1869 /* The next trid will be tried after reconnect_delay_sec seconds. */ 1870 return false; 1871 } 1872 1873 static bool 1874 bdev_nvme_check_ctrlr_loss_timeout(struct nvme_ctrlr *nvme_ctrlr) 1875 { 1876 int32_t elapsed; 1877 1878 if (nvme_ctrlr->opts.ctrlr_loss_timeout_sec == 0 || 1879 nvme_ctrlr->opts.ctrlr_loss_timeout_sec == -1) { 1880 return false; 1881 } 1882 1883 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 1884 if (elapsed >= nvme_ctrlr->opts.ctrlr_loss_timeout_sec) { 1885 return true; 1886 } else { 1887 return false; 1888 } 1889 } 1890 1891 static bool 1892 bdev_nvme_check_fast_io_fail_timeout(struct nvme_ctrlr *nvme_ctrlr) 1893 { 1894 uint32_t elapsed; 1895 1896 if (nvme_ctrlr->opts.fast_io_fail_timeout_sec == 0) { 1897 return false; 1898 } 1899 1900 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 1901 if (elapsed >= nvme_ctrlr->opts.fast_io_fail_timeout_sec) { 1902 return true; 1903 } else { 1904 return false; 1905 } 1906 } 1907 1908 static void bdev_nvme_reset_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, bool success); 1909 1910 static void 1911 nvme_ctrlr_disconnect(struct nvme_ctrlr *nvme_ctrlr, nvme_ctrlr_disconnected_cb cb_fn) 1912 { 1913 int rc; 1914 1915 rc = spdk_nvme_ctrlr_disconnect(nvme_ctrlr->ctrlr); 1916 if (rc != 0) { 1917 /* Disconnect fails if ctrlr is already resetting or removed. In this case, 1918 * fail the reset sequence immediately. 1919 */ 1920 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 1921 return; 1922 } 1923 1924 /* spdk_nvme_ctrlr_disconnect() may complete asynchronously later by polling adminq. 1925 * Set callback here to execute the specified operation after ctrlr is really disconnected. 1926 */ 1927 assert(nvme_ctrlr->disconnected_cb == NULL); 1928 nvme_ctrlr->disconnected_cb = cb_fn; 1929 1930 /* During disconnection, reduce the period to poll adminq more often. */ 1931 bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 0); 1932 } 1933 1934 enum bdev_nvme_op_after_reset { 1935 OP_NONE, 1936 OP_COMPLETE_PENDING_DESTRUCT, 1937 OP_DESTRUCT, 1938 OP_DELAYED_RECONNECT, 1939 OP_FAILOVER, 1940 }; 1941 1942 typedef enum bdev_nvme_op_after_reset _bdev_nvme_op_after_reset; 1943 1944 static _bdev_nvme_op_after_reset 1945 bdev_nvme_check_op_after_reset(struct nvme_ctrlr *nvme_ctrlr, bool success) 1946 { 1947 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 1948 /* Complete pending destruct after reset completes. */ 1949 return OP_COMPLETE_PENDING_DESTRUCT; 1950 } else if (nvme_ctrlr->pending_failover) { 1951 nvme_ctrlr->pending_failover = false; 1952 nvme_ctrlr->reset_start_tsc = 0; 1953 return OP_FAILOVER; 1954 } else if (success || nvme_ctrlr->opts.reconnect_delay_sec == 0) { 1955 nvme_ctrlr->reset_start_tsc = 0; 1956 return OP_NONE; 1957 } else if (bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 1958 return OP_DESTRUCT; 1959 } else { 1960 if (bdev_nvme_check_fast_io_fail_timeout(nvme_ctrlr)) { 1961 nvme_ctrlr->fast_io_fail_timedout = true; 1962 } 1963 return OP_DELAYED_RECONNECT; 1964 } 1965 } 1966 1967 static int bdev_nvme_delete_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool hotplug); 1968 static void bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 1969 1970 static int 1971 bdev_nvme_reconnect_delay_timer_expired(void *ctx) 1972 { 1973 struct nvme_ctrlr *nvme_ctrlr = ctx; 1974 1975 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect_delay, nvme_ctrlr->nbdev_ctrlr->name); 1976 pthread_mutex_lock(&nvme_ctrlr->mutex); 1977 1978 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 1979 1980 if (!nvme_ctrlr->reconnect_is_delayed) { 1981 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1982 return SPDK_POLLER_BUSY; 1983 } 1984 1985 nvme_ctrlr->reconnect_is_delayed = false; 1986 1987 if (nvme_ctrlr->destruct) { 1988 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1989 return SPDK_POLLER_BUSY; 1990 } 1991 1992 assert(nvme_ctrlr->resetting == false); 1993 nvme_ctrlr->resetting = true; 1994 1995 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1996 1997 spdk_poller_resume(nvme_ctrlr->adminq_timer_poller); 1998 1999 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2000 return SPDK_POLLER_BUSY; 2001 } 2002 2003 static void 2004 bdev_nvme_start_reconnect_delay_timer(struct nvme_ctrlr *nvme_ctrlr) 2005 { 2006 spdk_poller_pause(nvme_ctrlr->adminq_timer_poller); 2007 2008 assert(nvme_ctrlr->reconnect_is_delayed == false); 2009 nvme_ctrlr->reconnect_is_delayed = true; 2010 2011 assert(nvme_ctrlr->reconnect_delay_timer == NULL); 2012 nvme_ctrlr->reconnect_delay_timer = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_delay_timer_expired, 2013 nvme_ctrlr, 2014 nvme_ctrlr->opts.reconnect_delay_sec * SPDK_SEC_TO_USEC); 2015 } 2016 2017 static void remove_discovery_entry(struct nvme_ctrlr *nvme_ctrlr); 2018 2019 static void 2020 _bdev_nvme_reset_ctrlr_complete(struct spdk_io_channel_iter *i, int status) 2021 { 2022 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2023 bool success = spdk_io_channel_iter_get_ctx(i) == NULL; 2024 bdev_nvme_ctrlr_op_cb ctrlr_op_cb_fn = nvme_ctrlr->ctrlr_op_cb_fn; 2025 void *ctrlr_op_cb_arg = nvme_ctrlr->ctrlr_op_cb_arg; 2026 enum bdev_nvme_op_after_reset op_after_reset; 2027 2028 assert(nvme_ctrlr->thread == spdk_get_thread()); 2029 2030 nvme_ctrlr->ctrlr_op_cb_fn = NULL; 2031 nvme_ctrlr->ctrlr_op_cb_arg = NULL; 2032 2033 if (!success) { 2034 SPDK_ERRLOG("Resetting controller failed.\n"); 2035 } else { 2036 SPDK_NOTICELOG("Resetting controller successful.\n"); 2037 } 2038 2039 pthread_mutex_lock(&nvme_ctrlr->mutex); 2040 nvme_ctrlr->resetting = false; 2041 nvme_ctrlr->dont_retry = false; 2042 nvme_ctrlr->in_failover = false; 2043 2044 op_after_reset = bdev_nvme_check_op_after_reset(nvme_ctrlr, success); 2045 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2046 2047 /* Delay callbacks when the next operation is a failover. */ 2048 if (ctrlr_op_cb_fn && op_after_reset != OP_FAILOVER) { 2049 ctrlr_op_cb_fn(ctrlr_op_cb_arg, success ? 0 : -1); 2050 } 2051 2052 switch (op_after_reset) { 2053 case OP_COMPLETE_PENDING_DESTRUCT: 2054 nvme_ctrlr_unregister(nvme_ctrlr); 2055 break; 2056 case OP_DESTRUCT: 2057 bdev_nvme_delete_ctrlr(nvme_ctrlr, false); 2058 remove_discovery_entry(nvme_ctrlr); 2059 break; 2060 case OP_DELAYED_RECONNECT: 2061 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_start_reconnect_delay_timer); 2062 break; 2063 case OP_FAILOVER: 2064 nvme_ctrlr->ctrlr_op_cb_fn = ctrlr_op_cb_fn; 2065 nvme_ctrlr->ctrlr_op_cb_arg = ctrlr_op_cb_arg; 2066 bdev_nvme_failover_ctrlr(nvme_ctrlr); 2067 break; 2068 default: 2069 break; 2070 } 2071 } 2072 2073 static void 2074 bdev_nvme_reset_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, bool success) 2075 { 2076 pthread_mutex_lock(&nvme_ctrlr->mutex); 2077 if (!success) { 2078 /* Connecting the active trid failed. Set the next alternate trid to the 2079 * active trid if it exists. 2080 */ 2081 if (bdev_nvme_failover_trid(nvme_ctrlr, false, false)) { 2082 /* The next alternate trid exists and is ready to try. Try it now. */ 2083 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2084 2085 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr); 2086 return; 2087 } 2088 2089 /* We came here if there is no alternate trid or if the next trid exists but 2090 * is not ready to try. We will try the active trid after reconnect_delay_sec 2091 * seconds if it is non-zero or at the next reset call otherwise. 2092 */ 2093 } else { 2094 /* Connecting the active trid succeeded. Clear the last failed time because it 2095 * means the trid is failed if its last failed time is non-zero. 2096 */ 2097 nvme_ctrlr->active_path_id->last_failed_tsc = 0; 2098 } 2099 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2100 2101 /* Make sure we clear any pending resets before returning. */ 2102 spdk_for_each_channel(nvme_ctrlr, 2103 bdev_nvme_complete_pending_resets, 2104 success ? NULL : (void *)0x1, 2105 _bdev_nvme_reset_ctrlr_complete); 2106 } 2107 2108 static void 2109 bdev_nvme_reset_create_qpairs_failed(struct spdk_io_channel_iter *i, int status) 2110 { 2111 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2112 2113 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 2114 } 2115 2116 static void 2117 bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i) 2118 { 2119 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 2120 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch); 2121 struct nvme_qpair *nvme_qpair; 2122 2123 nvme_qpair = ctrlr_ch->qpair; 2124 assert(nvme_qpair != NULL); 2125 2126 _bdev_nvme_clear_io_path_cache(nvme_qpair); 2127 2128 if (nvme_qpair->qpair != NULL) { 2129 if (nvme_qpair->ctrlr->dont_retry) { 2130 spdk_nvme_qpair_set_abort_dnr(nvme_qpair->qpair, true); 2131 } 2132 spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); 2133 2134 /* The current full reset sequence will move to the next 2135 * ctrlr_channel after the qpair is actually disconnected. 2136 */ 2137 assert(ctrlr_ch->reset_iter == NULL); 2138 ctrlr_ch->reset_iter = i; 2139 } else { 2140 spdk_for_each_channel_continue(i, 0); 2141 } 2142 } 2143 2144 static void 2145 bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status) 2146 { 2147 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2148 2149 if (status == 0) { 2150 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, true); 2151 } else { 2152 /* Delete the added qpairs and quiesce ctrlr to make the states clean. */ 2153 spdk_for_each_channel(nvme_ctrlr, 2154 bdev_nvme_reset_destroy_qpair, 2155 NULL, 2156 bdev_nvme_reset_create_qpairs_failed); 2157 } 2158 } 2159 2160 static int 2161 bdev_nvme_reset_check_qpair_connected(void *ctx) 2162 { 2163 struct nvme_ctrlr_channel *ctrlr_ch = ctx; 2164 2165 if (ctrlr_ch->reset_iter == NULL) { 2166 /* qpair was already failed to connect and the reset sequence is being aborted. */ 2167 assert(ctrlr_ch->connect_poller == NULL); 2168 assert(ctrlr_ch->qpair->qpair == NULL); 2169 return SPDK_POLLER_BUSY; 2170 } 2171 2172 assert(ctrlr_ch->qpair->qpair != NULL); 2173 2174 if (!spdk_nvme_qpair_is_connected(ctrlr_ch->qpair->qpair)) { 2175 return SPDK_POLLER_BUSY; 2176 } 2177 2178 spdk_poller_unregister(&ctrlr_ch->connect_poller); 2179 2180 /* qpair was completed to connect. Move to the next ctrlr_channel */ 2181 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 2182 ctrlr_ch->reset_iter = NULL; 2183 2184 if (!g_opts.disable_auto_failback) { 2185 _bdev_nvme_clear_io_path_cache(ctrlr_ch->qpair); 2186 } 2187 2188 return SPDK_POLLER_BUSY; 2189 } 2190 2191 static void 2192 bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i) 2193 { 2194 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 2195 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 2196 int rc; 2197 2198 rc = bdev_nvme_create_qpair(ctrlr_ch->qpair); 2199 if (rc == 0) { 2200 ctrlr_ch->connect_poller = SPDK_POLLER_REGISTER(bdev_nvme_reset_check_qpair_connected, 2201 ctrlr_ch, 0); 2202 2203 /* The current full reset sequence will move to the next 2204 * ctrlr_channel after the qpair is actually connected. 2205 */ 2206 assert(ctrlr_ch->reset_iter == NULL); 2207 ctrlr_ch->reset_iter = i; 2208 } else { 2209 spdk_for_each_channel_continue(i, rc); 2210 } 2211 } 2212 2213 static int 2214 bdev_nvme_reconnect_ctrlr_poll(void *arg) 2215 { 2216 struct nvme_ctrlr *nvme_ctrlr = arg; 2217 int rc = -ETIMEDOUT; 2218 2219 if (!bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 2220 rc = spdk_nvme_ctrlr_reconnect_poll_async(nvme_ctrlr->ctrlr); 2221 if (rc == -EAGAIN) { 2222 return SPDK_POLLER_BUSY; 2223 } 2224 } 2225 2226 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 2227 if (rc == 0) { 2228 /* Recreate all of the I/O queue pairs */ 2229 spdk_for_each_channel(nvme_ctrlr, 2230 bdev_nvme_reset_create_qpair, 2231 NULL, 2232 bdev_nvme_reset_create_qpairs_done); 2233 } else { 2234 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 2235 } 2236 return SPDK_POLLER_BUSY; 2237 } 2238 2239 static void 2240 bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2241 { 2242 spdk_nvme_ctrlr_reconnect_async(nvme_ctrlr->ctrlr); 2243 2244 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect, nvme_ctrlr->nbdev_ctrlr->name); 2245 assert(nvme_ctrlr->reset_detach_poller == NULL); 2246 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_ctrlr_poll, 2247 nvme_ctrlr, 0); 2248 } 2249 2250 static void 2251 bdev_nvme_reset_destroy_qpair_done(struct spdk_io_channel_iter *i, int status) 2252 { 2253 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2254 2255 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reset, nvme_ctrlr->nbdev_ctrlr->name); 2256 assert(status == 0); 2257 2258 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2259 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2260 } else { 2261 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr); 2262 } 2263 } 2264 2265 static void 2266 bdev_nvme_reset_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr) 2267 { 2268 spdk_for_each_channel(nvme_ctrlr, 2269 bdev_nvme_reset_destroy_qpair, 2270 NULL, 2271 bdev_nvme_reset_destroy_qpair_done); 2272 } 2273 2274 static void 2275 bdev_nvme_reconnect_ctrlr_now(void *ctx) 2276 { 2277 struct nvme_ctrlr *nvme_ctrlr = ctx; 2278 2279 assert(nvme_ctrlr->resetting == true); 2280 assert(nvme_ctrlr->thread == spdk_get_thread()); 2281 2282 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 2283 2284 spdk_poller_resume(nvme_ctrlr->adminq_timer_poller); 2285 2286 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2287 } 2288 2289 static void 2290 _bdev_nvme_reset_ctrlr(void *ctx) 2291 { 2292 struct nvme_ctrlr *nvme_ctrlr = ctx; 2293 2294 assert(nvme_ctrlr->resetting == true); 2295 assert(nvme_ctrlr->thread == spdk_get_thread()); 2296 2297 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2298 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reset_destroy_qpairs); 2299 } else { 2300 bdev_nvme_reset_destroy_qpairs(nvme_ctrlr); 2301 } 2302 } 2303 2304 static int 2305 bdev_nvme_reset_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2306 { 2307 spdk_msg_fn msg_fn; 2308 2309 pthread_mutex_lock(&nvme_ctrlr->mutex); 2310 if (nvme_ctrlr->destruct) { 2311 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2312 return -ENXIO; 2313 } 2314 2315 if (nvme_ctrlr->resetting) { 2316 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2317 SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); 2318 return -EBUSY; 2319 } 2320 2321 if (nvme_ctrlr->disabled) { 2322 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2323 SPDK_NOTICELOG("Unable to perform reset. Controller is disabled.\n"); 2324 return -EALREADY; 2325 } 2326 2327 nvme_ctrlr->resetting = true; 2328 nvme_ctrlr->dont_retry = true; 2329 2330 if (nvme_ctrlr->reconnect_is_delayed) { 2331 SPDK_DEBUGLOG(bdev_nvme, "Reconnect is already scheduled.\n"); 2332 msg_fn = bdev_nvme_reconnect_ctrlr_now; 2333 nvme_ctrlr->reconnect_is_delayed = false; 2334 } else { 2335 msg_fn = _bdev_nvme_reset_ctrlr; 2336 assert(nvme_ctrlr->reset_start_tsc == 0); 2337 } 2338 2339 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2340 2341 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2342 2343 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 2344 return 0; 2345 } 2346 2347 static int 2348 bdev_nvme_enable_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2349 { 2350 pthread_mutex_lock(&nvme_ctrlr->mutex); 2351 if (nvme_ctrlr->destruct) { 2352 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2353 return -ENXIO; 2354 } 2355 2356 if (nvme_ctrlr->resetting) { 2357 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2358 return -EBUSY; 2359 } 2360 2361 if (!nvme_ctrlr->disabled) { 2362 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2363 return -EALREADY; 2364 } 2365 2366 nvme_ctrlr->disabled = false; 2367 nvme_ctrlr->resetting = true; 2368 2369 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2370 2371 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2372 2373 spdk_thread_send_msg(nvme_ctrlr->thread, bdev_nvme_reconnect_ctrlr_now, nvme_ctrlr); 2374 return 0; 2375 } 2376 2377 static void 2378 _bdev_nvme_disable_ctrlr_complete(struct spdk_io_channel_iter *i, int status) 2379 { 2380 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2381 bdev_nvme_ctrlr_op_cb ctrlr_op_cb_fn = nvme_ctrlr->ctrlr_op_cb_fn; 2382 void *ctrlr_op_cb_arg = nvme_ctrlr->ctrlr_op_cb_arg; 2383 enum bdev_nvme_op_after_reset op_after_disable; 2384 2385 assert(nvme_ctrlr->thread == spdk_get_thread()); 2386 2387 nvme_ctrlr->ctrlr_op_cb_fn = NULL; 2388 nvme_ctrlr->ctrlr_op_cb_arg = NULL; 2389 2390 pthread_mutex_lock(&nvme_ctrlr->mutex); 2391 2392 nvme_ctrlr->resetting = false; 2393 nvme_ctrlr->dont_retry = false; 2394 2395 op_after_disable = bdev_nvme_check_op_after_reset(nvme_ctrlr, true); 2396 2397 nvme_ctrlr->disabled = true; 2398 spdk_poller_pause(nvme_ctrlr->adminq_timer_poller); 2399 2400 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2401 2402 if (ctrlr_op_cb_fn) { 2403 ctrlr_op_cb_fn(ctrlr_op_cb_arg, 0); 2404 } 2405 2406 switch (op_after_disable) { 2407 case OP_COMPLETE_PENDING_DESTRUCT: 2408 nvme_ctrlr_unregister(nvme_ctrlr); 2409 break; 2410 default: 2411 break; 2412 } 2413 2414 } 2415 2416 static void 2417 bdev_nvme_disable_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr) 2418 { 2419 /* Make sure we clear any pending resets before returning. */ 2420 spdk_for_each_channel(nvme_ctrlr, 2421 bdev_nvme_complete_pending_resets, 2422 NULL, 2423 _bdev_nvme_disable_ctrlr_complete); 2424 } 2425 2426 static void 2427 bdev_nvme_disable_destroy_qpairs_done(struct spdk_io_channel_iter *i, int status) 2428 { 2429 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 2430 2431 assert(status == 0); 2432 2433 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2434 bdev_nvme_disable_ctrlr_complete(nvme_ctrlr); 2435 } else { 2436 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_disable_ctrlr_complete); 2437 } 2438 } 2439 2440 static void 2441 bdev_nvme_disable_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr) 2442 { 2443 spdk_for_each_channel(nvme_ctrlr, 2444 bdev_nvme_reset_destroy_qpair, 2445 NULL, 2446 bdev_nvme_disable_destroy_qpairs_done); 2447 } 2448 2449 static void 2450 _bdev_nvme_cancel_reconnect_and_disable_ctrlr(void *ctx) 2451 { 2452 struct nvme_ctrlr *nvme_ctrlr = ctx; 2453 2454 assert(nvme_ctrlr->resetting == true); 2455 assert(nvme_ctrlr->thread == spdk_get_thread()); 2456 2457 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 2458 2459 bdev_nvme_disable_ctrlr_complete(nvme_ctrlr); 2460 } 2461 2462 static void 2463 _bdev_nvme_disconnect_and_disable_ctrlr(void *ctx) 2464 { 2465 struct nvme_ctrlr *nvme_ctrlr = ctx; 2466 2467 assert(nvme_ctrlr->resetting == true); 2468 assert(nvme_ctrlr->thread == spdk_get_thread()); 2469 2470 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2471 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_disable_destroy_qpairs); 2472 } else { 2473 bdev_nvme_disable_destroy_qpairs(nvme_ctrlr); 2474 } 2475 } 2476 2477 static int 2478 bdev_nvme_disable_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2479 { 2480 spdk_msg_fn msg_fn; 2481 2482 pthread_mutex_lock(&nvme_ctrlr->mutex); 2483 if (nvme_ctrlr->destruct) { 2484 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2485 return -ENXIO; 2486 } 2487 2488 if (nvme_ctrlr->resetting) { 2489 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2490 return -EBUSY; 2491 } 2492 2493 if (nvme_ctrlr->disabled) { 2494 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2495 return -EALREADY; 2496 } 2497 2498 nvme_ctrlr->resetting = true; 2499 nvme_ctrlr->dont_retry = true; 2500 2501 if (nvme_ctrlr->reconnect_is_delayed) { 2502 msg_fn = _bdev_nvme_cancel_reconnect_and_disable_ctrlr; 2503 nvme_ctrlr->reconnect_is_delayed = false; 2504 } else { 2505 msg_fn = _bdev_nvme_disconnect_and_disable_ctrlr; 2506 } 2507 2508 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2509 2510 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2511 2512 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 2513 return 0; 2514 } 2515 2516 static int 2517 nvme_ctrlr_op(struct nvme_ctrlr *nvme_ctrlr, enum nvme_ctrlr_op op, 2518 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2519 { 2520 int rc; 2521 2522 switch (op) { 2523 case NVME_CTRLR_OP_RESET: 2524 rc = bdev_nvme_reset_ctrlr(nvme_ctrlr); 2525 break; 2526 case NVME_CTRLR_OP_ENABLE: 2527 rc = bdev_nvme_enable_ctrlr(nvme_ctrlr); 2528 break; 2529 case NVME_CTRLR_OP_DISABLE: 2530 rc = bdev_nvme_disable_ctrlr(nvme_ctrlr); 2531 break; 2532 default: 2533 rc = -EINVAL; 2534 break; 2535 } 2536 2537 if (rc == 0) { 2538 assert(nvme_ctrlr->ctrlr_op_cb_fn == NULL); 2539 assert(nvme_ctrlr->ctrlr_op_cb_arg == NULL); 2540 nvme_ctrlr->ctrlr_op_cb_fn = cb_fn; 2541 nvme_ctrlr->ctrlr_op_cb_arg = cb_arg; 2542 } 2543 return rc; 2544 } 2545 2546 struct nvme_ctrlr_op_rpc_ctx { 2547 struct nvme_ctrlr *nvme_ctrlr; 2548 struct spdk_thread *orig_thread; 2549 enum nvme_ctrlr_op op; 2550 int rc; 2551 bdev_nvme_ctrlr_op_cb cb_fn; 2552 void *cb_arg; 2553 }; 2554 2555 static void 2556 _nvme_ctrlr_op_rpc_complete(void *_ctx) 2557 { 2558 struct nvme_ctrlr_op_rpc_ctx *ctx = _ctx; 2559 2560 assert(ctx != NULL); 2561 assert(ctx->cb_fn != NULL); 2562 2563 ctx->cb_fn(ctx->cb_arg, ctx->rc); 2564 2565 free(ctx); 2566 } 2567 2568 static void 2569 nvme_ctrlr_op_rpc_complete(void *cb_arg, int rc) 2570 { 2571 struct nvme_ctrlr_op_rpc_ctx *ctx = cb_arg; 2572 2573 ctx->rc = rc; 2574 2575 spdk_thread_send_msg(ctx->orig_thread, _nvme_ctrlr_op_rpc_complete, ctx); 2576 } 2577 2578 void 2579 nvme_ctrlr_op_rpc(struct nvme_ctrlr *nvme_ctrlr, enum nvme_ctrlr_op op, 2580 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2581 { 2582 struct nvme_ctrlr_op_rpc_ctx *ctx; 2583 int rc; 2584 2585 assert(cb_fn != NULL); 2586 2587 ctx = calloc(1, sizeof(*ctx)); 2588 if (ctx == NULL) { 2589 SPDK_ERRLOG("Failed to allocate nvme_ctrlr_op_rpc_ctx.\n"); 2590 cb_fn(cb_arg, -ENOMEM); 2591 return; 2592 } 2593 2594 ctx->orig_thread = spdk_get_thread(); 2595 ctx->cb_fn = cb_fn; 2596 ctx->cb_arg = cb_arg; 2597 2598 rc = nvme_ctrlr_op(nvme_ctrlr, op, nvme_ctrlr_op_rpc_complete, ctx); 2599 if (rc == 0) { 2600 return; 2601 } else if (rc == -EALREADY) { 2602 rc = 0; 2603 } 2604 2605 nvme_ctrlr_op_rpc_complete(ctx, rc); 2606 } 2607 2608 static void nvme_bdev_ctrlr_op_rpc_continue(void *cb_arg, int rc); 2609 2610 static void 2611 _nvme_bdev_ctrlr_op_rpc_continue(void *_ctx) 2612 { 2613 struct nvme_ctrlr_op_rpc_ctx *ctx = _ctx; 2614 struct nvme_ctrlr *prev_nvme_ctrlr, *next_nvme_ctrlr; 2615 int rc; 2616 2617 prev_nvme_ctrlr = ctx->nvme_ctrlr; 2618 ctx->nvme_ctrlr = NULL; 2619 2620 if (ctx->rc != 0) { 2621 goto complete; 2622 } 2623 2624 next_nvme_ctrlr = TAILQ_NEXT(prev_nvme_ctrlr, tailq); 2625 if (next_nvme_ctrlr == NULL) { 2626 goto complete; 2627 } 2628 2629 rc = nvme_ctrlr_op(next_nvme_ctrlr, ctx->op, nvme_bdev_ctrlr_op_rpc_continue, ctx); 2630 if (rc == 0) { 2631 ctx->nvme_ctrlr = next_nvme_ctrlr; 2632 return; 2633 } else if (rc == -EALREADY) { 2634 ctx->nvme_ctrlr = next_nvme_ctrlr; 2635 rc = 0; 2636 } 2637 2638 ctx->rc = rc; 2639 2640 complete: 2641 ctx->cb_fn(ctx->cb_arg, ctx->rc); 2642 free(ctx); 2643 } 2644 2645 static void 2646 nvme_bdev_ctrlr_op_rpc_continue(void *cb_arg, int rc) 2647 { 2648 struct nvme_ctrlr_op_rpc_ctx *ctx = cb_arg; 2649 2650 ctx->rc = rc; 2651 2652 spdk_thread_send_msg(ctx->orig_thread, _nvme_bdev_ctrlr_op_rpc_continue, ctx); 2653 } 2654 2655 void 2656 nvme_bdev_ctrlr_op_rpc(struct nvme_bdev_ctrlr *nbdev_ctrlr, enum nvme_ctrlr_op op, 2657 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2658 { 2659 struct nvme_ctrlr_op_rpc_ctx *ctx; 2660 struct nvme_ctrlr *nvme_ctrlr; 2661 int rc; 2662 2663 assert(cb_fn != NULL); 2664 2665 ctx = calloc(1, sizeof(*ctx)); 2666 if (ctx == NULL) { 2667 SPDK_ERRLOG("Failed to allocate nvme_ctrlr_op_rpc_ctx.\n"); 2668 cb_fn(cb_arg, -ENOMEM); 2669 return; 2670 } 2671 2672 ctx->orig_thread = spdk_get_thread(); 2673 ctx->op = op; 2674 ctx->cb_fn = cb_fn; 2675 ctx->cb_arg = cb_arg; 2676 2677 nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 2678 assert(nvme_ctrlr != NULL); 2679 2680 rc = nvme_ctrlr_op(nvme_ctrlr, op, nvme_bdev_ctrlr_op_rpc_continue, ctx); 2681 if (rc == 0) { 2682 ctx->nvme_ctrlr = nvme_ctrlr; 2683 return; 2684 } else if (rc == -EALREADY) { 2685 ctx->nvme_ctrlr = nvme_ctrlr; 2686 rc = 0; 2687 } 2688 2689 nvme_bdev_ctrlr_op_rpc_continue(ctx, rc); 2690 } 2691 2692 static int _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio); 2693 2694 static void 2695 _bdev_nvme_reset_io_complete(struct spdk_io_channel_iter *i, int status) 2696 { 2697 struct nvme_bdev_io *bio = spdk_io_channel_iter_get_ctx(i); 2698 enum spdk_bdev_io_status io_status; 2699 2700 if (bio->cpl.cdw0 == 0) { 2701 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 2702 } else { 2703 io_status = SPDK_BDEV_IO_STATUS_FAILED; 2704 } 2705 2706 __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), io_status, NULL); 2707 } 2708 2709 static void 2710 bdev_nvme_abort_bdev_channel(struct spdk_io_channel_iter *i) 2711 { 2712 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 2713 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 2714 2715 bdev_nvme_abort_retry_ios(nbdev_ch); 2716 2717 spdk_for_each_channel_continue(i, 0); 2718 } 2719 2720 static void 2721 bdev_nvme_reset_io_complete(struct nvme_bdev_io *bio) 2722 { 2723 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2724 struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt; 2725 2726 /* Abort all queued I/Os for retry. */ 2727 spdk_for_each_channel(nbdev, 2728 bdev_nvme_abort_bdev_channel, 2729 bio, 2730 _bdev_nvme_reset_io_complete); 2731 } 2732 2733 static void 2734 _bdev_nvme_reset_io_continue(void *ctx) 2735 { 2736 struct nvme_bdev_io *bio = ctx; 2737 struct nvme_io_path *prev_io_path, *next_io_path; 2738 int rc; 2739 2740 prev_io_path = bio->io_path; 2741 bio->io_path = NULL; 2742 2743 if (bio->cpl.cdw0 != 0) { 2744 goto complete; 2745 } 2746 2747 next_io_path = STAILQ_NEXT(prev_io_path, stailq); 2748 if (next_io_path == NULL) { 2749 goto complete; 2750 } 2751 2752 rc = _bdev_nvme_reset_io(next_io_path, bio); 2753 if (rc == 0) { 2754 return; 2755 } 2756 2757 bio->cpl.cdw0 = 1; 2758 2759 complete: 2760 bdev_nvme_reset_io_complete(bio); 2761 } 2762 2763 static void 2764 bdev_nvme_reset_io_continue(void *cb_arg, int rc) 2765 { 2766 struct nvme_bdev_io *bio = cb_arg; 2767 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2768 2769 bio->cpl.cdw0 = (rc == 0) ? 0 : 1; 2770 2771 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), _bdev_nvme_reset_io_continue, bio); 2772 } 2773 2774 static int 2775 _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio) 2776 { 2777 struct nvme_ctrlr_channel *ctrlr_ch; 2778 int rc; 2779 2780 rc = nvme_ctrlr_op(io_path->qpair->ctrlr, NVME_CTRLR_OP_RESET, 2781 bdev_nvme_reset_io_continue, bio); 2782 if (rc == 0) { 2783 assert(bio->io_path == NULL); 2784 bio->io_path = io_path; 2785 } else if (rc == -EBUSY) { 2786 ctrlr_ch = io_path->qpair->ctrlr_ch; 2787 assert(ctrlr_ch != NULL); 2788 /* 2789 * Reset call is queued only if it is from the app framework. This is on purpose so that 2790 * we don't interfere with the app framework reset strategy. i.e. we are deferring to the 2791 * upper level. If they are in the middle of a reset, we won't try to schedule another one. 2792 */ 2793 TAILQ_INSERT_TAIL(&ctrlr_ch->pending_resets, bio, retry_link); 2794 rc = 0; 2795 } 2796 2797 return rc; 2798 } 2799 2800 static void 2801 bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio) 2802 { 2803 struct nvme_io_path *io_path; 2804 int rc; 2805 2806 bio->cpl.cdw0 = 0; 2807 2808 /* Reset all nvme_ctrlrs of a bdev controller sequentially. */ 2809 io_path = STAILQ_FIRST(&nbdev_ch->io_path_list); 2810 assert(io_path != NULL); 2811 2812 rc = _bdev_nvme_reset_io(io_path, bio); 2813 if (rc != 0) { 2814 /* If the current nvme_ctrlr is disabled, skip it and move to the next nvme_ctrlr. */ 2815 bdev_nvme_reset_io_continue(bio, rc == -EALREADY); 2816 } 2817 } 2818 2819 static int 2820 bdev_nvme_failover_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool remove) 2821 { 2822 if (nvme_ctrlr->destruct) { 2823 /* Don't bother resetting if the controller is in the process of being destructed. */ 2824 return -ENXIO; 2825 } 2826 2827 if (nvme_ctrlr->resetting) { 2828 if (!nvme_ctrlr->in_failover) { 2829 SPDK_NOTICELOG("Reset is already in progress. Defer failover until reset completes.\n"); 2830 2831 /* Defer failover until reset completes. */ 2832 nvme_ctrlr->pending_failover = true; 2833 return -EINPROGRESS; 2834 } else { 2835 SPDK_NOTICELOG("Unable to perform failover, already in progress.\n"); 2836 return -EBUSY; 2837 } 2838 } 2839 2840 bdev_nvme_failover_trid(nvme_ctrlr, remove, true); 2841 2842 if (nvme_ctrlr->reconnect_is_delayed) { 2843 SPDK_NOTICELOG("Reconnect is already scheduled.\n"); 2844 2845 /* We rely on the next reconnect for the failover. */ 2846 return -EALREADY; 2847 } 2848 2849 if (nvme_ctrlr->disabled) { 2850 SPDK_NOTICELOG("Controller is disabled.\n"); 2851 2852 /* We rely on the enablement for the failover. */ 2853 return -EALREADY; 2854 } 2855 2856 nvme_ctrlr->resetting = true; 2857 nvme_ctrlr->in_failover = true; 2858 2859 assert(nvme_ctrlr->reset_start_tsc == 0); 2860 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2861 2862 return 0; 2863 } 2864 2865 static int 2866 bdev_nvme_failover_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2867 { 2868 int rc; 2869 2870 pthread_mutex_lock(&nvme_ctrlr->mutex); 2871 rc = bdev_nvme_failover_ctrlr_unsafe(nvme_ctrlr, false); 2872 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2873 2874 if (rc == 0) { 2875 spdk_thread_send_msg(nvme_ctrlr->thread, _bdev_nvme_reset_ctrlr, nvme_ctrlr); 2876 } else if (rc == -EALREADY) { 2877 rc = 0; 2878 } 2879 2880 return rc; 2881 } 2882 2883 static int bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, 2884 uint64_t num_blocks); 2885 2886 static int bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, 2887 uint64_t num_blocks); 2888 2889 static int bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, 2890 uint64_t src_offset_blocks, 2891 uint64_t num_blocks); 2892 2893 static void 2894 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 2895 bool success) 2896 { 2897 struct nvme_bdev_io *bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 2898 int ret; 2899 2900 if (!success) { 2901 ret = -EINVAL; 2902 goto exit; 2903 } 2904 2905 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 2906 ret = -ENXIO; 2907 goto exit; 2908 } 2909 2910 ret = bdev_nvme_readv(bio, 2911 bdev_io->u.bdev.iovs, 2912 bdev_io->u.bdev.iovcnt, 2913 bdev_io->u.bdev.md_buf, 2914 bdev_io->u.bdev.num_blocks, 2915 bdev_io->u.bdev.offset_blocks, 2916 bdev_io->u.bdev.dif_check_flags, 2917 bdev_io->u.bdev.memory_domain, 2918 bdev_io->u.bdev.memory_domain_ctx, 2919 bdev_io->u.bdev.accel_sequence); 2920 2921 exit: 2922 if (spdk_unlikely(ret != 0)) { 2923 bdev_nvme_io_complete(bio, ret); 2924 } 2925 } 2926 2927 static inline void 2928 _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io) 2929 { 2930 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 2931 struct spdk_bdev *bdev = bdev_io->bdev; 2932 struct nvme_bdev_io *nbdev_io_to_abort; 2933 int rc = 0; 2934 2935 switch (bdev_io->type) { 2936 case SPDK_BDEV_IO_TYPE_READ: 2937 if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) { 2938 2939 rc = bdev_nvme_readv(nbdev_io, 2940 bdev_io->u.bdev.iovs, 2941 bdev_io->u.bdev.iovcnt, 2942 bdev_io->u.bdev.md_buf, 2943 bdev_io->u.bdev.num_blocks, 2944 bdev_io->u.bdev.offset_blocks, 2945 bdev_io->u.bdev.dif_check_flags, 2946 bdev_io->u.bdev.memory_domain, 2947 bdev_io->u.bdev.memory_domain_ctx, 2948 bdev_io->u.bdev.accel_sequence); 2949 } else { 2950 spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb, 2951 bdev_io->u.bdev.num_blocks * bdev->blocklen); 2952 rc = 0; 2953 } 2954 break; 2955 case SPDK_BDEV_IO_TYPE_WRITE: 2956 rc = bdev_nvme_writev(nbdev_io, 2957 bdev_io->u.bdev.iovs, 2958 bdev_io->u.bdev.iovcnt, 2959 bdev_io->u.bdev.md_buf, 2960 bdev_io->u.bdev.num_blocks, 2961 bdev_io->u.bdev.offset_blocks, 2962 bdev_io->u.bdev.dif_check_flags, 2963 bdev_io->u.bdev.memory_domain, 2964 bdev_io->u.bdev.memory_domain_ctx, 2965 bdev_io->u.bdev.accel_sequence); 2966 break; 2967 case SPDK_BDEV_IO_TYPE_COMPARE: 2968 rc = bdev_nvme_comparev(nbdev_io, 2969 bdev_io->u.bdev.iovs, 2970 bdev_io->u.bdev.iovcnt, 2971 bdev_io->u.bdev.md_buf, 2972 bdev_io->u.bdev.num_blocks, 2973 bdev_io->u.bdev.offset_blocks, 2974 bdev_io->u.bdev.dif_check_flags); 2975 break; 2976 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 2977 rc = bdev_nvme_comparev_and_writev(nbdev_io, 2978 bdev_io->u.bdev.iovs, 2979 bdev_io->u.bdev.iovcnt, 2980 bdev_io->u.bdev.fused_iovs, 2981 bdev_io->u.bdev.fused_iovcnt, 2982 bdev_io->u.bdev.md_buf, 2983 bdev_io->u.bdev.num_blocks, 2984 bdev_io->u.bdev.offset_blocks, 2985 bdev_io->u.bdev.dif_check_flags); 2986 break; 2987 case SPDK_BDEV_IO_TYPE_UNMAP: 2988 rc = bdev_nvme_unmap(nbdev_io, 2989 bdev_io->u.bdev.offset_blocks, 2990 bdev_io->u.bdev.num_blocks); 2991 break; 2992 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2993 rc = bdev_nvme_write_zeroes(nbdev_io, 2994 bdev_io->u.bdev.offset_blocks, 2995 bdev_io->u.bdev.num_blocks); 2996 break; 2997 case SPDK_BDEV_IO_TYPE_RESET: 2998 nbdev_io->io_path = NULL; 2999 bdev_nvme_reset_io(nbdev_ch, nbdev_io); 3000 return; 3001 3002 case SPDK_BDEV_IO_TYPE_FLUSH: 3003 bdev_nvme_io_complete(nbdev_io, 0); 3004 return; 3005 3006 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 3007 rc = bdev_nvme_zone_appendv(nbdev_io, 3008 bdev_io->u.bdev.iovs, 3009 bdev_io->u.bdev.iovcnt, 3010 bdev_io->u.bdev.md_buf, 3011 bdev_io->u.bdev.num_blocks, 3012 bdev_io->u.bdev.offset_blocks, 3013 bdev_io->u.bdev.dif_check_flags); 3014 break; 3015 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 3016 rc = bdev_nvme_get_zone_info(nbdev_io, 3017 bdev_io->u.zone_mgmt.zone_id, 3018 bdev_io->u.zone_mgmt.num_zones, 3019 bdev_io->u.zone_mgmt.buf); 3020 break; 3021 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 3022 rc = bdev_nvme_zone_management(nbdev_io, 3023 bdev_io->u.zone_mgmt.zone_id, 3024 bdev_io->u.zone_mgmt.zone_action); 3025 break; 3026 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 3027 nbdev_io->io_path = NULL; 3028 bdev_nvme_admin_passthru(nbdev_ch, 3029 nbdev_io, 3030 &bdev_io->u.nvme_passthru.cmd, 3031 bdev_io->u.nvme_passthru.buf, 3032 bdev_io->u.nvme_passthru.nbytes); 3033 return; 3034 3035 case SPDK_BDEV_IO_TYPE_NVME_IO: 3036 rc = bdev_nvme_io_passthru(nbdev_io, 3037 &bdev_io->u.nvme_passthru.cmd, 3038 bdev_io->u.nvme_passthru.buf, 3039 bdev_io->u.nvme_passthru.nbytes); 3040 break; 3041 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3042 rc = bdev_nvme_io_passthru_md(nbdev_io, 3043 &bdev_io->u.nvme_passthru.cmd, 3044 bdev_io->u.nvme_passthru.buf, 3045 bdev_io->u.nvme_passthru.nbytes, 3046 bdev_io->u.nvme_passthru.md_buf, 3047 bdev_io->u.nvme_passthru.md_len); 3048 break; 3049 case SPDK_BDEV_IO_TYPE_NVME_IOV_MD: 3050 rc = bdev_nvme_iov_passthru_md(nbdev_io, 3051 &bdev_io->u.nvme_passthru.cmd, 3052 bdev_io->u.nvme_passthru.iovs, 3053 bdev_io->u.nvme_passthru.iovcnt, 3054 bdev_io->u.nvme_passthru.nbytes, 3055 bdev_io->u.nvme_passthru.md_buf, 3056 bdev_io->u.nvme_passthru.md_len); 3057 break; 3058 case SPDK_BDEV_IO_TYPE_ABORT: 3059 nbdev_io->io_path = NULL; 3060 nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx; 3061 bdev_nvme_abort(nbdev_ch, 3062 nbdev_io, 3063 nbdev_io_to_abort); 3064 return; 3065 3066 case SPDK_BDEV_IO_TYPE_COPY: 3067 rc = bdev_nvme_copy(nbdev_io, 3068 bdev_io->u.bdev.offset_blocks, 3069 bdev_io->u.bdev.copy.src_offset_blocks, 3070 bdev_io->u.bdev.num_blocks); 3071 break; 3072 default: 3073 rc = -EINVAL; 3074 break; 3075 } 3076 3077 if (spdk_unlikely(rc != 0)) { 3078 bdev_nvme_io_complete(nbdev_io, rc); 3079 } 3080 } 3081 3082 static void 3083 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 3084 { 3085 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 3086 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 3087 3088 if (spdk_likely(nbdev_io->submit_tsc == 0)) { 3089 nbdev_io->submit_tsc = spdk_bdev_io_get_submit_tsc(bdev_io); 3090 } else { 3091 /* There are cases where submit_tsc != 0, i.e. retry I/O. 3092 * We need to update submit_tsc here. 3093 */ 3094 nbdev_io->submit_tsc = spdk_get_ticks(); 3095 } 3096 3097 spdk_trace_record(TRACE_BDEV_NVME_IO_START, 0, 0, (uintptr_t)nbdev_io, (uintptr_t)bdev_io); 3098 nbdev_io->io_path = bdev_nvme_find_io_path(nbdev_ch); 3099 if (spdk_unlikely(!nbdev_io->io_path)) { 3100 if (!bdev_nvme_io_type_is_admin(bdev_io->type)) { 3101 bdev_nvme_io_complete(nbdev_io, -ENXIO); 3102 return; 3103 } 3104 3105 /* Admin commands do not use the optimal I/O path. 3106 * Simply fall through even if it is not found. 3107 */ 3108 } 3109 3110 _bdev_nvme_submit_request(nbdev_ch, bdev_io); 3111 } 3112 3113 static bool 3114 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 3115 { 3116 struct nvme_bdev *nbdev = ctx; 3117 struct nvme_ns *nvme_ns; 3118 struct spdk_nvme_ns *ns; 3119 struct spdk_nvme_ctrlr *ctrlr; 3120 const struct spdk_nvme_ctrlr_data *cdata; 3121 3122 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 3123 assert(nvme_ns != NULL); 3124 ns = nvme_ns->ns; 3125 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 3126 3127 switch (io_type) { 3128 case SPDK_BDEV_IO_TYPE_READ: 3129 case SPDK_BDEV_IO_TYPE_WRITE: 3130 case SPDK_BDEV_IO_TYPE_RESET: 3131 case SPDK_BDEV_IO_TYPE_FLUSH: 3132 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 3133 case SPDK_BDEV_IO_TYPE_NVME_IO: 3134 case SPDK_BDEV_IO_TYPE_ABORT: 3135 return true; 3136 3137 case SPDK_BDEV_IO_TYPE_COMPARE: 3138 return spdk_nvme_ns_supports_compare(ns); 3139 3140 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3141 return spdk_nvme_ns_get_md_size(ns) ? true : false; 3142 3143 case SPDK_BDEV_IO_TYPE_UNMAP: 3144 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3145 return cdata->oncs.dsm; 3146 3147 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3148 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3149 return cdata->oncs.write_zeroes; 3150 3151 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 3152 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 3153 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) { 3154 return true; 3155 } 3156 return false; 3157 3158 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 3159 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 3160 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS; 3161 3162 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 3163 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS && 3164 spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ZONE_APPEND_SUPPORTED; 3165 3166 case SPDK_BDEV_IO_TYPE_COPY: 3167 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3168 return cdata->oncs.copy; 3169 3170 default: 3171 return false; 3172 } 3173 } 3174 3175 static int 3176 nvme_qpair_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ctrlr_channel *ctrlr_ch) 3177 { 3178 struct nvme_qpair *nvme_qpair; 3179 struct spdk_io_channel *pg_ch; 3180 int rc; 3181 3182 nvme_qpair = calloc(1, sizeof(*nvme_qpair)); 3183 if (!nvme_qpair) { 3184 SPDK_ERRLOG("Failed to alloc nvme_qpair.\n"); 3185 return -1; 3186 } 3187 3188 TAILQ_INIT(&nvme_qpair->io_path_list); 3189 3190 nvme_qpair->ctrlr = nvme_ctrlr; 3191 nvme_qpair->ctrlr_ch = ctrlr_ch; 3192 3193 pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs); 3194 if (!pg_ch) { 3195 free(nvme_qpair); 3196 return -1; 3197 } 3198 3199 nvme_qpair->group = spdk_io_channel_get_ctx(pg_ch); 3200 3201 #ifdef SPDK_CONFIG_VTUNE 3202 nvme_qpair->group->collect_spin_stat = true; 3203 #else 3204 nvme_qpair->group->collect_spin_stat = false; 3205 #endif 3206 3207 if (!nvme_ctrlr->disabled) { 3208 /* If a nvme_ctrlr is disabled, don't try to create qpair for it. Qpair will 3209 * be created when it's enabled. 3210 */ 3211 rc = bdev_nvme_create_qpair(nvme_qpair); 3212 if (rc != 0) { 3213 /* nvme_ctrlr can't create IO qpair if connection is down. 3214 * If reconnect_delay_sec is non-zero, creating IO qpair is retried 3215 * after reconnect_delay_sec seconds. If bdev_retry_count is non-zero, 3216 * submitted IO will be queued until IO qpair is successfully created. 3217 * 3218 * Hence, if both are satisfied, ignore the failure. 3219 */ 3220 if (nvme_ctrlr->opts.reconnect_delay_sec == 0 || g_opts.bdev_retry_count == 0) { 3221 spdk_put_io_channel(pg_ch); 3222 free(nvme_qpair); 3223 return rc; 3224 } 3225 } 3226 } 3227 3228 TAILQ_INSERT_TAIL(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 3229 3230 ctrlr_ch->qpair = nvme_qpair; 3231 3232 pthread_mutex_lock(&nvme_qpair->ctrlr->mutex); 3233 nvme_qpair->ctrlr->ref++; 3234 pthread_mutex_unlock(&nvme_qpair->ctrlr->mutex); 3235 3236 return 0; 3237 } 3238 3239 static int 3240 bdev_nvme_create_ctrlr_channel_cb(void *io_device, void *ctx_buf) 3241 { 3242 struct nvme_ctrlr *nvme_ctrlr = io_device; 3243 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 3244 3245 TAILQ_INIT(&ctrlr_ch->pending_resets); 3246 3247 return nvme_qpair_create(nvme_ctrlr, ctrlr_ch); 3248 } 3249 3250 static void 3251 nvme_qpair_delete(struct nvme_qpair *nvme_qpair) 3252 { 3253 struct nvme_io_path *io_path, *next; 3254 3255 assert(nvme_qpair->group != NULL); 3256 3257 TAILQ_FOREACH_SAFE(io_path, &nvme_qpair->io_path_list, tailq, next) { 3258 TAILQ_REMOVE(&nvme_qpair->io_path_list, io_path, tailq); 3259 nvme_io_path_free(io_path); 3260 } 3261 3262 TAILQ_REMOVE(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 3263 3264 spdk_put_io_channel(spdk_io_channel_from_ctx(nvme_qpair->group)); 3265 3266 nvme_ctrlr_release(nvme_qpair->ctrlr); 3267 3268 free(nvme_qpair); 3269 } 3270 3271 static void 3272 bdev_nvme_destroy_ctrlr_channel_cb(void *io_device, void *ctx_buf) 3273 { 3274 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 3275 struct nvme_qpair *nvme_qpair; 3276 3277 nvme_qpair = ctrlr_ch->qpair; 3278 assert(nvme_qpair != NULL); 3279 3280 _bdev_nvme_clear_io_path_cache(nvme_qpair); 3281 3282 if (nvme_qpair->qpair != NULL) { 3283 if (ctrlr_ch->reset_iter == NULL) { 3284 spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); 3285 } else { 3286 /* Skip current ctrlr_channel in a full reset sequence because 3287 * it is being deleted now. The qpair is already being disconnected. 3288 * We do not have to restart disconnecting it. 3289 */ 3290 spdk_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 3291 } 3292 3293 /* We cannot release a reference to the poll group now. 3294 * The qpair may be disconnected asynchronously later. 3295 * We need to poll it until it is actually disconnected. 3296 * Just detach the qpair from the deleting ctrlr_channel. 3297 */ 3298 nvme_qpair->ctrlr_ch = NULL; 3299 } else { 3300 assert(ctrlr_ch->reset_iter == NULL); 3301 3302 nvme_qpair_delete(nvme_qpair); 3303 } 3304 } 3305 3306 static inline struct spdk_io_channel * 3307 bdev_nvme_get_accel_channel(struct nvme_poll_group *group) 3308 { 3309 if (spdk_unlikely(!group->accel_channel)) { 3310 group->accel_channel = spdk_accel_get_io_channel(); 3311 if (!group->accel_channel) { 3312 SPDK_ERRLOG("Cannot get the accel_channel for bdev nvme polling group=%p\n", 3313 group); 3314 return NULL; 3315 } 3316 } 3317 3318 return group->accel_channel; 3319 } 3320 3321 static void 3322 bdev_nvme_submit_accel_crc32c(void *ctx, uint32_t *dst, struct iovec *iov, 3323 uint32_t iov_cnt, uint32_t seed, 3324 spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) 3325 { 3326 struct spdk_io_channel *accel_ch; 3327 struct nvme_poll_group *group = ctx; 3328 int rc; 3329 3330 assert(cb_fn != NULL); 3331 3332 accel_ch = bdev_nvme_get_accel_channel(group); 3333 if (spdk_unlikely(accel_ch == NULL)) { 3334 cb_fn(cb_arg, -ENOMEM); 3335 return; 3336 } 3337 3338 rc = spdk_accel_submit_crc32cv(accel_ch, dst, iov, iov_cnt, seed, cb_fn, cb_arg); 3339 if (rc) { 3340 /* For the two cases, spdk_accel_submit_crc32cv does not call the user's cb_fn */ 3341 if (rc == -ENOMEM || rc == -EINVAL) { 3342 cb_fn(cb_arg, rc); 3343 } 3344 SPDK_ERRLOG("Cannot complete the accelerated crc32c operation with iov=%p\n", iov); 3345 } 3346 } 3347 3348 static void 3349 bdev_nvme_finish_sequence(void *seq, spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) 3350 { 3351 spdk_accel_sequence_finish(seq, cb_fn, cb_arg); 3352 } 3353 3354 static void 3355 bdev_nvme_abort_sequence(void *seq) 3356 { 3357 spdk_accel_sequence_abort(seq); 3358 } 3359 3360 static void 3361 bdev_nvme_reverse_sequence(void *seq) 3362 { 3363 spdk_accel_sequence_reverse(seq); 3364 } 3365 3366 static int 3367 bdev_nvme_append_crc32c(void *ctx, void **seq, uint32_t *dst, struct iovec *iovs, uint32_t iovcnt, 3368 struct spdk_memory_domain *domain, void *domain_ctx, uint32_t seed, 3369 spdk_nvme_accel_step_cb cb_fn, void *cb_arg) 3370 { 3371 struct spdk_io_channel *ch; 3372 struct nvme_poll_group *group = ctx; 3373 3374 ch = bdev_nvme_get_accel_channel(group); 3375 if (spdk_unlikely(ch == NULL)) { 3376 return -ENOMEM; 3377 } 3378 3379 return spdk_accel_append_crc32c((struct spdk_accel_sequence **)seq, ch, dst, iovs, iovcnt, 3380 domain, domain_ctx, seed, cb_fn, cb_arg); 3381 } 3382 3383 static struct spdk_nvme_accel_fn_table g_bdev_nvme_accel_fn_table = { 3384 .table_size = sizeof(struct spdk_nvme_accel_fn_table), 3385 .submit_accel_crc32c = bdev_nvme_submit_accel_crc32c, 3386 .append_crc32c = bdev_nvme_append_crc32c, 3387 .finish_sequence = bdev_nvme_finish_sequence, 3388 .reverse_sequence = bdev_nvme_reverse_sequence, 3389 .abort_sequence = bdev_nvme_abort_sequence, 3390 }; 3391 3392 static int 3393 bdev_nvme_create_poll_group_cb(void *io_device, void *ctx_buf) 3394 { 3395 struct nvme_poll_group *group = ctx_buf; 3396 3397 TAILQ_INIT(&group->qpair_list); 3398 3399 group->group = spdk_nvme_poll_group_create(group, &g_bdev_nvme_accel_fn_table); 3400 if (group->group == NULL) { 3401 return -1; 3402 } 3403 3404 group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us); 3405 3406 if (group->poller == NULL) { 3407 spdk_nvme_poll_group_destroy(group->group); 3408 return -1; 3409 } 3410 3411 return 0; 3412 } 3413 3414 static void 3415 bdev_nvme_destroy_poll_group_cb(void *io_device, void *ctx_buf) 3416 { 3417 struct nvme_poll_group *group = ctx_buf; 3418 3419 assert(TAILQ_EMPTY(&group->qpair_list)); 3420 3421 if (group->accel_channel) { 3422 spdk_put_io_channel(group->accel_channel); 3423 } 3424 3425 spdk_poller_unregister(&group->poller); 3426 if (spdk_nvme_poll_group_destroy(group->group)) { 3427 SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.\n"); 3428 assert(false); 3429 } 3430 } 3431 3432 static struct spdk_io_channel * 3433 bdev_nvme_get_io_channel(void *ctx) 3434 { 3435 struct nvme_bdev *nvme_bdev = ctx; 3436 3437 return spdk_get_io_channel(nvme_bdev); 3438 } 3439 3440 static void * 3441 bdev_nvme_get_module_ctx(void *ctx) 3442 { 3443 struct nvme_bdev *nvme_bdev = ctx; 3444 struct nvme_ns *nvme_ns; 3445 3446 if (!nvme_bdev || nvme_bdev->disk.module != &nvme_if) { 3447 return NULL; 3448 } 3449 3450 nvme_ns = TAILQ_FIRST(&nvme_bdev->nvme_ns_list); 3451 if (!nvme_ns) { 3452 return NULL; 3453 } 3454 3455 return nvme_ns->ns; 3456 } 3457 3458 static const char * 3459 _nvme_ana_state_str(enum spdk_nvme_ana_state ana_state) 3460 { 3461 switch (ana_state) { 3462 case SPDK_NVME_ANA_OPTIMIZED_STATE: 3463 return "optimized"; 3464 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 3465 return "non_optimized"; 3466 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 3467 return "inaccessible"; 3468 case SPDK_NVME_ANA_PERSISTENT_LOSS_STATE: 3469 return "persistent_loss"; 3470 case SPDK_NVME_ANA_CHANGE_STATE: 3471 return "change"; 3472 default: 3473 return NULL; 3474 } 3475 } 3476 3477 static int 3478 bdev_nvme_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size) 3479 { 3480 struct spdk_memory_domain **_domains = NULL; 3481 struct nvme_bdev *nbdev = ctx; 3482 struct nvme_ns *nvme_ns; 3483 int i = 0, _array_size = array_size; 3484 int rc = 0; 3485 3486 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 3487 if (domains && array_size >= i) { 3488 _domains = &domains[i]; 3489 } else { 3490 _domains = NULL; 3491 } 3492 rc = spdk_nvme_ctrlr_get_memory_domains(nvme_ns->ctrlr->ctrlr, _domains, _array_size); 3493 if (rc > 0) { 3494 i += rc; 3495 if (_array_size >= rc) { 3496 _array_size -= rc; 3497 } else { 3498 _array_size = 0; 3499 } 3500 } else if (rc < 0) { 3501 return rc; 3502 } 3503 } 3504 3505 return i; 3506 } 3507 3508 static const char * 3509 nvme_ctrlr_get_state_str(struct nvme_ctrlr *nvme_ctrlr) 3510 { 3511 if (nvme_ctrlr->destruct) { 3512 return "deleting"; 3513 } else if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 3514 return "failed"; 3515 } else if (nvme_ctrlr->resetting) { 3516 return "resetting"; 3517 } else if (nvme_ctrlr->reconnect_is_delayed > 0) { 3518 return "reconnect_is_delayed"; 3519 } else if (nvme_ctrlr->disabled) { 3520 return "disabled"; 3521 } else { 3522 return "enabled"; 3523 } 3524 } 3525 3526 void 3527 nvme_ctrlr_info_json(struct spdk_json_write_ctx *w, struct nvme_ctrlr *nvme_ctrlr) 3528 { 3529 struct spdk_nvme_transport_id *trid; 3530 const struct spdk_nvme_ctrlr_opts *opts; 3531 const struct spdk_nvme_ctrlr_data *cdata; 3532 struct nvme_path_id *path_id; 3533 3534 spdk_json_write_object_begin(w); 3535 3536 spdk_json_write_named_string(w, "state", nvme_ctrlr_get_state_str(nvme_ctrlr)); 3537 3538 #ifdef SPDK_CONFIG_NVME_CUSE 3539 size_t cuse_name_size = 128; 3540 char cuse_name[cuse_name_size]; 3541 3542 int rc = spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr, cuse_name, &cuse_name_size); 3543 if (rc == 0) { 3544 spdk_json_write_named_string(w, "cuse_device", cuse_name); 3545 } 3546 #endif 3547 trid = &nvme_ctrlr->active_path_id->trid; 3548 spdk_json_write_named_object_begin(w, "trid"); 3549 nvme_bdev_dump_trid_json(trid, w); 3550 spdk_json_write_object_end(w); 3551 3552 path_id = TAILQ_NEXT(nvme_ctrlr->active_path_id, link); 3553 if (path_id != NULL) { 3554 spdk_json_write_named_array_begin(w, "alternate_trids"); 3555 do { 3556 trid = &path_id->trid; 3557 spdk_json_write_object_begin(w); 3558 nvme_bdev_dump_trid_json(trid, w); 3559 spdk_json_write_object_end(w); 3560 3561 path_id = TAILQ_NEXT(path_id, link); 3562 } while (path_id != NULL); 3563 spdk_json_write_array_end(w); 3564 } 3565 3566 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 3567 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 3568 3569 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 3570 spdk_json_write_named_object_begin(w, "host"); 3571 spdk_json_write_named_string(w, "nqn", opts->hostnqn); 3572 spdk_json_write_named_string(w, "addr", opts->src_addr); 3573 spdk_json_write_named_string(w, "svcid", opts->src_svcid); 3574 spdk_json_write_object_end(w); 3575 3576 spdk_json_write_object_end(w); 3577 } 3578 3579 static void 3580 nvme_namespace_info_json(struct spdk_json_write_ctx *w, 3581 struct nvme_ns *nvme_ns) 3582 { 3583 struct spdk_nvme_ns *ns; 3584 struct spdk_nvme_ctrlr *ctrlr; 3585 const struct spdk_nvme_ctrlr_data *cdata; 3586 const struct spdk_nvme_transport_id *trid; 3587 union spdk_nvme_vs_register vs; 3588 const struct spdk_nvme_ns_data *nsdata; 3589 char buf[128]; 3590 3591 ns = nvme_ns->ns; 3592 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 3593 3594 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3595 trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); 3596 vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr); 3597 3598 spdk_json_write_object_begin(w); 3599 3600 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 3601 spdk_json_write_named_string(w, "pci_address", trid->traddr); 3602 } 3603 3604 spdk_json_write_named_object_begin(w, "trid"); 3605 3606 nvme_bdev_dump_trid_json(trid, w); 3607 3608 spdk_json_write_object_end(w); 3609 3610 #ifdef SPDK_CONFIG_NVME_CUSE 3611 size_t cuse_name_size = 128; 3612 char cuse_name[cuse_name_size]; 3613 3614 int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns), 3615 cuse_name, &cuse_name_size); 3616 if (rc == 0) { 3617 spdk_json_write_named_string(w, "cuse_device", cuse_name); 3618 } 3619 #endif 3620 3621 spdk_json_write_named_object_begin(w, "ctrlr_data"); 3622 3623 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 3624 3625 spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid); 3626 3627 snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn); 3628 spdk_str_trim(buf); 3629 spdk_json_write_named_string(w, "model_number", buf); 3630 3631 snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn); 3632 spdk_str_trim(buf); 3633 spdk_json_write_named_string(w, "serial_number", buf); 3634 3635 snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr); 3636 spdk_str_trim(buf); 3637 spdk_json_write_named_string(w, "firmware_revision", buf); 3638 3639 if (cdata->subnqn[0] != '\0') { 3640 spdk_json_write_named_string(w, "subnqn", cdata->subnqn); 3641 } 3642 3643 spdk_json_write_named_object_begin(w, "oacs"); 3644 3645 spdk_json_write_named_uint32(w, "security", cdata->oacs.security); 3646 spdk_json_write_named_uint32(w, "format", cdata->oacs.format); 3647 spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware); 3648 spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage); 3649 3650 spdk_json_write_object_end(w); 3651 3652 spdk_json_write_named_bool(w, "multi_ctrlr", cdata->cmic.multi_ctrlr); 3653 spdk_json_write_named_bool(w, "ana_reporting", cdata->cmic.ana_reporting); 3654 3655 spdk_json_write_object_end(w); 3656 3657 spdk_json_write_named_object_begin(w, "vs"); 3658 3659 spdk_json_write_name(w, "nvme_version"); 3660 if (vs.bits.ter) { 3661 spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter); 3662 } else { 3663 spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr); 3664 } 3665 3666 spdk_json_write_object_end(w); 3667 3668 nsdata = spdk_nvme_ns_get_data(ns); 3669 3670 spdk_json_write_named_object_begin(w, "ns_data"); 3671 3672 spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns)); 3673 3674 if (cdata->cmic.ana_reporting) { 3675 spdk_json_write_named_string(w, "ana_state", 3676 _nvme_ana_state_str(nvme_ns->ana_state)); 3677 } 3678 3679 spdk_json_write_named_bool(w, "can_share", nsdata->nmic.can_share); 3680 3681 spdk_json_write_object_end(w); 3682 3683 if (cdata->oacs.security) { 3684 spdk_json_write_named_object_begin(w, "security"); 3685 3686 spdk_json_write_named_bool(w, "opal", nvme_ns->bdev->opal); 3687 3688 spdk_json_write_object_end(w); 3689 } 3690 3691 spdk_json_write_object_end(w); 3692 } 3693 3694 static const char * 3695 nvme_bdev_get_mp_policy_str(struct nvme_bdev *nbdev) 3696 { 3697 switch (nbdev->mp_policy) { 3698 case BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE: 3699 return "active_passive"; 3700 case BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE: 3701 return "active_active"; 3702 default: 3703 assert(false); 3704 return "invalid"; 3705 } 3706 } 3707 3708 static int 3709 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) 3710 { 3711 struct nvme_bdev *nvme_bdev = ctx; 3712 struct nvme_ns *nvme_ns; 3713 3714 pthread_mutex_lock(&nvme_bdev->mutex); 3715 spdk_json_write_named_array_begin(w, "nvme"); 3716 TAILQ_FOREACH(nvme_ns, &nvme_bdev->nvme_ns_list, tailq) { 3717 nvme_namespace_info_json(w, nvme_ns); 3718 } 3719 spdk_json_write_array_end(w); 3720 spdk_json_write_named_string(w, "mp_policy", nvme_bdev_get_mp_policy_str(nvme_bdev)); 3721 pthread_mutex_unlock(&nvme_bdev->mutex); 3722 3723 return 0; 3724 } 3725 3726 static void 3727 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3728 { 3729 /* No config per bdev needed */ 3730 } 3731 3732 static uint64_t 3733 bdev_nvme_get_spin_time(struct spdk_io_channel *ch) 3734 { 3735 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 3736 struct nvme_io_path *io_path; 3737 struct nvme_poll_group *group; 3738 uint64_t spin_time = 0; 3739 3740 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 3741 group = io_path->qpair->group; 3742 3743 if (!group || !group->collect_spin_stat) { 3744 continue; 3745 } 3746 3747 if (group->end_ticks != 0) { 3748 group->spin_ticks += (group->end_ticks - group->start_ticks); 3749 group->end_ticks = 0; 3750 } 3751 3752 spin_time += group->spin_ticks; 3753 group->start_ticks = 0; 3754 group->spin_ticks = 0; 3755 } 3756 3757 return (spin_time * 1000000ULL) / spdk_get_ticks_hz(); 3758 } 3759 3760 static void 3761 bdev_nvme_reset_device_stat(void *ctx) 3762 { 3763 struct nvme_bdev *nbdev = ctx; 3764 3765 if (nbdev->err_stat != NULL) { 3766 memset(nbdev->err_stat, 0, sizeof(struct nvme_error_stat)); 3767 } 3768 } 3769 3770 /* JSON string should be lowercases and underscore delimited string. */ 3771 static void 3772 bdev_nvme_format_nvme_status(char *dst, const char *src) 3773 { 3774 char tmp[256]; 3775 3776 spdk_strcpy_replace(dst, 256, src, " - ", "_"); 3777 spdk_strcpy_replace(tmp, 256, dst, "-", "_"); 3778 spdk_strcpy_replace(dst, 256, tmp, " ", "_"); 3779 spdk_strlwr(dst); 3780 } 3781 3782 static void 3783 bdev_nvme_dump_device_stat_json(void *ctx, struct spdk_json_write_ctx *w) 3784 { 3785 struct nvme_bdev *nbdev = ctx; 3786 struct spdk_nvme_status status = {}; 3787 uint16_t sct, sc; 3788 char status_json[256]; 3789 const char *status_str; 3790 3791 if (nbdev->err_stat == NULL) { 3792 return; 3793 } 3794 3795 spdk_json_write_named_object_begin(w, "nvme_error"); 3796 3797 spdk_json_write_named_object_begin(w, "status_type"); 3798 for (sct = 0; sct < 8; sct++) { 3799 if (nbdev->err_stat->status_type[sct] == 0) { 3800 continue; 3801 } 3802 status.sct = sct; 3803 3804 status_str = spdk_nvme_cpl_get_status_type_string(&status); 3805 assert(status_str != NULL); 3806 bdev_nvme_format_nvme_status(status_json, status_str); 3807 3808 spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status_type[sct]); 3809 } 3810 spdk_json_write_object_end(w); 3811 3812 spdk_json_write_named_object_begin(w, "status_code"); 3813 for (sct = 0; sct < 4; sct++) { 3814 status.sct = sct; 3815 for (sc = 0; sc < 256; sc++) { 3816 if (nbdev->err_stat->status[sct][sc] == 0) { 3817 continue; 3818 } 3819 status.sc = sc; 3820 3821 status_str = spdk_nvme_cpl_get_status_string(&status); 3822 assert(status_str != NULL); 3823 bdev_nvme_format_nvme_status(status_json, status_str); 3824 3825 spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status[sct][sc]); 3826 } 3827 } 3828 spdk_json_write_object_end(w); 3829 3830 spdk_json_write_object_end(w); 3831 } 3832 3833 static bool 3834 bdev_nvme_accel_sequence_supported(void *ctx, enum spdk_bdev_io_type type) 3835 { 3836 struct nvme_bdev *nbdev = ctx; 3837 struct spdk_nvme_ctrlr *ctrlr; 3838 3839 if (!g_opts.allow_accel_sequence) { 3840 return false; 3841 } 3842 3843 switch (type) { 3844 case SPDK_BDEV_IO_TYPE_WRITE: 3845 case SPDK_BDEV_IO_TYPE_READ: 3846 break; 3847 default: 3848 return false; 3849 } 3850 3851 ctrlr = bdev_nvme_get_ctrlr(&nbdev->disk); 3852 assert(ctrlr != NULL); 3853 3854 return spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ACCEL_SEQUENCE_SUPPORTED; 3855 } 3856 3857 static const struct spdk_bdev_fn_table nvmelib_fn_table = { 3858 .destruct = bdev_nvme_destruct, 3859 .submit_request = bdev_nvme_submit_request, 3860 .io_type_supported = bdev_nvme_io_type_supported, 3861 .get_io_channel = bdev_nvme_get_io_channel, 3862 .dump_info_json = bdev_nvme_dump_info_json, 3863 .write_config_json = bdev_nvme_write_config_json, 3864 .get_spin_time = bdev_nvme_get_spin_time, 3865 .get_module_ctx = bdev_nvme_get_module_ctx, 3866 .get_memory_domains = bdev_nvme_get_memory_domains, 3867 .accel_sequence_supported = bdev_nvme_accel_sequence_supported, 3868 .reset_device_stat = bdev_nvme_reset_device_stat, 3869 .dump_device_stat_json = bdev_nvme_dump_device_stat_json, 3870 }; 3871 3872 typedef int (*bdev_nvme_parse_ana_log_page_cb)( 3873 const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg); 3874 3875 static int 3876 bdev_nvme_parse_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 3877 bdev_nvme_parse_ana_log_page_cb cb_fn, void *cb_arg) 3878 { 3879 struct spdk_nvme_ana_group_descriptor *copied_desc; 3880 uint8_t *orig_desc; 3881 uint32_t i, desc_size, copy_len; 3882 int rc = 0; 3883 3884 if (nvme_ctrlr->ana_log_page == NULL) { 3885 return -EINVAL; 3886 } 3887 3888 copied_desc = nvme_ctrlr->copied_ana_desc; 3889 3890 orig_desc = (uint8_t *)nvme_ctrlr->ana_log_page + sizeof(struct spdk_nvme_ana_page); 3891 copy_len = nvme_ctrlr->max_ana_log_page_size - sizeof(struct spdk_nvme_ana_page); 3892 3893 for (i = 0; i < nvme_ctrlr->ana_log_page->num_ana_group_desc; i++) { 3894 memcpy(copied_desc, orig_desc, copy_len); 3895 3896 rc = cb_fn(copied_desc, cb_arg); 3897 if (rc != 0) { 3898 break; 3899 } 3900 3901 desc_size = sizeof(struct spdk_nvme_ana_group_descriptor) + 3902 copied_desc->num_of_nsid * sizeof(uint32_t); 3903 orig_desc += desc_size; 3904 copy_len -= desc_size; 3905 } 3906 3907 return rc; 3908 } 3909 3910 static int 3911 nvme_ns_ana_transition_timedout(void *ctx) 3912 { 3913 struct nvme_ns *nvme_ns = ctx; 3914 3915 spdk_poller_unregister(&nvme_ns->anatt_timer); 3916 nvme_ns->ana_transition_timedout = true; 3917 3918 return SPDK_POLLER_BUSY; 3919 } 3920 3921 static void 3922 _nvme_ns_set_ana_state(struct nvme_ns *nvme_ns, 3923 const struct spdk_nvme_ana_group_descriptor *desc) 3924 { 3925 const struct spdk_nvme_ctrlr_data *cdata; 3926 3927 nvme_ns->ana_group_id = desc->ana_group_id; 3928 nvme_ns->ana_state = desc->ana_state; 3929 nvme_ns->ana_state_updating = false; 3930 3931 switch (nvme_ns->ana_state) { 3932 case SPDK_NVME_ANA_OPTIMIZED_STATE: 3933 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 3934 nvme_ns->ana_transition_timedout = false; 3935 spdk_poller_unregister(&nvme_ns->anatt_timer); 3936 break; 3937 3938 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 3939 case SPDK_NVME_ANA_CHANGE_STATE: 3940 if (nvme_ns->anatt_timer != NULL) { 3941 break; 3942 } 3943 3944 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 3945 nvme_ns->anatt_timer = SPDK_POLLER_REGISTER(nvme_ns_ana_transition_timedout, 3946 nvme_ns, 3947 cdata->anatt * SPDK_SEC_TO_USEC); 3948 break; 3949 default: 3950 break; 3951 } 3952 } 3953 3954 static int 3955 nvme_ns_set_ana_state(const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg) 3956 { 3957 struct nvme_ns *nvme_ns = cb_arg; 3958 uint32_t i; 3959 3960 for (i = 0; i < desc->num_of_nsid; i++) { 3961 if (desc->nsid[i] != spdk_nvme_ns_get_id(nvme_ns->ns)) { 3962 continue; 3963 } 3964 3965 _nvme_ns_set_ana_state(nvme_ns, desc); 3966 return 1; 3967 } 3968 3969 return 0; 3970 } 3971 3972 static int 3973 nvme_generate_uuid(const char *sn, uint32_t nsid, struct spdk_uuid *uuid) 3974 { 3975 int rc = 0; 3976 struct spdk_uuid new_uuid, namespace_uuid; 3977 char merged_str[SPDK_NVME_CTRLR_SN_LEN + NSID_STR_LEN + 1] = {'\0'}; 3978 /* This namespace UUID was generated using uuid_generate() method. */ 3979 const char *namespace_str = {"edaed2de-24bc-4b07-b559-f47ecbe730fd"}; 3980 int size; 3981 3982 assert(strlen(sn) <= SPDK_NVME_CTRLR_SN_LEN); 3983 3984 spdk_uuid_set_null(&new_uuid); 3985 spdk_uuid_set_null(&namespace_uuid); 3986 3987 size = snprintf(merged_str, sizeof(merged_str), "%s%"PRIu32, sn, nsid); 3988 if (size <= 0 || (unsigned long)size >= sizeof(merged_str)) { 3989 return -EINVAL; 3990 } 3991 3992 spdk_uuid_parse(&namespace_uuid, namespace_str); 3993 3994 rc = spdk_uuid_generate_sha1(&new_uuid, &namespace_uuid, merged_str, size); 3995 if (rc == 0) { 3996 memcpy(uuid, &new_uuid, sizeof(struct spdk_uuid)); 3997 } 3998 3999 return rc; 4000 } 4001 4002 static int 4003 nvme_disk_create(struct spdk_bdev *disk, const char *base_name, 4004 struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns, 4005 uint32_t prchk_flags, void *ctx) 4006 { 4007 const struct spdk_uuid *uuid; 4008 const uint8_t *nguid; 4009 const struct spdk_nvme_ctrlr_data *cdata; 4010 const struct spdk_nvme_ns_data *nsdata; 4011 const struct spdk_nvme_ctrlr_opts *opts; 4012 enum spdk_nvme_csi csi; 4013 uint32_t atomic_bs, phys_bs, bs; 4014 char sn_tmp[SPDK_NVME_CTRLR_SN_LEN + 1] = {'\0'}; 4015 int rc; 4016 4017 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 4018 csi = spdk_nvme_ns_get_csi(ns); 4019 opts = spdk_nvme_ctrlr_get_opts(ctrlr); 4020 4021 switch (csi) { 4022 case SPDK_NVME_CSI_NVM: 4023 disk->product_name = "NVMe disk"; 4024 break; 4025 case SPDK_NVME_CSI_ZNS: 4026 disk->product_name = "NVMe ZNS disk"; 4027 disk->zoned = true; 4028 disk->zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 4029 disk->max_zone_append_size = spdk_nvme_zns_ctrlr_get_max_zone_append_size(ctrlr) / 4030 spdk_nvme_ns_get_extended_sector_size(ns); 4031 disk->max_open_zones = spdk_nvme_zns_ns_get_max_open_zones(ns); 4032 disk->max_active_zones = spdk_nvme_zns_ns_get_max_active_zones(ns); 4033 break; 4034 default: 4035 SPDK_ERRLOG("unsupported CSI: %u\n", csi); 4036 return -ENOTSUP; 4037 } 4038 4039 nguid = spdk_nvme_ns_get_nguid(ns); 4040 if (!nguid) { 4041 uuid = spdk_nvme_ns_get_uuid(ns); 4042 if (uuid) { 4043 disk->uuid = *uuid; 4044 } else if (g_opts.generate_uuids) { 4045 spdk_strcpy_pad(sn_tmp, cdata->sn, SPDK_NVME_CTRLR_SN_LEN, '\0'); 4046 rc = nvme_generate_uuid(sn_tmp, spdk_nvme_ns_get_id(ns), &disk->uuid); 4047 if (rc != 0) { 4048 SPDK_ERRLOG("UUID generation failed (%s)\n", strerror(rc)); 4049 return rc; 4050 } 4051 } 4052 } else { 4053 memcpy(&disk->uuid, nguid, sizeof(disk->uuid)); 4054 } 4055 4056 disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns)); 4057 if (!disk->name) { 4058 return -ENOMEM; 4059 } 4060 4061 disk->write_cache = 0; 4062 if (cdata->vwc.present) { 4063 /* Enable if the Volatile Write Cache exists */ 4064 disk->write_cache = 1; 4065 } 4066 if (cdata->oncs.write_zeroes) { 4067 disk->max_write_zeroes = UINT16_MAX + 1; 4068 } 4069 disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns); 4070 disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns); 4071 disk->max_segment_size = spdk_nvme_ctrlr_get_max_xfer_size(ctrlr); 4072 /* NVMe driver will split one request into multiple requests 4073 * based on MDTS and stripe boundary, the bdev layer will use 4074 * max_segment_size and max_num_segments to split one big IO 4075 * into multiple requests, then small request can't run out 4076 * of NVMe internal requests data structure. 4077 */ 4078 if (opts && opts->io_queue_requests) { 4079 disk->max_num_segments = opts->io_queue_requests / 2; 4080 } 4081 if (spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_SGL_SUPPORTED) { 4082 /* The nvme driver will try to split I/O that have too many 4083 * SGEs, but it doesn't work if that last SGE doesn't end on 4084 * an aggregate total that is block aligned. The bdev layer has 4085 * a more robust splitting framework, so use that instead for 4086 * this case. (See issue #3269.) 4087 */ 4088 uint16_t max_sges = spdk_nvme_ctrlr_get_max_sges(ctrlr); 4089 4090 if (disk->max_num_segments == 0) { 4091 disk->max_num_segments = max_sges; 4092 } else { 4093 disk->max_num_segments = spdk_min(disk->max_num_segments, max_sges); 4094 } 4095 } 4096 disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns); 4097 4098 nsdata = spdk_nvme_ns_get_data(ns); 4099 bs = spdk_nvme_ns_get_sector_size(ns); 4100 atomic_bs = bs; 4101 phys_bs = bs; 4102 if (nsdata->nabo == 0) { 4103 if (nsdata->nsfeat.ns_atomic_write_unit && nsdata->nawupf) { 4104 atomic_bs = bs * (1 + nsdata->nawupf); 4105 } else { 4106 atomic_bs = bs * (1 + cdata->awupf); 4107 } 4108 } 4109 if (nsdata->nsfeat.optperf) { 4110 phys_bs = bs * (1 + nsdata->npwg); 4111 } 4112 disk->phys_blocklen = spdk_min(phys_bs, atomic_bs); 4113 4114 disk->md_len = spdk_nvme_ns_get_md_size(ns); 4115 if (disk->md_len != 0) { 4116 disk->md_interleave = nsdata->flbas.extended; 4117 disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns); 4118 if (disk->dif_type != SPDK_DIF_DISABLE) { 4119 disk->dif_is_head_of_md = nsdata->dps.md_start; 4120 disk->dif_check_flags = prchk_flags; 4121 } 4122 } 4123 4124 if (!(spdk_nvme_ctrlr_get_flags(ctrlr) & 4125 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) { 4126 disk->acwu = 0; 4127 } else if (nsdata->nsfeat.ns_atomic_write_unit) { 4128 disk->acwu = nsdata->nacwu + 1; /* 0-based */ 4129 } else { 4130 disk->acwu = cdata->acwu + 1; /* 0-based */ 4131 } 4132 4133 if (cdata->oncs.copy) { 4134 /* For now bdev interface allows only single segment copy */ 4135 disk->max_copy = nsdata->mssrl; 4136 } 4137 4138 disk->ctxt = ctx; 4139 disk->fn_table = &nvmelib_fn_table; 4140 disk->module = &nvme_if; 4141 4142 return 0; 4143 } 4144 4145 static struct nvme_bdev * 4146 nvme_bdev_alloc(void) 4147 { 4148 struct nvme_bdev *bdev; 4149 int rc; 4150 4151 bdev = calloc(1, sizeof(*bdev)); 4152 if (!bdev) { 4153 SPDK_ERRLOG("bdev calloc() failed\n"); 4154 return NULL; 4155 } 4156 4157 if (g_opts.nvme_error_stat) { 4158 bdev->err_stat = calloc(1, sizeof(struct nvme_error_stat)); 4159 if (!bdev->err_stat) { 4160 SPDK_ERRLOG("err_stat calloc() failed\n"); 4161 free(bdev); 4162 return NULL; 4163 } 4164 } 4165 4166 rc = pthread_mutex_init(&bdev->mutex, NULL); 4167 if (rc != 0) { 4168 free(bdev->err_stat); 4169 free(bdev); 4170 return NULL; 4171 } 4172 4173 bdev->ref = 1; 4174 bdev->mp_policy = BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE; 4175 bdev->mp_selector = BDEV_NVME_MP_SELECTOR_ROUND_ROBIN; 4176 bdev->rr_min_io = UINT32_MAX; 4177 TAILQ_INIT(&bdev->nvme_ns_list); 4178 4179 return bdev; 4180 } 4181 4182 static int 4183 nvme_bdev_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4184 { 4185 struct nvme_bdev *bdev; 4186 struct nvme_bdev_ctrlr *nbdev_ctrlr = nvme_ctrlr->nbdev_ctrlr; 4187 int rc; 4188 4189 bdev = nvme_bdev_alloc(); 4190 if (bdev == NULL) { 4191 SPDK_ERRLOG("Failed to allocate NVMe bdev\n"); 4192 return -ENOMEM; 4193 } 4194 4195 bdev->opal = nvme_ctrlr->opal_dev != NULL; 4196 4197 rc = nvme_disk_create(&bdev->disk, nbdev_ctrlr->name, nvme_ctrlr->ctrlr, 4198 nvme_ns->ns, nvme_ctrlr->opts.prchk_flags, bdev); 4199 if (rc != 0) { 4200 SPDK_ERRLOG("Failed to create NVMe disk\n"); 4201 nvme_bdev_free(bdev); 4202 return rc; 4203 } 4204 4205 spdk_io_device_register(bdev, 4206 bdev_nvme_create_bdev_channel_cb, 4207 bdev_nvme_destroy_bdev_channel_cb, 4208 sizeof(struct nvme_bdev_channel), 4209 bdev->disk.name); 4210 4211 nvme_ns->bdev = bdev; 4212 bdev->nsid = nvme_ns->id; 4213 TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); 4214 4215 bdev->nbdev_ctrlr = nbdev_ctrlr; 4216 TAILQ_INSERT_TAIL(&nbdev_ctrlr->bdevs, bdev, tailq); 4217 4218 rc = spdk_bdev_register(&bdev->disk); 4219 if (rc != 0) { 4220 SPDK_ERRLOG("spdk_bdev_register() failed\n"); 4221 spdk_io_device_unregister(bdev, NULL); 4222 nvme_ns->bdev = NULL; 4223 TAILQ_REMOVE(&nbdev_ctrlr->bdevs, bdev, tailq); 4224 nvme_bdev_free(bdev); 4225 return rc; 4226 } 4227 4228 return 0; 4229 } 4230 4231 static bool 4232 bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2) 4233 { 4234 const struct spdk_nvme_ns_data *nsdata1, *nsdata2; 4235 const struct spdk_uuid *uuid1, *uuid2; 4236 4237 nsdata1 = spdk_nvme_ns_get_data(ns1); 4238 nsdata2 = spdk_nvme_ns_get_data(ns2); 4239 uuid1 = spdk_nvme_ns_get_uuid(ns1); 4240 uuid2 = spdk_nvme_ns_get_uuid(ns2); 4241 4242 return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid)) == 0 && 4243 nsdata1->eui64 == nsdata2->eui64 && 4244 ((uuid1 == NULL && uuid2 == NULL) || 4245 (uuid1 != NULL && uuid2 != NULL && spdk_uuid_compare(uuid1, uuid2) == 0)) && 4246 spdk_nvme_ns_get_csi(ns1) == spdk_nvme_ns_get_csi(ns2); 4247 } 4248 4249 static bool 4250 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 4251 struct spdk_nvme_ctrlr_opts *opts) 4252 { 4253 struct nvme_probe_skip_entry *entry; 4254 4255 TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) { 4256 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 4257 return false; 4258 } 4259 } 4260 4261 opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst; 4262 opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight; 4263 opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight; 4264 opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight; 4265 opts->disable_read_ana_log_page = true; 4266 4267 SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr); 4268 4269 return true; 4270 } 4271 4272 static void 4273 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl) 4274 { 4275 struct nvme_ctrlr *nvme_ctrlr = ctx; 4276 4277 if (spdk_nvme_cpl_is_error(cpl)) { 4278 SPDK_WARNLOG("Abort failed. Resetting controller. sc is %u, sct is %u.\n", cpl->status.sc, 4279 cpl->status.sct); 4280 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4281 } else if (cpl->cdw0 & 0x1) { 4282 SPDK_WARNLOG("Specified command could not be aborted.\n"); 4283 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4284 } 4285 } 4286 4287 static void 4288 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr, 4289 struct spdk_nvme_qpair *qpair, uint16_t cid) 4290 { 4291 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 4292 union spdk_nvme_csts_register csts; 4293 int rc; 4294 4295 assert(nvme_ctrlr->ctrlr == ctrlr); 4296 4297 SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid); 4298 4299 /* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O 4300 * queue. (Note: qpair == NULL when there's an admin cmd timeout.) Otherwise we 4301 * would submit another fabrics cmd on the admin queue to read CSTS and check for its 4302 * completion recursively. 4303 */ 4304 if (nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) { 4305 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); 4306 if (csts.bits.cfs) { 4307 SPDK_ERRLOG("Controller Fatal Status, reset required\n"); 4308 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4309 return; 4310 } 4311 } 4312 4313 switch (g_opts.action_on_timeout) { 4314 case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT: 4315 if (qpair) { 4316 /* Don't send abort to ctrlr when ctrlr is not available. */ 4317 pthread_mutex_lock(&nvme_ctrlr->mutex); 4318 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 4319 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4320 SPDK_NOTICELOG("Quit abort. Ctrlr is not available.\n"); 4321 return; 4322 } 4323 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4324 4325 rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid, 4326 nvme_abort_cpl, nvme_ctrlr); 4327 if (rc == 0) { 4328 return; 4329 } 4330 4331 SPDK_ERRLOG("Unable to send abort. Resetting, rc is %d.\n", rc); 4332 } 4333 4334 /* FALLTHROUGH */ 4335 case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET: 4336 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4337 break; 4338 case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE: 4339 SPDK_DEBUGLOG(bdev_nvme, "No action for nvme controller timeout.\n"); 4340 break; 4341 default: 4342 SPDK_ERRLOG("An invalid timeout action value is found.\n"); 4343 break; 4344 } 4345 } 4346 4347 static struct nvme_ns * 4348 nvme_ns_alloc(void) 4349 { 4350 struct nvme_ns *nvme_ns; 4351 4352 nvme_ns = calloc(1, sizeof(struct nvme_ns)); 4353 if (nvme_ns == NULL) { 4354 return NULL; 4355 } 4356 4357 if (g_opts.io_path_stat) { 4358 nvme_ns->stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); 4359 if (nvme_ns->stat == NULL) { 4360 free(nvme_ns); 4361 return NULL; 4362 } 4363 spdk_bdev_reset_io_stat(nvme_ns->stat, SPDK_BDEV_RESET_STAT_MAXMIN); 4364 } 4365 4366 return nvme_ns; 4367 } 4368 4369 static void 4370 nvme_ns_free(struct nvme_ns *nvme_ns) 4371 { 4372 free(nvme_ns->stat); 4373 free(nvme_ns); 4374 } 4375 4376 static void 4377 nvme_ctrlr_populate_namespace_done(struct nvme_ns *nvme_ns, int rc) 4378 { 4379 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 4380 struct nvme_async_probe_ctx *ctx = nvme_ns->probe_ctx; 4381 4382 if (rc == 0) { 4383 nvme_ns->probe_ctx = NULL; 4384 pthread_mutex_lock(&nvme_ctrlr->mutex); 4385 nvme_ctrlr->ref++; 4386 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4387 } else { 4388 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4389 nvme_ns_free(nvme_ns); 4390 } 4391 4392 if (ctx) { 4393 ctx->populates_in_progress--; 4394 if (ctx->populates_in_progress == 0) { 4395 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 4396 } 4397 } 4398 } 4399 4400 static void 4401 bdev_nvme_add_io_path(struct spdk_io_channel_iter *i) 4402 { 4403 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4404 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4405 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4406 int rc; 4407 4408 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 4409 if (rc != 0) { 4410 SPDK_ERRLOG("Failed to add I/O path to bdev_channel dynamically.\n"); 4411 } 4412 4413 spdk_for_each_channel_continue(i, rc); 4414 } 4415 4416 static void 4417 bdev_nvme_delete_io_path(struct spdk_io_channel_iter *i) 4418 { 4419 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4420 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4421 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4422 struct nvme_io_path *io_path; 4423 4424 io_path = _bdev_nvme_get_io_path(nbdev_ch, nvme_ns); 4425 if (io_path != NULL) { 4426 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 4427 } 4428 4429 spdk_for_each_channel_continue(i, 0); 4430 } 4431 4432 static void 4433 bdev_nvme_add_io_path_failed(struct spdk_io_channel_iter *i, int status) 4434 { 4435 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4436 4437 nvme_ctrlr_populate_namespace_done(nvme_ns, -1); 4438 } 4439 4440 static void 4441 bdev_nvme_add_io_path_done(struct spdk_io_channel_iter *i, int status) 4442 { 4443 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4444 struct nvme_bdev *bdev = spdk_io_channel_iter_get_io_device(i); 4445 4446 if (status == 0) { 4447 nvme_ctrlr_populate_namespace_done(nvme_ns, 0); 4448 } else { 4449 /* Delete the added io_paths and fail populating the namespace. */ 4450 spdk_for_each_channel(bdev, 4451 bdev_nvme_delete_io_path, 4452 nvme_ns, 4453 bdev_nvme_add_io_path_failed); 4454 } 4455 } 4456 4457 static int 4458 nvme_bdev_add_ns(struct nvme_bdev *bdev, struct nvme_ns *nvme_ns) 4459 { 4460 struct nvme_ns *tmp_ns; 4461 const struct spdk_nvme_ns_data *nsdata; 4462 4463 nsdata = spdk_nvme_ns_get_data(nvme_ns->ns); 4464 if (!nsdata->nmic.can_share) { 4465 SPDK_ERRLOG("Namespace cannot be shared.\n"); 4466 return -EINVAL; 4467 } 4468 4469 pthread_mutex_lock(&bdev->mutex); 4470 4471 tmp_ns = TAILQ_FIRST(&bdev->nvme_ns_list); 4472 assert(tmp_ns != NULL); 4473 4474 if (!bdev_nvme_compare_ns(nvme_ns->ns, tmp_ns->ns)) { 4475 pthread_mutex_unlock(&bdev->mutex); 4476 SPDK_ERRLOG("Namespaces are not identical.\n"); 4477 return -EINVAL; 4478 } 4479 4480 bdev->ref++; 4481 TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); 4482 nvme_ns->bdev = bdev; 4483 4484 pthread_mutex_unlock(&bdev->mutex); 4485 4486 /* Add nvme_io_path to nvme_bdev_channels dynamically. */ 4487 spdk_for_each_channel(bdev, 4488 bdev_nvme_add_io_path, 4489 nvme_ns, 4490 bdev_nvme_add_io_path_done); 4491 4492 return 0; 4493 } 4494 4495 static void 4496 nvme_ctrlr_populate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4497 { 4498 struct spdk_nvme_ns *ns; 4499 struct nvme_bdev *bdev; 4500 int rc = 0; 4501 4502 ns = spdk_nvme_ctrlr_get_ns(nvme_ctrlr->ctrlr, nvme_ns->id); 4503 if (!ns) { 4504 SPDK_DEBUGLOG(bdev_nvme, "Invalid NS %d\n", nvme_ns->id); 4505 rc = -EINVAL; 4506 goto done; 4507 } 4508 4509 nvme_ns->ns = ns; 4510 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 4511 4512 if (nvme_ctrlr->ana_log_page != NULL) { 4513 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ns_set_ana_state, nvme_ns); 4514 } 4515 4516 bdev = nvme_bdev_ctrlr_get_bdev(nvme_ctrlr->nbdev_ctrlr, nvme_ns->id); 4517 if (bdev == NULL) { 4518 rc = nvme_bdev_create(nvme_ctrlr, nvme_ns); 4519 } else { 4520 rc = nvme_bdev_add_ns(bdev, nvme_ns); 4521 if (rc == 0) { 4522 return; 4523 } 4524 } 4525 done: 4526 nvme_ctrlr_populate_namespace_done(nvme_ns, rc); 4527 } 4528 4529 static void 4530 nvme_ctrlr_depopulate_namespace_done(struct nvme_ns *nvme_ns) 4531 { 4532 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 4533 4534 assert(nvme_ctrlr != NULL); 4535 4536 pthread_mutex_lock(&nvme_ctrlr->mutex); 4537 4538 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4539 4540 if (nvme_ns->bdev != NULL) { 4541 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4542 return; 4543 } 4544 4545 nvme_ns_free(nvme_ns); 4546 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4547 4548 nvme_ctrlr_release(nvme_ctrlr); 4549 } 4550 4551 static void 4552 bdev_nvme_delete_io_path_done(struct spdk_io_channel_iter *i, int status) 4553 { 4554 struct nvme_ns *nvme_ns = spdk_io_channel_iter_get_ctx(i); 4555 4556 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 4557 } 4558 4559 static void 4560 nvme_ctrlr_depopulate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4561 { 4562 struct nvme_bdev *bdev; 4563 4564 spdk_poller_unregister(&nvme_ns->anatt_timer); 4565 4566 bdev = nvme_ns->bdev; 4567 if (bdev != NULL) { 4568 pthread_mutex_lock(&bdev->mutex); 4569 4570 assert(bdev->ref > 0); 4571 bdev->ref--; 4572 if (bdev->ref == 0) { 4573 pthread_mutex_unlock(&bdev->mutex); 4574 4575 spdk_bdev_unregister(&bdev->disk, NULL, NULL); 4576 } else { 4577 /* spdk_bdev_unregister() is not called until the last nvme_ns is 4578 * depopulated. Hence we need to remove nvme_ns from bdev->nvme_ns_list 4579 * and clear nvme_ns->bdev here. 4580 */ 4581 TAILQ_REMOVE(&bdev->nvme_ns_list, nvme_ns, tailq); 4582 nvme_ns->bdev = NULL; 4583 4584 pthread_mutex_unlock(&bdev->mutex); 4585 4586 /* Delete nvme_io_paths from nvme_bdev_channels dynamically. After that, 4587 * we call depopulate_namespace_done() to avoid use-after-free. 4588 */ 4589 spdk_for_each_channel(bdev, 4590 bdev_nvme_delete_io_path, 4591 nvme_ns, 4592 bdev_nvme_delete_io_path_done); 4593 return; 4594 } 4595 } 4596 4597 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 4598 } 4599 4600 static void 4601 nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 4602 struct nvme_async_probe_ctx *ctx) 4603 { 4604 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 4605 struct nvme_ns *nvme_ns, *next; 4606 struct spdk_nvme_ns *ns; 4607 struct nvme_bdev *bdev; 4608 uint32_t nsid; 4609 int rc; 4610 uint64_t num_sectors; 4611 4612 if (ctx) { 4613 /* Initialize this count to 1 to handle the populate functions 4614 * calling nvme_ctrlr_populate_namespace_done() immediately. 4615 */ 4616 ctx->populates_in_progress = 1; 4617 } 4618 4619 /* First loop over our existing namespaces and see if they have been 4620 * removed. */ 4621 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 4622 while (nvme_ns != NULL) { 4623 next = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 4624 4625 if (spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) { 4626 /* NS is still there but attributes may have changed */ 4627 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id); 4628 num_sectors = spdk_nvme_ns_get_num_sectors(ns); 4629 bdev = nvme_ns->bdev; 4630 assert(bdev != NULL); 4631 if (bdev->disk.blockcnt != num_sectors) { 4632 SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n", 4633 nvme_ns->id, 4634 bdev->disk.name, 4635 bdev->disk.blockcnt, 4636 num_sectors); 4637 rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors); 4638 if (rc != 0) { 4639 SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n", 4640 bdev->disk.name, rc); 4641 } 4642 } 4643 } else { 4644 /* Namespace was removed */ 4645 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 4646 } 4647 4648 nvme_ns = next; 4649 } 4650 4651 /* Loop through all of the namespaces at the nvme level and see if any of them are new */ 4652 nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 4653 while (nsid != 0) { 4654 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 4655 4656 if (nvme_ns == NULL) { 4657 /* Found a new one */ 4658 nvme_ns = nvme_ns_alloc(); 4659 if (nvme_ns == NULL) { 4660 SPDK_ERRLOG("Failed to allocate namespace\n"); 4661 /* This just fails to attach the namespace. It may work on a future attempt. */ 4662 continue; 4663 } 4664 4665 nvme_ns->id = nsid; 4666 nvme_ns->ctrlr = nvme_ctrlr; 4667 4668 nvme_ns->bdev = NULL; 4669 4670 if (ctx) { 4671 ctx->populates_in_progress++; 4672 } 4673 nvme_ns->probe_ctx = ctx; 4674 4675 RB_INSERT(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4676 4677 nvme_ctrlr_populate_namespace(nvme_ctrlr, nvme_ns); 4678 } 4679 4680 nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid); 4681 } 4682 4683 if (ctx) { 4684 /* Decrement this count now that the loop is over to account 4685 * for the one we started with. If the count is then 0, we 4686 * know any populate_namespace functions completed immediately, 4687 * so we'll kick the callback here. 4688 */ 4689 ctx->populates_in_progress--; 4690 if (ctx->populates_in_progress == 0) { 4691 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 4692 } 4693 } 4694 4695 } 4696 4697 static void 4698 nvme_ctrlr_depopulate_namespaces(struct nvme_ctrlr *nvme_ctrlr) 4699 { 4700 struct nvme_ns *nvme_ns, *tmp; 4701 4702 RB_FOREACH_SAFE(nvme_ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp) { 4703 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 4704 } 4705 } 4706 4707 static uint32_t 4708 nvme_ctrlr_get_ana_log_page_size(struct nvme_ctrlr *nvme_ctrlr) 4709 { 4710 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 4711 const struct spdk_nvme_ctrlr_data *cdata; 4712 uint32_t nsid, ns_count = 0; 4713 4714 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 4715 4716 for (nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 4717 nsid != 0; nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid)) { 4718 ns_count++; 4719 } 4720 4721 return sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 4722 sizeof(struct spdk_nvme_ana_group_descriptor) + ns_count * 4723 sizeof(uint32_t); 4724 } 4725 4726 static int 4727 nvme_ctrlr_set_ana_states(const struct spdk_nvme_ana_group_descriptor *desc, 4728 void *cb_arg) 4729 { 4730 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 4731 struct nvme_ns *nvme_ns; 4732 uint32_t i, nsid; 4733 4734 for (i = 0; i < desc->num_of_nsid; i++) { 4735 nsid = desc->nsid[i]; 4736 if (nsid == 0) { 4737 continue; 4738 } 4739 4740 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 4741 4742 assert(nvme_ns != NULL); 4743 if (nvme_ns == NULL) { 4744 /* Target told us that an inactive namespace had an ANA change */ 4745 continue; 4746 } 4747 4748 _nvme_ns_set_ana_state(nvme_ns, desc); 4749 } 4750 4751 return 0; 4752 } 4753 4754 static void 4755 bdev_nvme_disable_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 4756 { 4757 struct nvme_ns *nvme_ns; 4758 4759 spdk_free(nvme_ctrlr->ana_log_page); 4760 nvme_ctrlr->ana_log_page = NULL; 4761 4762 for (nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 4763 nvme_ns != NULL; 4764 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns)) { 4765 nvme_ns->ana_state_updating = false; 4766 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 4767 } 4768 } 4769 4770 static void 4771 nvme_ctrlr_read_ana_log_page_done(void *ctx, const struct spdk_nvme_cpl *cpl) 4772 { 4773 struct nvme_ctrlr *nvme_ctrlr = ctx; 4774 4775 if (cpl != NULL && spdk_nvme_cpl_is_success(cpl)) { 4776 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ctrlr_set_ana_states, 4777 nvme_ctrlr); 4778 } else { 4779 bdev_nvme_disable_read_ana_log_page(nvme_ctrlr); 4780 } 4781 4782 pthread_mutex_lock(&nvme_ctrlr->mutex); 4783 4784 assert(nvme_ctrlr->ana_log_page_updating == true); 4785 nvme_ctrlr->ana_log_page_updating = false; 4786 4787 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 4788 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4789 4790 nvme_ctrlr_unregister(nvme_ctrlr); 4791 } else { 4792 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4793 4794 bdev_nvme_clear_io_path_caches(nvme_ctrlr); 4795 } 4796 } 4797 4798 static int 4799 nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 4800 { 4801 uint32_t ana_log_page_size; 4802 int rc; 4803 4804 if (nvme_ctrlr->ana_log_page == NULL) { 4805 return -EINVAL; 4806 } 4807 4808 ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); 4809 4810 if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { 4811 SPDK_ERRLOG("ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", 4812 ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); 4813 return -EINVAL; 4814 } 4815 4816 pthread_mutex_lock(&nvme_ctrlr->mutex); 4817 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 4818 nvme_ctrlr->ana_log_page_updating) { 4819 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4820 return -EBUSY; 4821 } 4822 4823 nvme_ctrlr->ana_log_page_updating = true; 4824 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4825 4826 rc = spdk_nvme_ctrlr_cmd_get_log_page(nvme_ctrlr->ctrlr, 4827 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 4828 SPDK_NVME_GLOBAL_NS_TAG, 4829 nvme_ctrlr->ana_log_page, 4830 ana_log_page_size, 0, 4831 nvme_ctrlr_read_ana_log_page_done, 4832 nvme_ctrlr); 4833 if (rc != 0) { 4834 nvme_ctrlr_read_ana_log_page_done(nvme_ctrlr, NULL); 4835 } 4836 4837 return rc; 4838 } 4839 4840 static void 4841 dummy_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 4842 { 4843 } 4844 4845 struct bdev_nvme_set_preferred_path_ctx { 4846 struct spdk_bdev_desc *desc; 4847 struct nvme_ns *nvme_ns; 4848 bdev_nvme_set_preferred_path_cb cb_fn; 4849 void *cb_arg; 4850 }; 4851 4852 static void 4853 bdev_nvme_set_preferred_path_done(struct spdk_io_channel_iter *i, int status) 4854 { 4855 struct bdev_nvme_set_preferred_path_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4856 4857 assert(ctx != NULL); 4858 assert(ctx->desc != NULL); 4859 assert(ctx->cb_fn != NULL); 4860 4861 spdk_bdev_close(ctx->desc); 4862 4863 ctx->cb_fn(ctx->cb_arg, status); 4864 4865 free(ctx); 4866 } 4867 4868 static void 4869 _bdev_nvme_set_preferred_path(struct spdk_io_channel_iter *i) 4870 { 4871 struct bdev_nvme_set_preferred_path_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4872 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 4873 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 4874 struct nvme_io_path *io_path, *prev; 4875 4876 prev = NULL; 4877 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 4878 if (io_path->nvme_ns == ctx->nvme_ns) { 4879 break; 4880 } 4881 prev = io_path; 4882 } 4883 4884 if (io_path != NULL) { 4885 if (prev != NULL) { 4886 STAILQ_REMOVE_AFTER(&nbdev_ch->io_path_list, prev, stailq); 4887 STAILQ_INSERT_HEAD(&nbdev_ch->io_path_list, io_path, stailq); 4888 } 4889 4890 /* We can set io_path to nbdev_ch->current_io_path directly here. 4891 * However, it needs to be conditional. To simplify the code, 4892 * just clear nbdev_ch->current_io_path and let find_io_path() 4893 * fill it. 4894 * 4895 * Automatic failback may be disabled. Hence even if the io_path is 4896 * already at the head, clear nbdev_ch->current_io_path. 4897 */ 4898 bdev_nvme_clear_current_io_path(nbdev_ch); 4899 } 4900 4901 spdk_for_each_channel_continue(i, 0); 4902 } 4903 4904 static struct nvme_ns * 4905 bdev_nvme_set_preferred_ns(struct nvme_bdev *nbdev, uint16_t cntlid) 4906 { 4907 struct nvme_ns *nvme_ns, *prev; 4908 const struct spdk_nvme_ctrlr_data *cdata; 4909 4910 prev = NULL; 4911 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 4912 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 4913 4914 if (cdata->cntlid == cntlid) { 4915 break; 4916 } 4917 prev = nvme_ns; 4918 } 4919 4920 if (nvme_ns != NULL && prev != NULL) { 4921 TAILQ_REMOVE(&nbdev->nvme_ns_list, nvme_ns, tailq); 4922 TAILQ_INSERT_HEAD(&nbdev->nvme_ns_list, nvme_ns, tailq); 4923 } 4924 4925 return nvme_ns; 4926 } 4927 4928 /* This function supports only multipath mode. There is only a single I/O path 4929 * for each NVMe-oF controller. Hence, just move the matched I/O path to the 4930 * head of the I/O path list for each NVMe bdev channel. 4931 * 4932 * NVMe bdev channel may be acquired after completing this function. move the 4933 * matched namespace to the head of the namespace list for the NVMe bdev too. 4934 */ 4935 void 4936 bdev_nvme_set_preferred_path(const char *name, uint16_t cntlid, 4937 bdev_nvme_set_preferred_path_cb cb_fn, void *cb_arg) 4938 { 4939 struct bdev_nvme_set_preferred_path_ctx *ctx; 4940 struct spdk_bdev *bdev; 4941 struct nvme_bdev *nbdev; 4942 int rc = 0; 4943 4944 assert(cb_fn != NULL); 4945 4946 ctx = calloc(1, sizeof(*ctx)); 4947 if (ctx == NULL) { 4948 SPDK_ERRLOG("Failed to alloc context.\n"); 4949 rc = -ENOMEM; 4950 goto err_alloc; 4951 } 4952 4953 ctx->cb_fn = cb_fn; 4954 ctx->cb_arg = cb_arg; 4955 4956 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 4957 if (rc != 0) { 4958 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 4959 goto err_open; 4960 } 4961 4962 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 4963 4964 if (bdev->module != &nvme_if) { 4965 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 4966 rc = -ENODEV; 4967 goto err_bdev; 4968 } 4969 4970 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 4971 4972 pthread_mutex_lock(&nbdev->mutex); 4973 4974 ctx->nvme_ns = bdev_nvme_set_preferred_ns(nbdev, cntlid); 4975 if (ctx->nvme_ns == NULL) { 4976 pthread_mutex_unlock(&nbdev->mutex); 4977 4978 SPDK_ERRLOG("bdev %s does not have namespace to controller %u.\n", name, cntlid); 4979 rc = -ENODEV; 4980 goto err_bdev; 4981 } 4982 4983 pthread_mutex_unlock(&nbdev->mutex); 4984 4985 spdk_for_each_channel(nbdev, 4986 _bdev_nvme_set_preferred_path, 4987 ctx, 4988 bdev_nvme_set_preferred_path_done); 4989 return; 4990 4991 err_bdev: 4992 spdk_bdev_close(ctx->desc); 4993 err_open: 4994 free(ctx); 4995 err_alloc: 4996 cb_fn(cb_arg, rc); 4997 } 4998 4999 struct bdev_nvme_set_multipath_policy_ctx { 5000 struct spdk_bdev_desc *desc; 5001 bdev_nvme_set_multipath_policy_cb cb_fn; 5002 void *cb_arg; 5003 }; 5004 5005 static void 5006 bdev_nvme_set_multipath_policy_done(struct spdk_io_channel_iter *i, int status) 5007 { 5008 struct bdev_nvme_set_multipath_policy_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 5009 5010 assert(ctx != NULL); 5011 assert(ctx->desc != NULL); 5012 assert(ctx->cb_fn != NULL); 5013 5014 spdk_bdev_close(ctx->desc); 5015 5016 ctx->cb_fn(ctx->cb_arg, status); 5017 5018 free(ctx); 5019 } 5020 5021 static void 5022 _bdev_nvme_set_multipath_policy(struct spdk_io_channel_iter *i) 5023 { 5024 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 5025 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(_ch); 5026 struct nvme_bdev *nbdev = spdk_io_channel_get_io_device(_ch); 5027 5028 nbdev_ch->mp_policy = nbdev->mp_policy; 5029 nbdev_ch->mp_selector = nbdev->mp_selector; 5030 nbdev_ch->rr_min_io = nbdev->rr_min_io; 5031 bdev_nvme_clear_current_io_path(nbdev_ch); 5032 5033 spdk_for_each_channel_continue(i, 0); 5034 } 5035 5036 void 5037 bdev_nvme_set_multipath_policy(const char *name, enum bdev_nvme_multipath_policy policy, 5038 enum bdev_nvme_multipath_selector selector, uint32_t rr_min_io, 5039 bdev_nvme_set_multipath_policy_cb cb_fn, void *cb_arg) 5040 { 5041 struct bdev_nvme_set_multipath_policy_ctx *ctx; 5042 struct spdk_bdev *bdev; 5043 struct nvme_bdev *nbdev; 5044 int rc; 5045 5046 assert(cb_fn != NULL); 5047 5048 if (policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE && selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 5049 if (rr_min_io == UINT32_MAX) { 5050 rr_min_io = 1; 5051 } else if (rr_min_io == 0) { 5052 rc = -EINVAL; 5053 goto exit; 5054 } 5055 } else if (rr_min_io != UINT32_MAX) { 5056 rc = -EINVAL; 5057 goto exit; 5058 } 5059 5060 ctx = calloc(1, sizeof(*ctx)); 5061 if (ctx == NULL) { 5062 SPDK_ERRLOG("Failed to alloc context.\n"); 5063 rc = -ENOMEM; 5064 goto exit; 5065 } 5066 5067 ctx->cb_fn = cb_fn; 5068 ctx->cb_arg = cb_arg; 5069 5070 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 5071 if (rc != 0) { 5072 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 5073 rc = -ENODEV; 5074 goto err_open; 5075 } 5076 5077 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 5078 if (bdev->module != &nvme_if) { 5079 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 5080 rc = -ENODEV; 5081 goto err_module; 5082 } 5083 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 5084 5085 pthread_mutex_lock(&nbdev->mutex); 5086 nbdev->mp_policy = policy; 5087 nbdev->mp_selector = selector; 5088 nbdev->rr_min_io = rr_min_io; 5089 pthread_mutex_unlock(&nbdev->mutex); 5090 5091 spdk_for_each_channel(nbdev, 5092 _bdev_nvme_set_multipath_policy, 5093 ctx, 5094 bdev_nvme_set_multipath_policy_done); 5095 return; 5096 5097 err_module: 5098 spdk_bdev_close(ctx->desc); 5099 err_open: 5100 free(ctx); 5101 exit: 5102 cb_fn(cb_arg, rc); 5103 } 5104 5105 static void 5106 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 5107 { 5108 struct nvme_ctrlr *nvme_ctrlr = arg; 5109 union spdk_nvme_async_event_completion event; 5110 5111 if (spdk_nvme_cpl_is_error(cpl)) { 5112 SPDK_WARNLOG("AER request execute failed\n"); 5113 return; 5114 } 5115 5116 event.raw = cpl->cdw0; 5117 if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 5118 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) { 5119 nvme_ctrlr_populate_namespaces(nvme_ctrlr, NULL); 5120 } else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 5121 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_ANA_CHANGE)) { 5122 nvme_ctrlr_read_ana_log_page(nvme_ctrlr); 5123 } 5124 } 5125 5126 static void 5127 free_nvme_async_probe_ctx(struct nvme_async_probe_ctx *ctx) 5128 { 5129 spdk_keyring_put_key(ctx->drv_opts.tls_psk); 5130 spdk_keyring_put_key(ctx->drv_opts.dhchap_key); 5131 free(ctx); 5132 } 5133 5134 static void 5135 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, int rc) 5136 { 5137 if (ctx->cb_fn) { 5138 ctx->cb_fn(ctx->cb_ctx, ctx->reported_bdevs, rc); 5139 } 5140 5141 ctx->namespaces_populated = true; 5142 if (ctx->probe_done) { 5143 /* The probe was already completed, so we need to free the context 5144 * here. This can happen for cases like OCSSD, where we need to 5145 * send additional commands to the SSD after attach. 5146 */ 5147 free_nvme_async_probe_ctx(ctx); 5148 } 5149 } 5150 5151 static void 5152 nvme_ctrlr_create_done(struct nvme_ctrlr *nvme_ctrlr, 5153 struct nvme_async_probe_ctx *ctx) 5154 { 5155 spdk_io_device_register(nvme_ctrlr, 5156 bdev_nvme_create_ctrlr_channel_cb, 5157 bdev_nvme_destroy_ctrlr_channel_cb, 5158 sizeof(struct nvme_ctrlr_channel), 5159 nvme_ctrlr->nbdev_ctrlr->name); 5160 5161 nvme_ctrlr_populate_namespaces(nvme_ctrlr, ctx); 5162 } 5163 5164 static void 5165 nvme_ctrlr_init_ana_log_page_done(void *_ctx, const struct spdk_nvme_cpl *cpl) 5166 { 5167 struct nvme_ctrlr *nvme_ctrlr = _ctx; 5168 struct nvme_async_probe_ctx *ctx = nvme_ctrlr->probe_ctx; 5169 5170 nvme_ctrlr->probe_ctx = NULL; 5171 5172 if (spdk_nvme_cpl_is_error(cpl)) { 5173 nvme_ctrlr_delete(nvme_ctrlr); 5174 5175 if (ctx != NULL) { 5176 ctx->reported_bdevs = 0; 5177 populate_namespaces_cb(ctx, -1); 5178 } 5179 return; 5180 } 5181 5182 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 5183 } 5184 5185 static int 5186 nvme_ctrlr_init_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 5187 struct nvme_async_probe_ctx *ctx) 5188 { 5189 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 5190 const struct spdk_nvme_ctrlr_data *cdata; 5191 uint32_t ana_log_page_size; 5192 5193 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5194 5195 /* Set buffer size enough to include maximum number of allowed namespaces. */ 5196 ana_log_page_size = sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 5197 sizeof(struct spdk_nvme_ana_group_descriptor) + cdata->mnan * 5198 sizeof(uint32_t); 5199 5200 nvme_ctrlr->ana_log_page = spdk_zmalloc(ana_log_page_size, 64, NULL, 5201 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 5202 if (nvme_ctrlr->ana_log_page == NULL) { 5203 SPDK_ERRLOG("could not allocate ANA log page buffer\n"); 5204 return -ENXIO; 5205 } 5206 5207 /* Each descriptor in a ANA log page is not ensured to be 8-bytes aligned. 5208 * Hence copy each descriptor to a temporary area when parsing it. 5209 * 5210 * Allocate a buffer whose size is as large as ANA log page buffer because 5211 * we do not know the size of a descriptor until actually reading it. 5212 */ 5213 nvme_ctrlr->copied_ana_desc = calloc(1, ana_log_page_size); 5214 if (nvme_ctrlr->copied_ana_desc == NULL) { 5215 SPDK_ERRLOG("could not allocate a buffer to parse ANA descriptor\n"); 5216 return -ENOMEM; 5217 } 5218 5219 nvme_ctrlr->max_ana_log_page_size = ana_log_page_size; 5220 5221 nvme_ctrlr->probe_ctx = ctx; 5222 5223 /* Then, set the read size only to include the current active namespaces. */ 5224 ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); 5225 5226 if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { 5227 SPDK_ERRLOG("ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", 5228 ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); 5229 return -EINVAL; 5230 } 5231 5232 return spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, 5233 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 5234 SPDK_NVME_GLOBAL_NS_TAG, 5235 nvme_ctrlr->ana_log_page, 5236 ana_log_page_size, 0, 5237 nvme_ctrlr_init_ana_log_page_done, 5238 nvme_ctrlr); 5239 } 5240 5241 /* hostnqn and subnqn were already verified before attaching a controller. 5242 * Hence check only the multipath capability and cntlid here. 5243 */ 5244 static bool 5245 bdev_nvme_check_multipath(struct nvme_bdev_ctrlr *nbdev_ctrlr, struct spdk_nvme_ctrlr *ctrlr) 5246 { 5247 struct nvme_ctrlr *tmp; 5248 const struct spdk_nvme_ctrlr_data *cdata, *tmp_cdata; 5249 5250 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5251 5252 if (!cdata->cmic.multi_ctrlr) { 5253 SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); 5254 return false; 5255 } 5256 5257 TAILQ_FOREACH(tmp, &nbdev_ctrlr->ctrlrs, tailq) { 5258 tmp_cdata = spdk_nvme_ctrlr_get_data(tmp->ctrlr); 5259 5260 if (!tmp_cdata->cmic.multi_ctrlr) { 5261 SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); 5262 return false; 5263 } 5264 if (cdata->cntlid == tmp_cdata->cntlid) { 5265 SPDK_ERRLOG("cntlid %u are duplicated.\n", tmp_cdata->cntlid); 5266 return false; 5267 } 5268 } 5269 5270 return true; 5271 } 5272 5273 static int 5274 nvme_bdev_ctrlr_create(const char *name, struct nvme_ctrlr *nvme_ctrlr) 5275 { 5276 struct nvme_bdev_ctrlr *nbdev_ctrlr; 5277 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 5278 int rc = 0; 5279 5280 pthread_mutex_lock(&g_bdev_nvme_mutex); 5281 5282 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 5283 if (nbdev_ctrlr != NULL) { 5284 if (!bdev_nvme_check_multipath(nbdev_ctrlr, ctrlr)) { 5285 rc = -EINVAL; 5286 goto exit; 5287 } 5288 } else { 5289 nbdev_ctrlr = calloc(1, sizeof(*nbdev_ctrlr)); 5290 if (nbdev_ctrlr == NULL) { 5291 SPDK_ERRLOG("Failed to allocate nvme_bdev_ctrlr.\n"); 5292 rc = -ENOMEM; 5293 goto exit; 5294 } 5295 nbdev_ctrlr->name = strdup(name); 5296 if (nbdev_ctrlr->name == NULL) { 5297 SPDK_ERRLOG("Failed to allocate name of nvme_bdev_ctrlr.\n"); 5298 free(nbdev_ctrlr); 5299 goto exit; 5300 } 5301 TAILQ_INIT(&nbdev_ctrlr->ctrlrs); 5302 TAILQ_INIT(&nbdev_ctrlr->bdevs); 5303 TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 5304 } 5305 nvme_ctrlr->nbdev_ctrlr = nbdev_ctrlr; 5306 TAILQ_INSERT_TAIL(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 5307 exit: 5308 pthread_mutex_unlock(&g_bdev_nvme_mutex); 5309 return rc; 5310 } 5311 5312 static int 5313 nvme_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr, 5314 const char *name, 5315 const struct spdk_nvme_transport_id *trid, 5316 struct nvme_async_probe_ctx *ctx) 5317 { 5318 struct nvme_ctrlr *nvme_ctrlr; 5319 struct nvme_path_id *path_id; 5320 const struct spdk_nvme_ctrlr_data *cdata; 5321 int rc; 5322 5323 nvme_ctrlr = calloc(1, sizeof(*nvme_ctrlr)); 5324 if (nvme_ctrlr == NULL) { 5325 SPDK_ERRLOG("Failed to allocate device struct\n"); 5326 return -ENOMEM; 5327 } 5328 5329 rc = pthread_mutex_init(&nvme_ctrlr->mutex, NULL); 5330 if (rc != 0) { 5331 free(nvme_ctrlr); 5332 return rc; 5333 } 5334 5335 TAILQ_INIT(&nvme_ctrlr->trids); 5336 RB_INIT(&nvme_ctrlr->namespaces); 5337 5338 /* Get another reference to the key, so the first one can be released from probe_ctx */ 5339 if (ctx != NULL) { 5340 if (ctx->drv_opts.tls_psk != NULL) { 5341 nvme_ctrlr->psk = spdk_keyring_get_key( 5342 spdk_key_get_name(ctx->drv_opts.tls_psk)); 5343 if (nvme_ctrlr->psk == NULL) { 5344 /* Could only happen if the key was removed in the meantime */ 5345 SPDK_ERRLOG("Couldn't get a reference to the key '%s'\n", 5346 spdk_key_get_name(ctx->drv_opts.tls_psk)); 5347 rc = -ENOKEY; 5348 goto err; 5349 } 5350 } 5351 5352 if (ctx->drv_opts.dhchap_key != NULL) { 5353 nvme_ctrlr->dhchap_key = spdk_keyring_get_key( 5354 spdk_key_get_name(ctx->drv_opts.dhchap_key)); 5355 if (nvme_ctrlr->dhchap_key == NULL) { 5356 SPDK_ERRLOG("Couldn't get a reference to the key '%s'\n", 5357 spdk_key_get_name(ctx->drv_opts.tls_psk)); 5358 rc = -ENOKEY; 5359 goto err; 5360 } 5361 } 5362 } 5363 5364 path_id = calloc(1, sizeof(*path_id)); 5365 if (path_id == NULL) { 5366 SPDK_ERRLOG("Failed to allocate trid entry pointer\n"); 5367 rc = -ENOMEM; 5368 goto err; 5369 } 5370 5371 path_id->trid = *trid; 5372 if (ctx != NULL) { 5373 memcpy(path_id->hostid.hostaddr, ctx->drv_opts.src_addr, sizeof(path_id->hostid.hostaddr)); 5374 memcpy(path_id->hostid.hostsvcid, ctx->drv_opts.src_svcid, sizeof(path_id->hostid.hostsvcid)); 5375 } 5376 nvme_ctrlr->active_path_id = path_id; 5377 TAILQ_INSERT_HEAD(&nvme_ctrlr->trids, path_id, link); 5378 5379 nvme_ctrlr->thread = spdk_get_thread(); 5380 nvme_ctrlr->ctrlr = ctrlr; 5381 nvme_ctrlr->ref = 1; 5382 5383 if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) { 5384 SPDK_ERRLOG("OCSSDs are not supported"); 5385 rc = -ENOTSUP; 5386 goto err; 5387 } 5388 5389 if (ctx != NULL) { 5390 memcpy(&nvme_ctrlr->opts, &ctx->bdev_opts, sizeof(ctx->bdev_opts)); 5391 } else { 5392 bdev_nvme_get_default_ctrlr_opts(&nvme_ctrlr->opts); 5393 } 5394 5395 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_ctrlr, 5396 g_opts.nvme_adminq_poll_period_us); 5397 5398 if (g_opts.timeout_us > 0) { 5399 /* Register timeout callback. Timeout values for IO vs. admin reqs can be different. */ 5400 /* If timeout_admin_us is 0 (not specified), admin uses same timeout as IO. */ 5401 uint64_t adm_timeout_us = (g_opts.timeout_admin_us == 0) ? 5402 g_opts.timeout_us : g_opts.timeout_admin_us; 5403 spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us, 5404 adm_timeout_us, timeout_cb, nvme_ctrlr); 5405 } 5406 5407 spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_ctrlr); 5408 spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_ctrlr); 5409 5410 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 5411 SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) { 5412 nvme_ctrlr->opal_dev = spdk_opal_dev_construct(ctrlr); 5413 } 5414 5415 rc = nvme_bdev_ctrlr_create(name, nvme_ctrlr); 5416 if (rc != 0) { 5417 goto err; 5418 } 5419 5420 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5421 5422 if (cdata->cmic.ana_reporting) { 5423 rc = nvme_ctrlr_init_ana_log_page(nvme_ctrlr, ctx); 5424 if (rc == 0) { 5425 return 0; 5426 } 5427 } else { 5428 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 5429 return 0; 5430 } 5431 5432 err: 5433 nvme_ctrlr_delete(nvme_ctrlr); 5434 return rc; 5435 } 5436 5437 void 5438 bdev_nvme_get_default_ctrlr_opts(struct nvme_ctrlr_opts *opts) 5439 { 5440 opts->prchk_flags = 0; 5441 opts->ctrlr_loss_timeout_sec = g_opts.ctrlr_loss_timeout_sec; 5442 opts->reconnect_delay_sec = g_opts.reconnect_delay_sec; 5443 opts->fast_io_fail_timeout_sec = g_opts.fast_io_fail_timeout_sec; 5444 } 5445 5446 static void 5447 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5448 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *drv_opts) 5449 { 5450 char *name; 5451 5452 name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++); 5453 if (!name) { 5454 SPDK_ERRLOG("Failed to assign name to NVMe device\n"); 5455 return; 5456 } 5457 5458 if (nvme_ctrlr_create(ctrlr, name, trid, NULL) == 0) { 5459 SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name); 5460 } else { 5461 SPDK_ERRLOG("Failed to attach to %s (%s)\n", trid->traddr, name); 5462 } 5463 5464 free(name); 5465 } 5466 5467 static void 5468 _nvme_ctrlr_destruct(void *ctx) 5469 { 5470 struct nvme_ctrlr *nvme_ctrlr = ctx; 5471 5472 nvme_ctrlr_depopulate_namespaces(nvme_ctrlr); 5473 nvme_ctrlr_release(nvme_ctrlr); 5474 } 5475 5476 static int 5477 bdev_nvme_delete_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) 5478 { 5479 struct nvme_probe_skip_entry *entry; 5480 5481 /* The controller's destruction was already started */ 5482 if (nvme_ctrlr->destruct) { 5483 return -EALREADY; 5484 } 5485 5486 if (!hotplug && 5487 nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { 5488 entry = calloc(1, sizeof(*entry)); 5489 if (!entry) { 5490 return -ENOMEM; 5491 } 5492 entry->trid = nvme_ctrlr->active_path_id->trid; 5493 TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq); 5494 } 5495 5496 nvme_ctrlr->destruct = true; 5497 return 0; 5498 } 5499 5500 static int 5501 bdev_nvme_delete_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) 5502 { 5503 int rc; 5504 5505 pthread_mutex_lock(&nvme_ctrlr->mutex); 5506 rc = bdev_nvme_delete_ctrlr_unsafe(nvme_ctrlr, hotplug); 5507 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5508 5509 if (rc == 0) { 5510 _nvme_ctrlr_destruct(nvme_ctrlr); 5511 } else if (rc == -EALREADY) { 5512 rc = 0; 5513 } 5514 5515 return rc; 5516 } 5517 5518 static void 5519 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr) 5520 { 5521 struct nvme_ctrlr *nvme_ctrlr = cb_ctx; 5522 5523 bdev_nvme_delete_ctrlr(nvme_ctrlr, true); 5524 } 5525 5526 static int 5527 bdev_nvme_hotplug_probe(void *arg) 5528 { 5529 if (g_hotplug_probe_ctx == NULL) { 5530 spdk_poller_unregister(&g_hotplug_probe_poller); 5531 return SPDK_POLLER_IDLE; 5532 } 5533 5534 if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) { 5535 g_hotplug_probe_ctx = NULL; 5536 spdk_poller_unregister(&g_hotplug_probe_poller); 5537 } 5538 5539 return SPDK_POLLER_BUSY; 5540 } 5541 5542 static int 5543 bdev_nvme_hotplug(void *arg) 5544 { 5545 struct spdk_nvme_transport_id trid_pcie; 5546 5547 if (g_hotplug_probe_ctx) { 5548 return SPDK_POLLER_BUSY; 5549 } 5550 5551 memset(&trid_pcie, 0, sizeof(trid_pcie)); 5552 spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE); 5553 5554 g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL, 5555 hotplug_probe_cb, attach_cb, NULL); 5556 5557 if (g_hotplug_probe_ctx) { 5558 assert(g_hotplug_probe_poller == NULL); 5559 g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000); 5560 } 5561 5562 return SPDK_POLLER_BUSY; 5563 } 5564 5565 void 5566 bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts) 5567 { 5568 *opts = g_opts; 5569 } 5570 5571 static bool bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, 5572 uint32_t reconnect_delay_sec, 5573 uint32_t fast_io_fail_timeout_sec); 5574 5575 static int 5576 bdev_nvme_validate_opts(const struct spdk_bdev_nvme_opts *opts) 5577 { 5578 if ((opts->timeout_us == 0) && (opts->timeout_admin_us != 0)) { 5579 /* Can't set timeout_admin_us without also setting timeout_us */ 5580 SPDK_WARNLOG("Invalid options: Can't have (timeout_us == 0) with (timeout_admin_us > 0)\n"); 5581 return -EINVAL; 5582 } 5583 5584 if (opts->bdev_retry_count < -1) { 5585 SPDK_WARNLOG("Invalid option: bdev_retry_count can't be less than -1.\n"); 5586 return -EINVAL; 5587 } 5588 5589 if (!bdev_nvme_check_io_error_resiliency_params(opts->ctrlr_loss_timeout_sec, 5590 opts->reconnect_delay_sec, 5591 opts->fast_io_fail_timeout_sec)) { 5592 return -EINVAL; 5593 } 5594 5595 return 0; 5596 } 5597 5598 int 5599 bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts) 5600 { 5601 int ret; 5602 5603 ret = bdev_nvme_validate_opts(opts); 5604 if (ret) { 5605 SPDK_WARNLOG("Failed to set nvme opts.\n"); 5606 return ret; 5607 } 5608 5609 if (g_bdev_nvme_init_thread != NULL) { 5610 if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 5611 return -EPERM; 5612 } 5613 } 5614 5615 if (opts->rdma_srq_size != 0 || 5616 opts->rdma_max_cq_size != 0 || 5617 opts->rdma_cm_event_timeout_ms != 0) { 5618 struct spdk_nvme_transport_opts drv_opts; 5619 5620 spdk_nvme_transport_get_opts(&drv_opts, sizeof(drv_opts)); 5621 if (opts->rdma_srq_size != 0) { 5622 drv_opts.rdma_srq_size = opts->rdma_srq_size; 5623 } 5624 if (opts->rdma_max_cq_size != 0) { 5625 drv_opts.rdma_max_cq_size = opts->rdma_max_cq_size; 5626 } 5627 if (opts->rdma_cm_event_timeout_ms != 0) { 5628 drv_opts.rdma_cm_event_timeout_ms = opts->rdma_cm_event_timeout_ms; 5629 } 5630 5631 ret = spdk_nvme_transport_set_opts(&drv_opts, sizeof(drv_opts)); 5632 if (ret) { 5633 SPDK_ERRLOG("Failed to set NVMe transport opts.\n"); 5634 return ret; 5635 } 5636 } 5637 5638 g_opts = *opts; 5639 5640 return 0; 5641 } 5642 5643 struct set_nvme_hotplug_ctx { 5644 uint64_t period_us; 5645 bool enabled; 5646 spdk_msg_fn fn; 5647 void *fn_ctx; 5648 }; 5649 5650 static void 5651 set_nvme_hotplug_period_cb(void *_ctx) 5652 { 5653 struct set_nvme_hotplug_ctx *ctx = _ctx; 5654 5655 spdk_poller_unregister(&g_hotplug_poller); 5656 if (ctx->enabled) { 5657 g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us); 5658 } 5659 5660 g_nvme_hotplug_poll_period_us = ctx->period_us; 5661 g_nvme_hotplug_enabled = ctx->enabled; 5662 if (ctx->fn) { 5663 ctx->fn(ctx->fn_ctx); 5664 } 5665 5666 free(ctx); 5667 } 5668 5669 int 5670 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx) 5671 { 5672 struct set_nvme_hotplug_ctx *ctx; 5673 5674 if (enabled == true && !spdk_process_is_primary()) { 5675 return -EPERM; 5676 } 5677 5678 ctx = calloc(1, sizeof(*ctx)); 5679 if (ctx == NULL) { 5680 return -ENOMEM; 5681 } 5682 5683 period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us; 5684 ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX); 5685 ctx->enabled = enabled; 5686 ctx->fn = cb; 5687 ctx->fn_ctx = cb_ctx; 5688 5689 spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx); 5690 return 0; 5691 } 5692 5693 static void 5694 nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 5695 struct nvme_async_probe_ctx *ctx) 5696 { 5697 struct nvme_ns *nvme_ns; 5698 struct nvme_bdev *nvme_bdev; 5699 size_t j; 5700 5701 assert(nvme_ctrlr != NULL); 5702 5703 if (ctx->names == NULL) { 5704 ctx->reported_bdevs = 0; 5705 populate_namespaces_cb(ctx, 0); 5706 return; 5707 } 5708 5709 /* 5710 * Report the new bdevs that were created in this call. 5711 * There can be more than one bdev per NVMe controller. 5712 */ 5713 j = 0; 5714 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 5715 while (nvme_ns != NULL) { 5716 nvme_bdev = nvme_ns->bdev; 5717 if (j < ctx->max_bdevs) { 5718 ctx->names[j] = nvme_bdev->disk.name; 5719 j++; 5720 } else { 5721 SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n", 5722 ctx->max_bdevs); 5723 ctx->reported_bdevs = 0; 5724 populate_namespaces_cb(ctx, -ERANGE); 5725 return; 5726 } 5727 5728 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 5729 } 5730 5731 ctx->reported_bdevs = j; 5732 populate_namespaces_cb(ctx, 0); 5733 } 5734 5735 static int 5736 bdev_nvme_check_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 5737 struct spdk_nvme_ctrlr *new_ctrlr, 5738 struct spdk_nvme_transport_id *trid) 5739 { 5740 struct nvme_path_id *tmp_trid; 5741 5742 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 5743 SPDK_ERRLOG("PCIe failover is not supported.\n"); 5744 return -ENOTSUP; 5745 } 5746 5747 /* Currently we only support failover to the same transport type. */ 5748 if (nvme_ctrlr->active_path_id->trid.trtype != trid->trtype) { 5749 SPDK_WARNLOG("Failover from trtype: %s to a different trtype: %s is not supported currently\n", 5750 spdk_nvme_transport_id_trtype_str(nvme_ctrlr->active_path_id->trid.trtype), 5751 spdk_nvme_transport_id_trtype_str(trid->trtype)); 5752 return -EINVAL; 5753 } 5754 5755 5756 /* Currently we only support failover to the same NQN. */ 5757 if (strncmp(trid->subnqn, nvme_ctrlr->active_path_id->trid.subnqn, SPDK_NVMF_NQN_MAX_LEN)) { 5758 SPDK_WARNLOG("Failover from subnqn: %s to a different subnqn: %s is not supported currently\n", 5759 nvme_ctrlr->active_path_id->trid.subnqn, trid->subnqn); 5760 return -EINVAL; 5761 } 5762 5763 /* Skip all the other checks if we've already registered this path. */ 5764 TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) { 5765 if (!spdk_nvme_transport_id_compare(&tmp_trid->trid, trid)) { 5766 SPDK_WARNLOG("This path (traddr: %s subnqn: %s) is already registered\n", trid->traddr, 5767 trid->subnqn); 5768 return -EEXIST; 5769 } 5770 } 5771 5772 return 0; 5773 } 5774 5775 static int 5776 bdev_nvme_check_secondary_namespace(struct nvme_ctrlr *nvme_ctrlr, 5777 struct spdk_nvme_ctrlr *new_ctrlr) 5778 { 5779 struct nvme_ns *nvme_ns; 5780 struct spdk_nvme_ns *new_ns; 5781 5782 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 5783 while (nvme_ns != NULL) { 5784 new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nvme_ns->id); 5785 assert(new_ns != NULL); 5786 5787 if (!bdev_nvme_compare_ns(nvme_ns->ns, new_ns)) { 5788 return -EINVAL; 5789 } 5790 5791 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 5792 } 5793 5794 return 0; 5795 } 5796 5797 static int 5798 _bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 5799 struct spdk_nvme_transport_id *trid) 5800 { 5801 struct nvme_path_id *active_id, *new_trid, *tmp_trid; 5802 5803 new_trid = calloc(1, sizeof(*new_trid)); 5804 if (new_trid == NULL) { 5805 return -ENOMEM; 5806 } 5807 new_trid->trid = *trid; 5808 5809 active_id = nvme_ctrlr->active_path_id; 5810 assert(active_id != NULL); 5811 assert(active_id == TAILQ_FIRST(&nvme_ctrlr->trids)); 5812 5813 /* Skip the active trid not to replace it until it is failed. */ 5814 tmp_trid = TAILQ_NEXT(active_id, link); 5815 if (tmp_trid == NULL) { 5816 goto add_tail; 5817 } 5818 5819 /* It means the trid is faled if its last failed time is non-zero. 5820 * Insert the new alternate trid before any failed trid. 5821 */ 5822 TAILQ_FOREACH_FROM(tmp_trid, &nvme_ctrlr->trids, link) { 5823 if (tmp_trid->last_failed_tsc != 0) { 5824 TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link); 5825 return 0; 5826 } 5827 } 5828 5829 add_tail: 5830 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, new_trid, link); 5831 return 0; 5832 } 5833 5834 /* This is the case that a secondary path is added to an existing 5835 * nvme_ctrlr for failover. After checking if it can access the same 5836 * namespaces as the primary path, it is disconnected until failover occurs. 5837 */ 5838 static int 5839 bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 5840 struct spdk_nvme_ctrlr *new_ctrlr, 5841 struct spdk_nvme_transport_id *trid) 5842 { 5843 int rc; 5844 5845 assert(nvme_ctrlr != NULL); 5846 5847 pthread_mutex_lock(&nvme_ctrlr->mutex); 5848 5849 rc = bdev_nvme_check_secondary_trid(nvme_ctrlr, new_ctrlr, trid); 5850 if (rc != 0) { 5851 goto exit; 5852 } 5853 5854 rc = bdev_nvme_check_secondary_namespace(nvme_ctrlr, new_ctrlr); 5855 if (rc != 0) { 5856 goto exit; 5857 } 5858 5859 rc = _bdev_nvme_add_secondary_trid(nvme_ctrlr, trid); 5860 5861 exit: 5862 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5863 5864 spdk_nvme_detach(new_ctrlr); 5865 5866 return rc; 5867 } 5868 5869 static void 5870 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5871 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 5872 { 5873 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 5874 struct nvme_async_probe_ctx *ctx; 5875 int rc; 5876 5877 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 5878 ctx->ctrlr_attached = true; 5879 5880 rc = nvme_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx); 5881 if (rc != 0) { 5882 ctx->reported_bdevs = 0; 5883 populate_namespaces_cb(ctx, rc); 5884 } 5885 } 5886 5887 static void 5888 connect_set_failover_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5889 struct spdk_nvme_ctrlr *ctrlr, 5890 const struct spdk_nvme_ctrlr_opts *opts) 5891 { 5892 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 5893 struct nvme_ctrlr *nvme_ctrlr; 5894 struct nvme_async_probe_ctx *ctx; 5895 int rc; 5896 5897 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 5898 ctx->ctrlr_attached = true; 5899 5900 nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->base_name); 5901 if (nvme_ctrlr) { 5902 rc = bdev_nvme_add_secondary_trid(nvme_ctrlr, ctrlr, &ctx->trid); 5903 } else { 5904 rc = -ENODEV; 5905 } 5906 5907 ctx->reported_bdevs = 0; 5908 populate_namespaces_cb(ctx, rc); 5909 } 5910 5911 static int 5912 bdev_nvme_async_poll(void *arg) 5913 { 5914 struct nvme_async_probe_ctx *ctx = arg; 5915 int rc; 5916 5917 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 5918 if (spdk_unlikely(rc != -EAGAIN)) { 5919 ctx->probe_done = true; 5920 spdk_poller_unregister(&ctx->poller); 5921 if (!ctx->ctrlr_attached) { 5922 /* The probe is done, but no controller was attached. 5923 * That means we had a failure, so report -EIO back to 5924 * the caller (usually the RPC). populate_namespaces_cb() 5925 * will take care of freeing the nvme_async_probe_ctx. 5926 */ 5927 ctx->reported_bdevs = 0; 5928 populate_namespaces_cb(ctx, -EIO); 5929 } else if (ctx->namespaces_populated) { 5930 /* The namespaces for the attached controller were all 5931 * populated and the response was already sent to the 5932 * caller (usually the RPC). So free the context here. 5933 */ 5934 free_nvme_async_probe_ctx(ctx); 5935 } 5936 } 5937 5938 return SPDK_POLLER_BUSY; 5939 } 5940 5941 static bool 5942 bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, 5943 uint32_t reconnect_delay_sec, 5944 uint32_t fast_io_fail_timeout_sec) 5945 { 5946 if (ctrlr_loss_timeout_sec < -1) { 5947 SPDK_ERRLOG("ctrlr_loss_timeout_sec can't be less than -1.\n"); 5948 return false; 5949 } else if (ctrlr_loss_timeout_sec == -1) { 5950 if (reconnect_delay_sec == 0) { 5951 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 5952 return false; 5953 } else if (fast_io_fail_timeout_sec != 0 && 5954 fast_io_fail_timeout_sec < reconnect_delay_sec) { 5955 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io-fail_timeout_sec.\n"); 5956 return false; 5957 } 5958 } else if (ctrlr_loss_timeout_sec != 0) { 5959 if (reconnect_delay_sec == 0) { 5960 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 5961 return false; 5962 } else if (reconnect_delay_sec > (uint32_t)ctrlr_loss_timeout_sec) { 5963 SPDK_ERRLOG("reconnect_delay_sec can't be more than ctrlr_loss_timeout_sec.\n"); 5964 return false; 5965 } else if (fast_io_fail_timeout_sec != 0) { 5966 if (fast_io_fail_timeout_sec < reconnect_delay_sec) { 5967 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io_fail_timeout_sec.\n"); 5968 return false; 5969 } else if (fast_io_fail_timeout_sec > (uint32_t)ctrlr_loss_timeout_sec) { 5970 SPDK_ERRLOG("fast_io_fail_timeout_sec can't be more than ctrlr_loss_timeout_sec.\n"); 5971 return false; 5972 } 5973 } 5974 } else if (reconnect_delay_sec != 0 || fast_io_fail_timeout_sec != 0) { 5975 SPDK_ERRLOG("Both reconnect_delay_sec and fast_io_fail_timeout_sec must be 0 if ctrlr_loss_timeout_sec is 0.\n"); 5976 return false; 5977 } 5978 5979 return true; 5980 } 5981 5982 static int 5983 bdev_nvme_load_psk(const char *fname, char *buf, size_t bufsz) 5984 { 5985 FILE *psk_file; 5986 struct stat statbuf; 5987 int rc; 5988 #define TCP_PSK_INVALID_PERMISSIONS 0177 5989 5990 if (stat(fname, &statbuf) != 0) { 5991 SPDK_ERRLOG("Could not read permissions for PSK file\n"); 5992 return -EACCES; 5993 } 5994 5995 if ((statbuf.st_mode & TCP_PSK_INVALID_PERMISSIONS) != 0) { 5996 SPDK_ERRLOG("Incorrect permissions for PSK file\n"); 5997 return -EPERM; 5998 } 5999 if ((size_t)statbuf.st_size >= bufsz) { 6000 SPDK_ERRLOG("Invalid PSK: too long\n"); 6001 return -EINVAL; 6002 } 6003 psk_file = fopen(fname, "r"); 6004 if (psk_file == NULL) { 6005 SPDK_ERRLOG("Could not open PSK file\n"); 6006 return -EINVAL; 6007 } 6008 6009 memset(buf, 0, bufsz); 6010 rc = fread(buf, 1, statbuf.st_size, psk_file); 6011 if (rc != statbuf.st_size) { 6012 SPDK_ERRLOG("Failed to read PSK\n"); 6013 fclose(psk_file); 6014 return -EINVAL; 6015 } 6016 6017 fclose(psk_file); 6018 return 0; 6019 } 6020 6021 int 6022 bdev_nvme_create(struct spdk_nvme_transport_id *trid, 6023 const char *base_name, 6024 const char **names, 6025 uint32_t count, 6026 spdk_bdev_create_nvme_fn cb_fn, 6027 void *cb_ctx, 6028 struct spdk_nvme_ctrlr_opts *drv_opts, 6029 struct nvme_ctrlr_opts *bdev_opts, 6030 bool multipath) 6031 { 6032 struct nvme_probe_skip_entry *entry, *tmp; 6033 struct nvme_async_probe_ctx *ctx; 6034 spdk_nvme_attach_cb attach_cb; 6035 int rc, len; 6036 6037 /* TODO expand this check to include both the host and target TRIDs. 6038 * Only if both are the same should we fail. 6039 */ 6040 if (nvme_ctrlr_get(trid) != NULL) { 6041 SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr); 6042 return -EEXIST; 6043 } 6044 6045 len = strnlen(base_name, SPDK_CONTROLLER_NAME_MAX); 6046 6047 if (len == 0 || len == SPDK_CONTROLLER_NAME_MAX) { 6048 SPDK_ERRLOG("controller name must be between 1 and %d characters\n", SPDK_CONTROLLER_NAME_MAX - 1); 6049 return -EINVAL; 6050 } 6051 6052 if (bdev_opts != NULL && 6053 !bdev_nvme_check_io_error_resiliency_params(bdev_opts->ctrlr_loss_timeout_sec, 6054 bdev_opts->reconnect_delay_sec, 6055 bdev_opts->fast_io_fail_timeout_sec)) { 6056 return -EINVAL; 6057 } 6058 6059 ctx = calloc(1, sizeof(*ctx)); 6060 if (!ctx) { 6061 return -ENOMEM; 6062 } 6063 ctx->base_name = base_name; 6064 ctx->names = names; 6065 ctx->max_bdevs = count; 6066 ctx->cb_fn = cb_fn; 6067 ctx->cb_ctx = cb_ctx; 6068 ctx->trid = *trid; 6069 6070 if (bdev_opts) { 6071 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 6072 } else { 6073 bdev_nvme_get_default_ctrlr_opts(&ctx->bdev_opts); 6074 } 6075 6076 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 6077 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) { 6078 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 6079 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 6080 free(entry); 6081 break; 6082 } 6083 } 6084 } 6085 6086 if (drv_opts) { 6087 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 6088 } else { 6089 spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->drv_opts, sizeof(ctx->drv_opts)); 6090 } 6091 6092 ctx->drv_opts.transport_retry_count = g_opts.transport_retry_count; 6093 ctx->drv_opts.transport_ack_timeout = g_opts.transport_ack_timeout; 6094 ctx->drv_opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms; 6095 ctx->drv_opts.disable_read_ana_log_page = true; 6096 ctx->drv_opts.transport_tos = g_opts.transport_tos; 6097 6098 if (ctx->bdev_opts.psk[0] != '\0') { 6099 /* Try to use the keyring first */ 6100 ctx->drv_opts.tls_psk = spdk_keyring_get_key(ctx->bdev_opts.psk); 6101 if (ctx->drv_opts.tls_psk == NULL) { 6102 rc = bdev_nvme_load_psk(ctx->bdev_opts.psk, 6103 ctx->drv_opts.psk, sizeof(ctx->drv_opts.psk)); 6104 if (rc != 0) { 6105 SPDK_ERRLOG("Could not load PSK from %s\n", ctx->bdev_opts.psk); 6106 free_nvme_async_probe_ctx(ctx); 6107 return rc; 6108 } 6109 } 6110 } 6111 6112 if (ctx->bdev_opts.dhchap_key != NULL) { 6113 ctx->drv_opts.dhchap_key = spdk_keyring_get_key(ctx->bdev_opts.dhchap_key); 6114 if (ctx->drv_opts.dhchap_key == NULL) { 6115 SPDK_ERRLOG("Could not load DH-HMAC-CHAP key: %s\n", 6116 ctx->bdev_opts.dhchap_key); 6117 free_nvme_async_probe_ctx(ctx); 6118 return -ENOKEY; 6119 } 6120 6121 ctx->drv_opts.dhchap_digests = g_opts.dhchap_digests; 6122 ctx->drv_opts.dhchap_dhgroups = g_opts.dhchap_dhgroups; 6123 } 6124 6125 if (nvme_bdev_ctrlr_get_by_name(base_name) == NULL || multipath) { 6126 attach_cb = connect_attach_cb; 6127 } else { 6128 attach_cb = connect_set_failover_cb; 6129 } 6130 6131 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, attach_cb); 6132 if (ctx->probe_ctx == NULL) { 6133 SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr); 6134 free_nvme_async_probe_ctx(ctx); 6135 return -ENODEV; 6136 } 6137 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000); 6138 6139 return 0; 6140 } 6141 6142 struct bdev_nvme_delete_ctx { 6143 char *name; 6144 struct nvme_path_id path_id; 6145 bdev_nvme_delete_done_fn delete_done; 6146 void *delete_done_ctx; 6147 uint64_t timeout_ticks; 6148 struct spdk_poller *poller; 6149 }; 6150 6151 static void 6152 free_bdev_nvme_delete_ctx(struct bdev_nvme_delete_ctx *ctx) 6153 { 6154 if (ctx != NULL) { 6155 free(ctx->name); 6156 free(ctx); 6157 } 6158 } 6159 6160 static bool 6161 nvme_path_id_compare(struct nvme_path_id *p, const struct nvme_path_id *path_id) 6162 { 6163 if (path_id->trid.trtype != 0) { 6164 if (path_id->trid.trtype == SPDK_NVME_TRANSPORT_CUSTOM) { 6165 if (strcasecmp(path_id->trid.trstring, p->trid.trstring) != 0) { 6166 return false; 6167 } 6168 } else { 6169 if (path_id->trid.trtype != p->trid.trtype) { 6170 return false; 6171 } 6172 } 6173 } 6174 6175 if (!spdk_mem_all_zero(path_id->trid.traddr, sizeof(path_id->trid.traddr))) { 6176 if (strcasecmp(path_id->trid.traddr, p->trid.traddr) != 0) { 6177 return false; 6178 } 6179 } 6180 6181 if (path_id->trid.adrfam != 0) { 6182 if (path_id->trid.adrfam != p->trid.adrfam) { 6183 return false; 6184 } 6185 } 6186 6187 if (!spdk_mem_all_zero(path_id->trid.trsvcid, sizeof(path_id->trid.trsvcid))) { 6188 if (strcasecmp(path_id->trid.trsvcid, p->trid.trsvcid) != 0) { 6189 return false; 6190 } 6191 } 6192 6193 if (!spdk_mem_all_zero(path_id->trid.subnqn, sizeof(path_id->trid.subnqn))) { 6194 if (strcmp(path_id->trid.subnqn, p->trid.subnqn) != 0) { 6195 return false; 6196 } 6197 } 6198 6199 if (!spdk_mem_all_zero(path_id->hostid.hostaddr, sizeof(path_id->hostid.hostaddr))) { 6200 if (strcmp(path_id->hostid.hostaddr, p->hostid.hostaddr) != 0) { 6201 return false; 6202 } 6203 } 6204 6205 if (!spdk_mem_all_zero(path_id->hostid.hostsvcid, sizeof(path_id->hostid.hostsvcid))) { 6206 if (strcmp(path_id->hostid.hostsvcid, p->hostid.hostsvcid) != 0) { 6207 return false; 6208 } 6209 } 6210 6211 return true; 6212 } 6213 6214 static bool 6215 nvme_path_id_exists(const char *name, const struct nvme_path_id *path_id) 6216 { 6217 struct nvme_bdev_ctrlr *nbdev_ctrlr; 6218 struct nvme_ctrlr *ctrlr; 6219 struct nvme_path_id *p; 6220 6221 pthread_mutex_lock(&g_bdev_nvme_mutex); 6222 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 6223 if (!nbdev_ctrlr) { 6224 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6225 return false; 6226 } 6227 6228 TAILQ_FOREACH(ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 6229 pthread_mutex_lock(&ctrlr->mutex); 6230 TAILQ_FOREACH(p, &ctrlr->trids, link) { 6231 if (nvme_path_id_compare(p, path_id)) { 6232 pthread_mutex_unlock(&ctrlr->mutex); 6233 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6234 return true; 6235 } 6236 } 6237 pthread_mutex_unlock(&ctrlr->mutex); 6238 } 6239 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6240 6241 return false; 6242 } 6243 6244 static int 6245 bdev_nvme_delete_complete_poll(void *arg) 6246 { 6247 struct bdev_nvme_delete_ctx *ctx = arg; 6248 int rc = 0; 6249 6250 if (nvme_path_id_exists(ctx->name, &ctx->path_id)) { 6251 if (ctx->timeout_ticks > spdk_get_ticks()) { 6252 return SPDK_POLLER_BUSY; 6253 } 6254 6255 SPDK_ERRLOG("NVMe path '%s' still exists after delete\n", ctx->name); 6256 rc = -ETIMEDOUT; 6257 } 6258 6259 spdk_poller_unregister(&ctx->poller); 6260 6261 ctx->delete_done(ctx->delete_done_ctx, rc); 6262 free_bdev_nvme_delete_ctx(ctx); 6263 6264 return SPDK_POLLER_BUSY; 6265 } 6266 6267 static int 6268 _bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, const struct nvme_path_id *path_id) 6269 { 6270 struct nvme_path_id *p, *t; 6271 spdk_msg_fn msg_fn; 6272 int rc = -ENXIO; 6273 6274 pthread_mutex_lock(&nvme_ctrlr->mutex); 6275 6276 TAILQ_FOREACH_REVERSE_SAFE(p, &nvme_ctrlr->trids, nvme_paths, link, t) { 6277 if (p == TAILQ_FIRST(&nvme_ctrlr->trids)) { 6278 break; 6279 } 6280 6281 if (!nvme_path_id_compare(p, path_id)) { 6282 continue; 6283 } 6284 6285 /* We are not using the specified path. */ 6286 TAILQ_REMOVE(&nvme_ctrlr->trids, p, link); 6287 free(p); 6288 rc = 0; 6289 } 6290 6291 if (p == NULL || !nvme_path_id_compare(p, path_id)) { 6292 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6293 return rc; 6294 } 6295 6296 /* If we made it here, then this path is a match! Now we need to remove it. */ 6297 6298 /* This is the active path in use right now. The active path is always the first in the list. */ 6299 assert(p == nvme_ctrlr->active_path_id); 6300 6301 if (!TAILQ_NEXT(p, link)) { 6302 /* The current path is the only path. */ 6303 msg_fn = _nvme_ctrlr_destruct; 6304 rc = bdev_nvme_delete_ctrlr_unsafe(nvme_ctrlr, false); 6305 } else { 6306 /* There is an alternative path. */ 6307 msg_fn = _bdev_nvme_reset_ctrlr; 6308 rc = bdev_nvme_failover_ctrlr_unsafe(nvme_ctrlr, true); 6309 } 6310 6311 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6312 6313 if (rc == 0) { 6314 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 6315 } else if (rc == -EALREADY) { 6316 rc = 0; 6317 } 6318 6319 return rc; 6320 } 6321 6322 int 6323 bdev_nvme_delete(const char *name, const struct nvme_path_id *path_id, 6324 bdev_nvme_delete_done_fn delete_done, void *delete_done_ctx) 6325 { 6326 struct nvme_bdev_ctrlr *nbdev_ctrlr; 6327 struct nvme_ctrlr *nvme_ctrlr, *tmp_nvme_ctrlr; 6328 struct bdev_nvme_delete_ctx *ctx = NULL; 6329 int rc = -ENXIO, _rc; 6330 6331 if (name == NULL || path_id == NULL) { 6332 rc = -EINVAL; 6333 goto exit; 6334 } 6335 6336 pthread_mutex_lock(&g_bdev_nvme_mutex); 6337 6338 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 6339 if (nbdev_ctrlr == NULL) { 6340 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6341 6342 SPDK_ERRLOG("Failed to find NVMe bdev controller\n"); 6343 rc = -ENODEV; 6344 goto exit; 6345 } 6346 6347 TAILQ_FOREACH_SAFE(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq, tmp_nvme_ctrlr) { 6348 _rc = _bdev_nvme_delete(nvme_ctrlr, path_id); 6349 if (_rc < 0 && _rc != -ENXIO) { 6350 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6351 rc = _rc; 6352 goto exit; 6353 } else if (_rc == 0) { 6354 /* We traverse all remaining nvme_ctrlrs even if one nvme_ctrlr 6355 * was deleted successfully. To remember the successful deletion, 6356 * overwrite rc only if _rc is zero. 6357 */ 6358 rc = 0; 6359 } 6360 } 6361 6362 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6363 6364 if (rc != 0 || delete_done == NULL) { 6365 goto exit; 6366 } 6367 6368 ctx = calloc(1, sizeof(*ctx)); 6369 if (ctx == NULL) { 6370 SPDK_ERRLOG("Failed to allocate context for bdev_nvme_delete\n"); 6371 rc = -ENOMEM; 6372 goto exit; 6373 } 6374 6375 ctx->name = strdup(name); 6376 if (ctx->name == NULL) { 6377 SPDK_ERRLOG("Failed to copy controller name for deletion\n"); 6378 rc = -ENOMEM; 6379 goto exit; 6380 } 6381 6382 ctx->delete_done = delete_done; 6383 ctx->delete_done_ctx = delete_done_ctx; 6384 ctx->path_id = *path_id; 6385 ctx->timeout_ticks = spdk_get_ticks() + 10 * spdk_get_ticks_hz(); 6386 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_delete_complete_poll, ctx, 1000); 6387 if (ctx->poller == NULL) { 6388 SPDK_ERRLOG("Failed to register bdev_nvme_delete poller\n"); 6389 rc = -ENOMEM; 6390 goto exit; 6391 } 6392 6393 exit: 6394 if (rc != 0) { 6395 free_bdev_nvme_delete_ctx(ctx); 6396 } 6397 6398 return rc; 6399 } 6400 6401 #define DISCOVERY_INFOLOG(ctx, format, ...) \ 6402 SPDK_INFOLOG(bdev_nvme, "Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 6403 6404 #define DISCOVERY_ERRLOG(ctx, format, ...) \ 6405 SPDK_ERRLOG("Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 6406 6407 struct discovery_entry_ctx { 6408 char name[128]; 6409 struct spdk_nvme_transport_id trid; 6410 struct spdk_nvme_ctrlr_opts drv_opts; 6411 struct spdk_nvmf_discovery_log_page_entry entry; 6412 TAILQ_ENTRY(discovery_entry_ctx) tailq; 6413 struct discovery_ctx *ctx; 6414 }; 6415 6416 struct discovery_ctx { 6417 char *name; 6418 spdk_bdev_nvme_start_discovery_fn start_cb_fn; 6419 spdk_bdev_nvme_stop_discovery_fn stop_cb_fn; 6420 void *cb_ctx; 6421 struct spdk_nvme_probe_ctx *probe_ctx; 6422 struct spdk_nvme_detach_ctx *detach_ctx; 6423 struct spdk_nvme_ctrlr *ctrlr; 6424 struct spdk_nvme_transport_id trid; 6425 struct discovery_entry_ctx *entry_ctx_in_use; 6426 struct spdk_poller *poller; 6427 struct spdk_nvme_ctrlr_opts drv_opts; 6428 struct nvme_ctrlr_opts bdev_opts; 6429 struct spdk_nvmf_discovery_log_page *log_page; 6430 TAILQ_ENTRY(discovery_ctx) tailq; 6431 TAILQ_HEAD(, discovery_entry_ctx) nvm_entry_ctxs; 6432 TAILQ_HEAD(, discovery_entry_ctx) discovery_entry_ctxs; 6433 int rc; 6434 bool wait_for_attach; 6435 uint64_t timeout_ticks; 6436 /* Denotes that the discovery service is being started. We're waiting 6437 * for the initial connection to the discovery controller to be 6438 * established and attach discovered NVM ctrlrs. 6439 */ 6440 bool initializing; 6441 /* Denotes if a discovery is currently in progress for this context. 6442 * That includes connecting to newly discovered subsystems. Used to 6443 * ensure we do not start a new discovery until an existing one is 6444 * complete. 6445 */ 6446 bool in_progress; 6447 6448 /* Denotes if another discovery is needed after the one in progress 6449 * completes. Set when we receive an AER completion while a discovery 6450 * is already in progress. 6451 */ 6452 bool pending; 6453 6454 /* Signal to the discovery context poller that it should stop the 6455 * discovery service, including detaching from the current discovery 6456 * controller. 6457 */ 6458 bool stop; 6459 6460 struct spdk_thread *calling_thread; 6461 uint32_t index; 6462 uint32_t attach_in_progress; 6463 char *hostnqn; 6464 6465 /* Denotes if the discovery service was started by the mdns discovery. 6466 */ 6467 bool from_mdns_discovery_service; 6468 }; 6469 6470 TAILQ_HEAD(discovery_ctxs, discovery_ctx); 6471 static struct discovery_ctxs g_discovery_ctxs = TAILQ_HEAD_INITIALIZER(g_discovery_ctxs); 6472 6473 static void get_discovery_log_page(struct discovery_ctx *ctx); 6474 6475 static void 6476 free_discovery_ctx(struct discovery_ctx *ctx) 6477 { 6478 free(ctx->log_page); 6479 free(ctx->hostnqn); 6480 free(ctx->name); 6481 free(ctx); 6482 } 6483 6484 static void 6485 discovery_complete(struct discovery_ctx *ctx) 6486 { 6487 ctx->initializing = false; 6488 ctx->in_progress = false; 6489 if (ctx->pending) { 6490 ctx->pending = false; 6491 get_discovery_log_page(ctx); 6492 } 6493 } 6494 6495 static void 6496 build_trid_from_log_page_entry(struct spdk_nvme_transport_id *trid, 6497 struct spdk_nvmf_discovery_log_page_entry *entry) 6498 { 6499 char *space; 6500 6501 trid->trtype = entry->trtype; 6502 trid->adrfam = entry->adrfam; 6503 memcpy(trid->traddr, entry->traddr, sizeof(entry->traddr)); 6504 memcpy(trid->trsvcid, entry->trsvcid, sizeof(entry->trsvcid)); 6505 /* Because the source buffer (entry->subnqn) is longer than trid->subnqn, and 6506 * before call to this function trid->subnqn is zeroed out, we need 6507 * to copy sizeof(trid->subnqn) minus one byte to make sure the last character 6508 * remains 0. Then we can shorten the string (replace ' ' with 0) if required 6509 */ 6510 memcpy(trid->subnqn, entry->subnqn, sizeof(trid->subnqn) - 1); 6511 6512 /* We want the traddr, trsvcid and subnqn fields to be NULL-terminated. 6513 * But the log page entries typically pad them with spaces, not zeroes. 6514 * So add a NULL terminator to each of these fields at the appropriate 6515 * location. 6516 */ 6517 space = strchr(trid->traddr, ' '); 6518 if (space) { 6519 *space = 0; 6520 } 6521 space = strchr(trid->trsvcid, ' '); 6522 if (space) { 6523 *space = 0; 6524 } 6525 space = strchr(trid->subnqn, ' '); 6526 if (space) { 6527 *space = 0; 6528 } 6529 } 6530 6531 static void 6532 _stop_discovery(void *_ctx) 6533 { 6534 struct discovery_ctx *ctx = _ctx; 6535 6536 if (ctx->attach_in_progress > 0) { 6537 spdk_thread_send_msg(spdk_get_thread(), _stop_discovery, ctx); 6538 return; 6539 } 6540 6541 ctx->stop = true; 6542 6543 while (!TAILQ_EMPTY(&ctx->nvm_entry_ctxs)) { 6544 struct discovery_entry_ctx *entry_ctx; 6545 struct nvme_path_id path = {}; 6546 6547 entry_ctx = TAILQ_FIRST(&ctx->nvm_entry_ctxs); 6548 path.trid = entry_ctx->trid; 6549 bdev_nvme_delete(entry_ctx->name, &path, NULL, NULL); 6550 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 6551 free(entry_ctx); 6552 } 6553 6554 while (!TAILQ_EMPTY(&ctx->discovery_entry_ctxs)) { 6555 struct discovery_entry_ctx *entry_ctx; 6556 6557 entry_ctx = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 6558 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 6559 free(entry_ctx); 6560 } 6561 6562 free(ctx->entry_ctx_in_use); 6563 ctx->entry_ctx_in_use = NULL; 6564 } 6565 6566 static void 6567 stop_discovery(struct discovery_ctx *ctx, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 6568 { 6569 ctx->stop_cb_fn = cb_fn; 6570 ctx->cb_ctx = cb_ctx; 6571 6572 if (ctx->attach_in_progress > 0) { 6573 DISCOVERY_INFOLOG(ctx, "stopping discovery with attach_in_progress: %"PRIu32"\n", 6574 ctx->attach_in_progress); 6575 } 6576 6577 _stop_discovery(ctx); 6578 } 6579 6580 static void 6581 remove_discovery_entry(struct nvme_ctrlr *nvme_ctrlr) 6582 { 6583 struct discovery_ctx *d_ctx; 6584 struct nvme_path_id *path_id; 6585 struct spdk_nvme_transport_id trid = {}; 6586 struct discovery_entry_ctx *entry_ctx, *tmp; 6587 6588 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 6589 6590 TAILQ_FOREACH(d_ctx, &g_discovery_ctxs, tailq) { 6591 TAILQ_FOREACH_SAFE(entry_ctx, &d_ctx->nvm_entry_ctxs, tailq, tmp) { 6592 build_trid_from_log_page_entry(&trid, &entry_ctx->entry); 6593 if (spdk_nvme_transport_id_compare(&trid, &path_id->trid) != 0) { 6594 continue; 6595 } 6596 6597 TAILQ_REMOVE(&d_ctx->nvm_entry_ctxs, entry_ctx, tailq); 6598 free(entry_ctx); 6599 DISCOVERY_INFOLOG(d_ctx, "Remove discovery entry: %s:%s:%s\n", 6600 trid.subnqn, trid.traddr, trid.trsvcid); 6601 6602 /* Fail discovery ctrlr to force reattach attempt */ 6603 spdk_nvme_ctrlr_fail(d_ctx->ctrlr); 6604 } 6605 } 6606 } 6607 6608 static void 6609 discovery_remove_controllers(struct discovery_ctx *ctx) 6610 { 6611 struct spdk_nvmf_discovery_log_page *log_page = ctx->log_page; 6612 struct discovery_entry_ctx *entry_ctx, *tmp; 6613 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 6614 struct spdk_nvme_transport_id old_trid = {}; 6615 uint64_t numrec, i; 6616 bool found; 6617 6618 numrec = from_le64(&log_page->numrec); 6619 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->nvm_entry_ctxs, tailq, tmp) { 6620 found = false; 6621 old_entry = &entry_ctx->entry; 6622 build_trid_from_log_page_entry(&old_trid, old_entry); 6623 for (i = 0; i < numrec; i++) { 6624 new_entry = &log_page->entries[i]; 6625 if (!memcmp(old_entry, new_entry, sizeof(*old_entry))) { 6626 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s found again\n", 6627 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 6628 found = true; 6629 break; 6630 } 6631 } 6632 if (!found) { 6633 struct nvme_path_id path = {}; 6634 6635 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s not found\n", 6636 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 6637 6638 path.trid = entry_ctx->trid; 6639 bdev_nvme_delete(entry_ctx->name, &path, NULL, NULL); 6640 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 6641 free(entry_ctx); 6642 } 6643 } 6644 free(log_page); 6645 ctx->log_page = NULL; 6646 discovery_complete(ctx); 6647 } 6648 6649 static void 6650 complete_discovery_start(struct discovery_ctx *ctx, int status) 6651 { 6652 ctx->timeout_ticks = 0; 6653 ctx->rc = status; 6654 if (ctx->start_cb_fn) { 6655 ctx->start_cb_fn(ctx->cb_ctx, status); 6656 ctx->start_cb_fn = NULL; 6657 ctx->cb_ctx = NULL; 6658 } 6659 } 6660 6661 static void 6662 discovery_attach_controller_done(void *cb_ctx, size_t bdev_count, int rc) 6663 { 6664 struct discovery_entry_ctx *entry_ctx = cb_ctx; 6665 struct discovery_ctx *ctx = entry_ctx->ctx; 6666 6667 DISCOVERY_INFOLOG(ctx, "attach %s done\n", entry_ctx->name); 6668 ctx->attach_in_progress--; 6669 if (ctx->attach_in_progress == 0) { 6670 complete_discovery_start(ctx, ctx->rc); 6671 if (ctx->initializing && ctx->rc != 0) { 6672 DISCOVERY_ERRLOG(ctx, "stopping discovery due to errors: %d\n", ctx->rc); 6673 stop_discovery(ctx, NULL, ctx->cb_ctx); 6674 } else { 6675 discovery_remove_controllers(ctx); 6676 } 6677 } 6678 } 6679 6680 static struct discovery_entry_ctx * 6681 create_discovery_entry_ctx(struct discovery_ctx *ctx, struct spdk_nvme_transport_id *trid) 6682 { 6683 struct discovery_entry_ctx *new_ctx; 6684 6685 new_ctx = calloc(1, sizeof(*new_ctx)); 6686 if (new_ctx == NULL) { 6687 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6688 return NULL; 6689 } 6690 6691 new_ctx->ctx = ctx; 6692 memcpy(&new_ctx->trid, trid, sizeof(*trid)); 6693 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 6694 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 6695 return new_ctx; 6696 } 6697 6698 static void 6699 discovery_log_page_cb(void *cb_arg, int rc, const struct spdk_nvme_cpl *cpl, 6700 struct spdk_nvmf_discovery_log_page *log_page) 6701 { 6702 struct discovery_ctx *ctx = cb_arg; 6703 struct discovery_entry_ctx *entry_ctx, *tmp; 6704 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 6705 uint64_t numrec, i; 6706 bool found; 6707 6708 if (rc || spdk_nvme_cpl_is_error(cpl)) { 6709 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 6710 return; 6711 } 6712 6713 ctx->log_page = log_page; 6714 assert(ctx->attach_in_progress == 0); 6715 numrec = from_le64(&log_page->numrec); 6716 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->discovery_entry_ctxs, tailq, tmp) { 6717 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 6718 free(entry_ctx); 6719 } 6720 for (i = 0; i < numrec; i++) { 6721 found = false; 6722 new_entry = &log_page->entries[i]; 6723 if (new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY_CURRENT || 6724 new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) { 6725 struct discovery_entry_ctx *new_ctx; 6726 struct spdk_nvme_transport_id trid = {}; 6727 6728 build_trid_from_log_page_entry(&trid, new_entry); 6729 new_ctx = create_discovery_entry_ctx(ctx, &trid); 6730 if (new_ctx == NULL) { 6731 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6732 break; 6733 } 6734 6735 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, new_ctx, tailq); 6736 continue; 6737 } 6738 TAILQ_FOREACH(entry_ctx, &ctx->nvm_entry_ctxs, tailq) { 6739 old_entry = &entry_ctx->entry; 6740 if (!memcmp(new_entry, old_entry, sizeof(*new_entry))) { 6741 found = true; 6742 break; 6743 } 6744 } 6745 if (!found) { 6746 struct discovery_entry_ctx *subnqn_ctx = NULL, *new_ctx; 6747 struct discovery_ctx *d_ctx; 6748 6749 TAILQ_FOREACH(d_ctx, &g_discovery_ctxs, tailq) { 6750 TAILQ_FOREACH(subnqn_ctx, &d_ctx->nvm_entry_ctxs, tailq) { 6751 if (!memcmp(subnqn_ctx->entry.subnqn, new_entry->subnqn, 6752 sizeof(new_entry->subnqn))) { 6753 break; 6754 } 6755 } 6756 if (subnqn_ctx) { 6757 break; 6758 } 6759 } 6760 6761 new_ctx = calloc(1, sizeof(*new_ctx)); 6762 if (new_ctx == NULL) { 6763 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6764 break; 6765 } 6766 6767 new_ctx->ctx = ctx; 6768 memcpy(&new_ctx->entry, new_entry, sizeof(*new_entry)); 6769 build_trid_from_log_page_entry(&new_ctx->trid, new_entry); 6770 if (subnqn_ctx) { 6771 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s", subnqn_ctx->name); 6772 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new path for %s\n", 6773 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 6774 new_ctx->name); 6775 } else { 6776 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s%d", ctx->name, ctx->index++); 6777 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new subsystem %s\n", 6778 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 6779 new_ctx->name); 6780 } 6781 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 6782 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 6783 rc = bdev_nvme_create(&new_ctx->trid, new_ctx->name, NULL, 0, 6784 discovery_attach_controller_done, new_ctx, 6785 &new_ctx->drv_opts, &ctx->bdev_opts, true); 6786 if (rc == 0) { 6787 TAILQ_INSERT_TAIL(&ctx->nvm_entry_ctxs, new_ctx, tailq); 6788 ctx->attach_in_progress++; 6789 } else { 6790 DISCOVERY_ERRLOG(ctx, "bdev_nvme_create failed (%s)\n", spdk_strerror(-rc)); 6791 } 6792 } 6793 } 6794 6795 if (ctx->attach_in_progress == 0) { 6796 discovery_remove_controllers(ctx); 6797 } 6798 } 6799 6800 static void 6801 get_discovery_log_page(struct discovery_ctx *ctx) 6802 { 6803 int rc; 6804 6805 assert(ctx->in_progress == false); 6806 ctx->in_progress = true; 6807 rc = spdk_nvme_ctrlr_get_discovery_log_page(ctx->ctrlr, discovery_log_page_cb, ctx); 6808 if (rc != 0) { 6809 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 6810 } 6811 DISCOVERY_INFOLOG(ctx, "sent discovery log page command\n"); 6812 } 6813 6814 static void 6815 discovery_aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 6816 { 6817 struct discovery_ctx *ctx = arg; 6818 uint32_t log_page_id = (cpl->cdw0 & 0xFF0000) >> 16; 6819 6820 if (spdk_nvme_cpl_is_error(cpl)) { 6821 DISCOVERY_ERRLOG(ctx, "aer failed\n"); 6822 return; 6823 } 6824 6825 if (log_page_id != SPDK_NVME_LOG_DISCOVERY) { 6826 DISCOVERY_ERRLOG(ctx, "unexpected log page 0x%x\n", log_page_id); 6827 return; 6828 } 6829 6830 DISCOVERY_INFOLOG(ctx, "got aer\n"); 6831 if (ctx->in_progress) { 6832 ctx->pending = true; 6833 return; 6834 } 6835 6836 get_discovery_log_page(ctx); 6837 } 6838 6839 static void 6840 discovery_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 6841 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 6842 { 6843 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 6844 struct discovery_ctx *ctx; 6845 6846 ctx = SPDK_CONTAINEROF(user_opts, struct discovery_ctx, drv_opts); 6847 6848 DISCOVERY_INFOLOG(ctx, "discovery ctrlr attached\n"); 6849 ctx->probe_ctx = NULL; 6850 ctx->ctrlr = ctrlr; 6851 6852 if (ctx->rc != 0) { 6853 DISCOVERY_ERRLOG(ctx, "encountered error while attaching discovery ctrlr: %d\n", 6854 ctx->rc); 6855 return; 6856 } 6857 6858 spdk_nvme_ctrlr_register_aer_callback(ctx->ctrlr, discovery_aer_cb, ctx); 6859 } 6860 6861 static int 6862 discovery_poller(void *arg) 6863 { 6864 struct discovery_ctx *ctx = arg; 6865 struct spdk_nvme_transport_id *trid; 6866 int rc; 6867 6868 if (ctx->detach_ctx) { 6869 rc = spdk_nvme_detach_poll_async(ctx->detach_ctx); 6870 if (rc != -EAGAIN) { 6871 ctx->detach_ctx = NULL; 6872 ctx->ctrlr = NULL; 6873 } 6874 } else if (ctx->stop) { 6875 if (ctx->ctrlr != NULL) { 6876 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 6877 if (rc == 0) { 6878 return SPDK_POLLER_BUSY; 6879 } 6880 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 6881 } 6882 spdk_poller_unregister(&ctx->poller); 6883 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 6884 assert(ctx->start_cb_fn == NULL); 6885 if (ctx->stop_cb_fn != NULL) { 6886 ctx->stop_cb_fn(ctx->cb_ctx); 6887 } 6888 free_discovery_ctx(ctx); 6889 } else if (ctx->probe_ctx == NULL && ctx->ctrlr == NULL) { 6890 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 6891 DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); 6892 assert(ctx->initializing); 6893 spdk_poller_unregister(&ctx->poller); 6894 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 6895 complete_discovery_start(ctx, -ETIMEDOUT); 6896 stop_discovery(ctx, NULL, NULL); 6897 free_discovery_ctx(ctx); 6898 return SPDK_POLLER_BUSY; 6899 } 6900 6901 assert(ctx->entry_ctx_in_use == NULL); 6902 ctx->entry_ctx_in_use = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 6903 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 6904 trid = &ctx->entry_ctx_in_use->trid; 6905 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, discovery_attach_cb); 6906 if (ctx->probe_ctx) { 6907 spdk_poller_unregister(&ctx->poller); 6908 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000); 6909 } else { 6910 DISCOVERY_ERRLOG(ctx, "could not start discovery connect\n"); 6911 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 6912 ctx->entry_ctx_in_use = NULL; 6913 } 6914 } else if (ctx->probe_ctx) { 6915 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 6916 DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); 6917 complete_discovery_start(ctx, -ETIMEDOUT); 6918 return SPDK_POLLER_BUSY; 6919 } 6920 6921 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 6922 if (rc != -EAGAIN) { 6923 if (ctx->rc != 0) { 6924 assert(ctx->initializing); 6925 stop_discovery(ctx, NULL, ctx->cb_ctx); 6926 } else { 6927 assert(rc == 0); 6928 DISCOVERY_INFOLOG(ctx, "discovery ctrlr connected\n"); 6929 ctx->rc = rc; 6930 get_discovery_log_page(ctx); 6931 } 6932 } 6933 } else { 6934 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 6935 DISCOVERY_ERRLOG(ctx, "timed out while attaching NVM ctrlrs\n"); 6936 complete_discovery_start(ctx, -ETIMEDOUT); 6937 /* We need to wait until all NVM ctrlrs are attached before we stop the 6938 * discovery service to make sure we don't detach a ctrlr that is still 6939 * being attached. 6940 */ 6941 if (ctx->attach_in_progress == 0) { 6942 stop_discovery(ctx, NULL, ctx->cb_ctx); 6943 return SPDK_POLLER_BUSY; 6944 } 6945 } 6946 6947 rc = spdk_nvme_ctrlr_process_admin_completions(ctx->ctrlr); 6948 if (rc < 0) { 6949 spdk_poller_unregister(&ctx->poller); 6950 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 6951 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 6952 ctx->entry_ctx_in_use = NULL; 6953 6954 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 6955 if (rc != 0) { 6956 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 6957 ctx->ctrlr = NULL; 6958 } 6959 } 6960 } 6961 6962 return SPDK_POLLER_BUSY; 6963 } 6964 6965 static void 6966 start_discovery_poller(void *arg) 6967 { 6968 struct discovery_ctx *ctx = arg; 6969 6970 TAILQ_INSERT_TAIL(&g_discovery_ctxs, ctx, tailq); 6971 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 6972 } 6973 6974 int 6975 bdev_nvme_start_discovery(struct spdk_nvme_transport_id *trid, 6976 const char *base_name, 6977 struct spdk_nvme_ctrlr_opts *drv_opts, 6978 struct nvme_ctrlr_opts *bdev_opts, 6979 uint64_t attach_timeout, 6980 bool from_mdns, 6981 spdk_bdev_nvme_start_discovery_fn cb_fn, void *cb_ctx) 6982 { 6983 struct discovery_ctx *ctx; 6984 struct discovery_entry_ctx *discovery_entry_ctx; 6985 6986 snprintf(trid->subnqn, sizeof(trid->subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN); 6987 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 6988 if (strcmp(ctx->name, base_name) == 0) { 6989 return -EEXIST; 6990 } 6991 6992 if (ctx->entry_ctx_in_use != NULL) { 6993 if (!spdk_nvme_transport_id_compare(trid, &ctx->entry_ctx_in_use->trid)) { 6994 return -EEXIST; 6995 } 6996 } 6997 6998 TAILQ_FOREACH(discovery_entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 6999 if (!spdk_nvme_transport_id_compare(trid, &discovery_entry_ctx->trid)) { 7000 return -EEXIST; 7001 } 7002 } 7003 } 7004 7005 ctx = calloc(1, sizeof(*ctx)); 7006 if (ctx == NULL) { 7007 return -ENOMEM; 7008 } 7009 7010 ctx->name = strdup(base_name); 7011 if (ctx->name == NULL) { 7012 free_discovery_ctx(ctx); 7013 return -ENOMEM; 7014 } 7015 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 7016 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 7017 ctx->from_mdns_discovery_service = from_mdns; 7018 ctx->bdev_opts.from_discovery_service = true; 7019 ctx->calling_thread = spdk_get_thread(); 7020 ctx->start_cb_fn = cb_fn; 7021 ctx->cb_ctx = cb_ctx; 7022 ctx->initializing = true; 7023 if (ctx->start_cb_fn) { 7024 /* We can use this when dumping json to denote if this RPC parameter 7025 * was specified or not. 7026 */ 7027 ctx->wait_for_attach = true; 7028 } 7029 if (attach_timeout != 0) { 7030 ctx->timeout_ticks = spdk_get_ticks() + attach_timeout * 7031 spdk_get_ticks_hz() / 1000ull; 7032 } 7033 TAILQ_INIT(&ctx->nvm_entry_ctxs); 7034 TAILQ_INIT(&ctx->discovery_entry_ctxs); 7035 memcpy(&ctx->trid, trid, sizeof(*trid)); 7036 /* Even if user did not specify hostnqn, we can still strdup("\0"); */ 7037 ctx->hostnqn = strdup(ctx->drv_opts.hostnqn); 7038 if (ctx->hostnqn == NULL) { 7039 free_discovery_ctx(ctx); 7040 return -ENOMEM; 7041 } 7042 discovery_entry_ctx = create_discovery_entry_ctx(ctx, trid); 7043 if (discovery_entry_ctx == NULL) { 7044 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 7045 free_discovery_ctx(ctx); 7046 return -ENOMEM; 7047 } 7048 7049 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, discovery_entry_ctx, tailq); 7050 spdk_thread_send_msg(g_bdev_nvme_init_thread, start_discovery_poller, ctx); 7051 return 0; 7052 } 7053 7054 int 7055 bdev_nvme_stop_discovery(const char *name, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 7056 { 7057 struct discovery_ctx *ctx; 7058 7059 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 7060 if (strcmp(name, ctx->name) == 0) { 7061 if (ctx->stop) { 7062 return -EALREADY; 7063 } 7064 /* If we're still starting the discovery service and ->rc is non-zero, we're 7065 * going to stop it as soon as we can 7066 */ 7067 if (ctx->initializing && ctx->rc != 0) { 7068 return -EALREADY; 7069 } 7070 stop_discovery(ctx, cb_fn, cb_ctx); 7071 return 0; 7072 } 7073 } 7074 7075 return -ENOENT; 7076 } 7077 7078 static int 7079 bdev_nvme_library_init(void) 7080 { 7081 g_bdev_nvme_init_thread = spdk_get_thread(); 7082 7083 spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_create_poll_group_cb, 7084 bdev_nvme_destroy_poll_group_cb, 7085 sizeof(struct nvme_poll_group), "nvme_poll_groups"); 7086 7087 return 0; 7088 } 7089 7090 static void 7091 bdev_nvme_fini_destruct_ctrlrs(void) 7092 { 7093 struct nvme_bdev_ctrlr *nbdev_ctrlr; 7094 struct nvme_ctrlr *nvme_ctrlr; 7095 7096 pthread_mutex_lock(&g_bdev_nvme_mutex); 7097 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 7098 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 7099 pthread_mutex_lock(&nvme_ctrlr->mutex); 7100 if (nvme_ctrlr->destruct) { 7101 /* This controller's destruction was already started 7102 * before the application started shutting down 7103 */ 7104 pthread_mutex_unlock(&nvme_ctrlr->mutex); 7105 continue; 7106 } 7107 nvme_ctrlr->destruct = true; 7108 pthread_mutex_unlock(&nvme_ctrlr->mutex); 7109 7110 spdk_thread_send_msg(nvme_ctrlr->thread, _nvme_ctrlr_destruct, 7111 nvme_ctrlr); 7112 } 7113 } 7114 7115 g_bdev_nvme_module_finish = true; 7116 if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 7117 pthread_mutex_unlock(&g_bdev_nvme_mutex); 7118 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 7119 spdk_bdev_module_fini_done(); 7120 return; 7121 } 7122 7123 pthread_mutex_unlock(&g_bdev_nvme_mutex); 7124 } 7125 7126 static void 7127 check_discovery_fini(void *arg) 7128 { 7129 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 7130 bdev_nvme_fini_destruct_ctrlrs(); 7131 } 7132 } 7133 7134 static void 7135 bdev_nvme_library_fini(void) 7136 { 7137 struct nvme_probe_skip_entry *entry, *entry_tmp; 7138 struct discovery_ctx *ctx; 7139 7140 spdk_poller_unregister(&g_hotplug_poller); 7141 free(g_hotplug_probe_ctx); 7142 g_hotplug_probe_ctx = NULL; 7143 7144 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) { 7145 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 7146 free(entry); 7147 } 7148 7149 assert(spdk_get_thread() == g_bdev_nvme_init_thread); 7150 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 7151 bdev_nvme_fini_destruct_ctrlrs(); 7152 } else { 7153 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 7154 stop_discovery(ctx, check_discovery_fini, NULL); 7155 } 7156 } 7157 } 7158 7159 static void 7160 bdev_nvme_verify_pi_error(struct nvme_bdev_io *bio) 7161 { 7162 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7163 struct spdk_bdev *bdev = bdev_io->bdev; 7164 struct spdk_dif_ctx dif_ctx; 7165 struct spdk_dif_error err_blk = {}; 7166 int rc; 7167 struct spdk_dif_ctx_init_ext_opts dif_opts; 7168 7169 dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format); 7170 dif_opts.dif_pi_format = SPDK_DIF_PI_FORMAT_16; 7171 rc = spdk_dif_ctx_init(&dif_ctx, 7172 bdev->blocklen, bdev->md_len, bdev->md_interleave, 7173 bdev->dif_is_head_of_md, bdev->dif_type, 7174 bdev_io->u.bdev.dif_check_flags, 7175 bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0, &dif_opts); 7176 if (rc != 0) { 7177 SPDK_ERRLOG("Initialization of DIF context failed\n"); 7178 return; 7179 } 7180 7181 if (bdev->md_interleave) { 7182 rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 7183 bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 7184 } else { 7185 struct iovec md_iov = { 7186 .iov_base = bdev_io->u.bdev.md_buf, 7187 .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len, 7188 }; 7189 7190 rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 7191 &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 7192 } 7193 7194 if (rc != 0) { 7195 SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n", 7196 err_blk.err_type, err_blk.err_offset); 7197 } else { 7198 SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n"); 7199 } 7200 } 7201 7202 static void 7203 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 7204 { 7205 struct nvme_bdev_io *bio = ref; 7206 7207 if (spdk_nvme_cpl_is_success(cpl)) { 7208 /* Run PI verification for read data buffer. */ 7209 bdev_nvme_verify_pi_error(bio); 7210 } 7211 7212 /* Return original completion status */ 7213 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 7214 } 7215 7216 static void 7217 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 7218 { 7219 struct nvme_bdev_io *bio = ref; 7220 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7221 int ret; 7222 7223 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 7224 SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n", 7225 cpl->status.sct, cpl->status.sc); 7226 7227 /* Save completion status to use after verifying PI error. */ 7228 bio->cpl = *cpl; 7229 7230 if (spdk_likely(nvme_io_path_is_available(bio->io_path))) { 7231 /* Read without PI checking to verify PI error. */ 7232 ret = bdev_nvme_no_pi_readv(bio, 7233 bdev_io->u.bdev.iovs, 7234 bdev_io->u.bdev.iovcnt, 7235 bdev_io->u.bdev.md_buf, 7236 bdev_io->u.bdev.num_blocks, 7237 bdev_io->u.bdev.offset_blocks); 7238 if (ret == 0) { 7239 return; 7240 } 7241 } 7242 } 7243 7244 bdev_nvme_io_complete_nvme_status(bio, cpl); 7245 } 7246 7247 static void 7248 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7249 { 7250 struct nvme_bdev_io *bio = ref; 7251 7252 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 7253 SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n", 7254 cpl->status.sct, cpl->status.sc); 7255 /* Run PI verification for write data buffer if PI error is detected. */ 7256 bdev_nvme_verify_pi_error(bio); 7257 } 7258 7259 bdev_nvme_io_complete_nvme_status(bio, cpl); 7260 } 7261 7262 static void 7263 bdev_nvme_zone_appendv_done(void *ref, const struct spdk_nvme_cpl *cpl) 7264 { 7265 struct nvme_bdev_io *bio = ref; 7266 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7267 7268 /* spdk_bdev_io_get_append_location() requires that the ALBA is stored in offset_blocks. 7269 * Additionally, offset_blocks has to be set before calling bdev_nvme_verify_pi_error(). 7270 */ 7271 bdev_io->u.bdev.offset_blocks = *(uint64_t *)&cpl->cdw0; 7272 7273 if (spdk_nvme_cpl_is_pi_error(cpl)) { 7274 SPDK_ERRLOG("zone append completed with PI error (sct=%d, sc=%d)\n", 7275 cpl->status.sct, cpl->status.sc); 7276 /* Run PI verification for zone append data buffer if PI error is detected. */ 7277 bdev_nvme_verify_pi_error(bio); 7278 } 7279 7280 bdev_nvme_io_complete_nvme_status(bio, cpl); 7281 } 7282 7283 static void 7284 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7285 { 7286 struct nvme_bdev_io *bio = ref; 7287 7288 if (spdk_nvme_cpl_is_pi_error(cpl)) { 7289 SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n", 7290 cpl->status.sct, cpl->status.sc); 7291 /* Run PI verification for compare data buffer if PI error is detected. */ 7292 bdev_nvme_verify_pi_error(bio); 7293 } 7294 7295 bdev_nvme_io_complete_nvme_status(bio, cpl); 7296 } 7297 7298 static void 7299 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7300 { 7301 struct nvme_bdev_io *bio = ref; 7302 7303 /* Compare operation completion */ 7304 if (!bio->first_fused_completed) { 7305 /* Save compare result for write callback */ 7306 bio->cpl = *cpl; 7307 bio->first_fused_completed = true; 7308 return; 7309 } 7310 7311 /* Write operation completion */ 7312 if (spdk_nvme_cpl_is_error(&bio->cpl)) { 7313 /* If bio->cpl is already an error, it means the compare operation failed. In that case, 7314 * complete the IO with the compare operation's status. 7315 */ 7316 if (!spdk_nvme_cpl_is_error(cpl)) { 7317 SPDK_ERRLOG("Unexpected write success after compare failure.\n"); 7318 } 7319 7320 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 7321 } else { 7322 bdev_nvme_io_complete_nvme_status(bio, cpl); 7323 } 7324 } 7325 7326 static void 7327 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl) 7328 { 7329 struct nvme_bdev_io *bio = ref; 7330 7331 bdev_nvme_io_complete_nvme_status(bio, cpl); 7332 } 7333 7334 static int 7335 fill_zone_from_report(struct spdk_bdev_zone_info *info, struct spdk_nvme_zns_zone_desc *desc) 7336 { 7337 switch (desc->zt) { 7338 case SPDK_NVME_ZONE_TYPE_SEQWR: 7339 info->type = SPDK_BDEV_ZONE_TYPE_SEQWR; 7340 break; 7341 default: 7342 SPDK_ERRLOG("Invalid zone type: %#x in zone report\n", desc->zt); 7343 return -EIO; 7344 } 7345 7346 switch (desc->zs) { 7347 case SPDK_NVME_ZONE_STATE_EMPTY: 7348 info->state = SPDK_BDEV_ZONE_STATE_EMPTY; 7349 break; 7350 case SPDK_NVME_ZONE_STATE_IOPEN: 7351 info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN; 7352 break; 7353 case SPDK_NVME_ZONE_STATE_EOPEN: 7354 info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN; 7355 break; 7356 case SPDK_NVME_ZONE_STATE_CLOSED: 7357 info->state = SPDK_BDEV_ZONE_STATE_CLOSED; 7358 break; 7359 case SPDK_NVME_ZONE_STATE_RONLY: 7360 info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY; 7361 break; 7362 case SPDK_NVME_ZONE_STATE_FULL: 7363 info->state = SPDK_BDEV_ZONE_STATE_FULL; 7364 break; 7365 case SPDK_NVME_ZONE_STATE_OFFLINE: 7366 info->state = SPDK_BDEV_ZONE_STATE_OFFLINE; 7367 break; 7368 default: 7369 SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", desc->zs); 7370 return -EIO; 7371 } 7372 7373 info->zone_id = desc->zslba; 7374 info->write_pointer = desc->wp; 7375 info->capacity = desc->zcap; 7376 7377 return 0; 7378 } 7379 7380 static void 7381 bdev_nvme_get_zone_info_done(void *ref, const struct spdk_nvme_cpl *cpl) 7382 { 7383 struct nvme_bdev_io *bio = ref; 7384 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7385 uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id; 7386 uint32_t zones_to_copy = bdev_io->u.zone_mgmt.num_zones; 7387 struct spdk_bdev_zone_info *info = bdev_io->u.zone_mgmt.buf; 7388 uint64_t max_zones_per_buf, i; 7389 uint32_t zone_report_bufsize; 7390 struct spdk_nvme_ns *ns; 7391 struct spdk_nvme_qpair *qpair; 7392 int ret; 7393 7394 if (spdk_nvme_cpl_is_error(cpl)) { 7395 goto out_complete_io_nvme_cpl; 7396 } 7397 7398 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 7399 ret = -ENXIO; 7400 goto out_complete_io_ret; 7401 } 7402 7403 ns = bio->io_path->nvme_ns->ns; 7404 qpair = bio->io_path->qpair->qpair; 7405 7406 zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 7407 max_zones_per_buf = (zone_report_bufsize - sizeof(*bio->zone_report_buf)) / 7408 sizeof(bio->zone_report_buf->descs[0]); 7409 7410 if (bio->zone_report_buf->nr_zones > max_zones_per_buf) { 7411 ret = -EINVAL; 7412 goto out_complete_io_ret; 7413 } 7414 7415 if (!bio->zone_report_buf->nr_zones) { 7416 ret = -EINVAL; 7417 goto out_complete_io_ret; 7418 } 7419 7420 for (i = 0; i < bio->zone_report_buf->nr_zones && bio->handled_zones < zones_to_copy; i++) { 7421 ret = fill_zone_from_report(&info[bio->handled_zones], 7422 &bio->zone_report_buf->descs[i]); 7423 if (ret) { 7424 goto out_complete_io_ret; 7425 } 7426 bio->handled_zones++; 7427 } 7428 7429 if (bio->handled_zones < zones_to_copy) { 7430 uint64_t zone_size_lba = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 7431 uint64_t slba = zone_id + (zone_size_lba * bio->handled_zones); 7432 7433 memset(bio->zone_report_buf, 0, zone_report_bufsize); 7434 ret = spdk_nvme_zns_report_zones(ns, qpair, 7435 bio->zone_report_buf, zone_report_bufsize, 7436 slba, SPDK_NVME_ZRA_LIST_ALL, true, 7437 bdev_nvme_get_zone_info_done, bio); 7438 if (!ret) { 7439 return; 7440 } else { 7441 goto out_complete_io_ret; 7442 } 7443 } 7444 7445 out_complete_io_nvme_cpl: 7446 free(bio->zone_report_buf); 7447 bio->zone_report_buf = NULL; 7448 bdev_nvme_io_complete_nvme_status(bio, cpl); 7449 return; 7450 7451 out_complete_io_ret: 7452 free(bio->zone_report_buf); 7453 bio->zone_report_buf = NULL; 7454 bdev_nvme_io_complete(bio, ret); 7455 } 7456 7457 static void 7458 bdev_nvme_zone_management_done(void *ref, const struct spdk_nvme_cpl *cpl) 7459 { 7460 struct nvme_bdev_io *bio = ref; 7461 7462 bdev_nvme_io_complete_nvme_status(bio, cpl); 7463 } 7464 7465 static void 7466 bdev_nvme_admin_passthru_complete_nvme_status(void *ctx) 7467 { 7468 struct nvme_bdev_io *bio = ctx; 7469 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7470 const struct spdk_nvme_cpl *cpl = &bio->cpl; 7471 7472 assert(bdev_nvme_io_type_is_admin(bdev_io->type)); 7473 7474 __bdev_nvme_io_complete(bdev_io, 0, cpl); 7475 } 7476 7477 static void 7478 bdev_nvme_abort_complete(void *ctx) 7479 { 7480 struct nvme_bdev_io *bio = ctx; 7481 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7482 7483 if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) { 7484 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS, NULL); 7485 } else { 7486 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED, NULL); 7487 } 7488 } 7489 7490 static void 7491 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl) 7492 { 7493 struct nvme_bdev_io *bio = ref; 7494 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7495 7496 bio->cpl = *cpl; 7497 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), bdev_nvme_abort_complete, bio); 7498 } 7499 7500 static void 7501 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl) 7502 { 7503 struct nvme_bdev_io *bio = ref; 7504 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7505 7506 bio->cpl = *cpl; 7507 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7508 bdev_nvme_admin_passthru_complete_nvme_status, bio); 7509 } 7510 7511 static void 7512 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset) 7513 { 7514 struct nvme_bdev_io *bio = ref; 7515 struct iovec *iov; 7516 7517 bio->iov_offset = sgl_offset; 7518 for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) { 7519 iov = &bio->iovs[bio->iovpos]; 7520 if (bio->iov_offset < iov->iov_len) { 7521 break; 7522 } 7523 7524 bio->iov_offset -= iov->iov_len; 7525 } 7526 } 7527 7528 static int 7529 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length) 7530 { 7531 struct nvme_bdev_io *bio = ref; 7532 struct iovec *iov; 7533 7534 assert(bio->iovpos < bio->iovcnt); 7535 7536 iov = &bio->iovs[bio->iovpos]; 7537 7538 *address = iov->iov_base; 7539 *length = iov->iov_len; 7540 7541 if (bio->iov_offset) { 7542 assert(bio->iov_offset <= iov->iov_len); 7543 *address += bio->iov_offset; 7544 *length -= bio->iov_offset; 7545 } 7546 7547 bio->iov_offset += *length; 7548 if (bio->iov_offset == iov->iov_len) { 7549 bio->iovpos++; 7550 bio->iov_offset = 0; 7551 } 7552 7553 return 0; 7554 } 7555 7556 static void 7557 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset) 7558 { 7559 struct nvme_bdev_io *bio = ref; 7560 struct iovec *iov; 7561 7562 bio->fused_iov_offset = sgl_offset; 7563 for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) { 7564 iov = &bio->fused_iovs[bio->fused_iovpos]; 7565 if (bio->fused_iov_offset < iov->iov_len) { 7566 break; 7567 } 7568 7569 bio->fused_iov_offset -= iov->iov_len; 7570 } 7571 } 7572 7573 static int 7574 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length) 7575 { 7576 struct nvme_bdev_io *bio = ref; 7577 struct iovec *iov; 7578 7579 assert(bio->fused_iovpos < bio->fused_iovcnt); 7580 7581 iov = &bio->fused_iovs[bio->fused_iovpos]; 7582 7583 *address = iov->iov_base; 7584 *length = iov->iov_len; 7585 7586 if (bio->fused_iov_offset) { 7587 assert(bio->fused_iov_offset <= iov->iov_len); 7588 *address += bio->fused_iov_offset; 7589 *length -= bio->fused_iov_offset; 7590 } 7591 7592 bio->fused_iov_offset += *length; 7593 if (bio->fused_iov_offset == iov->iov_len) { 7594 bio->fused_iovpos++; 7595 bio->fused_iov_offset = 0; 7596 } 7597 7598 return 0; 7599 } 7600 7601 static int 7602 bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7603 void *md, uint64_t lba_count, uint64_t lba) 7604 { 7605 int rc; 7606 7607 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n", 7608 lba_count, lba); 7609 7610 bio->iovs = iov; 7611 bio->iovcnt = iovcnt; 7612 bio->iovpos = 0; 7613 bio->iov_offset = 0; 7614 7615 rc = spdk_nvme_ns_cmd_readv_with_md(bio->io_path->nvme_ns->ns, 7616 bio->io_path->qpair->qpair, 7617 lba, lba_count, 7618 bdev_nvme_no_pi_readv_done, bio, 0, 7619 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 7620 md, 0, 0); 7621 7622 if (rc != 0 && rc != -ENOMEM) { 7623 SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc); 7624 } 7625 return rc; 7626 } 7627 7628 static int 7629 bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7630 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, 7631 struct spdk_memory_domain *domain, void *domain_ctx, 7632 struct spdk_accel_sequence *seq) 7633 { 7634 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7635 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7636 int rc; 7637 7638 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7639 lba_count, lba); 7640 7641 bio->iovs = iov; 7642 bio->iovcnt = iovcnt; 7643 bio->iovpos = 0; 7644 bio->iov_offset = 0; 7645 7646 if (domain != NULL || seq != NULL) { 7647 bio->ext_opts.size = SPDK_SIZEOF(&bio->ext_opts, accel_sequence); 7648 bio->ext_opts.memory_domain = domain; 7649 bio->ext_opts.memory_domain_ctx = domain_ctx; 7650 bio->ext_opts.io_flags = flags; 7651 bio->ext_opts.metadata = md; 7652 bio->ext_opts.accel_sequence = seq; 7653 7654 if (iovcnt == 1) { 7655 rc = spdk_nvme_ns_cmd_read_ext(ns, qpair, iov[0].iov_base, lba, lba_count, bdev_nvme_readv_done, 7656 bio, &bio->ext_opts); 7657 } else { 7658 rc = spdk_nvme_ns_cmd_readv_ext(ns, qpair, lba, lba_count, 7659 bdev_nvme_readv_done, bio, 7660 bdev_nvme_queued_reset_sgl, 7661 bdev_nvme_queued_next_sge, 7662 &bio->ext_opts); 7663 } 7664 } else if (iovcnt == 1) { 7665 rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base, 7666 md, lba, lba_count, bdev_nvme_readv_done, 7667 bio, flags, 0, 0); 7668 } else { 7669 rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count, 7670 bdev_nvme_readv_done, bio, flags, 7671 bdev_nvme_queued_reset_sgl, 7672 bdev_nvme_queued_next_sge, md, 0, 0); 7673 } 7674 7675 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 7676 SPDK_ERRLOG("readv failed: rc = %d\n", rc); 7677 } 7678 return rc; 7679 } 7680 7681 static int 7682 bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7683 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, 7684 struct spdk_memory_domain *domain, void *domain_ctx, 7685 struct spdk_accel_sequence *seq) 7686 { 7687 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7688 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7689 int rc; 7690 7691 SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7692 lba_count, lba); 7693 7694 bio->iovs = iov; 7695 bio->iovcnt = iovcnt; 7696 bio->iovpos = 0; 7697 bio->iov_offset = 0; 7698 7699 if (domain != NULL || seq != NULL) { 7700 bio->ext_opts.size = SPDK_SIZEOF(&bio->ext_opts, accel_sequence); 7701 bio->ext_opts.memory_domain = domain; 7702 bio->ext_opts.memory_domain_ctx = domain_ctx; 7703 bio->ext_opts.io_flags = flags; 7704 bio->ext_opts.metadata = md; 7705 bio->ext_opts.accel_sequence = seq; 7706 7707 if (iovcnt == 1) { 7708 rc = spdk_nvme_ns_cmd_write_ext(ns, qpair, iov[0].iov_base, lba, lba_count, bdev_nvme_writev_done, 7709 bio, &bio->ext_opts); 7710 } else { 7711 rc = spdk_nvme_ns_cmd_writev_ext(ns, qpair, lba, lba_count, 7712 bdev_nvme_writev_done, bio, 7713 bdev_nvme_queued_reset_sgl, 7714 bdev_nvme_queued_next_sge, 7715 &bio->ext_opts); 7716 } 7717 } else if (iovcnt == 1) { 7718 rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base, 7719 md, lba, lba_count, bdev_nvme_writev_done, 7720 bio, flags, 0, 0); 7721 } else { 7722 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 7723 bdev_nvme_writev_done, bio, flags, 7724 bdev_nvme_queued_reset_sgl, 7725 bdev_nvme_queued_next_sge, md, 0, 0); 7726 } 7727 7728 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 7729 SPDK_ERRLOG("writev failed: rc = %d\n", rc); 7730 } 7731 return rc; 7732 } 7733 7734 static int 7735 bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7736 void *md, uint64_t lba_count, uint64_t zslba, 7737 uint32_t flags) 7738 { 7739 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7740 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7741 int rc; 7742 7743 SPDK_DEBUGLOG(bdev_nvme, "zone append %" PRIu64 " blocks to zone start lba %#" PRIx64 "\n", 7744 lba_count, zslba); 7745 7746 bio->iovs = iov; 7747 bio->iovcnt = iovcnt; 7748 bio->iovpos = 0; 7749 bio->iov_offset = 0; 7750 7751 if (iovcnt == 1) { 7752 rc = spdk_nvme_zns_zone_append_with_md(ns, qpair, iov[0].iov_base, md, zslba, 7753 lba_count, 7754 bdev_nvme_zone_appendv_done, bio, 7755 flags, 7756 0, 0); 7757 } else { 7758 rc = spdk_nvme_zns_zone_appendv_with_md(ns, qpair, zslba, lba_count, 7759 bdev_nvme_zone_appendv_done, bio, flags, 7760 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 7761 md, 0, 0); 7762 } 7763 7764 if (rc != 0 && rc != -ENOMEM) { 7765 SPDK_ERRLOG("zone append failed: rc = %d\n", rc); 7766 } 7767 return rc; 7768 } 7769 7770 static int 7771 bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7772 void *md, uint64_t lba_count, uint64_t lba, 7773 uint32_t flags) 7774 { 7775 int rc; 7776 7777 SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7778 lba_count, lba); 7779 7780 bio->iovs = iov; 7781 bio->iovcnt = iovcnt; 7782 bio->iovpos = 0; 7783 bio->iov_offset = 0; 7784 7785 rc = spdk_nvme_ns_cmd_comparev_with_md(bio->io_path->nvme_ns->ns, 7786 bio->io_path->qpair->qpair, 7787 lba, lba_count, 7788 bdev_nvme_comparev_done, bio, flags, 7789 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 7790 md, 0, 0); 7791 7792 if (rc != 0 && rc != -ENOMEM) { 7793 SPDK_ERRLOG("comparev failed: rc = %d\n", rc); 7794 } 7795 return rc; 7796 } 7797 7798 static int 7799 bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, 7800 struct iovec *write_iov, int write_iovcnt, 7801 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags) 7802 { 7803 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7804 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7805 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7806 int rc; 7807 7808 SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7809 lba_count, lba); 7810 7811 bio->iovs = cmp_iov; 7812 bio->iovcnt = cmp_iovcnt; 7813 bio->iovpos = 0; 7814 bio->iov_offset = 0; 7815 bio->fused_iovs = write_iov; 7816 bio->fused_iovcnt = write_iovcnt; 7817 bio->fused_iovpos = 0; 7818 bio->fused_iov_offset = 0; 7819 7820 if (bdev_io->num_retries == 0) { 7821 bio->first_fused_submitted = false; 7822 bio->first_fused_completed = false; 7823 } 7824 7825 if (!bio->first_fused_submitted) { 7826 flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST; 7827 memset(&bio->cpl, 0, sizeof(bio->cpl)); 7828 7829 rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count, 7830 bdev_nvme_comparev_and_writev_done, bio, flags, 7831 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0); 7832 if (rc == 0) { 7833 bio->first_fused_submitted = true; 7834 flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST; 7835 } else { 7836 if (rc != -ENOMEM) { 7837 SPDK_ERRLOG("compare failed: rc = %d\n", rc); 7838 } 7839 return rc; 7840 } 7841 } 7842 7843 flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND; 7844 7845 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 7846 bdev_nvme_comparev_and_writev_done, bio, flags, 7847 bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0); 7848 if (rc != 0 && rc != -ENOMEM) { 7849 SPDK_ERRLOG("write failed: rc = %d\n", rc); 7850 rc = 0; 7851 } 7852 7853 return rc; 7854 } 7855 7856 static int 7857 bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 7858 { 7859 struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES]; 7860 struct spdk_nvme_dsm_range *range; 7861 uint64_t offset, remaining; 7862 uint64_t num_ranges_u64; 7863 uint16_t num_ranges; 7864 int rc; 7865 7866 num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) / 7867 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 7868 if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) { 7869 SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks); 7870 return -EINVAL; 7871 } 7872 num_ranges = (uint16_t)num_ranges_u64; 7873 7874 offset = offset_blocks; 7875 remaining = num_blocks; 7876 range = &dsm_ranges[0]; 7877 7878 /* Fill max-size ranges until the remaining blocks fit into one range */ 7879 while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) { 7880 range->attributes.raw = 0; 7881 range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 7882 range->starting_lba = offset; 7883 7884 offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 7885 remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 7886 range++; 7887 } 7888 7889 /* Final range describes the remaining blocks */ 7890 range->attributes.raw = 0; 7891 range->length = remaining; 7892 range->starting_lba = offset; 7893 7894 rc = spdk_nvme_ns_cmd_dataset_management(bio->io_path->nvme_ns->ns, 7895 bio->io_path->qpair->qpair, 7896 SPDK_NVME_DSM_ATTR_DEALLOCATE, 7897 dsm_ranges, num_ranges, 7898 bdev_nvme_queued_done, bio); 7899 7900 return rc; 7901 } 7902 7903 static int 7904 bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 7905 { 7906 if (num_blocks > UINT16_MAX + 1) { 7907 SPDK_ERRLOG("NVMe write zeroes is limited to 16-bit block count\n"); 7908 return -EINVAL; 7909 } 7910 7911 return spdk_nvme_ns_cmd_write_zeroes(bio->io_path->nvme_ns->ns, 7912 bio->io_path->qpair->qpair, 7913 offset_blocks, num_blocks, 7914 bdev_nvme_queued_done, bio, 7915 0); 7916 } 7917 7918 static int 7919 bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones, 7920 struct spdk_bdev_zone_info *info) 7921 { 7922 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7923 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7924 uint32_t zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 7925 uint64_t zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 7926 uint64_t total_zones = spdk_nvme_zns_ns_get_num_zones(ns); 7927 7928 if (zone_id % zone_size != 0) { 7929 return -EINVAL; 7930 } 7931 7932 if (num_zones > total_zones || !num_zones) { 7933 return -EINVAL; 7934 } 7935 7936 assert(!bio->zone_report_buf); 7937 bio->zone_report_buf = calloc(1, zone_report_bufsize); 7938 if (!bio->zone_report_buf) { 7939 return -ENOMEM; 7940 } 7941 7942 bio->handled_zones = 0; 7943 7944 return spdk_nvme_zns_report_zones(ns, qpair, bio->zone_report_buf, zone_report_bufsize, 7945 zone_id, SPDK_NVME_ZRA_LIST_ALL, true, 7946 bdev_nvme_get_zone_info_done, bio); 7947 } 7948 7949 static int 7950 bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 7951 enum spdk_bdev_zone_action action) 7952 { 7953 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7954 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7955 7956 switch (action) { 7957 case SPDK_BDEV_ZONE_CLOSE: 7958 return spdk_nvme_zns_close_zone(ns, qpair, zone_id, false, 7959 bdev_nvme_zone_management_done, bio); 7960 case SPDK_BDEV_ZONE_FINISH: 7961 return spdk_nvme_zns_finish_zone(ns, qpair, zone_id, false, 7962 bdev_nvme_zone_management_done, bio); 7963 case SPDK_BDEV_ZONE_OPEN: 7964 return spdk_nvme_zns_open_zone(ns, qpair, zone_id, false, 7965 bdev_nvme_zone_management_done, bio); 7966 case SPDK_BDEV_ZONE_RESET: 7967 return spdk_nvme_zns_reset_zone(ns, qpair, zone_id, false, 7968 bdev_nvme_zone_management_done, bio); 7969 case SPDK_BDEV_ZONE_OFFLINE: 7970 return spdk_nvme_zns_offline_zone(ns, qpair, zone_id, false, 7971 bdev_nvme_zone_management_done, bio); 7972 default: 7973 return -EINVAL; 7974 } 7975 } 7976 7977 static void 7978 bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 7979 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) 7980 { 7981 struct nvme_io_path *io_path; 7982 struct nvme_ctrlr *nvme_ctrlr; 7983 uint32_t max_xfer_size; 7984 int rc = -ENXIO; 7985 7986 /* Choose the first ctrlr which is not failed. */ 7987 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 7988 nvme_ctrlr = io_path->qpair->ctrlr; 7989 7990 /* We should skip any unavailable nvme_ctrlr rather than checking 7991 * if the return value of spdk_nvme_ctrlr_cmd_admin_raw() is -ENXIO. 7992 */ 7993 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 7994 continue; 7995 } 7996 7997 max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ctrlr->ctrlr); 7998 7999 if (nbytes > max_xfer_size) { 8000 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 8001 rc = -EINVAL; 8002 goto err; 8003 } 8004 8005 rc = spdk_nvme_ctrlr_cmd_admin_raw(nvme_ctrlr->ctrlr, cmd, buf, (uint32_t)nbytes, 8006 bdev_nvme_admin_passthru_done, bio); 8007 if (rc == 0) { 8008 return; 8009 } 8010 } 8011 8012 err: 8013 bdev_nvme_admin_complete(bio, rc); 8014 } 8015 8016 static int 8017 bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 8018 void *buf, size_t nbytes) 8019 { 8020 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8021 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8022 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 8023 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 8024 8025 if (nbytes > max_xfer_size) { 8026 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 8027 return -EINVAL; 8028 } 8029 8030 /* 8031 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 8032 * so fill it out automatically. 8033 */ 8034 cmd->nsid = spdk_nvme_ns_get_id(ns); 8035 8036 return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf, 8037 (uint32_t)nbytes, bdev_nvme_queued_done, bio); 8038 } 8039 8040 static int 8041 bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 8042 void *buf, size_t nbytes, void *md_buf, size_t md_len) 8043 { 8044 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8045 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8046 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 8047 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 8048 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 8049 8050 if (nbytes > max_xfer_size) { 8051 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 8052 return -EINVAL; 8053 } 8054 8055 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 8056 SPDK_ERRLOG("invalid meta data buffer size\n"); 8057 return -EINVAL; 8058 } 8059 8060 /* 8061 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 8062 * so fill it out automatically. 8063 */ 8064 cmd->nsid = spdk_nvme_ns_get_id(ns); 8065 8066 return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf, 8067 (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio); 8068 } 8069 8070 static int 8071 bdev_nvme_iov_passthru_md(struct nvme_bdev_io *bio, 8072 struct spdk_nvme_cmd *cmd, struct iovec *iov, int iovcnt, 8073 size_t nbytes, void *md_buf, size_t md_len) 8074 { 8075 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8076 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8077 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 8078 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 8079 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 8080 8081 bio->iovs = iov; 8082 bio->iovcnt = iovcnt; 8083 bio->iovpos = 0; 8084 bio->iov_offset = 0; 8085 8086 if (nbytes > max_xfer_size) { 8087 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 8088 return -EINVAL; 8089 } 8090 8091 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 8092 SPDK_ERRLOG("invalid meta data buffer size\n"); 8093 return -EINVAL; 8094 } 8095 8096 /* 8097 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands 8098 * require a nsid, so fill it out automatically. 8099 */ 8100 cmd->nsid = spdk_nvme_ns_get_id(ns); 8101 8102 return spdk_nvme_ctrlr_cmd_iov_raw_with_md( 8103 ctrlr, qpair, cmd, (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio, 8104 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge); 8105 } 8106 8107 static void 8108 bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 8109 struct nvme_bdev_io *bio_to_abort) 8110 { 8111 struct nvme_io_path *io_path; 8112 int rc = 0; 8113 8114 rc = bdev_nvme_abort_retry_io(nbdev_ch, bio_to_abort); 8115 if (rc == 0) { 8116 bdev_nvme_admin_complete(bio, 0); 8117 return; 8118 } 8119 8120 io_path = bio_to_abort->io_path; 8121 if (io_path != NULL) { 8122 rc = spdk_nvme_ctrlr_cmd_abort_ext(io_path->qpair->ctrlr->ctrlr, 8123 io_path->qpair->qpair, 8124 bio_to_abort, 8125 bdev_nvme_abort_done, bio); 8126 } else { 8127 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 8128 rc = spdk_nvme_ctrlr_cmd_abort_ext(io_path->qpair->ctrlr->ctrlr, 8129 NULL, 8130 bio_to_abort, 8131 bdev_nvme_abort_done, bio); 8132 8133 if (rc != -ENOENT) { 8134 break; 8135 } 8136 } 8137 } 8138 8139 if (rc != 0) { 8140 /* If no command was found or there was any error, complete the abort 8141 * request with failure. 8142 */ 8143 bdev_nvme_admin_complete(bio, rc); 8144 } 8145 } 8146 8147 static int 8148 bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, uint64_t src_offset_blocks, 8149 uint64_t num_blocks) 8150 { 8151 struct spdk_nvme_scc_source_range range = { 8152 .slba = src_offset_blocks, 8153 .nlb = num_blocks - 1 8154 }; 8155 8156 return spdk_nvme_ns_cmd_copy(bio->io_path->nvme_ns->ns, 8157 bio->io_path->qpair->qpair, 8158 &range, 1, dst_offset_blocks, 8159 bdev_nvme_queued_done, bio); 8160 } 8161 8162 static void 8163 bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w) 8164 { 8165 const char *action; 8166 uint32_t i; 8167 8168 if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) { 8169 action = "reset"; 8170 } else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) { 8171 action = "abort"; 8172 } else { 8173 action = "none"; 8174 } 8175 8176 spdk_json_write_object_begin(w); 8177 8178 spdk_json_write_named_string(w, "method", "bdev_nvme_set_options"); 8179 8180 spdk_json_write_named_object_begin(w, "params"); 8181 spdk_json_write_named_string(w, "action_on_timeout", action); 8182 spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us); 8183 spdk_json_write_named_uint64(w, "timeout_admin_us", g_opts.timeout_admin_us); 8184 spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms); 8185 spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst); 8186 spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight); 8187 spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight); 8188 spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight); 8189 spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us); 8190 spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us); 8191 spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests); 8192 spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit); 8193 spdk_json_write_named_uint32(w, "transport_retry_count", g_opts.transport_retry_count); 8194 spdk_json_write_named_int32(w, "bdev_retry_count", g_opts.bdev_retry_count); 8195 spdk_json_write_named_uint8(w, "transport_ack_timeout", g_opts.transport_ack_timeout); 8196 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", g_opts.ctrlr_loss_timeout_sec); 8197 spdk_json_write_named_uint32(w, "reconnect_delay_sec", g_opts.reconnect_delay_sec); 8198 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", g_opts.fast_io_fail_timeout_sec); 8199 spdk_json_write_named_bool(w, "disable_auto_failback", g_opts.disable_auto_failback); 8200 spdk_json_write_named_bool(w, "generate_uuids", g_opts.generate_uuids); 8201 spdk_json_write_named_uint8(w, "transport_tos", g_opts.transport_tos); 8202 spdk_json_write_named_bool(w, "nvme_error_stat", g_opts.nvme_error_stat); 8203 spdk_json_write_named_uint32(w, "rdma_srq_size", g_opts.rdma_srq_size); 8204 spdk_json_write_named_bool(w, "io_path_stat", g_opts.io_path_stat); 8205 spdk_json_write_named_bool(w, "allow_accel_sequence", g_opts.allow_accel_sequence); 8206 spdk_json_write_named_uint32(w, "rdma_max_cq_size", g_opts.rdma_max_cq_size); 8207 spdk_json_write_named_uint16(w, "rdma_cm_event_timeout_ms", g_opts.rdma_cm_event_timeout_ms); 8208 spdk_json_write_named_array_begin(w, "dhchap_digests"); 8209 for (i = 0; i < 32; ++i) { 8210 if (g_opts.dhchap_digests & SPDK_BIT(i)) { 8211 spdk_json_write_string(w, spdk_nvme_dhchap_get_digest_name(i)); 8212 } 8213 } 8214 spdk_json_write_array_end(w); 8215 spdk_json_write_named_array_begin(w, "dhchap_dhgroups"); 8216 for (i = 0; i < 32; ++i) { 8217 if (g_opts.dhchap_dhgroups & SPDK_BIT(i)) { 8218 spdk_json_write_string(w, spdk_nvme_dhchap_get_dhgroup_name(i)); 8219 } 8220 } 8221 8222 spdk_json_write_array_end(w); 8223 spdk_json_write_object_end(w); 8224 8225 spdk_json_write_object_end(w); 8226 } 8227 8228 static void 8229 bdev_nvme_discovery_config_json(struct spdk_json_write_ctx *w, struct discovery_ctx *ctx) 8230 { 8231 struct spdk_nvme_transport_id trid; 8232 8233 spdk_json_write_object_begin(w); 8234 8235 spdk_json_write_named_string(w, "method", "bdev_nvme_start_discovery"); 8236 8237 spdk_json_write_named_object_begin(w, "params"); 8238 spdk_json_write_named_string(w, "name", ctx->name); 8239 spdk_json_write_named_string(w, "hostnqn", ctx->hostnqn); 8240 8241 trid = ctx->trid; 8242 memset(trid.subnqn, 0, sizeof(trid.subnqn)); 8243 nvme_bdev_dump_trid_json(&trid, w); 8244 8245 spdk_json_write_named_bool(w, "wait_for_attach", ctx->wait_for_attach); 8246 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", ctx->bdev_opts.ctrlr_loss_timeout_sec); 8247 spdk_json_write_named_uint32(w, "reconnect_delay_sec", ctx->bdev_opts.reconnect_delay_sec); 8248 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 8249 ctx->bdev_opts.fast_io_fail_timeout_sec); 8250 spdk_json_write_object_end(w); 8251 8252 spdk_json_write_object_end(w); 8253 } 8254 8255 #ifdef SPDK_CONFIG_NVME_CUSE 8256 static void 8257 nvme_ctrlr_cuse_config_json(struct spdk_json_write_ctx *w, 8258 struct nvme_ctrlr *nvme_ctrlr) 8259 { 8260 size_t cuse_name_size = 128; 8261 char cuse_name[cuse_name_size]; 8262 8263 if (spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr, 8264 cuse_name, &cuse_name_size) != 0) { 8265 return; 8266 } 8267 8268 spdk_json_write_object_begin(w); 8269 8270 spdk_json_write_named_string(w, "method", "bdev_nvme_cuse_register"); 8271 8272 spdk_json_write_named_object_begin(w, "params"); 8273 spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name); 8274 spdk_json_write_object_end(w); 8275 8276 spdk_json_write_object_end(w); 8277 } 8278 #endif 8279 8280 static void 8281 nvme_ctrlr_config_json(struct spdk_json_write_ctx *w, 8282 struct nvme_ctrlr *nvme_ctrlr) 8283 { 8284 struct spdk_nvme_transport_id *trid; 8285 const struct spdk_nvme_ctrlr_opts *opts; 8286 8287 if (nvme_ctrlr->opts.from_discovery_service) { 8288 /* Do not emit an RPC for this - it will be implicitly 8289 * covered by a separate bdev_nvme_start_discovery or 8290 * bdev_nvme_start_mdns_discovery RPC. 8291 */ 8292 return; 8293 } 8294 8295 trid = &nvme_ctrlr->active_path_id->trid; 8296 8297 spdk_json_write_object_begin(w); 8298 8299 spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller"); 8300 8301 spdk_json_write_named_object_begin(w, "params"); 8302 spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name); 8303 nvme_bdev_dump_trid_json(trid, w); 8304 spdk_json_write_named_bool(w, "prchk_reftag", 8305 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0); 8306 spdk_json_write_named_bool(w, "prchk_guard", 8307 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0); 8308 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", nvme_ctrlr->opts.ctrlr_loss_timeout_sec); 8309 spdk_json_write_named_uint32(w, "reconnect_delay_sec", nvme_ctrlr->opts.reconnect_delay_sec); 8310 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 8311 nvme_ctrlr->opts.fast_io_fail_timeout_sec); 8312 if (nvme_ctrlr->psk != NULL) { 8313 spdk_json_write_named_string(w, "psk", spdk_key_get_name(nvme_ctrlr->psk)); 8314 } else if (nvme_ctrlr->opts.psk[0] != '\0') { 8315 spdk_json_write_named_string(w, "psk", nvme_ctrlr->opts.psk); 8316 } 8317 8318 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 8319 spdk_json_write_named_string(w, "hostnqn", opts->hostnqn); 8320 spdk_json_write_named_bool(w, "hdgst", opts->header_digest); 8321 spdk_json_write_named_bool(w, "ddgst", opts->data_digest); 8322 if (opts->src_addr[0] != '\0') { 8323 spdk_json_write_named_string(w, "hostaddr", opts->src_addr); 8324 } 8325 if (opts->src_svcid[0] != '\0') { 8326 spdk_json_write_named_string(w, "hostsvcid", opts->src_svcid); 8327 } 8328 8329 spdk_json_write_object_end(w); 8330 8331 spdk_json_write_object_end(w); 8332 } 8333 8334 static void 8335 bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w) 8336 { 8337 spdk_json_write_object_begin(w); 8338 spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug"); 8339 8340 spdk_json_write_named_object_begin(w, "params"); 8341 spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us); 8342 spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled); 8343 spdk_json_write_object_end(w); 8344 8345 spdk_json_write_object_end(w); 8346 } 8347 8348 static int 8349 bdev_nvme_config_json(struct spdk_json_write_ctx *w) 8350 { 8351 struct nvme_bdev_ctrlr *nbdev_ctrlr; 8352 struct nvme_ctrlr *nvme_ctrlr; 8353 struct discovery_ctx *ctx; 8354 8355 bdev_nvme_opts_config_json(w); 8356 8357 pthread_mutex_lock(&g_bdev_nvme_mutex); 8358 8359 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 8360 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 8361 nvme_ctrlr_config_json(w, nvme_ctrlr); 8362 8363 #ifdef SPDK_CONFIG_NVME_CUSE 8364 nvme_ctrlr_cuse_config_json(w, nvme_ctrlr); 8365 #endif 8366 } 8367 } 8368 8369 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 8370 if (!ctx->from_mdns_discovery_service) { 8371 bdev_nvme_discovery_config_json(w, ctx); 8372 } 8373 } 8374 8375 bdev_nvme_mdns_discovery_config_json(w); 8376 8377 /* Dump as last parameter to give all NVMe bdevs chance to be constructed 8378 * before enabling hotplug poller. 8379 */ 8380 bdev_nvme_hotplug_config_json(w); 8381 8382 pthread_mutex_unlock(&g_bdev_nvme_mutex); 8383 return 0; 8384 } 8385 8386 struct spdk_nvme_ctrlr * 8387 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev) 8388 { 8389 struct nvme_bdev *nbdev; 8390 struct nvme_ns *nvme_ns; 8391 8392 if (!bdev || bdev->module != &nvme_if) { 8393 return NULL; 8394 } 8395 8396 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 8397 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 8398 assert(nvme_ns != NULL); 8399 8400 return nvme_ns->ctrlr->ctrlr; 8401 } 8402 8403 void 8404 nvme_io_path_info_json(struct spdk_json_write_ctx *w, struct nvme_io_path *io_path) 8405 { 8406 struct nvme_ns *nvme_ns = io_path->nvme_ns; 8407 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 8408 const struct spdk_nvme_ctrlr_data *cdata; 8409 const struct spdk_nvme_transport_id *trid; 8410 const char *adrfam_str; 8411 8412 spdk_json_write_object_begin(w); 8413 8414 spdk_json_write_named_string(w, "bdev_name", nvme_ns->bdev->disk.name); 8415 8416 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 8417 trid = spdk_nvme_ctrlr_get_transport_id(nvme_ctrlr->ctrlr); 8418 8419 spdk_json_write_named_uint32(w, "cntlid", cdata->cntlid); 8420 spdk_json_write_named_bool(w, "current", io_path->nbdev_ch != NULL && 8421 io_path == io_path->nbdev_ch->current_io_path); 8422 spdk_json_write_named_bool(w, "connected", nvme_qpair_is_connected(io_path->qpair)); 8423 spdk_json_write_named_bool(w, "accessible", nvme_ns_is_accessible(nvme_ns)); 8424 8425 spdk_json_write_named_object_begin(w, "transport"); 8426 spdk_json_write_named_string(w, "trtype", trid->trstring); 8427 spdk_json_write_named_string(w, "traddr", trid->traddr); 8428 if (trid->trsvcid[0] != '\0') { 8429 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 8430 } 8431 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 8432 if (adrfam_str) { 8433 spdk_json_write_named_string(w, "adrfam", adrfam_str); 8434 } 8435 spdk_json_write_object_end(w); 8436 8437 spdk_json_write_object_end(w); 8438 } 8439 8440 void 8441 bdev_nvme_get_discovery_info(struct spdk_json_write_ctx *w) 8442 { 8443 struct discovery_ctx *ctx; 8444 struct discovery_entry_ctx *entry_ctx; 8445 8446 spdk_json_write_array_begin(w); 8447 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 8448 spdk_json_write_object_begin(w); 8449 spdk_json_write_named_string(w, "name", ctx->name); 8450 8451 spdk_json_write_named_object_begin(w, "trid"); 8452 nvme_bdev_dump_trid_json(&ctx->trid, w); 8453 spdk_json_write_object_end(w); 8454 8455 spdk_json_write_named_array_begin(w, "referrals"); 8456 TAILQ_FOREACH(entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 8457 spdk_json_write_object_begin(w); 8458 spdk_json_write_named_object_begin(w, "trid"); 8459 nvme_bdev_dump_trid_json(&entry_ctx->trid, w); 8460 spdk_json_write_object_end(w); 8461 spdk_json_write_object_end(w); 8462 } 8463 spdk_json_write_array_end(w); 8464 8465 spdk_json_write_object_end(w); 8466 } 8467 spdk_json_write_array_end(w); 8468 } 8469 8470 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme) 8471 8472 SPDK_TRACE_REGISTER_FN(bdev_nvme_trace, "bdev_nvme", TRACE_GROUP_BDEV_NVME) 8473 { 8474 struct spdk_trace_tpoint_opts opts[] = { 8475 { 8476 "BDEV_NVME_IO_START", TRACE_BDEV_NVME_IO_START, 8477 OWNER_NONE, OBJECT_BDEV_NVME_IO, 1, 8478 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 8479 }, 8480 { 8481 "BDEV_NVME_IO_DONE", TRACE_BDEV_NVME_IO_DONE, 8482 OWNER_NONE, OBJECT_BDEV_NVME_IO, 0, 8483 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 8484 } 8485 }; 8486 8487 8488 spdk_trace_register_object(OBJECT_BDEV_NVME_IO, 'N'); 8489 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 8490 spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_SUBMIT, OBJECT_BDEV_NVME_IO, 0); 8491 spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_SUBMIT, OBJECT_BDEV_NVME_IO, 0); 8492 spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_COMPLETE, OBJECT_BDEV_NVME_IO, 0); 8493 spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_COMPLETE, OBJECT_BDEV_NVME_IO, 0); 8494 } 8495