1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 * Copyright (c) 2022 Dell Inc, or its subsidiaries. All rights reserved. 6 */ 7 8 #include "spdk/stdinc.h" 9 10 #include "bdev_nvme.h" 11 12 #include "spdk/accel.h" 13 #include "spdk/config.h" 14 #include "spdk/endian.h" 15 #include "spdk/bdev.h" 16 #include "spdk/json.h" 17 #include "spdk/keyring.h" 18 #include "spdk/likely.h" 19 #include "spdk/nvme.h" 20 #include "spdk/nvme_ocssd.h" 21 #include "spdk/nvme_zns.h" 22 #include "spdk/opal.h" 23 #include "spdk/thread.h" 24 #include "spdk/trace.h" 25 #include "spdk/string.h" 26 #include "spdk/util.h" 27 #include "spdk/uuid.h" 28 29 #include "spdk/bdev_module.h" 30 #include "spdk/log.h" 31 32 #include "spdk_internal/usdt.h" 33 #include "spdk_internal/trace_defs.h" 34 35 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true 36 #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS (10000) 37 38 #define NSID_STR_LEN 10 39 40 #define SPDK_CONTROLLER_NAME_MAX 512 41 42 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w); 43 44 struct nvme_bdev_io { 45 /** array of iovecs to transfer. */ 46 struct iovec *iovs; 47 48 /** Number of iovecs in iovs array. */ 49 int iovcnt; 50 51 /** Current iovec position. */ 52 int iovpos; 53 54 /** Offset in current iovec. */ 55 uint32_t iov_offset; 56 57 /** Offset in current iovec. */ 58 uint32_t fused_iov_offset; 59 60 /** array of iovecs to transfer. */ 61 struct iovec *fused_iovs; 62 63 /** Number of iovecs in iovs array. */ 64 int fused_iovcnt; 65 66 /** Current iovec position. */ 67 int fused_iovpos; 68 69 /** I/O path the current I/O or admin passthrough is submitted on, or the I/O path 70 * being reset in a reset I/O. 71 */ 72 struct nvme_io_path *io_path; 73 74 /** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */ 75 struct spdk_nvme_cpl cpl; 76 77 /** Extended IO opts passed by the user to bdev layer and mapped to NVME format */ 78 struct spdk_nvme_ns_cmd_ext_io_opts ext_opts; 79 80 /** Keeps track if first of fused commands was submitted */ 81 bool first_fused_submitted; 82 83 /** Keeps track if first of fused commands was completed */ 84 bool first_fused_completed; 85 86 /* How many times the current I/O was retried. */ 87 int32_t retry_count; 88 89 /** Expiration value in ticks to retry the current I/O. */ 90 uint64_t retry_ticks; 91 92 /** Temporary pointer to zone report buffer */ 93 struct spdk_nvme_zns_zone_report *zone_report_buf; 94 95 /** Keep track of how many zones that have been copied to the spdk_bdev_zone_info struct */ 96 uint64_t handled_zones; 97 98 /* Current tsc at submit time. */ 99 uint64_t submit_tsc; 100 101 /* Used to put nvme_bdev_io into the list */ 102 TAILQ_ENTRY(nvme_bdev_io) retry_link; 103 }; 104 105 struct nvme_probe_skip_entry { 106 struct spdk_nvme_transport_id trid; 107 TAILQ_ENTRY(nvme_probe_skip_entry) tailq; 108 }; 109 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */ 110 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER( 111 g_skipped_nvme_ctrlrs); 112 113 #define BDEV_NVME_DEFAULT_DIGESTS (SPDK_BIT(SPDK_NVMF_DHCHAP_HASH_SHA256) | \ 114 SPDK_BIT(SPDK_NVMF_DHCHAP_HASH_SHA384) | \ 115 SPDK_BIT(SPDK_NVMF_DHCHAP_HASH_SHA512)) 116 117 #define BDEV_NVME_DEFAULT_DHGROUPS (SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_NULL) | \ 118 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_2048) | \ 119 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_3072) | \ 120 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_4096) | \ 121 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_6144) | \ 122 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_8192)) 123 124 static struct spdk_bdev_nvme_opts g_opts = { 125 .action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE, 126 .timeout_us = 0, 127 .timeout_admin_us = 0, 128 .keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS, 129 .transport_retry_count = 4, 130 .arbitration_burst = 0, 131 .low_priority_weight = 0, 132 .medium_priority_weight = 0, 133 .high_priority_weight = 0, 134 .nvme_adminq_poll_period_us = 10000ULL, 135 .nvme_ioq_poll_period_us = 0, 136 .io_queue_requests = 0, 137 .delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT, 138 .bdev_retry_count = 3, 139 .transport_ack_timeout = 0, 140 .ctrlr_loss_timeout_sec = 0, 141 .reconnect_delay_sec = 0, 142 .fast_io_fail_timeout_sec = 0, 143 .disable_auto_failback = false, 144 .generate_uuids = false, 145 .transport_tos = 0, 146 .nvme_error_stat = false, 147 .io_path_stat = false, 148 .allow_accel_sequence = false, 149 .dhchap_digests = BDEV_NVME_DEFAULT_DIGESTS, 150 .dhchap_dhgroups = BDEV_NVME_DEFAULT_DHGROUPS, 151 }; 152 153 #define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL 154 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT 100000ULL 155 156 static int g_hot_insert_nvme_controller_index = 0; 157 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT; 158 static bool g_nvme_hotplug_enabled = false; 159 struct spdk_thread *g_bdev_nvme_init_thread; 160 static struct spdk_poller *g_hotplug_poller; 161 static struct spdk_poller *g_hotplug_probe_poller; 162 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx; 163 164 static void nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 165 struct nvme_async_probe_ctx *ctx); 166 static void nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 167 struct nvme_async_probe_ctx *ctx); 168 static int bdev_nvme_library_init(void); 169 static void bdev_nvme_library_fini(void); 170 static void _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, 171 struct spdk_bdev_io *bdev_io); 172 static void bdev_nvme_submit_request(struct spdk_io_channel *ch, 173 struct spdk_bdev_io *bdev_io); 174 static int bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 175 void *md, uint64_t lba_count, uint64_t lba, 176 uint32_t flags, struct spdk_memory_domain *domain, void *domain_ctx, 177 struct spdk_accel_sequence *seq); 178 static int bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 179 void *md, uint64_t lba_count, uint64_t lba); 180 static int bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 181 void *md, uint64_t lba_count, uint64_t lba, 182 uint32_t flags, struct spdk_memory_domain *domain, void *domain_ctx, 183 struct spdk_accel_sequence *seq, 184 union spdk_bdev_nvme_cdw12 cdw12, union spdk_bdev_nvme_cdw13 cdw13); 185 static int bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 186 void *md, uint64_t lba_count, 187 uint64_t zslba, uint32_t flags); 188 static int bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 189 void *md, uint64_t lba_count, uint64_t lba, 190 uint32_t flags); 191 static int bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, 192 struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov, 193 int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba, 194 uint32_t flags); 195 static int bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, 196 uint32_t num_zones, struct spdk_bdev_zone_info *info); 197 static int bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 198 enum spdk_bdev_zone_action action); 199 static void bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, 200 struct nvme_bdev_io *bio, 201 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); 202 static int bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 203 void *buf, size_t nbytes); 204 static int bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 205 void *buf, size_t nbytes, void *md_buf, size_t md_len); 206 static int bdev_nvme_iov_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 207 struct iovec *iov, int iovcnt, size_t nbytes, 208 void *md_buf, size_t md_len); 209 static void bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, 210 struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort); 211 static void bdev_nvme_reset_io(struct nvme_bdev *nbdev, struct nvme_bdev_io *bio); 212 static int bdev_nvme_reset_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 213 static int bdev_nvme_failover_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 214 static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr); 215 static int nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr); 216 217 static struct nvme_ns *nvme_ns_alloc(void); 218 static void nvme_ns_free(struct nvme_ns *ns); 219 220 static int 221 nvme_ns_cmp(struct nvme_ns *ns1, struct nvme_ns *ns2) 222 { 223 return ns1->id < ns2->id ? -1 : ns1->id > ns2->id; 224 } 225 226 RB_GENERATE_STATIC(nvme_ns_tree, nvme_ns, node, nvme_ns_cmp); 227 228 struct spdk_nvme_qpair * 229 bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch) 230 { 231 struct nvme_ctrlr_channel *ctrlr_ch; 232 233 assert(ctrlr_io_ch != NULL); 234 235 ctrlr_ch = spdk_io_channel_get_ctx(ctrlr_io_ch); 236 237 return ctrlr_ch->qpair->qpair; 238 } 239 240 static int 241 bdev_nvme_get_ctx_size(void) 242 { 243 return sizeof(struct nvme_bdev_io); 244 } 245 246 static struct spdk_bdev_module nvme_if = { 247 .name = "nvme", 248 .async_fini = true, 249 .module_init = bdev_nvme_library_init, 250 .module_fini = bdev_nvme_library_fini, 251 .config_json = bdev_nvme_config_json, 252 .get_ctx_size = bdev_nvme_get_ctx_size, 253 254 }; 255 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if) 256 257 struct nvme_bdev_ctrlrs g_nvme_bdev_ctrlrs = TAILQ_HEAD_INITIALIZER(g_nvme_bdev_ctrlrs); 258 pthread_mutex_t g_bdev_nvme_mutex = PTHREAD_MUTEX_INITIALIZER; 259 bool g_bdev_nvme_module_finish; 260 261 struct nvme_bdev_ctrlr * 262 nvme_bdev_ctrlr_get_by_name(const char *name) 263 { 264 struct nvme_bdev_ctrlr *nbdev_ctrlr; 265 266 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 267 if (strcmp(name, nbdev_ctrlr->name) == 0) { 268 break; 269 } 270 } 271 272 return nbdev_ctrlr; 273 } 274 275 static struct nvme_ctrlr * 276 nvme_bdev_ctrlr_get_ctrlr(struct nvme_bdev_ctrlr *nbdev_ctrlr, 277 const struct spdk_nvme_transport_id *trid, const char *hostnqn) 278 { 279 const struct spdk_nvme_ctrlr_opts *opts; 280 struct nvme_ctrlr *nvme_ctrlr; 281 282 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 283 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 284 if (spdk_nvme_transport_id_compare(trid, &nvme_ctrlr->active_path_id->trid) == 0 && 285 strcmp(hostnqn, opts->hostnqn) == 0) { 286 break; 287 } 288 } 289 290 return nvme_ctrlr; 291 } 292 293 struct nvme_ctrlr * 294 nvme_bdev_ctrlr_get_ctrlr_by_id(struct nvme_bdev_ctrlr *nbdev_ctrlr, 295 uint16_t cntlid) 296 { 297 struct nvme_ctrlr *nvme_ctrlr; 298 const struct spdk_nvme_ctrlr_data *cdata; 299 300 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 301 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 302 if (cdata->cntlid == cntlid) { 303 break; 304 } 305 } 306 307 return nvme_ctrlr; 308 } 309 310 static struct nvme_bdev * 311 nvme_bdev_ctrlr_get_bdev(struct nvme_bdev_ctrlr *nbdev_ctrlr, uint32_t nsid) 312 { 313 struct nvme_bdev *bdev; 314 315 pthread_mutex_lock(&g_bdev_nvme_mutex); 316 TAILQ_FOREACH(bdev, &nbdev_ctrlr->bdevs, tailq) { 317 if (bdev->nsid == nsid) { 318 break; 319 } 320 } 321 pthread_mutex_unlock(&g_bdev_nvme_mutex); 322 323 return bdev; 324 } 325 326 struct nvme_ns * 327 nvme_ctrlr_get_ns(struct nvme_ctrlr *nvme_ctrlr, uint32_t nsid) 328 { 329 struct nvme_ns ns; 330 331 assert(nsid > 0); 332 333 ns.id = nsid; 334 return RB_FIND(nvme_ns_tree, &nvme_ctrlr->namespaces, &ns); 335 } 336 337 struct nvme_ns * 338 nvme_ctrlr_get_first_active_ns(struct nvme_ctrlr *nvme_ctrlr) 339 { 340 return RB_MIN(nvme_ns_tree, &nvme_ctrlr->namespaces); 341 } 342 343 struct nvme_ns * 344 nvme_ctrlr_get_next_active_ns(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *ns) 345 { 346 if (ns == NULL) { 347 return NULL; 348 } 349 350 return RB_NEXT(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 351 } 352 353 static struct nvme_ctrlr * 354 nvme_ctrlr_get(const struct spdk_nvme_transport_id *trid, const char *hostnqn) 355 { 356 struct nvme_bdev_ctrlr *nbdev_ctrlr; 357 struct nvme_ctrlr *nvme_ctrlr = NULL; 358 359 pthread_mutex_lock(&g_bdev_nvme_mutex); 360 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 361 nvme_ctrlr = nvme_bdev_ctrlr_get_ctrlr(nbdev_ctrlr, trid, hostnqn); 362 if (nvme_ctrlr != NULL) { 363 break; 364 } 365 } 366 pthread_mutex_unlock(&g_bdev_nvme_mutex); 367 368 return nvme_ctrlr; 369 } 370 371 struct nvme_ctrlr * 372 nvme_ctrlr_get_by_name(const char *name) 373 { 374 struct nvme_bdev_ctrlr *nbdev_ctrlr; 375 struct nvme_ctrlr *nvme_ctrlr = NULL; 376 377 if (name == NULL) { 378 return NULL; 379 } 380 381 pthread_mutex_lock(&g_bdev_nvme_mutex); 382 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 383 if (nbdev_ctrlr != NULL) { 384 nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 385 } 386 pthread_mutex_unlock(&g_bdev_nvme_mutex); 387 388 return nvme_ctrlr; 389 } 390 391 void 392 nvme_bdev_ctrlr_for_each(nvme_bdev_ctrlr_for_each_fn fn, void *ctx) 393 { 394 struct nvme_bdev_ctrlr *nbdev_ctrlr; 395 396 pthread_mutex_lock(&g_bdev_nvme_mutex); 397 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 398 fn(nbdev_ctrlr, ctx); 399 } 400 pthread_mutex_unlock(&g_bdev_nvme_mutex); 401 } 402 403 struct nvme_ctrlr_channel_iter { 404 nvme_ctrlr_for_each_channel_msg fn; 405 nvme_ctrlr_for_each_channel_done cpl; 406 struct spdk_io_channel_iter *i; 407 void *ctx; 408 }; 409 410 void 411 nvme_ctrlr_for_each_channel_continue(struct nvme_ctrlr_channel_iter *iter, int status) 412 { 413 spdk_for_each_channel_continue(iter->i, status); 414 } 415 416 static void 417 nvme_ctrlr_each_channel_msg(struct spdk_io_channel_iter *i) 418 { 419 struct nvme_ctrlr_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 420 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 421 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 422 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch); 423 424 iter->i = i; 425 iter->fn(iter, nvme_ctrlr, ctrlr_ch, iter->ctx); 426 } 427 428 static void 429 nvme_ctrlr_each_channel_cpl(struct spdk_io_channel_iter *i, int status) 430 { 431 struct nvme_ctrlr_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 432 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 433 434 iter->i = i; 435 iter->cpl(nvme_ctrlr, iter->ctx, status); 436 437 free(iter); 438 } 439 440 void 441 nvme_ctrlr_for_each_channel(struct nvme_ctrlr *nvme_ctrlr, 442 nvme_ctrlr_for_each_channel_msg fn, void *ctx, 443 nvme_ctrlr_for_each_channel_done cpl) 444 { 445 struct nvme_ctrlr_channel_iter *iter; 446 447 assert(nvme_ctrlr != NULL && fn != NULL); 448 449 iter = calloc(1, sizeof(struct nvme_ctrlr_channel_iter)); 450 if (iter == NULL) { 451 SPDK_ERRLOG("Unable to allocate iterator\n"); 452 assert(false); 453 return; 454 } 455 456 iter->fn = fn; 457 iter->cpl = cpl; 458 iter->ctx = ctx; 459 460 spdk_for_each_channel(nvme_ctrlr, nvme_ctrlr_each_channel_msg, 461 iter, nvme_ctrlr_each_channel_cpl); 462 } 463 464 struct nvme_bdev_channel_iter { 465 nvme_bdev_for_each_channel_msg fn; 466 nvme_bdev_for_each_channel_done cpl; 467 struct spdk_io_channel_iter *i; 468 void *ctx; 469 }; 470 471 void 472 nvme_bdev_for_each_channel_continue(struct nvme_bdev_channel_iter *iter, int status) 473 { 474 spdk_for_each_channel_continue(iter->i, status); 475 } 476 477 static void 478 nvme_bdev_each_channel_msg(struct spdk_io_channel_iter *i) 479 { 480 struct nvme_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 481 struct nvme_bdev *nbdev = spdk_io_channel_iter_get_io_device(i); 482 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 483 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 484 485 iter->i = i; 486 iter->fn(iter, nbdev, nbdev_ch, iter->ctx); 487 } 488 489 static void 490 nvme_bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status) 491 { 492 struct nvme_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 493 struct nvme_bdev *nbdev = spdk_io_channel_iter_get_io_device(i); 494 495 iter->i = i; 496 iter->cpl(nbdev, iter->ctx, status); 497 498 free(iter); 499 } 500 501 void 502 nvme_bdev_for_each_channel(struct nvme_bdev *nbdev, 503 nvme_bdev_for_each_channel_msg fn, void *ctx, 504 nvme_bdev_for_each_channel_done cpl) 505 { 506 struct nvme_bdev_channel_iter *iter; 507 508 assert(nbdev != NULL && fn != NULL); 509 510 iter = calloc(1, sizeof(struct nvme_bdev_channel_iter)); 511 if (iter == NULL) { 512 SPDK_ERRLOG("Unable to allocate iterator\n"); 513 assert(false); 514 return; 515 } 516 517 iter->fn = fn; 518 iter->cpl = cpl; 519 iter->ctx = ctx; 520 521 spdk_for_each_channel(nbdev, nvme_bdev_each_channel_msg, iter, 522 nvme_bdev_each_channel_cpl); 523 } 524 525 void 526 nvme_bdev_dump_trid_json(const struct spdk_nvme_transport_id *trid, struct spdk_json_write_ctx *w) 527 { 528 const char *trtype_str; 529 const char *adrfam_str; 530 531 trtype_str = spdk_nvme_transport_id_trtype_str(trid->trtype); 532 if (trtype_str) { 533 spdk_json_write_named_string(w, "trtype", trtype_str); 534 } 535 536 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 537 if (adrfam_str) { 538 spdk_json_write_named_string(w, "adrfam", adrfam_str); 539 } 540 541 if (trid->traddr[0] != '\0') { 542 spdk_json_write_named_string(w, "traddr", trid->traddr); 543 } 544 545 if (trid->trsvcid[0] != '\0') { 546 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 547 } 548 549 if (trid->subnqn[0] != '\0') { 550 spdk_json_write_named_string(w, "subnqn", trid->subnqn); 551 } 552 } 553 554 static void 555 nvme_bdev_ctrlr_delete(struct nvme_bdev_ctrlr *nbdev_ctrlr, 556 struct nvme_ctrlr *nvme_ctrlr) 557 { 558 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_delete, nvme_ctrlr->nbdev_ctrlr->name); 559 pthread_mutex_lock(&g_bdev_nvme_mutex); 560 561 TAILQ_REMOVE(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 562 if (!TAILQ_EMPTY(&nbdev_ctrlr->ctrlrs)) { 563 pthread_mutex_unlock(&g_bdev_nvme_mutex); 564 565 return; 566 } 567 TAILQ_REMOVE(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 568 569 pthread_mutex_unlock(&g_bdev_nvme_mutex); 570 571 assert(TAILQ_EMPTY(&nbdev_ctrlr->bdevs)); 572 573 free(nbdev_ctrlr->name); 574 free(nbdev_ctrlr); 575 } 576 577 static void 578 _nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 579 { 580 struct nvme_path_id *path_id, *tmp_path; 581 struct nvme_ns *ns, *tmp_ns; 582 583 free(nvme_ctrlr->copied_ana_desc); 584 spdk_free(nvme_ctrlr->ana_log_page); 585 586 if (nvme_ctrlr->opal_dev) { 587 spdk_opal_dev_destruct(nvme_ctrlr->opal_dev); 588 nvme_ctrlr->opal_dev = NULL; 589 } 590 591 if (nvme_ctrlr->nbdev_ctrlr) { 592 nvme_bdev_ctrlr_delete(nvme_ctrlr->nbdev_ctrlr, nvme_ctrlr); 593 } 594 595 RB_FOREACH_SAFE(ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp_ns) { 596 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 597 nvme_ns_free(ns); 598 } 599 600 TAILQ_FOREACH_SAFE(path_id, &nvme_ctrlr->trids, link, tmp_path) { 601 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 602 free(path_id); 603 } 604 605 pthread_mutex_destroy(&nvme_ctrlr->mutex); 606 spdk_keyring_put_key(nvme_ctrlr->psk); 607 spdk_keyring_put_key(nvme_ctrlr->dhchap_key); 608 spdk_keyring_put_key(nvme_ctrlr->dhchap_ctrlr_key); 609 free(nvme_ctrlr); 610 611 pthread_mutex_lock(&g_bdev_nvme_mutex); 612 if (g_bdev_nvme_module_finish && TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 613 pthread_mutex_unlock(&g_bdev_nvme_mutex); 614 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 615 spdk_bdev_module_fini_done(); 616 return; 617 } 618 pthread_mutex_unlock(&g_bdev_nvme_mutex); 619 } 620 621 static int 622 nvme_detach_poller(void *arg) 623 { 624 struct nvme_ctrlr *nvme_ctrlr = arg; 625 int rc; 626 627 rc = spdk_nvme_detach_poll_async(nvme_ctrlr->detach_ctx); 628 if (rc != -EAGAIN) { 629 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 630 _nvme_ctrlr_delete(nvme_ctrlr); 631 } 632 633 return SPDK_POLLER_BUSY; 634 } 635 636 static void 637 nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 638 { 639 int rc; 640 641 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 642 643 /* First, unregister the adminq poller, as the driver will poll adminq if necessary */ 644 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 645 646 /* If we got here, the reset/detach poller cannot be active */ 647 assert(nvme_ctrlr->reset_detach_poller == NULL); 648 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(nvme_detach_poller, 649 nvme_ctrlr, 1000); 650 if (nvme_ctrlr->reset_detach_poller == NULL) { 651 SPDK_ERRLOG("Failed to register detach poller\n"); 652 goto error; 653 } 654 655 rc = spdk_nvme_detach_async(nvme_ctrlr->ctrlr, &nvme_ctrlr->detach_ctx); 656 if (rc != 0) { 657 SPDK_ERRLOG("Failed to detach the NVMe controller\n"); 658 goto error; 659 } 660 661 return; 662 error: 663 /* We don't have a good way to handle errors here, so just do what we can and delete the 664 * controller without detaching the underlying NVMe device. 665 */ 666 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 667 _nvme_ctrlr_delete(nvme_ctrlr); 668 } 669 670 static void 671 nvme_ctrlr_unregister_cb(void *io_device) 672 { 673 struct nvme_ctrlr *nvme_ctrlr = io_device; 674 675 nvme_ctrlr_delete(nvme_ctrlr); 676 } 677 678 static void 679 nvme_ctrlr_unregister(void *ctx) 680 { 681 struct nvme_ctrlr *nvme_ctrlr = ctx; 682 683 spdk_io_device_unregister(nvme_ctrlr, nvme_ctrlr_unregister_cb); 684 } 685 686 static bool 687 nvme_ctrlr_can_be_unregistered(struct nvme_ctrlr *nvme_ctrlr) 688 { 689 if (!nvme_ctrlr->destruct) { 690 return false; 691 } 692 693 if (nvme_ctrlr->ref > 0) { 694 return false; 695 } 696 697 if (nvme_ctrlr->resetting) { 698 return false; 699 } 700 701 if (nvme_ctrlr->ana_log_page_updating) { 702 return false; 703 } 704 705 if (nvme_ctrlr->io_path_cache_clearing) { 706 return false; 707 } 708 709 return true; 710 } 711 712 static void 713 nvme_ctrlr_release(struct nvme_ctrlr *nvme_ctrlr) 714 { 715 pthread_mutex_lock(&nvme_ctrlr->mutex); 716 SPDK_DTRACE_PROBE2(bdev_nvme_ctrlr_release, nvme_ctrlr->nbdev_ctrlr->name, nvme_ctrlr->ref); 717 718 assert(nvme_ctrlr->ref > 0); 719 nvme_ctrlr->ref--; 720 721 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 722 pthread_mutex_unlock(&nvme_ctrlr->mutex); 723 return; 724 } 725 726 pthread_mutex_unlock(&nvme_ctrlr->mutex); 727 728 spdk_thread_exec_msg(nvme_ctrlr->thread, nvme_ctrlr_unregister, nvme_ctrlr); 729 } 730 731 static void 732 bdev_nvme_clear_current_io_path(struct nvme_bdev_channel *nbdev_ch) 733 { 734 nbdev_ch->current_io_path = NULL; 735 nbdev_ch->rr_counter = 0; 736 } 737 738 static struct nvme_io_path * 739 _bdev_nvme_get_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 740 { 741 struct nvme_io_path *io_path; 742 743 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 744 if (io_path->nvme_ns == nvme_ns) { 745 break; 746 } 747 } 748 749 return io_path; 750 } 751 752 static struct nvme_io_path * 753 nvme_io_path_alloc(void) 754 { 755 struct nvme_io_path *io_path; 756 757 io_path = calloc(1, sizeof(*io_path)); 758 if (io_path == NULL) { 759 SPDK_ERRLOG("Failed to alloc io_path.\n"); 760 return NULL; 761 } 762 763 if (g_opts.io_path_stat) { 764 io_path->stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); 765 if (io_path->stat == NULL) { 766 free(io_path); 767 SPDK_ERRLOG("Failed to alloc io_path stat.\n"); 768 return NULL; 769 } 770 spdk_bdev_reset_io_stat(io_path->stat, SPDK_BDEV_RESET_STAT_MAXMIN); 771 } 772 773 return io_path; 774 } 775 776 static void 777 nvme_io_path_free(struct nvme_io_path *io_path) 778 { 779 free(io_path->stat); 780 free(io_path); 781 } 782 783 static int 784 _bdev_nvme_add_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 785 { 786 struct nvme_io_path *io_path; 787 struct spdk_io_channel *ch; 788 struct nvme_ctrlr_channel *ctrlr_ch; 789 struct nvme_qpair *nvme_qpair; 790 791 io_path = nvme_io_path_alloc(); 792 if (io_path == NULL) { 793 return -ENOMEM; 794 } 795 796 io_path->nvme_ns = nvme_ns; 797 798 ch = spdk_get_io_channel(nvme_ns->ctrlr); 799 if (ch == NULL) { 800 nvme_io_path_free(io_path); 801 SPDK_ERRLOG("Failed to alloc io_channel.\n"); 802 return -ENOMEM; 803 } 804 805 ctrlr_ch = spdk_io_channel_get_ctx(ch); 806 807 nvme_qpair = ctrlr_ch->qpair; 808 assert(nvme_qpair != NULL); 809 810 io_path->qpair = nvme_qpair; 811 TAILQ_INSERT_TAIL(&nvme_qpair->io_path_list, io_path, tailq); 812 813 io_path->nbdev_ch = nbdev_ch; 814 STAILQ_INSERT_TAIL(&nbdev_ch->io_path_list, io_path, stailq); 815 816 bdev_nvme_clear_current_io_path(nbdev_ch); 817 818 return 0; 819 } 820 821 static void 822 bdev_nvme_clear_retry_io_path(struct nvme_bdev_channel *nbdev_ch, 823 struct nvme_io_path *io_path) 824 { 825 struct nvme_bdev_io *bio; 826 827 TAILQ_FOREACH(bio, &nbdev_ch->retry_io_list, retry_link) { 828 if (bio->io_path == io_path) { 829 bio->io_path = NULL; 830 } 831 } 832 } 833 834 static void 835 _bdev_nvme_delete_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *io_path) 836 { 837 struct spdk_io_channel *ch; 838 struct nvme_qpair *nvme_qpair; 839 struct nvme_ctrlr_channel *ctrlr_ch; 840 struct nvme_bdev *nbdev; 841 842 nbdev = spdk_io_channel_get_io_device(spdk_io_channel_from_ctx(nbdev_ch)); 843 844 /* Add the statistics to nvme_ns before this path is destroyed. */ 845 pthread_mutex_lock(&nbdev->mutex); 846 if (nbdev->ref != 0 && io_path->nvme_ns->stat != NULL && io_path->stat != NULL) { 847 spdk_bdev_add_io_stat(io_path->nvme_ns->stat, io_path->stat); 848 } 849 pthread_mutex_unlock(&nbdev->mutex); 850 851 bdev_nvme_clear_current_io_path(nbdev_ch); 852 bdev_nvme_clear_retry_io_path(nbdev_ch, io_path); 853 854 STAILQ_REMOVE(&nbdev_ch->io_path_list, io_path, nvme_io_path, stailq); 855 io_path->nbdev_ch = NULL; 856 857 nvme_qpair = io_path->qpair; 858 assert(nvme_qpair != NULL); 859 860 ctrlr_ch = nvme_qpair->ctrlr_ch; 861 assert(ctrlr_ch != NULL); 862 863 ch = spdk_io_channel_from_ctx(ctrlr_ch); 864 spdk_put_io_channel(ch); 865 866 /* After an io_path is removed, I/Os submitted to it may complete and update statistics 867 * of the io_path. To avoid heap-use-after-free error from this case, do not free the 868 * io_path here but free the io_path when the associated qpair is freed. It is ensured 869 * that all I/Os submitted to the io_path are completed when the associated qpair is freed. 870 */ 871 } 872 873 static void 874 _bdev_nvme_delete_io_paths(struct nvme_bdev_channel *nbdev_ch) 875 { 876 struct nvme_io_path *io_path, *tmp_io_path; 877 878 STAILQ_FOREACH_SAFE(io_path, &nbdev_ch->io_path_list, stailq, tmp_io_path) { 879 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 880 } 881 } 882 883 static int 884 bdev_nvme_create_bdev_channel_cb(void *io_device, void *ctx_buf) 885 { 886 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 887 struct nvme_bdev *nbdev = io_device; 888 struct nvme_ns *nvme_ns; 889 int rc; 890 891 STAILQ_INIT(&nbdev_ch->io_path_list); 892 TAILQ_INIT(&nbdev_ch->retry_io_list); 893 894 pthread_mutex_lock(&nbdev->mutex); 895 896 nbdev_ch->mp_policy = nbdev->mp_policy; 897 nbdev_ch->mp_selector = nbdev->mp_selector; 898 nbdev_ch->rr_min_io = nbdev->rr_min_io; 899 900 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 901 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 902 if (rc != 0) { 903 pthread_mutex_unlock(&nbdev->mutex); 904 905 _bdev_nvme_delete_io_paths(nbdev_ch); 906 return rc; 907 } 908 } 909 pthread_mutex_unlock(&nbdev->mutex); 910 911 return 0; 912 } 913 914 /* If cpl != NULL, complete the bdev_io with nvme status based on 'cpl'. 915 * If cpl == NULL, complete the bdev_io with bdev status based on 'status'. 916 */ 917 static inline void 918 __bdev_nvme_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status, 919 const struct spdk_nvme_cpl *cpl) 920 { 921 spdk_trace_record(TRACE_BDEV_NVME_IO_DONE, 0, 0, (uintptr_t)bdev_io->driver_ctx, 922 (uintptr_t)bdev_io); 923 if (cpl) { 924 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 925 } else { 926 spdk_bdev_io_complete(bdev_io, status); 927 } 928 } 929 930 static void bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch); 931 932 static void 933 bdev_nvme_destroy_bdev_channel_cb(void *io_device, void *ctx_buf) 934 { 935 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 936 937 bdev_nvme_abort_retry_ios(nbdev_ch); 938 _bdev_nvme_delete_io_paths(nbdev_ch); 939 } 940 941 static inline bool 942 bdev_nvme_io_type_is_admin(enum spdk_bdev_io_type io_type) 943 { 944 switch (io_type) { 945 case SPDK_BDEV_IO_TYPE_RESET: 946 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 947 case SPDK_BDEV_IO_TYPE_ABORT: 948 return true; 949 default: 950 break; 951 } 952 953 return false; 954 } 955 956 static inline bool 957 nvme_ns_is_active(struct nvme_ns *nvme_ns) 958 { 959 if (spdk_unlikely(nvme_ns->ana_state_updating)) { 960 return false; 961 } 962 963 if (spdk_unlikely(nvme_ns->ns == NULL)) { 964 return false; 965 } 966 967 return true; 968 } 969 970 static inline bool 971 nvme_ns_is_accessible(struct nvme_ns *nvme_ns) 972 { 973 if (spdk_unlikely(!nvme_ns_is_active(nvme_ns))) { 974 return false; 975 } 976 977 switch (nvme_ns->ana_state) { 978 case SPDK_NVME_ANA_OPTIMIZED_STATE: 979 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 980 return true; 981 default: 982 break; 983 } 984 985 return false; 986 } 987 988 static inline bool 989 nvme_qpair_is_connected(struct nvme_qpair *nvme_qpair) 990 { 991 if (spdk_unlikely(nvme_qpair->qpair == NULL)) { 992 return false; 993 } 994 995 if (spdk_unlikely(spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) != 996 SPDK_NVME_QPAIR_FAILURE_NONE)) { 997 return false; 998 } 999 1000 if (spdk_unlikely(nvme_qpair->ctrlr_ch->reset_iter != NULL)) { 1001 return false; 1002 } 1003 1004 return true; 1005 } 1006 1007 static inline bool 1008 nvme_io_path_is_available(struct nvme_io_path *io_path) 1009 { 1010 if (spdk_unlikely(!nvme_qpair_is_connected(io_path->qpair))) { 1011 return false; 1012 } 1013 1014 if (spdk_unlikely(!nvme_ns_is_accessible(io_path->nvme_ns))) { 1015 return false; 1016 } 1017 1018 return true; 1019 } 1020 1021 static inline bool 1022 nvme_ctrlr_is_failed(struct nvme_ctrlr *nvme_ctrlr) 1023 { 1024 if (nvme_ctrlr->destruct) { 1025 return true; 1026 } 1027 1028 if (nvme_ctrlr->fast_io_fail_timedout) { 1029 return true; 1030 } 1031 1032 if (nvme_ctrlr->resetting) { 1033 if (nvme_ctrlr->opts.reconnect_delay_sec != 0) { 1034 return false; 1035 } else { 1036 return true; 1037 } 1038 } 1039 1040 if (nvme_ctrlr->reconnect_is_delayed) { 1041 return false; 1042 } 1043 1044 if (nvme_ctrlr->disabled) { 1045 return true; 1046 } 1047 1048 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 1049 return true; 1050 } else { 1051 return false; 1052 } 1053 } 1054 1055 static bool 1056 nvme_ctrlr_is_available(struct nvme_ctrlr *nvme_ctrlr) 1057 { 1058 if (nvme_ctrlr->destruct) { 1059 return false; 1060 } 1061 1062 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 1063 return false; 1064 } 1065 1066 if (nvme_ctrlr->resetting || nvme_ctrlr->reconnect_is_delayed) { 1067 return false; 1068 } 1069 1070 if (nvme_ctrlr->disabled) { 1071 return false; 1072 } 1073 1074 return true; 1075 } 1076 1077 /* Simulate circular linked list. */ 1078 static inline struct nvme_io_path * 1079 nvme_io_path_get_next(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *prev_path) 1080 { 1081 struct nvme_io_path *next_path; 1082 1083 if (prev_path != NULL) { 1084 next_path = STAILQ_NEXT(prev_path, stailq); 1085 if (next_path != NULL) { 1086 return next_path; 1087 } 1088 } 1089 1090 return STAILQ_FIRST(&nbdev_ch->io_path_list); 1091 } 1092 1093 static struct nvme_io_path * 1094 _bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 1095 { 1096 struct nvme_io_path *io_path, *start, *non_optimized = NULL; 1097 1098 start = nvme_io_path_get_next(nbdev_ch, nbdev_ch->current_io_path); 1099 1100 io_path = start; 1101 do { 1102 if (spdk_likely(nvme_io_path_is_available(io_path))) { 1103 switch (io_path->nvme_ns->ana_state) { 1104 case SPDK_NVME_ANA_OPTIMIZED_STATE: 1105 nbdev_ch->current_io_path = io_path; 1106 return io_path; 1107 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 1108 if (non_optimized == NULL) { 1109 non_optimized = io_path; 1110 } 1111 break; 1112 default: 1113 assert(false); 1114 break; 1115 } 1116 } 1117 io_path = nvme_io_path_get_next(nbdev_ch, io_path); 1118 } while (io_path != start); 1119 1120 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) { 1121 /* We come here only if there is no optimized path. Cache even non_optimized 1122 * path for load balance across multiple non_optimized paths. 1123 */ 1124 nbdev_ch->current_io_path = non_optimized; 1125 } 1126 1127 return non_optimized; 1128 } 1129 1130 static struct nvme_io_path * 1131 _bdev_nvme_find_io_path_min_qd(struct nvme_bdev_channel *nbdev_ch) 1132 { 1133 struct nvme_io_path *io_path; 1134 struct nvme_io_path *optimized = NULL, *non_optimized = NULL; 1135 uint32_t opt_min_qd = UINT32_MAX, non_opt_min_qd = UINT32_MAX; 1136 uint32_t num_outstanding_reqs; 1137 1138 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 1139 if (spdk_unlikely(!nvme_qpair_is_connected(io_path->qpair))) { 1140 /* The device is currently resetting. */ 1141 continue; 1142 } 1143 1144 if (spdk_unlikely(!nvme_ns_is_active(io_path->nvme_ns))) { 1145 continue; 1146 } 1147 1148 num_outstanding_reqs = spdk_nvme_qpair_get_num_outstanding_reqs(io_path->qpair->qpair); 1149 switch (io_path->nvme_ns->ana_state) { 1150 case SPDK_NVME_ANA_OPTIMIZED_STATE: 1151 if (num_outstanding_reqs < opt_min_qd) { 1152 opt_min_qd = num_outstanding_reqs; 1153 optimized = io_path; 1154 } 1155 break; 1156 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 1157 if (num_outstanding_reqs < non_opt_min_qd) { 1158 non_opt_min_qd = num_outstanding_reqs; 1159 non_optimized = io_path; 1160 } 1161 break; 1162 default: 1163 break; 1164 } 1165 } 1166 1167 /* don't cache io path for BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH selector */ 1168 if (optimized != NULL) { 1169 return optimized; 1170 } 1171 1172 return non_optimized; 1173 } 1174 1175 static inline struct nvme_io_path * 1176 bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 1177 { 1178 if (spdk_likely(nbdev_ch->current_io_path != NULL)) { 1179 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE) { 1180 return nbdev_ch->current_io_path; 1181 } else if (nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 1182 if (++nbdev_ch->rr_counter < nbdev_ch->rr_min_io) { 1183 return nbdev_ch->current_io_path; 1184 } 1185 nbdev_ch->rr_counter = 0; 1186 } 1187 } 1188 1189 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE || 1190 nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 1191 return _bdev_nvme_find_io_path(nbdev_ch); 1192 } else { 1193 return _bdev_nvme_find_io_path_min_qd(nbdev_ch); 1194 } 1195 } 1196 1197 /* Return true if there is any io_path whose qpair is active or ctrlr is not failed, 1198 * or false otherwise. 1199 * 1200 * If any io_path has an active qpair but find_io_path() returned NULL, its namespace 1201 * is likely to be non-accessible now but may become accessible. 1202 * 1203 * If any io_path has an unfailed ctrlr but find_io_path() returned NULL, the ctrlr 1204 * is likely to be resetting now but the reset may succeed. A ctrlr is set to unfailed 1205 * when starting to reset it but it is set to failed when the reset failed. Hence, if 1206 * a ctrlr is unfailed, it is likely that it works fine or is resetting. 1207 */ 1208 static bool 1209 any_io_path_may_become_available(struct nvme_bdev_channel *nbdev_ch) 1210 { 1211 struct nvme_io_path *io_path; 1212 1213 if (nbdev_ch->resetting) { 1214 return false; 1215 } 1216 1217 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 1218 if (io_path->nvme_ns->ana_transition_timedout) { 1219 continue; 1220 } 1221 1222 if (nvme_qpair_is_connected(io_path->qpair) || 1223 !nvme_ctrlr_is_failed(io_path->qpair->ctrlr)) { 1224 return true; 1225 } 1226 } 1227 1228 return false; 1229 } 1230 1231 static void 1232 bdev_nvme_retry_io(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io) 1233 { 1234 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 1235 struct spdk_io_channel *ch; 1236 1237 if (nbdev_io->io_path != NULL && nvme_io_path_is_available(nbdev_io->io_path)) { 1238 _bdev_nvme_submit_request(nbdev_ch, bdev_io); 1239 } else { 1240 ch = spdk_io_channel_from_ctx(nbdev_ch); 1241 bdev_nvme_submit_request(ch, bdev_io); 1242 } 1243 } 1244 1245 static int 1246 bdev_nvme_retry_ios(void *arg) 1247 { 1248 struct nvme_bdev_channel *nbdev_ch = arg; 1249 struct nvme_bdev_io *bio, *tmp_bio; 1250 uint64_t now, delay_us; 1251 1252 now = spdk_get_ticks(); 1253 1254 TAILQ_FOREACH_SAFE(bio, &nbdev_ch->retry_io_list, retry_link, tmp_bio) { 1255 if (bio->retry_ticks > now) { 1256 break; 1257 } 1258 1259 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bio, retry_link); 1260 1261 bdev_nvme_retry_io(nbdev_ch, spdk_bdev_io_from_ctx(bio)); 1262 } 1263 1264 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1265 1266 bio = TAILQ_FIRST(&nbdev_ch->retry_io_list); 1267 if (bio != NULL) { 1268 delay_us = (bio->retry_ticks - now) * SPDK_SEC_TO_USEC / spdk_get_ticks_hz(); 1269 1270 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 1271 delay_us); 1272 } 1273 1274 return SPDK_POLLER_BUSY; 1275 } 1276 1277 static void 1278 bdev_nvme_queue_retry_io(struct nvme_bdev_channel *nbdev_ch, 1279 struct nvme_bdev_io *bio, uint64_t delay_ms) 1280 { 1281 struct nvme_bdev_io *tmp_bio; 1282 1283 bio->retry_ticks = spdk_get_ticks() + delay_ms * spdk_get_ticks_hz() / 1000ULL; 1284 1285 TAILQ_FOREACH_REVERSE(tmp_bio, &nbdev_ch->retry_io_list, retry_io_head, retry_link) { 1286 if (tmp_bio->retry_ticks <= bio->retry_ticks) { 1287 TAILQ_INSERT_AFTER(&nbdev_ch->retry_io_list, tmp_bio, bio, 1288 retry_link); 1289 return; 1290 } 1291 } 1292 1293 /* No earlier I/Os were found. This I/O must be the new head. */ 1294 TAILQ_INSERT_HEAD(&nbdev_ch->retry_io_list, bio, retry_link); 1295 1296 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1297 1298 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 1299 delay_ms * 1000ULL); 1300 } 1301 1302 static void 1303 bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch) 1304 { 1305 struct nvme_bdev_io *bio, *tmp_bio; 1306 1307 TAILQ_FOREACH_SAFE(bio, &nbdev_ch->retry_io_list, retry_link, tmp_bio) { 1308 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bio, retry_link); 1309 __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_ABORTED, NULL); 1310 } 1311 1312 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1313 } 1314 1315 static int 1316 bdev_nvme_abort_retry_io(struct nvme_bdev_channel *nbdev_ch, 1317 struct nvme_bdev_io *bio_to_abort) 1318 { 1319 struct nvme_bdev_io *bio; 1320 1321 TAILQ_FOREACH(bio, &nbdev_ch->retry_io_list, retry_link) { 1322 if (bio == bio_to_abort) { 1323 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bio, retry_link); 1324 __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_ABORTED, NULL); 1325 return 0; 1326 } 1327 } 1328 1329 return -ENOENT; 1330 } 1331 1332 static void 1333 bdev_nvme_update_nvme_error_stat(struct spdk_bdev_io *bdev_io, const struct spdk_nvme_cpl *cpl) 1334 { 1335 struct nvme_bdev *nbdev; 1336 uint16_t sct, sc; 1337 1338 assert(spdk_nvme_cpl_is_error(cpl)); 1339 1340 nbdev = bdev_io->bdev->ctxt; 1341 1342 if (nbdev->err_stat == NULL) { 1343 return; 1344 } 1345 1346 sct = cpl->status.sct; 1347 sc = cpl->status.sc; 1348 1349 pthread_mutex_lock(&nbdev->mutex); 1350 1351 nbdev->err_stat->status_type[sct]++; 1352 switch (sct) { 1353 case SPDK_NVME_SCT_GENERIC: 1354 case SPDK_NVME_SCT_COMMAND_SPECIFIC: 1355 case SPDK_NVME_SCT_MEDIA_ERROR: 1356 case SPDK_NVME_SCT_PATH: 1357 nbdev->err_stat->status[sct][sc]++; 1358 break; 1359 default: 1360 break; 1361 } 1362 1363 pthread_mutex_unlock(&nbdev->mutex); 1364 } 1365 1366 static inline void 1367 bdev_nvme_update_io_path_stat(struct nvme_bdev_io *bio) 1368 { 1369 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1370 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 1371 uint32_t blocklen = bdev_io->bdev->blocklen; 1372 struct spdk_bdev_io_stat *stat; 1373 uint64_t tsc_diff; 1374 1375 if (bio->io_path->stat == NULL) { 1376 return; 1377 } 1378 1379 tsc_diff = spdk_get_ticks() - bio->submit_tsc; 1380 stat = bio->io_path->stat; 1381 1382 switch (bdev_io->type) { 1383 case SPDK_BDEV_IO_TYPE_READ: 1384 stat->bytes_read += num_blocks * blocklen; 1385 stat->num_read_ops++; 1386 stat->read_latency_ticks += tsc_diff; 1387 if (stat->max_read_latency_ticks < tsc_diff) { 1388 stat->max_read_latency_ticks = tsc_diff; 1389 } 1390 if (stat->min_read_latency_ticks > tsc_diff) { 1391 stat->min_read_latency_ticks = tsc_diff; 1392 } 1393 break; 1394 case SPDK_BDEV_IO_TYPE_WRITE: 1395 stat->bytes_written += num_blocks * blocklen; 1396 stat->num_write_ops++; 1397 stat->write_latency_ticks += tsc_diff; 1398 if (stat->max_write_latency_ticks < tsc_diff) { 1399 stat->max_write_latency_ticks = tsc_diff; 1400 } 1401 if (stat->min_write_latency_ticks > tsc_diff) { 1402 stat->min_write_latency_ticks = tsc_diff; 1403 } 1404 break; 1405 case SPDK_BDEV_IO_TYPE_UNMAP: 1406 stat->bytes_unmapped += num_blocks * blocklen; 1407 stat->num_unmap_ops++; 1408 stat->unmap_latency_ticks += tsc_diff; 1409 if (stat->max_unmap_latency_ticks < tsc_diff) { 1410 stat->max_unmap_latency_ticks = tsc_diff; 1411 } 1412 if (stat->min_unmap_latency_ticks > tsc_diff) { 1413 stat->min_unmap_latency_ticks = tsc_diff; 1414 } 1415 break; 1416 case SPDK_BDEV_IO_TYPE_ZCOPY: 1417 /* Track the data in the start phase only */ 1418 if (!bdev_io->u.bdev.zcopy.start) { 1419 break; 1420 } 1421 if (bdev_io->u.bdev.zcopy.populate) { 1422 stat->bytes_read += num_blocks * blocklen; 1423 stat->num_read_ops++; 1424 stat->read_latency_ticks += tsc_diff; 1425 if (stat->max_read_latency_ticks < tsc_diff) { 1426 stat->max_read_latency_ticks = tsc_diff; 1427 } 1428 if (stat->min_read_latency_ticks > tsc_diff) { 1429 stat->min_read_latency_ticks = tsc_diff; 1430 } 1431 } else { 1432 stat->bytes_written += num_blocks * blocklen; 1433 stat->num_write_ops++; 1434 stat->write_latency_ticks += tsc_diff; 1435 if (stat->max_write_latency_ticks < tsc_diff) { 1436 stat->max_write_latency_ticks = tsc_diff; 1437 } 1438 if (stat->min_write_latency_ticks > tsc_diff) { 1439 stat->min_write_latency_ticks = tsc_diff; 1440 } 1441 } 1442 break; 1443 case SPDK_BDEV_IO_TYPE_COPY: 1444 stat->bytes_copied += num_blocks * blocklen; 1445 stat->num_copy_ops++; 1446 stat->copy_latency_ticks += tsc_diff; 1447 if (stat->max_copy_latency_ticks < tsc_diff) { 1448 stat->max_copy_latency_ticks = tsc_diff; 1449 } 1450 if (stat->min_copy_latency_ticks > tsc_diff) { 1451 stat->min_copy_latency_ticks = tsc_diff; 1452 } 1453 break; 1454 default: 1455 break; 1456 } 1457 } 1458 1459 static bool 1460 bdev_nvme_check_retry_io(struct nvme_bdev_io *bio, 1461 const struct spdk_nvme_cpl *cpl, 1462 struct nvme_bdev_channel *nbdev_ch, 1463 uint64_t *_delay_ms) 1464 { 1465 struct nvme_io_path *io_path = bio->io_path; 1466 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 1467 const struct spdk_nvme_ctrlr_data *cdata; 1468 1469 if (spdk_nvme_cpl_is_path_error(cpl) || 1470 spdk_nvme_cpl_is_aborted_sq_deletion(cpl) || 1471 !nvme_io_path_is_available(io_path) || 1472 !nvme_ctrlr_is_available(nvme_ctrlr)) { 1473 bdev_nvme_clear_current_io_path(nbdev_ch); 1474 bio->io_path = NULL; 1475 if (spdk_nvme_cpl_is_ana_error(cpl)) { 1476 if (nvme_ctrlr_read_ana_log_page(nvme_ctrlr) == 0) { 1477 io_path->nvme_ns->ana_state_updating = true; 1478 } 1479 } 1480 if (!any_io_path_may_become_available(nbdev_ch)) { 1481 return false; 1482 } 1483 *_delay_ms = 0; 1484 } else { 1485 bio->retry_count++; 1486 1487 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 1488 1489 if (cpl->status.crd != 0) { 1490 *_delay_ms = cdata->crdt[cpl->status.crd] * 100; 1491 } else { 1492 *_delay_ms = 0; 1493 } 1494 } 1495 1496 return true; 1497 } 1498 1499 static inline void 1500 bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio, 1501 const struct spdk_nvme_cpl *cpl) 1502 { 1503 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1504 struct nvme_bdev_channel *nbdev_ch; 1505 uint64_t delay_ms; 1506 1507 assert(!bdev_nvme_io_type_is_admin(bdev_io->type)); 1508 1509 if (spdk_likely(spdk_nvme_cpl_is_success(cpl))) { 1510 bdev_nvme_update_io_path_stat(bio); 1511 goto complete; 1512 } 1513 1514 /* Update error counts before deciding if retry is needed. 1515 * Hence, error counts may be more than the number of I/O errors. 1516 */ 1517 bdev_nvme_update_nvme_error_stat(bdev_io, cpl); 1518 1519 if (cpl->status.dnr != 0 || spdk_nvme_cpl_is_aborted_by_request(cpl) || 1520 (g_opts.bdev_retry_count != -1 && bio->retry_count >= g_opts.bdev_retry_count)) { 1521 goto complete; 1522 } 1523 1524 /* At this point we don't know whether the sequence was successfully executed or not, so we 1525 * cannot retry the IO */ 1526 if (bdev_io->u.bdev.accel_sequence != NULL) { 1527 goto complete; 1528 } 1529 1530 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1531 1532 if (bdev_nvme_check_retry_io(bio, cpl, nbdev_ch, &delay_ms)) { 1533 bdev_nvme_queue_retry_io(nbdev_ch, bio, delay_ms); 1534 return; 1535 } 1536 1537 complete: 1538 bio->retry_count = 0; 1539 bio->submit_tsc = 0; 1540 bdev_io->u.bdev.accel_sequence = NULL; 1541 __bdev_nvme_io_complete(bdev_io, 0, cpl); 1542 } 1543 1544 static inline void 1545 bdev_nvme_io_complete(struct nvme_bdev_io *bio, int rc) 1546 { 1547 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1548 struct nvme_bdev_channel *nbdev_ch; 1549 enum spdk_bdev_io_status io_status; 1550 1551 assert(!bdev_nvme_io_type_is_admin(bdev_io->type)); 1552 1553 switch (rc) { 1554 case 0: 1555 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1556 break; 1557 case -ENOMEM: 1558 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1559 break; 1560 case -ENXIO: 1561 if (g_opts.bdev_retry_count == -1 || bio->retry_count < g_opts.bdev_retry_count) { 1562 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1563 1564 bdev_nvme_clear_current_io_path(nbdev_ch); 1565 bio->io_path = NULL; 1566 1567 if (any_io_path_may_become_available(nbdev_ch)) { 1568 bdev_nvme_queue_retry_io(nbdev_ch, bio, 1000ULL); 1569 return; 1570 } 1571 } 1572 1573 /* fallthrough */ 1574 default: 1575 spdk_accel_sequence_abort(bdev_io->u.bdev.accel_sequence); 1576 bdev_io->u.bdev.accel_sequence = NULL; 1577 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1578 break; 1579 } 1580 1581 bio->retry_count = 0; 1582 bio->submit_tsc = 0; 1583 __bdev_nvme_io_complete(bdev_io, io_status, NULL); 1584 } 1585 1586 static inline void 1587 bdev_nvme_admin_complete(struct nvme_bdev_io *bio, int rc) 1588 { 1589 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1590 enum spdk_bdev_io_status io_status; 1591 1592 switch (rc) { 1593 case 0: 1594 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1595 break; 1596 case -ENOMEM: 1597 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1598 break; 1599 case -ENXIO: 1600 /* fallthrough */ 1601 default: 1602 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1603 break; 1604 } 1605 1606 __bdev_nvme_io_complete(bdev_io, io_status, NULL); 1607 } 1608 1609 static void 1610 bdev_nvme_clear_io_path_caches_done(struct nvme_ctrlr *nvme_ctrlr, 1611 void *ctx, int status) 1612 { 1613 pthread_mutex_lock(&nvme_ctrlr->mutex); 1614 1615 assert(nvme_ctrlr->io_path_cache_clearing == true); 1616 nvme_ctrlr->io_path_cache_clearing = false; 1617 1618 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 1619 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1620 return; 1621 } 1622 1623 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1624 1625 nvme_ctrlr_unregister(nvme_ctrlr); 1626 } 1627 1628 static void 1629 _bdev_nvme_clear_io_path_cache(struct nvme_qpair *nvme_qpair) 1630 { 1631 struct nvme_io_path *io_path; 1632 1633 TAILQ_FOREACH(io_path, &nvme_qpair->io_path_list, tailq) { 1634 if (io_path->nbdev_ch == NULL) { 1635 continue; 1636 } 1637 bdev_nvme_clear_current_io_path(io_path->nbdev_ch); 1638 } 1639 } 1640 1641 static void 1642 bdev_nvme_clear_io_path_cache(struct nvme_ctrlr_channel_iter *i, 1643 struct nvme_ctrlr *nvme_ctrlr, 1644 struct nvme_ctrlr_channel *ctrlr_ch, 1645 void *ctx) 1646 { 1647 assert(ctrlr_ch->qpair != NULL); 1648 1649 _bdev_nvme_clear_io_path_cache(ctrlr_ch->qpair); 1650 1651 nvme_ctrlr_for_each_channel_continue(i, 0); 1652 } 1653 1654 static void 1655 bdev_nvme_clear_io_path_caches(struct nvme_ctrlr *nvme_ctrlr) 1656 { 1657 pthread_mutex_lock(&nvme_ctrlr->mutex); 1658 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 1659 nvme_ctrlr->io_path_cache_clearing) { 1660 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1661 return; 1662 } 1663 1664 nvme_ctrlr->io_path_cache_clearing = true; 1665 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1666 1667 nvme_ctrlr_for_each_channel(nvme_ctrlr, 1668 bdev_nvme_clear_io_path_cache, 1669 NULL, 1670 bdev_nvme_clear_io_path_caches_done); 1671 } 1672 1673 static struct nvme_qpair * 1674 nvme_poll_group_get_qpair(struct nvme_poll_group *group, struct spdk_nvme_qpair *qpair) 1675 { 1676 struct nvme_qpair *nvme_qpair; 1677 1678 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1679 if (nvme_qpair->qpair == qpair) { 1680 break; 1681 } 1682 } 1683 1684 return nvme_qpair; 1685 } 1686 1687 static void nvme_qpair_delete(struct nvme_qpair *nvme_qpair); 1688 1689 static void 1690 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx) 1691 { 1692 struct nvme_poll_group *group = poll_group_ctx; 1693 struct nvme_qpair *nvme_qpair; 1694 struct nvme_ctrlr_channel *ctrlr_ch; 1695 int status; 1696 1697 nvme_qpair = nvme_poll_group_get_qpair(group, qpair); 1698 if (nvme_qpair == NULL) { 1699 return; 1700 } 1701 1702 if (nvme_qpair->qpair != NULL) { 1703 spdk_nvme_ctrlr_free_io_qpair(nvme_qpair->qpair); 1704 nvme_qpair->qpair = NULL; 1705 } 1706 1707 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1708 1709 ctrlr_ch = nvme_qpair->ctrlr_ch; 1710 1711 if (ctrlr_ch != NULL) { 1712 if (ctrlr_ch->reset_iter != NULL) { 1713 /* We are in a full reset sequence. */ 1714 if (ctrlr_ch->connect_poller != NULL) { 1715 /* qpair was failed to connect. Abort the reset sequence. */ 1716 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was failed to connect. abort the reset ctrlr sequence.\n", 1717 qpair); 1718 spdk_poller_unregister(&ctrlr_ch->connect_poller); 1719 status = -1; 1720 } else { 1721 /* qpair was completed to disconnect. Just move to the next ctrlr_channel. */ 1722 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was disconnected and freed in a reset ctrlr sequence.\n", 1723 qpair); 1724 status = 0; 1725 } 1726 nvme_ctrlr_for_each_channel_continue(ctrlr_ch->reset_iter, status); 1727 ctrlr_ch->reset_iter = NULL; 1728 } else { 1729 /* qpair was disconnected unexpectedly. Reset controller for recovery. */ 1730 SPDK_NOTICELOG("qpair %p was disconnected and freed. reset controller.\n", qpair); 1731 bdev_nvme_failover_ctrlr(nvme_qpair->ctrlr); 1732 } 1733 } else { 1734 /* In this case, ctrlr_channel is already deleted. */ 1735 SPDK_DEBUGLOG(bdev_nvme, "qpair %p was disconnected and freed. delete nvme_qpair.\n", qpair); 1736 nvme_qpair_delete(nvme_qpair); 1737 } 1738 } 1739 1740 static void 1741 bdev_nvme_check_io_qpairs(struct nvme_poll_group *group) 1742 { 1743 struct nvme_qpair *nvme_qpair; 1744 1745 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1746 if (nvme_qpair->qpair == NULL || nvme_qpair->ctrlr_ch == NULL) { 1747 continue; 1748 } 1749 1750 if (spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) != 1751 SPDK_NVME_QPAIR_FAILURE_NONE) { 1752 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1753 } 1754 } 1755 } 1756 1757 static int 1758 bdev_nvme_poll(void *arg) 1759 { 1760 struct nvme_poll_group *group = arg; 1761 int64_t num_completions; 1762 1763 if (group->collect_spin_stat && group->start_ticks == 0) { 1764 group->start_ticks = spdk_get_ticks(); 1765 } 1766 1767 num_completions = spdk_nvme_poll_group_process_completions(group->group, 0, 1768 bdev_nvme_disconnected_qpair_cb); 1769 if (group->collect_spin_stat) { 1770 if (num_completions > 0) { 1771 if (group->end_ticks != 0) { 1772 group->spin_ticks += (group->end_ticks - group->start_ticks); 1773 group->end_ticks = 0; 1774 } 1775 group->start_ticks = 0; 1776 } else { 1777 group->end_ticks = spdk_get_ticks(); 1778 } 1779 } 1780 1781 if (spdk_unlikely(num_completions < 0)) { 1782 bdev_nvme_check_io_qpairs(group); 1783 } 1784 1785 return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 1786 } 1787 1788 static int bdev_nvme_poll_adminq(void *arg); 1789 1790 static void 1791 bdev_nvme_change_adminq_poll_period(struct nvme_ctrlr *nvme_ctrlr, uint64_t new_period_us) 1792 { 1793 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 1794 1795 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, 1796 nvme_ctrlr, new_period_us); 1797 } 1798 1799 static int 1800 bdev_nvme_poll_adminq(void *arg) 1801 { 1802 int32_t rc; 1803 struct nvme_ctrlr *nvme_ctrlr = arg; 1804 nvme_ctrlr_disconnected_cb disconnected_cb; 1805 1806 assert(nvme_ctrlr != NULL); 1807 1808 rc = spdk_nvme_ctrlr_process_admin_completions(nvme_ctrlr->ctrlr); 1809 if (rc < 0) { 1810 disconnected_cb = nvme_ctrlr->disconnected_cb; 1811 nvme_ctrlr->disconnected_cb = NULL; 1812 1813 if (disconnected_cb != NULL) { 1814 bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 1815 g_opts.nvme_adminq_poll_period_us); 1816 disconnected_cb(nvme_ctrlr); 1817 } else { 1818 bdev_nvme_failover_ctrlr(nvme_ctrlr); 1819 } 1820 } else if (spdk_nvme_ctrlr_get_admin_qp_failure_reason(nvme_ctrlr->ctrlr) != 1821 SPDK_NVME_QPAIR_FAILURE_NONE) { 1822 bdev_nvme_clear_io_path_caches(nvme_ctrlr); 1823 } 1824 1825 return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY; 1826 } 1827 1828 static void 1829 nvme_bdev_free(void *io_device) 1830 { 1831 struct nvme_bdev *nvme_disk = io_device; 1832 1833 pthread_mutex_destroy(&nvme_disk->mutex); 1834 free(nvme_disk->disk.name); 1835 free(nvme_disk->err_stat); 1836 free(nvme_disk); 1837 } 1838 1839 static int 1840 bdev_nvme_destruct(void *ctx) 1841 { 1842 struct nvme_bdev *nvme_disk = ctx; 1843 struct nvme_ns *nvme_ns, *tmp_nvme_ns; 1844 1845 SPDK_DTRACE_PROBE2(bdev_nvme_destruct, nvme_disk->nbdev_ctrlr->name, nvme_disk->nsid); 1846 1847 TAILQ_FOREACH_SAFE(nvme_ns, &nvme_disk->nvme_ns_list, tailq, tmp_nvme_ns) { 1848 pthread_mutex_lock(&nvme_ns->ctrlr->mutex); 1849 1850 nvme_ns->bdev = NULL; 1851 1852 assert(nvme_ns->id > 0); 1853 1854 if (nvme_ctrlr_get_ns(nvme_ns->ctrlr, nvme_ns->id) == NULL) { 1855 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1856 1857 nvme_ctrlr_release(nvme_ns->ctrlr); 1858 nvme_ns_free(nvme_ns); 1859 } else { 1860 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1861 } 1862 } 1863 1864 pthread_mutex_lock(&g_bdev_nvme_mutex); 1865 TAILQ_REMOVE(&nvme_disk->nbdev_ctrlr->bdevs, nvme_disk, tailq); 1866 pthread_mutex_unlock(&g_bdev_nvme_mutex); 1867 1868 spdk_io_device_unregister(nvme_disk, nvme_bdev_free); 1869 1870 return 0; 1871 } 1872 1873 static int 1874 bdev_nvme_create_qpair(struct nvme_qpair *nvme_qpair) 1875 { 1876 struct nvme_ctrlr *nvme_ctrlr; 1877 struct spdk_nvme_io_qpair_opts opts; 1878 struct spdk_nvme_qpair *qpair; 1879 int rc; 1880 1881 nvme_ctrlr = nvme_qpair->ctrlr; 1882 1883 spdk_nvme_ctrlr_get_default_io_qpair_opts(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1884 opts.delay_cmd_submit = g_opts.delay_cmd_submit; 1885 opts.create_only = true; 1886 opts.async_mode = true; 1887 opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests); 1888 g_opts.io_queue_requests = opts.io_queue_requests; 1889 1890 qpair = spdk_nvme_ctrlr_alloc_io_qpair(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1891 if (qpair == NULL) { 1892 return -1; 1893 } 1894 1895 SPDK_DTRACE_PROBE3(bdev_nvme_create_qpair, nvme_ctrlr->nbdev_ctrlr->name, 1896 spdk_nvme_qpair_get_id(qpair), spdk_thread_get_id(nvme_ctrlr->thread)); 1897 1898 assert(nvme_qpair->group != NULL); 1899 1900 rc = spdk_nvme_poll_group_add(nvme_qpair->group->group, qpair); 1901 if (rc != 0) { 1902 SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n"); 1903 goto err; 1904 } 1905 1906 rc = spdk_nvme_ctrlr_connect_io_qpair(nvme_ctrlr->ctrlr, qpair); 1907 if (rc != 0) { 1908 SPDK_ERRLOG("Unable to connect I/O qpair.\n"); 1909 goto err; 1910 } 1911 1912 nvme_qpair->qpair = qpair; 1913 1914 if (!g_opts.disable_auto_failback) { 1915 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1916 } 1917 1918 return 0; 1919 1920 err: 1921 spdk_nvme_ctrlr_free_io_qpair(qpair); 1922 1923 return rc; 1924 } 1925 1926 static void bdev_nvme_reset_io_continue(void *cb_arg, int rc); 1927 1928 static void 1929 bdev_nvme_complete_pending_resets(struct nvme_ctrlr_channel_iter *i, 1930 struct nvme_ctrlr *nvme_ctrlr, 1931 struct nvme_ctrlr_channel *ctrlr_ch, 1932 void *ctx) 1933 { 1934 int rc = 0; 1935 struct nvme_bdev_io *bio; 1936 1937 if (ctx != NULL) { 1938 rc = -1; 1939 } 1940 1941 while (!TAILQ_EMPTY(&ctrlr_ch->pending_resets)) { 1942 bio = TAILQ_FIRST(&ctrlr_ch->pending_resets); 1943 TAILQ_REMOVE(&ctrlr_ch->pending_resets, bio, retry_link); 1944 1945 bdev_nvme_reset_io_continue(bio, rc); 1946 } 1947 1948 nvme_ctrlr_for_each_channel_continue(i, 0); 1949 } 1950 1951 /* This function marks the current trid as failed by storing the current ticks 1952 * and then sets the next trid to the active trid within a controller if exists. 1953 * 1954 * The purpose of the boolean return value is to request the caller to disconnect 1955 * the current trid now to try connecting the next trid. 1956 */ 1957 static bool 1958 bdev_nvme_failover_trid(struct nvme_ctrlr *nvme_ctrlr, bool remove, bool start) 1959 { 1960 struct nvme_path_id *path_id, *next_path; 1961 int rc __attribute__((unused)); 1962 1963 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 1964 assert(path_id); 1965 assert(path_id == nvme_ctrlr->active_path_id); 1966 next_path = TAILQ_NEXT(path_id, link); 1967 1968 /* Update the last failed time. It means the trid is failed if its last 1969 * failed time is non-zero. 1970 */ 1971 path_id->last_failed_tsc = spdk_get_ticks(); 1972 1973 if (next_path == NULL) { 1974 /* There is no alternate trid within a controller. */ 1975 return false; 1976 } 1977 1978 if (!start && nvme_ctrlr->opts.reconnect_delay_sec == 0) { 1979 /* Connect is not retried in a controller reset sequence. Connecting 1980 * the next trid will be done by the next bdev_nvme_failover_ctrlr() call. 1981 */ 1982 return false; 1983 } 1984 1985 assert(path_id->trid.trtype != SPDK_NVME_TRANSPORT_PCIE); 1986 1987 SPDK_NOTICELOG("Start failover from %s:%s to %s:%s\n", path_id->trid.traddr, 1988 path_id->trid.trsvcid, next_path->trid.traddr, next_path->trid.trsvcid); 1989 1990 spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr); 1991 nvme_ctrlr->active_path_id = next_path; 1992 rc = spdk_nvme_ctrlr_set_trid(nvme_ctrlr->ctrlr, &next_path->trid); 1993 assert(rc == 0); 1994 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 1995 if (!remove) { 1996 /** Shuffle the old trid to the end of the list and use the new one. 1997 * Allows for round robin through multiple connections. 1998 */ 1999 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, path_id, link); 2000 } else { 2001 free(path_id); 2002 } 2003 2004 if (start || next_path->last_failed_tsc == 0) { 2005 /* bdev_nvme_failover_ctrlr() is just called or the next trid is not failed 2006 * or used yet. Try the next trid now. 2007 */ 2008 return true; 2009 } 2010 2011 if (spdk_get_ticks() > next_path->last_failed_tsc + spdk_get_ticks_hz() * 2012 nvme_ctrlr->opts.reconnect_delay_sec) { 2013 /* Enough backoff passed since the next trid failed. Try the next trid now. */ 2014 return true; 2015 } 2016 2017 /* The next trid will be tried after reconnect_delay_sec seconds. */ 2018 return false; 2019 } 2020 2021 static bool 2022 bdev_nvme_check_ctrlr_loss_timeout(struct nvme_ctrlr *nvme_ctrlr) 2023 { 2024 int32_t elapsed; 2025 2026 if (nvme_ctrlr->opts.ctrlr_loss_timeout_sec == 0 || 2027 nvme_ctrlr->opts.ctrlr_loss_timeout_sec == -1) { 2028 return false; 2029 } 2030 2031 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 2032 if (elapsed >= nvme_ctrlr->opts.ctrlr_loss_timeout_sec) { 2033 return true; 2034 } else { 2035 return false; 2036 } 2037 } 2038 2039 static bool 2040 bdev_nvme_check_fast_io_fail_timeout(struct nvme_ctrlr *nvme_ctrlr) 2041 { 2042 uint32_t elapsed; 2043 2044 if (nvme_ctrlr->opts.fast_io_fail_timeout_sec == 0) { 2045 return false; 2046 } 2047 2048 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 2049 if (elapsed >= nvme_ctrlr->opts.fast_io_fail_timeout_sec) { 2050 return true; 2051 } else { 2052 return false; 2053 } 2054 } 2055 2056 static void bdev_nvme_reset_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, bool success); 2057 2058 static void 2059 nvme_ctrlr_disconnect(struct nvme_ctrlr *nvme_ctrlr, nvme_ctrlr_disconnected_cb cb_fn) 2060 { 2061 int rc; 2062 2063 rc = spdk_nvme_ctrlr_disconnect(nvme_ctrlr->ctrlr); 2064 if (rc != 0) { 2065 /* Disconnect fails if ctrlr is already resetting or removed. In this case, 2066 * fail the reset sequence immediately. 2067 */ 2068 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 2069 return; 2070 } 2071 2072 /* spdk_nvme_ctrlr_disconnect() may complete asynchronously later by polling adminq. 2073 * Set callback here to execute the specified operation after ctrlr is really disconnected. 2074 */ 2075 assert(nvme_ctrlr->disconnected_cb == NULL); 2076 nvme_ctrlr->disconnected_cb = cb_fn; 2077 2078 /* During disconnection, reduce the period to poll adminq more often. */ 2079 bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 0); 2080 } 2081 2082 enum bdev_nvme_op_after_reset { 2083 OP_NONE, 2084 OP_COMPLETE_PENDING_DESTRUCT, 2085 OP_DESTRUCT, 2086 OP_DELAYED_RECONNECT, 2087 OP_FAILOVER, 2088 }; 2089 2090 typedef enum bdev_nvme_op_after_reset _bdev_nvme_op_after_reset; 2091 2092 static _bdev_nvme_op_after_reset 2093 bdev_nvme_check_op_after_reset(struct nvme_ctrlr *nvme_ctrlr, bool success) 2094 { 2095 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 2096 /* Complete pending destruct after reset completes. */ 2097 return OP_COMPLETE_PENDING_DESTRUCT; 2098 } else if (nvme_ctrlr->pending_failover) { 2099 nvme_ctrlr->pending_failover = false; 2100 nvme_ctrlr->reset_start_tsc = 0; 2101 return OP_FAILOVER; 2102 } else if (success || nvme_ctrlr->opts.reconnect_delay_sec == 0) { 2103 nvme_ctrlr->reset_start_tsc = 0; 2104 return OP_NONE; 2105 } else if (bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 2106 return OP_DESTRUCT; 2107 } else { 2108 if (bdev_nvme_check_fast_io_fail_timeout(nvme_ctrlr)) { 2109 nvme_ctrlr->fast_io_fail_timedout = true; 2110 } 2111 return OP_DELAYED_RECONNECT; 2112 } 2113 } 2114 2115 static int bdev_nvme_delete_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool hotplug); 2116 static void bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 2117 2118 static int 2119 bdev_nvme_reconnect_delay_timer_expired(void *ctx) 2120 { 2121 struct nvme_ctrlr *nvme_ctrlr = ctx; 2122 2123 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect_delay, nvme_ctrlr->nbdev_ctrlr->name); 2124 pthread_mutex_lock(&nvme_ctrlr->mutex); 2125 2126 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 2127 2128 if (!nvme_ctrlr->reconnect_is_delayed) { 2129 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2130 return SPDK_POLLER_BUSY; 2131 } 2132 2133 nvme_ctrlr->reconnect_is_delayed = false; 2134 2135 if (nvme_ctrlr->destruct) { 2136 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2137 return SPDK_POLLER_BUSY; 2138 } 2139 2140 assert(nvme_ctrlr->resetting == false); 2141 nvme_ctrlr->resetting = true; 2142 2143 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2144 2145 spdk_poller_resume(nvme_ctrlr->adminq_timer_poller); 2146 2147 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2148 return SPDK_POLLER_BUSY; 2149 } 2150 2151 static void 2152 bdev_nvme_start_reconnect_delay_timer(struct nvme_ctrlr *nvme_ctrlr) 2153 { 2154 spdk_poller_pause(nvme_ctrlr->adminq_timer_poller); 2155 2156 assert(nvme_ctrlr->reconnect_is_delayed == false); 2157 nvme_ctrlr->reconnect_is_delayed = true; 2158 2159 assert(nvme_ctrlr->reconnect_delay_timer == NULL); 2160 nvme_ctrlr->reconnect_delay_timer = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_delay_timer_expired, 2161 nvme_ctrlr, 2162 nvme_ctrlr->opts.reconnect_delay_sec * SPDK_SEC_TO_USEC); 2163 } 2164 2165 static void remove_discovery_entry(struct nvme_ctrlr *nvme_ctrlr); 2166 2167 static void 2168 _bdev_nvme_reset_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, void *ctx, int status) 2169 { 2170 bool success = (ctx == NULL); 2171 bdev_nvme_ctrlr_op_cb ctrlr_op_cb_fn = nvme_ctrlr->ctrlr_op_cb_fn; 2172 void *ctrlr_op_cb_arg = nvme_ctrlr->ctrlr_op_cb_arg; 2173 enum bdev_nvme_op_after_reset op_after_reset; 2174 2175 assert(nvme_ctrlr->thread == spdk_get_thread()); 2176 2177 nvme_ctrlr->ctrlr_op_cb_fn = NULL; 2178 nvme_ctrlr->ctrlr_op_cb_arg = NULL; 2179 2180 if (!success) { 2181 SPDK_ERRLOG("Resetting controller failed.\n"); 2182 } else { 2183 SPDK_NOTICELOG("Resetting controller successful.\n"); 2184 } 2185 2186 pthread_mutex_lock(&nvme_ctrlr->mutex); 2187 nvme_ctrlr->resetting = false; 2188 nvme_ctrlr->dont_retry = false; 2189 nvme_ctrlr->in_failover = false; 2190 2191 op_after_reset = bdev_nvme_check_op_after_reset(nvme_ctrlr, success); 2192 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2193 2194 /* Delay callbacks when the next operation is a failover. */ 2195 if (ctrlr_op_cb_fn && op_after_reset != OP_FAILOVER) { 2196 ctrlr_op_cb_fn(ctrlr_op_cb_arg, success ? 0 : -1); 2197 } 2198 2199 switch (op_after_reset) { 2200 case OP_COMPLETE_PENDING_DESTRUCT: 2201 nvme_ctrlr_unregister(nvme_ctrlr); 2202 break; 2203 case OP_DESTRUCT: 2204 bdev_nvme_delete_ctrlr(nvme_ctrlr, false); 2205 remove_discovery_entry(nvme_ctrlr); 2206 break; 2207 case OP_DELAYED_RECONNECT: 2208 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_start_reconnect_delay_timer); 2209 break; 2210 case OP_FAILOVER: 2211 nvme_ctrlr->ctrlr_op_cb_fn = ctrlr_op_cb_fn; 2212 nvme_ctrlr->ctrlr_op_cb_arg = ctrlr_op_cb_arg; 2213 bdev_nvme_failover_ctrlr(nvme_ctrlr); 2214 break; 2215 default: 2216 break; 2217 } 2218 } 2219 2220 static void 2221 bdev_nvme_reset_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, bool success) 2222 { 2223 pthread_mutex_lock(&nvme_ctrlr->mutex); 2224 if (!success) { 2225 /* Connecting the active trid failed. Set the next alternate trid to the 2226 * active trid if it exists. 2227 */ 2228 if (bdev_nvme_failover_trid(nvme_ctrlr, false, false)) { 2229 /* The next alternate trid exists and is ready to try. Try it now. */ 2230 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2231 2232 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr); 2233 return; 2234 } 2235 2236 /* We came here if there is no alternate trid or if the next trid exists but 2237 * is not ready to try. We will try the active trid after reconnect_delay_sec 2238 * seconds if it is non-zero or at the next reset call otherwise. 2239 */ 2240 } else { 2241 /* Connecting the active trid succeeded. Clear the last failed time because it 2242 * means the trid is failed if its last failed time is non-zero. 2243 */ 2244 nvme_ctrlr->active_path_id->last_failed_tsc = 0; 2245 } 2246 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2247 2248 /* Make sure we clear any pending resets before returning. */ 2249 nvme_ctrlr_for_each_channel(nvme_ctrlr, 2250 bdev_nvme_complete_pending_resets, 2251 success ? NULL : (void *)0x1, 2252 _bdev_nvme_reset_ctrlr_complete); 2253 } 2254 2255 static void 2256 bdev_nvme_reset_create_qpairs_failed(struct nvme_ctrlr *nvme_ctrlr, void *ctx, int status) 2257 { 2258 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 2259 } 2260 2261 static void 2262 bdev_nvme_reset_destroy_qpair(struct nvme_ctrlr_channel_iter *i, 2263 struct nvme_ctrlr *nvme_ctrlr, 2264 struct nvme_ctrlr_channel *ctrlr_ch, void *ctx) 2265 { 2266 struct nvme_qpair *nvme_qpair; 2267 2268 nvme_qpair = ctrlr_ch->qpair; 2269 assert(nvme_qpair != NULL); 2270 2271 _bdev_nvme_clear_io_path_cache(nvme_qpair); 2272 2273 if (nvme_qpair->qpair != NULL) { 2274 if (nvme_qpair->ctrlr->dont_retry) { 2275 spdk_nvme_qpair_set_abort_dnr(nvme_qpair->qpair, true); 2276 } 2277 spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); 2278 2279 /* The current full reset sequence will move to the next 2280 * ctrlr_channel after the qpair is actually disconnected. 2281 */ 2282 assert(ctrlr_ch->reset_iter == NULL); 2283 ctrlr_ch->reset_iter = i; 2284 } else { 2285 nvme_ctrlr_for_each_channel_continue(i, 0); 2286 } 2287 } 2288 2289 static void 2290 bdev_nvme_reset_create_qpairs_done(struct nvme_ctrlr *nvme_ctrlr, void *ctx, int status) 2291 { 2292 if (status == 0) { 2293 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, true); 2294 } else { 2295 /* Delete the added qpairs and quiesce ctrlr to make the states clean. */ 2296 nvme_ctrlr_for_each_channel(nvme_ctrlr, 2297 bdev_nvme_reset_destroy_qpair, 2298 NULL, 2299 bdev_nvme_reset_create_qpairs_failed); 2300 } 2301 } 2302 2303 static int 2304 bdev_nvme_reset_check_qpair_connected(void *ctx) 2305 { 2306 struct nvme_ctrlr_channel *ctrlr_ch = ctx; 2307 2308 if (ctrlr_ch->reset_iter == NULL) { 2309 /* qpair was already failed to connect and the reset sequence is being aborted. */ 2310 assert(ctrlr_ch->connect_poller == NULL); 2311 assert(ctrlr_ch->qpair->qpair == NULL); 2312 return SPDK_POLLER_BUSY; 2313 } 2314 2315 assert(ctrlr_ch->qpair->qpair != NULL); 2316 2317 if (!spdk_nvme_qpair_is_connected(ctrlr_ch->qpair->qpair)) { 2318 return SPDK_POLLER_BUSY; 2319 } 2320 2321 spdk_poller_unregister(&ctrlr_ch->connect_poller); 2322 2323 /* qpair was completed to connect. Move to the next ctrlr_channel */ 2324 nvme_ctrlr_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 2325 ctrlr_ch->reset_iter = NULL; 2326 2327 if (!g_opts.disable_auto_failback) { 2328 _bdev_nvme_clear_io_path_cache(ctrlr_ch->qpair); 2329 } 2330 2331 return SPDK_POLLER_BUSY; 2332 } 2333 2334 static void 2335 bdev_nvme_reset_create_qpair(struct nvme_ctrlr_channel_iter *i, 2336 struct nvme_ctrlr *nvme_ctrlr, 2337 struct nvme_ctrlr_channel *ctrlr_ch, 2338 void *ctx) 2339 { 2340 int rc; 2341 2342 rc = bdev_nvme_create_qpair(ctrlr_ch->qpair); 2343 if (rc == 0) { 2344 ctrlr_ch->connect_poller = SPDK_POLLER_REGISTER(bdev_nvme_reset_check_qpair_connected, 2345 ctrlr_ch, 0); 2346 2347 /* The current full reset sequence will move to the next 2348 * ctrlr_channel after the qpair is actually connected. 2349 */ 2350 assert(ctrlr_ch->reset_iter == NULL); 2351 ctrlr_ch->reset_iter = i; 2352 } else { 2353 nvme_ctrlr_for_each_channel_continue(i, rc); 2354 } 2355 } 2356 2357 static void 2358 nvme_ctrlr_check_namespaces(struct nvme_ctrlr *nvme_ctrlr) 2359 { 2360 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 2361 struct nvme_ns *nvme_ns; 2362 2363 for (nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 2364 nvme_ns != NULL; 2365 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns)) { 2366 if (!spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) { 2367 SPDK_DEBUGLOG(bdev_nvme, "NSID %u was removed during reset.\n", nvme_ns->id); 2368 /* NS can be added again. Just nullify nvme_ns->ns. */ 2369 nvme_ns->ns = NULL; 2370 } 2371 } 2372 } 2373 2374 2375 static int 2376 bdev_nvme_reconnect_ctrlr_poll(void *arg) 2377 { 2378 struct nvme_ctrlr *nvme_ctrlr = arg; 2379 int rc = -ETIMEDOUT; 2380 2381 if (bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 2382 /* Mark the ctrlr as failed. The next call to 2383 * spdk_nvme_ctrlr_reconnect_poll_async() will then 2384 * do the necessary cleanup and return failure. 2385 */ 2386 spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr); 2387 } 2388 2389 rc = spdk_nvme_ctrlr_reconnect_poll_async(nvme_ctrlr->ctrlr); 2390 if (rc == -EAGAIN) { 2391 return SPDK_POLLER_BUSY; 2392 } 2393 2394 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 2395 if (rc == 0) { 2396 nvme_ctrlr_check_namespaces(nvme_ctrlr); 2397 2398 /* Recreate all of the I/O queue pairs */ 2399 nvme_ctrlr_for_each_channel(nvme_ctrlr, 2400 bdev_nvme_reset_create_qpair, 2401 NULL, 2402 bdev_nvme_reset_create_qpairs_done); 2403 } else { 2404 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 2405 } 2406 return SPDK_POLLER_BUSY; 2407 } 2408 2409 static void 2410 bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2411 { 2412 spdk_nvme_ctrlr_reconnect_async(nvme_ctrlr->ctrlr); 2413 2414 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect, nvme_ctrlr->nbdev_ctrlr->name); 2415 assert(nvme_ctrlr->reset_detach_poller == NULL); 2416 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_ctrlr_poll, 2417 nvme_ctrlr, 0); 2418 } 2419 2420 static void 2421 bdev_nvme_reset_destroy_qpair_done(struct nvme_ctrlr *nvme_ctrlr, void *ctx, int status) 2422 { 2423 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reset, nvme_ctrlr->nbdev_ctrlr->name); 2424 assert(status == 0); 2425 2426 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2427 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2428 } else { 2429 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr); 2430 } 2431 } 2432 2433 static void 2434 bdev_nvme_reset_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr) 2435 { 2436 nvme_ctrlr_for_each_channel(nvme_ctrlr, 2437 bdev_nvme_reset_destroy_qpair, 2438 NULL, 2439 bdev_nvme_reset_destroy_qpair_done); 2440 } 2441 2442 static void 2443 bdev_nvme_reconnect_ctrlr_now(void *ctx) 2444 { 2445 struct nvme_ctrlr *nvme_ctrlr = ctx; 2446 2447 assert(nvme_ctrlr->resetting == true); 2448 assert(nvme_ctrlr->thread == spdk_get_thread()); 2449 2450 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 2451 2452 spdk_poller_resume(nvme_ctrlr->adminq_timer_poller); 2453 2454 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2455 } 2456 2457 static void 2458 _bdev_nvme_reset_ctrlr(void *ctx) 2459 { 2460 struct nvme_ctrlr *nvme_ctrlr = ctx; 2461 2462 assert(nvme_ctrlr->resetting == true); 2463 assert(nvme_ctrlr->thread == spdk_get_thread()); 2464 2465 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2466 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reset_destroy_qpairs); 2467 } else { 2468 bdev_nvme_reset_destroy_qpairs(nvme_ctrlr); 2469 } 2470 } 2471 2472 static int 2473 bdev_nvme_reset_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2474 { 2475 spdk_msg_fn msg_fn; 2476 2477 pthread_mutex_lock(&nvme_ctrlr->mutex); 2478 if (nvme_ctrlr->destruct) { 2479 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2480 return -ENXIO; 2481 } 2482 2483 if (nvme_ctrlr->resetting) { 2484 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2485 SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); 2486 return -EBUSY; 2487 } 2488 2489 if (nvme_ctrlr->disabled) { 2490 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2491 SPDK_NOTICELOG("Unable to perform reset. Controller is disabled.\n"); 2492 return -EALREADY; 2493 } 2494 2495 nvme_ctrlr->resetting = true; 2496 nvme_ctrlr->dont_retry = true; 2497 2498 if (nvme_ctrlr->reconnect_is_delayed) { 2499 SPDK_DEBUGLOG(bdev_nvme, "Reconnect is already scheduled.\n"); 2500 msg_fn = bdev_nvme_reconnect_ctrlr_now; 2501 nvme_ctrlr->reconnect_is_delayed = false; 2502 } else { 2503 msg_fn = _bdev_nvme_reset_ctrlr; 2504 assert(nvme_ctrlr->reset_start_tsc == 0); 2505 } 2506 2507 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2508 2509 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2510 2511 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 2512 return 0; 2513 } 2514 2515 static int 2516 bdev_nvme_enable_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2517 { 2518 pthread_mutex_lock(&nvme_ctrlr->mutex); 2519 if (nvme_ctrlr->destruct) { 2520 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2521 return -ENXIO; 2522 } 2523 2524 if (nvme_ctrlr->resetting) { 2525 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2526 return -EBUSY; 2527 } 2528 2529 if (!nvme_ctrlr->disabled) { 2530 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2531 return -EALREADY; 2532 } 2533 2534 nvme_ctrlr->disabled = false; 2535 nvme_ctrlr->resetting = true; 2536 2537 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2538 2539 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2540 2541 spdk_thread_send_msg(nvme_ctrlr->thread, bdev_nvme_reconnect_ctrlr_now, nvme_ctrlr); 2542 return 0; 2543 } 2544 2545 static void 2546 _bdev_nvme_disable_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, void *ctx, int status) 2547 { 2548 bdev_nvme_ctrlr_op_cb ctrlr_op_cb_fn = nvme_ctrlr->ctrlr_op_cb_fn; 2549 void *ctrlr_op_cb_arg = nvme_ctrlr->ctrlr_op_cb_arg; 2550 enum bdev_nvme_op_after_reset op_after_disable; 2551 2552 assert(nvme_ctrlr->thread == spdk_get_thread()); 2553 2554 nvme_ctrlr->ctrlr_op_cb_fn = NULL; 2555 nvme_ctrlr->ctrlr_op_cb_arg = NULL; 2556 2557 pthread_mutex_lock(&nvme_ctrlr->mutex); 2558 2559 nvme_ctrlr->resetting = false; 2560 nvme_ctrlr->dont_retry = false; 2561 2562 op_after_disable = bdev_nvme_check_op_after_reset(nvme_ctrlr, true); 2563 2564 nvme_ctrlr->disabled = true; 2565 spdk_poller_pause(nvme_ctrlr->adminq_timer_poller); 2566 2567 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2568 2569 if (ctrlr_op_cb_fn) { 2570 ctrlr_op_cb_fn(ctrlr_op_cb_arg, 0); 2571 } 2572 2573 switch (op_after_disable) { 2574 case OP_COMPLETE_PENDING_DESTRUCT: 2575 nvme_ctrlr_unregister(nvme_ctrlr); 2576 break; 2577 default: 2578 break; 2579 } 2580 2581 } 2582 2583 static void 2584 bdev_nvme_disable_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr) 2585 { 2586 /* Make sure we clear any pending resets before returning. */ 2587 nvme_ctrlr_for_each_channel(nvme_ctrlr, 2588 bdev_nvme_complete_pending_resets, 2589 NULL, 2590 _bdev_nvme_disable_ctrlr_complete); 2591 } 2592 2593 static void 2594 bdev_nvme_disable_destroy_qpairs_done(struct nvme_ctrlr *nvme_ctrlr, void *ctx, int status) 2595 { 2596 assert(status == 0); 2597 2598 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2599 bdev_nvme_disable_ctrlr_complete(nvme_ctrlr); 2600 } else { 2601 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_disable_ctrlr_complete); 2602 } 2603 } 2604 2605 static void 2606 bdev_nvme_disable_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr) 2607 { 2608 nvme_ctrlr_for_each_channel(nvme_ctrlr, 2609 bdev_nvme_reset_destroy_qpair, 2610 NULL, 2611 bdev_nvme_disable_destroy_qpairs_done); 2612 } 2613 2614 static void 2615 _bdev_nvme_cancel_reconnect_and_disable_ctrlr(void *ctx) 2616 { 2617 struct nvme_ctrlr *nvme_ctrlr = ctx; 2618 2619 assert(nvme_ctrlr->resetting == true); 2620 assert(nvme_ctrlr->thread == spdk_get_thread()); 2621 2622 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 2623 2624 bdev_nvme_disable_ctrlr_complete(nvme_ctrlr); 2625 } 2626 2627 static void 2628 _bdev_nvme_disconnect_and_disable_ctrlr(void *ctx) 2629 { 2630 struct nvme_ctrlr *nvme_ctrlr = ctx; 2631 2632 assert(nvme_ctrlr->resetting == true); 2633 assert(nvme_ctrlr->thread == spdk_get_thread()); 2634 2635 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2636 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_disable_destroy_qpairs); 2637 } else { 2638 bdev_nvme_disable_destroy_qpairs(nvme_ctrlr); 2639 } 2640 } 2641 2642 static int 2643 bdev_nvme_disable_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2644 { 2645 spdk_msg_fn msg_fn; 2646 2647 pthread_mutex_lock(&nvme_ctrlr->mutex); 2648 if (nvme_ctrlr->destruct) { 2649 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2650 return -ENXIO; 2651 } 2652 2653 if (nvme_ctrlr->resetting) { 2654 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2655 return -EBUSY; 2656 } 2657 2658 if (nvme_ctrlr->disabled) { 2659 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2660 return -EALREADY; 2661 } 2662 2663 nvme_ctrlr->resetting = true; 2664 nvme_ctrlr->dont_retry = true; 2665 2666 if (nvme_ctrlr->reconnect_is_delayed) { 2667 msg_fn = _bdev_nvme_cancel_reconnect_and_disable_ctrlr; 2668 nvme_ctrlr->reconnect_is_delayed = false; 2669 } else { 2670 msg_fn = _bdev_nvme_disconnect_and_disable_ctrlr; 2671 } 2672 2673 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2674 2675 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2676 2677 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 2678 return 0; 2679 } 2680 2681 static int 2682 nvme_ctrlr_op(struct nvme_ctrlr *nvme_ctrlr, enum nvme_ctrlr_op op, 2683 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2684 { 2685 int rc; 2686 2687 switch (op) { 2688 case NVME_CTRLR_OP_RESET: 2689 rc = bdev_nvme_reset_ctrlr(nvme_ctrlr); 2690 break; 2691 case NVME_CTRLR_OP_ENABLE: 2692 rc = bdev_nvme_enable_ctrlr(nvme_ctrlr); 2693 break; 2694 case NVME_CTRLR_OP_DISABLE: 2695 rc = bdev_nvme_disable_ctrlr(nvme_ctrlr); 2696 break; 2697 default: 2698 rc = -EINVAL; 2699 break; 2700 } 2701 2702 if (rc == 0) { 2703 assert(nvme_ctrlr->ctrlr_op_cb_fn == NULL); 2704 assert(nvme_ctrlr->ctrlr_op_cb_arg == NULL); 2705 nvme_ctrlr->ctrlr_op_cb_fn = cb_fn; 2706 nvme_ctrlr->ctrlr_op_cb_arg = cb_arg; 2707 } 2708 return rc; 2709 } 2710 2711 struct nvme_ctrlr_op_rpc_ctx { 2712 struct nvme_ctrlr *nvme_ctrlr; 2713 struct spdk_thread *orig_thread; 2714 enum nvme_ctrlr_op op; 2715 int rc; 2716 bdev_nvme_ctrlr_op_cb cb_fn; 2717 void *cb_arg; 2718 }; 2719 2720 static void 2721 _nvme_ctrlr_op_rpc_complete(void *_ctx) 2722 { 2723 struct nvme_ctrlr_op_rpc_ctx *ctx = _ctx; 2724 2725 assert(ctx != NULL); 2726 assert(ctx->cb_fn != NULL); 2727 2728 ctx->cb_fn(ctx->cb_arg, ctx->rc); 2729 2730 free(ctx); 2731 } 2732 2733 static void 2734 nvme_ctrlr_op_rpc_complete(void *cb_arg, int rc) 2735 { 2736 struct nvme_ctrlr_op_rpc_ctx *ctx = cb_arg; 2737 2738 ctx->rc = rc; 2739 2740 spdk_thread_send_msg(ctx->orig_thread, _nvme_ctrlr_op_rpc_complete, ctx); 2741 } 2742 2743 void 2744 nvme_ctrlr_op_rpc(struct nvme_ctrlr *nvme_ctrlr, enum nvme_ctrlr_op op, 2745 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2746 { 2747 struct nvme_ctrlr_op_rpc_ctx *ctx; 2748 int rc; 2749 2750 assert(cb_fn != NULL); 2751 2752 ctx = calloc(1, sizeof(*ctx)); 2753 if (ctx == NULL) { 2754 SPDK_ERRLOG("Failed to allocate nvme_ctrlr_op_rpc_ctx.\n"); 2755 cb_fn(cb_arg, -ENOMEM); 2756 return; 2757 } 2758 2759 ctx->orig_thread = spdk_get_thread(); 2760 ctx->cb_fn = cb_fn; 2761 ctx->cb_arg = cb_arg; 2762 2763 rc = nvme_ctrlr_op(nvme_ctrlr, op, nvme_ctrlr_op_rpc_complete, ctx); 2764 if (rc == 0) { 2765 return; 2766 } else if (rc == -EALREADY) { 2767 rc = 0; 2768 } 2769 2770 nvme_ctrlr_op_rpc_complete(ctx, rc); 2771 } 2772 2773 static void nvme_bdev_ctrlr_op_rpc_continue(void *cb_arg, int rc); 2774 2775 static void 2776 _nvme_bdev_ctrlr_op_rpc_continue(void *_ctx) 2777 { 2778 struct nvme_ctrlr_op_rpc_ctx *ctx = _ctx; 2779 struct nvme_ctrlr *prev_nvme_ctrlr, *next_nvme_ctrlr; 2780 int rc; 2781 2782 prev_nvme_ctrlr = ctx->nvme_ctrlr; 2783 ctx->nvme_ctrlr = NULL; 2784 2785 if (ctx->rc != 0) { 2786 goto complete; 2787 } 2788 2789 next_nvme_ctrlr = TAILQ_NEXT(prev_nvme_ctrlr, tailq); 2790 if (next_nvme_ctrlr == NULL) { 2791 goto complete; 2792 } 2793 2794 rc = nvme_ctrlr_op(next_nvme_ctrlr, ctx->op, nvme_bdev_ctrlr_op_rpc_continue, ctx); 2795 if (rc == 0) { 2796 ctx->nvme_ctrlr = next_nvme_ctrlr; 2797 return; 2798 } else if (rc == -EALREADY) { 2799 ctx->nvme_ctrlr = next_nvme_ctrlr; 2800 rc = 0; 2801 } 2802 2803 ctx->rc = rc; 2804 2805 complete: 2806 ctx->cb_fn(ctx->cb_arg, ctx->rc); 2807 free(ctx); 2808 } 2809 2810 static void 2811 nvme_bdev_ctrlr_op_rpc_continue(void *cb_arg, int rc) 2812 { 2813 struct nvme_ctrlr_op_rpc_ctx *ctx = cb_arg; 2814 2815 ctx->rc = rc; 2816 2817 spdk_thread_send_msg(ctx->orig_thread, _nvme_bdev_ctrlr_op_rpc_continue, ctx); 2818 } 2819 2820 void 2821 nvme_bdev_ctrlr_op_rpc(struct nvme_bdev_ctrlr *nbdev_ctrlr, enum nvme_ctrlr_op op, 2822 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2823 { 2824 struct nvme_ctrlr_op_rpc_ctx *ctx; 2825 struct nvme_ctrlr *nvme_ctrlr; 2826 int rc; 2827 2828 assert(cb_fn != NULL); 2829 2830 ctx = calloc(1, sizeof(*ctx)); 2831 if (ctx == NULL) { 2832 SPDK_ERRLOG("Failed to allocate nvme_ctrlr_op_rpc_ctx.\n"); 2833 cb_fn(cb_arg, -ENOMEM); 2834 return; 2835 } 2836 2837 ctx->orig_thread = spdk_get_thread(); 2838 ctx->op = op; 2839 ctx->cb_fn = cb_fn; 2840 ctx->cb_arg = cb_arg; 2841 2842 nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 2843 assert(nvme_ctrlr != NULL); 2844 2845 rc = nvme_ctrlr_op(nvme_ctrlr, op, nvme_bdev_ctrlr_op_rpc_continue, ctx); 2846 if (rc == 0) { 2847 ctx->nvme_ctrlr = nvme_ctrlr; 2848 return; 2849 } else if (rc == -EALREADY) { 2850 ctx->nvme_ctrlr = nvme_ctrlr; 2851 rc = 0; 2852 } 2853 2854 nvme_bdev_ctrlr_op_rpc_continue(ctx, rc); 2855 } 2856 2857 static int _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio); 2858 2859 static void 2860 bdev_nvme_unfreeze_bdev_channel_done(struct nvme_bdev *nbdev, void *ctx, int status) 2861 { 2862 struct nvme_bdev_io *bio = ctx; 2863 enum spdk_bdev_io_status io_status; 2864 2865 if (bio->cpl.cdw0 == 0) { 2866 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 2867 } else { 2868 io_status = SPDK_BDEV_IO_STATUS_FAILED; 2869 } 2870 2871 __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), io_status, NULL); 2872 } 2873 2874 static void 2875 bdev_nvme_unfreeze_bdev_channel(struct nvme_bdev_channel_iter *i, 2876 struct nvme_bdev *nbdev, 2877 struct nvme_bdev_channel *nbdev_ch, void *ctx) 2878 { 2879 bdev_nvme_abort_retry_ios(nbdev_ch); 2880 nbdev_ch->resetting = false; 2881 2882 nvme_bdev_for_each_channel_continue(i, 0); 2883 } 2884 2885 static void 2886 bdev_nvme_reset_io_complete(struct nvme_bdev_io *bio) 2887 { 2888 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2889 struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt; 2890 2891 /* Abort all queued I/Os for retry. */ 2892 nvme_bdev_for_each_channel(nbdev, 2893 bdev_nvme_unfreeze_bdev_channel, 2894 bio, 2895 bdev_nvme_unfreeze_bdev_channel_done); 2896 } 2897 2898 static void 2899 _bdev_nvme_reset_io_continue(void *ctx) 2900 { 2901 struct nvme_bdev_io *bio = ctx; 2902 struct nvme_io_path *prev_io_path, *next_io_path; 2903 int rc; 2904 2905 prev_io_path = bio->io_path; 2906 bio->io_path = NULL; 2907 2908 next_io_path = STAILQ_NEXT(prev_io_path, stailq); 2909 if (next_io_path == NULL) { 2910 goto complete; 2911 } 2912 2913 rc = _bdev_nvme_reset_io(next_io_path, bio); 2914 if (rc == 0) { 2915 return; 2916 } 2917 2918 complete: 2919 bdev_nvme_reset_io_complete(bio); 2920 } 2921 2922 static void 2923 bdev_nvme_reset_io_continue(void *cb_arg, int rc) 2924 { 2925 struct nvme_bdev_io *bio = cb_arg; 2926 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2927 2928 /* Reset status is initialized as "failed". Set to "success" once we have at least one 2929 * successfully reset nvme_ctrlr. 2930 */ 2931 if (rc == 0) { 2932 bio->cpl.cdw0 = 0; 2933 } 2934 2935 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), _bdev_nvme_reset_io_continue, bio); 2936 } 2937 2938 static int 2939 _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio) 2940 { 2941 struct nvme_ctrlr_channel *ctrlr_ch; 2942 int rc; 2943 2944 rc = nvme_ctrlr_op(io_path->qpair->ctrlr, NVME_CTRLR_OP_RESET, 2945 bdev_nvme_reset_io_continue, bio); 2946 if (rc != 0 && rc != -EBUSY) { 2947 return rc; 2948 } 2949 2950 assert(bio->io_path == NULL); 2951 bio->io_path = io_path; 2952 2953 if (rc == -EBUSY) { 2954 ctrlr_ch = io_path->qpair->ctrlr_ch; 2955 assert(ctrlr_ch != NULL); 2956 /* 2957 * Reset call is queued only if it is from the app framework. This is on purpose so that 2958 * we don't interfere with the app framework reset strategy. i.e. we are deferring to the 2959 * upper level. If they are in the middle of a reset, we won't try to schedule another one. 2960 */ 2961 TAILQ_INSERT_TAIL(&ctrlr_ch->pending_resets, bio, retry_link); 2962 } 2963 2964 return 0; 2965 } 2966 2967 static void 2968 bdev_nvme_freeze_bdev_channel_done(struct nvme_bdev *nbdev, void *ctx, int status) 2969 { 2970 struct nvme_bdev_io *bio = ctx; 2971 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2972 struct nvme_bdev_channel *nbdev_ch; 2973 struct nvme_io_path *io_path; 2974 int rc; 2975 2976 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 2977 2978 /* Initialize with failed status. With multipath it is enough to have at least one successful 2979 * nvme_ctrlr reset. If there is none, reset status will remain failed. 2980 */ 2981 bio->cpl.cdw0 = 1; 2982 2983 /* Reset all nvme_ctrlrs of a bdev controller sequentially. */ 2984 io_path = STAILQ_FIRST(&nbdev_ch->io_path_list); 2985 assert(io_path != NULL); 2986 2987 rc = _bdev_nvme_reset_io(io_path, bio); 2988 if (rc != 0) { 2989 /* If the current nvme_ctrlr is disabled, skip it and move to the next nvme_ctrlr. */ 2990 rc = (rc == -EALREADY) ? 0 : rc; 2991 2992 bdev_nvme_reset_io_continue(bio, rc); 2993 } 2994 } 2995 2996 static void 2997 bdev_nvme_freeze_bdev_channel(struct nvme_bdev_channel_iter *i, 2998 struct nvme_bdev *nbdev, 2999 struct nvme_bdev_channel *nbdev_ch, void *ctx) 3000 { 3001 nbdev_ch->resetting = true; 3002 3003 nvme_bdev_for_each_channel_continue(i, 0); 3004 } 3005 3006 static void 3007 bdev_nvme_reset_io(struct nvme_bdev *nbdev, struct nvme_bdev_io *bio) 3008 { 3009 nvme_bdev_for_each_channel(nbdev, 3010 bdev_nvme_freeze_bdev_channel, 3011 bio, 3012 bdev_nvme_freeze_bdev_channel_done); 3013 } 3014 3015 static int 3016 bdev_nvme_failover_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool remove) 3017 { 3018 if (nvme_ctrlr->destruct) { 3019 /* Don't bother resetting if the controller is in the process of being destructed. */ 3020 return -ENXIO; 3021 } 3022 3023 if (nvme_ctrlr->resetting) { 3024 if (!nvme_ctrlr->in_failover) { 3025 SPDK_NOTICELOG("Reset is already in progress. Defer failover until reset completes.\n"); 3026 3027 /* Defer failover until reset completes. */ 3028 nvme_ctrlr->pending_failover = true; 3029 return -EINPROGRESS; 3030 } else { 3031 SPDK_NOTICELOG("Unable to perform failover, already in progress.\n"); 3032 return -EBUSY; 3033 } 3034 } 3035 3036 bdev_nvme_failover_trid(nvme_ctrlr, remove, true); 3037 3038 if (nvme_ctrlr->reconnect_is_delayed) { 3039 SPDK_NOTICELOG("Reconnect is already scheduled.\n"); 3040 3041 /* We rely on the next reconnect for the failover. */ 3042 return -EALREADY; 3043 } 3044 3045 if (nvme_ctrlr->disabled) { 3046 SPDK_NOTICELOG("Controller is disabled.\n"); 3047 3048 /* We rely on the enablement for the failover. */ 3049 return -EALREADY; 3050 } 3051 3052 nvme_ctrlr->resetting = true; 3053 nvme_ctrlr->in_failover = true; 3054 3055 assert(nvme_ctrlr->reset_start_tsc == 0); 3056 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 3057 3058 return 0; 3059 } 3060 3061 static int 3062 bdev_nvme_failover_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 3063 { 3064 int rc; 3065 3066 pthread_mutex_lock(&nvme_ctrlr->mutex); 3067 rc = bdev_nvme_failover_ctrlr_unsafe(nvme_ctrlr, false); 3068 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3069 3070 if (rc == 0) { 3071 spdk_thread_send_msg(nvme_ctrlr->thread, _bdev_nvme_reset_ctrlr, nvme_ctrlr); 3072 } else if (rc == -EALREADY) { 3073 rc = 0; 3074 } 3075 3076 return rc; 3077 } 3078 3079 static int bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, 3080 uint64_t num_blocks); 3081 3082 static int bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, 3083 uint64_t num_blocks); 3084 3085 static int bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, 3086 uint64_t src_offset_blocks, 3087 uint64_t num_blocks); 3088 3089 static void 3090 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 3091 bool success) 3092 { 3093 struct nvme_bdev_io *bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 3094 int ret; 3095 3096 if (!success) { 3097 ret = -EINVAL; 3098 goto exit; 3099 } 3100 3101 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 3102 ret = -ENXIO; 3103 goto exit; 3104 } 3105 3106 ret = bdev_nvme_readv(bio, 3107 bdev_io->u.bdev.iovs, 3108 bdev_io->u.bdev.iovcnt, 3109 bdev_io->u.bdev.md_buf, 3110 bdev_io->u.bdev.num_blocks, 3111 bdev_io->u.bdev.offset_blocks, 3112 bdev_io->u.bdev.dif_check_flags, 3113 bdev_io->u.bdev.memory_domain, 3114 bdev_io->u.bdev.memory_domain_ctx, 3115 bdev_io->u.bdev.accel_sequence); 3116 3117 exit: 3118 if (spdk_unlikely(ret != 0)) { 3119 bdev_nvme_io_complete(bio, ret); 3120 } 3121 } 3122 3123 static inline void 3124 _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io) 3125 { 3126 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 3127 struct spdk_bdev *bdev = bdev_io->bdev; 3128 struct nvme_bdev_io *nbdev_io_to_abort; 3129 int rc = 0; 3130 3131 switch (bdev_io->type) { 3132 case SPDK_BDEV_IO_TYPE_READ: 3133 if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) { 3134 3135 rc = bdev_nvme_readv(nbdev_io, 3136 bdev_io->u.bdev.iovs, 3137 bdev_io->u.bdev.iovcnt, 3138 bdev_io->u.bdev.md_buf, 3139 bdev_io->u.bdev.num_blocks, 3140 bdev_io->u.bdev.offset_blocks, 3141 bdev_io->u.bdev.dif_check_flags, 3142 bdev_io->u.bdev.memory_domain, 3143 bdev_io->u.bdev.memory_domain_ctx, 3144 bdev_io->u.bdev.accel_sequence); 3145 } else { 3146 spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb, 3147 bdev_io->u.bdev.num_blocks * bdev->blocklen); 3148 rc = 0; 3149 } 3150 break; 3151 case SPDK_BDEV_IO_TYPE_WRITE: 3152 rc = bdev_nvme_writev(nbdev_io, 3153 bdev_io->u.bdev.iovs, 3154 bdev_io->u.bdev.iovcnt, 3155 bdev_io->u.bdev.md_buf, 3156 bdev_io->u.bdev.num_blocks, 3157 bdev_io->u.bdev.offset_blocks, 3158 bdev_io->u.bdev.dif_check_flags, 3159 bdev_io->u.bdev.memory_domain, 3160 bdev_io->u.bdev.memory_domain_ctx, 3161 bdev_io->u.bdev.accel_sequence, 3162 bdev_io->u.bdev.nvme_cdw12, 3163 bdev_io->u.bdev.nvme_cdw13); 3164 break; 3165 case SPDK_BDEV_IO_TYPE_COMPARE: 3166 rc = bdev_nvme_comparev(nbdev_io, 3167 bdev_io->u.bdev.iovs, 3168 bdev_io->u.bdev.iovcnt, 3169 bdev_io->u.bdev.md_buf, 3170 bdev_io->u.bdev.num_blocks, 3171 bdev_io->u.bdev.offset_blocks, 3172 bdev_io->u.bdev.dif_check_flags); 3173 break; 3174 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 3175 rc = bdev_nvme_comparev_and_writev(nbdev_io, 3176 bdev_io->u.bdev.iovs, 3177 bdev_io->u.bdev.iovcnt, 3178 bdev_io->u.bdev.fused_iovs, 3179 bdev_io->u.bdev.fused_iovcnt, 3180 bdev_io->u.bdev.md_buf, 3181 bdev_io->u.bdev.num_blocks, 3182 bdev_io->u.bdev.offset_blocks, 3183 bdev_io->u.bdev.dif_check_flags); 3184 break; 3185 case SPDK_BDEV_IO_TYPE_UNMAP: 3186 rc = bdev_nvme_unmap(nbdev_io, 3187 bdev_io->u.bdev.offset_blocks, 3188 bdev_io->u.bdev.num_blocks); 3189 break; 3190 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3191 rc = bdev_nvme_write_zeroes(nbdev_io, 3192 bdev_io->u.bdev.offset_blocks, 3193 bdev_io->u.bdev.num_blocks); 3194 break; 3195 case SPDK_BDEV_IO_TYPE_RESET: 3196 nbdev_io->io_path = NULL; 3197 bdev_nvme_reset_io(bdev->ctxt, nbdev_io); 3198 return; 3199 3200 case SPDK_BDEV_IO_TYPE_FLUSH: 3201 bdev_nvme_io_complete(nbdev_io, 0); 3202 return; 3203 3204 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 3205 rc = bdev_nvme_zone_appendv(nbdev_io, 3206 bdev_io->u.bdev.iovs, 3207 bdev_io->u.bdev.iovcnt, 3208 bdev_io->u.bdev.md_buf, 3209 bdev_io->u.bdev.num_blocks, 3210 bdev_io->u.bdev.offset_blocks, 3211 bdev_io->u.bdev.dif_check_flags); 3212 break; 3213 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 3214 rc = bdev_nvme_get_zone_info(nbdev_io, 3215 bdev_io->u.zone_mgmt.zone_id, 3216 bdev_io->u.zone_mgmt.num_zones, 3217 bdev_io->u.zone_mgmt.buf); 3218 break; 3219 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 3220 rc = bdev_nvme_zone_management(nbdev_io, 3221 bdev_io->u.zone_mgmt.zone_id, 3222 bdev_io->u.zone_mgmt.zone_action); 3223 break; 3224 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 3225 nbdev_io->io_path = NULL; 3226 bdev_nvme_admin_passthru(nbdev_ch, 3227 nbdev_io, 3228 &bdev_io->u.nvme_passthru.cmd, 3229 bdev_io->u.nvme_passthru.buf, 3230 bdev_io->u.nvme_passthru.nbytes); 3231 return; 3232 3233 case SPDK_BDEV_IO_TYPE_NVME_IO: 3234 rc = bdev_nvme_io_passthru(nbdev_io, 3235 &bdev_io->u.nvme_passthru.cmd, 3236 bdev_io->u.nvme_passthru.buf, 3237 bdev_io->u.nvme_passthru.nbytes); 3238 break; 3239 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3240 rc = bdev_nvme_io_passthru_md(nbdev_io, 3241 &bdev_io->u.nvme_passthru.cmd, 3242 bdev_io->u.nvme_passthru.buf, 3243 bdev_io->u.nvme_passthru.nbytes, 3244 bdev_io->u.nvme_passthru.md_buf, 3245 bdev_io->u.nvme_passthru.md_len); 3246 break; 3247 case SPDK_BDEV_IO_TYPE_NVME_IOV_MD: 3248 rc = bdev_nvme_iov_passthru_md(nbdev_io, 3249 &bdev_io->u.nvme_passthru.cmd, 3250 bdev_io->u.nvme_passthru.iovs, 3251 bdev_io->u.nvme_passthru.iovcnt, 3252 bdev_io->u.nvme_passthru.nbytes, 3253 bdev_io->u.nvme_passthru.md_buf, 3254 bdev_io->u.nvme_passthru.md_len); 3255 break; 3256 case SPDK_BDEV_IO_TYPE_ABORT: 3257 nbdev_io->io_path = NULL; 3258 nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx; 3259 bdev_nvme_abort(nbdev_ch, 3260 nbdev_io, 3261 nbdev_io_to_abort); 3262 return; 3263 3264 case SPDK_BDEV_IO_TYPE_COPY: 3265 rc = bdev_nvme_copy(nbdev_io, 3266 bdev_io->u.bdev.offset_blocks, 3267 bdev_io->u.bdev.copy.src_offset_blocks, 3268 bdev_io->u.bdev.num_blocks); 3269 break; 3270 default: 3271 rc = -EINVAL; 3272 break; 3273 } 3274 3275 if (spdk_unlikely(rc != 0)) { 3276 bdev_nvme_io_complete(nbdev_io, rc); 3277 } 3278 } 3279 3280 static void 3281 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 3282 { 3283 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 3284 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 3285 3286 if (spdk_likely(nbdev_io->submit_tsc == 0)) { 3287 nbdev_io->submit_tsc = spdk_bdev_io_get_submit_tsc(bdev_io); 3288 } else { 3289 /* There are cases where submit_tsc != 0, i.e. retry I/O. 3290 * We need to update submit_tsc here. 3291 */ 3292 nbdev_io->submit_tsc = spdk_get_ticks(); 3293 } 3294 3295 spdk_trace_record(TRACE_BDEV_NVME_IO_START, 0, 0, (uintptr_t)nbdev_io, (uintptr_t)bdev_io); 3296 nbdev_io->io_path = bdev_nvme_find_io_path(nbdev_ch); 3297 if (spdk_unlikely(!nbdev_io->io_path)) { 3298 if (!bdev_nvme_io_type_is_admin(bdev_io->type)) { 3299 bdev_nvme_io_complete(nbdev_io, -ENXIO); 3300 return; 3301 } 3302 3303 /* Admin commands do not use the optimal I/O path. 3304 * Simply fall through even if it is not found. 3305 */ 3306 } 3307 3308 _bdev_nvme_submit_request(nbdev_ch, bdev_io); 3309 } 3310 3311 static bool 3312 bdev_nvme_is_supported_csi(enum spdk_nvme_csi csi) 3313 { 3314 switch (csi) { 3315 case SPDK_NVME_CSI_NVM: 3316 return true; 3317 case SPDK_NVME_CSI_ZNS: 3318 return true; 3319 default: 3320 return false; 3321 } 3322 } 3323 3324 static bool 3325 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 3326 { 3327 struct nvme_bdev *nbdev = ctx; 3328 struct nvme_ns *nvme_ns; 3329 struct spdk_nvme_ns *ns; 3330 struct spdk_nvme_ctrlr *ctrlr; 3331 const struct spdk_nvme_ctrlr_data *cdata; 3332 3333 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 3334 assert(nvme_ns != NULL); 3335 ns = nvme_ns->ns; 3336 if (ns == NULL) { 3337 return false; 3338 } 3339 3340 if (!bdev_nvme_is_supported_csi(spdk_nvme_ns_get_csi(ns))) { 3341 switch (io_type) { 3342 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 3343 case SPDK_BDEV_IO_TYPE_NVME_IO: 3344 return true; 3345 3346 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3347 return spdk_nvme_ns_get_md_size(ns) ? true : false; 3348 3349 default: 3350 return false; 3351 } 3352 } 3353 3354 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 3355 3356 switch (io_type) { 3357 case SPDK_BDEV_IO_TYPE_READ: 3358 case SPDK_BDEV_IO_TYPE_WRITE: 3359 case SPDK_BDEV_IO_TYPE_RESET: 3360 case SPDK_BDEV_IO_TYPE_FLUSH: 3361 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 3362 case SPDK_BDEV_IO_TYPE_NVME_IO: 3363 case SPDK_BDEV_IO_TYPE_ABORT: 3364 return true; 3365 3366 case SPDK_BDEV_IO_TYPE_COMPARE: 3367 return spdk_nvme_ns_supports_compare(ns); 3368 3369 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3370 return spdk_nvme_ns_get_md_size(ns) ? true : false; 3371 3372 case SPDK_BDEV_IO_TYPE_UNMAP: 3373 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3374 return cdata->oncs.dsm; 3375 3376 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3377 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3378 return cdata->oncs.write_zeroes; 3379 3380 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 3381 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 3382 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) { 3383 return true; 3384 } 3385 return false; 3386 3387 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 3388 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 3389 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS; 3390 3391 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 3392 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS && 3393 spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ZONE_APPEND_SUPPORTED; 3394 3395 case SPDK_BDEV_IO_TYPE_COPY: 3396 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3397 return cdata->oncs.copy; 3398 3399 default: 3400 return false; 3401 } 3402 } 3403 3404 static int 3405 nvme_qpair_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ctrlr_channel *ctrlr_ch) 3406 { 3407 struct nvme_qpair *nvme_qpair; 3408 struct spdk_io_channel *pg_ch; 3409 int rc; 3410 3411 nvme_qpair = calloc(1, sizeof(*nvme_qpair)); 3412 if (!nvme_qpair) { 3413 SPDK_ERRLOG("Failed to alloc nvme_qpair.\n"); 3414 return -1; 3415 } 3416 3417 TAILQ_INIT(&nvme_qpair->io_path_list); 3418 3419 nvme_qpair->ctrlr = nvme_ctrlr; 3420 nvme_qpair->ctrlr_ch = ctrlr_ch; 3421 3422 pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs); 3423 if (!pg_ch) { 3424 free(nvme_qpair); 3425 return -1; 3426 } 3427 3428 nvme_qpair->group = spdk_io_channel_get_ctx(pg_ch); 3429 3430 #ifdef SPDK_CONFIG_VTUNE 3431 nvme_qpair->group->collect_spin_stat = true; 3432 #else 3433 nvme_qpair->group->collect_spin_stat = false; 3434 #endif 3435 3436 if (!nvme_ctrlr->disabled) { 3437 /* If a nvme_ctrlr is disabled, don't try to create qpair for it. Qpair will 3438 * be created when it's enabled. 3439 */ 3440 rc = bdev_nvme_create_qpair(nvme_qpair); 3441 if (rc != 0) { 3442 /* nvme_ctrlr can't create IO qpair if connection is down. 3443 * If reconnect_delay_sec is non-zero, creating IO qpair is retried 3444 * after reconnect_delay_sec seconds. If bdev_retry_count is non-zero, 3445 * submitted IO will be queued until IO qpair is successfully created. 3446 * 3447 * Hence, if both are satisfied, ignore the failure. 3448 */ 3449 if (nvme_ctrlr->opts.reconnect_delay_sec == 0 || g_opts.bdev_retry_count == 0) { 3450 spdk_put_io_channel(pg_ch); 3451 free(nvme_qpair); 3452 return rc; 3453 } 3454 } 3455 } 3456 3457 TAILQ_INSERT_TAIL(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 3458 3459 ctrlr_ch->qpair = nvme_qpair; 3460 3461 pthread_mutex_lock(&nvme_qpair->ctrlr->mutex); 3462 nvme_qpair->ctrlr->ref++; 3463 pthread_mutex_unlock(&nvme_qpair->ctrlr->mutex); 3464 3465 return 0; 3466 } 3467 3468 static int 3469 bdev_nvme_create_ctrlr_channel_cb(void *io_device, void *ctx_buf) 3470 { 3471 struct nvme_ctrlr *nvme_ctrlr = io_device; 3472 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 3473 3474 TAILQ_INIT(&ctrlr_ch->pending_resets); 3475 3476 return nvme_qpair_create(nvme_ctrlr, ctrlr_ch); 3477 } 3478 3479 static void 3480 nvme_qpair_delete(struct nvme_qpair *nvme_qpair) 3481 { 3482 struct nvme_io_path *io_path, *next; 3483 3484 assert(nvme_qpair->group != NULL); 3485 3486 TAILQ_FOREACH_SAFE(io_path, &nvme_qpair->io_path_list, tailq, next) { 3487 TAILQ_REMOVE(&nvme_qpair->io_path_list, io_path, tailq); 3488 nvme_io_path_free(io_path); 3489 } 3490 3491 TAILQ_REMOVE(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 3492 3493 spdk_put_io_channel(spdk_io_channel_from_ctx(nvme_qpair->group)); 3494 3495 nvme_ctrlr_release(nvme_qpair->ctrlr); 3496 3497 free(nvme_qpair); 3498 } 3499 3500 static void 3501 bdev_nvme_destroy_ctrlr_channel_cb(void *io_device, void *ctx_buf) 3502 { 3503 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 3504 struct nvme_qpair *nvme_qpair; 3505 3506 nvme_qpair = ctrlr_ch->qpair; 3507 assert(nvme_qpair != NULL); 3508 3509 _bdev_nvme_clear_io_path_cache(nvme_qpair); 3510 3511 if (nvme_qpair->qpair != NULL) { 3512 if (ctrlr_ch->reset_iter == NULL) { 3513 spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); 3514 } else { 3515 /* Skip current ctrlr_channel in a full reset sequence because 3516 * it is being deleted now. The qpair is already being disconnected. 3517 * We do not have to restart disconnecting it. 3518 */ 3519 nvme_ctrlr_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 3520 } 3521 3522 /* We cannot release a reference to the poll group now. 3523 * The qpair may be disconnected asynchronously later. 3524 * We need to poll it until it is actually disconnected. 3525 * Just detach the qpair from the deleting ctrlr_channel. 3526 */ 3527 nvme_qpair->ctrlr_ch = NULL; 3528 } else { 3529 assert(ctrlr_ch->reset_iter == NULL); 3530 3531 nvme_qpair_delete(nvme_qpair); 3532 } 3533 } 3534 3535 static inline struct spdk_io_channel * 3536 bdev_nvme_get_accel_channel(struct nvme_poll_group *group) 3537 { 3538 if (spdk_unlikely(!group->accel_channel)) { 3539 group->accel_channel = spdk_accel_get_io_channel(); 3540 if (!group->accel_channel) { 3541 SPDK_ERRLOG("Cannot get the accel_channel for bdev nvme polling group=%p\n", 3542 group); 3543 return NULL; 3544 } 3545 } 3546 3547 return group->accel_channel; 3548 } 3549 3550 static void 3551 bdev_nvme_finish_sequence(void *seq, spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) 3552 { 3553 spdk_accel_sequence_finish(seq, cb_fn, cb_arg); 3554 } 3555 3556 static void 3557 bdev_nvme_abort_sequence(void *seq) 3558 { 3559 spdk_accel_sequence_abort(seq); 3560 } 3561 3562 static void 3563 bdev_nvme_reverse_sequence(void *seq) 3564 { 3565 spdk_accel_sequence_reverse(seq); 3566 } 3567 3568 static int 3569 bdev_nvme_append_crc32c(void *ctx, void **seq, uint32_t *dst, struct iovec *iovs, uint32_t iovcnt, 3570 struct spdk_memory_domain *domain, void *domain_ctx, uint32_t seed, 3571 spdk_nvme_accel_step_cb cb_fn, void *cb_arg) 3572 { 3573 struct spdk_io_channel *ch; 3574 struct nvme_poll_group *group = ctx; 3575 3576 ch = bdev_nvme_get_accel_channel(group); 3577 if (spdk_unlikely(ch == NULL)) { 3578 return -ENOMEM; 3579 } 3580 3581 return spdk_accel_append_crc32c((struct spdk_accel_sequence **)seq, ch, dst, iovs, iovcnt, 3582 domain, domain_ctx, seed, cb_fn, cb_arg); 3583 } 3584 3585 static int 3586 bdev_nvme_append_copy(void *ctx, void **seq, struct iovec *dst_iovs, uint32_t dst_iovcnt, 3587 struct spdk_memory_domain *dst_domain, void *dst_domain_ctx, 3588 struct iovec *src_iovs, uint32_t src_iovcnt, 3589 struct spdk_memory_domain *src_domain, void *src_domain_ctx, 3590 spdk_nvme_accel_step_cb cb_fn, void *cb_arg) 3591 { 3592 struct spdk_io_channel *ch; 3593 struct nvme_poll_group *group = ctx; 3594 3595 ch = bdev_nvme_get_accel_channel(group); 3596 if (spdk_unlikely(ch == NULL)) { 3597 return -ENOMEM; 3598 } 3599 3600 return spdk_accel_append_copy((struct spdk_accel_sequence **)seq, ch, 3601 dst_iovs, dst_iovcnt, dst_domain, dst_domain_ctx, 3602 src_iovs, src_iovcnt, src_domain, src_domain_ctx, 3603 cb_fn, cb_arg); 3604 } 3605 3606 static struct spdk_nvme_accel_fn_table g_bdev_nvme_accel_fn_table = { 3607 .table_size = sizeof(struct spdk_nvme_accel_fn_table), 3608 .append_crc32c = bdev_nvme_append_crc32c, 3609 .append_copy = bdev_nvme_append_copy, 3610 .finish_sequence = bdev_nvme_finish_sequence, 3611 .reverse_sequence = bdev_nvme_reverse_sequence, 3612 .abort_sequence = bdev_nvme_abort_sequence, 3613 }; 3614 3615 static int 3616 bdev_nvme_create_poll_group_cb(void *io_device, void *ctx_buf) 3617 { 3618 struct nvme_poll_group *group = ctx_buf; 3619 3620 TAILQ_INIT(&group->qpair_list); 3621 3622 group->group = spdk_nvme_poll_group_create(group, &g_bdev_nvme_accel_fn_table); 3623 if (group->group == NULL) { 3624 return -1; 3625 } 3626 3627 group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us); 3628 3629 if (group->poller == NULL) { 3630 spdk_nvme_poll_group_destroy(group->group); 3631 return -1; 3632 } 3633 3634 return 0; 3635 } 3636 3637 static void 3638 bdev_nvme_destroy_poll_group_cb(void *io_device, void *ctx_buf) 3639 { 3640 struct nvme_poll_group *group = ctx_buf; 3641 3642 assert(TAILQ_EMPTY(&group->qpair_list)); 3643 3644 if (group->accel_channel) { 3645 spdk_put_io_channel(group->accel_channel); 3646 } 3647 3648 spdk_poller_unregister(&group->poller); 3649 if (spdk_nvme_poll_group_destroy(group->group)) { 3650 SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.\n"); 3651 assert(false); 3652 } 3653 } 3654 3655 static struct spdk_io_channel * 3656 bdev_nvme_get_io_channel(void *ctx) 3657 { 3658 struct nvme_bdev *nvme_bdev = ctx; 3659 3660 return spdk_get_io_channel(nvme_bdev); 3661 } 3662 3663 static void * 3664 bdev_nvme_get_module_ctx(void *ctx) 3665 { 3666 struct nvme_bdev *nvme_bdev = ctx; 3667 struct nvme_ns *nvme_ns; 3668 3669 if (!nvme_bdev || nvme_bdev->disk.module != &nvme_if) { 3670 return NULL; 3671 } 3672 3673 nvme_ns = TAILQ_FIRST(&nvme_bdev->nvme_ns_list); 3674 if (!nvme_ns) { 3675 return NULL; 3676 } 3677 3678 return nvme_ns->ns; 3679 } 3680 3681 static const char * 3682 _nvme_ana_state_str(enum spdk_nvme_ana_state ana_state) 3683 { 3684 switch (ana_state) { 3685 case SPDK_NVME_ANA_OPTIMIZED_STATE: 3686 return "optimized"; 3687 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 3688 return "non_optimized"; 3689 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 3690 return "inaccessible"; 3691 case SPDK_NVME_ANA_PERSISTENT_LOSS_STATE: 3692 return "persistent_loss"; 3693 case SPDK_NVME_ANA_CHANGE_STATE: 3694 return "change"; 3695 default: 3696 return NULL; 3697 } 3698 } 3699 3700 static int 3701 bdev_nvme_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size) 3702 { 3703 struct spdk_memory_domain **_domains = NULL; 3704 struct nvme_bdev *nbdev = ctx; 3705 struct nvme_ns *nvme_ns; 3706 int i = 0, _array_size = array_size; 3707 int rc = 0; 3708 3709 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 3710 if (domains && array_size >= i) { 3711 _domains = &domains[i]; 3712 } else { 3713 _domains = NULL; 3714 } 3715 rc = spdk_nvme_ctrlr_get_memory_domains(nvme_ns->ctrlr->ctrlr, _domains, _array_size); 3716 if (rc > 0) { 3717 i += rc; 3718 if (_array_size >= rc) { 3719 _array_size -= rc; 3720 } else { 3721 _array_size = 0; 3722 } 3723 } else if (rc < 0) { 3724 return rc; 3725 } 3726 } 3727 3728 return i; 3729 } 3730 3731 static const char * 3732 nvme_ctrlr_get_state_str(struct nvme_ctrlr *nvme_ctrlr) 3733 { 3734 if (nvme_ctrlr->destruct) { 3735 return "deleting"; 3736 } else if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 3737 return "failed"; 3738 } else if (nvme_ctrlr->resetting) { 3739 return "resetting"; 3740 } else if (nvme_ctrlr->reconnect_is_delayed > 0) { 3741 return "reconnect_is_delayed"; 3742 } else if (nvme_ctrlr->disabled) { 3743 return "disabled"; 3744 } else { 3745 return "enabled"; 3746 } 3747 } 3748 3749 void 3750 nvme_ctrlr_info_json(struct spdk_json_write_ctx *w, struct nvme_ctrlr *nvme_ctrlr) 3751 { 3752 struct spdk_nvme_transport_id *trid; 3753 const struct spdk_nvme_ctrlr_opts *opts; 3754 const struct spdk_nvme_ctrlr_data *cdata; 3755 struct nvme_path_id *path_id; 3756 int32_t numa_id; 3757 3758 spdk_json_write_object_begin(w); 3759 3760 spdk_json_write_named_string(w, "state", nvme_ctrlr_get_state_str(nvme_ctrlr)); 3761 3762 #ifdef SPDK_CONFIG_NVME_CUSE 3763 size_t cuse_name_size = 128; 3764 char cuse_name[cuse_name_size]; 3765 3766 int rc = spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr, cuse_name, &cuse_name_size); 3767 if (rc == 0) { 3768 spdk_json_write_named_string(w, "cuse_device", cuse_name); 3769 } 3770 #endif 3771 trid = &nvme_ctrlr->active_path_id->trid; 3772 spdk_json_write_named_object_begin(w, "trid"); 3773 nvme_bdev_dump_trid_json(trid, w); 3774 spdk_json_write_object_end(w); 3775 3776 path_id = TAILQ_NEXT(nvme_ctrlr->active_path_id, link); 3777 if (path_id != NULL) { 3778 spdk_json_write_named_array_begin(w, "alternate_trids"); 3779 do { 3780 trid = &path_id->trid; 3781 spdk_json_write_object_begin(w); 3782 nvme_bdev_dump_trid_json(trid, w); 3783 spdk_json_write_object_end(w); 3784 3785 path_id = TAILQ_NEXT(path_id, link); 3786 } while (path_id != NULL); 3787 spdk_json_write_array_end(w); 3788 } 3789 3790 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 3791 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 3792 3793 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 3794 spdk_json_write_named_object_begin(w, "host"); 3795 spdk_json_write_named_string(w, "nqn", opts->hostnqn); 3796 spdk_json_write_named_string(w, "addr", opts->src_addr); 3797 spdk_json_write_named_string(w, "svcid", opts->src_svcid); 3798 spdk_json_write_object_end(w); 3799 3800 numa_id = spdk_nvme_ctrlr_get_numa_id(nvme_ctrlr->ctrlr); 3801 if (numa_id != SPDK_ENV_NUMA_ID_ANY) { 3802 spdk_json_write_named_uint32(w, "numa_id", numa_id); 3803 } 3804 spdk_json_write_object_end(w); 3805 } 3806 3807 static void 3808 nvme_namespace_info_json(struct spdk_json_write_ctx *w, 3809 struct nvme_ns *nvme_ns) 3810 { 3811 struct spdk_nvme_ns *ns; 3812 struct spdk_nvme_ctrlr *ctrlr; 3813 const struct spdk_nvme_ctrlr_data *cdata; 3814 const struct spdk_nvme_transport_id *trid; 3815 union spdk_nvme_vs_register vs; 3816 const struct spdk_nvme_ns_data *nsdata; 3817 char buf[128]; 3818 3819 ns = nvme_ns->ns; 3820 if (ns == NULL) { 3821 return; 3822 } 3823 3824 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 3825 3826 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3827 trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); 3828 vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr); 3829 3830 spdk_json_write_object_begin(w); 3831 3832 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 3833 spdk_json_write_named_string(w, "pci_address", trid->traddr); 3834 } 3835 3836 spdk_json_write_named_object_begin(w, "trid"); 3837 3838 nvme_bdev_dump_trid_json(trid, w); 3839 3840 spdk_json_write_object_end(w); 3841 3842 #ifdef SPDK_CONFIG_NVME_CUSE 3843 size_t cuse_name_size = 128; 3844 char cuse_name[cuse_name_size]; 3845 3846 int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns), 3847 cuse_name, &cuse_name_size); 3848 if (rc == 0) { 3849 spdk_json_write_named_string(w, "cuse_device", cuse_name); 3850 } 3851 #endif 3852 3853 spdk_json_write_named_object_begin(w, "ctrlr_data"); 3854 3855 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 3856 3857 spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid); 3858 3859 snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn); 3860 spdk_str_trim(buf); 3861 spdk_json_write_named_string(w, "model_number", buf); 3862 3863 snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn); 3864 spdk_str_trim(buf); 3865 spdk_json_write_named_string(w, "serial_number", buf); 3866 3867 snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr); 3868 spdk_str_trim(buf); 3869 spdk_json_write_named_string(w, "firmware_revision", buf); 3870 3871 if (cdata->subnqn[0] != '\0') { 3872 spdk_json_write_named_string(w, "subnqn", cdata->subnqn); 3873 } 3874 3875 spdk_json_write_named_object_begin(w, "oacs"); 3876 3877 spdk_json_write_named_uint32(w, "security", cdata->oacs.security); 3878 spdk_json_write_named_uint32(w, "format", cdata->oacs.format); 3879 spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware); 3880 spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage); 3881 3882 spdk_json_write_object_end(w); 3883 3884 spdk_json_write_named_bool(w, "multi_ctrlr", cdata->cmic.multi_ctrlr); 3885 spdk_json_write_named_bool(w, "ana_reporting", cdata->cmic.ana_reporting); 3886 3887 spdk_json_write_object_end(w); 3888 3889 spdk_json_write_named_object_begin(w, "vs"); 3890 3891 spdk_json_write_name(w, "nvme_version"); 3892 if (vs.bits.ter) { 3893 spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter); 3894 } else { 3895 spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr); 3896 } 3897 3898 spdk_json_write_object_end(w); 3899 3900 nsdata = spdk_nvme_ns_get_data(ns); 3901 3902 spdk_json_write_named_object_begin(w, "ns_data"); 3903 3904 spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns)); 3905 3906 if (cdata->cmic.ana_reporting) { 3907 spdk_json_write_named_string(w, "ana_state", 3908 _nvme_ana_state_str(nvme_ns->ana_state)); 3909 } 3910 3911 spdk_json_write_named_bool(w, "can_share", nsdata->nmic.can_share); 3912 3913 spdk_json_write_object_end(w); 3914 3915 if (cdata->oacs.security) { 3916 spdk_json_write_named_object_begin(w, "security"); 3917 3918 spdk_json_write_named_bool(w, "opal", nvme_ns->bdev->opal); 3919 3920 spdk_json_write_object_end(w); 3921 } 3922 3923 spdk_json_write_object_end(w); 3924 } 3925 3926 static const char * 3927 nvme_bdev_get_mp_policy_str(struct nvme_bdev *nbdev) 3928 { 3929 switch (nbdev->mp_policy) { 3930 case BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE: 3931 return "active_passive"; 3932 case BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE: 3933 return "active_active"; 3934 default: 3935 assert(false); 3936 return "invalid"; 3937 } 3938 } 3939 3940 static const char * 3941 nvme_bdev_get_mp_selector_str(struct nvme_bdev *nbdev) 3942 { 3943 switch (nbdev->mp_selector) { 3944 case BDEV_NVME_MP_SELECTOR_ROUND_ROBIN: 3945 return "round_robin"; 3946 case BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH: 3947 return "queue_depth"; 3948 default: 3949 assert(false); 3950 return "invalid"; 3951 } 3952 } 3953 3954 static int 3955 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) 3956 { 3957 struct nvme_bdev *nvme_bdev = ctx; 3958 struct nvme_ns *nvme_ns; 3959 3960 pthread_mutex_lock(&nvme_bdev->mutex); 3961 spdk_json_write_named_array_begin(w, "nvme"); 3962 TAILQ_FOREACH(nvme_ns, &nvme_bdev->nvme_ns_list, tailq) { 3963 nvme_namespace_info_json(w, nvme_ns); 3964 } 3965 spdk_json_write_array_end(w); 3966 spdk_json_write_named_string(w, "mp_policy", nvme_bdev_get_mp_policy_str(nvme_bdev)); 3967 if (nvme_bdev->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) { 3968 spdk_json_write_named_string(w, "selector", nvme_bdev_get_mp_selector_str(nvme_bdev)); 3969 if (nvme_bdev->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 3970 spdk_json_write_named_uint32(w, "rr_min_io", nvme_bdev->rr_min_io); 3971 } 3972 } 3973 pthread_mutex_unlock(&nvme_bdev->mutex); 3974 3975 return 0; 3976 } 3977 3978 static void 3979 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3980 { 3981 /* No config per bdev needed */ 3982 } 3983 3984 static uint64_t 3985 bdev_nvme_get_spin_time(struct spdk_io_channel *ch) 3986 { 3987 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 3988 struct nvme_io_path *io_path; 3989 struct nvme_poll_group *group; 3990 uint64_t spin_time = 0; 3991 3992 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 3993 group = io_path->qpair->group; 3994 3995 if (!group || !group->collect_spin_stat) { 3996 continue; 3997 } 3998 3999 if (group->end_ticks != 0) { 4000 group->spin_ticks += (group->end_ticks - group->start_ticks); 4001 group->end_ticks = 0; 4002 } 4003 4004 spin_time += group->spin_ticks; 4005 group->start_ticks = 0; 4006 group->spin_ticks = 0; 4007 } 4008 4009 return (spin_time * 1000000ULL) / spdk_get_ticks_hz(); 4010 } 4011 4012 static void 4013 bdev_nvme_reset_device_stat(void *ctx) 4014 { 4015 struct nvme_bdev *nbdev = ctx; 4016 4017 if (nbdev->err_stat != NULL) { 4018 memset(nbdev->err_stat, 0, sizeof(struct nvme_error_stat)); 4019 } 4020 } 4021 4022 /* JSON string should be lowercases and underscore delimited string. */ 4023 static void 4024 bdev_nvme_format_nvme_status(char *dst, const char *src) 4025 { 4026 char tmp[256]; 4027 4028 spdk_strcpy_replace(dst, 256, src, " - ", "_"); 4029 spdk_strcpy_replace(tmp, 256, dst, "-", "_"); 4030 spdk_strcpy_replace(dst, 256, tmp, " ", "_"); 4031 spdk_strlwr(dst); 4032 } 4033 4034 static void 4035 bdev_nvme_dump_device_stat_json(void *ctx, struct spdk_json_write_ctx *w) 4036 { 4037 struct nvme_bdev *nbdev = ctx; 4038 struct spdk_nvme_status status = {}; 4039 uint16_t sct, sc; 4040 char status_json[256]; 4041 const char *status_str; 4042 4043 if (nbdev->err_stat == NULL) { 4044 return; 4045 } 4046 4047 spdk_json_write_named_object_begin(w, "nvme_error"); 4048 4049 spdk_json_write_named_object_begin(w, "status_type"); 4050 for (sct = 0; sct < 8; sct++) { 4051 if (nbdev->err_stat->status_type[sct] == 0) { 4052 continue; 4053 } 4054 status.sct = sct; 4055 4056 status_str = spdk_nvme_cpl_get_status_type_string(&status); 4057 assert(status_str != NULL); 4058 bdev_nvme_format_nvme_status(status_json, status_str); 4059 4060 spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status_type[sct]); 4061 } 4062 spdk_json_write_object_end(w); 4063 4064 spdk_json_write_named_object_begin(w, "status_code"); 4065 for (sct = 0; sct < 4; sct++) { 4066 status.sct = sct; 4067 for (sc = 0; sc < 256; sc++) { 4068 if (nbdev->err_stat->status[sct][sc] == 0) { 4069 continue; 4070 } 4071 status.sc = sc; 4072 4073 status_str = spdk_nvme_cpl_get_status_string(&status); 4074 assert(status_str != NULL); 4075 bdev_nvme_format_nvme_status(status_json, status_str); 4076 4077 spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status[sct][sc]); 4078 } 4079 } 4080 spdk_json_write_object_end(w); 4081 4082 spdk_json_write_object_end(w); 4083 } 4084 4085 static bool 4086 bdev_nvme_accel_sequence_supported(void *ctx, enum spdk_bdev_io_type type) 4087 { 4088 struct nvme_bdev *nbdev = ctx; 4089 struct spdk_nvme_ctrlr *ctrlr; 4090 4091 if (!g_opts.allow_accel_sequence) { 4092 return false; 4093 } 4094 4095 switch (type) { 4096 case SPDK_BDEV_IO_TYPE_WRITE: 4097 case SPDK_BDEV_IO_TYPE_READ: 4098 break; 4099 default: 4100 return false; 4101 } 4102 4103 ctrlr = bdev_nvme_get_ctrlr(&nbdev->disk); 4104 assert(ctrlr != NULL); 4105 4106 return spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ACCEL_SEQUENCE_SUPPORTED; 4107 } 4108 4109 static const struct spdk_bdev_fn_table nvmelib_fn_table = { 4110 .destruct = bdev_nvme_destruct, 4111 .submit_request = bdev_nvme_submit_request, 4112 .io_type_supported = bdev_nvme_io_type_supported, 4113 .get_io_channel = bdev_nvme_get_io_channel, 4114 .dump_info_json = bdev_nvme_dump_info_json, 4115 .write_config_json = bdev_nvme_write_config_json, 4116 .get_spin_time = bdev_nvme_get_spin_time, 4117 .get_module_ctx = bdev_nvme_get_module_ctx, 4118 .get_memory_domains = bdev_nvme_get_memory_domains, 4119 .accel_sequence_supported = bdev_nvme_accel_sequence_supported, 4120 .reset_device_stat = bdev_nvme_reset_device_stat, 4121 .dump_device_stat_json = bdev_nvme_dump_device_stat_json, 4122 }; 4123 4124 typedef int (*bdev_nvme_parse_ana_log_page_cb)( 4125 const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg); 4126 4127 static int 4128 bdev_nvme_parse_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 4129 bdev_nvme_parse_ana_log_page_cb cb_fn, void *cb_arg) 4130 { 4131 struct spdk_nvme_ana_group_descriptor *copied_desc; 4132 uint8_t *orig_desc; 4133 uint32_t i, desc_size, copy_len; 4134 int rc = 0; 4135 4136 if (nvme_ctrlr->ana_log_page == NULL) { 4137 return -EINVAL; 4138 } 4139 4140 copied_desc = nvme_ctrlr->copied_ana_desc; 4141 4142 orig_desc = (uint8_t *)nvme_ctrlr->ana_log_page + sizeof(struct spdk_nvme_ana_page); 4143 copy_len = nvme_ctrlr->max_ana_log_page_size - sizeof(struct spdk_nvme_ana_page); 4144 4145 for (i = 0; i < nvme_ctrlr->ana_log_page->num_ana_group_desc; i++) { 4146 memcpy(copied_desc, orig_desc, copy_len); 4147 4148 rc = cb_fn(copied_desc, cb_arg); 4149 if (rc != 0) { 4150 break; 4151 } 4152 4153 desc_size = sizeof(struct spdk_nvme_ana_group_descriptor) + 4154 copied_desc->num_of_nsid * sizeof(uint32_t); 4155 orig_desc += desc_size; 4156 copy_len -= desc_size; 4157 } 4158 4159 return rc; 4160 } 4161 4162 static int 4163 nvme_ns_ana_transition_timedout(void *ctx) 4164 { 4165 struct nvme_ns *nvme_ns = ctx; 4166 4167 spdk_poller_unregister(&nvme_ns->anatt_timer); 4168 nvme_ns->ana_transition_timedout = true; 4169 4170 return SPDK_POLLER_BUSY; 4171 } 4172 4173 static void 4174 _nvme_ns_set_ana_state(struct nvme_ns *nvme_ns, 4175 const struct spdk_nvme_ana_group_descriptor *desc) 4176 { 4177 const struct spdk_nvme_ctrlr_data *cdata; 4178 4179 nvme_ns->ana_group_id = desc->ana_group_id; 4180 nvme_ns->ana_state = desc->ana_state; 4181 nvme_ns->ana_state_updating = false; 4182 4183 switch (nvme_ns->ana_state) { 4184 case SPDK_NVME_ANA_OPTIMIZED_STATE: 4185 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 4186 nvme_ns->ana_transition_timedout = false; 4187 spdk_poller_unregister(&nvme_ns->anatt_timer); 4188 break; 4189 4190 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 4191 case SPDK_NVME_ANA_CHANGE_STATE: 4192 if (nvme_ns->anatt_timer != NULL) { 4193 break; 4194 } 4195 4196 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 4197 nvme_ns->anatt_timer = SPDK_POLLER_REGISTER(nvme_ns_ana_transition_timedout, 4198 nvme_ns, 4199 cdata->anatt * SPDK_SEC_TO_USEC); 4200 break; 4201 default: 4202 break; 4203 } 4204 } 4205 4206 static int 4207 nvme_ns_set_ana_state(const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg) 4208 { 4209 struct nvme_ns *nvme_ns = cb_arg; 4210 uint32_t i; 4211 4212 assert(nvme_ns->ns != NULL); 4213 4214 for (i = 0; i < desc->num_of_nsid; i++) { 4215 if (desc->nsid[i] != spdk_nvme_ns_get_id(nvme_ns->ns)) { 4216 continue; 4217 } 4218 4219 _nvme_ns_set_ana_state(nvme_ns, desc); 4220 return 1; 4221 } 4222 4223 return 0; 4224 } 4225 4226 static int 4227 nvme_generate_uuid(const char *sn, uint32_t nsid, struct spdk_uuid *uuid) 4228 { 4229 int rc = 0; 4230 struct spdk_uuid new_uuid, namespace_uuid; 4231 char merged_str[SPDK_NVME_CTRLR_SN_LEN + NSID_STR_LEN + 1] = {'\0'}; 4232 /* This namespace UUID was generated using uuid_generate() method. */ 4233 const char *namespace_str = {"edaed2de-24bc-4b07-b559-f47ecbe730fd"}; 4234 int size; 4235 4236 assert(strlen(sn) <= SPDK_NVME_CTRLR_SN_LEN); 4237 4238 spdk_uuid_set_null(&new_uuid); 4239 spdk_uuid_set_null(&namespace_uuid); 4240 4241 size = snprintf(merged_str, sizeof(merged_str), "%s%"PRIu32, sn, nsid); 4242 if (size <= 0 || (unsigned long)size >= sizeof(merged_str)) { 4243 return -EINVAL; 4244 } 4245 4246 spdk_uuid_parse(&namespace_uuid, namespace_str); 4247 4248 rc = spdk_uuid_generate_sha1(&new_uuid, &namespace_uuid, merged_str, size); 4249 if (rc == 0) { 4250 memcpy(uuid, &new_uuid, sizeof(struct spdk_uuid)); 4251 } 4252 4253 return rc; 4254 } 4255 4256 static int 4257 nvme_disk_create(struct spdk_bdev *disk, const char *base_name, 4258 struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns, 4259 struct spdk_bdev_nvme_ctrlr_opts *bdev_opts, void *ctx) 4260 { 4261 const struct spdk_uuid *uuid; 4262 const uint8_t *nguid; 4263 const struct spdk_nvme_ctrlr_data *cdata; 4264 const struct spdk_nvme_ns_data *nsdata; 4265 const struct spdk_nvme_ctrlr_opts *opts; 4266 enum spdk_nvme_csi csi; 4267 uint32_t atomic_bs, phys_bs, bs; 4268 char sn_tmp[SPDK_NVME_CTRLR_SN_LEN + 1] = {'\0'}; 4269 int rc; 4270 4271 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 4272 csi = spdk_nvme_ns_get_csi(ns); 4273 opts = spdk_nvme_ctrlr_get_opts(ctrlr); 4274 4275 switch (csi) { 4276 case SPDK_NVME_CSI_NVM: 4277 disk->product_name = "NVMe disk"; 4278 break; 4279 case SPDK_NVME_CSI_ZNS: 4280 disk->product_name = "NVMe ZNS disk"; 4281 disk->zoned = true; 4282 disk->zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 4283 disk->max_zone_append_size = spdk_nvme_zns_ctrlr_get_max_zone_append_size(ctrlr) / 4284 spdk_nvme_ns_get_extended_sector_size(ns); 4285 disk->max_open_zones = spdk_nvme_zns_ns_get_max_open_zones(ns); 4286 disk->max_active_zones = spdk_nvme_zns_ns_get_max_active_zones(ns); 4287 break; 4288 default: 4289 if (bdev_opts->allow_unrecognized_csi) { 4290 disk->product_name = "NVMe Passthrough disk"; 4291 break; 4292 } 4293 SPDK_ERRLOG("unsupported CSI: %u\n", csi); 4294 return -ENOTSUP; 4295 } 4296 4297 nguid = spdk_nvme_ns_get_nguid(ns); 4298 if (!nguid) { 4299 uuid = spdk_nvme_ns_get_uuid(ns); 4300 if (uuid) { 4301 disk->uuid = *uuid; 4302 } else if (g_opts.generate_uuids) { 4303 spdk_strcpy_pad(sn_tmp, cdata->sn, SPDK_NVME_CTRLR_SN_LEN, '\0'); 4304 rc = nvme_generate_uuid(sn_tmp, spdk_nvme_ns_get_id(ns), &disk->uuid); 4305 if (rc < 0) { 4306 SPDK_ERRLOG("UUID generation failed (%s)\n", spdk_strerror(-rc)); 4307 return rc; 4308 } 4309 } 4310 } else { 4311 memcpy(&disk->uuid, nguid, sizeof(disk->uuid)); 4312 } 4313 4314 disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns)); 4315 if (!disk->name) { 4316 return -ENOMEM; 4317 } 4318 4319 disk->write_cache = 0; 4320 if (cdata->vwc.present) { 4321 /* Enable if the Volatile Write Cache exists */ 4322 disk->write_cache = 1; 4323 } 4324 if (cdata->oncs.write_zeroes) { 4325 disk->max_write_zeroes = UINT16_MAX + 1; 4326 } 4327 disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns); 4328 disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns); 4329 disk->max_segment_size = spdk_nvme_ctrlr_get_max_xfer_size(ctrlr); 4330 disk->ctratt.raw = cdata->ctratt.raw; 4331 /* NVMe driver will split one request into multiple requests 4332 * based on MDTS and stripe boundary, the bdev layer will use 4333 * max_segment_size and max_num_segments to split one big IO 4334 * into multiple requests, then small request can't run out 4335 * of NVMe internal requests data structure. 4336 */ 4337 if (opts && opts->io_queue_requests) { 4338 disk->max_num_segments = opts->io_queue_requests / 2; 4339 } 4340 if (spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_SGL_SUPPORTED) { 4341 /* The nvme driver will try to split I/O that have too many 4342 * SGEs, but it doesn't work if that last SGE doesn't end on 4343 * an aggregate total that is block aligned. The bdev layer has 4344 * a more robust splitting framework, so use that instead for 4345 * this case. (See issue #3269.) 4346 */ 4347 uint16_t max_sges = spdk_nvme_ctrlr_get_max_sges(ctrlr); 4348 4349 if (disk->max_num_segments == 0) { 4350 disk->max_num_segments = max_sges; 4351 } else { 4352 disk->max_num_segments = spdk_min(disk->max_num_segments, max_sges); 4353 } 4354 } 4355 disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns); 4356 4357 nsdata = spdk_nvme_ns_get_data(ns); 4358 bs = spdk_nvme_ns_get_sector_size(ns); 4359 atomic_bs = bs; 4360 phys_bs = bs; 4361 if (nsdata->nabo == 0) { 4362 if (nsdata->nsfeat.ns_atomic_write_unit && nsdata->nawupf) { 4363 atomic_bs = bs * (1 + nsdata->nawupf); 4364 } else { 4365 atomic_bs = bs * (1 + cdata->awupf); 4366 } 4367 } 4368 if (nsdata->nsfeat.optperf) { 4369 phys_bs = bs * (1 + nsdata->npwg); 4370 } 4371 disk->phys_blocklen = spdk_min(phys_bs, atomic_bs); 4372 4373 disk->md_len = spdk_nvme_ns_get_md_size(ns); 4374 if (disk->md_len != 0) { 4375 disk->md_interleave = nsdata->flbas.extended; 4376 disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns); 4377 if (disk->dif_type != SPDK_DIF_DISABLE) { 4378 disk->dif_is_head_of_md = nsdata->dps.md_start; 4379 disk->dif_check_flags = bdev_opts->prchk_flags; 4380 disk->dif_pi_format = (enum spdk_dif_pi_format)spdk_nvme_ns_get_pi_format(ns); 4381 } 4382 } 4383 4384 if (!(spdk_nvme_ctrlr_get_flags(ctrlr) & 4385 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) { 4386 disk->acwu = 0; 4387 } else if (nsdata->nsfeat.ns_atomic_write_unit) { 4388 disk->acwu = nsdata->nacwu + 1; /* 0-based */ 4389 } else { 4390 disk->acwu = cdata->acwu + 1; /* 0-based */ 4391 } 4392 4393 if (cdata->oncs.copy) { 4394 /* For now bdev interface allows only single segment copy */ 4395 disk->max_copy = nsdata->mssrl; 4396 } 4397 4398 disk->ctxt = ctx; 4399 disk->fn_table = &nvmelib_fn_table; 4400 disk->module = &nvme_if; 4401 4402 disk->numa.id_valid = 1; 4403 disk->numa.id = spdk_nvme_ctrlr_get_numa_id(ctrlr); 4404 4405 return 0; 4406 } 4407 4408 static struct nvme_bdev * 4409 nvme_bdev_alloc(void) 4410 { 4411 struct nvme_bdev *bdev; 4412 int rc; 4413 4414 bdev = calloc(1, sizeof(*bdev)); 4415 if (!bdev) { 4416 SPDK_ERRLOG("bdev calloc() failed\n"); 4417 return NULL; 4418 } 4419 4420 if (g_opts.nvme_error_stat) { 4421 bdev->err_stat = calloc(1, sizeof(struct nvme_error_stat)); 4422 if (!bdev->err_stat) { 4423 SPDK_ERRLOG("err_stat calloc() failed\n"); 4424 free(bdev); 4425 return NULL; 4426 } 4427 } 4428 4429 rc = pthread_mutex_init(&bdev->mutex, NULL); 4430 if (rc != 0) { 4431 free(bdev->err_stat); 4432 free(bdev); 4433 return NULL; 4434 } 4435 4436 bdev->ref = 1; 4437 bdev->mp_policy = BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE; 4438 bdev->mp_selector = BDEV_NVME_MP_SELECTOR_ROUND_ROBIN; 4439 bdev->rr_min_io = UINT32_MAX; 4440 TAILQ_INIT(&bdev->nvme_ns_list); 4441 4442 return bdev; 4443 } 4444 4445 static int 4446 nvme_bdev_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4447 { 4448 struct nvme_bdev *bdev; 4449 struct nvme_bdev_ctrlr *nbdev_ctrlr = nvme_ctrlr->nbdev_ctrlr; 4450 int rc; 4451 4452 bdev = nvme_bdev_alloc(); 4453 if (bdev == NULL) { 4454 SPDK_ERRLOG("Failed to allocate NVMe bdev\n"); 4455 return -ENOMEM; 4456 } 4457 4458 bdev->opal = nvme_ctrlr->opal_dev != NULL; 4459 4460 rc = nvme_disk_create(&bdev->disk, nbdev_ctrlr->name, nvme_ctrlr->ctrlr, 4461 nvme_ns->ns, &nvme_ctrlr->opts, bdev); 4462 if (rc != 0) { 4463 SPDK_ERRLOG("Failed to create NVMe disk\n"); 4464 nvme_bdev_free(bdev); 4465 return rc; 4466 } 4467 4468 spdk_io_device_register(bdev, 4469 bdev_nvme_create_bdev_channel_cb, 4470 bdev_nvme_destroy_bdev_channel_cb, 4471 sizeof(struct nvme_bdev_channel), 4472 bdev->disk.name); 4473 4474 nvme_ns->bdev = bdev; 4475 bdev->nsid = nvme_ns->id; 4476 TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); 4477 4478 bdev->nbdev_ctrlr = nbdev_ctrlr; 4479 TAILQ_INSERT_TAIL(&nbdev_ctrlr->bdevs, bdev, tailq); 4480 4481 rc = spdk_bdev_register(&bdev->disk); 4482 if (rc != 0) { 4483 SPDK_ERRLOG("spdk_bdev_register() failed\n"); 4484 spdk_io_device_unregister(bdev, NULL); 4485 nvme_ns->bdev = NULL; 4486 TAILQ_REMOVE(&nbdev_ctrlr->bdevs, bdev, tailq); 4487 nvme_bdev_free(bdev); 4488 return rc; 4489 } 4490 4491 return 0; 4492 } 4493 4494 static bool 4495 bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2) 4496 { 4497 const struct spdk_nvme_ns_data *nsdata1, *nsdata2; 4498 const struct spdk_uuid *uuid1, *uuid2; 4499 4500 nsdata1 = spdk_nvme_ns_get_data(ns1); 4501 nsdata2 = spdk_nvme_ns_get_data(ns2); 4502 uuid1 = spdk_nvme_ns_get_uuid(ns1); 4503 uuid2 = spdk_nvme_ns_get_uuid(ns2); 4504 4505 return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid)) == 0 && 4506 nsdata1->eui64 == nsdata2->eui64 && 4507 ((uuid1 == NULL && uuid2 == NULL) || 4508 (uuid1 != NULL && uuid2 != NULL && spdk_uuid_compare(uuid1, uuid2) == 0)) && 4509 spdk_nvme_ns_get_csi(ns1) == spdk_nvme_ns_get_csi(ns2); 4510 } 4511 4512 static bool 4513 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 4514 struct spdk_nvme_ctrlr_opts *opts) 4515 { 4516 struct nvme_probe_skip_entry *entry; 4517 4518 TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) { 4519 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 4520 return false; 4521 } 4522 } 4523 4524 opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst; 4525 opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight; 4526 opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight; 4527 opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight; 4528 opts->disable_read_ana_log_page = true; 4529 4530 SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr); 4531 4532 return true; 4533 } 4534 4535 static void 4536 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl) 4537 { 4538 struct nvme_ctrlr *nvme_ctrlr = ctx; 4539 4540 if (spdk_nvme_cpl_is_error(cpl)) { 4541 SPDK_WARNLOG("Abort failed. Resetting controller. sc is %u, sct is %u.\n", cpl->status.sc, 4542 cpl->status.sct); 4543 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4544 } else if (cpl->cdw0 & 0x1) { 4545 SPDK_WARNLOG("Specified command could not be aborted.\n"); 4546 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4547 } 4548 } 4549 4550 static void 4551 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr, 4552 struct spdk_nvme_qpair *qpair, uint16_t cid) 4553 { 4554 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 4555 union spdk_nvme_csts_register csts; 4556 int rc; 4557 4558 assert(nvme_ctrlr->ctrlr == ctrlr); 4559 4560 SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid); 4561 4562 /* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O 4563 * queue. (Note: qpair == NULL when there's an admin cmd timeout.) Otherwise we 4564 * would submit another fabrics cmd on the admin queue to read CSTS and check for its 4565 * completion recursively. 4566 */ 4567 if (nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) { 4568 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); 4569 if (csts.bits.cfs) { 4570 SPDK_ERRLOG("Controller Fatal Status, reset required\n"); 4571 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4572 return; 4573 } 4574 } 4575 4576 switch (g_opts.action_on_timeout) { 4577 case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT: 4578 if (qpair) { 4579 /* Don't send abort to ctrlr when ctrlr is not available. */ 4580 pthread_mutex_lock(&nvme_ctrlr->mutex); 4581 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 4582 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4583 SPDK_NOTICELOG("Quit abort. Ctrlr is not available.\n"); 4584 return; 4585 } 4586 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4587 4588 rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid, 4589 nvme_abort_cpl, nvme_ctrlr); 4590 if (rc == 0) { 4591 return; 4592 } 4593 4594 SPDK_ERRLOG("Unable to send abort. Resetting, rc is %d.\n", rc); 4595 } 4596 4597 /* FALLTHROUGH */ 4598 case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET: 4599 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4600 break; 4601 case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE: 4602 SPDK_DEBUGLOG(bdev_nvme, "No action for nvme controller timeout.\n"); 4603 break; 4604 default: 4605 SPDK_ERRLOG("An invalid timeout action value is found.\n"); 4606 break; 4607 } 4608 } 4609 4610 static struct nvme_ns * 4611 nvme_ns_alloc(void) 4612 { 4613 struct nvme_ns *nvme_ns; 4614 4615 nvme_ns = calloc(1, sizeof(struct nvme_ns)); 4616 if (nvme_ns == NULL) { 4617 return NULL; 4618 } 4619 4620 if (g_opts.io_path_stat) { 4621 nvme_ns->stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); 4622 if (nvme_ns->stat == NULL) { 4623 free(nvme_ns); 4624 return NULL; 4625 } 4626 spdk_bdev_reset_io_stat(nvme_ns->stat, SPDK_BDEV_RESET_STAT_MAXMIN); 4627 } 4628 4629 return nvme_ns; 4630 } 4631 4632 static void 4633 nvme_ns_free(struct nvme_ns *nvme_ns) 4634 { 4635 free(nvme_ns->stat); 4636 free(nvme_ns); 4637 } 4638 4639 static void 4640 nvme_ctrlr_populate_namespace_done(struct nvme_ns *nvme_ns, int rc) 4641 { 4642 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 4643 struct nvme_async_probe_ctx *ctx = nvme_ns->probe_ctx; 4644 4645 if (rc == 0) { 4646 nvme_ns->probe_ctx = NULL; 4647 pthread_mutex_lock(&nvme_ctrlr->mutex); 4648 nvme_ctrlr->ref++; 4649 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4650 } else { 4651 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4652 nvme_ns_free(nvme_ns); 4653 } 4654 4655 if (ctx) { 4656 ctx->populates_in_progress--; 4657 if (ctx->populates_in_progress == 0) { 4658 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 4659 } 4660 } 4661 } 4662 4663 static void 4664 bdev_nvme_add_io_path(struct nvme_bdev_channel_iter *i, 4665 struct nvme_bdev *nbdev, 4666 struct nvme_bdev_channel *nbdev_ch, void *ctx) 4667 { 4668 struct nvme_ns *nvme_ns = ctx; 4669 int rc; 4670 4671 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 4672 if (rc != 0) { 4673 SPDK_ERRLOG("Failed to add I/O path to bdev_channel dynamically.\n"); 4674 } 4675 4676 nvme_bdev_for_each_channel_continue(i, rc); 4677 } 4678 4679 static void 4680 bdev_nvme_delete_io_path(struct nvme_bdev_channel_iter *i, 4681 struct nvme_bdev *nbdev, 4682 struct nvme_bdev_channel *nbdev_ch, void *ctx) 4683 { 4684 struct nvme_ns *nvme_ns = ctx; 4685 struct nvme_io_path *io_path; 4686 4687 io_path = _bdev_nvme_get_io_path(nbdev_ch, nvme_ns); 4688 if (io_path != NULL) { 4689 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 4690 } 4691 4692 nvme_bdev_for_each_channel_continue(i, 0); 4693 } 4694 4695 static void 4696 bdev_nvme_add_io_path_failed(struct nvme_bdev *nbdev, void *ctx, int status) 4697 { 4698 struct nvme_ns *nvme_ns = ctx; 4699 4700 nvme_ctrlr_populate_namespace_done(nvme_ns, -1); 4701 } 4702 4703 static void 4704 bdev_nvme_add_io_path_done(struct nvme_bdev *nbdev, void *ctx, int status) 4705 { 4706 struct nvme_ns *nvme_ns = ctx; 4707 4708 if (status == 0) { 4709 nvme_ctrlr_populate_namespace_done(nvme_ns, 0); 4710 } else { 4711 /* Delete the added io_paths and fail populating the namespace. */ 4712 nvme_bdev_for_each_channel(nbdev, 4713 bdev_nvme_delete_io_path, 4714 nvme_ns, 4715 bdev_nvme_add_io_path_failed); 4716 } 4717 } 4718 4719 static int 4720 nvme_bdev_add_ns(struct nvme_bdev *bdev, struct nvme_ns *nvme_ns) 4721 { 4722 struct nvme_ns *tmp_ns; 4723 const struct spdk_nvme_ns_data *nsdata; 4724 4725 nsdata = spdk_nvme_ns_get_data(nvme_ns->ns); 4726 if (!nsdata->nmic.can_share) { 4727 SPDK_ERRLOG("Namespace cannot be shared.\n"); 4728 return -EINVAL; 4729 } 4730 4731 pthread_mutex_lock(&bdev->mutex); 4732 4733 tmp_ns = TAILQ_FIRST(&bdev->nvme_ns_list); 4734 assert(tmp_ns != NULL); 4735 4736 if (tmp_ns->ns != NULL && !bdev_nvme_compare_ns(nvme_ns->ns, tmp_ns->ns)) { 4737 pthread_mutex_unlock(&bdev->mutex); 4738 SPDK_ERRLOG("Namespaces are not identical.\n"); 4739 return -EINVAL; 4740 } 4741 4742 bdev->ref++; 4743 TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); 4744 nvme_ns->bdev = bdev; 4745 4746 pthread_mutex_unlock(&bdev->mutex); 4747 4748 /* Add nvme_io_path to nvme_bdev_channels dynamically. */ 4749 nvme_bdev_for_each_channel(bdev, 4750 bdev_nvme_add_io_path, 4751 nvme_ns, 4752 bdev_nvme_add_io_path_done); 4753 4754 return 0; 4755 } 4756 4757 static void 4758 nvme_ctrlr_populate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4759 { 4760 struct spdk_nvme_ns *ns; 4761 struct nvme_bdev *bdev; 4762 int rc = 0; 4763 4764 ns = spdk_nvme_ctrlr_get_ns(nvme_ctrlr->ctrlr, nvme_ns->id); 4765 if (!ns) { 4766 SPDK_DEBUGLOG(bdev_nvme, "Invalid NS %d\n", nvme_ns->id); 4767 rc = -EINVAL; 4768 goto done; 4769 } 4770 4771 nvme_ns->ns = ns; 4772 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 4773 4774 if (nvme_ctrlr->ana_log_page != NULL) { 4775 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ns_set_ana_state, nvme_ns); 4776 } 4777 4778 bdev = nvme_bdev_ctrlr_get_bdev(nvme_ctrlr->nbdev_ctrlr, nvme_ns->id); 4779 if (bdev == NULL) { 4780 rc = nvme_bdev_create(nvme_ctrlr, nvme_ns); 4781 } else { 4782 rc = nvme_bdev_add_ns(bdev, nvme_ns); 4783 if (rc == 0) { 4784 return; 4785 } 4786 } 4787 done: 4788 nvme_ctrlr_populate_namespace_done(nvme_ns, rc); 4789 } 4790 4791 static void 4792 nvme_ctrlr_depopulate_namespace_done(struct nvme_ns *nvme_ns) 4793 { 4794 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 4795 4796 assert(nvme_ctrlr != NULL); 4797 4798 pthread_mutex_lock(&nvme_ctrlr->mutex); 4799 4800 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4801 4802 if (nvme_ns->bdev != NULL) { 4803 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4804 return; 4805 } 4806 4807 nvme_ns_free(nvme_ns); 4808 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4809 4810 nvme_ctrlr_release(nvme_ctrlr); 4811 } 4812 4813 static void 4814 bdev_nvme_delete_io_path_done(struct nvme_bdev *nbdev, void *ctx, int status) 4815 { 4816 struct nvme_ns *nvme_ns = ctx; 4817 4818 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 4819 } 4820 4821 static void 4822 nvme_ctrlr_depopulate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4823 { 4824 struct nvme_bdev *bdev; 4825 4826 spdk_poller_unregister(&nvme_ns->anatt_timer); 4827 4828 bdev = nvme_ns->bdev; 4829 if (bdev != NULL) { 4830 pthread_mutex_lock(&bdev->mutex); 4831 4832 assert(bdev->ref > 0); 4833 bdev->ref--; 4834 if (bdev->ref == 0) { 4835 pthread_mutex_unlock(&bdev->mutex); 4836 4837 spdk_bdev_unregister(&bdev->disk, NULL, NULL); 4838 } else { 4839 /* spdk_bdev_unregister() is not called until the last nvme_ns is 4840 * depopulated. Hence we need to remove nvme_ns from bdev->nvme_ns_list 4841 * and clear nvme_ns->bdev here. 4842 */ 4843 TAILQ_REMOVE(&bdev->nvme_ns_list, nvme_ns, tailq); 4844 nvme_ns->bdev = NULL; 4845 4846 pthread_mutex_unlock(&bdev->mutex); 4847 4848 /* Delete nvme_io_paths from nvme_bdev_channels dynamically. After that, 4849 * we call depopulate_namespace_done() to avoid use-after-free. 4850 */ 4851 nvme_bdev_for_each_channel(bdev, 4852 bdev_nvme_delete_io_path, 4853 nvme_ns, 4854 bdev_nvme_delete_io_path_done); 4855 return; 4856 } 4857 } 4858 4859 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 4860 } 4861 4862 static void 4863 nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 4864 struct nvme_async_probe_ctx *ctx) 4865 { 4866 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 4867 struct nvme_ns *nvme_ns, *next; 4868 struct spdk_nvme_ns *ns; 4869 struct nvme_bdev *bdev; 4870 uint32_t nsid; 4871 int rc; 4872 uint64_t num_sectors; 4873 4874 if (ctx) { 4875 /* Initialize this count to 1 to handle the populate functions 4876 * calling nvme_ctrlr_populate_namespace_done() immediately. 4877 */ 4878 ctx->populates_in_progress = 1; 4879 } 4880 4881 /* First loop over our existing namespaces and see if they have been 4882 * removed. */ 4883 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 4884 while (nvme_ns != NULL) { 4885 next = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 4886 4887 if (spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) { 4888 /* NS is still there or added again. Its attributes may have changed. */ 4889 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id); 4890 if (nvme_ns->ns != ns) { 4891 assert(nvme_ns->ns == NULL); 4892 nvme_ns->ns = ns; 4893 SPDK_DEBUGLOG(bdev_nvme, "NSID %u was added\n", nvme_ns->id); 4894 } 4895 4896 num_sectors = spdk_nvme_ns_get_num_sectors(ns); 4897 bdev = nvme_ns->bdev; 4898 assert(bdev != NULL); 4899 if (bdev->disk.blockcnt != num_sectors) { 4900 SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n", 4901 nvme_ns->id, 4902 bdev->disk.name, 4903 bdev->disk.blockcnt, 4904 num_sectors); 4905 rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors); 4906 if (rc != 0) { 4907 SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n", 4908 bdev->disk.name, rc); 4909 } 4910 } 4911 } else { 4912 /* Namespace was removed */ 4913 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 4914 } 4915 4916 nvme_ns = next; 4917 } 4918 4919 /* Loop through all of the namespaces at the nvme level and see if any of them are new */ 4920 nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 4921 while (nsid != 0) { 4922 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 4923 4924 if (nvme_ns == NULL) { 4925 /* Found a new one */ 4926 nvme_ns = nvme_ns_alloc(); 4927 if (nvme_ns == NULL) { 4928 SPDK_ERRLOG("Failed to allocate namespace\n"); 4929 /* This just fails to attach the namespace. It may work on a future attempt. */ 4930 continue; 4931 } 4932 4933 nvme_ns->id = nsid; 4934 nvme_ns->ctrlr = nvme_ctrlr; 4935 4936 nvme_ns->bdev = NULL; 4937 4938 if (ctx) { 4939 ctx->populates_in_progress++; 4940 } 4941 nvme_ns->probe_ctx = ctx; 4942 4943 RB_INSERT(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4944 4945 nvme_ctrlr_populate_namespace(nvme_ctrlr, nvme_ns); 4946 } 4947 4948 nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid); 4949 } 4950 4951 if (ctx) { 4952 /* Decrement this count now that the loop is over to account 4953 * for the one we started with. If the count is then 0, we 4954 * know any populate_namespace functions completed immediately, 4955 * so we'll kick the callback here. 4956 */ 4957 ctx->populates_in_progress--; 4958 if (ctx->populates_in_progress == 0) { 4959 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 4960 } 4961 } 4962 4963 } 4964 4965 static void 4966 nvme_ctrlr_depopulate_namespaces(struct nvme_ctrlr *nvme_ctrlr) 4967 { 4968 struct nvme_ns *nvme_ns, *tmp; 4969 4970 RB_FOREACH_SAFE(nvme_ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp) { 4971 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 4972 } 4973 } 4974 4975 static uint32_t 4976 nvme_ctrlr_get_ana_log_page_size(struct nvme_ctrlr *nvme_ctrlr) 4977 { 4978 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 4979 const struct spdk_nvme_ctrlr_data *cdata; 4980 uint32_t nsid, ns_count = 0; 4981 4982 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 4983 4984 for (nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 4985 nsid != 0; nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid)) { 4986 ns_count++; 4987 } 4988 4989 return sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 4990 sizeof(struct spdk_nvme_ana_group_descriptor) + ns_count * 4991 sizeof(uint32_t); 4992 } 4993 4994 static int 4995 nvme_ctrlr_set_ana_states(const struct spdk_nvme_ana_group_descriptor *desc, 4996 void *cb_arg) 4997 { 4998 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 4999 struct nvme_ns *nvme_ns; 5000 uint32_t i, nsid; 5001 5002 for (i = 0; i < desc->num_of_nsid; i++) { 5003 nsid = desc->nsid[i]; 5004 if (nsid == 0) { 5005 continue; 5006 } 5007 5008 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 5009 5010 if (nvme_ns == NULL) { 5011 /* Target told us that an inactive namespace had an ANA change */ 5012 continue; 5013 } 5014 5015 _nvme_ns_set_ana_state(nvme_ns, desc); 5016 } 5017 5018 return 0; 5019 } 5020 5021 static void 5022 bdev_nvme_disable_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 5023 { 5024 struct nvme_ns *nvme_ns; 5025 5026 spdk_free(nvme_ctrlr->ana_log_page); 5027 nvme_ctrlr->ana_log_page = NULL; 5028 5029 for (nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 5030 nvme_ns != NULL; 5031 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns)) { 5032 nvme_ns->ana_state_updating = false; 5033 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 5034 } 5035 } 5036 5037 static void 5038 nvme_ctrlr_read_ana_log_page_done(void *ctx, const struct spdk_nvme_cpl *cpl) 5039 { 5040 struct nvme_ctrlr *nvme_ctrlr = ctx; 5041 5042 if (cpl != NULL && spdk_nvme_cpl_is_success(cpl)) { 5043 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ctrlr_set_ana_states, 5044 nvme_ctrlr); 5045 } else { 5046 bdev_nvme_disable_read_ana_log_page(nvme_ctrlr); 5047 } 5048 5049 pthread_mutex_lock(&nvme_ctrlr->mutex); 5050 5051 assert(nvme_ctrlr->ana_log_page_updating == true); 5052 nvme_ctrlr->ana_log_page_updating = false; 5053 5054 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 5055 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5056 5057 nvme_ctrlr_unregister(nvme_ctrlr); 5058 } else { 5059 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5060 5061 bdev_nvme_clear_io_path_caches(nvme_ctrlr); 5062 } 5063 } 5064 5065 static int 5066 nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 5067 { 5068 uint32_t ana_log_page_size; 5069 int rc; 5070 5071 if (nvme_ctrlr->ana_log_page == NULL) { 5072 return -EINVAL; 5073 } 5074 5075 ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); 5076 5077 if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { 5078 SPDK_ERRLOG("ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", 5079 ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); 5080 return -EINVAL; 5081 } 5082 5083 pthread_mutex_lock(&nvme_ctrlr->mutex); 5084 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 5085 nvme_ctrlr->ana_log_page_updating) { 5086 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5087 return -EBUSY; 5088 } 5089 5090 nvme_ctrlr->ana_log_page_updating = true; 5091 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5092 5093 rc = spdk_nvme_ctrlr_cmd_get_log_page(nvme_ctrlr->ctrlr, 5094 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 5095 SPDK_NVME_GLOBAL_NS_TAG, 5096 nvme_ctrlr->ana_log_page, 5097 ana_log_page_size, 0, 5098 nvme_ctrlr_read_ana_log_page_done, 5099 nvme_ctrlr); 5100 if (rc != 0) { 5101 nvme_ctrlr_read_ana_log_page_done(nvme_ctrlr, NULL); 5102 } 5103 5104 return rc; 5105 } 5106 5107 static void 5108 dummy_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 5109 { 5110 } 5111 5112 struct bdev_nvme_set_preferred_path_ctx { 5113 struct spdk_bdev_desc *desc; 5114 struct nvme_ns *nvme_ns; 5115 bdev_nvme_set_preferred_path_cb cb_fn; 5116 void *cb_arg; 5117 }; 5118 5119 static void 5120 bdev_nvme_set_preferred_path_done(struct nvme_bdev *nbdev, void *_ctx, int status) 5121 { 5122 struct bdev_nvme_set_preferred_path_ctx *ctx = _ctx; 5123 5124 assert(ctx != NULL); 5125 assert(ctx->desc != NULL); 5126 assert(ctx->cb_fn != NULL); 5127 5128 spdk_bdev_close(ctx->desc); 5129 5130 ctx->cb_fn(ctx->cb_arg, status); 5131 5132 free(ctx); 5133 } 5134 5135 static void 5136 _bdev_nvme_set_preferred_path(struct nvme_bdev_channel_iter *i, 5137 struct nvme_bdev *nbdev, 5138 struct nvme_bdev_channel *nbdev_ch, void *_ctx) 5139 { 5140 struct bdev_nvme_set_preferred_path_ctx *ctx = _ctx; 5141 struct nvme_io_path *io_path, *prev; 5142 5143 prev = NULL; 5144 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 5145 if (io_path->nvme_ns == ctx->nvme_ns) { 5146 break; 5147 } 5148 prev = io_path; 5149 } 5150 5151 if (io_path != NULL) { 5152 if (prev != NULL) { 5153 STAILQ_REMOVE_AFTER(&nbdev_ch->io_path_list, prev, stailq); 5154 STAILQ_INSERT_HEAD(&nbdev_ch->io_path_list, io_path, stailq); 5155 } 5156 5157 /* We can set io_path to nbdev_ch->current_io_path directly here. 5158 * However, it needs to be conditional. To simplify the code, 5159 * just clear nbdev_ch->current_io_path and let find_io_path() 5160 * fill it. 5161 * 5162 * Automatic failback may be disabled. Hence even if the io_path is 5163 * already at the head, clear nbdev_ch->current_io_path. 5164 */ 5165 bdev_nvme_clear_current_io_path(nbdev_ch); 5166 } 5167 5168 nvme_bdev_for_each_channel_continue(i, 0); 5169 } 5170 5171 static struct nvme_ns * 5172 bdev_nvme_set_preferred_ns(struct nvme_bdev *nbdev, uint16_t cntlid) 5173 { 5174 struct nvme_ns *nvme_ns, *prev; 5175 const struct spdk_nvme_ctrlr_data *cdata; 5176 5177 prev = NULL; 5178 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 5179 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 5180 5181 if (cdata->cntlid == cntlid) { 5182 break; 5183 } 5184 prev = nvme_ns; 5185 } 5186 5187 if (nvme_ns != NULL && prev != NULL) { 5188 TAILQ_REMOVE(&nbdev->nvme_ns_list, nvme_ns, tailq); 5189 TAILQ_INSERT_HEAD(&nbdev->nvme_ns_list, nvme_ns, tailq); 5190 } 5191 5192 return nvme_ns; 5193 } 5194 5195 /* This function supports only multipath mode. There is only a single I/O path 5196 * for each NVMe-oF controller. Hence, just move the matched I/O path to the 5197 * head of the I/O path list for each NVMe bdev channel. 5198 * 5199 * NVMe bdev channel may be acquired after completing this function. move the 5200 * matched namespace to the head of the namespace list for the NVMe bdev too. 5201 */ 5202 void 5203 bdev_nvme_set_preferred_path(const char *name, uint16_t cntlid, 5204 bdev_nvme_set_preferred_path_cb cb_fn, void *cb_arg) 5205 { 5206 struct bdev_nvme_set_preferred_path_ctx *ctx; 5207 struct spdk_bdev *bdev; 5208 struct nvme_bdev *nbdev; 5209 int rc = 0; 5210 5211 assert(cb_fn != NULL); 5212 5213 ctx = calloc(1, sizeof(*ctx)); 5214 if (ctx == NULL) { 5215 SPDK_ERRLOG("Failed to alloc context.\n"); 5216 rc = -ENOMEM; 5217 goto err_alloc; 5218 } 5219 5220 ctx->cb_fn = cb_fn; 5221 ctx->cb_arg = cb_arg; 5222 5223 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 5224 if (rc != 0) { 5225 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 5226 goto err_open; 5227 } 5228 5229 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 5230 5231 if (bdev->module != &nvme_if) { 5232 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 5233 rc = -ENODEV; 5234 goto err_bdev; 5235 } 5236 5237 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 5238 5239 pthread_mutex_lock(&nbdev->mutex); 5240 5241 ctx->nvme_ns = bdev_nvme_set_preferred_ns(nbdev, cntlid); 5242 if (ctx->nvme_ns == NULL) { 5243 pthread_mutex_unlock(&nbdev->mutex); 5244 5245 SPDK_ERRLOG("bdev %s does not have namespace to controller %u.\n", name, cntlid); 5246 rc = -ENODEV; 5247 goto err_bdev; 5248 } 5249 5250 pthread_mutex_unlock(&nbdev->mutex); 5251 5252 nvme_bdev_for_each_channel(nbdev, 5253 _bdev_nvme_set_preferred_path, 5254 ctx, 5255 bdev_nvme_set_preferred_path_done); 5256 return; 5257 5258 err_bdev: 5259 spdk_bdev_close(ctx->desc); 5260 err_open: 5261 free(ctx); 5262 err_alloc: 5263 cb_fn(cb_arg, rc); 5264 } 5265 5266 struct bdev_nvme_set_multipath_policy_ctx { 5267 struct spdk_bdev_desc *desc; 5268 spdk_bdev_nvme_set_multipath_policy_cb cb_fn; 5269 void *cb_arg; 5270 }; 5271 5272 static void 5273 bdev_nvme_set_multipath_policy_done(struct nvme_bdev *nbdev, void *_ctx, int status) 5274 { 5275 struct bdev_nvme_set_multipath_policy_ctx *ctx = _ctx; 5276 5277 assert(ctx != NULL); 5278 assert(ctx->desc != NULL); 5279 assert(ctx->cb_fn != NULL); 5280 5281 spdk_bdev_close(ctx->desc); 5282 5283 ctx->cb_fn(ctx->cb_arg, status); 5284 5285 free(ctx); 5286 } 5287 5288 static void 5289 _bdev_nvme_set_multipath_policy(struct nvme_bdev_channel_iter *i, 5290 struct nvme_bdev *nbdev, 5291 struct nvme_bdev_channel *nbdev_ch, void *ctx) 5292 { 5293 nbdev_ch->mp_policy = nbdev->mp_policy; 5294 nbdev_ch->mp_selector = nbdev->mp_selector; 5295 nbdev_ch->rr_min_io = nbdev->rr_min_io; 5296 bdev_nvme_clear_current_io_path(nbdev_ch); 5297 5298 nvme_bdev_for_each_channel_continue(i, 0); 5299 } 5300 5301 void 5302 spdk_bdev_nvme_set_multipath_policy(const char *name, enum spdk_bdev_nvme_multipath_policy policy, 5303 enum spdk_bdev_nvme_multipath_selector selector, uint32_t rr_min_io, 5304 spdk_bdev_nvme_set_multipath_policy_cb cb_fn, void *cb_arg) 5305 { 5306 struct bdev_nvme_set_multipath_policy_ctx *ctx; 5307 struct spdk_bdev *bdev; 5308 struct nvme_bdev *nbdev; 5309 int rc; 5310 5311 assert(cb_fn != NULL); 5312 5313 switch (policy) { 5314 case BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE: 5315 break; 5316 case BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE: 5317 switch (selector) { 5318 case BDEV_NVME_MP_SELECTOR_ROUND_ROBIN: 5319 if (rr_min_io == UINT32_MAX) { 5320 rr_min_io = 1; 5321 } else if (rr_min_io == 0) { 5322 rc = -EINVAL; 5323 goto exit; 5324 } 5325 break; 5326 case BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH: 5327 break; 5328 default: 5329 rc = -EINVAL; 5330 goto exit; 5331 } 5332 break; 5333 default: 5334 rc = -EINVAL; 5335 goto exit; 5336 } 5337 5338 ctx = calloc(1, sizeof(*ctx)); 5339 if (ctx == NULL) { 5340 SPDK_ERRLOG("Failed to alloc context.\n"); 5341 rc = -ENOMEM; 5342 goto exit; 5343 } 5344 5345 ctx->cb_fn = cb_fn; 5346 ctx->cb_arg = cb_arg; 5347 5348 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 5349 if (rc != 0) { 5350 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 5351 rc = -ENODEV; 5352 goto err_open; 5353 } 5354 5355 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 5356 if (bdev->module != &nvme_if) { 5357 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 5358 rc = -ENODEV; 5359 goto err_module; 5360 } 5361 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 5362 5363 pthread_mutex_lock(&nbdev->mutex); 5364 nbdev->mp_policy = policy; 5365 nbdev->mp_selector = selector; 5366 nbdev->rr_min_io = rr_min_io; 5367 pthread_mutex_unlock(&nbdev->mutex); 5368 5369 nvme_bdev_for_each_channel(nbdev, 5370 _bdev_nvme_set_multipath_policy, 5371 ctx, 5372 bdev_nvme_set_multipath_policy_done); 5373 return; 5374 5375 err_module: 5376 spdk_bdev_close(ctx->desc); 5377 err_open: 5378 free(ctx); 5379 exit: 5380 cb_fn(cb_arg, rc); 5381 } 5382 5383 static void 5384 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 5385 { 5386 struct nvme_ctrlr *nvme_ctrlr = arg; 5387 union spdk_nvme_async_event_completion event; 5388 5389 if (spdk_nvme_cpl_is_error(cpl)) { 5390 SPDK_WARNLOG("AER request execute failed\n"); 5391 return; 5392 } 5393 5394 event.raw = cpl->cdw0; 5395 if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 5396 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) { 5397 nvme_ctrlr_populate_namespaces(nvme_ctrlr, NULL); 5398 } else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 5399 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_ANA_CHANGE)) { 5400 nvme_ctrlr_read_ana_log_page(nvme_ctrlr); 5401 } 5402 } 5403 5404 static void 5405 free_nvme_async_probe_ctx(struct nvme_async_probe_ctx *ctx) 5406 { 5407 spdk_keyring_put_key(ctx->drv_opts.tls_psk); 5408 spdk_keyring_put_key(ctx->drv_opts.dhchap_key); 5409 spdk_keyring_put_key(ctx->drv_opts.dhchap_ctrlr_key); 5410 free(ctx); 5411 } 5412 5413 static void 5414 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, int rc) 5415 { 5416 if (ctx->cb_fn) { 5417 ctx->cb_fn(ctx->cb_ctx, ctx->reported_bdevs, rc); 5418 } 5419 5420 ctx->namespaces_populated = true; 5421 if (ctx->probe_done) { 5422 /* The probe was already completed, so we need to free the context 5423 * here. This can happen for cases like OCSSD, where we need to 5424 * send additional commands to the SSD after attach. 5425 */ 5426 free_nvme_async_probe_ctx(ctx); 5427 } 5428 } 5429 5430 static int 5431 bdev_nvme_remove_poller(void *ctx) 5432 { 5433 struct spdk_nvme_transport_id trid_pcie; 5434 5435 if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 5436 spdk_poller_unregister(&g_hotplug_poller); 5437 return SPDK_POLLER_IDLE; 5438 } 5439 5440 memset(&trid_pcie, 0, sizeof(trid_pcie)); 5441 spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE); 5442 5443 if (spdk_nvme_scan_attached(&trid_pcie)) { 5444 SPDK_ERRLOG_RATELIMIT("spdk_nvme_scan_attached() failed\n"); 5445 } 5446 5447 return SPDK_POLLER_BUSY; 5448 } 5449 5450 static void 5451 nvme_ctrlr_create_done(struct nvme_ctrlr *nvme_ctrlr, 5452 struct nvme_async_probe_ctx *ctx) 5453 { 5454 spdk_io_device_register(nvme_ctrlr, 5455 bdev_nvme_create_ctrlr_channel_cb, 5456 bdev_nvme_destroy_ctrlr_channel_cb, 5457 sizeof(struct nvme_ctrlr_channel), 5458 nvme_ctrlr->nbdev_ctrlr->name); 5459 5460 nvme_ctrlr_populate_namespaces(nvme_ctrlr, ctx); 5461 5462 if (g_hotplug_poller == NULL) { 5463 g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_remove_poller, NULL, 5464 NVME_HOTPLUG_POLL_PERIOD_DEFAULT); 5465 } 5466 } 5467 5468 static void 5469 nvme_ctrlr_init_ana_log_page_done(void *_ctx, const struct spdk_nvme_cpl *cpl) 5470 { 5471 struct nvme_ctrlr *nvme_ctrlr = _ctx; 5472 struct nvme_async_probe_ctx *ctx = nvme_ctrlr->probe_ctx; 5473 5474 nvme_ctrlr->probe_ctx = NULL; 5475 5476 if (spdk_nvme_cpl_is_error(cpl)) { 5477 nvme_ctrlr_delete(nvme_ctrlr); 5478 5479 if (ctx != NULL) { 5480 ctx->reported_bdevs = 0; 5481 populate_namespaces_cb(ctx, -1); 5482 } 5483 return; 5484 } 5485 5486 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 5487 } 5488 5489 static int 5490 nvme_ctrlr_init_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 5491 struct nvme_async_probe_ctx *ctx) 5492 { 5493 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 5494 const struct spdk_nvme_ctrlr_data *cdata; 5495 uint32_t ana_log_page_size; 5496 5497 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5498 5499 /* Set buffer size enough to include maximum number of allowed namespaces. */ 5500 ana_log_page_size = sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 5501 sizeof(struct spdk_nvme_ana_group_descriptor) + cdata->mnan * 5502 sizeof(uint32_t); 5503 5504 nvme_ctrlr->ana_log_page = spdk_zmalloc(ana_log_page_size, 64, NULL, 5505 SPDK_ENV_NUMA_ID_ANY, SPDK_MALLOC_DMA); 5506 if (nvme_ctrlr->ana_log_page == NULL) { 5507 SPDK_ERRLOG("could not allocate ANA log page buffer\n"); 5508 return -ENXIO; 5509 } 5510 5511 /* Each descriptor in a ANA log page is not ensured to be 8-bytes aligned. 5512 * Hence copy each descriptor to a temporary area when parsing it. 5513 * 5514 * Allocate a buffer whose size is as large as ANA log page buffer because 5515 * we do not know the size of a descriptor until actually reading it. 5516 */ 5517 nvme_ctrlr->copied_ana_desc = calloc(1, ana_log_page_size); 5518 if (nvme_ctrlr->copied_ana_desc == NULL) { 5519 SPDK_ERRLOG("could not allocate a buffer to parse ANA descriptor\n"); 5520 return -ENOMEM; 5521 } 5522 5523 nvme_ctrlr->max_ana_log_page_size = ana_log_page_size; 5524 5525 nvme_ctrlr->probe_ctx = ctx; 5526 5527 /* Then, set the read size only to include the current active namespaces. */ 5528 ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); 5529 5530 if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { 5531 SPDK_ERRLOG("ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", 5532 ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); 5533 return -EINVAL; 5534 } 5535 5536 return spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, 5537 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 5538 SPDK_NVME_GLOBAL_NS_TAG, 5539 nvme_ctrlr->ana_log_page, 5540 ana_log_page_size, 0, 5541 nvme_ctrlr_init_ana_log_page_done, 5542 nvme_ctrlr); 5543 } 5544 5545 /* hostnqn and subnqn were already verified before attaching a controller. 5546 * Hence check only the multipath capability and cntlid here. 5547 */ 5548 static bool 5549 bdev_nvme_check_multipath(struct nvme_bdev_ctrlr *nbdev_ctrlr, struct spdk_nvme_ctrlr *ctrlr) 5550 { 5551 struct nvme_ctrlr *tmp; 5552 const struct spdk_nvme_ctrlr_data *cdata, *tmp_cdata; 5553 5554 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5555 5556 if (!cdata->cmic.multi_ctrlr) { 5557 SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); 5558 return false; 5559 } 5560 5561 TAILQ_FOREACH(tmp, &nbdev_ctrlr->ctrlrs, tailq) { 5562 tmp_cdata = spdk_nvme_ctrlr_get_data(tmp->ctrlr); 5563 5564 if (!tmp_cdata->cmic.multi_ctrlr) { 5565 SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); 5566 return false; 5567 } 5568 if (cdata->cntlid == tmp_cdata->cntlid) { 5569 SPDK_ERRLOG("cntlid %u are duplicated.\n", tmp_cdata->cntlid); 5570 return false; 5571 } 5572 } 5573 5574 return true; 5575 } 5576 5577 5578 static int 5579 nvme_bdev_ctrlr_create(const char *name, struct nvme_ctrlr *nvme_ctrlr) 5580 { 5581 struct nvme_bdev_ctrlr *nbdev_ctrlr; 5582 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 5583 struct nvme_ctrlr *nctrlr; 5584 int rc = 0; 5585 5586 pthread_mutex_lock(&g_bdev_nvme_mutex); 5587 5588 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 5589 if (nbdev_ctrlr != NULL) { 5590 if (!bdev_nvme_check_multipath(nbdev_ctrlr, ctrlr)) { 5591 rc = -EINVAL; 5592 goto exit; 5593 } 5594 TAILQ_FOREACH(nctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 5595 if (nctrlr->opts.multipath != nvme_ctrlr->opts.multipath) { 5596 /* All controllers with the same name must be configured the same 5597 * way, either for multipath or failover. If the configuration doesn't 5598 * match - report error. 5599 */ 5600 rc = -EINVAL; 5601 goto exit; 5602 } 5603 } 5604 } else { 5605 nbdev_ctrlr = calloc(1, sizeof(*nbdev_ctrlr)); 5606 if (nbdev_ctrlr == NULL) { 5607 SPDK_ERRLOG("Failed to allocate nvme_bdev_ctrlr.\n"); 5608 rc = -ENOMEM; 5609 goto exit; 5610 } 5611 nbdev_ctrlr->name = strdup(name); 5612 if (nbdev_ctrlr->name == NULL) { 5613 SPDK_ERRLOG("Failed to allocate name of nvme_bdev_ctrlr.\n"); 5614 free(nbdev_ctrlr); 5615 goto exit; 5616 } 5617 TAILQ_INIT(&nbdev_ctrlr->ctrlrs); 5618 TAILQ_INIT(&nbdev_ctrlr->bdevs); 5619 TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 5620 } 5621 nvme_ctrlr->nbdev_ctrlr = nbdev_ctrlr; 5622 TAILQ_INSERT_TAIL(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 5623 exit: 5624 pthread_mutex_unlock(&g_bdev_nvme_mutex); 5625 return rc; 5626 } 5627 5628 static int 5629 nvme_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr, 5630 const char *name, 5631 const struct spdk_nvme_transport_id *trid, 5632 struct nvme_async_probe_ctx *ctx) 5633 { 5634 struct nvme_ctrlr *nvme_ctrlr; 5635 struct nvme_path_id *path_id; 5636 const struct spdk_nvme_ctrlr_data *cdata; 5637 int rc; 5638 5639 nvme_ctrlr = calloc(1, sizeof(*nvme_ctrlr)); 5640 if (nvme_ctrlr == NULL) { 5641 SPDK_ERRLOG("Failed to allocate device struct\n"); 5642 return -ENOMEM; 5643 } 5644 5645 rc = pthread_mutex_init(&nvme_ctrlr->mutex, NULL); 5646 if (rc != 0) { 5647 free(nvme_ctrlr); 5648 return rc; 5649 } 5650 5651 TAILQ_INIT(&nvme_ctrlr->trids); 5652 RB_INIT(&nvme_ctrlr->namespaces); 5653 5654 /* Get another reference to the key, so the first one can be released from probe_ctx */ 5655 if (ctx != NULL) { 5656 if (ctx->drv_opts.tls_psk != NULL) { 5657 nvme_ctrlr->psk = spdk_keyring_get_key( 5658 spdk_key_get_name(ctx->drv_opts.tls_psk)); 5659 if (nvme_ctrlr->psk == NULL) { 5660 /* Could only happen if the key was removed in the meantime */ 5661 SPDK_ERRLOG("Couldn't get a reference to the key '%s'\n", 5662 spdk_key_get_name(ctx->drv_opts.tls_psk)); 5663 rc = -ENOKEY; 5664 goto err; 5665 } 5666 } 5667 5668 if (ctx->drv_opts.dhchap_key != NULL) { 5669 nvme_ctrlr->dhchap_key = spdk_keyring_get_key( 5670 spdk_key_get_name(ctx->drv_opts.dhchap_key)); 5671 if (nvme_ctrlr->dhchap_key == NULL) { 5672 SPDK_ERRLOG("Couldn't get a reference to the key '%s'\n", 5673 spdk_key_get_name(ctx->drv_opts.dhchap_key)); 5674 rc = -ENOKEY; 5675 goto err; 5676 } 5677 } 5678 5679 if (ctx->drv_opts.dhchap_ctrlr_key != NULL) { 5680 nvme_ctrlr->dhchap_ctrlr_key = 5681 spdk_keyring_get_key( 5682 spdk_key_get_name(ctx->drv_opts.dhchap_ctrlr_key)); 5683 if (nvme_ctrlr->dhchap_ctrlr_key == NULL) { 5684 SPDK_ERRLOG("Couldn't get a reference to the key '%s'\n", 5685 spdk_key_get_name(ctx->drv_opts.dhchap_ctrlr_key)); 5686 rc = -ENOKEY; 5687 goto err; 5688 } 5689 } 5690 } 5691 5692 path_id = calloc(1, sizeof(*path_id)); 5693 if (path_id == NULL) { 5694 SPDK_ERRLOG("Failed to allocate trid entry pointer\n"); 5695 rc = -ENOMEM; 5696 goto err; 5697 } 5698 5699 path_id->trid = *trid; 5700 if (ctx != NULL) { 5701 memcpy(path_id->hostid.hostaddr, ctx->drv_opts.src_addr, sizeof(path_id->hostid.hostaddr)); 5702 memcpy(path_id->hostid.hostsvcid, ctx->drv_opts.src_svcid, sizeof(path_id->hostid.hostsvcid)); 5703 } 5704 nvme_ctrlr->active_path_id = path_id; 5705 TAILQ_INSERT_HEAD(&nvme_ctrlr->trids, path_id, link); 5706 5707 nvme_ctrlr->thread = spdk_get_thread(); 5708 nvme_ctrlr->ctrlr = ctrlr; 5709 nvme_ctrlr->ref = 1; 5710 5711 if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) { 5712 SPDK_ERRLOG("OCSSDs are not supported"); 5713 rc = -ENOTSUP; 5714 goto err; 5715 } 5716 5717 if (ctx != NULL) { 5718 memcpy(&nvme_ctrlr->opts, &ctx->bdev_opts, sizeof(ctx->bdev_opts)); 5719 } else { 5720 spdk_bdev_nvme_get_default_ctrlr_opts(&nvme_ctrlr->opts); 5721 } 5722 5723 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_ctrlr, 5724 g_opts.nvme_adminq_poll_period_us); 5725 5726 if (g_opts.timeout_us > 0) { 5727 /* Register timeout callback. Timeout values for IO vs. admin reqs can be different. */ 5728 /* If timeout_admin_us is 0 (not specified), admin uses same timeout as IO. */ 5729 uint64_t adm_timeout_us = (g_opts.timeout_admin_us == 0) ? 5730 g_opts.timeout_us : g_opts.timeout_admin_us; 5731 spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us, 5732 adm_timeout_us, timeout_cb, nvme_ctrlr); 5733 } 5734 5735 spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_ctrlr); 5736 spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_ctrlr); 5737 5738 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 5739 SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) { 5740 nvme_ctrlr->opal_dev = spdk_opal_dev_construct(ctrlr); 5741 } 5742 5743 rc = nvme_bdev_ctrlr_create(name, nvme_ctrlr); 5744 if (rc != 0) { 5745 goto err; 5746 } 5747 5748 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5749 5750 if (cdata->cmic.ana_reporting) { 5751 rc = nvme_ctrlr_init_ana_log_page(nvme_ctrlr, ctx); 5752 if (rc == 0) { 5753 return 0; 5754 } 5755 } else { 5756 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 5757 return 0; 5758 } 5759 5760 err: 5761 nvme_ctrlr_delete(nvme_ctrlr); 5762 return rc; 5763 } 5764 5765 void 5766 spdk_bdev_nvme_get_default_ctrlr_opts(struct spdk_bdev_nvme_ctrlr_opts *opts) 5767 { 5768 opts->prchk_flags = 0; 5769 opts->ctrlr_loss_timeout_sec = g_opts.ctrlr_loss_timeout_sec; 5770 opts->reconnect_delay_sec = g_opts.reconnect_delay_sec; 5771 opts->fast_io_fail_timeout_sec = g_opts.fast_io_fail_timeout_sec; 5772 opts->multipath = true; 5773 } 5774 5775 static void 5776 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5777 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *drv_opts) 5778 { 5779 char *name; 5780 5781 name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++); 5782 if (!name) { 5783 SPDK_ERRLOG("Failed to assign name to NVMe device\n"); 5784 return; 5785 } 5786 5787 if (nvme_ctrlr_create(ctrlr, name, trid, NULL) == 0) { 5788 SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name); 5789 } else { 5790 SPDK_ERRLOG("Failed to attach to %s (%s)\n", trid->traddr, name); 5791 } 5792 5793 free(name); 5794 } 5795 5796 static void 5797 _nvme_ctrlr_destruct(void *ctx) 5798 { 5799 struct nvme_ctrlr *nvme_ctrlr = ctx; 5800 5801 nvme_ctrlr_depopulate_namespaces(nvme_ctrlr); 5802 nvme_ctrlr_release(nvme_ctrlr); 5803 } 5804 5805 static int 5806 bdev_nvme_delete_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) 5807 { 5808 struct nvme_probe_skip_entry *entry; 5809 5810 /* The controller's destruction was already started */ 5811 if (nvme_ctrlr->destruct) { 5812 return -EALREADY; 5813 } 5814 5815 if (!hotplug && 5816 nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { 5817 entry = calloc(1, sizeof(*entry)); 5818 if (!entry) { 5819 return -ENOMEM; 5820 } 5821 entry->trid = nvme_ctrlr->active_path_id->trid; 5822 TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq); 5823 } 5824 5825 nvme_ctrlr->destruct = true; 5826 return 0; 5827 } 5828 5829 static int 5830 bdev_nvme_delete_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) 5831 { 5832 int rc; 5833 5834 pthread_mutex_lock(&nvme_ctrlr->mutex); 5835 rc = bdev_nvme_delete_ctrlr_unsafe(nvme_ctrlr, hotplug); 5836 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5837 5838 if (rc == 0) { 5839 _nvme_ctrlr_destruct(nvme_ctrlr); 5840 } else if (rc == -EALREADY) { 5841 rc = 0; 5842 } 5843 5844 return rc; 5845 } 5846 5847 static void 5848 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr) 5849 { 5850 struct nvme_ctrlr *nvme_ctrlr = cb_ctx; 5851 5852 bdev_nvme_delete_ctrlr(nvme_ctrlr, true); 5853 } 5854 5855 static int 5856 bdev_nvme_hotplug_probe(void *arg) 5857 { 5858 if (g_hotplug_probe_ctx == NULL) { 5859 spdk_poller_unregister(&g_hotplug_probe_poller); 5860 return SPDK_POLLER_IDLE; 5861 } 5862 5863 if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) { 5864 g_hotplug_probe_ctx = NULL; 5865 spdk_poller_unregister(&g_hotplug_probe_poller); 5866 } 5867 5868 return SPDK_POLLER_BUSY; 5869 } 5870 5871 static int 5872 bdev_nvme_hotplug(void *arg) 5873 { 5874 struct spdk_nvme_transport_id trid_pcie; 5875 5876 if (g_hotplug_probe_ctx) { 5877 return SPDK_POLLER_BUSY; 5878 } 5879 5880 memset(&trid_pcie, 0, sizeof(trid_pcie)); 5881 spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE); 5882 5883 g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL, 5884 hotplug_probe_cb, attach_cb, NULL); 5885 5886 if (g_hotplug_probe_ctx) { 5887 assert(g_hotplug_probe_poller == NULL); 5888 g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000); 5889 } 5890 5891 return SPDK_POLLER_BUSY; 5892 } 5893 5894 void 5895 bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts) 5896 { 5897 *opts = g_opts; 5898 } 5899 5900 static bool bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, 5901 uint32_t reconnect_delay_sec, 5902 uint32_t fast_io_fail_timeout_sec); 5903 5904 static int 5905 bdev_nvme_validate_opts(const struct spdk_bdev_nvme_opts *opts) 5906 { 5907 if ((opts->timeout_us == 0) && (opts->timeout_admin_us != 0)) { 5908 /* Can't set timeout_admin_us without also setting timeout_us */ 5909 SPDK_WARNLOG("Invalid options: Can't have (timeout_us == 0) with (timeout_admin_us > 0)\n"); 5910 return -EINVAL; 5911 } 5912 5913 if (opts->bdev_retry_count < -1) { 5914 SPDK_WARNLOG("Invalid option: bdev_retry_count can't be less than -1.\n"); 5915 return -EINVAL; 5916 } 5917 5918 if (!bdev_nvme_check_io_error_resiliency_params(opts->ctrlr_loss_timeout_sec, 5919 opts->reconnect_delay_sec, 5920 opts->fast_io_fail_timeout_sec)) { 5921 return -EINVAL; 5922 } 5923 5924 return 0; 5925 } 5926 5927 int 5928 bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts) 5929 { 5930 int ret; 5931 5932 ret = bdev_nvme_validate_opts(opts); 5933 if (ret) { 5934 SPDK_WARNLOG("Failed to set nvme opts.\n"); 5935 return ret; 5936 } 5937 5938 if (g_bdev_nvme_init_thread != NULL) { 5939 if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 5940 return -EPERM; 5941 } 5942 } 5943 5944 if (opts->rdma_srq_size != 0 || 5945 opts->rdma_max_cq_size != 0 || 5946 opts->rdma_cm_event_timeout_ms != 0) { 5947 struct spdk_nvme_transport_opts drv_opts; 5948 5949 spdk_nvme_transport_get_opts(&drv_opts, sizeof(drv_opts)); 5950 if (opts->rdma_srq_size != 0) { 5951 drv_opts.rdma_srq_size = opts->rdma_srq_size; 5952 } 5953 if (opts->rdma_max_cq_size != 0) { 5954 drv_opts.rdma_max_cq_size = opts->rdma_max_cq_size; 5955 } 5956 if (opts->rdma_cm_event_timeout_ms != 0) { 5957 drv_opts.rdma_cm_event_timeout_ms = opts->rdma_cm_event_timeout_ms; 5958 } 5959 5960 ret = spdk_nvme_transport_set_opts(&drv_opts, sizeof(drv_opts)); 5961 if (ret) { 5962 SPDK_ERRLOG("Failed to set NVMe transport opts.\n"); 5963 return ret; 5964 } 5965 } 5966 5967 g_opts = *opts; 5968 5969 return 0; 5970 } 5971 5972 struct set_nvme_hotplug_ctx { 5973 uint64_t period_us; 5974 bool enabled; 5975 spdk_msg_fn fn; 5976 void *fn_ctx; 5977 }; 5978 5979 static void 5980 set_nvme_hotplug_period_cb(void *_ctx) 5981 { 5982 struct set_nvme_hotplug_ctx *ctx = _ctx; 5983 5984 spdk_poller_unregister(&g_hotplug_poller); 5985 if (ctx->enabled) { 5986 g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us); 5987 } else { 5988 g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_remove_poller, NULL, 5989 NVME_HOTPLUG_POLL_PERIOD_DEFAULT); 5990 } 5991 5992 g_nvme_hotplug_poll_period_us = ctx->period_us; 5993 g_nvme_hotplug_enabled = ctx->enabled; 5994 if (ctx->fn) { 5995 ctx->fn(ctx->fn_ctx); 5996 } 5997 5998 free(ctx); 5999 } 6000 6001 int 6002 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx) 6003 { 6004 struct set_nvme_hotplug_ctx *ctx; 6005 6006 if (enabled == true && !spdk_process_is_primary()) { 6007 return -EPERM; 6008 } 6009 6010 ctx = calloc(1, sizeof(*ctx)); 6011 if (ctx == NULL) { 6012 return -ENOMEM; 6013 } 6014 6015 period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us; 6016 ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX); 6017 ctx->enabled = enabled; 6018 ctx->fn = cb; 6019 ctx->fn_ctx = cb_ctx; 6020 6021 spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx); 6022 return 0; 6023 } 6024 6025 static void 6026 nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 6027 struct nvme_async_probe_ctx *ctx) 6028 { 6029 struct nvme_ns *nvme_ns; 6030 struct nvme_bdev *nvme_bdev; 6031 size_t j; 6032 6033 assert(nvme_ctrlr != NULL); 6034 6035 if (ctx->names == NULL) { 6036 ctx->reported_bdevs = 0; 6037 populate_namespaces_cb(ctx, 0); 6038 return; 6039 } 6040 6041 /* 6042 * Report the new bdevs that were created in this call. 6043 * There can be more than one bdev per NVMe controller. 6044 */ 6045 j = 0; 6046 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 6047 while (nvme_ns != NULL) { 6048 nvme_bdev = nvme_ns->bdev; 6049 if (j < ctx->max_bdevs) { 6050 ctx->names[j] = nvme_bdev->disk.name; 6051 j++; 6052 } else { 6053 SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n", 6054 ctx->max_bdevs); 6055 ctx->reported_bdevs = 0; 6056 populate_namespaces_cb(ctx, -ERANGE); 6057 return; 6058 } 6059 6060 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 6061 } 6062 6063 ctx->reported_bdevs = j; 6064 populate_namespaces_cb(ctx, 0); 6065 } 6066 6067 static int 6068 bdev_nvme_check_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 6069 struct spdk_nvme_ctrlr *new_ctrlr, 6070 struct spdk_nvme_transport_id *trid) 6071 { 6072 struct nvme_path_id *tmp_trid; 6073 6074 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 6075 SPDK_ERRLOG("PCIe failover is not supported.\n"); 6076 return -ENOTSUP; 6077 } 6078 6079 /* Currently we only support failover to the same transport type. */ 6080 if (nvme_ctrlr->active_path_id->trid.trtype != trid->trtype) { 6081 SPDK_WARNLOG("Failover from trtype: %s to a different trtype: %s is not supported currently\n", 6082 spdk_nvme_transport_id_trtype_str(nvme_ctrlr->active_path_id->trid.trtype), 6083 spdk_nvme_transport_id_trtype_str(trid->trtype)); 6084 return -EINVAL; 6085 } 6086 6087 6088 /* Currently we only support failover to the same NQN. */ 6089 if (strncmp(trid->subnqn, nvme_ctrlr->active_path_id->trid.subnqn, SPDK_NVMF_NQN_MAX_LEN)) { 6090 SPDK_WARNLOG("Failover from subnqn: %s to a different subnqn: %s is not supported currently\n", 6091 nvme_ctrlr->active_path_id->trid.subnqn, trid->subnqn); 6092 return -EINVAL; 6093 } 6094 6095 /* Skip all the other checks if we've already registered this path. */ 6096 TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) { 6097 if (!spdk_nvme_transport_id_compare(&tmp_trid->trid, trid)) { 6098 SPDK_WARNLOG("This path (traddr: %s subnqn: %s) is already registered\n", trid->traddr, 6099 trid->subnqn); 6100 return -EALREADY; 6101 } 6102 } 6103 6104 return 0; 6105 } 6106 6107 static int 6108 bdev_nvme_check_secondary_namespace(struct nvme_ctrlr *nvme_ctrlr, 6109 struct spdk_nvme_ctrlr *new_ctrlr) 6110 { 6111 struct nvme_ns *nvme_ns; 6112 struct spdk_nvme_ns *new_ns; 6113 6114 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 6115 while (nvme_ns != NULL) { 6116 new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nvme_ns->id); 6117 assert(new_ns != NULL); 6118 6119 if (!bdev_nvme_compare_ns(nvme_ns->ns, new_ns)) { 6120 return -EINVAL; 6121 } 6122 6123 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 6124 } 6125 6126 return 0; 6127 } 6128 6129 static int 6130 _bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 6131 struct spdk_nvme_transport_id *trid) 6132 { 6133 struct nvme_path_id *active_id, *new_trid, *tmp_trid; 6134 6135 new_trid = calloc(1, sizeof(*new_trid)); 6136 if (new_trid == NULL) { 6137 return -ENOMEM; 6138 } 6139 new_trid->trid = *trid; 6140 6141 active_id = nvme_ctrlr->active_path_id; 6142 assert(active_id != NULL); 6143 assert(active_id == TAILQ_FIRST(&nvme_ctrlr->trids)); 6144 6145 /* Skip the active trid not to replace it until it is failed. */ 6146 tmp_trid = TAILQ_NEXT(active_id, link); 6147 if (tmp_trid == NULL) { 6148 goto add_tail; 6149 } 6150 6151 /* It means the trid is faled if its last failed time is non-zero. 6152 * Insert the new alternate trid before any failed trid. 6153 */ 6154 TAILQ_FOREACH_FROM(tmp_trid, &nvme_ctrlr->trids, link) { 6155 if (tmp_trid->last_failed_tsc != 0) { 6156 TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link); 6157 return 0; 6158 } 6159 } 6160 6161 add_tail: 6162 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, new_trid, link); 6163 return 0; 6164 } 6165 6166 /* This is the case that a secondary path is added to an existing 6167 * nvme_ctrlr for failover. After checking if it can access the same 6168 * namespaces as the primary path, it is disconnected until failover occurs. 6169 */ 6170 static int 6171 bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 6172 struct spdk_nvme_ctrlr *new_ctrlr, 6173 struct spdk_nvme_transport_id *trid) 6174 { 6175 int rc; 6176 6177 assert(nvme_ctrlr != NULL); 6178 6179 pthread_mutex_lock(&nvme_ctrlr->mutex); 6180 6181 rc = bdev_nvme_check_secondary_trid(nvme_ctrlr, new_ctrlr, trid); 6182 if (rc != 0) { 6183 goto exit; 6184 } 6185 6186 rc = bdev_nvme_check_secondary_namespace(nvme_ctrlr, new_ctrlr); 6187 if (rc != 0) { 6188 goto exit; 6189 } 6190 6191 rc = _bdev_nvme_add_secondary_trid(nvme_ctrlr, trid); 6192 6193 exit: 6194 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6195 6196 spdk_nvme_detach(new_ctrlr); 6197 6198 return rc; 6199 } 6200 6201 static void 6202 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 6203 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 6204 { 6205 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 6206 struct nvme_async_probe_ctx *ctx; 6207 int rc; 6208 6209 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 6210 ctx->ctrlr_attached = true; 6211 6212 rc = nvme_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx); 6213 if (rc != 0) { 6214 ctx->reported_bdevs = 0; 6215 populate_namespaces_cb(ctx, rc); 6216 } 6217 } 6218 6219 6220 static void 6221 connect_set_failover_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 6222 struct spdk_nvme_ctrlr *ctrlr, 6223 const struct spdk_nvme_ctrlr_opts *opts) 6224 { 6225 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 6226 struct nvme_ctrlr *nvme_ctrlr; 6227 struct nvme_async_probe_ctx *ctx; 6228 int rc; 6229 6230 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 6231 ctx->ctrlr_attached = true; 6232 6233 nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->base_name); 6234 if (nvme_ctrlr) { 6235 rc = bdev_nvme_add_secondary_trid(nvme_ctrlr, ctrlr, &ctx->trid); 6236 } else { 6237 rc = -ENODEV; 6238 } 6239 6240 ctx->reported_bdevs = 0; 6241 populate_namespaces_cb(ctx, rc); 6242 } 6243 6244 static int 6245 bdev_nvme_async_poll(void *arg) 6246 { 6247 struct nvme_async_probe_ctx *ctx = arg; 6248 int rc; 6249 6250 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 6251 if (spdk_unlikely(rc != -EAGAIN)) { 6252 ctx->probe_done = true; 6253 spdk_poller_unregister(&ctx->poller); 6254 if (!ctx->ctrlr_attached) { 6255 /* The probe is done, but no controller was attached. 6256 * That means we had a failure, so report -EIO back to 6257 * the caller (usually the RPC). populate_namespaces_cb() 6258 * will take care of freeing the nvme_async_probe_ctx. 6259 */ 6260 ctx->reported_bdevs = 0; 6261 populate_namespaces_cb(ctx, -EIO); 6262 } else if (ctx->namespaces_populated) { 6263 /* The namespaces for the attached controller were all 6264 * populated and the response was already sent to the 6265 * caller (usually the RPC). So free the context here. 6266 */ 6267 free_nvme_async_probe_ctx(ctx); 6268 } 6269 } 6270 6271 return SPDK_POLLER_BUSY; 6272 } 6273 6274 static bool 6275 bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, 6276 uint32_t reconnect_delay_sec, 6277 uint32_t fast_io_fail_timeout_sec) 6278 { 6279 if (ctrlr_loss_timeout_sec < -1) { 6280 SPDK_ERRLOG("ctrlr_loss_timeout_sec can't be less than -1.\n"); 6281 return false; 6282 } else if (ctrlr_loss_timeout_sec == -1) { 6283 if (reconnect_delay_sec == 0) { 6284 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 6285 return false; 6286 } else if (fast_io_fail_timeout_sec != 0 && 6287 fast_io_fail_timeout_sec < reconnect_delay_sec) { 6288 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io-fail_timeout_sec.\n"); 6289 return false; 6290 } 6291 } else if (ctrlr_loss_timeout_sec != 0) { 6292 if (reconnect_delay_sec == 0) { 6293 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 6294 return false; 6295 } else if (reconnect_delay_sec > (uint32_t)ctrlr_loss_timeout_sec) { 6296 SPDK_ERRLOG("reconnect_delay_sec can't be more than ctrlr_loss_timeout_sec.\n"); 6297 return false; 6298 } else if (fast_io_fail_timeout_sec != 0) { 6299 if (fast_io_fail_timeout_sec < reconnect_delay_sec) { 6300 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io_fail_timeout_sec.\n"); 6301 return false; 6302 } else if (fast_io_fail_timeout_sec > (uint32_t)ctrlr_loss_timeout_sec) { 6303 SPDK_ERRLOG("fast_io_fail_timeout_sec can't be more than ctrlr_loss_timeout_sec.\n"); 6304 return false; 6305 } 6306 } 6307 } else if (reconnect_delay_sec != 0 || fast_io_fail_timeout_sec != 0) { 6308 SPDK_ERRLOG("Both reconnect_delay_sec and fast_io_fail_timeout_sec must be 0 if ctrlr_loss_timeout_sec is 0.\n"); 6309 return false; 6310 } 6311 6312 return true; 6313 } 6314 6315 int 6316 spdk_bdev_nvme_create(struct spdk_nvme_transport_id *trid, 6317 const char *base_name, 6318 const char **names, 6319 uint32_t count, 6320 spdk_bdev_nvme_create_cb cb_fn, 6321 void *cb_ctx, 6322 struct spdk_nvme_ctrlr_opts *drv_opts, 6323 struct spdk_bdev_nvme_ctrlr_opts *bdev_opts) 6324 { 6325 struct nvme_probe_skip_entry *entry, *tmp; 6326 struct nvme_async_probe_ctx *ctx; 6327 spdk_nvme_attach_cb attach_cb; 6328 struct nvme_ctrlr *nvme_ctrlr; 6329 int len; 6330 6331 /* TODO expand this check to include both the host and target TRIDs. 6332 * Only if both are the same should we fail. 6333 */ 6334 if (nvme_ctrlr_get(trid, drv_opts->hostnqn) != NULL) { 6335 SPDK_ERRLOG("A controller with the provided trid (traddr: %s, hostnqn: %s) " 6336 "already exists.\n", trid->traddr, drv_opts->hostnqn); 6337 return -EEXIST; 6338 } 6339 6340 len = strnlen(base_name, SPDK_CONTROLLER_NAME_MAX); 6341 6342 if (len == 0 || len == SPDK_CONTROLLER_NAME_MAX) { 6343 SPDK_ERRLOG("controller name must be between 1 and %d characters\n", SPDK_CONTROLLER_NAME_MAX - 1); 6344 return -EINVAL; 6345 } 6346 6347 if (bdev_opts != NULL && 6348 !bdev_nvme_check_io_error_resiliency_params(bdev_opts->ctrlr_loss_timeout_sec, 6349 bdev_opts->reconnect_delay_sec, 6350 bdev_opts->fast_io_fail_timeout_sec)) { 6351 return -EINVAL; 6352 } 6353 6354 ctx = calloc(1, sizeof(*ctx)); 6355 if (!ctx) { 6356 return -ENOMEM; 6357 } 6358 ctx->base_name = base_name; 6359 ctx->names = names; 6360 ctx->max_bdevs = count; 6361 ctx->cb_fn = cb_fn; 6362 ctx->cb_ctx = cb_ctx; 6363 ctx->trid = *trid; 6364 6365 if (bdev_opts) { 6366 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 6367 } else { 6368 spdk_bdev_nvme_get_default_ctrlr_opts(&ctx->bdev_opts); 6369 } 6370 6371 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 6372 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) { 6373 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 6374 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 6375 free(entry); 6376 break; 6377 } 6378 } 6379 } 6380 6381 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 6382 ctx->drv_opts.transport_retry_count = g_opts.transport_retry_count; 6383 ctx->drv_opts.transport_ack_timeout = g_opts.transport_ack_timeout; 6384 ctx->drv_opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms; 6385 ctx->drv_opts.disable_read_ana_log_page = true; 6386 ctx->drv_opts.transport_tos = g_opts.transport_tos; 6387 6388 if (ctx->bdev_opts.psk != NULL) { 6389 ctx->drv_opts.tls_psk = spdk_keyring_get_key(ctx->bdev_opts.psk); 6390 if (ctx->drv_opts.tls_psk == NULL) { 6391 SPDK_ERRLOG("Could not load PSK: %s\n", ctx->bdev_opts.psk); 6392 free_nvme_async_probe_ctx(ctx); 6393 return -ENOKEY; 6394 } 6395 } 6396 6397 if (ctx->bdev_opts.dhchap_key != NULL) { 6398 ctx->drv_opts.dhchap_key = spdk_keyring_get_key(ctx->bdev_opts.dhchap_key); 6399 if (ctx->drv_opts.dhchap_key == NULL) { 6400 SPDK_ERRLOG("Could not load DH-HMAC-CHAP key: %s\n", 6401 ctx->bdev_opts.dhchap_key); 6402 free_nvme_async_probe_ctx(ctx); 6403 return -ENOKEY; 6404 } 6405 6406 ctx->drv_opts.dhchap_digests = g_opts.dhchap_digests; 6407 ctx->drv_opts.dhchap_dhgroups = g_opts.dhchap_dhgroups; 6408 } 6409 if (ctx->bdev_opts.dhchap_ctrlr_key != NULL) { 6410 ctx->drv_opts.dhchap_ctrlr_key = 6411 spdk_keyring_get_key(ctx->bdev_opts.dhchap_ctrlr_key); 6412 if (ctx->drv_opts.dhchap_ctrlr_key == NULL) { 6413 SPDK_ERRLOG("Could not load DH-HMAC-CHAP controller key: %s\n", 6414 ctx->bdev_opts.dhchap_ctrlr_key); 6415 free_nvme_async_probe_ctx(ctx); 6416 return -ENOKEY; 6417 } 6418 } 6419 6420 if (nvme_bdev_ctrlr_get_by_name(base_name) == NULL || ctx->bdev_opts.multipath) { 6421 attach_cb = connect_attach_cb; 6422 } else { 6423 attach_cb = connect_set_failover_cb; 6424 } 6425 6426 nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->base_name); 6427 if (nvme_ctrlr && nvme_ctrlr->opts.multipath != ctx->bdev_opts.multipath) { 6428 /* All controllers with the same name must be configured the same 6429 * way, either for multipath or failover. If the configuration doesn't 6430 * match - report error. 6431 */ 6432 free_nvme_async_probe_ctx(ctx); 6433 return -EINVAL; 6434 } 6435 6436 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, attach_cb); 6437 if (ctx->probe_ctx == NULL) { 6438 SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr); 6439 free_nvme_async_probe_ctx(ctx); 6440 return -ENODEV; 6441 } 6442 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000); 6443 6444 return 0; 6445 } 6446 6447 struct bdev_nvme_delete_ctx { 6448 char *name; 6449 struct nvme_path_id path_id; 6450 bdev_nvme_delete_done_fn delete_done; 6451 void *delete_done_ctx; 6452 uint64_t timeout_ticks; 6453 struct spdk_poller *poller; 6454 }; 6455 6456 static void 6457 free_bdev_nvme_delete_ctx(struct bdev_nvme_delete_ctx *ctx) 6458 { 6459 if (ctx != NULL) { 6460 free(ctx->name); 6461 free(ctx); 6462 } 6463 } 6464 6465 static bool 6466 nvme_path_id_compare(struct nvme_path_id *p, const struct nvme_path_id *path_id) 6467 { 6468 if (path_id->trid.trtype != 0) { 6469 if (path_id->trid.trtype == SPDK_NVME_TRANSPORT_CUSTOM) { 6470 if (strcasecmp(path_id->trid.trstring, p->trid.trstring) != 0) { 6471 return false; 6472 } 6473 } else { 6474 if (path_id->trid.trtype != p->trid.trtype) { 6475 return false; 6476 } 6477 } 6478 } 6479 6480 if (!spdk_mem_all_zero(path_id->trid.traddr, sizeof(path_id->trid.traddr))) { 6481 if (strcasecmp(path_id->trid.traddr, p->trid.traddr) != 0) { 6482 return false; 6483 } 6484 } 6485 6486 if (path_id->trid.adrfam != 0) { 6487 if (path_id->trid.adrfam != p->trid.adrfam) { 6488 return false; 6489 } 6490 } 6491 6492 if (!spdk_mem_all_zero(path_id->trid.trsvcid, sizeof(path_id->trid.trsvcid))) { 6493 if (strcasecmp(path_id->trid.trsvcid, p->trid.trsvcid) != 0) { 6494 return false; 6495 } 6496 } 6497 6498 if (!spdk_mem_all_zero(path_id->trid.subnqn, sizeof(path_id->trid.subnqn))) { 6499 if (strcmp(path_id->trid.subnqn, p->trid.subnqn) != 0) { 6500 return false; 6501 } 6502 } 6503 6504 if (!spdk_mem_all_zero(path_id->hostid.hostaddr, sizeof(path_id->hostid.hostaddr))) { 6505 if (strcmp(path_id->hostid.hostaddr, p->hostid.hostaddr) != 0) { 6506 return false; 6507 } 6508 } 6509 6510 if (!spdk_mem_all_zero(path_id->hostid.hostsvcid, sizeof(path_id->hostid.hostsvcid))) { 6511 if (strcmp(path_id->hostid.hostsvcid, p->hostid.hostsvcid) != 0) { 6512 return false; 6513 } 6514 } 6515 6516 return true; 6517 } 6518 6519 static bool 6520 nvme_path_id_exists(const char *name, const struct nvme_path_id *path_id) 6521 { 6522 struct nvme_bdev_ctrlr *nbdev_ctrlr; 6523 struct nvme_ctrlr *ctrlr; 6524 struct nvme_path_id *p; 6525 6526 pthread_mutex_lock(&g_bdev_nvme_mutex); 6527 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 6528 if (!nbdev_ctrlr) { 6529 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6530 return false; 6531 } 6532 6533 TAILQ_FOREACH(ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 6534 pthread_mutex_lock(&ctrlr->mutex); 6535 TAILQ_FOREACH(p, &ctrlr->trids, link) { 6536 if (nvme_path_id_compare(p, path_id)) { 6537 pthread_mutex_unlock(&ctrlr->mutex); 6538 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6539 return true; 6540 } 6541 } 6542 pthread_mutex_unlock(&ctrlr->mutex); 6543 } 6544 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6545 6546 return false; 6547 } 6548 6549 static int 6550 bdev_nvme_delete_complete_poll(void *arg) 6551 { 6552 struct bdev_nvme_delete_ctx *ctx = arg; 6553 int rc = 0; 6554 6555 if (nvme_path_id_exists(ctx->name, &ctx->path_id)) { 6556 if (ctx->timeout_ticks > spdk_get_ticks()) { 6557 return SPDK_POLLER_BUSY; 6558 } 6559 6560 SPDK_ERRLOG("NVMe path '%s' still exists after delete\n", ctx->name); 6561 rc = -ETIMEDOUT; 6562 } 6563 6564 spdk_poller_unregister(&ctx->poller); 6565 6566 ctx->delete_done(ctx->delete_done_ctx, rc); 6567 free_bdev_nvme_delete_ctx(ctx); 6568 6569 return SPDK_POLLER_BUSY; 6570 } 6571 6572 static int 6573 _bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, const struct nvme_path_id *path_id) 6574 { 6575 struct nvme_path_id *p, *t; 6576 spdk_msg_fn msg_fn; 6577 int rc = -ENXIO; 6578 6579 pthread_mutex_lock(&nvme_ctrlr->mutex); 6580 6581 TAILQ_FOREACH_REVERSE_SAFE(p, &nvme_ctrlr->trids, nvme_paths, link, t) { 6582 if (p == TAILQ_FIRST(&nvme_ctrlr->trids)) { 6583 break; 6584 } 6585 6586 if (!nvme_path_id_compare(p, path_id)) { 6587 continue; 6588 } 6589 6590 /* We are not using the specified path. */ 6591 TAILQ_REMOVE(&nvme_ctrlr->trids, p, link); 6592 free(p); 6593 rc = 0; 6594 } 6595 6596 if (p == NULL || !nvme_path_id_compare(p, path_id)) { 6597 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6598 return rc; 6599 } 6600 6601 /* If we made it here, then this path is a match! Now we need to remove it. */ 6602 6603 /* This is the active path in use right now. The active path is always the first in the list. */ 6604 assert(p == nvme_ctrlr->active_path_id); 6605 6606 if (!TAILQ_NEXT(p, link)) { 6607 /* The current path is the only path. */ 6608 msg_fn = _nvme_ctrlr_destruct; 6609 rc = bdev_nvme_delete_ctrlr_unsafe(nvme_ctrlr, false); 6610 } else { 6611 /* There is an alternative path. */ 6612 msg_fn = _bdev_nvme_reset_ctrlr; 6613 rc = bdev_nvme_failover_ctrlr_unsafe(nvme_ctrlr, true); 6614 } 6615 6616 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6617 6618 if (rc == 0) { 6619 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 6620 } else if (rc == -EALREADY) { 6621 rc = 0; 6622 } 6623 6624 return rc; 6625 } 6626 6627 int 6628 bdev_nvme_delete(const char *name, const struct nvme_path_id *path_id, 6629 bdev_nvme_delete_done_fn delete_done, void *delete_done_ctx) 6630 { 6631 struct nvme_bdev_ctrlr *nbdev_ctrlr; 6632 struct nvme_ctrlr *nvme_ctrlr, *tmp_nvme_ctrlr; 6633 struct bdev_nvme_delete_ctx *ctx = NULL; 6634 int rc = -ENXIO, _rc; 6635 6636 if (name == NULL || path_id == NULL) { 6637 rc = -EINVAL; 6638 goto exit; 6639 } 6640 6641 pthread_mutex_lock(&g_bdev_nvme_mutex); 6642 6643 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 6644 if (nbdev_ctrlr == NULL) { 6645 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6646 6647 SPDK_ERRLOG("Failed to find NVMe bdev controller\n"); 6648 rc = -ENODEV; 6649 goto exit; 6650 } 6651 6652 TAILQ_FOREACH_SAFE(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq, tmp_nvme_ctrlr) { 6653 _rc = _bdev_nvme_delete(nvme_ctrlr, path_id); 6654 if (_rc < 0 && _rc != -ENXIO) { 6655 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6656 rc = _rc; 6657 goto exit; 6658 } else if (_rc == 0) { 6659 /* We traverse all remaining nvme_ctrlrs even if one nvme_ctrlr 6660 * was deleted successfully. To remember the successful deletion, 6661 * overwrite rc only if _rc is zero. 6662 */ 6663 rc = 0; 6664 } 6665 } 6666 6667 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6668 6669 if (rc != 0 || delete_done == NULL) { 6670 goto exit; 6671 } 6672 6673 ctx = calloc(1, sizeof(*ctx)); 6674 if (ctx == NULL) { 6675 SPDK_ERRLOG("Failed to allocate context for bdev_nvme_delete\n"); 6676 rc = -ENOMEM; 6677 goto exit; 6678 } 6679 6680 ctx->name = strdup(name); 6681 if (ctx->name == NULL) { 6682 SPDK_ERRLOG("Failed to copy controller name for deletion\n"); 6683 rc = -ENOMEM; 6684 goto exit; 6685 } 6686 6687 ctx->delete_done = delete_done; 6688 ctx->delete_done_ctx = delete_done_ctx; 6689 ctx->path_id = *path_id; 6690 ctx->timeout_ticks = spdk_get_ticks() + 10 * spdk_get_ticks_hz(); 6691 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_delete_complete_poll, ctx, 1000); 6692 if (ctx->poller == NULL) { 6693 SPDK_ERRLOG("Failed to register bdev_nvme_delete poller\n"); 6694 rc = -ENOMEM; 6695 goto exit; 6696 } 6697 6698 exit: 6699 if (rc != 0) { 6700 free_bdev_nvme_delete_ctx(ctx); 6701 } 6702 6703 return rc; 6704 } 6705 6706 #define DISCOVERY_INFOLOG(ctx, format, ...) \ 6707 SPDK_INFOLOG(bdev_nvme, "Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 6708 6709 #define DISCOVERY_ERRLOG(ctx, format, ...) \ 6710 SPDK_ERRLOG("Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 6711 6712 struct discovery_entry_ctx { 6713 char name[128]; 6714 struct spdk_nvme_transport_id trid; 6715 struct spdk_nvme_ctrlr_opts drv_opts; 6716 struct spdk_nvmf_discovery_log_page_entry entry; 6717 TAILQ_ENTRY(discovery_entry_ctx) tailq; 6718 struct discovery_ctx *ctx; 6719 }; 6720 6721 struct discovery_ctx { 6722 char *name; 6723 spdk_bdev_nvme_start_discovery_fn start_cb_fn; 6724 spdk_bdev_nvme_stop_discovery_fn stop_cb_fn; 6725 void *cb_ctx; 6726 struct spdk_nvme_probe_ctx *probe_ctx; 6727 struct spdk_nvme_detach_ctx *detach_ctx; 6728 struct spdk_nvme_ctrlr *ctrlr; 6729 struct spdk_nvme_transport_id trid; 6730 struct discovery_entry_ctx *entry_ctx_in_use; 6731 struct spdk_poller *poller; 6732 struct spdk_nvme_ctrlr_opts drv_opts; 6733 struct spdk_bdev_nvme_ctrlr_opts bdev_opts; 6734 struct spdk_nvmf_discovery_log_page *log_page; 6735 TAILQ_ENTRY(discovery_ctx) tailq; 6736 TAILQ_HEAD(, discovery_entry_ctx) nvm_entry_ctxs; 6737 TAILQ_HEAD(, discovery_entry_ctx) discovery_entry_ctxs; 6738 int rc; 6739 bool wait_for_attach; 6740 uint64_t timeout_ticks; 6741 /* Denotes that the discovery service is being started. We're waiting 6742 * for the initial connection to the discovery controller to be 6743 * established and attach discovered NVM ctrlrs. 6744 */ 6745 bool initializing; 6746 /* Denotes if a discovery is currently in progress for this context. 6747 * That includes connecting to newly discovered subsystems. Used to 6748 * ensure we do not start a new discovery until an existing one is 6749 * complete. 6750 */ 6751 bool in_progress; 6752 6753 /* Denotes if another discovery is needed after the one in progress 6754 * completes. Set when we receive an AER completion while a discovery 6755 * is already in progress. 6756 */ 6757 bool pending; 6758 6759 /* Signal to the discovery context poller that it should stop the 6760 * discovery service, including detaching from the current discovery 6761 * controller. 6762 */ 6763 bool stop; 6764 6765 struct spdk_thread *calling_thread; 6766 uint32_t index; 6767 uint32_t attach_in_progress; 6768 char *hostnqn; 6769 6770 /* Denotes if the discovery service was started by the mdns discovery. 6771 */ 6772 bool from_mdns_discovery_service; 6773 }; 6774 6775 TAILQ_HEAD(discovery_ctxs, discovery_ctx); 6776 static struct discovery_ctxs g_discovery_ctxs = TAILQ_HEAD_INITIALIZER(g_discovery_ctxs); 6777 6778 static void get_discovery_log_page(struct discovery_ctx *ctx); 6779 6780 static void 6781 free_discovery_ctx(struct discovery_ctx *ctx) 6782 { 6783 free(ctx->log_page); 6784 free(ctx->hostnqn); 6785 free(ctx->name); 6786 free(ctx); 6787 } 6788 6789 static void 6790 discovery_complete(struct discovery_ctx *ctx) 6791 { 6792 ctx->initializing = false; 6793 ctx->in_progress = false; 6794 if (ctx->pending) { 6795 ctx->pending = false; 6796 get_discovery_log_page(ctx); 6797 } 6798 } 6799 6800 static void 6801 build_trid_from_log_page_entry(struct spdk_nvme_transport_id *trid, 6802 struct spdk_nvmf_discovery_log_page_entry *entry) 6803 { 6804 char *space; 6805 6806 trid->trtype = entry->trtype; 6807 trid->adrfam = entry->adrfam; 6808 memcpy(trid->traddr, entry->traddr, sizeof(entry->traddr)); 6809 memcpy(trid->trsvcid, entry->trsvcid, sizeof(entry->trsvcid)); 6810 /* Because the source buffer (entry->subnqn) is longer than trid->subnqn, and 6811 * before call to this function trid->subnqn is zeroed out, we need 6812 * to copy sizeof(trid->subnqn) minus one byte to make sure the last character 6813 * remains 0. Then we can shorten the string (replace ' ' with 0) if required 6814 */ 6815 memcpy(trid->subnqn, entry->subnqn, sizeof(trid->subnqn) - 1); 6816 6817 /* We want the traddr, trsvcid and subnqn fields to be NULL-terminated. 6818 * But the log page entries typically pad them with spaces, not zeroes. 6819 * So add a NULL terminator to each of these fields at the appropriate 6820 * location. 6821 */ 6822 space = strchr(trid->traddr, ' '); 6823 if (space) { 6824 *space = 0; 6825 } 6826 space = strchr(trid->trsvcid, ' '); 6827 if (space) { 6828 *space = 0; 6829 } 6830 space = strchr(trid->subnqn, ' '); 6831 if (space) { 6832 *space = 0; 6833 } 6834 } 6835 6836 static void 6837 _stop_discovery(void *_ctx) 6838 { 6839 struct discovery_ctx *ctx = _ctx; 6840 6841 if (ctx->attach_in_progress > 0) { 6842 spdk_thread_send_msg(spdk_get_thread(), _stop_discovery, ctx); 6843 return; 6844 } 6845 6846 ctx->stop = true; 6847 6848 while (!TAILQ_EMPTY(&ctx->nvm_entry_ctxs)) { 6849 struct discovery_entry_ctx *entry_ctx; 6850 struct nvme_path_id path = {}; 6851 6852 entry_ctx = TAILQ_FIRST(&ctx->nvm_entry_ctxs); 6853 path.trid = entry_ctx->trid; 6854 bdev_nvme_delete(entry_ctx->name, &path, NULL, NULL); 6855 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 6856 free(entry_ctx); 6857 } 6858 6859 while (!TAILQ_EMPTY(&ctx->discovery_entry_ctxs)) { 6860 struct discovery_entry_ctx *entry_ctx; 6861 6862 entry_ctx = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 6863 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 6864 free(entry_ctx); 6865 } 6866 6867 free(ctx->entry_ctx_in_use); 6868 ctx->entry_ctx_in_use = NULL; 6869 } 6870 6871 static void 6872 stop_discovery(struct discovery_ctx *ctx, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 6873 { 6874 ctx->stop_cb_fn = cb_fn; 6875 ctx->cb_ctx = cb_ctx; 6876 6877 if (ctx->attach_in_progress > 0) { 6878 DISCOVERY_INFOLOG(ctx, "stopping discovery with attach_in_progress: %"PRIu32"\n", 6879 ctx->attach_in_progress); 6880 } 6881 6882 _stop_discovery(ctx); 6883 } 6884 6885 static void 6886 remove_discovery_entry(struct nvme_ctrlr *nvme_ctrlr) 6887 { 6888 struct discovery_ctx *d_ctx; 6889 struct nvme_path_id *path_id; 6890 struct spdk_nvme_transport_id trid = {}; 6891 struct discovery_entry_ctx *entry_ctx, *tmp; 6892 6893 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 6894 6895 TAILQ_FOREACH(d_ctx, &g_discovery_ctxs, tailq) { 6896 TAILQ_FOREACH_SAFE(entry_ctx, &d_ctx->nvm_entry_ctxs, tailq, tmp) { 6897 build_trid_from_log_page_entry(&trid, &entry_ctx->entry); 6898 if (spdk_nvme_transport_id_compare(&trid, &path_id->trid) != 0) { 6899 continue; 6900 } 6901 6902 TAILQ_REMOVE(&d_ctx->nvm_entry_ctxs, entry_ctx, tailq); 6903 free(entry_ctx); 6904 DISCOVERY_INFOLOG(d_ctx, "Remove discovery entry: %s:%s:%s\n", 6905 trid.subnqn, trid.traddr, trid.trsvcid); 6906 6907 /* Fail discovery ctrlr to force reattach attempt */ 6908 spdk_nvme_ctrlr_fail(d_ctx->ctrlr); 6909 } 6910 } 6911 } 6912 6913 static void 6914 discovery_remove_controllers(struct discovery_ctx *ctx) 6915 { 6916 struct spdk_nvmf_discovery_log_page *log_page = ctx->log_page; 6917 struct discovery_entry_ctx *entry_ctx, *tmp; 6918 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 6919 struct spdk_nvme_transport_id old_trid = {}; 6920 uint64_t numrec, i; 6921 bool found; 6922 6923 numrec = from_le64(&log_page->numrec); 6924 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->nvm_entry_ctxs, tailq, tmp) { 6925 found = false; 6926 old_entry = &entry_ctx->entry; 6927 build_trid_from_log_page_entry(&old_trid, old_entry); 6928 for (i = 0; i < numrec; i++) { 6929 new_entry = &log_page->entries[i]; 6930 if (!memcmp(old_entry, new_entry, sizeof(*old_entry))) { 6931 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s found again\n", 6932 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 6933 found = true; 6934 break; 6935 } 6936 } 6937 if (!found) { 6938 struct nvme_path_id path = {}; 6939 6940 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s not found\n", 6941 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 6942 6943 path.trid = entry_ctx->trid; 6944 bdev_nvme_delete(entry_ctx->name, &path, NULL, NULL); 6945 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 6946 free(entry_ctx); 6947 } 6948 } 6949 free(log_page); 6950 ctx->log_page = NULL; 6951 discovery_complete(ctx); 6952 } 6953 6954 static void 6955 complete_discovery_start(struct discovery_ctx *ctx, int status) 6956 { 6957 ctx->timeout_ticks = 0; 6958 ctx->rc = status; 6959 if (ctx->start_cb_fn) { 6960 ctx->start_cb_fn(ctx->cb_ctx, status); 6961 ctx->start_cb_fn = NULL; 6962 ctx->cb_ctx = NULL; 6963 } 6964 } 6965 6966 static void 6967 discovery_attach_controller_done(void *cb_ctx, size_t bdev_count, int rc) 6968 { 6969 struct discovery_entry_ctx *entry_ctx = cb_ctx; 6970 struct discovery_ctx *ctx = entry_ctx->ctx; 6971 6972 DISCOVERY_INFOLOG(ctx, "attach %s done\n", entry_ctx->name); 6973 ctx->attach_in_progress--; 6974 if (ctx->attach_in_progress == 0) { 6975 complete_discovery_start(ctx, ctx->rc); 6976 if (ctx->initializing && ctx->rc != 0) { 6977 DISCOVERY_ERRLOG(ctx, "stopping discovery due to errors: %d\n", ctx->rc); 6978 stop_discovery(ctx, NULL, ctx->cb_ctx); 6979 } else { 6980 discovery_remove_controllers(ctx); 6981 } 6982 } 6983 } 6984 6985 static struct discovery_entry_ctx * 6986 create_discovery_entry_ctx(struct discovery_ctx *ctx, struct spdk_nvme_transport_id *trid) 6987 { 6988 struct discovery_entry_ctx *new_ctx; 6989 6990 new_ctx = calloc(1, sizeof(*new_ctx)); 6991 if (new_ctx == NULL) { 6992 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 6993 return NULL; 6994 } 6995 6996 new_ctx->ctx = ctx; 6997 memcpy(&new_ctx->trid, trid, sizeof(*trid)); 6998 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 6999 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 7000 return new_ctx; 7001 } 7002 7003 static void 7004 discovery_log_page_cb(void *cb_arg, int rc, const struct spdk_nvme_cpl *cpl, 7005 struct spdk_nvmf_discovery_log_page *log_page) 7006 { 7007 struct discovery_ctx *ctx = cb_arg; 7008 struct discovery_entry_ctx *entry_ctx, *tmp; 7009 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 7010 uint64_t numrec, i; 7011 bool found; 7012 7013 if (rc || spdk_nvme_cpl_is_error(cpl)) { 7014 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 7015 return; 7016 } 7017 7018 ctx->log_page = log_page; 7019 assert(ctx->attach_in_progress == 0); 7020 numrec = from_le64(&log_page->numrec); 7021 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->discovery_entry_ctxs, tailq, tmp) { 7022 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 7023 free(entry_ctx); 7024 } 7025 for (i = 0; i < numrec; i++) { 7026 found = false; 7027 new_entry = &log_page->entries[i]; 7028 if (new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY_CURRENT || 7029 new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) { 7030 struct discovery_entry_ctx *new_ctx; 7031 struct spdk_nvme_transport_id trid = {}; 7032 7033 build_trid_from_log_page_entry(&trid, new_entry); 7034 new_ctx = create_discovery_entry_ctx(ctx, &trid); 7035 if (new_ctx == NULL) { 7036 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 7037 break; 7038 } 7039 7040 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, new_ctx, tailq); 7041 continue; 7042 } 7043 TAILQ_FOREACH(entry_ctx, &ctx->nvm_entry_ctxs, tailq) { 7044 old_entry = &entry_ctx->entry; 7045 if (!memcmp(new_entry, old_entry, sizeof(*new_entry))) { 7046 found = true; 7047 break; 7048 } 7049 } 7050 if (!found) { 7051 struct discovery_entry_ctx *subnqn_ctx = NULL, *new_ctx; 7052 struct discovery_ctx *d_ctx; 7053 7054 TAILQ_FOREACH(d_ctx, &g_discovery_ctxs, tailq) { 7055 TAILQ_FOREACH(subnqn_ctx, &d_ctx->nvm_entry_ctxs, tailq) { 7056 if (!memcmp(subnqn_ctx->entry.subnqn, new_entry->subnqn, 7057 sizeof(new_entry->subnqn))) { 7058 break; 7059 } 7060 } 7061 if (subnqn_ctx) { 7062 break; 7063 } 7064 } 7065 7066 new_ctx = calloc(1, sizeof(*new_ctx)); 7067 if (new_ctx == NULL) { 7068 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 7069 break; 7070 } 7071 7072 new_ctx->ctx = ctx; 7073 memcpy(&new_ctx->entry, new_entry, sizeof(*new_entry)); 7074 build_trid_from_log_page_entry(&new_ctx->trid, new_entry); 7075 if (subnqn_ctx) { 7076 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s", subnqn_ctx->name); 7077 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new path for %s\n", 7078 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 7079 new_ctx->name); 7080 } else { 7081 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s%d", ctx->name, ctx->index++); 7082 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new subsystem %s\n", 7083 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 7084 new_ctx->name); 7085 } 7086 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 7087 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 7088 rc = spdk_bdev_nvme_create(&new_ctx->trid, new_ctx->name, NULL, 0, 7089 discovery_attach_controller_done, new_ctx, 7090 &new_ctx->drv_opts, &ctx->bdev_opts); 7091 if (rc == 0) { 7092 TAILQ_INSERT_TAIL(&ctx->nvm_entry_ctxs, new_ctx, tailq); 7093 ctx->attach_in_progress++; 7094 } else { 7095 DISCOVERY_ERRLOG(ctx, "spdk_bdev_nvme_create failed (%s)\n", spdk_strerror(-rc)); 7096 } 7097 } 7098 } 7099 7100 if (ctx->attach_in_progress == 0) { 7101 discovery_remove_controllers(ctx); 7102 } 7103 } 7104 7105 static void 7106 get_discovery_log_page(struct discovery_ctx *ctx) 7107 { 7108 int rc; 7109 7110 assert(ctx->in_progress == false); 7111 ctx->in_progress = true; 7112 rc = spdk_nvme_ctrlr_get_discovery_log_page(ctx->ctrlr, discovery_log_page_cb, ctx); 7113 if (rc != 0) { 7114 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 7115 } 7116 DISCOVERY_INFOLOG(ctx, "sent discovery log page command\n"); 7117 } 7118 7119 static void 7120 discovery_aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 7121 { 7122 struct discovery_ctx *ctx = arg; 7123 uint32_t log_page_id = (cpl->cdw0 & 0xFF0000) >> 16; 7124 7125 if (spdk_nvme_cpl_is_error(cpl)) { 7126 DISCOVERY_ERRLOG(ctx, "aer failed\n"); 7127 return; 7128 } 7129 7130 if (log_page_id != SPDK_NVME_LOG_DISCOVERY) { 7131 DISCOVERY_ERRLOG(ctx, "unexpected log page 0x%x\n", log_page_id); 7132 return; 7133 } 7134 7135 DISCOVERY_INFOLOG(ctx, "got aer\n"); 7136 if (ctx->in_progress) { 7137 ctx->pending = true; 7138 return; 7139 } 7140 7141 get_discovery_log_page(ctx); 7142 } 7143 7144 static void 7145 discovery_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 7146 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 7147 { 7148 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 7149 struct discovery_ctx *ctx; 7150 7151 ctx = SPDK_CONTAINEROF(user_opts, struct discovery_ctx, drv_opts); 7152 7153 DISCOVERY_INFOLOG(ctx, "discovery ctrlr attached\n"); 7154 ctx->probe_ctx = NULL; 7155 ctx->ctrlr = ctrlr; 7156 7157 if (ctx->rc != 0) { 7158 DISCOVERY_ERRLOG(ctx, "encountered error while attaching discovery ctrlr: %d\n", 7159 ctx->rc); 7160 return; 7161 } 7162 7163 spdk_nvme_ctrlr_register_aer_callback(ctx->ctrlr, discovery_aer_cb, ctx); 7164 } 7165 7166 static int 7167 discovery_poller(void *arg) 7168 { 7169 struct discovery_ctx *ctx = arg; 7170 struct spdk_nvme_transport_id *trid; 7171 int rc; 7172 7173 if (ctx->detach_ctx) { 7174 rc = spdk_nvme_detach_poll_async(ctx->detach_ctx); 7175 if (rc != -EAGAIN) { 7176 ctx->detach_ctx = NULL; 7177 ctx->ctrlr = NULL; 7178 } 7179 } else if (ctx->stop) { 7180 if (ctx->ctrlr != NULL) { 7181 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 7182 if (rc == 0) { 7183 return SPDK_POLLER_BUSY; 7184 } 7185 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 7186 } 7187 spdk_poller_unregister(&ctx->poller); 7188 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 7189 assert(ctx->start_cb_fn == NULL); 7190 if (ctx->stop_cb_fn != NULL) { 7191 ctx->stop_cb_fn(ctx->cb_ctx); 7192 } 7193 free_discovery_ctx(ctx); 7194 } else if (ctx->probe_ctx == NULL && ctx->ctrlr == NULL) { 7195 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 7196 DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); 7197 assert(ctx->initializing); 7198 spdk_poller_unregister(&ctx->poller); 7199 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 7200 complete_discovery_start(ctx, -ETIMEDOUT); 7201 stop_discovery(ctx, NULL, NULL); 7202 free_discovery_ctx(ctx); 7203 return SPDK_POLLER_BUSY; 7204 } 7205 7206 assert(ctx->entry_ctx_in_use == NULL); 7207 ctx->entry_ctx_in_use = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 7208 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 7209 trid = &ctx->entry_ctx_in_use->trid; 7210 7211 /* All controllers must be configured explicitely either for multipath or failover. 7212 * While discovery use multipath mode, we need to set this in bdev options as well. 7213 */ 7214 ctx->bdev_opts.multipath = true; 7215 7216 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, discovery_attach_cb); 7217 if (ctx->probe_ctx) { 7218 spdk_poller_unregister(&ctx->poller); 7219 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000); 7220 } else { 7221 DISCOVERY_ERRLOG(ctx, "could not start discovery connect\n"); 7222 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 7223 ctx->entry_ctx_in_use = NULL; 7224 } 7225 } else if (ctx->probe_ctx) { 7226 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 7227 DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); 7228 complete_discovery_start(ctx, -ETIMEDOUT); 7229 return SPDK_POLLER_BUSY; 7230 } 7231 7232 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 7233 if (rc != -EAGAIN) { 7234 if (ctx->rc != 0) { 7235 assert(ctx->initializing); 7236 stop_discovery(ctx, NULL, ctx->cb_ctx); 7237 } else { 7238 assert(rc == 0); 7239 DISCOVERY_INFOLOG(ctx, "discovery ctrlr connected\n"); 7240 ctx->rc = rc; 7241 get_discovery_log_page(ctx); 7242 } 7243 } 7244 } else { 7245 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 7246 DISCOVERY_ERRLOG(ctx, "timed out while attaching NVM ctrlrs\n"); 7247 complete_discovery_start(ctx, -ETIMEDOUT); 7248 /* We need to wait until all NVM ctrlrs are attached before we stop the 7249 * discovery service to make sure we don't detach a ctrlr that is still 7250 * being attached. 7251 */ 7252 if (ctx->attach_in_progress == 0) { 7253 stop_discovery(ctx, NULL, ctx->cb_ctx); 7254 return SPDK_POLLER_BUSY; 7255 } 7256 } 7257 7258 rc = spdk_nvme_ctrlr_process_admin_completions(ctx->ctrlr); 7259 if (rc < 0) { 7260 spdk_poller_unregister(&ctx->poller); 7261 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 7262 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 7263 ctx->entry_ctx_in_use = NULL; 7264 7265 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 7266 if (rc != 0) { 7267 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 7268 ctx->ctrlr = NULL; 7269 } 7270 } 7271 } 7272 7273 return SPDK_POLLER_BUSY; 7274 } 7275 7276 static void 7277 start_discovery_poller(void *arg) 7278 { 7279 struct discovery_ctx *ctx = arg; 7280 7281 TAILQ_INSERT_TAIL(&g_discovery_ctxs, ctx, tailq); 7282 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 7283 } 7284 7285 int 7286 bdev_nvme_start_discovery(struct spdk_nvme_transport_id *trid, 7287 const char *base_name, 7288 struct spdk_nvme_ctrlr_opts *drv_opts, 7289 struct spdk_bdev_nvme_ctrlr_opts *bdev_opts, 7290 uint64_t attach_timeout, 7291 bool from_mdns, 7292 spdk_bdev_nvme_start_discovery_fn cb_fn, void *cb_ctx) 7293 { 7294 struct discovery_ctx *ctx; 7295 struct discovery_entry_ctx *discovery_entry_ctx; 7296 7297 snprintf(trid->subnqn, sizeof(trid->subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN); 7298 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 7299 if (strcmp(ctx->name, base_name) == 0) { 7300 return -EEXIST; 7301 } 7302 7303 if (ctx->entry_ctx_in_use != NULL) { 7304 if (!spdk_nvme_transport_id_compare(trid, &ctx->entry_ctx_in_use->trid)) { 7305 return -EEXIST; 7306 } 7307 } 7308 7309 TAILQ_FOREACH(discovery_entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 7310 if (!spdk_nvme_transport_id_compare(trid, &discovery_entry_ctx->trid)) { 7311 return -EEXIST; 7312 } 7313 } 7314 } 7315 7316 ctx = calloc(1, sizeof(*ctx)); 7317 if (ctx == NULL) { 7318 return -ENOMEM; 7319 } 7320 7321 ctx->name = strdup(base_name); 7322 if (ctx->name == NULL) { 7323 free_discovery_ctx(ctx); 7324 return -ENOMEM; 7325 } 7326 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 7327 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 7328 ctx->from_mdns_discovery_service = from_mdns; 7329 ctx->bdev_opts.from_discovery_service = true; 7330 ctx->calling_thread = spdk_get_thread(); 7331 ctx->start_cb_fn = cb_fn; 7332 ctx->cb_ctx = cb_ctx; 7333 ctx->initializing = true; 7334 if (ctx->start_cb_fn) { 7335 /* We can use this when dumping json to denote if this RPC parameter 7336 * was specified or not. 7337 */ 7338 ctx->wait_for_attach = true; 7339 } 7340 if (attach_timeout != 0) { 7341 ctx->timeout_ticks = spdk_get_ticks() + attach_timeout * 7342 spdk_get_ticks_hz() / 1000ull; 7343 } 7344 TAILQ_INIT(&ctx->nvm_entry_ctxs); 7345 TAILQ_INIT(&ctx->discovery_entry_ctxs); 7346 memcpy(&ctx->trid, trid, sizeof(*trid)); 7347 /* Even if user did not specify hostnqn, we can still strdup("\0"); */ 7348 ctx->hostnqn = strdup(ctx->drv_opts.hostnqn); 7349 if (ctx->hostnqn == NULL) { 7350 free_discovery_ctx(ctx); 7351 return -ENOMEM; 7352 } 7353 discovery_entry_ctx = create_discovery_entry_ctx(ctx, trid); 7354 if (discovery_entry_ctx == NULL) { 7355 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 7356 free_discovery_ctx(ctx); 7357 return -ENOMEM; 7358 } 7359 7360 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, discovery_entry_ctx, tailq); 7361 spdk_thread_send_msg(g_bdev_nvme_init_thread, start_discovery_poller, ctx); 7362 return 0; 7363 } 7364 7365 int 7366 bdev_nvme_stop_discovery(const char *name, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 7367 { 7368 struct discovery_ctx *ctx; 7369 7370 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 7371 if (strcmp(name, ctx->name) == 0) { 7372 if (ctx->stop) { 7373 return -EALREADY; 7374 } 7375 /* If we're still starting the discovery service and ->rc is non-zero, we're 7376 * going to stop it as soon as we can 7377 */ 7378 if (ctx->initializing && ctx->rc != 0) { 7379 return -EALREADY; 7380 } 7381 stop_discovery(ctx, cb_fn, cb_ctx); 7382 return 0; 7383 } 7384 } 7385 7386 return -ENOENT; 7387 } 7388 7389 static int 7390 bdev_nvme_library_init(void) 7391 { 7392 g_bdev_nvme_init_thread = spdk_get_thread(); 7393 7394 spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_create_poll_group_cb, 7395 bdev_nvme_destroy_poll_group_cb, 7396 sizeof(struct nvme_poll_group), "nvme_poll_groups"); 7397 7398 return 0; 7399 } 7400 7401 static void 7402 bdev_nvme_fini_destruct_ctrlrs(void) 7403 { 7404 struct nvme_bdev_ctrlr *nbdev_ctrlr; 7405 struct nvme_ctrlr *nvme_ctrlr; 7406 7407 pthread_mutex_lock(&g_bdev_nvme_mutex); 7408 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 7409 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 7410 pthread_mutex_lock(&nvme_ctrlr->mutex); 7411 if (nvme_ctrlr->destruct) { 7412 /* This controller's destruction was already started 7413 * before the application started shutting down 7414 */ 7415 pthread_mutex_unlock(&nvme_ctrlr->mutex); 7416 continue; 7417 } 7418 nvme_ctrlr->destruct = true; 7419 pthread_mutex_unlock(&nvme_ctrlr->mutex); 7420 7421 spdk_thread_send_msg(nvme_ctrlr->thread, _nvme_ctrlr_destruct, 7422 nvme_ctrlr); 7423 } 7424 } 7425 7426 g_bdev_nvme_module_finish = true; 7427 if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 7428 pthread_mutex_unlock(&g_bdev_nvme_mutex); 7429 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 7430 spdk_bdev_module_fini_done(); 7431 return; 7432 } 7433 7434 pthread_mutex_unlock(&g_bdev_nvme_mutex); 7435 } 7436 7437 static void 7438 check_discovery_fini(void *arg) 7439 { 7440 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 7441 bdev_nvme_fini_destruct_ctrlrs(); 7442 } 7443 } 7444 7445 static void 7446 bdev_nvme_library_fini(void) 7447 { 7448 struct nvme_probe_skip_entry *entry, *entry_tmp; 7449 struct discovery_ctx *ctx; 7450 7451 spdk_poller_unregister(&g_hotplug_poller); 7452 free(g_hotplug_probe_ctx); 7453 g_hotplug_probe_ctx = NULL; 7454 7455 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) { 7456 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 7457 free(entry); 7458 } 7459 7460 assert(spdk_get_thread() == g_bdev_nvme_init_thread); 7461 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 7462 bdev_nvme_fini_destruct_ctrlrs(); 7463 } else { 7464 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 7465 stop_discovery(ctx, check_discovery_fini, NULL); 7466 } 7467 } 7468 } 7469 7470 static void 7471 bdev_nvme_verify_pi_error(struct nvme_bdev_io *bio) 7472 { 7473 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7474 struct spdk_bdev *bdev = bdev_io->bdev; 7475 struct spdk_dif_ctx dif_ctx; 7476 struct spdk_dif_error err_blk = {}; 7477 int rc; 7478 struct spdk_dif_ctx_init_ext_opts dif_opts; 7479 7480 dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format); 7481 dif_opts.dif_pi_format = bdev->dif_pi_format; 7482 rc = spdk_dif_ctx_init(&dif_ctx, 7483 bdev->blocklen, bdev->md_len, bdev->md_interleave, 7484 bdev->dif_is_head_of_md, bdev->dif_type, 7485 bdev_io->u.bdev.dif_check_flags, 7486 bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0, &dif_opts); 7487 if (rc != 0) { 7488 SPDK_ERRLOG("Initialization of DIF context failed\n"); 7489 return; 7490 } 7491 7492 if (bdev->md_interleave) { 7493 rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 7494 bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 7495 } else { 7496 struct iovec md_iov = { 7497 .iov_base = bdev_io->u.bdev.md_buf, 7498 .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len, 7499 }; 7500 7501 rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 7502 &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 7503 } 7504 7505 if (rc != 0) { 7506 SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n", 7507 err_blk.err_type, err_blk.err_offset); 7508 } else { 7509 SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n"); 7510 } 7511 } 7512 7513 static void 7514 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 7515 { 7516 struct nvme_bdev_io *bio = ref; 7517 7518 if (spdk_nvme_cpl_is_success(cpl)) { 7519 /* Run PI verification for read data buffer. */ 7520 bdev_nvme_verify_pi_error(bio); 7521 } 7522 7523 /* Return original completion status */ 7524 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 7525 } 7526 7527 static void 7528 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 7529 { 7530 struct nvme_bdev_io *bio = ref; 7531 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7532 int ret; 7533 7534 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 7535 SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n", 7536 cpl->status.sct, cpl->status.sc); 7537 7538 /* Save completion status to use after verifying PI error. */ 7539 bio->cpl = *cpl; 7540 7541 if (spdk_likely(nvme_io_path_is_available(bio->io_path))) { 7542 /* Read without PI checking to verify PI error. */ 7543 ret = bdev_nvme_no_pi_readv(bio, 7544 bdev_io->u.bdev.iovs, 7545 bdev_io->u.bdev.iovcnt, 7546 bdev_io->u.bdev.md_buf, 7547 bdev_io->u.bdev.num_blocks, 7548 bdev_io->u.bdev.offset_blocks); 7549 if (ret == 0) { 7550 return; 7551 } 7552 } 7553 } 7554 7555 bdev_nvme_io_complete_nvme_status(bio, cpl); 7556 } 7557 7558 static void 7559 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7560 { 7561 struct nvme_bdev_io *bio = ref; 7562 7563 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 7564 SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n", 7565 cpl->status.sct, cpl->status.sc); 7566 /* Run PI verification for write data buffer if PI error is detected. */ 7567 bdev_nvme_verify_pi_error(bio); 7568 } 7569 7570 bdev_nvme_io_complete_nvme_status(bio, cpl); 7571 } 7572 7573 static void 7574 bdev_nvme_zone_appendv_done(void *ref, const struct spdk_nvme_cpl *cpl) 7575 { 7576 struct nvme_bdev_io *bio = ref; 7577 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7578 7579 /* spdk_bdev_io_get_append_location() requires that the ALBA is stored in offset_blocks. 7580 * Additionally, offset_blocks has to be set before calling bdev_nvme_verify_pi_error(). 7581 */ 7582 bdev_io->u.bdev.offset_blocks = *(uint64_t *)&cpl->cdw0; 7583 7584 if (spdk_nvme_cpl_is_pi_error(cpl)) { 7585 SPDK_ERRLOG("zone append completed with PI error (sct=%d, sc=%d)\n", 7586 cpl->status.sct, cpl->status.sc); 7587 /* Run PI verification for zone append data buffer if PI error is detected. */ 7588 bdev_nvme_verify_pi_error(bio); 7589 } 7590 7591 bdev_nvme_io_complete_nvme_status(bio, cpl); 7592 } 7593 7594 static void 7595 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7596 { 7597 struct nvme_bdev_io *bio = ref; 7598 7599 if (spdk_nvme_cpl_is_pi_error(cpl)) { 7600 SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n", 7601 cpl->status.sct, cpl->status.sc); 7602 /* Run PI verification for compare data buffer if PI error is detected. */ 7603 bdev_nvme_verify_pi_error(bio); 7604 } 7605 7606 bdev_nvme_io_complete_nvme_status(bio, cpl); 7607 } 7608 7609 static void 7610 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7611 { 7612 struct nvme_bdev_io *bio = ref; 7613 7614 /* Compare operation completion */ 7615 if (!bio->first_fused_completed) { 7616 /* Save compare result for write callback */ 7617 bio->cpl = *cpl; 7618 bio->first_fused_completed = true; 7619 return; 7620 } 7621 7622 /* Write operation completion */ 7623 if (spdk_nvme_cpl_is_error(&bio->cpl)) { 7624 /* If bio->cpl is already an error, it means the compare operation failed. In that case, 7625 * complete the IO with the compare operation's status. 7626 */ 7627 if (!spdk_nvme_cpl_is_error(cpl)) { 7628 SPDK_ERRLOG("Unexpected write success after compare failure.\n"); 7629 } 7630 7631 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 7632 } else { 7633 bdev_nvme_io_complete_nvme_status(bio, cpl); 7634 } 7635 } 7636 7637 static void 7638 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl) 7639 { 7640 struct nvme_bdev_io *bio = ref; 7641 7642 bdev_nvme_io_complete_nvme_status(bio, cpl); 7643 } 7644 7645 static int 7646 fill_zone_from_report(struct spdk_bdev_zone_info *info, struct spdk_nvme_zns_zone_desc *desc) 7647 { 7648 switch (desc->zt) { 7649 case SPDK_NVME_ZONE_TYPE_SEQWR: 7650 info->type = SPDK_BDEV_ZONE_TYPE_SEQWR; 7651 break; 7652 default: 7653 SPDK_ERRLOG("Invalid zone type: %#x in zone report\n", desc->zt); 7654 return -EIO; 7655 } 7656 7657 switch (desc->zs) { 7658 case SPDK_NVME_ZONE_STATE_EMPTY: 7659 info->state = SPDK_BDEV_ZONE_STATE_EMPTY; 7660 break; 7661 case SPDK_NVME_ZONE_STATE_IOPEN: 7662 info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN; 7663 break; 7664 case SPDK_NVME_ZONE_STATE_EOPEN: 7665 info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN; 7666 break; 7667 case SPDK_NVME_ZONE_STATE_CLOSED: 7668 info->state = SPDK_BDEV_ZONE_STATE_CLOSED; 7669 break; 7670 case SPDK_NVME_ZONE_STATE_RONLY: 7671 info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY; 7672 break; 7673 case SPDK_NVME_ZONE_STATE_FULL: 7674 info->state = SPDK_BDEV_ZONE_STATE_FULL; 7675 break; 7676 case SPDK_NVME_ZONE_STATE_OFFLINE: 7677 info->state = SPDK_BDEV_ZONE_STATE_OFFLINE; 7678 break; 7679 default: 7680 SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", desc->zs); 7681 return -EIO; 7682 } 7683 7684 info->zone_id = desc->zslba; 7685 info->write_pointer = desc->wp; 7686 info->capacity = desc->zcap; 7687 7688 return 0; 7689 } 7690 7691 static void 7692 bdev_nvme_get_zone_info_done(void *ref, const struct spdk_nvme_cpl *cpl) 7693 { 7694 struct nvme_bdev_io *bio = ref; 7695 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7696 uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id; 7697 uint32_t zones_to_copy = bdev_io->u.zone_mgmt.num_zones; 7698 struct spdk_bdev_zone_info *info = bdev_io->u.zone_mgmt.buf; 7699 uint64_t max_zones_per_buf, i; 7700 uint32_t zone_report_bufsize; 7701 struct spdk_nvme_ns *ns; 7702 struct spdk_nvme_qpair *qpair; 7703 int ret; 7704 7705 if (spdk_nvme_cpl_is_error(cpl)) { 7706 goto out_complete_io_nvme_cpl; 7707 } 7708 7709 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 7710 ret = -ENXIO; 7711 goto out_complete_io_ret; 7712 } 7713 7714 ns = bio->io_path->nvme_ns->ns; 7715 qpair = bio->io_path->qpair->qpair; 7716 7717 zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 7718 max_zones_per_buf = (zone_report_bufsize - sizeof(*bio->zone_report_buf)) / 7719 sizeof(bio->zone_report_buf->descs[0]); 7720 7721 if (bio->zone_report_buf->nr_zones > max_zones_per_buf) { 7722 ret = -EINVAL; 7723 goto out_complete_io_ret; 7724 } 7725 7726 if (!bio->zone_report_buf->nr_zones) { 7727 ret = -EINVAL; 7728 goto out_complete_io_ret; 7729 } 7730 7731 for (i = 0; i < bio->zone_report_buf->nr_zones && bio->handled_zones < zones_to_copy; i++) { 7732 ret = fill_zone_from_report(&info[bio->handled_zones], 7733 &bio->zone_report_buf->descs[i]); 7734 if (ret) { 7735 goto out_complete_io_ret; 7736 } 7737 bio->handled_zones++; 7738 } 7739 7740 if (bio->handled_zones < zones_to_copy) { 7741 uint64_t zone_size_lba = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 7742 uint64_t slba = zone_id + (zone_size_lba * bio->handled_zones); 7743 7744 memset(bio->zone_report_buf, 0, zone_report_bufsize); 7745 ret = spdk_nvme_zns_report_zones(ns, qpair, 7746 bio->zone_report_buf, zone_report_bufsize, 7747 slba, SPDK_NVME_ZRA_LIST_ALL, true, 7748 bdev_nvme_get_zone_info_done, bio); 7749 if (!ret) { 7750 return; 7751 } else { 7752 goto out_complete_io_ret; 7753 } 7754 } 7755 7756 out_complete_io_nvme_cpl: 7757 free(bio->zone_report_buf); 7758 bio->zone_report_buf = NULL; 7759 bdev_nvme_io_complete_nvme_status(bio, cpl); 7760 return; 7761 7762 out_complete_io_ret: 7763 free(bio->zone_report_buf); 7764 bio->zone_report_buf = NULL; 7765 bdev_nvme_io_complete(bio, ret); 7766 } 7767 7768 static void 7769 bdev_nvme_zone_management_done(void *ref, const struct spdk_nvme_cpl *cpl) 7770 { 7771 struct nvme_bdev_io *bio = ref; 7772 7773 bdev_nvme_io_complete_nvme_status(bio, cpl); 7774 } 7775 7776 static void 7777 bdev_nvme_admin_passthru_complete_nvme_status(void *ctx) 7778 { 7779 struct nvme_bdev_io *bio = ctx; 7780 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7781 const struct spdk_nvme_cpl *cpl = &bio->cpl; 7782 7783 assert(bdev_nvme_io_type_is_admin(bdev_io->type)); 7784 7785 __bdev_nvme_io_complete(bdev_io, 0, cpl); 7786 } 7787 7788 static void 7789 bdev_nvme_abort_complete(void *ctx) 7790 { 7791 struct nvme_bdev_io *bio = ctx; 7792 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7793 7794 if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) { 7795 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS, NULL); 7796 } else { 7797 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED, NULL); 7798 } 7799 } 7800 7801 static void 7802 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl) 7803 { 7804 struct nvme_bdev_io *bio = ref; 7805 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7806 7807 bio->cpl = *cpl; 7808 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), bdev_nvme_abort_complete, bio); 7809 } 7810 7811 static void 7812 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl) 7813 { 7814 struct nvme_bdev_io *bio = ref; 7815 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7816 7817 bio->cpl = *cpl; 7818 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7819 bdev_nvme_admin_passthru_complete_nvme_status, bio); 7820 } 7821 7822 static void 7823 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset) 7824 { 7825 struct nvme_bdev_io *bio = ref; 7826 struct iovec *iov; 7827 7828 bio->iov_offset = sgl_offset; 7829 for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) { 7830 iov = &bio->iovs[bio->iovpos]; 7831 if (bio->iov_offset < iov->iov_len) { 7832 break; 7833 } 7834 7835 bio->iov_offset -= iov->iov_len; 7836 } 7837 } 7838 7839 static int 7840 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length) 7841 { 7842 struct nvme_bdev_io *bio = ref; 7843 struct iovec *iov; 7844 7845 assert(bio->iovpos < bio->iovcnt); 7846 7847 iov = &bio->iovs[bio->iovpos]; 7848 7849 *address = iov->iov_base; 7850 *length = iov->iov_len; 7851 7852 if (bio->iov_offset) { 7853 assert(bio->iov_offset <= iov->iov_len); 7854 *address += bio->iov_offset; 7855 *length -= bio->iov_offset; 7856 } 7857 7858 bio->iov_offset += *length; 7859 if (bio->iov_offset == iov->iov_len) { 7860 bio->iovpos++; 7861 bio->iov_offset = 0; 7862 } 7863 7864 return 0; 7865 } 7866 7867 static void 7868 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset) 7869 { 7870 struct nvme_bdev_io *bio = ref; 7871 struct iovec *iov; 7872 7873 bio->fused_iov_offset = sgl_offset; 7874 for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) { 7875 iov = &bio->fused_iovs[bio->fused_iovpos]; 7876 if (bio->fused_iov_offset < iov->iov_len) { 7877 break; 7878 } 7879 7880 bio->fused_iov_offset -= iov->iov_len; 7881 } 7882 } 7883 7884 static int 7885 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length) 7886 { 7887 struct nvme_bdev_io *bio = ref; 7888 struct iovec *iov; 7889 7890 assert(bio->fused_iovpos < bio->fused_iovcnt); 7891 7892 iov = &bio->fused_iovs[bio->fused_iovpos]; 7893 7894 *address = iov->iov_base; 7895 *length = iov->iov_len; 7896 7897 if (bio->fused_iov_offset) { 7898 assert(bio->fused_iov_offset <= iov->iov_len); 7899 *address += bio->fused_iov_offset; 7900 *length -= bio->fused_iov_offset; 7901 } 7902 7903 bio->fused_iov_offset += *length; 7904 if (bio->fused_iov_offset == iov->iov_len) { 7905 bio->fused_iovpos++; 7906 bio->fused_iov_offset = 0; 7907 } 7908 7909 return 0; 7910 } 7911 7912 static int 7913 bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7914 void *md, uint64_t lba_count, uint64_t lba) 7915 { 7916 int rc; 7917 7918 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n", 7919 lba_count, lba); 7920 7921 bio->iovs = iov; 7922 bio->iovcnt = iovcnt; 7923 bio->iovpos = 0; 7924 bio->iov_offset = 0; 7925 7926 rc = spdk_nvme_ns_cmd_readv_with_md(bio->io_path->nvme_ns->ns, 7927 bio->io_path->qpair->qpair, 7928 lba, lba_count, 7929 bdev_nvme_no_pi_readv_done, bio, 0, 7930 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 7931 md, 0, 0); 7932 7933 if (rc != 0 && rc != -ENOMEM) { 7934 SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc); 7935 } 7936 return rc; 7937 } 7938 7939 static int 7940 bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7941 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, 7942 struct spdk_memory_domain *domain, void *domain_ctx, 7943 struct spdk_accel_sequence *seq) 7944 { 7945 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 7946 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 7947 int rc; 7948 7949 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n", 7950 lba_count, lba); 7951 7952 bio->iovs = iov; 7953 bio->iovcnt = iovcnt; 7954 bio->iovpos = 0; 7955 bio->iov_offset = 0; 7956 7957 if (domain != NULL || seq != NULL) { 7958 bio->ext_opts.size = SPDK_SIZEOF(&bio->ext_opts, accel_sequence); 7959 bio->ext_opts.memory_domain = domain; 7960 bio->ext_opts.memory_domain_ctx = domain_ctx; 7961 bio->ext_opts.io_flags = flags; 7962 bio->ext_opts.metadata = md; 7963 bio->ext_opts.accel_sequence = seq; 7964 7965 if (iovcnt == 1) { 7966 rc = spdk_nvme_ns_cmd_read_ext(ns, qpair, iov[0].iov_base, lba, lba_count, bdev_nvme_readv_done, 7967 bio, &bio->ext_opts); 7968 } else { 7969 rc = spdk_nvme_ns_cmd_readv_ext(ns, qpair, lba, lba_count, 7970 bdev_nvme_readv_done, bio, 7971 bdev_nvme_queued_reset_sgl, 7972 bdev_nvme_queued_next_sge, 7973 &bio->ext_opts); 7974 } 7975 } else if (iovcnt == 1) { 7976 rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base, 7977 md, lba, lba_count, bdev_nvme_readv_done, 7978 bio, flags, 0, 0); 7979 } else { 7980 rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count, 7981 bdev_nvme_readv_done, bio, flags, 7982 bdev_nvme_queued_reset_sgl, 7983 bdev_nvme_queued_next_sge, md, 0, 0); 7984 } 7985 7986 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 7987 SPDK_ERRLOG("readv failed: rc = %d\n", rc); 7988 } 7989 return rc; 7990 } 7991 7992 static int 7993 bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 7994 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, 7995 struct spdk_memory_domain *domain, void *domain_ctx, 7996 struct spdk_accel_sequence *seq, 7997 union spdk_bdev_nvme_cdw12 cdw12, union spdk_bdev_nvme_cdw13 cdw13) 7998 { 7999 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8000 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8001 int rc; 8002 8003 SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 8004 lba_count, lba); 8005 8006 bio->iovs = iov; 8007 bio->iovcnt = iovcnt; 8008 bio->iovpos = 0; 8009 bio->iov_offset = 0; 8010 8011 if (domain != NULL || seq != NULL) { 8012 bio->ext_opts.size = SPDK_SIZEOF(&bio->ext_opts, accel_sequence); 8013 bio->ext_opts.memory_domain = domain; 8014 bio->ext_opts.memory_domain_ctx = domain_ctx; 8015 bio->ext_opts.io_flags = flags | SPDK_NVME_IO_FLAGS_DIRECTIVE(cdw12.write.dtype); 8016 bio->ext_opts.cdw13 = cdw13.raw; 8017 bio->ext_opts.metadata = md; 8018 bio->ext_opts.accel_sequence = seq; 8019 8020 if (iovcnt == 1) { 8021 rc = spdk_nvme_ns_cmd_write_ext(ns, qpair, iov[0].iov_base, lba, lba_count, bdev_nvme_writev_done, 8022 bio, &bio->ext_opts); 8023 } else { 8024 rc = spdk_nvme_ns_cmd_writev_ext(ns, qpair, lba, lba_count, 8025 bdev_nvme_writev_done, bio, 8026 bdev_nvme_queued_reset_sgl, 8027 bdev_nvme_queued_next_sge, 8028 &bio->ext_opts); 8029 } 8030 } else if (iovcnt == 1) { 8031 rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base, 8032 md, lba, lba_count, bdev_nvme_writev_done, 8033 bio, flags, 0, 0); 8034 } else { 8035 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 8036 bdev_nvme_writev_done, bio, flags, 8037 bdev_nvme_queued_reset_sgl, 8038 bdev_nvme_queued_next_sge, md, 0, 0); 8039 } 8040 8041 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 8042 SPDK_ERRLOG("writev failed: rc = %d\n", rc); 8043 } 8044 return rc; 8045 } 8046 8047 static int 8048 bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 8049 void *md, uint64_t lba_count, uint64_t zslba, 8050 uint32_t flags) 8051 { 8052 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8053 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8054 int rc; 8055 8056 SPDK_DEBUGLOG(bdev_nvme, "zone append %" PRIu64 " blocks to zone start lba %#" PRIx64 "\n", 8057 lba_count, zslba); 8058 8059 bio->iovs = iov; 8060 bio->iovcnt = iovcnt; 8061 bio->iovpos = 0; 8062 bio->iov_offset = 0; 8063 8064 if (iovcnt == 1) { 8065 rc = spdk_nvme_zns_zone_append_with_md(ns, qpair, iov[0].iov_base, md, zslba, 8066 lba_count, 8067 bdev_nvme_zone_appendv_done, bio, 8068 flags, 8069 0, 0); 8070 } else { 8071 rc = spdk_nvme_zns_zone_appendv_with_md(ns, qpair, zslba, lba_count, 8072 bdev_nvme_zone_appendv_done, bio, flags, 8073 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 8074 md, 0, 0); 8075 } 8076 8077 if (rc != 0 && rc != -ENOMEM) { 8078 SPDK_ERRLOG("zone append failed: rc = %d\n", rc); 8079 } 8080 return rc; 8081 } 8082 8083 static int 8084 bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 8085 void *md, uint64_t lba_count, uint64_t lba, 8086 uint32_t flags) 8087 { 8088 int rc; 8089 8090 SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n", 8091 lba_count, lba); 8092 8093 bio->iovs = iov; 8094 bio->iovcnt = iovcnt; 8095 bio->iovpos = 0; 8096 bio->iov_offset = 0; 8097 8098 rc = spdk_nvme_ns_cmd_comparev_with_md(bio->io_path->nvme_ns->ns, 8099 bio->io_path->qpair->qpair, 8100 lba, lba_count, 8101 bdev_nvme_comparev_done, bio, flags, 8102 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 8103 md, 0, 0); 8104 8105 if (rc != 0 && rc != -ENOMEM) { 8106 SPDK_ERRLOG("comparev failed: rc = %d\n", rc); 8107 } 8108 return rc; 8109 } 8110 8111 static int 8112 bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, 8113 struct iovec *write_iov, int write_iovcnt, 8114 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags) 8115 { 8116 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8117 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8118 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 8119 int rc; 8120 8121 SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 8122 lba_count, lba); 8123 8124 bio->iovs = cmp_iov; 8125 bio->iovcnt = cmp_iovcnt; 8126 bio->iovpos = 0; 8127 bio->iov_offset = 0; 8128 bio->fused_iovs = write_iov; 8129 bio->fused_iovcnt = write_iovcnt; 8130 bio->fused_iovpos = 0; 8131 bio->fused_iov_offset = 0; 8132 8133 if (bdev_io->num_retries == 0) { 8134 bio->first_fused_submitted = false; 8135 bio->first_fused_completed = false; 8136 } 8137 8138 if (!bio->first_fused_submitted) { 8139 flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST; 8140 memset(&bio->cpl, 0, sizeof(bio->cpl)); 8141 8142 rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count, 8143 bdev_nvme_comparev_and_writev_done, bio, flags, 8144 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0); 8145 if (rc == 0) { 8146 bio->first_fused_submitted = true; 8147 flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST; 8148 } else { 8149 if (rc != -ENOMEM) { 8150 SPDK_ERRLOG("compare failed: rc = %d\n", rc); 8151 } 8152 return rc; 8153 } 8154 } 8155 8156 flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND; 8157 8158 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 8159 bdev_nvme_comparev_and_writev_done, bio, flags, 8160 bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0); 8161 if (rc != 0 && rc != -ENOMEM) { 8162 SPDK_ERRLOG("write failed: rc = %d\n", rc); 8163 rc = 0; 8164 } 8165 8166 return rc; 8167 } 8168 8169 static int 8170 bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 8171 { 8172 struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES]; 8173 struct spdk_nvme_dsm_range *range; 8174 uint64_t offset, remaining; 8175 uint64_t num_ranges_u64; 8176 uint16_t num_ranges; 8177 int rc; 8178 8179 num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) / 8180 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 8181 if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) { 8182 SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks); 8183 return -EINVAL; 8184 } 8185 num_ranges = (uint16_t)num_ranges_u64; 8186 8187 offset = offset_blocks; 8188 remaining = num_blocks; 8189 range = &dsm_ranges[0]; 8190 8191 /* Fill max-size ranges until the remaining blocks fit into one range */ 8192 while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) { 8193 range->attributes.raw = 0; 8194 range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 8195 range->starting_lba = offset; 8196 8197 offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 8198 remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 8199 range++; 8200 } 8201 8202 /* Final range describes the remaining blocks */ 8203 range->attributes.raw = 0; 8204 range->length = remaining; 8205 range->starting_lba = offset; 8206 8207 rc = spdk_nvme_ns_cmd_dataset_management(bio->io_path->nvme_ns->ns, 8208 bio->io_path->qpair->qpair, 8209 SPDK_NVME_DSM_ATTR_DEALLOCATE, 8210 dsm_ranges, num_ranges, 8211 bdev_nvme_queued_done, bio); 8212 8213 return rc; 8214 } 8215 8216 static int 8217 bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 8218 { 8219 if (num_blocks > UINT16_MAX + 1) { 8220 SPDK_ERRLOG("NVMe write zeroes is limited to 16-bit block count\n"); 8221 return -EINVAL; 8222 } 8223 8224 return spdk_nvme_ns_cmd_write_zeroes(bio->io_path->nvme_ns->ns, 8225 bio->io_path->qpair->qpair, 8226 offset_blocks, num_blocks, 8227 bdev_nvme_queued_done, bio, 8228 0); 8229 } 8230 8231 static int 8232 bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones, 8233 struct spdk_bdev_zone_info *info) 8234 { 8235 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8236 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8237 uint32_t zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 8238 uint64_t zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 8239 uint64_t total_zones = spdk_nvme_zns_ns_get_num_zones(ns); 8240 8241 if (zone_id % zone_size != 0) { 8242 return -EINVAL; 8243 } 8244 8245 if (num_zones > total_zones || !num_zones) { 8246 return -EINVAL; 8247 } 8248 8249 assert(!bio->zone_report_buf); 8250 bio->zone_report_buf = calloc(1, zone_report_bufsize); 8251 if (!bio->zone_report_buf) { 8252 return -ENOMEM; 8253 } 8254 8255 bio->handled_zones = 0; 8256 8257 return spdk_nvme_zns_report_zones(ns, qpair, bio->zone_report_buf, zone_report_bufsize, 8258 zone_id, SPDK_NVME_ZRA_LIST_ALL, true, 8259 bdev_nvme_get_zone_info_done, bio); 8260 } 8261 8262 static int 8263 bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 8264 enum spdk_bdev_zone_action action) 8265 { 8266 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8267 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8268 8269 switch (action) { 8270 case SPDK_BDEV_ZONE_CLOSE: 8271 return spdk_nvme_zns_close_zone(ns, qpair, zone_id, false, 8272 bdev_nvme_zone_management_done, bio); 8273 case SPDK_BDEV_ZONE_FINISH: 8274 return spdk_nvme_zns_finish_zone(ns, qpair, zone_id, false, 8275 bdev_nvme_zone_management_done, bio); 8276 case SPDK_BDEV_ZONE_OPEN: 8277 return spdk_nvme_zns_open_zone(ns, qpair, zone_id, false, 8278 bdev_nvme_zone_management_done, bio); 8279 case SPDK_BDEV_ZONE_RESET: 8280 return spdk_nvme_zns_reset_zone(ns, qpair, zone_id, false, 8281 bdev_nvme_zone_management_done, bio); 8282 case SPDK_BDEV_ZONE_OFFLINE: 8283 return spdk_nvme_zns_offline_zone(ns, qpair, zone_id, false, 8284 bdev_nvme_zone_management_done, bio); 8285 default: 8286 return -EINVAL; 8287 } 8288 } 8289 8290 static void 8291 bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 8292 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) 8293 { 8294 struct nvme_io_path *io_path; 8295 struct nvme_ctrlr *nvme_ctrlr; 8296 uint32_t max_xfer_size; 8297 int rc = -ENXIO; 8298 8299 /* Choose the first ctrlr which is not failed. */ 8300 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 8301 nvme_ctrlr = io_path->qpair->ctrlr; 8302 8303 /* We should skip any unavailable nvme_ctrlr rather than checking 8304 * if the return value of spdk_nvme_ctrlr_cmd_admin_raw() is -ENXIO. 8305 */ 8306 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 8307 continue; 8308 } 8309 8310 max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ctrlr->ctrlr); 8311 8312 if (nbytes > max_xfer_size) { 8313 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 8314 rc = -EINVAL; 8315 goto err; 8316 } 8317 8318 rc = spdk_nvme_ctrlr_cmd_admin_raw(nvme_ctrlr->ctrlr, cmd, buf, (uint32_t)nbytes, 8319 bdev_nvme_admin_passthru_done, bio); 8320 if (rc == 0) { 8321 return; 8322 } 8323 } 8324 8325 err: 8326 bdev_nvme_admin_complete(bio, rc); 8327 } 8328 8329 static int 8330 bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 8331 void *buf, size_t nbytes) 8332 { 8333 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8334 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8335 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 8336 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 8337 8338 if (nbytes > max_xfer_size) { 8339 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 8340 return -EINVAL; 8341 } 8342 8343 /* 8344 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 8345 * so fill it out automatically. 8346 */ 8347 cmd->nsid = spdk_nvme_ns_get_id(ns); 8348 8349 return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf, 8350 (uint32_t)nbytes, bdev_nvme_queued_done, bio); 8351 } 8352 8353 static int 8354 bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 8355 void *buf, size_t nbytes, void *md_buf, size_t md_len) 8356 { 8357 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8358 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8359 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 8360 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 8361 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 8362 8363 if (nbytes > max_xfer_size) { 8364 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 8365 return -EINVAL; 8366 } 8367 8368 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 8369 SPDK_ERRLOG("invalid meta data buffer size\n"); 8370 return -EINVAL; 8371 } 8372 8373 /* 8374 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 8375 * so fill it out automatically. 8376 */ 8377 cmd->nsid = spdk_nvme_ns_get_id(ns); 8378 8379 return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf, 8380 (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio); 8381 } 8382 8383 static int 8384 bdev_nvme_iov_passthru_md(struct nvme_bdev_io *bio, 8385 struct spdk_nvme_cmd *cmd, struct iovec *iov, int iovcnt, 8386 size_t nbytes, void *md_buf, size_t md_len) 8387 { 8388 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8389 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8390 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 8391 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 8392 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 8393 8394 bio->iovs = iov; 8395 bio->iovcnt = iovcnt; 8396 bio->iovpos = 0; 8397 bio->iov_offset = 0; 8398 8399 if (nbytes > max_xfer_size) { 8400 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 8401 return -EINVAL; 8402 } 8403 8404 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 8405 SPDK_ERRLOG("invalid meta data buffer size\n"); 8406 return -EINVAL; 8407 } 8408 8409 /* 8410 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands 8411 * require a nsid, so fill it out automatically. 8412 */ 8413 cmd->nsid = spdk_nvme_ns_get_id(ns); 8414 8415 return spdk_nvme_ctrlr_cmd_iov_raw_with_md( 8416 ctrlr, qpair, cmd, (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio, 8417 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge); 8418 } 8419 8420 static void 8421 bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 8422 struct nvme_bdev_io *bio_to_abort) 8423 { 8424 struct nvme_io_path *io_path; 8425 int rc = 0; 8426 8427 rc = bdev_nvme_abort_retry_io(nbdev_ch, bio_to_abort); 8428 if (rc == 0) { 8429 bdev_nvme_admin_complete(bio, 0); 8430 return; 8431 } 8432 8433 io_path = bio_to_abort->io_path; 8434 if (io_path != NULL) { 8435 rc = spdk_nvme_ctrlr_cmd_abort_ext(io_path->qpair->ctrlr->ctrlr, 8436 io_path->qpair->qpair, 8437 bio_to_abort, 8438 bdev_nvme_abort_done, bio); 8439 } else { 8440 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 8441 rc = spdk_nvme_ctrlr_cmd_abort_ext(io_path->qpair->ctrlr->ctrlr, 8442 NULL, 8443 bio_to_abort, 8444 bdev_nvme_abort_done, bio); 8445 8446 if (rc != -ENOENT) { 8447 break; 8448 } 8449 } 8450 } 8451 8452 if (rc != 0) { 8453 /* If no command was found or there was any error, complete the abort 8454 * request with failure. 8455 */ 8456 bdev_nvme_admin_complete(bio, rc); 8457 } 8458 } 8459 8460 static int 8461 bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, uint64_t src_offset_blocks, 8462 uint64_t num_blocks) 8463 { 8464 struct spdk_nvme_scc_source_range range = { 8465 .slba = src_offset_blocks, 8466 .nlb = num_blocks - 1 8467 }; 8468 8469 return spdk_nvme_ns_cmd_copy(bio->io_path->nvme_ns->ns, 8470 bio->io_path->qpair->qpair, 8471 &range, 1, dst_offset_blocks, 8472 bdev_nvme_queued_done, bio); 8473 } 8474 8475 static void 8476 bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w) 8477 { 8478 const char *action; 8479 uint32_t i; 8480 8481 if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) { 8482 action = "reset"; 8483 } else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) { 8484 action = "abort"; 8485 } else { 8486 action = "none"; 8487 } 8488 8489 spdk_json_write_object_begin(w); 8490 8491 spdk_json_write_named_string(w, "method", "bdev_nvme_set_options"); 8492 8493 spdk_json_write_named_object_begin(w, "params"); 8494 spdk_json_write_named_string(w, "action_on_timeout", action); 8495 spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us); 8496 spdk_json_write_named_uint64(w, "timeout_admin_us", g_opts.timeout_admin_us); 8497 spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms); 8498 spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst); 8499 spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight); 8500 spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight); 8501 spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight); 8502 spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us); 8503 spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us); 8504 spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests); 8505 spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit); 8506 spdk_json_write_named_uint32(w, "transport_retry_count", g_opts.transport_retry_count); 8507 spdk_json_write_named_int32(w, "bdev_retry_count", g_opts.bdev_retry_count); 8508 spdk_json_write_named_uint8(w, "transport_ack_timeout", g_opts.transport_ack_timeout); 8509 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", g_opts.ctrlr_loss_timeout_sec); 8510 spdk_json_write_named_uint32(w, "reconnect_delay_sec", g_opts.reconnect_delay_sec); 8511 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", g_opts.fast_io_fail_timeout_sec); 8512 spdk_json_write_named_bool(w, "disable_auto_failback", g_opts.disable_auto_failback); 8513 spdk_json_write_named_bool(w, "generate_uuids", g_opts.generate_uuids); 8514 spdk_json_write_named_uint8(w, "transport_tos", g_opts.transport_tos); 8515 spdk_json_write_named_bool(w, "nvme_error_stat", g_opts.nvme_error_stat); 8516 spdk_json_write_named_uint32(w, "rdma_srq_size", g_opts.rdma_srq_size); 8517 spdk_json_write_named_bool(w, "io_path_stat", g_opts.io_path_stat); 8518 spdk_json_write_named_bool(w, "allow_accel_sequence", g_opts.allow_accel_sequence); 8519 spdk_json_write_named_uint32(w, "rdma_max_cq_size", g_opts.rdma_max_cq_size); 8520 spdk_json_write_named_uint16(w, "rdma_cm_event_timeout_ms", g_opts.rdma_cm_event_timeout_ms); 8521 spdk_json_write_named_array_begin(w, "dhchap_digests"); 8522 for (i = 0; i < 32; ++i) { 8523 if (g_opts.dhchap_digests & SPDK_BIT(i)) { 8524 spdk_json_write_string(w, spdk_nvme_dhchap_get_digest_name(i)); 8525 } 8526 } 8527 spdk_json_write_array_end(w); 8528 spdk_json_write_named_array_begin(w, "dhchap_dhgroups"); 8529 for (i = 0; i < 32; ++i) { 8530 if (g_opts.dhchap_dhgroups & SPDK_BIT(i)) { 8531 spdk_json_write_string(w, spdk_nvme_dhchap_get_dhgroup_name(i)); 8532 } 8533 } 8534 8535 spdk_json_write_array_end(w); 8536 spdk_json_write_object_end(w); 8537 8538 spdk_json_write_object_end(w); 8539 } 8540 8541 static void 8542 bdev_nvme_discovery_config_json(struct spdk_json_write_ctx *w, struct discovery_ctx *ctx) 8543 { 8544 struct spdk_nvme_transport_id trid; 8545 8546 spdk_json_write_object_begin(w); 8547 8548 spdk_json_write_named_string(w, "method", "bdev_nvme_start_discovery"); 8549 8550 spdk_json_write_named_object_begin(w, "params"); 8551 spdk_json_write_named_string(w, "name", ctx->name); 8552 spdk_json_write_named_string(w, "hostnqn", ctx->hostnqn); 8553 8554 trid = ctx->trid; 8555 memset(trid.subnqn, 0, sizeof(trid.subnqn)); 8556 nvme_bdev_dump_trid_json(&trid, w); 8557 8558 spdk_json_write_named_bool(w, "wait_for_attach", ctx->wait_for_attach); 8559 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", ctx->bdev_opts.ctrlr_loss_timeout_sec); 8560 spdk_json_write_named_uint32(w, "reconnect_delay_sec", ctx->bdev_opts.reconnect_delay_sec); 8561 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 8562 ctx->bdev_opts.fast_io_fail_timeout_sec); 8563 spdk_json_write_object_end(w); 8564 8565 spdk_json_write_object_end(w); 8566 } 8567 8568 #ifdef SPDK_CONFIG_NVME_CUSE 8569 static void 8570 nvme_ctrlr_cuse_config_json(struct spdk_json_write_ctx *w, 8571 struct nvme_ctrlr *nvme_ctrlr) 8572 { 8573 size_t cuse_name_size = 128; 8574 char cuse_name[cuse_name_size]; 8575 8576 if (spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr, 8577 cuse_name, &cuse_name_size) != 0) { 8578 return; 8579 } 8580 8581 spdk_json_write_object_begin(w); 8582 8583 spdk_json_write_named_string(w, "method", "bdev_nvme_cuse_register"); 8584 8585 spdk_json_write_named_object_begin(w, "params"); 8586 spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name); 8587 spdk_json_write_object_end(w); 8588 8589 spdk_json_write_object_end(w); 8590 } 8591 #endif 8592 8593 static void 8594 nvme_ctrlr_config_json(struct spdk_json_write_ctx *w, 8595 struct nvme_ctrlr *nvme_ctrlr, 8596 struct nvme_path_id *path_id) 8597 { 8598 struct spdk_nvme_transport_id *trid; 8599 const struct spdk_nvme_ctrlr_opts *opts; 8600 8601 if (nvme_ctrlr->opts.from_discovery_service) { 8602 /* Do not emit an RPC for this - it will be implicitly 8603 * covered by a separate bdev_nvme_start_discovery or 8604 * bdev_nvme_start_mdns_discovery RPC. 8605 */ 8606 return; 8607 } 8608 8609 trid = &path_id->trid; 8610 8611 spdk_json_write_object_begin(w); 8612 8613 spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller"); 8614 8615 spdk_json_write_named_object_begin(w, "params"); 8616 spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name); 8617 nvme_bdev_dump_trid_json(trid, w); 8618 spdk_json_write_named_bool(w, "prchk_reftag", 8619 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0); 8620 spdk_json_write_named_bool(w, "prchk_guard", 8621 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0); 8622 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", nvme_ctrlr->opts.ctrlr_loss_timeout_sec); 8623 spdk_json_write_named_uint32(w, "reconnect_delay_sec", nvme_ctrlr->opts.reconnect_delay_sec); 8624 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 8625 nvme_ctrlr->opts.fast_io_fail_timeout_sec); 8626 if (nvme_ctrlr->psk != NULL) { 8627 spdk_json_write_named_string(w, "psk", spdk_key_get_name(nvme_ctrlr->psk)); 8628 } 8629 if (nvme_ctrlr->dhchap_key != NULL) { 8630 spdk_json_write_named_string(w, "dhchap_key", 8631 spdk_key_get_name(nvme_ctrlr->dhchap_key)); 8632 } 8633 if (nvme_ctrlr->dhchap_ctrlr_key != NULL) { 8634 spdk_json_write_named_string(w, "dhchap_ctrlr_key", 8635 spdk_key_get_name(nvme_ctrlr->dhchap_ctrlr_key)); 8636 } 8637 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 8638 spdk_json_write_named_string(w, "hostnqn", opts->hostnqn); 8639 spdk_json_write_named_bool(w, "hdgst", opts->header_digest); 8640 spdk_json_write_named_bool(w, "ddgst", opts->data_digest); 8641 if (opts->src_addr[0] != '\0') { 8642 spdk_json_write_named_string(w, "hostaddr", opts->src_addr); 8643 } 8644 if (opts->src_svcid[0] != '\0') { 8645 spdk_json_write_named_string(w, "hostsvcid", opts->src_svcid); 8646 } 8647 8648 if (nvme_ctrlr->opts.multipath) { 8649 spdk_json_write_named_string(w, "multipath", "multipath"); 8650 } 8651 spdk_json_write_object_end(w); 8652 8653 spdk_json_write_object_end(w); 8654 } 8655 8656 static void 8657 bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w) 8658 { 8659 spdk_json_write_object_begin(w); 8660 spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug"); 8661 8662 spdk_json_write_named_object_begin(w, "params"); 8663 spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us); 8664 spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled); 8665 spdk_json_write_object_end(w); 8666 8667 spdk_json_write_object_end(w); 8668 } 8669 8670 static int 8671 bdev_nvme_config_json(struct spdk_json_write_ctx *w) 8672 { 8673 struct nvme_bdev_ctrlr *nbdev_ctrlr; 8674 struct nvme_ctrlr *nvme_ctrlr; 8675 struct discovery_ctx *ctx; 8676 struct nvme_path_id *path_id; 8677 8678 bdev_nvme_opts_config_json(w); 8679 8680 pthread_mutex_lock(&g_bdev_nvme_mutex); 8681 8682 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 8683 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 8684 path_id = nvme_ctrlr->active_path_id; 8685 assert(path_id == TAILQ_FIRST(&nvme_ctrlr->trids)); 8686 nvme_ctrlr_config_json(w, nvme_ctrlr, path_id); 8687 8688 path_id = TAILQ_NEXT(path_id, link); 8689 while (path_id != NULL) { 8690 nvme_ctrlr_config_json(w, nvme_ctrlr, path_id); 8691 path_id = TAILQ_NEXT(path_id, link); 8692 } 8693 8694 #ifdef SPDK_CONFIG_NVME_CUSE 8695 nvme_ctrlr_cuse_config_json(w, nvme_ctrlr); 8696 #endif 8697 } 8698 } 8699 8700 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 8701 if (!ctx->from_mdns_discovery_service) { 8702 bdev_nvme_discovery_config_json(w, ctx); 8703 } 8704 } 8705 8706 bdev_nvme_mdns_discovery_config_json(w); 8707 8708 /* Dump as last parameter to give all NVMe bdevs chance to be constructed 8709 * before enabling hotplug poller. 8710 */ 8711 bdev_nvme_hotplug_config_json(w); 8712 8713 pthread_mutex_unlock(&g_bdev_nvme_mutex); 8714 return 0; 8715 } 8716 8717 struct spdk_nvme_ctrlr * 8718 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev) 8719 { 8720 struct nvme_bdev *nbdev; 8721 struct nvme_ns *nvme_ns; 8722 8723 if (!bdev || bdev->module != &nvme_if) { 8724 return NULL; 8725 } 8726 8727 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 8728 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 8729 assert(nvme_ns != NULL); 8730 8731 return nvme_ns->ctrlr->ctrlr; 8732 } 8733 8734 static bool 8735 nvme_io_path_is_current(struct nvme_io_path *io_path) 8736 { 8737 const struct nvme_bdev_channel *nbdev_ch; 8738 bool current; 8739 8740 if (!nvme_io_path_is_available(io_path)) { 8741 return false; 8742 } 8743 8744 nbdev_ch = io_path->nbdev_ch; 8745 if (nbdev_ch == NULL) { 8746 current = false; 8747 } else if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) { 8748 struct nvme_io_path *optimized_io_path = NULL; 8749 8750 STAILQ_FOREACH(optimized_io_path, &nbdev_ch->io_path_list, stailq) { 8751 if (optimized_io_path->nvme_ns->ana_state == SPDK_NVME_ANA_OPTIMIZED_STATE) { 8752 break; 8753 } 8754 } 8755 8756 /* A non-optimized path is only current if there are no optimized paths. */ 8757 current = (io_path->nvme_ns->ana_state == SPDK_NVME_ANA_OPTIMIZED_STATE) || 8758 (optimized_io_path == NULL); 8759 } else { 8760 if (nbdev_ch->current_io_path) { 8761 current = (io_path == nbdev_ch->current_io_path); 8762 } else { 8763 struct nvme_io_path *first_path; 8764 8765 /* We arrived here as there are no optimized paths for active-passive 8766 * mode. Check if this io_path is the first one available on the list. 8767 */ 8768 current = false; 8769 STAILQ_FOREACH(first_path, &nbdev_ch->io_path_list, stailq) { 8770 if (nvme_io_path_is_available(first_path)) { 8771 current = (io_path == first_path); 8772 break; 8773 } 8774 } 8775 } 8776 } 8777 8778 return current; 8779 } 8780 8781 static struct nvme_ctrlr * 8782 bdev_nvme_next_ctrlr_unsafe(struct nvme_bdev_ctrlr *nbdev_ctrlr, struct nvme_ctrlr *prev) 8783 { 8784 struct nvme_ctrlr *next; 8785 8786 /* Must be called under g_bdev_nvme_mutex */ 8787 next = prev != NULL ? TAILQ_NEXT(prev, tailq) : TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 8788 while (next != NULL) { 8789 /* ref can be 0 when the ctrlr was released, but hasn't been detached yet */ 8790 pthread_mutex_lock(&next->mutex); 8791 if (next->ref > 0) { 8792 next->ref++; 8793 pthread_mutex_unlock(&next->mutex); 8794 return next; 8795 } 8796 8797 pthread_mutex_unlock(&next->mutex); 8798 next = TAILQ_NEXT(next, tailq); 8799 } 8800 8801 return NULL; 8802 } 8803 8804 struct bdev_nvme_set_keys_ctx { 8805 struct nvme_ctrlr *nctrlr; 8806 struct spdk_key *dhchap_key; 8807 struct spdk_key *dhchap_ctrlr_key; 8808 struct spdk_thread *thread; 8809 bdev_nvme_set_keys_cb cb_fn; 8810 void *cb_ctx; 8811 int status; 8812 }; 8813 8814 static void 8815 bdev_nvme_free_set_keys_ctx(struct bdev_nvme_set_keys_ctx *ctx) 8816 { 8817 if (ctx == NULL) { 8818 return; 8819 } 8820 8821 spdk_keyring_put_key(ctx->dhchap_key); 8822 spdk_keyring_put_key(ctx->dhchap_ctrlr_key); 8823 free(ctx); 8824 } 8825 8826 static void 8827 _bdev_nvme_set_keys_done(void *_ctx) 8828 { 8829 struct bdev_nvme_set_keys_ctx *ctx = _ctx; 8830 8831 ctx->cb_fn(ctx->cb_ctx, ctx->status); 8832 8833 if (ctx->nctrlr != NULL) { 8834 nvme_ctrlr_release(ctx->nctrlr); 8835 } 8836 bdev_nvme_free_set_keys_ctx(ctx); 8837 } 8838 8839 static void 8840 bdev_nvme_set_keys_done(struct bdev_nvme_set_keys_ctx *ctx, int status) 8841 { 8842 ctx->status = status; 8843 spdk_thread_exec_msg(ctx->thread, _bdev_nvme_set_keys_done, ctx); 8844 } 8845 8846 static void bdev_nvme_authenticate_ctrlr(struct bdev_nvme_set_keys_ctx *ctx); 8847 8848 static void 8849 bdev_nvme_authenticate_ctrlr_continue(struct bdev_nvme_set_keys_ctx *ctx) 8850 { 8851 struct nvme_ctrlr *next; 8852 8853 pthread_mutex_lock(&g_bdev_nvme_mutex); 8854 next = bdev_nvme_next_ctrlr_unsafe(NULL, ctx->nctrlr); 8855 pthread_mutex_unlock(&g_bdev_nvme_mutex); 8856 8857 nvme_ctrlr_release(ctx->nctrlr); 8858 ctx->nctrlr = next; 8859 8860 if (next == NULL) { 8861 bdev_nvme_set_keys_done(ctx, 0); 8862 } else { 8863 bdev_nvme_authenticate_ctrlr(ctx); 8864 } 8865 } 8866 8867 static void 8868 bdev_nvme_authenticate_qpairs_done(struct spdk_io_channel_iter *i, int status) 8869 { 8870 struct bdev_nvme_set_keys_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 8871 8872 if (status != 0) { 8873 bdev_nvme_set_keys_done(ctx, status); 8874 return; 8875 } 8876 bdev_nvme_authenticate_ctrlr_continue(ctx); 8877 } 8878 8879 static void 8880 bdev_nvme_authenticate_qpair_done(void *ctx, int status) 8881 { 8882 spdk_for_each_channel_continue(ctx, status); 8883 } 8884 8885 static void 8886 bdev_nvme_authenticate_qpair(struct spdk_io_channel_iter *i) 8887 { 8888 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 8889 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch); 8890 struct nvme_qpair *qpair = ctrlr_ch->qpair; 8891 int rc; 8892 8893 if (!nvme_qpair_is_connected(qpair)) { 8894 spdk_for_each_channel_continue(i, 0); 8895 return; 8896 } 8897 8898 rc = spdk_nvme_qpair_authenticate(qpair->qpair, bdev_nvme_authenticate_qpair_done, i); 8899 if (rc != 0) { 8900 spdk_for_each_channel_continue(i, rc); 8901 } 8902 } 8903 8904 static void 8905 bdev_nvme_authenticate_ctrlr_done(void *_ctx, int status) 8906 { 8907 struct bdev_nvme_set_keys_ctx *ctx = _ctx; 8908 8909 if (status != 0) { 8910 bdev_nvme_set_keys_done(ctx, status); 8911 return; 8912 } 8913 8914 spdk_for_each_channel(ctx->nctrlr, bdev_nvme_authenticate_qpair, ctx, 8915 bdev_nvme_authenticate_qpairs_done); 8916 } 8917 8918 static void 8919 bdev_nvme_authenticate_ctrlr(struct bdev_nvme_set_keys_ctx *ctx) 8920 { 8921 struct spdk_nvme_ctrlr_key_opts opts = {}; 8922 struct nvme_ctrlr *nctrlr = ctx->nctrlr; 8923 int rc; 8924 8925 opts.size = SPDK_SIZEOF(&opts, dhchap_ctrlr_key); 8926 opts.dhchap_key = ctx->dhchap_key; 8927 opts.dhchap_ctrlr_key = ctx->dhchap_ctrlr_key; 8928 rc = spdk_nvme_ctrlr_set_keys(nctrlr->ctrlr, &opts); 8929 if (rc != 0) { 8930 bdev_nvme_set_keys_done(ctx, rc); 8931 return; 8932 } 8933 8934 if (ctx->dhchap_key != NULL) { 8935 rc = spdk_nvme_ctrlr_authenticate(nctrlr->ctrlr, 8936 bdev_nvme_authenticate_ctrlr_done, ctx); 8937 if (rc != 0) { 8938 bdev_nvme_set_keys_done(ctx, rc); 8939 } 8940 } else { 8941 bdev_nvme_authenticate_ctrlr_continue(ctx); 8942 } 8943 } 8944 8945 int 8946 bdev_nvme_set_keys(const char *name, const char *dhchap_key, const char *dhchap_ctrlr_key, 8947 bdev_nvme_set_keys_cb cb_fn, void *cb_ctx) 8948 { 8949 struct bdev_nvme_set_keys_ctx *ctx; 8950 struct nvme_bdev_ctrlr *nbdev_ctrlr; 8951 struct nvme_ctrlr *nctrlr; 8952 8953 ctx = calloc(1, sizeof(*ctx)); 8954 if (ctx == NULL) { 8955 return -ENOMEM; 8956 } 8957 8958 if (dhchap_key != NULL) { 8959 ctx->dhchap_key = spdk_keyring_get_key(dhchap_key); 8960 if (ctx->dhchap_key == NULL) { 8961 SPDK_ERRLOG("Could not find key %s for bdev %s\n", dhchap_key, name); 8962 bdev_nvme_free_set_keys_ctx(ctx); 8963 return -ENOKEY; 8964 } 8965 } 8966 if (dhchap_ctrlr_key != NULL) { 8967 ctx->dhchap_ctrlr_key = spdk_keyring_get_key(dhchap_ctrlr_key); 8968 if (ctx->dhchap_ctrlr_key == NULL) { 8969 SPDK_ERRLOG("Could not find key %s for bdev %s\n", dhchap_ctrlr_key, name); 8970 bdev_nvme_free_set_keys_ctx(ctx); 8971 return -ENOKEY; 8972 } 8973 } 8974 8975 pthread_mutex_lock(&g_bdev_nvme_mutex); 8976 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 8977 if (nbdev_ctrlr == NULL) { 8978 SPDK_ERRLOG("Could not find bdev_ctrlr %s\n", name); 8979 pthread_mutex_unlock(&g_bdev_nvme_mutex); 8980 bdev_nvme_free_set_keys_ctx(ctx); 8981 return -ENODEV; 8982 } 8983 nctrlr = bdev_nvme_next_ctrlr_unsafe(nbdev_ctrlr, NULL); 8984 if (nctrlr == NULL) { 8985 SPDK_ERRLOG("Could not find any nvme_ctrlrs on bdev_ctrlr %s\n", name); 8986 pthread_mutex_unlock(&g_bdev_nvme_mutex); 8987 bdev_nvme_free_set_keys_ctx(ctx); 8988 return -ENODEV; 8989 } 8990 pthread_mutex_unlock(&g_bdev_nvme_mutex); 8991 8992 ctx->nctrlr = nctrlr; 8993 ctx->cb_fn = cb_fn; 8994 ctx->cb_ctx = cb_ctx; 8995 ctx->thread = spdk_get_thread(); 8996 8997 bdev_nvme_authenticate_ctrlr(ctx); 8998 8999 return 0; 9000 } 9001 9002 void 9003 nvme_io_path_info_json(struct spdk_json_write_ctx *w, struct nvme_io_path *io_path) 9004 { 9005 struct nvme_ns *nvme_ns = io_path->nvme_ns; 9006 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 9007 const struct spdk_nvme_ctrlr_data *cdata; 9008 const struct spdk_nvme_transport_id *trid; 9009 const char *adrfam_str; 9010 9011 spdk_json_write_object_begin(w); 9012 9013 spdk_json_write_named_string(w, "bdev_name", nvme_ns->bdev->disk.name); 9014 9015 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 9016 trid = spdk_nvme_ctrlr_get_transport_id(nvme_ctrlr->ctrlr); 9017 9018 spdk_json_write_named_uint32(w, "cntlid", cdata->cntlid); 9019 spdk_json_write_named_bool(w, "current", nvme_io_path_is_current(io_path)); 9020 spdk_json_write_named_bool(w, "connected", nvme_qpair_is_connected(io_path->qpair)); 9021 spdk_json_write_named_bool(w, "accessible", nvme_ns_is_accessible(nvme_ns)); 9022 9023 spdk_json_write_named_object_begin(w, "transport"); 9024 spdk_json_write_named_string(w, "trtype", trid->trstring); 9025 spdk_json_write_named_string(w, "traddr", trid->traddr); 9026 if (trid->trsvcid[0] != '\0') { 9027 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 9028 } 9029 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 9030 if (adrfam_str) { 9031 spdk_json_write_named_string(w, "adrfam", adrfam_str); 9032 } 9033 spdk_json_write_object_end(w); 9034 9035 spdk_json_write_object_end(w); 9036 } 9037 9038 void 9039 bdev_nvme_get_discovery_info(struct spdk_json_write_ctx *w) 9040 { 9041 struct discovery_ctx *ctx; 9042 struct discovery_entry_ctx *entry_ctx; 9043 9044 spdk_json_write_array_begin(w); 9045 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 9046 spdk_json_write_object_begin(w); 9047 spdk_json_write_named_string(w, "name", ctx->name); 9048 9049 spdk_json_write_named_object_begin(w, "trid"); 9050 nvme_bdev_dump_trid_json(&ctx->trid, w); 9051 spdk_json_write_object_end(w); 9052 9053 spdk_json_write_named_array_begin(w, "referrals"); 9054 TAILQ_FOREACH(entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 9055 spdk_json_write_object_begin(w); 9056 spdk_json_write_named_object_begin(w, "trid"); 9057 nvme_bdev_dump_trid_json(&entry_ctx->trid, w); 9058 spdk_json_write_object_end(w); 9059 spdk_json_write_object_end(w); 9060 } 9061 spdk_json_write_array_end(w); 9062 9063 spdk_json_write_object_end(w); 9064 } 9065 spdk_json_write_array_end(w); 9066 } 9067 9068 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme) 9069 9070 static void 9071 bdev_nvme_trace(void) 9072 { 9073 struct spdk_trace_tpoint_opts opts[] = { 9074 { 9075 "BDEV_NVME_IO_START", TRACE_BDEV_NVME_IO_START, 9076 OWNER_TYPE_NONE, OBJECT_BDEV_NVME_IO, 1, 9077 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 9078 }, 9079 { 9080 "BDEV_NVME_IO_DONE", TRACE_BDEV_NVME_IO_DONE, 9081 OWNER_TYPE_NONE, OBJECT_BDEV_NVME_IO, 0, 9082 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 9083 } 9084 }; 9085 9086 9087 spdk_trace_register_object(OBJECT_BDEV_NVME_IO, 'N'); 9088 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 9089 spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_SUBMIT, OBJECT_BDEV_NVME_IO, 0); 9090 spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_SUBMIT, OBJECT_BDEV_NVME_IO, 0); 9091 spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_COMPLETE, OBJECT_BDEV_NVME_IO, 0); 9092 spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_COMPLETE, OBJECT_BDEV_NVME_IO, 0); 9093 } 9094 SPDK_TRACE_REGISTER_FN(bdev_nvme_trace, "bdev_nvme", TRACE_GROUP_BDEV_NVME) 9095