1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 * Copyright (c) 2022 Dell Inc, or its subsidiaries. All rights reserved. 6 */ 7 8 #include "spdk/stdinc.h" 9 10 #include "bdev_nvme.h" 11 12 #include "spdk/accel.h" 13 #include "spdk/config.h" 14 #include "spdk/endian.h" 15 #include "spdk/bdev.h" 16 #include "spdk/json.h" 17 #include "spdk/keyring.h" 18 #include "spdk/likely.h" 19 #include "spdk/nvme.h" 20 #include "spdk/nvme_ocssd.h" 21 #include "spdk/nvme_zns.h" 22 #include "spdk/opal.h" 23 #include "spdk/thread.h" 24 #include "spdk/trace.h" 25 #include "spdk/string.h" 26 #include "spdk/util.h" 27 #include "spdk/uuid.h" 28 29 #include "spdk/bdev_module.h" 30 #include "spdk/log.h" 31 32 #include "spdk_internal/usdt.h" 33 #include "spdk_internal/trace_defs.h" 34 35 #define CTRLR_STRING(nvme_ctrlr) \ 36 (spdk_nvme_trtype_is_fabrics(nvme_ctrlr->active_path_id->trid.trtype) ? \ 37 nvme_ctrlr->active_path_id->trid.subnqn : nvme_ctrlr->active_path_id->trid.traddr) 38 39 #define CTRLR_ID(nvme_ctrlr) (spdk_nvme_ctrlr_get_id(nvme_ctrlr->ctrlr)) 40 41 #define NVME_CTRLR_ERRLOG(ctrlr, format, ...) \ 42 SPDK_ERRLOG("[%s, %u] " format, CTRLR_STRING(ctrlr), CTRLR_ID(ctrlr), ##__VA_ARGS__); 43 44 #define NVME_CTRLR_WARNLOG(ctrlr, format, ...) \ 45 SPDK_WARNLOG("[%s, %u] " format, CTRLR_STRING(ctrlr), CTRLR_ID(ctrlr), ##__VA_ARGS__); 46 47 #define NVME_CTRLR_NOTICELOG(ctrlr, format, ...) \ 48 SPDK_NOTICELOG("[%s, %u] " format, CTRLR_STRING(ctrlr), CTRLR_ID(ctrlr), ##__VA_ARGS__); 49 50 #define NVME_CTRLR_INFOLOG(ctrlr, format, ...) \ 51 SPDK_INFOLOG(bdev_nvme, "[%s, %u] " format, CTRLR_STRING(ctrlr), CTRLR_ID(ctrlr), ##__VA_ARGS__); 52 53 #ifdef DEBUG 54 #define NVME_CTRLR_DEBUGLOG(ctrlr, format, ...) \ 55 SPDK_DEBUGLOG(bdev_nvme, "[%s, %u] " format, CTRLR_STRING(ctrlr), CTRLR_ID(ctrlr), ##__VA_ARGS__); 56 #else 57 #define NVME_CTRLR_DEBUGLOG(ctrlr, ...) do { } while (0) 58 #endif 59 60 #define BDEV_STRING(nbdev) (nbdev->disk.name) 61 62 #define NVME_BDEV_ERRLOG(nbdev, format, ...) \ 63 SPDK_ERRLOG("[%s] " format, BDEV_STRING(nbdev), ##__VA_ARGS__); 64 65 #define NVME_BDEV_WARNLOG(nbdev, format, ...) \ 66 SPDK_WARNLOG("[%s] " format, BDEV_STRING(nbdev), ##__VA_ARGS__); 67 68 #define NVME_BDEV_NOTICELOG(nbdev, format, ...) \ 69 SPDK_NOTICELOG("[%s] " format, BDEV_STRING(nbdev), ##__VA_ARGS__); 70 71 #define NVME_BDEV_INFOLOG(nbdev, format, ...) \ 72 SPDK_INFOLOG(bdev_nvme, "[%s] " format, BDEV_STRING(nbdev), ##__VA_ARGS__); 73 74 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true 75 #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS (10000) 76 77 #define NSID_STR_LEN 10 78 79 #define SPDK_CONTROLLER_NAME_MAX 512 80 81 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w); 82 83 struct nvme_bdev_io { 84 /** array of iovecs to transfer. */ 85 struct iovec *iovs; 86 87 /** Number of iovecs in iovs array. */ 88 int iovcnt; 89 90 /** Current iovec position. */ 91 int iovpos; 92 93 /** Offset in current iovec. */ 94 uint32_t iov_offset; 95 96 /** Offset in current iovec. */ 97 uint32_t fused_iov_offset; 98 99 /** array of iovecs to transfer. */ 100 struct iovec *fused_iovs; 101 102 /** Number of iovecs in iovs array. */ 103 int fused_iovcnt; 104 105 /** Current iovec position. */ 106 int fused_iovpos; 107 108 /** I/O path the current I/O or admin passthrough is submitted on, or the I/O path 109 * being reset in a reset I/O. 110 */ 111 struct nvme_io_path *io_path; 112 113 /** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */ 114 struct spdk_nvme_cpl cpl; 115 116 /** Extended IO opts passed by the user to bdev layer and mapped to NVME format */ 117 struct spdk_nvme_ns_cmd_ext_io_opts ext_opts; 118 119 /** Keeps track if first of fused commands was submitted */ 120 bool first_fused_submitted; 121 122 /** Keeps track if first of fused commands was completed */ 123 bool first_fused_completed; 124 125 /* How many times the current I/O was retried. */ 126 int32_t retry_count; 127 128 /** Expiration value in ticks to retry the current I/O. */ 129 uint64_t retry_ticks; 130 131 /** Temporary pointer to zone report buffer */ 132 struct spdk_nvme_zns_zone_report *zone_report_buf; 133 134 /** Keep track of how many zones that have been copied to the spdk_bdev_zone_info struct */ 135 uint64_t handled_zones; 136 137 /* Current tsc at submit time. */ 138 uint64_t submit_tsc; 139 140 /* Used to put nvme_bdev_io into the list */ 141 TAILQ_ENTRY(nvme_bdev_io) retry_link; 142 }; 143 144 struct nvme_probe_skip_entry { 145 struct spdk_nvme_transport_id trid; 146 TAILQ_ENTRY(nvme_probe_skip_entry) tailq; 147 }; 148 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */ 149 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER( 150 g_skipped_nvme_ctrlrs); 151 152 #define BDEV_NVME_DEFAULT_DIGESTS (SPDK_BIT(SPDK_NVMF_DHCHAP_HASH_SHA256) | \ 153 SPDK_BIT(SPDK_NVMF_DHCHAP_HASH_SHA384) | \ 154 SPDK_BIT(SPDK_NVMF_DHCHAP_HASH_SHA512)) 155 156 #define BDEV_NVME_DEFAULT_DHGROUPS (SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_NULL) | \ 157 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_2048) | \ 158 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_3072) | \ 159 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_4096) | \ 160 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_6144) | \ 161 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_8192)) 162 163 static struct spdk_bdev_nvme_opts g_opts = { 164 .action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE, 165 .timeout_us = 0, 166 .timeout_admin_us = 0, 167 .keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS, 168 .transport_retry_count = 4, 169 .arbitration_burst = 0, 170 .low_priority_weight = 0, 171 .medium_priority_weight = 0, 172 .high_priority_weight = 0, 173 .nvme_adminq_poll_period_us = 10000ULL, 174 .nvme_ioq_poll_period_us = 0, 175 .io_queue_requests = 0, 176 .delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT, 177 .bdev_retry_count = 3, 178 .transport_ack_timeout = 0, 179 .ctrlr_loss_timeout_sec = 0, 180 .reconnect_delay_sec = 0, 181 .fast_io_fail_timeout_sec = 0, 182 .disable_auto_failback = false, 183 .generate_uuids = false, 184 .transport_tos = 0, 185 .nvme_error_stat = false, 186 .io_path_stat = false, 187 .allow_accel_sequence = false, 188 .dhchap_digests = BDEV_NVME_DEFAULT_DIGESTS, 189 .dhchap_dhgroups = BDEV_NVME_DEFAULT_DHGROUPS, 190 }; 191 192 #define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL 193 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT 100000ULL 194 195 static int g_hot_insert_nvme_controller_index = 0; 196 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT; 197 static bool g_nvme_hotplug_enabled = false; 198 struct spdk_thread *g_bdev_nvme_init_thread; 199 static struct spdk_poller *g_hotplug_poller; 200 static struct spdk_poller *g_hotplug_probe_poller; 201 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx; 202 203 static void nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 204 struct nvme_async_probe_ctx *ctx); 205 static void nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 206 struct nvme_async_probe_ctx *ctx); 207 static int bdev_nvme_library_init(void); 208 static void bdev_nvme_library_fini(void); 209 static void _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, 210 struct spdk_bdev_io *bdev_io); 211 static void bdev_nvme_submit_request(struct spdk_io_channel *ch, 212 struct spdk_bdev_io *bdev_io); 213 static int bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 214 void *md, uint64_t lba_count, uint64_t lba, 215 uint32_t flags, struct spdk_memory_domain *domain, void *domain_ctx, 216 struct spdk_accel_sequence *seq); 217 static int bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 218 void *md, uint64_t lba_count, uint64_t lba); 219 static int bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 220 void *md, uint64_t lba_count, uint64_t lba, 221 uint32_t flags, struct spdk_memory_domain *domain, void *domain_ctx, 222 struct spdk_accel_sequence *seq, 223 union spdk_bdev_nvme_cdw12 cdw12, union spdk_bdev_nvme_cdw13 cdw13); 224 static int bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 225 void *md, uint64_t lba_count, 226 uint64_t zslba, uint32_t flags); 227 static int bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 228 void *md, uint64_t lba_count, uint64_t lba, 229 uint32_t flags); 230 static int bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, 231 struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov, 232 int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba, 233 uint32_t flags); 234 static int bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, 235 uint32_t num_zones, struct spdk_bdev_zone_info *info); 236 static int bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 237 enum spdk_bdev_zone_action action); 238 static void bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, 239 struct nvme_bdev_io *bio, 240 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); 241 static int bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 242 void *buf, size_t nbytes); 243 static int bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 244 void *buf, size_t nbytes, void *md_buf, size_t md_len); 245 static int bdev_nvme_iov_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 246 struct iovec *iov, int iovcnt, size_t nbytes, 247 void *md_buf, size_t md_len); 248 static void bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, 249 struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort); 250 static void bdev_nvme_reset_io(struct nvme_bdev *nbdev, struct nvme_bdev_io *bio); 251 static int bdev_nvme_reset_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 252 static int bdev_nvme_failover_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 253 static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr); 254 static int nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr); 255 256 static struct nvme_ns *nvme_ns_alloc(void); 257 static void nvme_ns_free(struct nvme_ns *ns); 258 259 static int 260 nvme_ns_cmp(struct nvme_ns *ns1, struct nvme_ns *ns2) 261 { 262 return ns1->id < ns2->id ? -1 : ns1->id > ns2->id; 263 } 264 265 RB_GENERATE_STATIC(nvme_ns_tree, nvme_ns, node, nvme_ns_cmp); 266 267 struct spdk_nvme_qpair * 268 bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch) 269 { 270 struct nvme_ctrlr_channel *ctrlr_ch; 271 272 assert(ctrlr_io_ch != NULL); 273 274 ctrlr_ch = spdk_io_channel_get_ctx(ctrlr_io_ch); 275 276 return ctrlr_ch->qpair->qpair; 277 } 278 279 static int 280 bdev_nvme_get_ctx_size(void) 281 { 282 return sizeof(struct nvme_bdev_io); 283 } 284 285 static struct spdk_bdev_module nvme_if = { 286 .name = "nvme", 287 .async_fini = true, 288 .module_init = bdev_nvme_library_init, 289 .module_fini = bdev_nvme_library_fini, 290 .config_json = bdev_nvme_config_json, 291 .get_ctx_size = bdev_nvme_get_ctx_size, 292 293 }; 294 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if) 295 296 struct nvme_bdev_ctrlrs g_nvme_bdev_ctrlrs = TAILQ_HEAD_INITIALIZER(g_nvme_bdev_ctrlrs); 297 pthread_mutex_t g_bdev_nvme_mutex = PTHREAD_MUTEX_INITIALIZER; 298 bool g_bdev_nvme_module_finish; 299 300 struct nvme_bdev_ctrlr * 301 nvme_bdev_ctrlr_get_by_name(const char *name) 302 { 303 struct nvme_bdev_ctrlr *nbdev_ctrlr; 304 305 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 306 if (strcmp(name, nbdev_ctrlr->name) == 0) { 307 break; 308 } 309 } 310 311 return nbdev_ctrlr; 312 } 313 314 static struct nvme_ctrlr * 315 nvme_bdev_ctrlr_get_ctrlr(struct nvme_bdev_ctrlr *nbdev_ctrlr, 316 const struct spdk_nvme_transport_id *trid, const char *hostnqn) 317 { 318 const struct spdk_nvme_ctrlr_opts *opts; 319 struct nvme_ctrlr *nvme_ctrlr; 320 321 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 322 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 323 if (spdk_nvme_transport_id_compare(trid, &nvme_ctrlr->active_path_id->trid) == 0 && 324 strcmp(hostnqn, opts->hostnqn) == 0) { 325 break; 326 } 327 } 328 329 return nvme_ctrlr; 330 } 331 332 struct nvme_ctrlr * 333 nvme_bdev_ctrlr_get_ctrlr_by_id(struct nvme_bdev_ctrlr *nbdev_ctrlr, 334 uint16_t cntlid) 335 { 336 struct nvme_ctrlr *nvme_ctrlr; 337 const struct spdk_nvme_ctrlr_data *cdata; 338 339 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 340 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 341 if (cdata->cntlid == cntlid) { 342 break; 343 } 344 } 345 346 return nvme_ctrlr; 347 } 348 349 static struct nvme_bdev * 350 nvme_bdev_ctrlr_get_bdev(struct nvme_bdev_ctrlr *nbdev_ctrlr, uint32_t nsid) 351 { 352 struct nvme_bdev *bdev; 353 354 pthread_mutex_lock(&g_bdev_nvme_mutex); 355 TAILQ_FOREACH(bdev, &nbdev_ctrlr->bdevs, tailq) { 356 if (bdev->nsid == nsid) { 357 break; 358 } 359 } 360 pthread_mutex_unlock(&g_bdev_nvme_mutex); 361 362 return bdev; 363 } 364 365 struct nvme_ns * 366 nvme_ctrlr_get_ns(struct nvme_ctrlr *nvme_ctrlr, uint32_t nsid) 367 { 368 struct nvme_ns ns; 369 370 assert(nsid > 0); 371 372 ns.id = nsid; 373 return RB_FIND(nvme_ns_tree, &nvme_ctrlr->namespaces, &ns); 374 } 375 376 struct nvme_ns * 377 nvme_ctrlr_get_first_active_ns(struct nvme_ctrlr *nvme_ctrlr) 378 { 379 return RB_MIN(nvme_ns_tree, &nvme_ctrlr->namespaces); 380 } 381 382 struct nvme_ns * 383 nvme_ctrlr_get_next_active_ns(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *ns) 384 { 385 if (ns == NULL) { 386 return NULL; 387 } 388 389 return RB_NEXT(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 390 } 391 392 static struct nvme_ctrlr * 393 nvme_ctrlr_get(const struct spdk_nvme_transport_id *trid, const char *hostnqn) 394 { 395 struct nvme_bdev_ctrlr *nbdev_ctrlr; 396 struct nvme_ctrlr *nvme_ctrlr = NULL; 397 398 pthread_mutex_lock(&g_bdev_nvme_mutex); 399 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 400 nvme_ctrlr = nvme_bdev_ctrlr_get_ctrlr(nbdev_ctrlr, trid, hostnqn); 401 if (nvme_ctrlr != NULL) { 402 break; 403 } 404 } 405 pthread_mutex_unlock(&g_bdev_nvme_mutex); 406 407 return nvme_ctrlr; 408 } 409 410 struct nvme_ctrlr * 411 nvme_ctrlr_get_by_name(const char *name) 412 { 413 struct nvme_bdev_ctrlr *nbdev_ctrlr; 414 struct nvme_ctrlr *nvme_ctrlr = NULL; 415 416 if (name == NULL) { 417 return NULL; 418 } 419 420 pthread_mutex_lock(&g_bdev_nvme_mutex); 421 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 422 if (nbdev_ctrlr != NULL) { 423 nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 424 } 425 pthread_mutex_unlock(&g_bdev_nvme_mutex); 426 427 return nvme_ctrlr; 428 } 429 430 void 431 nvme_bdev_ctrlr_for_each(nvme_bdev_ctrlr_for_each_fn fn, void *ctx) 432 { 433 struct nvme_bdev_ctrlr *nbdev_ctrlr; 434 435 pthread_mutex_lock(&g_bdev_nvme_mutex); 436 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 437 fn(nbdev_ctrlr, ctx); 438 } 439 pthread_mutex_unlock(&g_bdev_nvme_mutex); 440 } 441 442 struct nvme_ctrlr_channel_iter { 443 nvme_ctrlr_for_each_channel_msg fn; 444 nvme_ctrlr_for_each_channel_done cpl; 445 struct spdk_io_channel_iter *i; 446 void *ctx; 447 }; 448 449 void 450 nvme_ctrlr_for_each_channel_continue(struct nvme_ctrlr_channel_iter *iter, int status) 451 { 452 spdk_for_each_channel_continue(iter->i, status); 453 } 454 455 static void 456 nvme_ctrlr_each_channel_msg(struct spdk_io_channel_iter *i) 457 { 458 struct nvme_ctrlr_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 459 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 460 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 461 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch); 462 463 iter->i = i; 464 iter->fn(iter, nvme_ctrlr, ctrlr_ch, iter->ctx); 465 } 466 467 static void 468 nvme_ctrlr_each_channel_cpl(struct spdk_io_channel_iter *i, int status) 469 { 470 struct nvme_ctrlr_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 471 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 472 473 iter->i = i; 474 iter->cpl(nvme_ctrlr, iter->ctx, status); 475 476 free(iter); 477 } 478 479 void 480 nvme_ctrlr_for_each_channel(struct nvme_ctrlr *nvme_ctrlr, 481 nvme_ctrlr_for_each_channel_msg fn, void *ctx, 482 nvme_ctrlr_for_each_channel_done cpl) 483 { 484 struct nvme_ctrlr_channel_iter *iter; 485 486 assert(nvme_ctrlr != NULL && fn != NULL); 487 488 iter = calloc(1, sizeof(struct nvme_ctrlr_channel_iter)); 489 if (iter == NULL) { 490 SPDK_ERRLOG("Unable to allocate iterator\n"); 491 assert(false); 492 return; 493 } 494 495 iter->fn = fn; 496 iter->cpl = cpl; 497 iter->ctx = ctx; 498 499 spdk_for_each_channel(nvme_ctrlr, nvme_ctrlr_each_channel_msg, 500 iter, nvme_ctrlr_each_channel_cpl); 501 } 502 503 struct nvme_bdev_channel_iter { 504 nvme_bdev_for_each_channel_msg fn; 505 nvme_bdev_for_each_channel_done cpl; 506 struct spdk_io_channel_iter *i; 507 void *ctx; 508 }; 509 510 void 511 nvme_bdev_for_each_channel_continue(struct nvme_bdev_channel_iter *iter, int status) 512 { 513 spdk_for_each_channel_continue(iter->i, status); 514 } 515 516 static void 517 nvme_bdev_each_channel_msg(struct spdk_io_channel_iter *i) 518 { 519 struct nvme_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 520 struct nvme_bdev *nbdev = spdk_io_channel_iter_get_io_device(i); 521 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 522 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 523 524 iter->i = i; 525 iter->fn(iter, nbdev, nbdev_ch, iter->ctx); 526 } 527 528 static void 529 nvme_bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status) 530 { 531 struct nvme_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 532 struct nvme_bdev *nbdev = spdk_io_channel_iter_get_io_device(i); 533 534 iter->i = i; 535 iter->cpl(nbdev, iter->ctx, status); 536 537 free(iter); 538 } 539 540 void 541 nvme_bdev_for_each_channel(struct nvme_bdev *nbdev, 542 nvme_bdev_for_each_channel_msg fn, void *ctx, 543 nvme_bdev_for_each_channel_done cpl) 544 { 545 struct nvme_bdev_channel_iter *iter; 546 547 assert(nbdev != NULL && fn != NULL); 548 549 iter = calloc(1, sizeof(struct nvme_bdev_channel_iter)); 550 if (iter == NULL) { 551 SPDK_ERRLOG("Unable to allocate iterator\n"); 552 assert(false); 553 return; 554 } 555 556 iter->fn = fn; 557 iter->cpl = cpl; 558 iter->ctx = ctx; 559 560 spdk_for_each_channel(nbdev, nvme_bdev_each_channel_msg, iter, 561 nvme_bdev_each_channel_cpl); 562 } 563 564 void 565 nvme_bdev_dump_trid_json(const struct spdk_nvme_transport_id *trid, struct spdk_json_write_ctx *w) 566 { 567 const char *trtype_str; 568 const char *adrfam_str; 569 570 trtype_str = spdk_nvme_transport_id_trtype_str(trid->trtype); 571 if (trtype_str) { 572 spdk_json_write_named_string(w, "trtype", trtype_str); 573 } 574 575 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 576 if (adrfam_str) { 577 spdk_json_write_named_string(w, "adrfam", adrfam_str); 578 } 579 580 if (trid->traddr[0] != '\0') { 581 spdk_json_write_named_string(w, "traddr", trid->traddr); 582 } 583 584 if (trid->trsvcid[0] != '\0') { 585 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 586 } 587 588 if (trid->subnqn[0] != '\0') { 589 spdk_json_write_named_string(w, "subnqn", trid->subnqn); 590 } 591 } 592 593 static void 594 nvme_bdev_ctrlr_delete(struct nvme_bdev_ctrlr *nbdev_ctrlr, 595 struct nvme_ctrlr *nvme_ctrlr) 596 { 597 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_delete, nvme_ctrlr->nbdev_ctrlr->name); 598 pthread_mutex_lock(&g_bdev_nvme_mutex); 599 600 TAILQ_REMOVE(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 601 if (!TAILQ_EMPTY(&nbdev_ctrlr->ctrlrs)) { 602 pthread_mutex_unlock(&g_bdev_nvme_mutex); 603 604 return; 605 } 606 TAILQ_REMOVE(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 607 608 pthread_mutex_unlock(&g_bdev_nvme_mutex); 609 610 assert(TAILQ_EMPTY(&nbdev_ctrlr->bdevs)); 611 612 free(nbdev_ctrlr->name); 613 free(nbdev_ctrlr); 614 } 615 616 static void 617 _nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 618 { 619 struct nvme_path_id *path_id, *tmp_path; 620 struct nvme_ns *ns, *tmp_ns; 621 622 free(nvme_ctrlr->copied_ana_desc); 623 spdk_free(nvme_ctrlr->ana_log_page); 624 625 if (nvme_ctrlr->opal_dev) { 626 spdk_opal_dev_destruct(nvme_ctrlr->opal_dev); 627 nvme_ctrlr->opal_dev = NULL; 628 } 629 630 if (nvme_ctrlr->nbdev_ctrlr) { 631 nvme_bdev_ctrlr_delete(nvme_ctrlr->nbdev_ctrlr, nvme_ctrlr); 632 } 633 634 RB_FOREACH_SAFE(ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp_ns) { 635 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 636 nvme_ns_free(ns); 637 } 638 639 TAILQ_FOREACH_SAFE(path_id, &nvme_ctrlr->trids, link, tmp_path) { 640 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 641 free(path_id); 642 } 643 644 pthread_mutex_destroy(&nvme_ctrlr->mutex); 645 spdk_keyring_put_key(nvme_ctrlr->psk); 646 spdk_keyring_put_key(nvme_ctrlr->dhchap_key); 647 spdk_keyring_put_key(nvme_ctrlr->dhchap_ctrlr_key); 648 free(nvme_ctrlr); 649 650 pthread_mutex_lock(&g_bdev_nvme_mutex); 651 if (g_bdev_nvme_module_finish && TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 652 pthread_mutex_unlock(&g_bdev_nvme_mutex); 653 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 654 spdk_bdev_module_fini_done(); 655 return; 656 } 657 pthread_mutex_unlock(&g_bdev_nvme_mutex); 658 } 659 660 static int 661 nvme_detach_poller(void *arg) 662 { 663 struct nvme_ctrlr *nvme_ctrlr = arg; 664 int rc; 665 666 rc = spdk_nvme_detach_poll_async(nvme_ctrlr->detach_ctx); 667 if (rc != -EAGAIN) { 668 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 669 _nvme_ctrlr_delete(nvme_ctrlr); 670 } 671 672 return SPDK_POLLER_BUSY; 673 } 674 675 static void 676 nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 677 { 678 int rc; 679 680 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 681 682 /* First, unregister the adminq poller, as the driver will poll adminq if necessary */ 683 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 684 685 /* If we got here, the reset/detach poller cannot be active */ 686 assert(nvme_ctrlr->reset_detach_poller == NULL); 687 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(nvme_detach_poller, 688 nvme_ctrlr, 1000); 689 if (nvme_ctrlr->reset_detach_poller == NULL) { 690 NVME_CTRLR_ERRLOG(nvme_ctrlr, "Failed to register detach poller\n"); 691 goto error; 692 } 693 694 rc = spdk_nvme_detach_async(nvme_ctrlr->ctrlr, &nvme_ctrlr->detach_ctx); 695 if (rc != 0) { 696 NVME_CTRLR_ERRLOG(nvme_ctrlr, "Failed to detach the NVMe controller\n"); 697 goto error; 698 } 699 700 return; 701 error: 702 /* We don't have a good way to handle errors here, so just do what we can and delete the 703 * controller without detaching the underlying NVMe device. 704 */ 705 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 706 _nvme_ctrlr_delete(nvme_ctrlr); 707 } 708 709 static void 710 nvme_ctrlr_unregister_cb(void *io_device) 711 { 712 struct nvme_ctrlr *nvme_ctrlr = io_device; 713 714 nvme_ctrlr_delete(nvme_ctrlr); 715 } 716 717 static void 718 nvme_ctrlr_unregister(void *ctx) 719 { 720 struct nvme_ctrlr *nvme_ctrlr = ctx; 721 722 spdk_io_device_unregister(nvme_ctrlr, nvme_ctrlr_unregister_cb); 723 } 724 725 static bool 726 nvme_ctrlr_can_be_unregistered(struct nvme_ctrlr *nvme_ctrlr) 727 { 728 if (!nvme_ctrlr->destruct) { 729 return false; 730 } 731 732 if (nvme_ctrlr->ref > 0) { 733 return false; 734 } 735 736 if (nvme_ctrlr->resetting) { 737 return false; 738 } 739 740 if (nvme_ctrlr->ana_log_page_updating) { 741 return false; 742 } 743 744 if (nvme_ctrlr->io_path_cache_clearing) { 745 return false; 746 } 747 748 return true; 749 } 750 751 static void 752 nvme_ctrlr_release(struct nvme_ctrlr *nvme_ctrlr) 753 { 754 pthread_mutex_lock(&nvme_ctrlr->mutex); 755 SPDK_DTRACE_PROBE2(bdev_nvme_ctrlr_release, nvme_ctrlr->nbdev_ctrlr->name, nvme_ctrlr->ref); 756 757 assert(nvme_ctrlr->ref > 0); 758 nvme_ctrlr->ref--; 759 760 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 761 pthread_mutex_unlock(&nvme_ctrlr->mutex); 762 return; 763 } 764 765 pthread_mutex_unlock(&nvme_ctrlr->mutex); 766 767 spdk_thread_exec_msg(nvme_ctrlr->thread, nvme_ctrlr_unregister, nvme_ctrlr); 768 } 769 770 static void 771 bdev_nvme_clear_current_io_path(struct nvme_bdev_channel *nbdev_ch) 772 { 773 nbdev_ch->current_io_path = NULL; 774 nbdev_ch->rr_counter = 0; 775 } 776 777 static struct nvme_io_path * 778 _bdev_nvme_get_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 779 { 780 struct nvme_io_path *io_path; 781 782 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 783 if (io_path->nvme_ns == nvme_ns) { 784 break; 785 } 786 } 787 788 return io_path; 789 } 790 791 static struct nvme_io_path * 792 nvme_io_path_alloc(void) 793 { 794 struct nvme_io_path *io_path; 795 796 io_path = calloc(1, sizeof(*io_path)); 797 if (io_path == NULL) { 798 SPDK_ERRLOG("Failed to alloc io_path.\n"); 799 return NULL; 800 } 801 802 if (g_opts.io_path_stat) { 803 io_path->stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); 804 if (io_path->stat == NULL) { 805 free(io_path); 806 SPDK_ERRLOG("Failed to alloc io_path stat.\n"); 807 return NULL; 808 } 809 spdk_bdev_reset_io_stat(io_path->stat, SPDK_BDEV_RESET_STAT_MAXMIN); 810 } 811 812 return io_path; 813 } 814 815 static void 816 nvme_io_path_free(struct nvme_io_path *io_path) 817 { 818 free(io_path->stat); 819 free(io_path); 820 } 821 822 static int 823 _bdev_nvme_add_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 824 { 825 struct nvme_io_path *io_path; 826 struct spdk_io_channel *ch; 827 struct nvme_ctrlr_channel *ctrlr_ch; 828 struct nvme_qpair *nvme_qpair; 829 830 io_path = nvme_io_path_alloc(); 831 if (io_path == NULL) { 832 return -ENOMEM; 833 } 834 835 io_path->nvme_ns = nvme_ns; 836 837 ch = spdk_get_io_channel(nvme_ns->ctrlr); 838 if (ch == NULL) { 839 nvme_io_path_free(io_path); 840 SPDK_ERRLOG("Failed to alloc io_channel.\n"); 841 return -ENOMEM; 842 } 843 844 ctrlr_ch = spdk_io_channel_get_ctx(ch); 845 846 nvme_qpair = ctrlr_ch->qpair; 847 assert(nvme_qpair != NULL); 848 849 io_path->qpair = nvme_qpair; 850 TAILQ_INSERT_TAIL(&nvme_qpair->io_path_list, io_path, tailq); 851 852 io_path->nbdev_ch = nbdev_ch; 853 STAILQ_INSERT_TAIL(&nbdev_ch->io_path_list, io_path, stailq); 854 855 bdev_nvme_clear_current_io_path(nbdev_ch); 856 857 return 0; 858 } 859 860 static void 861 bdev_nvme_clear_retry_io_path(struct nvme_bdev_channel *nbdev_ch, 862 struct nvme_io_path *io_path) 863 { 864 struct nvme_bdev_io *bio; 865 866 TAILQ_FOREACH(bio, &nbdev_ch->retry_io_list, retry_link) { 867 if (bio->io_path == io_path) { 868 bio->io_path = NULL; 869 } 870 } 871 } 872 873 static void 874 _bdev_nvme_delete_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *io_path) 875 { 876 struct spdk_io_channel *ch; 877 struct nvme_qpair *nvme_qpair; 878 struct nvme_ctrlr_channel *ctrlr_ch; 879 struct nvme_bdev *nbdev; 880 881 nbdev = spdk_io_channel_get_io_device(spdk_io_channel_from_ctx(nbdev_ch)); 882 883 /* Add the statistics to nvme_ns before this path is destroyed. */ 884 pthread_mutex_lock(&nbdev->mutex); 885 if (nbdev->ref != 0 && io_path->nvme_ns->stat != NULL && io_path->stat != NULL) { 886 spdk_bdev_add_io_stat(io_path->nvme_ns->stat, io_path->stat); 887 } 888 pthread_mutex_unlock(&nbdev->mutex); 889 890 bdev_nvme_clear_current_io_path(nbdev_ch); 891 bdev_nvme_clear_retry_io_path(nbdev_ch, io_path); 892 893 STAILQ_REMOVE(&nbdev_ch->io_path_list, io_path, nvme_io_path, stailq); 894 io_path->nbdev_ch = NULL; 895 896 nvme_qpair = io_path->qpair; 897 assert(nvme_qpair != NULL); 898 899 ctrlr_ch = nvme_qpair->ctrlr_ch; 900 assert(ctrlr_ch != NULL); 901 902 ch = spdk_io_channel_from_ctx(ctrlr_ch); 903 spdk_put_io_channel(ch); 904 905 /* After an io_path is removed, I/Os submitted to it may complete and update statistics 906 * of the io_path. To avoid heap-use-after-free error from this case, do not free the 907 * io_path here but free the io_path when the associated qpair is freed. It is ensured 908 * that all I/Os submitted to the io_path are completed when the associated qpair is freed. 909 */ 910 } 911 912 static void 913 _bdev_nvme_delete_io_paths(struct nvme_bdev_channel *nbdev_ch) 914 { 915 struct nvme_io_path *io_path, *tmp_io_path; 916 917 STAILQ_FOREACH_SAFE(io_path, &nbdev_ch->io_path_list, stailq, tmp_io_path) { 918 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 919 } 920 } 921 922 static int 923 bdev_nvme_create_bdev_channel_cb(void *io_device, void *ctx_buf) 924 { 925 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 926 struct nvme_bdev *nbdev = io_device; 927 struct nvme_ns *nvme_ns; 928 int rc; 929 930 STAILQ_INIT(&nbdev_ch->io_path_list); 931 TAILQ_INIT(&nbdev_ch->retry_io_list); 932 933 pthread_mutex_lock(&nbdev->mutex); 934 935 nbdev_ch->mp_policy = nbdev->mp_policy; 936 nbdev_ch->mp_selector = nbdev->mp_selector; 937 nbdev_ch->rr_min_io = nbdev->rr_min_io; 938 939 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 940 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 941 if (rc != 0) { 942 pthread_mutex_unlock(&nbdev->mutex); 943 944 _bdev_nvme_delete_io_paths(nbdev_ch); 945 return rc; 946 } 947 } 948 pthread_mutex_unlock(&nbdev->mutex); 949 950 return 0; 951 } 952 953 /* If cpl != NULL, complete the bdev_io with nvme status based on 'cpl'. 954 * If cpl == NULL, complete the bdev_io with bdev status based on 'status'. 955 */ 956 static inline void 957 __bdev_nvme_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status, 958 const struct spdk_nvme_cpl *cpl) 959 { 960 spdk_trace_record(TRACE_BDEV_NVME_IO_DONE, 0, 0, (uintptr_t)bdev_io->driver_ctx, 961 (uintptr_t)bdev_io); 962 if (cpl) { 963 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 964 } else { 965 spdk_bdev_io_complete(bdev_io, status); 966 } 967 } 968 969 static void bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch); 970 971 static void 972 bdev_nvme_destroy_bdev_channel_cb(void *io_device, void *ctx_buf) 973 { 974 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 975 976 bdev_nvme_abort_retry_ios(nbdev_ch); 977 _bdev_nvme_delete_io_paths(nbdev_ch); 978 } 979 980 static inline bool 981 bdev_nvme_io_type_is_admin(enum spdk_bdev_io_type io_type) 982 { 983 switch (io_type) { 984 case SPDK_BDEV_IO_TYPE_RESET: 985 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 986 case SPDK_BDEV_IO_TYPE_ABORT: 987 return true; 988 default: 989 break; 990 } 991 992 return false; 993 } 994 995 static inline bool 996 nvme_ns_is_active(struct nvme_ns *nvme_ns) 997 { 998 if (spdk_unlikely(nvme_ns->ana_state_updating)) { 999 return false; 1000 } 1001 1002 if (spdk_unlikely(nvme_ns->ns == NULL)) { 1003 return false; 1004 } 1005 1006 return true; 1007 } 1008 1009 static inline bool 1010 nvme_ns_is_accessible(struct nvme_ns *nvme_ns) 1011 { 1012 if (spdk_unlikely(!nvme_ns_is_active(nvme_ns))) { 1013 return false; 1014 } 1015 1016 switch (nvme_ns->ana_state) { 1017 case SPDK_NVME_ANA_OPTIMIZED_STATE: 1018 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 1019 return true; 1020 default: 1021 break; 1022 } 1023 1024 return false; 1025 } 1026 1027 static inline bool 1028 nvme_qpair_is_connected(struct nvme_qpair *nvme_qpair) 1029 { 1030 if (spdk_unlikely(nvme_qpair->qpair == NULL)) { 1031 return false; 1032 } 1033 1034 if (spdk_unlikely(spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) != 1035 SPDK_NVME_QPAIR_FAILURE_NONE)) { 1036 return false; 1037 } 1038 1039 if (spdk_unlikely(nvme_qpair->ctrlr_ch->reset_iter != NULL)) { 1040 return false; 1041 } 1042 1043 return true; 1044 } 1045 1046 static inline bool 1047 nvme_io_path_is_available(struct nvme_io_path *io_path) 1048 { 1049 if (spdk_unlikely(!nvme_qpair_is_connected(io_path->qpair))) { 1050 return false; 1051 } 1052 1053 if (spdk_unlikely(!nvme_ns_is_accessible(io_path->nvme_ns))) { 1054 return false; 1055 } 1056 1057 return true; 1058 } 1059 1060 static inline bool 1061 nvme_ctrlr_is_failed(struct nvme_ctrlr *nvme_ctrlr) 1062 { 1063 if (nvme_ctrlr->destruct) { 1064 return true; 1065 } 1066 1067 if (nvme_ctrlr->fast_io_fail_timedout) { 1068 return true; 1069 } 1070 1071 if (nvme_ctrlr->resetting) { 1072 if (nvme_ctrlr->opts.reconnect_delay_sec != 0) { 1073 return false; 1074 } else { 1075 return true; 1076 } 1077 } 1078 1079 if (nvme_ctrlr->reconnect_is_delayed) { 1080 return false; 1081 } 1082 1083 if (nvme_ctrlr->disabled) { 1084 return true; 1085 } 1086 1087 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 1088 return true; 1089 } else { 1090 return false; 1091 } 1092 } 1093 1094 static bool 1095 nvme_ctrlr_is_available(struct nvme_ctrlr *nvme_ctrlr) 1096 { 1097 if (nvme_ctrlr->destruct) { 1098 return false; 1099 } 1100 1101 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 1102 return false; 1103 } 1104 1105 if (nvme_ctrlr->resetting || nvme_ctrlr->reconnect_is_delayed) { 1106 return false; 1107 } 1108 1109 if (nvme_ctrlr->disabled) { 1110 return false; 1111 } 1112 1113 return true; 1114 } 1115 1116 /* Simulate circular linked list. */ 1117 static inline struct nvme_io_path * 1118 nvme_io_path_get_next(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *prev_path) 1119 { 1120 struct nvme_io_path *next_path; 1121 1122 if (prev_path != NULL) { 1123 next_path = STAILQ_NEXT(prev_path, stailq); 1124 if (next_path != NULL) { 1125 return next_path; 1126 } 1127 } 1128 1129 return STAILQ_FIRST(&nbdev_ch->io_path_list); 1130 } 1131 1132 static struct nvme_io_path * 1133 _bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 1134 { 1135 struct nvme_io_path *io_path, *start, *non_optimized = NULL; 1136 1137 start = nvme_io_path_get_next(nbdev_ch, nbdev_ch->current_io_path); 1138 1139 io_path = start; 1140 do { 1141 if (spdk_likely(nvme_io_path_is_available(io_path))) { 1142 switch (io_path->nvme_ns->ana_state) { 1143 case SPDK_NVME_ANA_OPTIMIZED_STATE: 1144 nbdev_ch->current_io_path = io_path; 1145 return io_path; 1146 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 1147 if (non_optimized == NULL) { 1148 non_optimized = io_path; 1149 } 1150 break; 1151 default: 1152 assert(false); 1153 break; 1154 } 1155 } 1156 io_path = nvme_io_path_get_next(nbdev_ch, io_path); 1157 } while (io_path != start); 1158 1159 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) { 1160 /* We come here only if there is no optimized path. Cache even non_optimized 1161 * path for load balance across multiple non_optimized paths. 1162 */ 1163 nbdev_ch->current_io_path = non_optimized; 1164 } 1165 1166 return non_optimized; 1167 } 1168 1169 static struct nvme_io_path * 1170 _bdev_nvme_find_io_path_min_qd(struct nvme_bdev_channel *nbdev_ch) 1171 { 1172 struct nvme_io_path *io_path; 1173 struct nvme_io_path *optimized = NULL, *non_optimized = NULL; 1174 uint32_t opt_min_qd = UINT32_MAX, non_opt_min_qd = UINT32_MAX; 1175 uint32_t num_outstanding_reqs; 1176 1177 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 1178 if (spdk_unlikely(!nvme_qpair_is_connected(io_path->qpair))) { 1179 /* The device is currently resetting. */ 1180 continue; 1181 } 1182 1183 if (spdk_unlikely(!nvme_ns_is_active(io_path->nvme_ns))) { 1184 continue; 1185 } 1186 1187 num_outstanding_reqs = spdk_nvme_qpair_get_num_outstanding_reqs(io_path->qpair->qpair); 1188 switch (io_path->nvme_ns->ana_state) { 1189 case SPDK_NVME_ANA_OPTIMIZED_STATE: 1190 if (num_outstanding_reqs < opt_min_qd) { 1191 opt_min_qd = num_outstanding_reqs; 1192 optimized = io_path; 1193 } 1194 break; 1195 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 1196 if (num_outstanding_reqs < non_opt_min_qd) { 1197 non_opt_min_qd = num_outstanding_reqs; 1198 non_optimized = io_path; 1199 } 1200 break; 1201 default: 1202 break; 1203 } 1204 } 1205 1206 /* don't cache io path for BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH selector */ 1207 if (optimized != NULL) { 1208 return optimized; 1209 } 1210 1211 return non_optimized; 1212 } 1213 1214 static inline struct nvme_io_path * 1215 bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 1216 { 1217 if (spdk_likely(nbdev_ch->current_io_path != NULL)) { 1218 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE) { 1219 return nbdev_ch->current_io_path; 1220 } else if (nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 1221 if (++nbdev_ch->rr_counter < nbdev_ch->rr_min_io) { 1222 return nbdev_ch->current_io_path; 1223 } 1224 nbdev_ch->rr_counter = 0; 1225 } 1226 } 1227 1228 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE || 1229 nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 1230 return _bdev_nvme_find_io_path(nbdev_ch); 1231 } else { 1232 return _bdev_nvme_find_io_path_min_qd(nbdev_ch); 1233 } 1234 } 1235 1236 /* Return true if there is any io_path whose qpair is active or ctrlr is not failed, 1237 * or false otherwise. 1238 * 1239 * If any io_path has an active qpair but find_io_path() returned NULL, its namespace 1240 * is likely to be non-accessible now but may become accessible. 1241 * 1242 * If any io_path has an unfailed ctrlr but find_io_path() returned NULL, the ctrlr 1243 * is likely to be resetting now but the reset may succeed. A ctrlr is set to unfailed 1244 * when starting to reset it but it is set to failed when the reset failed. Hence, if 1245 * a ctrlr is unfailed, it is likely that it works fine or is resetting. 1246 */ 1247 static bool 1248 any_io_path_may_become_available(struct nvme_bdev_channel *nbdev_ch) 1249 { 1250 struct nvme_io_path *io_path; 1251 1252 if (nbdev_ch->resetting) { 1253 return false; 1254 } 1255 1256 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 1257 if (io_path->nvme_ns->ana_transition_timedout) { 1258 continue; 1259 } 1260 1261 if (nvme_qpair_is_connected(io_path->qpair) || 1262 !nvme_ctrlr_is_failed(io_path->qpair->ctrlr)) { 1263 return true; 1264 } 1265 } 1266 1267 return false; 1268 } 1269 1270 static void 1271 bdev_nvme_retry_io(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io) 1272 { 1273 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 1274 struct spdk_io_channel *ch; 1275 1276 if (nbdev_io->io_path != NULL && nvme_io_path_is_available(nbdev_io->io_path)) { 1277 _bdev_nvme_submit_request(nbdev_ch, bdev_io); 1278 } else { 1279 ch = spdk_io_channel_from_ctx(nbdev_ch); 1280 bdev_nvme_submit_request(ch, bdev_io); 1281 } 1282 } 1283 1284 static int 1285 bdev_nvme_retry_ios(void *arg) 1286 { 1287 struct nvme_bdev_channel *nbdev_ch = arg; 1288 struct nvme_bdev_io *bio, *tmp_bio; 1289 uint64_t now, delay_us; 1290 1291 now = spdk_get_ticks(); 1292 1293 TAILQ_FOREACH_SAFE(bio, &nbdev_ch->retry_io_list, retry_link, tmp_bio) { 1294 if (bio->retry_ticks > now) { 1295 break; 1296 } 1297 1298 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bio, retry_link); 1299 1300 bdev_nvme_retry_io(nbdev_ch, spdk_bdev_io_from_ctx(bio)); 1301 } 1302 1303 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1304 1305 bio = TAILQ_FIRST(&nbdev_ch->retry_io_list); 1306 if (bio != NULL) { 1307 delay_us = (bio->retry_ticks - now) * SPDK_SEC_TO_USEC / spdk_get_ticks_hz(); 1308 1309 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 1310 delay_us); 1311 } 1312 1313 return SPDK_POLLER_BUSY; 1314 } 1315 1316 static void 1317 bdev_nvme_queue_retry_io(struct nvme_bdev_channel *nbdev_ch, 1318 struct nvme_bdev_io *bio, uint64_t delay_ms) 1319 { 1320 struct nvme_bdev_io *tmp_bio; 1321 1322 bio->retry_ticks = spdk_get_ticks() + delay_ms * spdk_get_ticks_hz() / 1000ULL; 1323 1324 TAILQ_FOREACH_REVERSE(tmp_bio, &nbdev_ch->retry_io_list, retry_io_head, retry_link) { 1325 if (tmp_bio->retry_ticks <= bio->retry_ticks) { 1326 TAILQ_INSERT_AFTER(&nbdev_ch->retry_io_list, tmp_bio, bio, 1327 retry_link); 1328 return; 1329 } 1330 } 1331 1332 /* No earlier I/Os were found. This I/O must be the new head. */ 1333 TAILQ_INSERT_HEAD(&nbdev_ch->retry_io_list, bio, retry_link); 1334 1335 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1336 1337 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 1338 delay_ms * 1000ULL); 1339 } 1340 1341 static void 1342 bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch) 1343 { 1344 struct nvme_bdev_io *bio, *tmp_bio; 1345 1346 TAILQ_FOREACH_SAFE(bio, &nbdev_ch->retry_io_list, retry_link, tmp_bio) { 1347 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bio, retry_link); 1348 __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_ABORTED, NULL); 1349 } 1350 1351 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1352 } 1353 1354 static int 1355 bdev_nvme_abort_retry_io(struct nvme_bdev_channel *nbdev_ch, 1356 struct nvme_bdev_io *bio_to_abort) 1357 { 1358 struct nvme_bdev_io *bio; 1359 1360 TAILQ_FOREACH(bio, &nbdev_ch->retry_io_list, retry_link) { 1361 if (bio == bio_to_abort) { 1362 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bio, retry_link); 1363 __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_ABORTED, NULL); 1364 return 0; 1365 } 1366 } 1367 1368 return -ENOENT; 1369 } 1370 1371 static void 1372 bdev_nvme_update_nvme_error_stat(struct spdk_bdev_io *bdev_io, const struct spdk_nvme_cpl *cpl) 1373 { 1374 struct nvme_bdev *nbdev; 1375 uint16_t sct, sc; 1376 1377 assert(spdk_nvme_cpl_is_error(cpl)); 1378 1379 nbdev = bdev_io->bdev->ctxt; 1380 1381 if (nbdev->err_stat == NULL) { 1382 return; 1383 } 1384 1385 sct = cpl->status.sct; 1386 sc = cpl->status.sc; 1387 1388 pthread_mutex_lock(&nbdev->mutex); 1389 1390 nbdev->err_stat->status_type[sct]++; 1391 switch (sct) { 1392 case SPDK_NVME_SCT_GENERIC: 1393 case SPDK_NVME_SCT_COMMAND_SPECIFIC: 1394 case SPDK_NVME_SCT_MEDIA_ERROR: 1395 case SPDK_NVME_SCT_PATH: 1396 nbdev->err_stat->status[sct][sc]++; 1397 break; 1398 default: 1399 break; 1400 } 1401 1402 pthread_mutex_unlock(&nbdev->mutex); 1403 } 1404 1405 static inline void 1406 bdev_nvme_update_io_path_stat(struct nvme_bdev_io *bio) 1407 { 1408 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1409 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 1410 uint32_t blocklen = bdev_io->bdev->blocklen; 1411 struct spdk_bdev_io_stat *stat; 1412 uint64_t tsc_diff; 1413 1414 if (bio->io_path->stat == NULL) { 1415 return; 1416 } 1417 1418 tsc_diff = spdk_get_ticks() - bio->submit_tsc; 1419 stat = bio->io_path->stat; 1420 1421 switch (bdev_io->type) { 1422 case SPDK_BDEV_IO_TYPE_READ: 1423 stat->bytes_read += num_blocks * blocklen; 1424 stat->num_read_ops++; 1425 stat->read_latency_ticks += tsc_diff; 1426 if (stat->max_read_latency_ticks < tsc_diff) { 1427 stat->max_read_latency_ticks = tsc_diff; 1428 } 1429 if (stat->min_read_latency_ticks > tsc_diff) { 1430 stat->min_read_latency_ticks = tsc_diff; 1431 } 1432 break; 1433 case SPDK_BDEV_IO_TYPE_WRITE: 1434 stat->bytes_written += num_blocks * blocklen; 1435 stat->num_write_ops++; 1436 stat->write_latency_ticks += tsc_diff; 1437 if (stat->max_write_latency_ticks < tsc_diff) { 1438 stat->max_write_latency_ticks = tsc_diff; 1439 } 1440 if (stat->min_write_latency_ticks > tsc_diff) { 1441 stat->min_write_latency_ticks = tsc_diff; 1442 } 1443 break; 1444 case SPDK_BDEV_IO_TYPE_UNMAP: 1445 stat->bytes_unmapped += num_blocks * blocklen; 1446 stat->num_unmap_ops++; 1447 stat->unmap_latency_ticks += tsc_diff; 1448 if (stat->max_unmap_latency_ticks < tsc_diff) { 1449 stat->max_unmap_latency_ticks = tsc_diff; 1450 } 1451 if (stat->min_unmap_latency_ticks > tsc_diff) { 1452 stat->min_unmap_latency_ticks = tsc_diff; 1453 } 1454 break; 1455 case SPDK_BDEV_IO_TYPE_ZCOPY: 1456 /* Track the data in the start phase only */ 1457 if (!bdev_io->u.bdev.zcopy.start) { 1458 break; 1459 } 1460 if (bdev_io->u.bdev.zcopy.populate) { 1461 stat->bytes_read += num_blocks * blocklen; 1462 stat->num_read_ops++; 1463 stat->read_latency_ticks += tsc_diff; 1464 if (stat->max_read_latency_ticks < tsc_diff) { 1465 stat->max_read_latency_ticks = tsc_diff; 1466 } 1467 if (stat->min_read_latency_ticks > tsc_diff) { 1468 stat->min_read_latency_ticks = tsc_diff; 1469 } 1470 } else { 1471 stat->bytes_written += num_blocks * blocklen; 1472 stat->num_write_ops++; 1473 stat->write_latency_ticks += tsc_diff; 1474 if (stat->max_write_latency_ticks < tsc_diff) { 1475 stat->max_write_latency_ticks = tsc_diff; 1476 } 1477 if (stat->min_write_latency_ticks > tsc_diff) { 1478 stat->min_write_latency_ticks = tsc_diff; 1479 } 1480 } 1481 break; 1482 case SPDK_BDEV_IO_TYPE_COPY: 1483 stat->bytes_copied += num_blocks * blocklen; 1484 stat->num_copy_ops++; 1485 stat->copy_latency_ticks += tsc_diff; 1486 if (stat->max_copy_latency_ticks < tsc_diff) { 1487 stat->max_copy_latency_ticks = tsc_diff; 1488 } 1489 if (stat->min_copy_latency_ticks > tsc_diff) { 1490 stat->min_copy_latency_ticks = tsc_diff; 1491 } 1492 break; 1493 default: 1494 break; 1495 } 1496 } 1497 1498 static bool 1499 bdev_nvme_check_retry_io(struct nvme_bdev_io *bio, 1500 const struct spdk_nvme_cpl *cpl, 1501 struct nvme_bdev_channel *nbdev_ch, 1502 uint64_t *_delay_ms) 1503 { 1504 struct nvme_io_path *io_path = bio->io_path; 1505 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 1506 const struct spdk_nvme_ctrlr_data *cdata; 1507 1508 if (spdk_nvme_cpl_is_path_error(cpl) || 1509 spdk_nvme_cpl_is_aborted_sq_deletion(cpl) || 1510 !nvme_io_path_is_available(io_path) || 1511 !nvme_ctrlr_is_available(nvme_ctrlr)) { 1512 bdev_nvme_clear_current_io_path(nbdev_ch); 1513 bio->io_path = NULL; 1514 if (spdk_nvme_cpl_is_ana_error(cpl)) { 1515 if (nvme_ctrlr_read_ana_log_page(nvme_ctrlr) == 0) { 1516 io_path->nvme_ns->ana_state_updating = true; 1517 } 1518 } 1519 if (!any_io_path_may_become_available(nbdev_ch)) { 1520 return false; 1521 } 1522 *_delay_ms = 0; 1523 } else { 1524 bio->retry_count++; 1525 1526 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 1527 1528 if (cpl->status.crd != 0) { 1529 *_delay_ms = cdata->crdt[cpl->status.crd] * 100; 1530 } else { 1531 *_delay_ms = 0; 1532 } 1533 } 1534 1535 return true; 1536 } 1537 1538 static inline void 1539 bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio, 1540 const struct spdk_nvme_cpl *cpl) 1541 { 1542 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1543 struct nvme_bdev_channel *nbdev_ch; 1544 uint64_t delay_ms; 1545 1546 assert(!bdev_nvme_io_type_is_admin(bdev_io->type)); 1547 1548 if (spdk_likely(spdk_nvme_cpl_is_success(cpl))) { 1549 bdev_nvme_update_io_path_stat(bio); 1550 goto complete; 1551 } 1552 1553 /* Update error counts before deciding if retry is needed. 1554 * Hence, error counts may be more than the number of I/O errors. 1555 */ 1556 bdev_nvme_update_nvme_error_stat(bdev_io, cpl); 1557 1558 if (cpl->status.dnr != 0 || spdk_nvme_cpl_is_aborted_by_request(cpl) || 1559 (g_opts.bdev_retry_count != -1 && bio->retry_count >= g_opts.bdev_retry_count)) { 1560 goto complete; 1561 } 1562 1563 /* At this point we don't know whether the sequence was successfully executed or not, so we 1564 * cannot retry the IO */ 1565 if (bdev_io->u.bdev.accel_sequence != NULL) { 1566 goto complete; 1567 } 1568 1569 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1570 1571 if (bdev_nvme_check_retry_io(bio, cpl, nbdev_ch, &delay_ms)) { 1572 bdev_nvme_queue_retry_io(nbdev_ch, bio, delay_ms); 1573 return; 1574 } 1575 1576 complete: 1577 bio->retry_count = 0; 1578 bio->submit_tsc = 0; 1579 bdev_io->u.bdev.accel_sequence = NULL; 1580 __bdev_nvme_io_complete(bdev_io, 0, cpl); 1581 } 1582 1583 static inline void 1584 bdev_nvme_io_complete(struct nvme_bdev_io *bio, int rc) 1585 { 1586 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1587 struct nvme_bdev_channel *nbdev_ch; 1588 enum spdk_bdev_io_status io_status; 1589 1590 assert(!bdev_nvme_io_type_is_admin(bdev_io->type)); 1591 1592 switch (rc) { 1593 case 0: 1594 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1595 break; 1596 case -ENOMEM: 1597 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1598 break; 1599 case -ENXIO: 1600 if (g_opts.bdev_retry_count == -1 || bio->retry_count < g_opts.bdev_retry_count) { 1601 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1602 1603 bdev_nvme_clear_current_io_path(nbdev_ch); 1604 bio->io_path = NULL; 1605 1606 if (any_io_path_may_become_available(nbdev_ch)) { 1607 bdev_nvme_queue_retry_io(nbdev_ch, bio, 1000ULL); 1608 return; 1609 } 1610 } 1611 1612 /* fallthrough */ 1613 default: 1614 spdk_accel_sequence_abort(bdev_io->u.bdev.accel_sequence); 1615 bdev_io->u.bdev.accel_sequence = NULL; 1616 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1617 break; 1618 } 1619 1620 bio->retry_count = 0; 1621 bio->submit_tsc = 0; 1622 __bdev_nvme_io_complete(bdev_io, io_status, NULL); 1623 } 1624 1625 static inline void 1626 bdev_nvme_admin_complete(struct nvme_bdev_io *bio, int rc) 1627 { 1628 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1629 enum spdk_bdev_io_status io_status; 1630 1631 switch (rc) { 1632 case 0: 1633 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1634 break; 1635 case -ENOMEM: 1636 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1637 break; 1638 case -ENXIO: 1639 /* fallthrough */ 1640 default: 1641 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1642 break; 1643 } 1644 1645 __bdev_nvme_io_complete(bdev_io, io_status, NULL); 1646 } 1647 1648 static void 1649 bdev_nvme_clear_io_path_caches_done(struct nvme_ctrlr *nvme_ctrlr, 1650 void *ctx, int status) 1651 { 1652 pthread_mutex_lock(&nvme_ctrlr->mutex); 1653 1654 assert(nvme_ctrlr->io_path_cache_clearing == true); 1655 nvme_ctrlr->io_path_cache_clearing = false; 1656 1657 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 1658 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1659 return; 1660 } 1661 1662 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1663 1664 nvme_ctrlr_unregister(nvme_ctrlr); 1665 } 1666 1667 static void 1668 _bdev_nvme_clear_io_path_cache(struct nvme_qpair *nvme_qpair) 1669 { 1670 struct nvme_io_path *io_path; 1671 1672 TAILQ_FOREACH(io_path, &nvme_qpair->io_path_list, tailq) { 1673 if (io_path->nbdev_ch == NULL) { 1674 continue; 1675 } 1676 bdev_nvme_clear_current_io_path(io_path->nbdev_ch); 1677 } 1678 } 1679 1680 static void 1681 bdev_nvme_clear_io_path_cache(struct nvme_ctrlr_channel_iter *i, 1682 struct nvme_ctrlr *nvme_ctrlr, 1683 struct nvme_ctrlr_channel *ctrlr_ch, 1684 void *ctx) 1685 { 1686 assert(ctrlr_ch->qpair != NULL); 1687 1688 _bdev_nvme_clear_io_path_cache(ctrlr_ch->qpair); 1689 1690 nvme_ctrlr_for_each_channel_continue(i, 0); 1691 } 1692 1693 static void 1694 bdev_nvme_clear_io_path_caches(struct nvme_ctrlr *nvme_ctrlr) 1695 { 1696 pthread_mutex_lock(&nvme_ctrlr->mutex); 1697 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 1698 nvme_ctrlr->io_path_cache_clearing) { 1699 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1700 return; 1701 } 1702 1703 nvme_ctrlr->io_path_cache_clearing = true; 1704 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1705 1706 nvme_ctrlr_for_each_channel(nvme_ctrlr, 1707 bdev_nvme_clear_io_path_cache, 1708 NULL, 1709 bdev_nvme_clear_io_path_caches_done); 1710 } 1711 1712 static struct nvme_qpair * 1713 nvme_poll_group_get_qpair(struct nvme_poll_group *group, struct spdk_nvme_qpair *qpair) 1714 { 1715 struct nvme_qpair *nvme_qpair; 1716 1717 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1718 if (nvme_qpair->qpair == qpair) { 1719 break; 1720 } 1721 } 1722 1723 return nvme_qpair; 1724 } 1725 1726 static void nvme_qpair_delete(struct nvme_qpair *nvme_qpair); 1727 1728 static void 1729 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx) 1730 { 1731 struct nvme_poll_group *group = poll_group_ctx; 1732 struct nvme_qpair *nvme_qpair; 1733 struct nvme_ctrlr *nvme_ctrlr; 1734 struct nvme_ctrlr_channel *ctrlr_ch; 1735 int status; 1736 1737 nvme_qpair = nvme_poll_group_get_qpair(group, qpair); 1738 if (nvme_qpair == NULL) { 1739 return; 1740 } 1741 1742 if (nvme_qpair->qpair != NULL) { 1743 spdk_nvme_ctrlr_free_io_qpair(nvme_qpair->qpair); 1744 nvme_qpair->qpair = NULL; 1745 } 1746 1747 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1748 1749 nvme_ctrlr = nvme_qpair->ctrlr; 1750 ctrlr_ch = nvme_qpair->ctrlr_ch; 1751 1752 if (ctrlr_ch != NULL) { 1753 if (ctrlr_ch->reset_iter != NULL) { 1754 /* We are in a full reset sequence. */ 1755 if (ctrlr_ch->connect_poller != NULL) { 1756 /* qpair was failed to connect. Abort the reset sequence. */ 1757 NVME_CTRLR_INFOLOG(nvme_ctrlr, 1758 "qpair %p was failed to connect. abort the reset ctrlr sequence.\n", 1759 qpair); 1760 spdk_poller_unregister(&ctrlr_ch->connect_poller); 1761 status = -1; 1762 } else { 1763 /* qpair was completed to disconnect. Just move to the next ctrlr_channel. */ 1764 NVME_CTRLR_INFOLOG(nvme_ctrlr, 1765 "qpair %p was disconnected and freed in a reset ctrlr sequence.\n", 1766 qpair); 1767 status = 0; 1768 } 1769 nvme_ctrlr_for_each_channel_continue(ctrlr_ch->reset_iter, status); 1770 ctrlr_ch->reset_iter = NULL; 1771 } else { 1772 /* qpair was disconnected unexpectedly. Reset controller for recovery. */ 1773 NVME_CTRLR_INFOLOG(nvme_ctrlr, "qpair %p was disconnected and freed. reset controller.\n", 1774 qpair); 1775 bdev_nvme_failover_ctrlr(nvme_ctrlr); 1776 } 1777 } else { 1778 /* In this case, ctrlr_channel is already deleted. */ 1779 NVME_CTRLR_INFOLOG(nvme_ctrlr, "qpair %p was disconnected and freed. delete nvme_qpair.\n", 1780 qpair); 1781 nvme_qpair_delete(nvme_qpair); 1782 } 1783 } 1784 1785 static void 1786 bdev_nvme_check_io_qpairs(struct nvme_poll_group *group) 1787 { 1788 struct nvme_qpair *nvme_qpair; 1789 1790 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1791 if (nvme_qpair->qpair == NULL || nvme_qpair->ctrlr_ch == NULL) { 1792 continue; 1793 } 1794 1795 if (spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) != 1796 SPDK_NVME_QPAIR_FAILURE_NONE) { 1797 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1798 } 1799 } 1800 } 1801 1802 static int 1803 bdev_nvme_poll(void *arg) 1804 { 1805 struct nvme_poll_group *group = arg; 1806 int64_t num_completions; 1807 1808 if (group->collect_spin_stat && group->start_ticks == 0) { 1809 group->start_ticks = spdk_get_ticks(); 1810 } 1811 1812 num_completions = spdk_nvme_poll_group_process_completions(group->group, 0, 1813 bdev_nvme_disconnected_qpair_cb); 1814 if (group->collect_spin_stat) { 1815 if (num_completions > 0) { 1816 if (group->end_ticks != 0) { 1817 group->spin_ticks += (group->end_ticks - group->start_ticks); 1818 group->end_ticks = 0; 1819 } 1820 group->start_ticks = 0; 1821 } else { 1822 group->end_ticks = spdk_get_ticks(); 1823 } 1824 } 1825 1826 if (spdk_unlikely(num_completions < 0)) { 1827 bdev_nvme_check_io_qpairs(group); 1828 } 1829 1830 return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 1831 } 1832 1833 static int bdev_nvme_poll_adminq(void *arg); 1834 1835 static void 1836 bdev_nvme_change_adminq_poll_period(struct nvme_ctrlr *nvme_ctrlr, uint64_t new_period_us) 1837 { 1838 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 1839 1840 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, 1841 nvme_ctrlr, new_period_us); 1842 } 1843 1844 static int 1845 bdev_nvme_poll_adminq(void *arg) 1846 { 1847 int32_t rc; 1848 struct nvme_ctrlr *nvme_ctrlr = arg; 1849 nvme_ctrlr_disconnected_cb disconnected_cb; 1850 1851 assert(nvme_ctrlr != NULL); 1852 1853 rc = spdk_nvme_ctrlr_process_admin_completions(nvme_ctrlr->ctrlr); 1854 if (rc < 0) { 1855 disconnected_cb = nvme_ctrlr->disconnected_cb; 1856 nvme_ctrlr->disconnected_cb = NULL; 1857 1858 if (disconnected_cb != NULL) { 1859 bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 1860 g_opts.nvme_adminq_poll_period_us); 1861 disconnected_cb(nvme_ctrlr); 1862 } else { 1863 bdev_nvme_failover_ctrlr(nvme_ctrlr); 1864 } 1865 } else if (spdk_nvme_ctrlr_get_admin_qp_failure_reason(nvme_ctrlr->ctrlr) != 1866 SPDK_NVME_QPAIR_FAILURE_NONE) { 1867 bdev_nvme_clear_io_path_caches(nvme_ctrlr); 1868 } 1869 1870 return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY; 1871 } 1872 1873 static void 1874 nvme_bdev_free(void *io_device) 1875 { 1876 struct nvme_bdev *nvme_disk = io_device; 1877 1878 pthread_mutex_destroy(&nvme_disk->mutex); 1879 free(nvme_disk->disk.name); 1880 free(nvme_disk->err_stat); 1881 free(nvme_disk); 1882 } 1883 1884 static int 1885 bdev_nvme_destruct(void *ctx) 1886 { 1887 struct nvme_bdev *nvme_disk = ctx; 1888 struct nvme_ns *nvme_ns, *tmp_nvme_ns; 1889 1890 SPDK_DTRACE_PROBE2(bdev_nvme_destruct, nvme_disk->nbdev_ctrlr->name, nvme_disk->nsid); 1891 1892 TAILQ_FOREACH_SAFE(nvme_ns, &nvme_disk->nvme_ns_list, tailq, tmp_nvme_ns) { 1893 pthread_mutex_lock(&nvme_ns->ctrlr->mutex); 1894 1895 nvme_ns->bdev = NULL; 1896 1897 assert(nvme_ns->id > 0); 1898 1899 if (nvme_ctrlr_get_ns(nvme_ns->ctrlr, nvme_ns->id) == NULL) { 1900 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1901 1902 nvme_ctrlr_release(nvme_ns->ctrlr); 1903 nvme_ns_free(nvme_ns); 1904 } else { 1905 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1906 } 1907 } 1908 1909 pthread_mutex_lock(&g_bdev_nvme_mutex); 1910 TAILQ_REMOVE(&nvme_disk->nbdev_ctrlr->bdevs, nvme_disk, tailq); 1911 pthread_mutex_unlock(&g_bdev_nvme_mutex); 1912 1913 spdk_io_device_unregister(nvme_disk, nvme_bdev_free); 1914 1915 return 0; 1916 } 1917 1918 static int 1919 bdev_nvme_create_qpair(struct nvme_qpair *nvme_qpair) 1920 { 1921 struct nvme_ctrlr *nvme_ctrlr; 1922 struct spdk_nvme_io_qpair_opts opts; 1923 struct spdk_nvme_qpair *qpair; 1924 int rc; 1925 1926 nvme_ctrlr = nvme_qpair->ctrlr; 1927 1928 spdk_nvme_ctrlr_get_default_io_qpair_opts(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1929 opts.delay_cmd_submit = g_opts.delay_cmd_submit; 1930 opts.create_only = true; 1931 opts.async_mode = true; 1932 opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests); 1933 g_opts.io_queue_requests = opts.io_queue_requests; 1934 1935 qpair = spdk_nvme_ctrlr_alloc_io_qpair(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1936 if (qpair == NULL) { 1937 return -1; 1938 } 1939 1940 SPDK_DTRACE_PROBE3(bdev_nvme_create_qpair, nvme_ctrlr->nbdev_ctrlr->name, 1941 spdk_nvme_qpair_get_id(qpair), spdk_thread_get_id(nvme_ctrlr->thread)); 1942 1943 assert(nvme_qpair->group != NULL); 1944 1945 rc = spdk_nvme_poll_group_add(nvme_qpair->group->group, qpair); 1946 if (rc != 0) { 1947 NVME_CTRLR_ERRLOG(nvme_ctrlr, "Unable to begin polling on NVMe Channel.\n"); 1948 goto err; 1949 } 1950 1951 rc = spdk_nvme_ctrlr_connect_io_qpair(nvme_ctrlr->ctrlr, qpair); 1952 if (rc != 0) { 1953 NVME_CTRLR_ERRLOG(nvme_ctrlr, "Unable to connect I/O qpair.\n"); 1954 goto err; 1955 } 1956 1957 nvme_qpair->qpair = qpair; 1958 1959 if (!g_opts.disable_auto_failback) { 1960 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1961 } 1962 1963 NVME_CTRLR_INFOLOG(nvme_ctrlr, "Connecting qpair %p:%u started.\n", 1964 qpair, spdk_nvme_qpair_get_id(qpair)); 1965 1966 return 0; 1967 1968 err: 1969 spdk_nvme_ctrlr_free_io_qpair(qpair); 1970 1971 return rc; 1972 } 1973 1974 static void bdev_nvme_reset_io_continue(void *cb_arg, int rc); 1975 1976 static void 1977 bdev_nvme_complete_pending_resets(struct nvme_ctrlr_channel_iter *i, 1978 struct nvme_ctrlr *nvme_ctrlr, 1979 struct nvme_ctrlr_channel *ctrlr_ch, 1980 void *ctx) 1981 { 1982 int rc = 0; 1983 struct nvme_bdev_io *bio; 1984 1985 if (ctx != NULL) { 1986 rc = -1; 1987 } 1988 1989 while (!TAILQ_EMPTY(&ctrlr_ch->pending_resets)) { 1990 bio = TAILQ_FIRST(&ctrlr_ch->pending_resets); 1991 TAILQ_REMOVE(&ctrlr_ch->pending_resets, bio, retry_link); 1992 1993 bdev_nvme_reset_io_continue(bio, rc); 1994 } 1995 1996 nvme_ctrlr_for_each_channel_continue(i, 0); 1997 } 1998 1999 /* This function marks the current trid as failed by storing the current ticks 2000 * and then sets the next trid to the active trid within a controller if exists. 2001 * 2002 * The purpose of the boolean return value is to request the caller to disconnect 2003 * the current trid now to try connecting the next trid. 2004 */ 2005 static bool 2006 bdev_nvme_failover_trid(struct nvme_ctrlr *nvme_ctrlr, bool remove, bool start) 2007 { 2008 struct nvme_path_id *path_id, *next_path; 2009 int rc __attribute__((unused)); 2010 2011 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 2012 assert(path_id); 2013 assert(path_id == nvme_ctrlr->active_path_id); 2014 next_path = TAILQ_NEXT(path_id, link); 2015 2016 /* Update the last failed time. It means the trid is failed if its last 2017 * failed time is non-zero. 2018 */ 2019 path_id->last_failed_tsc = spdk_get_ticks(); 2020 2021 if (next_path == NULL) { 2022 /* There is no alternate trid within a controller. */ 2023 return false; 2024 } 2025 2026 if (!start && nvme_ctrlr->opts.reconnect_delay_sec == 0) { 2027 /* Connect is not retried in a controller reset sequence. Connecting 2028 * the next trid will be done by the next bdev_nvme_failover_ctrlr() call. 2029 */ 2030 return false; 2031 } 2032 2033 assert(path_id->trid.trtype != SPDK_NVME_TRANSPORT_PCIE); 2034 2035 NVME_CTRLR_NOTICELOG(nvme_ctrlr, "Start failover from %s:%s to %s:%s\n", 2036 path_id->trid.traddr, path_id->trid.trsvcid, 2037 next_path->trid.traddr, next_path->trid.trsvcid); 2038 2039 spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr); 2040 nvme_ctrlr->active_path_id = next_path; 2041 rc = spdk_nvme_ctrlr_set_trid(nvme_ctrlr->ctrlr, &next_path->trid); 2042 assert(rc == 0); 2043 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 2044 if (!remove) { 2045 /** Shuffle the old trid to the end of the list and use the new one. 2046 * Allows for round robin through multiple connections. 2047 */ 2048 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, path_id, link); 2049 } else { 2050 free(path_id); 2051 } 2052 2053 if (start || next_path->last_failed_tsc == 0) { 2054 /* bdev_nvme_failover_ctrlr() is just called or the next trid is not failed 2055 * or used yet. Try the next trid now. 2056 */ 2057 return true; 2058 } 2059 2060 if (spdk_get_ticks() > next_path->last_failed_tsc + spdk_get_ticks_hz() * 2061 nvme_ctrlr->opts.reconnect_delay_sec) { 2062 /* Enough backoff passed since the next trid failed. Try the next trid now. */ 2063 return true; 2064 } 2065 2066 /* The next trid will be tried after reconnect_delay_sec seconds. */ 2067 return false; 2068 } 2069 2070 static bool 2071 bdev_nvme_check_ctrlr_loss_timeout(struct nvme_ctrlr *nvme_ctrlr) 2072 { 2073 int32_t elapsed; 2074 2075 if (nvme_ctrlr->opts.ctrlr_loss_timeout_sec == 0 || 2076 nvme_ctrlr->opts.ctrlr_loss_timeout_sec == -1) { 2077 return false; 2078 } 2079 2080 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 2081 if (elapsed >= nvme_ctrlr->opts.ctrlr_loss_timeout_sec) { 2082 return true; 2083 } else { 2084 return false; 2085 } 2086 } 2087 2088 static bool 2089 bdev_nvme_check_fast_io_fail_timeout(struct nvme_ctrlr *nvme_ctrlr) 2090 { 2091 uint32_t elapsed; 2092 2093 if (nvme_ctrlr->opts.fast_io_fail_timeout_sec == 0) { 2094 return false; 2095 } 2096 2097 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 2098 if (elapsed >= nvme_ctrlr->opts.fast_io_fail_timeout_sec) { 2099 return true; 2100 } else { 2101 return false; 2102 } 2103 } 2104 2105 static void bdev_nvme_reset_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, bool success); 2106 2107 static void 2108 nvme_ctrlr_disconnect(struct nvme_ctrlr *nvme_ctrlr, nvme_ctrlr_disconnected_cb cb_fn) 2109 { 2110 int rc; 2111 2112 NVME_CTRLR_INFOLOG(nvme_ctrlr, "Start disconnecting ctrlr.\n"); 2113 2114 rc = spdk_nvme_ctrlr_disconnect(nvme_ctrlr->ctrlr); 2115 if (rc != 0) { 2116 NVME_CTRLR_WARNLOG(nvme_ctrlr, "disconnecting ctrlr failed.\n"); 2117 2118 /* Disconnect fails if ctrlr is already resetting or removed. In this case, 2119 * fail the reset sequence immediately. 2120 */ 2121 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 2122 return; 2123 } 2124 2125 /* spdk_nvme_ctrlr_disconnect() may complete asynchronously later by polling adminq. 2126 * Set callback here to execute the specified operation after ctrlr is really disconnected. 2127 */ 2128 assert(nvme_ctrlr->disconnected_cb == NULL); 2129 nvme_ctrlr->disconnected_cb = cb_fn; 2130 2131 /* During disconnection, reduce the period to poll adminq more often. */ 2132 bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 0); 2133 } 2134 2135 enum bdev_nvme_op_after_reset { 2136 OP_NONE, 2137 OP_COMPLETE_PENDING_DESTRUCT, 2138 OP_DESTRUCT, 2139 OP_DELAYED_RECONNECT, 2140 OP_FAILOVER, 2141 }; 2142 2143 typedef enum bdev_nvme_op_after_reset _bdev_nvme_op_after_reset; 2144 2145 static _bdev_nvme_op_after_reset 2146 bdev_nvme_check_op_after_reset(struct nvme_ctrlr *nvme_ctrlr, bool success) 2147 { 2148 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 2149 /* Complete pending destruct after reset completes. */ 2150 return OP_COMPLETE_PENDING_DESTRUCT; 2151 } else if (nvme_ctrlr->pending_failover) { 2152 nvme_ctrlr->pending_failover = false; 2153 nvme_ctrlr->reset_start_tsc = 0; 2154 return OP_FAILOVER; 2155 } else if (success || nvme_ctrlr->opts.reconnect_delay_sec == 0) { 2156 nvme_ctrlr->reset_start_tsc = 0; 2157 return OP_NONE; 2158 } else if (bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 2159 return OP_DESTRUCT; 2160 } else { 2161 if (bdev_nvme_check_fast_io_fail_timeout(nvme_ctrlr)) { 2162 nvme_ctrlr->fast_io_fail_timedout = true; 2163 } 2164 return OP_DELAYED_RECONNECT; 2165 } 2166 } 2167 2168 static int bdev_nvme_delete_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool hotplug); 2169 static void bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 2170 2171 static int 2172 bdev_nvme_reconnect_delay_timer_expired(void *ctx) 2173 { 2174 struct nvme_ctrlr *nvme_ctrlr = ctx; 2175 2176 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect_delay, nvme_ctrlr->nbdev_ctrlr->name); 2177 pthread_mutex_lock(&nvme_ctrlr->mutex); 2178 2179 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 2180 2181 if (!nvme_ctrlr->reconnect_is_delayed) { 2182 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2183 return SPDK_POLLER_BUSY; 2184 } 2185 2186 nvme_ctrlr->reconnect_is_delayed = false; 2187 2188 if (nvme_ctrlr->destruct) { 2189 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2190 return SPDK_POLLER_BUSY; 2191 } 2192 2193 assert(nvme_ctrlr->resetting == false); 2194 nvme_ctrlr->resetting = true; 2195 2196 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2197 2198 spdk_poller_resume(nvme_ctrlr->adminq_timer_poller); 2199 2200 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2201 return SPDK_POLLER_BUSY; 2202 } 2203 2204 static void 2205 bdev_nvme_start_reconnect_delay_timer(struct nvme_ctrlr *nvme_ctrlr) 2206 { 2207 spdk_poller_pause(nvme_ctrlr->adminq_timer_poller); 2208 2209 assert(nvme_ctrlr->reconnect_is_delayed == false); 2210 nvme_ctrlr->reconnect_is_delayed = true; 2211 2212 assert(nvme_ctrlr->reconnect_delay_timer == NULL); 2213 nvme_ctrlr->reconnect_delay_timer = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_delay_timer_expired, 2214 nvme_ctrlr, 2215 nvme_ctrlr->opts.reconnect_delay_sec * SPDK_SEC_TO_USEC); 2216 } 2217 2218 static void remove_discovery_entry(struct nvme_ctrlr *nvme_ctrlr); 2219 2220 static void 2221 _bdev_nvme_reset_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, void *ctx, int status) 2222 { 2223 bool success = (ctx == NULL); 2224 bdev_nvme_ctrlr_op_cb ctrlr_op_cb_fn = nvme_ctrlr->ctrlr_op_cb_fn; 2225 void *ctrlr_op_cb_arg = nvme_ctrlr->ctrlr_op_cb_arg; 2226 enum bdev_nvme_op_after_reset op_after_reset; 2227 2228 assert(nvme_ctrlr->thread == spdk_get_thread()); 2229 2230 nvme_ctrlr->ctrlr_op_cb_fn = NULL; 2231 nvme_ctrlr->ctrlr_op_cb_arg = NULL; 2232 2233 if (!success) { 2234 NVME_CTRLR_ERRLOG(nvme_ctrlr, "Resetting controller failed.\n"); 2235 } else { 2236 NVME_CTRLR_NOTICELOG(nvme_ctrlr, "Resetting controller successful.\n"); 2237 } 2238 2239 pthread_mutex_lock(&nvme_ctrlr->mutex); 2240 nvme_ctrlr->resetting = false; 2241 nvme_ctrlr->dont_retry = false; 2242 nvme_ctrlr->in_failover = false; 2243 2244 op_after_reset = bdev_nvme_check_op_after_reset(nvme_ctrlr, success); 2245 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2246 2247 /* Delay callbacks when the next operation is a failover. */ 2248 if (ctrlr_op_cb_fn && op_after_reset != OP_FAILOVER) { 2249 ctrlr_op_cb_fn(ctrlr_op_cb_arg, success ? 0 : -1); 2250 } 2251 2252 switch (op_after_reset) { 2253 case OP_COMPLETE_PENDING_DESTRUCT: 2254 nvme_ctrlr_unregister(nvme_ctrlr); 2255 break; 2256 case OP_DESTRUCT: 2257 bdev_nvme_delete_ctrlr(nvme_ctrlr, false); 2258 remove_discovery_entry(nvme_ctrlr); 2259 break; 2260 case OP_DELAYED_RECONNECT: 2261 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_start_reconnect_delay_timer); 2262 break; 2263 case OP_FAILOVER: 2264 nvme_ctrlr->ctrlr_op_cb_fn = ctrlr_op_cb_fn; 2265 nvme_ctrlr->ctrlr_op_cb_arg = ctrlr_op_cb_arg; 2266 bdev_nvme_failover_ctrlr(nvme_ctrlr); 2267 break; 2268 default: 2269 break; 2270 } 2271 } 2272 2273 static void 2274 bdev_nvme_reset_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, bool success) 2275 { 2276 pthread_mutex_lock(&nvme_ctrlr->mutex); 2277 if (!success) { 2278 /* Connecting the active trid failed. Set the next alternate trid to the 2279 * active trid if it exists. 2280 */ 2281 if (bdev_nvme_failover_trid(nvme_ctrlr, false, false)) { 2282 /* The next alternate trid exists and is ready to try. Try it now. */ 2283 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2284 2285 NVME_CTRLR_INFOLOG(nvme_ctrlr, "Try the next alternate trid %s:%s now.\n", 2286 nvme_ctrlr->active_path_id->trid.traddr, 2287 nvme_ctrlr->active_path_id->trid.trsvcid); 2288 2289 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr); 2290 return; 2291 } 2292 2293 /* We came here if there is no alternate trid or if the next trid exists but 2294 * is not ready to try. We will try the active trid after reconnect_delay_sec 2295 * seconds if it is non-zero or at the next reset call otherwise. 2296 */ 2297 } else { 2298 /* Connecting the active trid succeeded. Clear the last failed time because it 2299 * means the trid is failed if its last failed time is non-zero. 2300 */ 2301 nvme_ctrlr->active_path_id->last_failed_tsc = 0; 2302 } 2303 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2304 2305 NVME_CTRLR_INFOLOG(nvme_ctrlr, "Clear pending resets.\n"); 2306 2307 /* Make sure we clear any pending resets before returning. */ 2308 nvme_ctrlr_for_each_channel(nvme_ctrlr, 2309 bdev_nvme_complete_pending_resets, 2310 success ? NULL : (void *)0x1, 2311 _bdev_nvme_reset_ctrlr_complete); 2312 } 2313 2314 static void 2315 bdev_nvme_reset_create_qpairs_failed(struct nvme_ctrlr *nvme_ctrlr, void *ctx, int status) 2316 { 2317 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 2318 } 2319 2320 static void 2321 bdev_nvme_reset_destroy_qpair(struct nvme_ctrlr_channel_iter *i, 2322 struct nvme_ctrlr *nvme_ctrlr, 2323 struct nvme_ctrlr_channel *ctrlr_ch, void *ctx) 2324 { 2325 struct nvme_qpair *nvme_qpair; 2326 struct spdk_nvme_qpair *qpair; 2327 2328 nvme_qpair = ctrlr_ch->qpair; 2329 assert(nvme_qpair != NULL); 2330 2331 _bdev_nvme_clear_io_path_cache(nvme_qpair); 2332 2333 qpair = nvme_qpair->qpair; 2334 if (qpair != NULL) { 2335 NVME_CTRLR_INFOLOG(nvme_ctrlr, "Start disconnecting qpair %p:%u.\n", 2336 qpair, spdk_nvme_qpair_get_id(qpair)); 2337 2338 if (nvme_qpair->ctrlr->dont_retry) { 2339 spdk_nvme_qpair_set_abort_dnr(qpair, true); 2340 } 2341 spdk_nvme_ctrlr_disconnect_io_qpair(qpair); 2342 2343 /* The current full reset sequence will move to the next 2344 * ctrlr_channel after the qpair is actually disconnected. 2345 */ 2346 assert(ctrlr_ch->reset_iter == NULL); 2347 ctrlr_ch->reset_iter = i; 2348 } else { 2349 nvme_ctrlr_for_each_channel_continue(i, 0); 2350 } 2351 } 2352 2353 static void 2354 bdev_nvme_reset_create_qpairs_done(struct nvme_ctrlr *nvme_ctrlr, void *ctx, int status) 2355 { 2356 if (status == 0) { 2357 NVME_CTRLR_INFOLOG(nvme_ctrlr, "qpairs were created after ctrlr reset.\n"); 2358 2359 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, true); 2360 } else { 2361 NVME_CTRLR_INFOLOG(nvme_ctrlr, "qpairs were failed to create after ctrlr reset.\n"); 2362 2363 /* Delete the added qpairs and quiesce ctrlr to make the states clean. */ 2364 nvme_ctrlr_for_each_channel(nvme_ctrlr, 2365 bdev_nvme_reset_destroy_qpair, 2366 NULL, 2367 bdev_nvme_reset_create_qpairs_failed); 2368 } 2369 } 2370 2371 static int 2372 bdev_nvme_reset_check_qpair_connected(void *ctx) 2373 { 2374 struct nvme_ctrlr_channel *ctrlr_ch = ctx; 2375 struct nvme_qpair *nvme_qpair = ctrlr_ch->qpair; 2376 struct spdk_nvme_qpair *qpair; 2377 2378 if (ctrlr_ch->reset_iter == NULL) { 2379 /* qpair was already failed to connect and the reset sequence is being aborted. */ 2380 assert(ctrlr_ch->connect_poller == NULL); 2381 assert(nvme_qpair->qpair == NULL); 2382 2383 NVME_CTRLR_INFOLOG(nvme_qpair->ctrlr, 2384 "qpair was already failed to connect. reset is being aborted.\n"); 2385 return SPDK_POLLER_BUSY; 2386 } 2387 2388 qpair = nvme_qpair->qpair; 2389 assert(qpair != NULL); 2390 2391 if (!spdk_nvme_qpair_is_connected(qpair)) { 2392 return SPDK_POLLER_BUSY; 2393 } 2394 2395 NVME_CTRLR_INFOLOG(nvme_qpair->ctrlr, "qpair %p:%u was connected.\n", 2396 qpair, spdk_nvme_qpair_get_id(qpair)); 2397 2398 spdk_poller_unregister(&ctrlr_ch->connect_poller); 2399 2400 /* qpair was completed to connect. Move to the next ctrlr_channel */ 2401 nvme_ctrlr_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 2402 ctrlr_ch->reset_iter = NULL; 2403 2404 if (!g_opts.disable_auto_failback) { 2405 _bdev_nvme_clear_io_path_cache(nvme_qpair); 2406 } 2407 2408 return SPDK_POLLER_BUSY; 2409 } 2410 2411 static void 2412 bdev_nvme_reset_create_qpair(struct nvme_ctrlr_channel_iter *i, 2413 struct nvme_ctrlr *nvme_ctrlr, 2414 struct nvme_ctrlr_channel *ctrlr_ch, 2415 void *ctx) 2416 { 2417 struct nvme_qpair *nvme_qpair = ctrlr_ch->qpair; 2418 struct spdk_nvme_qpair *qpair; 2419 int rc; 2420 2421 rc = bdev_nvme_create_qpair(nvme_qpair); 2422 if (rc == 0) { 2423 ctrlr_ch->connect_poller = SPDK_POLLER_REGISTER(bdev_nvme_reset_check_qpair_connected, 2424 ctrlr_ch, 0); 2425 2426 qpair = nvme_qpair->qpair; 2427 2428 NVME_CTRLR_INFOLOG(nvme_ctrlr, "Start checking qpair %p:%u to be connected.\n", 2429 qpair, spdk_nvme_qpair_get_id(qpair)); 2430 2431 /* The current full reset sequence will move to the next 2432 * ctrlr_channel after the qpair is actually connected. 2433 */ 2434 assert(ctrlr_ch->reset_iter == NULL); 2435 ctrlr_ch->reset_iter = i; 2436 } else { 2437 nvme_ctrlr_for_each_channel_continue(i, rc); 2438 } 2439 } 2440 2441 static void 2442 nvme_ctrlr_check_namespaces(struct nvme_ctrlr *nvme_ctrlr) 2443 { 2444 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 2445 struct nvme_ns *nvme_ns; 2446 2447 for (nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 2448 nvme_ns != NULL; 2449 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns)) { 2450 if (!spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) { 2451 SPDK_DEBUGLOG(bdev_nvme, "NSID %u was removed during reset.\n", nvme_ns->id); 2452 /* NS can be added again. Just nullify nvme_ns->ns. */ 2453 nvme_ns->ns = NULL; 2454 } 2455 } 2456 } 2457 2458 2459 static int 2460 bdev_nvme_reconnect_ctrlr_poll(void *arg) 2461 { 2462 struct nvme_ctrlr *nvme_ctrlr = arg; 2463 struct spdk_nvme_transport_id *trid; 2464 int rc = -ETIMEDOUT; 2465 2466 if (bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 2467 /* Mark the ctrlr as failed. The next call to 2468 * spdk_nvme_ctrlr_reconnect_poll_async() will then 2469 * do the necessary cleanup and return failure. 2470 */ 2471 spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr); 2472 } 2473 2474 rc = spdk_nvme_ctrlr_reconnect_poll_async(nvme_ctrlr->ctrlr); 2475 if (rc == -EAGAIN) { 2476 return SPDK_POLLER_BUSY; 2477 } 2478 2479 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 2480 if (rc == 0) { 2481 trid = &nvme_ctrlr->active_path_id->trid; 2482 2483 if (spdk_nvme_trtype_is_fabrics(trid->trtype)) { 2484 NVME_CTRLR_INFOLOG(nvme_ctrlr, "ctrlr was connected to %s:%s. Create qpairs.\n", 2485 trid->traddr, trid->trsvcid); 2486 } else { 2487 NVME_CTRLR_INFOLOG(nvme_ctrlr, "ctrlr was connected. Create qpairs.\n"); 2488 } 2489 2490 nvme_ctrlr_check_namespaces(nvme_ctrlr); 2491 2492 /* Recreate all of the I/O queue pairs */ 2493 nvme_ctrlr_for_each_channel(nvme_ctrlr, 2494 bdev_nvme_reset_create_qpair, 2495 NULL, 2496 bdev_nvme_reset_create_qpairs_done); 2497 } else { 2498 NVME_CTRLR_INFOLOG(nvme_ctrlr, "ctrlr could not be connected.\n"); 2499 2500 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 2501 } 2502 return SPDK_POLLER_BUSY; 2503 } 2504 2505 static void 2506 bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2507 { 2508 NVME_CTRLR_INFOLOG(nvme_ctrlr, "Start reconnecting ctrlr.\n"); 2509 2510 spdk_nvme_ctrlr_reconnect_async(nvme_ctrlr->ctrlr); 2511 2512 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect, nvme_ctrlr->nbdev_ctrlr->name); 2513 assert(nvme_ctrlr->reset_detach_poller == NULL); 2514 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_ctrlr_poll, 2515 nvme_ctrlr, 0); 2516 } 2517 2518 static void 2519 bdev_nvme_reset_destroy_qpair_done(struct nvme_ctrlr *nvme_ctrlr, void *ctx, int status) 2520 { 2521 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reset, nvme_ctrlr->nbdev_ctrlr->name); 2522 assert(status == 0); 2523 2524 NVME_CTRLR_INFOLOG(nvme_ctrlr, "qpairs were deleted.\n"); 2525 2526 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2527 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2528 } else { 2529 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr); 2530 } 2531 } 2532 2533 static void 2534 bdev_nvme_reset_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr) 2535 { 2536 NVME_CTRLR_INFOLOG(nvme_ctrlr, "Delete qpairs for reset.\n"); 2537 2538 nvme_ctrlr_for_each_channel(nvme_ctrlr, 2539 bdev_nvme_reset_destroy_qpair, 2540 NULL, 2541 bdev_nvme_reset_destroy_qpair_done); 2542 } 2543 2544 static void 2545 bdev_nvme_reconnect_ctrlr_now(void *ctx) 2546 { 2547 struct nvme_ctrlr *nvme_ctrlr = ctx; 2548 2549 assert(nvme_ctrlr->resetting == true); 2550 assert(nvme_ctrlr->thread == spdk_get_thread()); 2551 2552 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 2553 2554 spdk_poller_resume(nvme_ctrlr->adminq_timer_poller); 2555 2556 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2557 } 2558 2559 static void 2560 _bdev_nvme_reset_ctrlr(void *ctx) 2561 { 2562 struct nvme_ctrlr *nvme_ctrlr = ctx; 2563 2564 assert(nvme_ctrlr->resetting == true); 2565 assert(nvme_ctrlr->thread == spdk_get_thread()); 2566 2567 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2568 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reset_destroy_qpairs); 2569 } else { 2570 bdev_nvme_reset_destroy_qpairs(nvme_ctrlr); 2571 } 2572 } 2573 2574 static int 2575 bdev_nvme_reset_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2576 { 2577 spdk_msg_fn msg_fn; 2578 2579 pthread_mutex_lock(&nvme_ctrlr->mutex); 2580 if (nvme_ctrlr->destruct) { 2581 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2582 return -ENXIO; 2583 } 2584 2585 if (nvme_ctrlr->resetting) { 2586 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2587 NVME_CTRLR_NOTICELOG(nvme_ctrlr, "Unable to perform reset, already in progress.\n"); 2588 return -EBUSY; 2589 } 2590 2591 if (nvme_ctrlr->disabled) { 2592 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2593 NVME_CTRLR_NOTICELOG(nvme_ctrlr, "Unable to perform reset. Controller is disabled.\n"); 2594 return -EALREADY; 2595 } 2596 2597 nvme_ctrlr->resetting = true; 2598 nvme_ctrlr->dont_retry = true; 2599 2600 if (nvme_ctrlr->reconnect_is_delayed) { 2601 NVME_CTRLR_INFOLOG(nvme_ctrlr, "Reconnect is already scheduled.\n"); 2602 msg_fn = bdev_nvme_reconnect_ctrlr_now; 2603 nvme_ctrlr->reconnect_is_delayed = false; 2604 } else { 2605 msg_fn = _bdev_nvme_reset_ctrlr; 2606 assert(nvme_ctrlr->reset_start_tsc == 0); 2607 } 2608 2609 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2610 2611 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2612 2613 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 2614 return 0; 2615 } 2616 2617 static int 2618 bdev_nvme_enable_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2619 { 2620 pthread_mutex_lock(&nvme_ctrlr->mutex); 2621 if (nvme_ctrlr->destruct) { 2622 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2623 return -ENXIO; 2624 } 2625 2626 if (nvme_ctrlr->resetting) { 2627 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2628 return -EBUSY; 2629 } 2630 2631 if (!nvme_ctrlr->disabled) { 2632 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2633 return -EALREADY; 2634 } 2635 2636 nvme_ctrlr->disabled = false; 2637 nvme_ctrlr->resetting = true; 2638 2639 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2640 2641 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2642 2643 spdk_thread_send_msg(nvme_ctrlr->thread, bdev_nvme_reconnect_ctrlr_now, nvme_ctrlr); 2644 return 0; 2645 } 2646 2647 static void 2648 _bdev_nvme_disable_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, void *ctx, int status) 2649 { 2650 bdev_nvme_ctrlr_op_cb ctrlr_op_cb_fn = nvme_ctrlr->ctrlr_op_cb_fn; 2651 void *ctrlr_op_cb_arg = nvme_ctrlr->ctrlr_op_cb_arg; 2652 enum bdev_nvme_op_after_reset op_after_disable; 2653 2654 assert(nvme_ctrlr->thread == spdk_get_thread()); 2655 2656 nvme_ctrlr->ctrlr_op_cb_fn = NULL; 2657 nvme_ctrlr->ctrlr_op_cb_arg = NULL; 2658 2659 pthread_mutex_lock(&nvme_ctrlr->mutex); 2660 2661 nvme_ctrlr->resetting = false; 2662 nvme_ctrlr->dont_retry = false; 2663 2664 op_after_disable = bdev_nvme_check_op_after_reset(nvme_ctrlr, true); 2665 2666 nvme_ctrlr->disabled = true; 2667 spdk_poller_pause(nvme_ctrlr->adminq_timer_poller); 2668 2669 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2670 2671 if (ctrlr_op_cb_fn) { 2672 ctrlr_op_cb_fn(ctrlr_op_cb_arg, 0); 2673 } 2674 2675 switch (op_after_disable) { 2676 case OP_COMPLETE_PENDING_DESTRUCT: 2677 nvme_ctrlr_unregister(nvme_ctrlr); 2678 break; 2679 default: 2680 break; 2681 } 2682 2683 } 2684 2685 static void 2686 bdev_nvme_disable_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr) 2687 { 2688 /* Make sure we clear any pending resets before returning. */ 2689 nvme_ctrlr_for_each_channel(nvme_ctrlr, 2690 bdev_nvme_complete_pending_resets, 2691 NULL, 2692 _bdev_nvme_disable_ctrlr_complete); 2693 } 2694 2695 static void 2696 bdev_nvme_disable_destroy_qpairs_done(struct nvme_ctrlr *nvme_ctrlr, void *ctx, int status) 2697 { 2698 assert(status == 0); 2699 2700 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2701 bdev_nvme_disable_ctrlr_complete(nvme_ctrlr); 2702 } else { 2703 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_disable_ctrlr_complete); 2704 } 2705 } 2706 2707 static void 2708 bdev_nvme_disable_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr) 2709 { 2710 nvme_ctrlr_for_each_channel(nvme_ctrlr, 2711 bdev_nvme_reset_destroy_qpair, 2712 NULL, 2713 bdev_nvme_disable_destroy_qpairs_done); 2714 } 2715 2716 static void 2717 _bdev_nvme_cancel_reconnect_and_disable_ctrlr(void *ctx) 2718 { 2719 struct nvme_ctrlr *nvme_ctrlr = ctx; 2720 2721 assert(nvme_ctrlr->resetting == true); 2722 assert(nvme_ctrlr->thread == spdk_get_thread()); 2723 2724 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 2725 2726 bdev_nvme_disable_ctrlr_complete(nvme_ctrlr); 2727 } 2728 2729 static void 2730 _bdev_nvme_disconnect_and_disable_ctrlr(void *ctx) 2731 { 2732 struct nvme_ctrlr *nvme_ctrlr = ctx; 2733 2734 assert(nvme_ctrlr->resetting == true); 2735 assert(nvme_ctrlr->thread == spdk_get_thread()); 2736 2737 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2738 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_disable_destroy_qpairs); 2739 } else { 2740 bdev_nvme_disable_destroy_qpairs(nvme_ctrlr); 2741 } 2742 } 2743 2744 static int 2745 bdev_nvme_disable_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2746 { 2747 spdk_msg_fn msg_fn; 2748 2749 pthread_mutex_lock(&nvme_ctrlr->mutex); 2750 if (nvme_ctrlr->destruct) { 2751 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2752 return -ENXIO; 2753 } 2754 2755 if (nvme_ctrlr->resetting) { 2756 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2757 return -EBUSY; 2758 } 2759 2760 if (nvme_ctrlr->disabled) { 2761 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2762 return -EALREADY; 2763 } 2764 2765 nvme_ctrlr->resetting = true; 2766 nvme_ctrlr->dont_retry = true; 2767 2768 if (nvme_ctrlr->reconnect_is_delayed) { 2769 msg_fn = _bdev_nvme_cancel_reconnect_and_disable_ctrlr; 2770 nvme_ctrlr->reconnect_is_delayed = false; 2771 } else { 2772 msg_fn = _bdev_nvme_disconnect_and_disable_ctrlr; 2773 } 2774 2775 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2776 2777 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2778 2779 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 2780 return 0; 2781 } 2782 2783 static int 2784 nvme_ctrlr_op(struct nvme_ctrlr *nvme_ctrlr, enum nvme_ctrlr_op op, 2785 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2786 { 2787 int rc; 2788 2789 switch (op) { 2790 case NVME_CTRLR_OP_RESET: 2791 rc = bdev_nvme_reset_ctrlr(nvme_ctrlr); 2792 break; 2793 case NVME_CTRLR_OP_ENABLE: 2794 rc = bdev_nvme_enable_ctrlr(nvme_ctrlr); 2795 break; 2796 case NVME_CTRLR_OP_DISABLE: 2797 rc = bdev_nvme_disable_ctrlr(nvme_ctrlr); 2798 break; 2799 default: 2800 rc = -EINVAL; 2801 break; 2802 } 2803 2804 if (rc == 0) { 2805 assert(nvme_ctrlr->ctrlr_op_cb_fn == NULL); 2806 assert(nvme_ctrlr->ctrlr_op_cb_arg == NULL); 2807 nvme_ctrlr->ctrlr_op_cb_fn = cb_fn; 2808 nvme_ctrlr->ctrlr_op_cb_arg = cb_arg; 2809 } 2810 return rc; 2811 } 2812 2813 struct nvme_ctrlr_op_rpc_ctx { 2814 struct nvme_ctrlr *nvme_ctrlr; 2815 struct spdk_thread *orig_thread; 2816 enum nvme_ctrlr_op op; 2817 int rc; 2818 bdev_nvme_ctrlr_op_cb cb_fn; 2819 void *cb_arg; 2820 }; 2821 2822 static void 2823 _nvme_ctrlr_op_rpc_complete(void *_ctx) 2824 { 2825 struct nvme_ctrlr_op_rpc_ctx *ctx = _ctx; 2826 2827 assert(ctx != NULL); 2828 assert(ctx->cb_fn != NULL); 2829 2830 ctx->cb_fn(ctx->cb_arg, ctx->rc); 2831 2832 free(ctx); 2833 } 2834 2835 static void 2836 nvme_ctrlr_op_rpc_complete(void *cb_arg, int rc) 2837 { 2838 struct nvme_ctrlr_op_rpc_ctx *ctx = cb_arg; 2839 2840 ctx->rc = rc; 2841 2842 spdk_thread_send_msg(ctx->orig_thread, _nvme_ctrlr_op_rpc_complete, ctx); 2843 } 2844 2845 void 2846 nvme_ctrlr_op_rpc(struct nvme_ctrlr *nvme_ctrlr, enum nvme_ctrlr_op op, 2847 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2848 { 2849 struct nvme_ctrlr_op_rpc_ctx *ctx; 2850 int rc; 2851 2852 assert(cb_fn != NULL); 2853 2854 ctx = calloc(1, sizeof(*ctx)); 2855 if (ctx == NULL) { 2856 NVME_CTRLR_ERRLOG(nvme_ctrlr, "Failed to allocate nvme_ctrlr_op_rpc_ctx.\n"); 2857 cb_fn(cb_arg, -ENOMEM); 2858 return; 2859 } 2860 2861 ctx->orig_thread = spdk_get_thread(); 2862 ctx->cb_fn = cb_fn; 2863 ctx->cb_arg = cb_arg; 2864 2865 rc = nvme_ctrlr_op(nvme_ctrlr, op, nvme_ctrlr_op_rpc_complete, ctx); 2866 if (rc == 0) { 2867 return; 2868 } else if (rc == -EALREADY) { 2869 rc = 0; 2870 } 2871 2872 nvme_ctrlr_op_rpc_complete(ctx, rc); 2873 } 2874 2875 static void nvme_bdev_ctrlr_op_rpc_continue(void *cb_arg, int rc); 2876 2877 static void 2878 _nvme_bdev_ctrlr_op_rpc_continue(void *_ctx) 2879 { 2880 struct nvme_ctrlr_op_rpc_ctx *ctx = _ctx; 2881 struct nvme_ctrlr *prev_nvme_ctrlr, *next_nvme_ctrlr; 2882 int rc; 2883 2884 prev_nvme_ctrlr = ctx->nvme_ctrlr; 2885 ctx->nvme_ctrlr = NULL; 2886 2887 if (ctx->rc != 0) { 2888 goto complete; 2889 } 2890 2891 next_nvme_ctrlr = TAILQ_NEXT(prev_nvme_ctrlr, tailq); 2892 if (next_nvme_ctrlr == NULL) { 2893 goto complete; 2894 } 2895 2896 rc = nvme_ctrlr_op(next_nvme_ctrlr, ctx->op, nvme_bdev_ctrlr_op_rpc_continue, ctx); 2897 if (rc == 0) { 2898 ctx->nvme_ctrlr = next_nvme_ctrlr; 2899 return; 2900 } else if (rc == -EALREADY) { 2901 ctx->nvme_ctrlr = next_nvme_ctrlr; 2902 rc = 0; 2903 } 2904 2905 ctx->rc = rc; 2906 2907 complete: 2908 ctx->cb_fn(ctx->cb_arg, ctx->rc); 2909 free(ctx); 2910 } 2911 2912 static void 2913 nvme_bdev_ctrlr_op_rpc_continue(void *cb_arg, int rc) 2914 { 2915 struct nvme_ctrlr_op_rpc_ctx *ctx = cb_arg; 2916 2917 ctx->rc = rc; 2918 2919 spdk_thread_send_msg(ctx->orig_thread, _nvme_bdev_ctrlr_op_rpc_continue, ctx); 2920 } 2921 2922 void 2923 nvme_bdev_ctrlr_op_rpc(struct nvme_bdev_ctrlr *nbdev_ctrlr, enum nvme_ctrlr_op op, 2924 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2925 { 2926 struct nvme_ctrlr_op_rpc_ctx *ctx; 2927 struct nvme_ctrlr *nvme_ctrlr; 2928 int rc; 2929 2930 assert(cb_fn != NULL); 2931 2932 ctx = calloc(1, sizeof(*ctx)); 2933 if (ctx == NULL) { 2934 SPDK_ERRLOG("Failed to allocate nvme_ctrlr_op_rpc_ctx.\n"); 2935 cb_fn(cb_arg, -ENOMEM); 2936 return; 2937 } 2938 2939 ctx->orig_thread = spdk_get_thread(); 2940 ctx->op = op; 2941 ctx->cb_fn = cb_fn; 2942 ctx->cb_arg = cb_arg; 2943 2944 nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 2945 assert(nvme_ctrlr != NULL); 2946 2947 rc = nvme_ctrlr_op(nvme_ctrlr, op, nvme_bdev_ctrlr_op_rpc_continue, ctx); 2948 if (rc == 0) { 2949 ctx->nvme_ctrlr = nvme_ctrlr; 2950 return; 2951 } else if (rc == -EALREADY) { 2952 ctx->nvme_ctrlr = nvme_ctrlr; 2953 rc = 0; 2954 } 2955 2956 nvme_bdev_ctrlr_op_rpc_continue(ctx, rc); 2957 } 2958 2959 static int _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio); 2960 2961 static void 2962 bdev_nvme_unfreeze_bdev_channel_done(struct nvme_bdev *nbdev, void *ctx, int status) 2963 { 2964 struct nvme_bdev_io *bio = ctx; 2965 enum spdk_bdev_io_status io_status; 2966 2967 if (bio->cpl.cdw0 == 0) { 2968 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 2969 } else { 2970 io_status = SPDK_BDEV_IO_STATUS_FAILED; 2971 } 2972 2973 NVME_BDEV_INFOLOG(nbdev, "reset_io %p completed, status:%d\n", bio, io_status); 2974 2975 __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), io_status, NULL); 2976 } 2977 2978 static void 2979 bdev_nvme_unfreeze_bdev_channel(struct nvme_bdev_channel_iter *i, 2980 struct nvme_bdev *nbdev, 2981 struct nvme_bdev_channel *nbdev_ch, void *ctx) 2982 { 2983 bdev_nvme_abort_retry_ios(nbdev_ch); 2984 nbdev_ch->resetting = false; 2985 2986 nvme_bdev_for_each_channel_continue(i, 0); 2987 } 2988 2989 static void 2990 bdev_nvme_reset_io_complete(struct nvme_bdev_io *bio) 2991 { 2992 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2993 struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt; 2994 2995 /* Abort all queued I/Os for retry. */ 2996 nvme_bdev_for_each_channel(nbdev, 2997 bdev_nvme_unfreeze_bdev_channel, 2998 bio, 2999 bdev_nvme_unfreeze_bdev_channel_done); 3000 } 3001 3002 static void 3003 _bdev_nvme_reset_io_continue(void *ctx) 3004 { 3005 struct nvme_bdev_io *bio = ctx; 3006 struct nvme_io_path *prev_io_path, *next_io_path; 3007 int rc; 3008 3009 prev_io_path = bio->io_path; 3010 bio->io_path = NULL; 3011 3012 next_io_path = STAILQ_NEXT(prev_io_path, stailq); 3013 if (next_io_path == NULL) { 3014 goto complete; 3015 } 3016 3017 rc = _bdev_nvme_reset_io(next_io_path, bio); 3018 if (rc == 0) { 3019 return; 3020 } 3021 3022 complete: 3023 bdev_nvme_reset_io_complete(bio); 3024 } 3025 3026 static void 3027 bdev_nvme_reset_io_continue(void *cb_arg, int rc) 3028 { 3029 struct nvme_bdev_io *bio = cb_arg; 3030 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 3031 struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt; 3032 3033 NVME_BDEV_INFOLOG(nbdev, "continue reset_io %p, rc:%d\n", bio, rc); 3034 3035 /* Reset status is initialized as "failed". Set to "success" once we have at least one 3036 * successfully reset nvme_ctrlr. 3037 */ 3038 if (rc == 0) { 3039 bio->cpl.cdw0 = 0; 3040 } 3041 3042 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), _bdev_nvme_reset_io_continue, bio); 3043 } 3044 3045 static int 3046 _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio) 3047 { 3048 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 3049 struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt; 3050 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 3051 struct nvme_ctrlr_channel *ctrlr_ch; 3052 int rc; 3053 3054 assert(bio->io_path == NULL); 3055 bio->io_path = io_path; 3056 3057 rc = nvme_ctrlr_op(nvme_ctrlr, NVME_CTRLR_OP_RESET, 3058 bdev_nvme_reset_io_continue, bio); 3059 3060 if (rc == 0) { 3061 NVME_BDEV_INFOLOG(nbdev, "reset_io %p started resetting ctrlr [%s, %u].\n", 3062 bio, CTRLR_STRING(nvme_ctrlr), CTRLR_ID(nvme_ctrlr)); 3063 } else if (rc == -EBUSY) { 3064 ctrlr_ch = io_path->qpair->ctrlr_ch; 3065 assert(ctrlr_ch != NULL); 3066 /* 3067 * Reset call is queued only if it is from the app framework. This is on purpose so that 3068 * we don't interfere with the app framework reset strategy. i.e. we are deferring to the 3069 * upper level. If they are in the middle of a reset, we won't try to schedule another one. 3070 */ 3071 TAILQ_INSERT_TAIL(&ctrlr_ch->pending_resets, bio, retry_link); 3072 3073 rc = 0; 3074 3075 NVME_BDEV_INFOLOG(nbdev, "reset_io %p was queued to ctrlr [%s, %u].\n", 3076 bio, CTRLR_STRING(nvme_ctrlr), CTRLR_ID(nvme_ctrlr)); 3077 } else { 3078 NVME_BDEV_INFOLOG(nbdev, "reset_io %p could not reset ctrlr [%s, %u], rc:%d\n", 3079 bio, CTRLR_STRING(nvme_ctrlr), CTRLR_ID(nvme_ctrlr), rc); 3080 } 3081 3082 return rc; 3083 } 3084 3085 static void 3086 bdev_nvme_freeze_bdev_channel_done(struct nvme_bdev *nbdev, void *ctx, int status) 3087 { 3088 struct nvme_bdev_io *bio = ctx; 3089 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 3090 struct nvme_bdev_channel *nbdev_ch; 3091 struct nvme_io_path *io_path; 3092 int rc; 3093 3094 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 3095 3096 /* Initialize with failed status. With multipath it is enough to have at least one successful 3097 * nvme_ctrlr reset. If there is none, reset status will remain failed. 3098 */ 3099 bio->cpl.cdw0 = 1; 3100 3101 /* Reset all nvme_ctrlrs of a bdev controller sequentially. */ 3102 io_path = STAILQ_FIRST(&nbdev_ch->io_path_list); 3103 assert(io_path != NULL); 3104 3105 rc = _bdev_nvme_reset_io(io_path, bio); 3106 if (rc != 0) { 3107 /* If the current nvme_ctrlr is disabled, skip it and move to the next nvme_ctrlr. */ 3108 rc = (rc == -EALREADY) ? 0 : rc; 3109 3110 bdev_nvme_reset_io_continue(bio, rc); 3111 } 3112 } 3113 3114 static void 3115 bdev_nvme_freeze_bdev_channel(struct nvme_bdev_channel_iter *i, 3116 struct nvme_bdev *nbdev, 3117 struct nvme_bdev_channel *nbdev_ch, void *ctx) 3118 { 3119 nbdev_ch->resetting = true; 3120 3121 nvme_bdev_for_each_channel_continue(i, 0); 3122 } 3123 3124 static void 3125 bdev_nvme_reset_io(struct nvme_bdev *nbdev, struct nvme_bdev_io *bio) 3126 { 3127 NVME_BDEV_INFOLOG(nbdev, "reset_io %p started.\n", bio); 3128 3129 nvme_bdev_for_each_channel(nbdev, 3130 bdev_nvme_freeze_bdev_channel, 3131 bio, 3132 bdev_nvme_freeze_bdev_channel_done); 3133 } 3134 3135 static int 3136 bdev_nvme_failover_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool remove) 3137 { 3138 if (nvme_ctrlr->destruct) { 3139 /* Don't bother resetting if the controller is in the process of being destructed. */ 3140 return -ENXIO; 3141 } 3142 3143 if (nvme_ctrlr->resetting) { 3144 if (!nvme_ctrlr->in_failover) { 3145 NVME_CTRLR_NOTICELOG(nvme_ctrlr, 3146 "Reset is already in progress. Defer failover until reset completes.\n"); 3147 3148 /* Defer failover until reset completes. */ 3149 nvme_ctrlr->pending_failover = true; 3150 return -EINPROGRESS; 3151 } else { 3152 NVME_CTRLR_NOTICELOG(nvme_ctrlr, "Unable to perform failover, already in progress.\n"); 3153 return -EBUSY; 3154 } 3155 } 3156 3157 bdev_nvme_failover_trid(nvme_ctrlr, remove, true); 3158 3159 if (nvme_ctrlr->reconnect_is_delayed) { 3160 NVME_CTRLR_NOTICELOG(nvme_ctrlr, "Reconnect is already scheduled.\n"); 3161 3162 /* We rely on the next reconnect for the failover. */ 3163 return -EALREADY; 3164 } 3165 3166 if (nvme_ctrlr->disabled) { 3167 NVME_CTRLR_NOTICELOG(nvme_ctrlr, "Controller is disabled.\n"); 3168 3169 /* We rely on the enablement for the failover. */ 3170 return -EALREADY; 3171 } 3172 3173 nvme_ctrlr->resetting = true; 3174 nvme_ctrlr->in_failover = true; 3175 3176 assert(nvme_ctrlr->reset_start_tsc == 0); 3177 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 3178 3179 return 0; 3180 } 3181 3182 static int 3183 bdev_nvme_failover_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 3184 { 3185 int rc; 3186 3187 pthread_mutex_lock(&nvme_ctrlr->mutex); 3188 rc = bdev_nvme_failover_ctrlr_unsafe(nvme_ctrlr, false); 3189 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3190 3191 if (rc == 0) { 3192 spdk_thread_send_msg(nvme_ctrlr->thread, _bdev_nvme_reset_ctrlr, nvme_ctrlr); 3193 } else if (rc == -EALREADY) { 3194 rc = 0; 3195 } 3196 3197 return rc; 3198 } 3199 3200 static int bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, 3201 uint64_t num_blocks); 3202 3203 static int bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, 3204 uint64_t num_blocks); 3205 3206 static int bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, 3207 uint64_t src_offset_blocks, 3208 uint64_t num_blocks); 3209 3210 static void 3211 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 3212 bool success) 3213 { 3214 struct nvme_bdev_io *bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 3215 int ret; 3216 3217 if (!success) { 3218 ret = -EINVAL; 3219 goto exit; 3220 } 3221 3222 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 3223 ret = -ENXIO; 3224 goto exit; 3225 } 3226 3227 ret = bdev_nvme_readv(bio, 3228 bdev_io->u.bdev.iovs, 3229 bdev_io->u.bdev.iovcnt, 3230 bdev_io->u.bdev.md_buf, 3231 bdev_io->u.bdev.num_blocks, 3232 bdev_io->u.bdev.offset_blocks, 3233 bdev_io->u.bdev.dif_check_flags, 3234 bdev_io->u.bdev.memory_domain, 3235 bdev_io->u.bdev.memory_domain_ctx, 3236 bdev_io->u.bdev.accel_sequence); 3237 3238 exit: 3239 if (spdk_unlikely(ret != 0)) { 3240 bdev_nvme_io_complete(bio, ret); 3241 } 3242 } 3243 3244 static inline void 3245 _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io) 3246 { 3247 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 3248 struct spdk_bdev *bdev = bdev_io->bdev; 3249 struct nvme_bdev_io *nbdev_io_to_abort; 3250 int rc = 0; 3251 3252 switch (bdev_io->type) { 3253 case SPDK_BDEV_IO_TYPE_READ: 3254 if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) { 3255 3256 rc = bdev_nvme_readv(nbdev_io, 3257 bdev_io->u.bdev.iovs, 3258 bdev_io->u.bdev.iovcnt, 3259 bdev_io->u.bdev.md_buf, 3260 bdev_io->u.bdev.num_blocks, 3261 bdev_io->u.bdev.offset_blocks, 3262 bdev_io->u.bdev.dif_check_flags, 3263 bdev_io->u.bdev.memory_domain, 3264 bdev_io->u.bdev.memory_domain_ctx, 3265 bdev_io->u.bdev.accel_sequence); 3266 } else { 3267 spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb, 3268 bdev_io->u.bdev.num_blocks * bdev->blocklen); 3269 rc = 0; 3270 } 3271 break; 3272 case SPDK_BDEV_IO_TYPE_WRITE: 3273 rc = bdev_nvme_writev(nbdev_io, 3274 bdev_io->u.bdev.iovs, 3275 bdev_io->u.bdev.iovcnt, 3276 bdev_io->u.bdev.md_buf, 3277 bdev_io->u.bdev.num_blocks, 3278 bdev_io->u.bdev.offset_blocks, 3279 bdev_io->u.bdev.dif_check_flags, 3280 bdev_io->u.bdev.memory_domain, 3281 bdev_io->u.bdev.memory_domain_ctx, 3282 bdev_io->u.bdev.accel_sequence, 3283 bdev_io->u.bdev.nvme_cdw12, 3284 bdev_io->u.bdev.nvme_cdw13); 3285 break; 3286 case SPDK_BDEV_IO_TYPE_COMPARE: 3287 rc = bdev_nvme_comparev(nbdev_io, 3288 bdev_io->u.bdev.iovs, 3289 bdev_io->u.bdev.iovcnt, 3290 bdev_io->u.bdev.md_buf, 3291 bdev_io->u.bdev.num_blocks, 3292 bdev_io->u.bdev.offset_blocks, 3293 bdev_io->u.bdev.dif_check_flags); 3294 break; 3295 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 3296 rc = bdev_nvme_comparev_and_writev(nbdev_io, 3297 bdev_io->u.bdev.iovs, 3298 bdev_io->u.bdev.iovcnt, 3299 bdev_io->u.bdev.fused_iovs, 3300 bdev_io->u.bdev.fused_iovcnt, 3301 bdev_io->u.bdev.md_buf, 3302 bdev_io->u.bdev.num_blocks, 3303 bdev_io->u.bdev.offset_blocks, 3304 bdev_io->u.bdev.dif_check_flags); 3305 break; 3306 case SPDK_BDEV_IO_TYPE_UNMAP: 3307 rc = bdev_nvme_unmap(nbdev_io, 3308 bdev_io->u.bdev.offset_blocks, 3309 bdev_io->u.bdev.num_blocks); 3310 break; 3311 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3312 rc = bdev_nvme_write_zeroes(nbdev_io, 3313 bdev_io->u.bdev.offset_blocks, 3314 bdev_io->u.bdev.num_blocks); 3315 break; 3316 case SPDK_BDEV_IO_TYPE_RESET: 3317 nbdev_io->io_path = NULL; 3318 bdev_nvme_reset_io(bdev->ctxt, nbdev_io); 3319 return; 3320 3321 case SPDK_BDEV_IO_TYPE_FLUSH: 3322 bdev_nvme_io_complete(nbdev_io, 0); 3323 return; 3324 3325 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 3326 rc = bdev_nvme_zone_appendv(nbdev_io, 3327 bdev_io->u.bdev.iovs, 3328 bdev_io->u.bdev.iovcnt, 3329 bdev_io->u.bdev.md_buf, 3330 bdev_io->u.bdev.num_blocks, 3331 bdev_io->u.bdev.offset_blocks, 3332 bdev_io->u.bdev.dif_check_flags); 3333 break; 3334 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 3335 rc = bdev_nvme_get_zone_info(nbdev_io, 3336 bdev_io->u.zone_mgmt.zone_id, 3337 bdev_io->u.zone_mgmt.num_zones, 3338 bdev_io->u.zone_mgmt.buf); 3339 break; 3340 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 3341 rc = bdev_nvme_zone_management(nbdev_io, 3342 bdev_io->u.zone_mgmt.zone_id, 3343 bdev_io->u.zone_mgmt.zone_action); 3344 break; 3345 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 3346 nbdev_io->io_path = NULL; 3347 bdev_nvme_admin_passthru(nbdev_ch, 3348 nbdev_io, 3349 &bdev_io->u.nvme_passthru.cmd, 3350 bdev_io->u.nvme_passthru.buf, 3351 bdev_io->u.nvme_passthru.nbytes); 3352 return; 3353 3354 case SPDK_BDEV_IO_TYPE_NVME_IO: 3355 rc = bdev_nvme_io_passthru(nbdev_io, 3356 &bdev_io->u.nvme_passthru.cmd, 3357 bdev_io->u.nvme_passthru.buf, 3358 bdev_io->u.nvme_passthru.nbytes); 3359 break; 3360 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3361 rc = bdev_nvme_io_passthru_md(nbdev_io, 3362 &bdev_io->u.nvme_passthru.cmd, 3363 bdev_io->u.nvme_passthru.buf, 3364 bdev_io->u.nvme_passthru.nbytes, 3365 bdev_io->u.nvme_passthru.md_buf, 3366 bdev_io->u.nvme_passthru.md_len); 3367 break; 3368 case SPDK_BDEV_IO_TYPE_NVME_IOV_MD: 3369 rc = bdev_nvme_iov_passthru_md(nbdev_io, 3370 &bdev_io->u.nvme_passthru.cmd, 3371 bdev_io->u.nvme_passthru.iovs, 3372 bdev_io->u.nvme_passthru.iovcnt, 3373 bdev_io->u.nvme_passthru.nbytes, 3374 bdev_io->u.nvme_passthru.md_buf, 3375 bdev_io->u.nvme_passthru.md_len); 3376 break; 3377 case SPDK_BDEV_IO_TYPE_ABORT: 3378 nbdev_io->io_path = NULL; 3379 nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx; 3380 bdev_nvme_abort(nbdev_ch, 3381 nbdev_io, 3382 nbdev_io_to_abort); 3383 return; 3384 3385 case SPDK_BDEV_IO_TYPE_COPY: 3386 rc = bdev_nvme_copy(nbdev_io, 3387 bdev_io->u.bdev.offset_blocks, 3388 bdev_io->u.bdev.copy.src_offset_blocks, 3389 bdev_io->u.bdev.num_blocks); 3390 break; 3391 default: 3392 rc = -EINVAL; 3393 break; 3394 } 3395 3396 if (spdk_unlikely(rc != 0)) { 3397 bdev_nvme_io_complete(nbdev_io, rc); 3398 } 3399 } 3400 3401 static void 3402 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 3403 { 3404 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 3405 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 3406 3407 if (spdk_likely(nbdev_io->submit_tsc == 0)) { 3408 nbdev_io->submit_tsc = spdk_bdev_io_get_submit_tsc(bdev_io); 3409 } else { 3410 /* There are cases where submit_tsc != 0, i.e. retry I/O. 3411 * We need to update submit_tsc here. 3412 */ 3413 nbdev_io->submit_tsc = spdk_get_ticks(); 3414 } 3415 3416 spdk_trace_record(TRACE_BDEV_NVME_IO_START, 0, 0, (uintptr_t)nbdev_io, (uintptr_t)bdev_io); 3417 nbdev_io->io_path = bdev_nvme_find_io_path(nbdev_ch); 3418 if (spdk_unlikely(!nbdev_io->io_path)) { 3419 if (!bdev_nvme_io_type_is_admin(bdev_io->type)) { 3420 bdev_nvme_io_complete(nbdev_io, -ENXIO); 3421 return; 3422 } 3423 3424 /* Admin commands do not use the optimal I/O path. 3425 * Simply fall through even if it is not found. 3426 */ 3427 } 3428 3429 _bdev_nvme_submit_request(nbdev_ch, bdev_io); 3430 } 3431 3432 static bool 3433 bdev_nvme_is_supported_csi(enum spdk_nvme_csi csi) 3434 { 3435 switch (csi) { 3436 case SPDK_NVME_CSI_NVM: 3437 return true; 3438 case SPDK_NVME_CSI_ZNS: 3439 return true; 3440 default: 3441 return false; 3442 } 3443 } 3444 3445 static bool 3446 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 3447 { 3448 struct nvme_bdev *nbdev = ctx; 3449 struct nvme_ns *nvme_ns; 3450 struct spdk_nvme_ns *ns; 3451 struct spdk_nvme_ctrlr *ctrlr; 3452 const struct spdk_nvme_ctrlr_data *cdata; 3453 3454 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 3455 assert(nvme_ns != NULL); 3456 ns = nvme_ns->ns; 3457 if (ns == NULL) { 3458 return false; 3459 } 3460 3461 if (!bdev_nvme_is_supported_csi(spdk_nvme_ns_get_csi(ns))) { 3462 switch (io_type) { 3463 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 3464 case SPDK_BDEV_IO_TYPE_NVME_IO: 3465 return true; 3466 3467 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3468 return spdk_nvme_ns_get_md_size(ns) ? true : false; 3469 3470 default: 3471 return false; 3472 } 3473 } 3474 3475 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 3476 3477 switch (io_type) { 3478 case SPDK_BDEV_IO_TYPE_READ: 3479 case SPDK_BDEV_IO_TYPE_WRITE: 3480 case SPDK_BDEV_IO_TYPE_RESET: 3481 case SPDK_BDEV_IO_TYPE_FLUSH: 3482 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 3483 case SPDK_BDEV_IO_TYPE_NVME_IO: 3484 case SPDK_BDEV_IO_TYPE_ABORT: 3485 return true; 3486 3487 case SPDK_BDEV_IO_TYPE_COMPARE: 3488 return spdk_nvme_ns_supports_compare(ns); 3489 3490 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3491 return spdk_nvme_ns_get_md_size(ns) ? true : false; 3492 3493 case SPDK_BDEV_IO_TYPE_UNMAP: 3494 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3495 return cdata->oncs.dsm; 3496 3497 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3498 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3499 return cdata->oncs.write_zeroes; 3500 3501 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 3502 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 3503 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) { 3504 return true; 3505 } 3506 return false; 3507 3508 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 3509 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 3510 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS; 3511 3512 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 3513 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS && 3514 spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ZONE_APPEND_SUPPORTED; 3515 3516 case SPDK_BDEV_IO_TYPE_COPY: 3517 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3518 return cdata->oncs.copy; 3519 3520 default: 3521 return false; 3522 } 3523 } 3524 3525 static int 3526 nvme_qpair_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ctrlr_channel *ctrlr_ch) 3527 { 3528 struct nvme_qpair *nvme_qpair; 3529 struct spdk_io_channel *pg_ch; 3530 int rc; 3531 3532 nvme_qpair = calloc(1, sizeof(*nvme_qpair)); 3533 if (!nvme_qpair) { 3534 NVME_CTRLR_ERRLOG(nvme_ctrlr, "Failed to alloc nvme_qpair.\n"); 3535 return -1; 3536 } 3537 3538 TAILQ_INIT(&nvme_qpair->io_path_list); 3539 3540 nvme_qpair->ctrlr = nvme_ctrlr; 3541 nvme_qpair->ctrlr_ch = ctrlr_ch; 3542 3543 pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs); 3544 if (!pg_ch) { 3545 free(nvme_qpair); 3546 return -1; 3547 } 3548 3549 nvme_qpair->group = spdk_io_channel_get_ctx(pg_ch); 3550 3551 #ifdef SPDK_CONFIG_VTUNE 3552 nvme_qpair->group->collect_spin_stat = true; 3553 #else 3554 nvme_qpair->group->collect_spin_stat = false; 3555 #endif 3556 3557 if (!nvme_ctrlr->disabled) { 3558 /* If a nvme_ctrlr is disabled, don't try to create qpair for it. Qpair will 3559 * be created when it's enabled. 3560 */ 3561 rc = bdev_nvme_create_qpair(nvme_qpair); 3562 if (rc != 0) { 3563 /* nvme_ctrlr can't create IO qpair if connection is down. 3564 * If reconnect_delay_sec is non-zero, creating IO qpair is retried 3565 * after reconnect_delay_sec seconds. If bdev_retry_count is non-zero, 3566 * submitted IO will be queued until IO qpair is successfully created. 3567 * 3568 * Hence, if both are satisfied, ignore the failure. 3569 */ 3570 if (nvme_ctrlr->opts.reconnect_delay_sec == 0 || g_opts.bdev_retry_count == 0) { 3571 spdk_put_io_channel(pg_ch); 3572 free(nvme_qpair); 3573 return rc; 3574 } 3575 } 3576 } 3577 3578 TAILQ_INSERT_TAIL(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 3579 3580 ctrlr_ch->qpair = nvme_qpair; 3581 3582 pthread_mutex_lock(&nvme_qpair->ctrlr->mutex); 3583 nvme_qpair->ctrlr->ref++; 3584 pthread_mutex_unlock(&nvme_qpair->ctrlr->mutex); 3585 3586 return 0; 3587 } 3588 3589 static int 3590 bdev_nvme_create_ctrlr_channel_cb(void *io_device, void *ctx_buf) 3591 { 3592 struct nvme_ctrlr *nvme_ctrlr = io_device; 3593 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 3594 3595 TAILQ_INIT(&ctrlr_ch->pending_resets); 3596 3597 return nvme_qpair_create(nvme_ctrlr, ctrlr_ch); 3598 } 3599 3600 static void 3601 nvme_qpair_delete(struct nvme_qpair *nvme_qpair) 3602 { 3603 struct nvme_io_path *io_path, *next; 3604 3605 assert(nvme_qpair->group != NULL); 3606 3607 TAILQ_FOREACH_SAFE(io_path, &nvme_qpair->io_path_list, tailq, next) { 3608 TAILQ_REMOVE(&nvme_qpair->io_path_list, io_path, tailq); 3609 nvme_io_path_free(io_path); 3610 } 3611 3612 TAILQ_REMOVE(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 3613 3614 spdk_put_io_channel(spdk_io_channel_from_ctx(nvme_qpair->group)); 3615 3616 nvme_ctrlr_release(nvme_qpair->ctrlr); 3617 3618 free(nvme_qpair); 3619 } 3620 3621 static void 3622 bdev_nvme_destroy_ctrlr_channel_cb(void *io_device, void *ctx_buf) 3623 { 3624 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 3625 struct nvme_qpair *nvme_qpair; 3626 3627 nvme_qpair = ctrlr_ch->qpair; 3628 assert(nvme_qpair != NULL); 3629 3630 _bdev_nvme_clear_io_path_cache(nvme_qpair); 3631 3632 if (nvme_qpair->qpair != NULL) { 3633 if (ctrlr_ch->reset_iter == NULL) { 3634 spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); 3635 } else { 3636 /* Skip current ctrlr_channel in a full reset sequence because 3637 * it is being deleted now. The qpair is already being disconnected. 3638 * We do not have to restart disconnecting it. 3639 */ 3640 nvme_ctrlr_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 3641 } 3642 3643 /* We cannot release a reference to the poll group now. 3644 * The qpair may be disconnected asynchronously later. 3645 * We need to poll it until it is actually disconnected. 3646 * Just detach the qpair from the deleting ctrlr_channel. 3647 */ 3648 nvme_qpair->ctrlr_ch = NULL; 3649 } else { 3650 assert(ctrlr_ch->reset_iter == NULL); 3651 3652 nvme_qpair_delete(nvme_qpair); 3653 } 3654 } 3655 3656 static inline struct spdk_io_channel * 3657 bdev_nvme_get_accel_channel(struct nvme_poll_group *group) 3658 { 3659 if (spdk_unlikely(!group->accel_channel)) { 3660 group->accel_channel = spdk_accel_get_io_channel(); 3661 if (!group->accel_channel) { 3662 SPDK_ERRLOG("Cannot get the accel_channel for bdev nvme polling group=%p\n", 3663 group); 3664 return NULL; 3665 } 3666 } 3667 3668 return group->accel_channel; 3669 } 3670 3671 static void 3672 bdev_nvme_finish_sequence(void *seq, spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) 3673 { 3674 spdk_accel_sequence_finish(seq, cb_fn, cb_arg); 3675 } 3676 3677 static void 3678 bdev_nvme_abort_sequence(void *seq) 3679 { 3680 spdk_accel_sequence_abort(seq); 3681 } 3682 3683 static void 3684 bdev_nvme_reverse_sequence(void *seq) 3685 { 3686 spdk_accel_sequence_reverse(seq); 3687 } 3688 3689 static int 3690 bdev_nvme_append_crc32c(void *ctx, void **seq, uint32_t *dst, struct iovec *iovs, uint32_t iovcnt, 3691 struct spdk_memory_domain *domain, void *domain_ctx, uint32_t seed, 3692 spdk_nvme_accel_step_cb cb_fn, void *cb_arg) 3693 { 3694 struct spdk_io_channel *ch; 3695 struct nvme_poll_group *group = ctx; 3696 3697 ch = bdev_nvme_get_accel_channel(group); 3698 if (spdk_unlikely(ch == NULL)) { 3699 return -ENOMEM; 3700 } 3701 3702 return spdk_accel_append_crc32c((struct spdk_accel_sequence **)seq, ch, dst, iovs, iovcnt, 3703 domain, domain_ctx, seed, cb_fn, cb_arg); 3704 } 3705 3706 static int 3707 bdev_nvme_append_copy(void *ctx, void **seq, struct iovec *dst_iovs, uint32_t dst_iovcnt, 3708 struct spdk_memory_domain *dst_domain, void *dst_domain_ctx, 3709 struct iovec *src_iovs, uint32_t src_iovcnt, 3710 struct spdk_memory_domain *src_domain, void *src_domain_ctx, 3711 spdk_nvme_accel_step_cb cb_fn, void *cb_arg) 3712 { 3713 struct spdk_io_channel *ch; 3714 struct nvme_poll_group *group = ctx; 3715 3716 ch = bdev_nvme_get_accel_channel(group); 3717 if (spdk_unlikely(ch == NULL)) { 3718 return -ENOMEM; 3719 } 3720 3721 return spdk_accel_append_copy((struct spdk_accel_sequence **)seq, ch, 3722 dst_iovs, dst_iovcnt, dst_domain, dst_domain_ctx, 3723 src_iovs, src_iovcnt, src_domain, src_domain_ctx, 3724 cb_fn, cb_arg); 3725 } 3726 3727 static struct spdk_nvme_accel_fn_table g_bdev_nvme_accel_fn_table = { 3728 .table_size = sizeof(struct spdk_nvme_accel_fn_table), 3729 .append_crc32c = bdev_nvme_append_crc32c, 3730 .append_copy = bdev_nvme_append_copy, 3731 .finish_sequence = bdev_nvme_finish_sequence, 3732 .reverse_sequence = bdev_nvme_reverse_sequence, 3733 .abort_sequence = bdev_nvme_abort_sequence, 3734 }; 3735 3736 static int 3737 bdev_nvme_create_poll_group_cb(void *io_device, void *ctx_buf) 3738 { 3739 struct nvme_poll_group *group = ctx_buf; 3740 3741 TAILQ_INIT(&group->qpair_list); 3742 3743 group->group = spdk_nvme_poll_group_create(group, &g_bdev_nvme_accel_fn_table); 3744 if (group->group == NULL) { 3745 return -1; 3746 } 3747 3748 group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us); 3749 3750 if (group->poller == NULL) { 3751 spdk_nvme_poll_group_destroy(group->group); 3752 return -1; 3753 } 3754 3755 return 0; 3756 } 3757 3758 static void 3759 bdev_nvme_destroy_poll_group_cb(void *io_device, void *ctx_buf) 3760 { 3761 struct nvme_poll_group *group = ctx_buf; 3762 3763 assert(TAILQ_EMPTY(&group->qpair_list)); 3764 3765 if (group->accel_channel) { 3766 spdk_put_io_channel(group->accel_channel); 3767 } 3768 3769 spdk_poller_unregister(&group->poller); 3770 if (spdk_nvme_poll_group_destroy(group->group)) { 3771 SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.\n"); 3772 assert(false); 3773 } 3774 } 3775 3776 static struct spdk_io_channel * 3777 bdev_nvme_get_io_channel(void *ctx) 3778 { 3779 struct nvme_bdev *nvme_bdev = ctx; 3780 3781 return spdk_get_io_channel(nvme_bdev); 3782 } 3783 3784 static void * 3785 bdev_nvme_get_module_ctx(void *ctx) 3786 { 3787 struct nvme_bdev *nvme_bdev = ctx; 3788 struct nvme_ns *nvme_ns; 3789 3790 if (!nvme_bdev || nvme_bdev->disk.module != &nvme_if) { 3791 return NULL; 3792 } 3793 3794 nvme_ns = TAILQ_FIRST(&nvme_bdev->nvme_ns_list); 3795 if (!nvme_ns) { 3796 return NULL; 3797 } 3798 3799 return nvme_ns->ns; 3800 } 3801 3802 static const char * 3803 _nvme_ana_state_str(enum spdk_nvme_ana_state ana_state) 3804 { 3805 switch (ana_state) { 3806 case SPDK_NVME_ANA_OPTIMIZED_STATE: 3807 return "optimized"; 3808 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 3809 return "non_optimized"; 3810 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 3811 return "inaccessible"; 3812 case SPDK_NVME_ANA_PERSISTENT_LOSS_STATE: 3813 return "persistent_loss"; 3814 case SPDK_NVME_ANA_CHANGE_STATE: 3815 return "change"; 3816 default: 3817 return NULL; 3818 } 3819 } 3820 3821 static int 3822 bdev_nvme_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size) 3823 { 3824 struct spdk_memory_domain **_domains = NULL; 3825 struct nvme_bdev *nbdev = ctx; 3826 struct nvme_ns *nvme_ns; 3827 int i = 0, _array_size = array_size; 3828 int rc = 0; 3829 3830 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 3831 if (domains && array_size >= i) { 3832 _domains = &domains[i]; 3833 } else { 3834 _domains = NULL; 3835 } 3836 rc = spdk_nvme_ctrlr_get_memory_domains(nvme_ns->ctrlr->ctrlr, _domains, _array_size); 3837 if (rc > 0) { 3838 i += rc; 3839 if (_array_size >= rc) { 3840 _array_size -= rc; 3841 } else { 3842 _array_size = 0; 3843 } 3844 } else if (rc < 0) { 3845 return rc; 3846 } 3847 } 3848 3849 return i; 3850 } 3851 3852 static const char * 3853 nvme_ctrlr_get_state_str(struct nvme_ctrlr *nvme_ctrlr) 3854 { 3855 if (nvme_ctrlr->destruct) { 3856 return "deleting"; 3857 } else if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 3858 return "failed"; 3859 } else if (nvme_ctrlr->resetting) { 3860 return "resetting"; 3861 } else if (nvme_ctrlr->reconnect_is_delayed > 0) { 3862 return "reconnect_is_delayed"; 3863 } else if (nvme_ctrlr->disabled) { 3864 return "disabled"; 3865 } else { 3866 return "enabled"; 3867 } 3868 } 3869 3870 void 3871 nvme_ctrlr_info_json(struct spdk_json_write_ctx *w, struct nvme_ctrlr *nvme_ctrlr) 3872 { 3873 struct spdk_nvme_transport_id *trid; 3874 const struct spdk_nvme_ctrlr_opts *opts; 3875 const struct spdk_nvme_ctrlr_data *cdata; 3876 struct nvme_path_id *path_id; 3877 int32_t numa_id; 3878 3879 spdk_json_write_object_begin(w); 3880 3881 spdk_json_write_named_string(w, "state", nvme_ctrlr_get_state_str(nvme_ctrlr)); 3882 3883 #ifdef SPDK_CONFIG_NVME_CUSE 3884 size_t cuse_name_size = 128; 3885 char cuse_name[cuse_name_size]; 3886 3887 int rc = spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr, cuse_name, &cuse_name_size); 3888 if (rc == 0) { 3889 spdk_json_write_named_string(w, "cuse_device", cuse_name); 3890 } 3891 #endif 3892 trid = &nvme_ctrlr->active_path_id->trid; 3893 spdk_json_write_named_object_begin(w, "trid"); 3894 nvme_bdev_dump_trid_json(trid, w); 3895 spdk_json_write_object_end(w); 3896 3897 path_id = TAILQ_NEXT(nvme_ctrlr->active_path_id, link); 3898 if (path_id != NULL) { 3899 spdk_json_write_named_array_begin(w, "alternate_trids"); 3900 do { 3901 trid = &path_id->trid; 3902 spdk_json_write_object_begin(w); 3903 nvme_bdev_dump_trid_json(trid, w); 3904 spdk_json_write_object_end(w); 3905 3906 path_id = TAILQ_NEXT(path_id, link); 3907 } while (path_id != NULL); 3908 spdk_json_write_array_end(w); 3909 } 3910 3911 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 3912 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 3913 3914 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 3915 spdk_json_write_named_object_begin(w, "host"); 3916 spdk_json_write_named_string(w, "nqn", opts->hostnqn); 3917 spdk_json_write_named_string(w, "addr", opts->src_addr); 3918 spdk_json_write_named_string(w, "svcid", opts->src_svcid); 3919 spdk_json_write_object_end(w); 3920 3921 numa_id = spdk_nvme_ctrlr_get_numa_id(nvme_ctrlr->ctrlr); 3922 if (numa_id != SPDK_ENV_NUMA_ID_ANY) { 3923 spdk_json_write_named_uint32(w, "numa_id", numa_id); 3924 } 3925 spdk_json_write_object_end(w); 3926 } 3927 3928 static void 3929 nvme_namespace_info_json(struct spdk_json_write_ctx *w, 3930 struct nvme_ns *nvme_ns) 3931 { 3932 struct spdk_nvme_ns *ns; 3933 struct spdk_nvme_ctrlr *ctrlr; 3934 const struct spdk_nvme_ctrlr_data *cdata; 3935 const struct spdk_nvme_transport_id *trid; 3936 union spdk_nvme_vs_register vs; 3937 const struct spdk_nvme_ns_data *nsdata; 3938 char buf[128]; 3939 3940 ns = nvme_ns->ns; 3941 if (ns == NULL) { 3942 return; 3943 } 3944 3945 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 3946 3947 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3948 trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); 3949 vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr); 3950 3951 spdk_json_write_object_begin(w); 3952 3953 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 3954 spdk_json_write_named_string(w, "pci_address", trid->traddr); 3955 } 3956 3957 spdk_json_write_named_object_begin(w, "trid"); 3958 3959 nvme_bdev_dump_trid_json(trid, w); 3960 3961 spdk_json_write_object_end(w); 3962 3963 #ifdef SPDK_CONFIG_NVME_CUSE 3964 size_t cuse_name_size = 128; 3965 char cuse_name[cuse_name_size]; 3966 3967 int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns), 3968 cuse_name, &cuse_name_size); 3969 if (rc == 0) { 3970 spdk_json_write_named_string(w, "cuse_device", cuse_name); 3971 } 3972 #endif 3973 3974 spdk_json_write_named_object_begin(w, "ctrlr_data"); 3975 3976 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 3977 3978 spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid); 3979 3980 snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn); 3981 spdk_str_trim(buf); 3982 spdk_json_write_named_string(w, "model_number", buf); 3983 3984 snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn); 3985 spdk_str_trim(buf); 3986 spdk_json_write_named_string(w, "serial_number", buf); 3987 3988 snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr); 3989 spdk_str_trim(buf); 3990 spdk_json_write_named_string(w, "firmware_revision", buf); 3991 3992 if (cdata->subnqn[0] != '\0') { 3993 spdk_json_write_named_string(w, "subnqn", cdata->subnqn); 3994 } 3995 3996 spdk_json_write_named_object_begin(w, "oacs"); 3997 3998 spdk_json_write_named_uint32(w, "security", cdata->oacs.security); 3999 spdk_json_write_named_uint32(w, "format", cdata->oacs.format); 4000 spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware); 4001 spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage); 4002 4003 spdk_json_write_object_end(w); 4004 4005 spdk_json_write_named_bool(w, "multi_ctrlr", cdata->cmic.multi_ctrlr); 4006 spdk_json_write_named_bool(w, "ana_reporting", cdata->cmic.ana_reporting); 4007 4008 spdk_json_write_object_end(w); 4009 4010 spdk_json_write_named_object_begin(w, "vs"); 4011 4012 spdk_json_write_name(w, "nvme_version"); 4013 if (vs.bits.ter) { 4014 spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter); 4015 } else { 4016 spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr); 4017 } 4018 4019 spdk_json_write_object_end(w); 4020 4021 nsdata = spdk_nvme_ns_get_data(ns); 4022 4023 spdk_json_write_named_object_begin(w, "ns_data"); 4024 4025 spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns)); 4026 4027 if (cdata->cmic.ana_reporting) { 4028 spdk_json_write_named_string(w, "ana_state", 4029 _nvme_ana_state_str(nvme_ns->ana_state)); 4030 } 4031 4032 spdk_json_write_named_bool(w, "can_share", nsdata->nmic.can_share); 4033 4034 spdk_json_write_object_end(w); 4035 4036 if (cdata->oacs.security) { 4037 spdk_json_write_named_object_begin(w, "security"); 4038 4039 spdk_json_write_named_bool(w, "opal", nvme_ns->bdev->opal); 4040 4041 spdk_json_write_object_end(w); 4042 } 4043 4044 spdk_json_write_object_end(w); 4045 } 4046 4047 static const char * 4048 nvme_bdev_get_mp_policy_str(struct nvme_bdev *nbdev) 4049 { 4050 switch (nbdev->mp_policy) { 4051 case BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE: 4052 return "active_passive"; 4053 case BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE: 4054 return "active_active"; 4055 default: 4056 assert(false); 4057 return "invalid"; 4058 } 4059 } 4060 4061 static const char * 4062 nvme_bdev_get_mp_selector_str(struct nvme_bdev *nbdev) 4063 { 4064 switch (nbdev->mp_selector) { 4065 case BDEV_NVME_MP_SELECTOR_ROUND_ROBIN: 4066 return "round_robin"; 4067 case BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH: 4068 return "queue_depth"; 4069 default: 4070 assert(false); 4071 return "invalid"; 4072 } 4073 } 4074 4075 static int 4076 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) 4077 { 4078 struct nvme_bdev *nvme_bdev = ctx; 4079 struct nvme_ns *nvme_ns; 4080 4081 pthread_mutex_lock(&nvme_bdev->mutex); 4082 spdk_json_write_named_array_begin(w, "nvme"); 4083 TAILQ_FOREACH(nvme_ns, &nvme_bdev->nvme_ns_list, tailq) { 4084 nvme_namespace_info_json(w, nvme_ns); 4085 } 4086 spdk_json_write_array_end(w); 4087 spdk_json_write_named_string(w, "mp_policy", nvme_bdev_get_mp_policy_str(nvme_bdev)); 4088 if (nvme_bdev->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) { 4089 spdk_json_write_named_string(w, "selector", nvme_bdev_get_mp_selector_str(nvme_bdev)); 4090 if (nvme_bdev->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 4091 spdk_json_write_named_uint32(w, "rr_min_io", nvme_bdev->rr_min_io); 4092 } 4093 } 4094 pthread_mutex_unlock(&nvme_bdev->mutex); 4095 4096 return 0; 4097 } 4098 4099 static void 4100 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 4101 { 4102 /* No config per bdev needed */ 4103 } 4104 4105 static uint64_t 4106 bdev_nvme_get_spin_time(struct spdk_io_channel *ch) 4107 { 4108 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 4109 struct nvme_io_path *io_path; 4110 struct nvme_poll_group *group; 4111 uint64_t spin_time = 0; 4112 4113 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 4114 group = io_path->qpair->group; 4115 4116 if (!group || !group->collect_spin_stat) { 4117 continue; 4118 } 4119 4120 if (group->end_ticks != 0) { 4121 group->spin_ticks += (group->end_ticks - group->start_ticks); 4122 group->end_ticks = 0; 4123 } 4124 4125 spin_time += group->spin_ticks; 4126 group->start_ticks = 0; 4127 group->spin_ticks = 0; 4128 } 4129 4130 return (spin_time * 1000000ULL) / spdk_get_ticks_hz(); 4131 } 4132 4133 static void 4134 bdev_nvme_reset_device_stat(void *ctx) 4135 { 4136 struct nvme_bdev *nbdev = ctx; 4137 4138 if (nbdev->err_stat != NULL) { 4139 memset(nbdev->err_stat, 0, sizeof(struct nvme_error_stat)); 4140 } 4141 } 4142 4143 /* JSON string should be lowercases and underscore delimited string. */ 4144 static void 4145 bdev_nvme_format_nvme_status(char *dst, const char *src) 4146 { 4147 char tmp[256]; 4148 4149 spdk_strcpy_replace(dst, 256, src, " - ", "_"); 4150 spdk_strcpy_replace(tmp, 256, dst, "-", "_"); 4151 spdk_strcpy_replace(dst, 256, tmp, " ", "_"); 4152 spdk_strlwr(dst); 4153 } 4154 4155 static void 4156 bdev_nvme_dump_device_stat_json(void *ctx, struct spdk_json_write_ctx *w) 4157 { 4158 struct nvme_bdev *nbdev = ctx; 4159 struct spdk_nvme_status status = {}; 4160 uint16_t sct, sc; 4161 char status_json[256]; 4162 const char *status_str; 4163 4164 if (nbdev->err_stat == NULL) { 4165 return; 4166 } 4167 4168 spdk_json_write_named_object_begin(w, "nvme_error"); 4169 4170 spdk_json_write_named_object_begin(w, "status_type"); 4171 for (sct = 0; sct < 8; sct++) { 4172 if (nbdev->err_stat->status_type[sct] == 0) { 4173 continue; 4174 } 4175 status.sct = sct; 4176 4177 status_str = spdk_nvme_cpl_get_status_type_string(&status); 4178 assert(status_str != NULL); 4179 bdev_nvme_format_nvme_status(status_json, status_str); 4180 4181 spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status_type[sct]); 4182 } 4183 spdk_json_write_object_end(w); 4184 4185 spdk_json_write_named_object_begin(w, "status_code"); 4186 for (sct = 0; sct < 4; sct++) { 4187 status.sct = sct; 4188 for (sc = 0; sc < 256; sc++) { 4189 if (nbdev->err_stat->status[sct][sc] == 0) { 4190 continue; 4191 } 4192 status.sc = sc; 4193 4194 status_str = spdk_nvme_cpl_get_status_string(&status); 4195 assert(status_str != NULL); 4196 bdev_nvme_format_nvme_status(status_json, status_str); 4197 4198 spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status[sct][sc]); 4199 } 4200 } 4201 spdk_json_write_object_end(w); 4202 4203 spdk_json_write_object_end(w); 4204 } 4205 4206 static bool 4207 bdev_nvme_accel_sequence_supported(void *ctx, enum spdk_bdev_io_type type) 4208 { 4209 struct nvme_bdev *nbdev = ctx; 4210 struct spdk_nvme_ctrlr *ctrlr; 4211 4212 if (!g_opts.allow_accel_sequence) { 4213 return false; 4214 } 4215 4216 switch (type) { 4217 case SPDK_BDEV_IO_TYPE_WRITE: 4218 case SPDK_BDEV_IO_TYPE_READ: 4219 break; 4220 default: 4221 return false; 4222 } 4223 4224 ctrlr = bdev_nvme_get_ctrlr(&nbdev->disk); 4225 assert(ctrlr != NULL); 4226 4227 return spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ACCEL_SEQUENCE_SUPPORTED; 4228 } 4229 4230 static const struct spdk_bdev_fn_table nvmelib_fn_table = { 4231 .destruct = bdev_nvme_destruct, 4232 .submit_request = bdev_nvme_submit_request, 4233 .io_type_supported = bdev_nvme_io_type_supported, 4234 .get_io_channel = bdev_nvme_get_io_channel, 4235 .dump_info_json = bdev_nvme_dump_info_json, 4236 .write_config_json = bdev_nvme_write_config_json, 4237 .get_spin_time = bdev_nvme_get_spin_time, 4238 .get_module_ctx = bdev_nvme_get_module_ctx, 4239 .get_memory_domains = bdev_nvme_get_memory_domains, 4240 .accel_sequence_supported = bdev_nvme_accel_sequence_supported, 4241 .reset_device_stat = bdev_nvme_reset_device_stat, 4242 .dump_device_stat_json = bdev_nvme_dump_device_stat_json, 4243 }; 4244 4245 typedef int (*bdev_nvme_parse_ana_log_page_cb)( 4246 const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg); 4247 4248 static int 4249 bdev_nvme_parse_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 4250 bdev_nvme_parse_ana_log_page_cb cb_fn, void *cb_arg) 4251 { 4252 struct spdk_nvme_ana_group_descriptor *copied_desc; 4253 uint8_t *orig_desc; 4254 uint32_t i, desc_size, copy_len; 4255 int rc = 0; 4256 4257 if (nvme_ctrlr->ana_log_page == NULL) { 4258 return -EINVAL; 4259 } 4260 4261 copied_desc = nvme_ctrlr->copied_ana_desc; 4262 4263 orig_desc = (uint8_t *)nvme_ctrlr->ana_log_page + sizeof(struct spdk_nvme_ana_page); 4264 copy_len = nvme_ctrlr->max_ana_log_page_size - sizeof(struct spdk_nvme_ana_page); 4265 4266 for (i = 0; i < nvme_ctrlr->ana_log_page->num_ana_group_desc; i++) { 4267 memcpy(copied_desc, orig_desc, copy_len); 4268 4269 rc = cb_fn(copied_desc, cb_arg); 4270 if (rc != 0) { 4271 break; 4272 } 4273 4274 desc_size = sizeof(struct spdk_nvme_ana_group_descriptor) + 4275 copied_desc->num_of_nsid * sizeof(uint32_t); 4276 orig_desc += desc_size; 4277 copy_len -= desc_size; 4278 } 4279 4280 return rc; 4281 } 4282 4283 static int 4284 nvme_ns_ana_transition_timedout(void *ctx) 4285 { 4286 struct nvme_ns *nvme_ns = ctx; 4287 4288 spdk_poller_unregister(&nvme_ns->anatt_timer); 4289 nvme_ns->ana_transition_timedout = true; 4290 4291 return SPDK_POLLER_BUSY; 4292 } 4293 4294 static void 4295 _nvme_ns_set_ana_state(struct nvme_ns *nvme_ns, 4296 const struct spdk_nvme_ana_group_descriptor *desc) 4297 { 4298 const struct spdk_nvme_ctrlr_data *cdata; 4299 4300 nvme_ns->ana_group_id = desc->ana_group_id; 4301 nvme_ns->ana_state = desc->ana_state; 4302 nvme_ns->ana_state_updating = false; 4303 4304 switch (nvme_ns->ana_state) { 4305 case SPDK_NVME_ANA_OPTIMIZED_STATE: 4306 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 4307 nvme_ns->ana_transition_timedout = false; 4308 spdk_poller_unregister(&nvme_ns->anatt_timer); 4309 break; 4310 4311 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 4312 case SPDK_NVME_ANA_CHANGE_STATE: 4313 if (nvme_ns->anatt_timer != NULL) { 4314 break; 4315 } 4316 4317 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 4318 nvme_ns->anatt_timer = SPDK_POLLER_REGISTER(nvme_ns_ana_transition_timedout, 4319 nvme_ns, 4320 cdata->anatt * SPDK_SEC_TO_USEC); 4321 break; 4322 default: 4323 break; 4324 } 4325 } 4326 4327 static int 4328 nvme_ns_set_ana_state(const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg) 4329 { 4330 struct nvme_ns *nvme_ns = cb_arg; 4331 uint32_t i; 4332 4333 assert(nvme_ns->ns != NULL); 4334 4335 for (i = 0; i < desc->num_of_nsid; i++) { 4336 if (desc->nsid[i] != spdk_nvme_ns_get_id(nvme_ns->ns)) { 4337 continue; 4338 } 4339 4340 _nvme_ns_set_ana_state(nvme_ns, desc); 4341 return 1; 4342 } 4343 4344 return 0; 4345 } 4346 4347 static int 4348 nvme_generate_uuid(const char *sn, uint32_t nsid, struct spdk_uuid *uuid) 4349 { 4350 int rc = 0; 4351 struct spdk_uuid new_uuid, namespace_uuid; 4352 char merged_str[SPDK_NVME_CTRLR_SN_LEN + NSID_STR_LEN + 1] = {'\0'}; 4353 /* This namespace UUID was generated using uuid_generate() method. */ 4354 const char *namespace_str = {"edaed2de-24bc-4b07-b559-f47ecbe730fd"}; 4355 int size; 4356 4357 assert(strlen(sn) <= SPDK_NVME_CTRLR_SN_LEN); 4358 4359 spdk_uuid_set_null(&new_uuid); 4360 spdk_uuid_set_null(&namespace_uuid); 4361 4362 size = snprintf(merged_str, sizeof(merged_str), "%s%"PRIu32, sn, nsid); 4363 if (size <= 0 || (unsigned long)size >= sizeof(merged_str)) { 4364 return -EINVAL; 4365 } 4366 4367 spdk_uuid_parse(&namespace_uuid, namespace_str); 4368 4369 rc = spdk_uuid_generate_sha1(&new_uuid, &namespace_uuid, merged_str, size); 4370 if (rc == 0) { 4371 memcpy(uuid, &new_uuid, sizeof(struct spdk_uuid)); 4372 } 4373 4374 return rc; 4375 } 4376 4377 static int 4378 nvme_disk_create(struct spdk_bdev *disk, const char *base_name, 4379 struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns, 4380 struct spdk_bdev_nvme_ctrlr_opts *bdev_opts, void *ctx) 4381 { 4382 const struct spdk_uuid *uuid; 4383 const uint8_t *nguid; 4384 const struct spdk_nvme_ctrlr_data *cdata; 4385 const struct spdk_nvme_ns_data *nsdata; 4386 const struct spdk_nvme_ctrlr_opts *opts; 4387 enum spdk_nvme_csi csi; 4388 uint32_t atomic_bs, phys_bs, bs; 4389 char sn_tmp[SPDK_NVME_CTRLR_SN_LEN + 1] = {'\0'}; 4390 int rc; 4391 4392 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 4393 csi = spdk_nvme_ns_get_csi(ns); 4394 opts = spdk_nvme_ctrlr_get_opts(ctrlr); 4395 4396 switch (csi) { 4397 case SPDK_NVME_CSI_NVM: 4398 disk->product_name = "NVMe disk"; 4399 break; 4400 case SPDK_NVME_CSI_ZNS: 4401 disk->product_name = "NVMe ZNS disk"; 4402 disk->zoned = true; 4403 disk->zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 4404 disk->max_zone_append_size = spdk_nvme_zns_ctrlr_get_max_zone_append_size(ctrlr) / 4405 spdk_nvme_ns_get_extended_sector_size(ns); 4406 disk->max_open_zones = spdk_nvme_zns_ns_get_max_open_zones(ns); 4407 disk->max_active_zones = spdk_nvme_zns_ns_get_max_active_zones(ns); 4408 break; 4409 default: 4410 if (bdev_opts->allow_unrecognized_csi) { 4411 disk->product_name = "NVMe Passthrough disk"; 4412 break; 4413 } 4414 SPDK_ERRLOG("unsupported CSI: %u\n", csi); 4415 return -ENOTSUP; 4416 } 4417 4418 nguid = spdk_nvme_ns_get_nguid(ns); 4419 if (!nguid) { 4420 uuid = spdk_nvme_ns_get_uuid(ns); 4421 if (uuid) { 4422 disk->uuid = *uuid; 4423 } else if (g_opts.generate_uuids) { 4424 spdk_strcpy_pad(sn_tmp, cdata->sn, SPDK_NVME_CTRLR_SN_LEN, '\0'); 4425 rc = nvme_generate_uuid(sn_tmp, spdk_nvme_ns_get_id(ns), &disk->uuid); 4426 if (rc < 0) { 4427 SPDK_ERRLOG("UUID generation failed (%s)\n", spdk_strerror(-rc)); 4428 return rc; 4429 } 4430 } 4431 } else { 4432 memcpy(&disk->uuid, nguid, sizeof(disk->uuid)); 4433 } 4434 4435 disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns)); 4436 if (!disk->name) { 4437 return -ENOMEM; 4438 } 4439 4440 disk->write_cache = 0; 4441 if (cdata->vwc.present) { 4442 /* Enable if the Volatile Write Cache exists */ 4443 disk->write_cache = 1; 4444 } 4445 if (cdata->oncs.write_zeroes) { 4446 disk->max_write_zeroes = UINT16_MAX + 1; 4447 } 4448 disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns); 4449 disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns); 4450 disk->max_segment_size = spdk_nvme_ctrlr_get_max_xfer_size(ctrlr); 4451 disk->ctratt.raw = cdata->ctratt.raw; 4452 /* NVMe driver will split one request into multiple requests 4453 * based on MDTS and stripe boundary, the bdev layer will use 4454 * max_segment_size and max_num_segments to split one big IO 4455 * into multiple requests, then small request can't run out 4456 * of NVMe internal requests data structure. 4457 */ 4458 if (opts && opts->io_queue_requests) { 4459 disk->max_num_segments = opts->io_queue_requests / 2; 4460 } 4461 if (spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_SGL_SUPPORTED) { 4462 /* The nvme driver will try to split I/O that have too many 4463 * SGEs, but it doesn't work if that last SGE doesn't end on 4464 * an aggregate total that is block aligned. The bdev layer has 4465 * a more robust splitting framework, so use that instead for 4466 * this case. (See issue #3269.) 4467 */ 4468 uint16_t max_sges = spdk_nvme_ctrlr_get_max_sges(ctrlr); 4469 4470 if (disk->max_num_segments == 0) { 4471 disk->max_num_segments = max_sges; 4472 } else { 4473 disk->max_num_segments = spdk_min(disk->max_num_segments, max_sges); 4474 } 4475 } 4476 disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns); 4477 4478 nsdata = spdk_nvme_ns_get_data(ns); 4479 bs = spdk_nvme_ns_get_sector_size(ns); 4480 atomic_bs = bs; 4481 phys_bs = bs; 4482 if (nsdata->nabo == 0) { 4483 if (nsdata->nsfeat.ns_atomic_write_unit && nsdata->nawupf) { 4484 atomic_bs = bs * (1 + nsdata->nawupf); 4485 } else { 4486 atomic_bs = bs * (1 + cdata->awupf); 4487 } 4488 } 4489 if (nsdata->nsfeat.optperf) { 4490 phys_bs = bs * (1 + nsdata->npwg); 4491 } 4492 disk->phys_blocklen = spdk_min(phys_bs, atomic_bs); 4493 4494 disk->md_len = spdk_nvme_ns_get_md_size(ns); 4495 if (disk->md_len != 0) { 4496 disk->md_interleave = nsdata->flbas.extended; 4497 disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns); 4498 if (disk->dif_type != SPDK_DIF_DISABLE) { 4499 disk->dif_is_head_of_md = nsdata->dps.md_start; 4500 disk->dif_check_flags = bdev_opts->prchk_flags; 4501 disk->dif_pi_format = (enum spdk_dif_pi_format)spdk_nvme_ns_get_pi_format(ns); 4502 } 4503 } 4504 4505 if (!(spdk_nvme_ctrlr_get_flags(ctrlr) & 4506 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) { 4507 disk->acwu = 0; 4508 } else if (nsdata->nsfeat.ns_atomic_write_unit) { 4509 disk->acwu = nsdata->nacwu + 1; /* 0-based */ 4510 } else { 4511 disk->acwu = cdata->acwu + 1; /* 0-based */ 4512 } 4513 4514 if (cdata->oncs.copy) { 4515 /* For now bdev interface allows only single segment copy */ 4516 disk->max_copy = nsdata->mssrl; 4517 } 4518 4519 disk->ctxt = ctx; 4520 disk->fn_table = &nvmelib_fn_table; 4521 disk->module = &nvme_if; 4522 4523 disk->numa.id_valid = 1; 4524 disk->numa.id = spdk_nvme_ctrlr_get_numa_id(ctrlr); 4525 4526 return 0; 4527 } 4528 4529 static struct nvme_bdev * 4530 nvme_bdev_alloc(void) 4531 { 4532 struct nvme_bdev *bdev; 4533 int rc; 4534 4535 bdev = calloc(1, sizeof(*bdev)); 4536 if (!bdev) { 4537 SPDK_ERRLOG("bdev calloc() failed\n"); 4538 return NULL; 4539 } 4540 4541 if (g_opts.nvme_error_stat) { 4542 bdev->err_stat = calloc(1, sizeof(struct nvme_error_stat)); 4543 if (!bdev->err_stat) { 4544 SPDK_ERRLOG("err_stat calloc() failed\n"); 4545 free(bdev); 4546 return NULL; 4547 } 4548 } 4549 4550 rc = pthread_mutex_init(&bdev->mutex, NULL); 4551 if (rc != 0) { 4552 free(bdev->err_stat); 4553 free(bdev); 4554 return NULL; 4555 } 4556 4557 bdev->ref = 1; 4558 bdev->mp_policy = BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE; 4559 bdev->mp_selector = BDEV_NVME_MP_SELECTOR_ROUND_ROBIN; 4560 bdev->rr_min_io = UINT32_MAX; 4561 TAILQ_INIT(&bdev->nvme_ns_list); 4562 4563 return bdev; 4564 } 4565 4566 static int 4567 nvme_bdev_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4568 { 4569 struct nvme_bdev *bdev; 4570 struct nvme_bdev_ctrlr *nbdev_ctrlr = nvme_ctrlr->nbdev_ctrlr; 4571 int rc; 4572 4573 bdev = nvme_bdev_alloc(); 4574 if (bdev == NULL) { 4575 SPDK_ERRLOG("Failed to allocate NVMe bdev\n"); 4576 return -ENOMEM; 4577 } 4578 4579 bdev->opal = nvme_ctrlr->opal_dev != NULL; 4580 4581 rc = nvme_disk_create(&bdev->disk, nbdev_ctrlr->name, nvme_ctrlr->ctrlr, 4582 nvme_ns->ns, &nvme_ctrlr->opts, bdev); 4583 if (rc != 0) { 4584 SPDK_ERRLOG("Failed to create NVMe disk\n"); 4585 nvme_bdev_free(bdev); 4586 return rc; 4587 } 4588 4589 spdk_io_device_register(bdev, 4590 bdev_nvme_create_bdev_channel_cb, 4591 bdev_nvme_destroy_bdev_channel_cb, 4592 sizeof(struct nvme_bdev_channel), 4593 bdev->disk.name); 4594 4595 nvme_ns->bdev = bdev; 4596 bdev->nsid = nvme_ns->id; 4597 TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); 4598 4599 bdev->nbdev_ctrlr = nbdev_ctrlr; 4600 TAILQ_INSERT_TAIL(&nbdev_ctrlr->bdevs, bdev, tailq); 4601 4602 rc = spdk_bdev_register(&bdev->disk); 4603 if (rc != 0) { 4604 SPDK_ERRLOG("spdk_bdev_register() failed\n"); 4605 spdk_io_device_unregister(bdev, NULL); 4606 nvme_ns->bdev = NULL; 4607 TAILQ_REMOVE(&nbdev_ctrlr->bdevs, bdev, tailq); 4608 nvme_bdev_free(bdev); 4609 return rc; 4610 } 4611 4612 return 0; 4613 } 4614 4615 static bool 4616 bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2) 4617 { 4618 const struct spdk_nvme_ns_data *nsdata1, *nsdata2; 4619 const struct spdk_uuid *uuid1, *uuid2; 4620 4621 nsdata1 = spdk_nvme_ns_get_data(ns1); 4622 nsdata2 = spdk_nvme_ns_get_data(ns2); 4623 uuid1 = spdk_nvme_ns_get_uuid(ns1); 4624 uuid2 = spdk_nvme_ns_get_uuid(ns2); 4625 4626 return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid)) == 0 && 4627 nsdata1->eui64 == nsdata2->eui64 && 4628 ((uuid1 == NULL && uuid2 == NULL) || 4629 (uuid1 != NULL && uuid2 != NULL && spdk_uuid_compare(uuid1, uuid2) == 0)) && 4630 spdk_nvme_ns_get_csi(ns1) == spdk_nvme_ns_get_csi(ns2); 4631 } 4632 4633 static bool 4634 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 4635 struct spdk_nvme_ctrlr_opts *opts) 4636 { 4637 struct nvme_probe_skip_entry *entry; 4638 4639 TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) { 4640 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 4641 return false; 4642 } 4643 } 4644 4645 opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst; 4646 opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight; 4647 opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight; 4648 opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight; 4649 opts->disable_read_ana_log_page = true; 4650 4651 SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr); 4652 4653 return true; 4654 } 4655 4656 static void 4657 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl) 4658 { 4659 struct nvme_ctrlr *nvme_ctrlr = ctx; 4660 4661 if (spdk_nvme_cpl_is_error(cpl)) { 4662 NVME_CTRLR_WARNLOG(nvme_ctrlr, "Abort failed. Resetting controller. sc is %u, sct is %u.\n", 4663 cpl->status.sc, cpl->status.sct); 4664 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4665 } else if (cpl->cdw0 & 0x1) { 4666 NVME_CTRLR_WARNLOG(nvme_ctrlr, "Specified command could not be aborted.\n"); 4667 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4668 } 4669 } 4670 4671 static void 4672 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr, 4673 struct spdk_nvme_qpair *qpair, uint16_t cid) 4674 { 4675 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 4676 union spdk_nvme_csts_register csts; 4677 int rc; 4678 4679 assert(nvme_ctrlr->ctrlr == ctrlr); 4680 4681 NVME_CTRLR_WARNLOG(nvme_ctrlr, "Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", 4682 ctrlr, qpair, cid); 4683 4684 /* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O 4685 * queue. (Note: qpair == NULL when there's an admin cmd timeout.) Otherwise we 4686 * would submit another fabrics cmd on the admin queue to read CSTS and check for its 4687 * completion recursively. 4688 */ 4689 if (nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) { 4690 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); 4691 if (csts.bits.cfs) { 4692 NVME_CTRLR_ERRLOG(nvme_ctrlr, "Controller Fatal Status, reset required\n"); 4693 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4694 return; 4695 } 4696 } 4697 4698 switch (g_opts.action_on_timeout) { 4699 case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT: 4700 if (qpair) { 4701 /* Don't send abort to ctrlr when ctrlr is not available. */ 4702 pthread_mutex_lock(&nvme_ctrlr->mutex); 4703 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 4704 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4705 NVME_CTRLR_NOTICELOG(nvme_ctrlr, "Quit abort. Ctrlr is not available.\n"); 4706 return; 4707 } 4708 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4709 4710 rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid, 4711 nvme_abort_cpl, nvme_ctrlr); 4712 if (rc == 0) { 4713 return; 4714 } 4715 4716 NVME_CTRLR_ERRLOG(nvme_ctrlr, "Unable to send abort. Resetting, rc is %d.\n", rc); 4717 } 4718 4719 /* FALLTHROUGH */ 4720 case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET: 4721 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4722 break; 4723 case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE: 4724 NVME_CTRLR_DEBUGLOG(nvme_ctrlr, "No action for nvme controller timeout.\n"); 4725 break; 4726 default: 4727 NVME_CTRLR_ERRLOG(nvme_ctrlr, "An invalid timeout action value is found.\n"); 4728 break; 4729 } 4730 } 4731 4732 static struct nvme_ns * 4733 nvme_ns_alloc(void) 4734 { 4735 struct nvme_ns *nvme_ns; 4736 4737 nvme_ns = calloc(1, sizeof(struct nvme_ns)); 4738 if (nvme_ns == NULL) { 4739 return NULL; 4740 } 4741 4742 if (g_opts.io_path_stat) { 4743 nvme_ns->stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); 4744 if (nvme_ns->stat == NULL) { 4745 free(nvme_ns); 4746 return NULL; 4747 } 4748 spdk_bdev_reset_io_stat(nvme_ns->stat, SPDK_BDEV_RESET_STAT_MAXMIN); 4749 } 4750 4751 return nvme_ns; 4752 } 4753 4754 static void 4755 nvme_ns_free(struct nvme_ns *nvme_ns) 4756 { 4757 free(nvme_ns->stat); 4758 free(nvme_ns); 4759 } 4760 4761 static void 4762 nvme_ctrlr_populate_namespace_done(struct nvme_ns *nvme_ns, int rc) 4763 { 4764 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 4765 struct nvme_async_probe_ctx *ctx = nvme_ns->probe_ctx; 4766 4767 if (rc == 0) { 4768 nvme_ns->probe_ctx = NULL; 4769 pthread_mutex_lock(&nvme_ctrlr->mutex); 4770 nvme_ctrlr->ref++; 4771 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4772 } else { 4773 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4774 nvme_ns_free(nvme_ns); 4775 } 4776 4777 if (ctx) { 4778 ctx->populates_in_progress--; 4779 if (ctx->populates_in_progress == 0) { 4780 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 4781 } 4782 } 4783 } 4784 4785 static void 4786 bdev_nvme_add_io_path(struct nvme_bdev_channel_iter *i, 4787 struct nvme_bdev *nbdev, 4788 struct nvme_bdev_channel *nbdev_ch, void *ctx) 4789 { 4790 struct nvme_ns *nvme_ns = ctx; 4791 int rc; 4792 4793 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 4794 if (rc != 0) { 4795 SPDK_ERRLOG("Failed to add I/O path to bdev_channel dynamically.\n"); 4796 } 4797 4798 nvme_bdev_for_each_channel_continue(i, rc); 4799 } 4800 4801 static void 4802 bdev_nvme_delete_io_path(struct nvme_bdev_channel_iter *i, 4803 struct nvme_bdev *nbdev, 4804 struct nvme_bdev_channel *nbdev_ch, void *ctx) 4805 { 4806 struct nvme_ns *nvme_ns = ctx; 4807 struct nvme_io_path *io_path; 4808 4809 io_path = _bdev_nvme_get_io_path(nbdev_ch, nvme_ns); 4810 if (io_path != NULL) { 4811 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 4812 } 4813 4814 nvme_bdev_for_each_channel_continue(i, 0); 4815 } 4816 4817 static void 4818 bdev_nvme_add_io_path_failed(struct nvme_bdev *nbdev, void *ctx, int status) 4819 { 4820 struct nvme_ns *nvme_ns = ctx; 4821 4822 nvme_ctrlr_populate_namespace_done(nvme_ns, -1); 4823 } 4824 4825 static void 4826 bdev_nvme_add_io_path_done(struct nvme_bdev *nbdev, void *ctx, int status) 4827 { 4828 struct nvme_ns *nvme_ns = ctx; 4829 4830 if (status == 0) { 4831 nvme_ctrlr_populate_namespace_done(nvme_ns, 0); 4832 } else { 4833 /* Delete the added io_paths and fail populating the namespace. */ 4834 nvme_bdev_for_each_channel(nbdev, 4835 bdev_nvme_delete_io_path, 4836 nvme_ns, 4837 bdev_nvme_add_io_path_failed); 4838 } 4839 } 4840 4841 static int 4842 nvme_bdev_add_ns(struct nvme_bdev *bdev, struct nvme_ns *nvme_ns) 4843 { 4844 struct nvme_ns *tmp_ns; 4845 const struct spdk_nvme_ns_data *nsdata; 4846 4847 nsdata = spdk_nvme_ns_get_data(nvme_ns->ns); 4848 if (!nsdata->nmic.can_share) { 4849 SPDK_ERRLOG("Namespace cannot be shared.\n"); 4850 return -EINVAL; 4851 } 4852 4853 pthread_mutex_lock(&bdev->mutex); 4854 4855 tmp_ns = TAILQ_FIRST(&bdev->nvme_ns_list); 4856 assert(tmp_ns != NULL); 4857 4858 if (tmp_ns->ns != NULL && !bdev_nvme_compare_ns(nvme_ns->ns, tmp_ns->ns)) { 4859 pthread_mutex_unlock(&bdev->mutex); 4860 SPDK_ERRLOG("Namespaces are not identical.\n"); 4861 return -EINVAL; 4862 } 4863 4864 bdev->ref++; 4865 TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); 4866 nvme_ns->bdev = bdev; 4867 4868 pthread_mutex_unlock(&bdev->mutex); 4869 4870 /* Add nvme_io_path to nvme_bdev_channels dynamically. */ 4871 nvme_bdev_for_each_channel(bdev, 4872 bdev_nvme_add_io_path, 4873 nvme_ns, 4874 bdev_nvme_add_io_path_done); 4875 4876 return 0; 4877 } 4878 4879 static void 4880 nvme_ctrlr_populate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4881 { 4882 struct spdk_nvme_ns *ns; 4883 struct nvme_bdev *bdev; 4884 int rc = 0; 4885 4886 ns = spdk_nvme_ctrlr_get_ns(nvme_ctrlr->ctrlr, nvme_ns->id); 4887 if (!ns) { 4888 NVME_CTRLR_DEBUGLOG(nvme_ctrlr, "Invalid NS %d\n", nvme_ns->id); 4889 rc = -EINVAL; 4890 goto done; 4891 } 4892 4893 nvme_ns->ns = ns; 4894 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 4895 4896 if (nvme_ctrlr->ana_log_page != NULL) { 4897 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ns_set_ana_state, nvme_ns); 4898 } 4899 4900 bdev = nvme_bdev_ctrlr_get_bdev(nvme_ctrlr->nbdev_ctrlr, nvme_ns->id); 4901 if (bdev == NULL) { 4902 rc = nvme_bdev_create(nvme_ctrlr, nvme_ns); 4903 } else { 4904 rc = nvme_bdev_add_ns(bdev, nvme_ns); 4905 if (rc == 0) { 4906 return; 4907 } 4908 } 4909 done: 4910 nvme_ctrlr_populate_namespace_done(nvme_ns, rc); 4911 } 4912 4913 static void 4914 nvme_ctrlr_depopulate_namespace_done(struct nvme_ns *nvme_ns) 4915 { 4916 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 4917 4918 assert(nvme_ctrlr != NULL); 4919 4920 pthread_mutex_lock(&nvme_ctrlr->mutex); 4921 4922 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4923 4924 if (nvme_ns->bdev != NULL) { 4925 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4926 return; 4927 } 4928 4929 nvme_ns_free(nvme_ns); 4930 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4931 4932 nvme_ctrlr_release(nvme_ctrlr); 4933 } 4934 4935 static void 4936 bdev_nvme_delete_io_path_done(struct nvme_bdev *nbdev, void *ctx, int status) 4937 { 4938 struct nvme_ns *nvme_ns = ctx; 4939 4940 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 4941 } 4942 4943 static void 4944 nvme_ctrlr_depopulate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4945 { 4946 struct nvme_bdev *bdev; 4947 4948 spdk_poller_unregister(&nvme_ns->anatt_timer); 4949 4950 bdev = nvme_ns->bdev; 4951 if (bdev != NULL) { 4952 pthread_mutex_lock(&bdev->mutex); 4953 4954 assert(bdev->ref > 0); 4955 bdev->ref--; 4956 if (bdev->ref == 0) { 4957 pthread_mutex_unlock(&bdev->mutex); 4958 4959 spdk_bdev_unregister(&bdev->disk, NULL, NULL); 4960 } else { 4961 /* spdk_bdev_unregister() is not called until the last nvme_ns is 4962 * depopulated. Hence we need to remove nvme_ns from bdev->nvme_ns_list 4963 * and clear nvme_ns->bdev here. 4964 */ 4965 TAILQ_REMOVE(&bdev->nvme_ns_list, nvme_ns, tailq); 4966 nvme_ns->bdev = NULL; 4967 4968 pthread_mutex_unlock(&bdev->mutex); 4969 4970 /* Delete nvme_io_paths from nvme_bdev_channels dynamically. After that, 4971 * we call depopulate_namespace_done() to avoid use-after-free. 4972 */ 4973 nvme_bdev_for_each_channel(bdev, 4974 bdev_nvme_delete_io_path, 4975 nvme_ns, 4976 bdev_nvme_delete_io_path_done); 4977 return; 4978 } 4979 } 4980 4981 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 4982 } 4983 4984 static void 4985 nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 4986 struct nvme_async_probe_ctx *ctx) 4987 { 4988 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 4989 struct nvme_ns *nvme_ns, *next; 4990 struct spdk_nvme_ns *ns; 4991 struct nvme_bdev *bdev; 4992 uint32_t nsid; 4993 int rc; 4994 uint64_t num_sectors; 4995 4996 if (ctx) { 4997 /* Initialize this count to 1 to handle the populate functions 4998 * calling nvme_ctrlr_populate_namespace_done() immediately. 4999 */ 5000 ctx->populates_in_progress = 1; 5001 } 5002 5003 /* First loop over our existing namespaces and see if they have been 5004 * removed. */ 5005 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 5006 while (nvme_ns != NULL) { 5007 next = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 5008 5009 if (spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) { 5010 /* NS is still there or added again. Its attributes may have changed. */ 5011 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id); 5012 if (nvme_ns->ns != ns) { 5013 assert(nvme_ns->ns == NULL); 5014 nvme_ns->ns = ns; 5015 NVME_CTRLR_DEBUGLOG(nvme_ctrlr, "NSID %u was added\n", nvme_ns->id); 5016 } 5017 5018 num_sectors = spdk_nvme_ns_get_num_sectors(ns); 5019 bdev = nvme_ns->bdev; 5020 assert(bdev != NULL); 5021 if (bdev->disk.blockcnt != num_sectors) { 5022 NVME_CTRLR_NOTICELOG(nvme_ctrlr, 5023 "NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n", 5024 nvme_ns->id, 5025 bdev->disk.name, 5026 bdev->disk.blockcnt, 5027 num_sectors); 5028 rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors); 5029 if (rc != 0) { 5030 NVME_CTRLR_ERRLOG(nvme_ctrlr, 5031 "Could not change num blocks for nvme bdev: name %s, errno: %d.\n", 5032 bdev->disk.name, rc); 5033 } 5034 } 5035 } else { 5036 /* Namespace was removed */ 5037 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 5038 } 5039 5040 nvme_ns = next; 5041 } 5042 5043 /* Loop through all of the namespaces at the nvme level and see if any of them are new */ 5044 nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 5045 while (nsid != 0) { 5046 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 5047 5048 if (nvme_ns == NULL) { 5049 /* Found a new one */ 5050 nvme_ns = nvme_ns_alloc(); 5051 if (nvme_ns == NULL) { 5052 NVME_CTRLR_ERRLOG(nvme_ctrlr, "Failed to allocate namespace\n"); 5053 /* This just fails to attach the namespace. It may work on a future attempt. */ 5054 continue; 5055 } 5056 5057 nvme_ns->id = nsid; 5058 nvme_ns->ctrlr = nvme_ctrlr; 5059 5060 nvme_ns->bdev = NULL; 5061 5062 if (ctx) { 5063 ctx->populates_in_progress++; 5064 } 5065 nvme_ns->probe_ctx = ctx; 5066 5067 RB_INSERT(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 5068 5069 nvme_ctrlr_populate_namespace(nvme_ctrlr, nvme_ns); 5070 } 5071 5072 nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid); 5073 } 5074 5075 if (ctx) { 5076 /* Decrement this count now that the loop is over to account 5077 * for the one we started with. If the count is then 0, we 5078 * know any populate_namespace functions completed immediately, 5079 * so we'll kick the callback here. 5080 */ 5081 ctx->populates_in_progress--; 5082 if (ctx->populates_in_progress == 0) { 5083 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 5084 } 5085 } 5086 5087 } 5088 5089 static void 5090 nvme_ctrlr_depopulate_namespaces(struct nvme_ctrlr *nvme_ctrlr) 5091 { 5092 struct nvme_ns *nvme_ns, *tmp; 5093 5094 RB_FOREACH_SAFE(nvme_ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp) { 5095 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 5096 } 5097 } 5098 5099 static uint32_t 5100 nvme_ctrlr_get_ana_log_page_size(struct nvme_ctrlr *nvme_ctrlr) 5101 { 5102 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 5103 const struct spdk_nvme_ctrlr_data *cdata; 5104 uint32_t nsid, ns_count = 0; 5105 5106 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5107 5108 for (nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 5109 nsid != 0; nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid)) { 5110 ns_count++; 5111 } 5112 5113 return sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 5114 sizeof(struct spdk_nvme_ana_group_descriptor) + ns_count * 5115 sizeof(uint32_t); 5116 } 5117 5118 static int 5119 nvme_ctrlr_set_ana_states(const struct spdk_nvme_ana_group_descriptor *desc, 5120 void *cb_arg) 5121 { 5122 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 5123 struct nvme_ns *nvme_ns; 5124 uint32_t i, nsid; 5125 5126 for (i = 0; i < desc->num_of_nsid; i++) { 5127 nsid = desc->nsid[i]; 5128 if (nsid == 0) { 5129 continue; 5130 } 5131 5132 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 5133 5134 if (nvme_ns == NULL) { 5135 /* Target told us that an inactive namespace had an ANA change */ 5136 continue; 5137 } 5138 5139 _nvme_ns_set_ana_state(nvme_ns, desc); 5140 } 5141 5142 return 0; 5143 } 5144 5145 static void 5146 bdev_nvme_disable_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 5147 { 5148 struct nvme_ns *nvme_ns; 5149 5150 spdk_free(nvme_ctrlr->ana_log_page); 5151 nvme_ctrlr->ana_log_page = NULL; 5152 5153 for (nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 5154 nvme_ns != NULL; 5155 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns)) { 5156 nvme_ns->ana_state_updating = false; 5157 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 5158 } 5159 } 5160 5161 static void 5162 nvme_ctrlr_read_ana_log_page_done(void *ctx, const struct spdk_nvme_cpl *cpl) 5163 { 5164 struct nvme_ctrlr *nvme_ctrlr = ctx; 5165 5166 if (cpl != NULL && spdk_nvme_cpl_is_success(cpl)) { 5167 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ctrlr_set_ana_states, 5168 nvme_ctrlr); 5169 } else { 5170 bdev_nvme_disable_read_ana_log_page(nvme_ctrlr); 5171 } 5172 5173 pthread_mutex_lock(&nvme_ctrlr->mutex); 5174 5175 assert(nvme_ctrlr->ana_log_page_updating == true); 5176 nvme_ctrlr->ana_log_page_updating = false; 5177 5178 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 5179 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5180 5181 nvme_ctrlr_unregister(nvme_ctrlr); 5182 } else { 5183 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5184 5185 bdev_nvme_clear_io_path_caches(nvme_ctrlr); 5186 } 5187 } 5188 5189 static int 5190 nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 5191 { 5192 uint32_t ana_log_page_size; 5193 int rc; 5194 5195 if (nvme_ctrlr->ana_log_page == NULL) { 5196 return -EINVAL; 5197 } 5198 5199 ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); 5200 5201 if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { 5202 NVME_CTRLR_ERRLOG(nvme_ctrlr, 5203 "ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", 5204 ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); 5205 return -EINVAL; 5206 } 5207 5208 pthread_mutex_lock(&nvme_ctrlr->mutex); 5209 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 5210 nvme_ctrlr->ana_log_page_updating) { 5211 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5212 return -EBUSY; 5213 } 5214 5215 nvme_ctrlr->ana_log_page_updating = true; 5216 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5217 5218 rc = spdk_nvme_ctrlr_cmd_get_log_page(nvme_ctrlr->ctrlr, 5219 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 5220 SPDK_NVME_GLOBAL_NS_TAG, 5221 nvme_ctrlr->ana_log_page, 5222 ana_log_page_size, 0, 5223 nvme_ctrlr_read_ana_log_page_done, 5224 nvme_ctrlr); 5225 if (rc != 0) { 5226 nvme_ctrlr_read_ana_log_page_done(nvme_ctrlr, NULL); 5227 } 5228 5229 return rc; 5230 } 5231 5232 static void 5233 dummy_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 5234 { 5235 } 5236 5237 struct bdev_nvme_set_preferred_path_ctx { 5238 struct spdk_bdev_desc *desc; 5239 struct nvme_ns *nvme_ns; 5240 bdev_nvme_set_preferred_path_cb cb_fn; 5241 void *cb_arg; 5242 }; 5243 5244 static void 5245 bdev_nvme_set_preferred_path_done(struct nvme_bdev *nbdev, void *_ctx, int status) 5246 { 5247 struct bdev_nvme_set_preferred_path_ctx *ctx = _ctx; 5248 5249 assert(ctx != NULL); 5250 assert(ctx->desc != NULL); 5251 assert(ctx->cb_fn != NULL); 5252 5253 spdk_bdev_close(ctx->desc); 5254 5255 ctx->cb_fn(ctx->cb_arg, status); 5256 5257 free(ctx); 5258 } 5259 5260 static void 5261 _bdev_nvme_set_preferred_path(struct nvme_bdev_channel_iter *i, 5262 struct nvme_bdev *nbdev, 5263 struct nvme_bdev_channel *nbdev_ch, void *_ctx) 5264 { 5265 struct bdev_nvme_set_preferred_path_ctx *ctx = _ctx; 5266 struct nvme_io_path *io_path, *prev; 5267 5268 prev = NULL; 5269 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 5270 if (io_path->nvme_ns == ctx->nvme_ns) { 5271 break; 5272 } 5273 prev = io_path; 5274 } 5275 5276 if (io_path != NULL) { 5277 if (prev != NULL) { 5278 STAILQ_REMOVE_AFTER(&nbdev_ch->io_path_list, prev, stailq); 5279 STAILQ_INSERT_HEAD(&nbdev_ch->io_path_list, io_path, stailq); 5280 } 5281 5282 /* We can set io_path to nbdev_ch->current_io_path directly here. 5283 * However, it needs to be conditional. To simplify the code, 5284 * just clear nbdev_ch->current_io_path and let find_io_path() 5285 * fill it. 5286 * 5287 * Automatic failback may be disabled. Hence even if the io_path is 5288 * already at the head, clear nbdev_ch->current_io_path. 5289 */ 5290 bdev_nvme_clear_current_io_path(nbdev_ch); 5291 } 5292 5293 nvme_bdev_for_each_channel_continue(i, 0); 5294 } 5295 5296 static struct nvme_ns * 5297 bdev_nvme_set_preferred_ns(struct nvme_bdev *nbdev, uint16_t cntlid) 5298 { 5299 struct nvme_ns *nvme_ns, *prev; 5300 const struct spdk_nvme_ctrlr_data *cdata; 5301 5302 prev = NULL; 5303 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 5304 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 5305 5306 if (cdata->cntlid == cntlid) { 5307 break; 5308 } 5309 prev = nvme_ns; 5310 } 5311 5312 if (nvme_ns != NULL && prev != NULL) { 5313 TAILQ_REMOVE(&nbdev->nvme_ns_list, nvme_ns, tailq); 5314 TAILQ_INSERT_HEAD(&nbdev->nvme_ns_list, nvme_ns, tailq); 5315 } 5316 5317 return nvme_ns; 5318 } 5319 5320 /* This function supports only multipath mode. There is only a single I/O path 5321 * for each NVMe-oF controller. Hence, just move the matched I/O path to the 5322 * head of the I/O path list for each NVMe bdev channel. 5323 * 5324 * NVMe bdev channel may be acquired after completing this function. move the 5325 * matched namespace to the head of the namespace list for the NVMe bdev too. 5326 */ 5327 void 5328 bdev_nvme_set_preferred_path(const char *name, uint16_t cntlid, 5329 bdev_nvme_set_preferred_path_cb cb_fn, void *cb_arg) 5330 { 5331 struct bdev_nvme_set_preferred_path_ctx *ctx; 5332 struct spdk_bdev *bdev; 5333 struct nvme_bdev *nbdev; 5334 int rc = 0; 5335 5336 assert(cb_fn != NULL); 5337 5338 ctx = calloc(1, sizeof(*ctx)); 5339 if (ctx == NULL) { 5340 SPDK_ERRLOG("Failed to alloc context.\n"); 5341 rc = -ENOMEM; 5342 goto err_alloc; 5343 } 5344 5345 ctx->cb_fn = cb_fn; 5346 ctx->cb_arg = cb_arg; 5347 5348 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 5349 if (rc != 0) { 5350 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 5351 goto err_open; 5352 } 5353 5354 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 5355 5356 if (bdev->module != &nvme_if) { 5357 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 5358 rc = -ENODEV; 5359 goto err_bdev; 5360 } 5361 5362 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 5363 5364 pthread_mutex_lock(&nbdev->mutex); 5365 5366 ctx->nvme_ns = bdev_nvme_set_preferred_ns(nbdev, cntlid); 5367 if (ctx->nvme_ns == NULL) { 5368 pthread_mutex_unlock(&nbdev->mutex); 5369 5370 SPDK_ERRLOG("bdev %s does not have namespace to controller %u.\n", name, cntlid); 5371 rc = -ENODEV; 5372 goto err_bdev; 5373 } 5374 5375 pthread_mutex_unlock(&nbdev->mutex); 5376 5377 nvme_bdev_for_each_channel(nbdev, 5378 _bdev_nvme_set_preferred_path, 5379 ctx, 5380 bdev_nvme_set_preferred_path_done); 5381 return; 5382 5383 err_bdev: 5384 spdk_bdev_close(ctx->desc); 5385 err_open: 5386 free(ctx); 5387 err_alloc: 5388 cb_fn(cb_arg, rc); 5389 } 5390 5391 struct bdev_nvme_set_multipath_policy_ctx { 5392 struct spdk_bdev_desc *desc; 5393 spdk_bdev_nvme_set_multipath_policy_cb cb_fn; 5394 void *cb_arg; 5395 }; 5396 5397 static void 5398 bdev_nvme_set_multipath_policy_done(struct nvme_bdev *nbdev, void *_ctx, int status) 5399 { 5400 struct bdev_nvme_set_multipath_policy_ctx *ctx = _ctx; 5401 5402 assert(ctx != NULL); 5403 assert(ctx->desc != NULL); 5404 assert(ctx->cb_fn != NULL); 5405 5406 spdk_bdev_close(ctx->desc); 5407 5408 ctx->cb_fn(ctx->cb_arg, status); 5409 5410 free(ctx); 5411 } 5412 5413 static void 5414 _bdev_nvme_set_multipath_policy(struct nvme_bdev_channel_iter *i, 5415 struct nvme_bdev *nbdev, 5416 struct nvme_bdev_channel *nbdev_ch, void *ctx) 5417 { 5418 nbdev_ch->mp_policy = nbdev->mp_policy; 5419 nbdev_ch->mp_selector = nbdev->mp_selector; 5420 nbdev_ch->rr_min_io = nbdev->rr_min_io; 5421 bdev_nvme_clear_current_io_path(nbdev_ch); 5422 5423 nvme_bdev_for_each_channel_continue(i, 0); 5424 } 5425 5426 void 5427 spdk_bdev_nvme_set_multipath_policy(const char *name, enum spdk_bdev_nvme_multipath_policy policy, 5428 enum spdk_bdev_nvme_multipath_selector selector, uint32_t rr_min_io, 5429 spdk_bdev_nvme_set_multipath_policy_cb cb_fn, void *cb_arg) 5430 { 5431 struct bdev_nvme_set_multipath_policy_ctx *ctx; 5432 struct spdk_bdev *bdev; 5433 struct nvme_bdev *nbdev; 5434 int rc; 5435 5436 assert(cb_fn != NULL); 5437 5438 switch (policy) { 5439 case BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE: 5440 break; 5441 case BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE: 5442 switch (selector) { 5443 case BDEV_NVME_MP_SELECTOR_ROUND_ROBIN: 5444 if (rr_min_io == UINT32_MAX) { 5445 rr_min_io = 1; 5446 } else if (rr_min_io == 0) { 5447 rc = -EINVAL; 5448 goto exit; 5449 } 5450 break; 5451 case BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH: 5452 break; 5453 default: 5454 rc = -EINVAL; 5455 goto exit; 5456 } 5457 break; 5458 default: 5459 rc = -EINVAL; 5460 goto exit; 5461 } 5462 5463 ctx = calloc(1, sizeof(*ctx)); 5464 if (ctx == NULL) { 5465 SPDK_ERRLOG("Failed to alloc context.\n"); 5466 rc = -ENOMEM; 5467 goto exit; 5468 } 5469 5470 ctx->cb_fn = cb_fn; 5471 ctx->cb_arg = cb_arg; 5472 5473 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 5474 if (rc != 0) { 5475 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 5476 rc = -ENODEV; 5477 goto err_open; 5478 } 5479 5480 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 5481 if (bdev->module != &nvme_if) { 5482 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 5483 rc = -ENODEV; 5484 goto err_module; 5485 } 5486 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 5487 5488 pthread_mutex_lock(&nbdev->mutex); 5489 nbdev->mp_policy = policy; 5490 nbdev->mp_selector = selector; 5491 nbdev->rr_min_io = rr_min_io; 5492 pthread_mutex_unlock(&nbdev->mutex); 5493 5494 nvme_bdev_for_each_channel(nbdev, 5495 _bdev_nvme_set_multipath_policy, 5496 ctx, 5497 bdev_nvme_set_multipath_policy_done); 5498 return; 5499 5500 err_module: 5501 spdk_bdev_close(ctx->desc); 5502 err_open: 5503 free(ctx); 5504 exit: 5505 cb_fn(cb_arg, rc); 5506 } 5507 5508 static void 5509 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 5510 { 5511 struct nvme_ctrlr *nvme_ctrlr = arg; 5512 union spdk_nvme_async_event_completion event; 5513 5514 if (spdk_nvme_cpl_is_error(cpl)) { 5515 SPDK_WARNLOG("AER request execute failed\n"); 5516 return; 5517 } 5518 5519 event.raw = cpl->cdw0; 5520 if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 5521 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) { 5522 nvme_ctrlr_populate_namespaces(nvme_ctrlr, NULL); 5523 } else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 5524 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_ANA_CHANGE)) { 5525 nvme_ctrlr_read_ana_log_page(nvme_ctrlr); 5526 } 5527 } 5528 5529 static void 5530 free_nvme_async_probe_ctx(struct nvme_async_probe_ctx *ctx) 5531 { 5532 spdk_keyring_put_key(ctx->drv_opts.tls_psk); 5533 spdk_keyring_put_key(ctx->drv_opts.dhchap_key); 5534 spdk_keyring_put_key(ctx->drv_opts.dhchap_ctrlr_key); 5535 free(ctx); 5536 } 5537 5538 static void 5539 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, int rc) 5540 { 5541 if (ctx->cb_fn) { 5542 ctx->cb_fn(ctx->cb_ctx, ctx->reported_bdevs, rc); 5543 } 5544 5545 ctx->namespaces_populated = true; 5546 if (ctx->probe_done) { 5547 /* The probe was already completed, so we need to free the context 5548 * here. This can happen for cases like OCSSD, where we need to 5549 * send additional commands to the SSD after attach. 5550 */ 5551 free_nvme_async_probe_ctx(ctx); 5552 } 5553 } 5554 5555 static int 5556 bdev_nvme_remove_poller(void *ctx) 5557 { 5558 struct spdk_nvme_transport_id trid_pcie; 5559 5560 if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 5561 spdk_poller_unregister(&g_hotplug_poller); 5562 return SPDK_POLLER_IDLE; 5563 } 5564 5565 memset(&trid_pcie, 0, sizeof(trid_pcie)); 5566 spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE); 5567 5568 if (spdk_nvme_scan_attached(&trid_pcie)) { 5569 SPDK_ERRLOG_RATELIMIT("spdk_nvme_scan_attached() failed\n"); 5570 } 5571 5572 return SPDK_POLLER_BUSY; 5573 } 5574 5575 static void 5576 nvme_ctrlr_create_done(struct nvme_ctrlr *nvme_ctrlr, 5577 struct nvme_async_probe_ctx *ctx) 5578 { 5579 struct spdk_nvme_transport_id *trid = &nvme_ctrlr->active_path_id->trid; 5580 5581 if (spdk_nvme_trtype_is_fabrics(trid->trtype)) { 5582 NVME_CTRLR_INFOLOG(nvme_ctrlr, "ctrlr was created to %s:%s\n", 5583 trid->traddr, trid->trsvcid); 5584 } else { 5585 NVME_CTRLR_INFOLOG(nvme_ctrlr, "ctrlr was created\n"); 5586 } 5587 5588 spdk_io_device_register(nvme_ctrlr, 5589 bdev_nvme_create_ctrlr_channel_cb, 5590 bdev_nvme_destroy_ctrlr_channel_cb, 5591 sizeof(struct nvme_ctrlr_channel), 5592 nvme_ctrlr->nbdev_ctrlr->name); 5593 5594 nvme_ctrlr_populate_namespaces(nvme_ctrlr, ctx); 5595 5596 if (g_hotplug_poller == NULL) { 5597 g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_remove_poller, NULL, 5598 NVME_HOTPLUG_POLL_PERIOD_DEFAULT); 5599 } 5600 } 5601 5602 static void 5603 nvme_ctrlr_init_ana_log_page_done(void *_ctx, const struct spdk_nvme_cpl *cpl) 5604 { 5605 struct nvme_ctrlr *nvme_ctrlr = _ctx; 5606 struct nvme_async_probe_ctx *ctx = nvme_ctrlr->probe_ctx; 5607 5608 nvme_ctrlr->probe_ctx = NULL; 5609 5610 if (spdk_nvme_cpl_is_error(cpl)) { 5611 nvme_ctrlr_delete(nvme_ctrlr); 5612 5613 if (ctx != NULL) { 5614 ctx->reported_bdevs = 0; 5615 populate_namespaces_cb(ctx, -1); 5616 } 5617 return; 5618 } 5619 5620 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 5621 } 5622 5623 static int 5624 nvme_ctrlr_init_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 5625 struct nvme_async_probe_ctx *ctx) 5626 { 5627 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 5628 const struct spdk_nvme_ctrlr_data *cdata; 5629 uint32_t ana_log_page_size; 5630 5631 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5632 5633 /* Set buffer size enough to include maximum number of allowed namespaces. */ 5634 ana_log_page_size = sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 5635 sizeof(struct spdk_nvme_ana_group_descriptor) + cdata->mnan * 5636 sizeof(uint32_t); 5637 5638 nvme_ctrlr->ana_log_page = spdk_zmalloc(ana_log_page_size, 64, NULL, 5639 SPDK_ENV_NUMA_ID_ANY, SPDK_MALLOC_DMA); 5640 if (nvme_ctrlr->ana_log_page == NULL) { 5641 NVME_CTRLR_ERRLOG(nvme_ctrlr, "could not allocate ANA log page buffer\n"); 5642 return -ENXIO; 5643 } 5644 5645 /* Each descriptor in a ANA log page is not ensured to be 8-bytes aligned. 5646 * Hence copy each descriptor to a temporary area when parsing it. 5647 * 5648 * Allocate a buffer whose size is as large as ANA log page buffer because 5649 * we do not know the size of a descriptor until actually reading it. 5650 */ 5651 nvme_ctrlr->copied_ana_desc = calloc(1, ana_log_page_size); 5652 if (nvme_ctrlr->copied_ana_desc == NULL) { 5653 NVME_CTRLR_ERRLOG(nvme_ctrlr, "could not allocate a buffer to parse ANA descriptor\n"); 5654 return -ENOMEM; 5655 } 5656 5657 nvme_ctrlr->max_ana_log_page_size = ana_log_page_size; 5658 5659 nvme_ctrlr->probe_ctx = ctx; 5660 5661 /* Then, set the read size only to include the current active namespaces. */ 5662 ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); 5663 5664 if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { 5665 NVME_CTRLR_ERRLOG(nvme_ctrlr, "ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", 5666 ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); 5667 return -EINVAL; 5668 } 5669 5670 return spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, 5671 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 5672 SPDK_NVME_GLOBAL_NS_TAG, 5673 nvme_ctrlr->ana_log_page, 5674 ana_log_page_size, 0, 5675 nvme_ctrlr_init_ana_log_page_done, 5676 nvme_ctrlr); 5677 } 5678 5679 /* hostnqn and subnqn were already verified before attaching a controller. 5680 * Hence check only the multipath capability and cntlid here. 5681 */ 5682 static bool 5683 bdev_nvme_check_multipath(struct nvme_bdev_ctrlr *nbdev_ctrlr, struct spdk_nvme_ctrlr *ctrlr) 5684 { 5685 struct nvme_ctrlr *tmp; 5686 const struct spdk_nvme_ctrlr_data *cdata, *tmp_cdata; 5687 5688 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5689 5690 if (!cdata->cmic.multi_ctrlr) { 5691 SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); 5692 return false; 5693 } 5694 5695 TAILQ_FOREACH(tmp, &nbdev_ctrlr->ctrlrs, tailq) { 5696 tmp_cdata = spdk_nvme_ctrlr_get_data(tmp->ctrlr); 5697 5698 if (!tmp_cdata->cmic.multi_ctrlr) { 5699 NVME_CTRLR_ERRLOG(tmp, "Ctrlr%u does not support multipath.\n", cdata->cntlid); 5700 return false; 5701 } 5702 if (cdata->cntlid == tmp_cdata->cntlid) { 5703 NVME_CTRLR_ERRLOG(tmp, "cntlid %u are duplicated.\n", tmp_cdata->cntlid); 5704 return false; 5705 } 5706 } 5707 5708 return true; 5709 } 5710 5711 5712 static int 5713 nvme_bdev_ctrlr_create(const char *name, struct nvme_ctrlr *nvme_ctrlr) 5714 { 5715 struct nvme_bdev_ctrlr *nbdev_ctrlr; 5716 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 5717 struct nvme_ctrlr *nctrlr; 5718 int rc = 0; 5719 5720 pthread_mutex_lock(&g_bdev_nvme_mutex); 5721 5722 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 5723 if (nbdev_ctrlr != NULL) { 5724 if (!bdev_nvme_check_multipath(nbdev_ctrlr, ctrlr)) { 5725 rc = -EINVAL; 5726 goto exit; 5727 } 5728 TAILQ_FOREACH(nctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 5729 if (nctrlr->opts.multipath != nvme_ctrlr->opts.multipath) { 5730 /* All controllers with the same name must be configured the same 5731 * way, either for multipath or failover. If the configuration doesn't 5732 * match - report error. 5733 */ 5734 rc = -EINVAL; 5735 goto exit; 5736 } 5737 } 5738 } else { 5739 nbdev_ctrlr = calloc(1, sizeof(*nbdev_ctrlr)); 5740 if (nbdev_ctrlr == NULL) { 5741 NVME_CTRLR_ERRLOG(nvme_ctrlr, "Failed to allocate nvme_bdev_ctrlr.\n"); 5742 rc = -ENOMEM; 5743 goto exit; 5744 } 5745 nbdev_ctrlr->name = strdup(name); 5746 if (nbdev_ctrlr->name == NULL) { 5747 NVME_CTRLR_ERRLOG(nvme_ctrlr, "Failed to allocate name of nvme_bdev_ctrlr.\n"); 5748 free(nbdev_ctrlr); 5749 goto exit; 5750 } 5751 TAILQ_INIT(&nbdev_ctrlr->ctrlrs); 5752 TAILQ_INIT(&nbdev_ctrlr->bdevs); 5753 TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 5754 } 5755 nvme_ctrlr->nbdev_ctrlr = nbdev_ctrlr; 5756 TAILQ_INSERT_TAIL(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 5757 exit: 5758 pthread_mutex_unlock(&g_bdev_nvme_mutex); 5759 return rc; 5760 } 5761 5762 static int 5763 nvme_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr, 5764 const char *name, 5765 const struct spdk_nvme_transport_id *trid, 5766 struct nvme_async_probe_ctx *ctx) 5767 { 5768 struct nvme_ctrlr *nvme_ctrlr; 5769 struct nvme_path_id *path_id; 5770 const struct spdk_nvme_ctrlr_data *cdata; 5771 int rc; 5772 5773 nvme_ctrlr = calloc(1, sizeof(*nvme_ctrlr)); 5774 if (nvme_ctrlr == NULL) { 5775 SPDK_ERRLOG("Failed to allocate device struct\n"); 5776 return -ENOMEM; 5777 } 5778 5779 rc = pthread_mutex_init(&nvme_ctrlr->mutex, NULL); 5780 if (rc != 0) { 5781 free(nvme_ctrlr); 5782 return rc; 5783 } 5784 5785 TAILQ_INIT(&nvme_ctrlr->trids); 5786 RB_INIT(&nvme_ctrlr->namespaces); 5787 5788 /* Get another reference to the key, so the first one can be released from probe_ctx */ 5789 if (ctx != NULL) { 5790 if (ctx->drv_opts.tls_psk != NULL) { 5791 nvme_ctrlr->psk = spdk_keyring_get_key( 5792 spdk_key_get_name(ctx->drv_opts.tls_psk)); 5793 if (nvme_ctrlr->psk == NULL) { 5794 /* Could only happen if the key was removed in the meantime */ 5795 SPDK_ERRLOG("Couldn't get a reference to the key '%s'\n", 5796 spdk_key_get_name(ctx->drv_opts.tls_psk)); 5797 rc = -ENOKEY; 5798 goto err; 5799 } 5800 } 5801 5802 if (ctx->drv_opts.dhchap_key != NULL) { 5803 nvme_ctrlr->dhchap_key = spdk_keyring_get_key( 5804 spdk_key_get_name(ctx->drv_opts.dhchap_key)); 5805 if (nvme_ctrlr->dhchap_key == NULL) { 5806 SPDK_ERRLOG("Couldn't get a reference to the key '%s'\n", 5807 spdk_key_get_name(ctx->drv_opts.dhchap_key)); 5808 rc = -ENOKEY; 5809 goto err; 5810 } 5811 } 5812 5813 if (ctx->drv_opts.dhchap_ctrlr_key != NULL) { 5814 nvme_ctrlr->dhchap_ctrlr_key = 5815 spdk_keyring_get_key( 5816 spdk_key_get_name(ctx->drv_opts.dhchap_ctrlr_key)); 5817 if (nvme_ctrlr->dhchap_ctrlr_key == NULL) { 5818 SPDK_ERRLOG("Couldn't get a reference to the key '%s'\n", 5819 spdk_key_get_name(ctx->drv_opts.dhchap_ctrlr_key)); 5820 rc = -ENOKEY; 5821 goto err; 5822 } 5823 } 5824 } 5825 5826 path_id = calloc(1, sizeof(*path_id)); 5827 if (path_id == NULL) { 5828 SPDK_ERRLOG("Failed to allocate trid entry pointer\n"); 5829 rc = -ENOMEM; 5830 goto err; 5831 } 5832 5833 path_id->trid = *trid; 5834 if (ctx != NULL) { 5835 memcpy(path_id->hostid.hostaddr, ctx->drv_opts.src_addr, sizeof(path_id->hostid.hostaddr)); 5836 memcpy(path_id->hostid.hostsvcid, ctx->drv_opts.src_svcid, sizeof(path_id->hostid.hostsvcid)); 5837 } 5838 nvme_ctrlr->active_path_id = path_id; 5839 TAILQ_INSERT_HEAD(&nvme_ctrlr->trids, path_id, link); 5840 5841 nvme_ctrlr->thread = spdk_get_thread(); 5842 nvme_ctrlr->ctrlr = ctrlr; 5843 nvme_ctrlr->ref = 1; 5844 5845 if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) { 5846 SPDK_ERRLOG("OCSSDs are not supported"); 5847 rc = -ENOTSUP; 5848 goto err; 5849 } 5850 5851 if (ctx != NULL) { 5852 memcpy(&nvme_ctrlr->opts, &ctx->bdev_opts, sizeof(ctx->bdev_opts)); 5853 } else { 5854 spdk_bdev_nvme_get_default_ctrlr_opts(&nvme_ctrlr->opts); 5855 } 5856 5857 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_ctrlr, 5858 g_opts.nvme_adminq_poll_period_us); 5859 5860 if (g_opts.timeout_us > 0) { 5861 /* Register timeout callback. Timeout values for IO vs. admin reqs can be different. */ 5862 /* If timeout_admin_us is 0 (not specified), admin uses same timeout as IO. */ 5863 uint64_t adm_timeout_us = (g_opts.timeout_admin_us == 0) ? 5864 g_opts.timeout_us : g_opts.timeout_admin_us; 5865 spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us, 5866 adm_timeout_us, timeout_cb, nvme_ctrlr); 5867 } 5868 5869 spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_ctrlr); 5870 spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_ctrlr); 5871 5872 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 5873 SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) { 5874 nvme_ctrlr->opal_dev = spdk_opal_dev_construct(ctrlr); 5875 } 5876 5877 rc = nvme_bdev_ctrlr_create(name, nvme_ctrlr); 5878 if (rc != 0) { 5879 goto err; 5880 } 5881 5882 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5883 5884 if (cdata->cmic.ana_reporting) { 5885 rc = nvme_ctrlr_init_ana_log_page(nvme_ctrlr, ctx); 5886 if (rc == 0) { 5887 return 0; 5888 } 5889 } else { 5890 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 5891 return 0; 5892 } 5893 5894 err: 5895 nvme_ctrlr_delete(nvme_ctrlr); 5896 return rc; 5897 } 5898 5899 void 5900 spdk_bdev_nvme_get_default_ctrlr_opts(struct spdk_bdev_nvme_ctrlr_opts *opts) 5901 { 5902 opts->prchk_flags = 0; 5903 opts->ctrlr_loss_timeout_sec = g_opts.ctrlr_loss_timeout_sec; 5904 opts->reconnect_delay_sec = g_opts.reconnect_delay_sec; 5905 opts->fast_io_fail_timeout_sec = g_opts.fast_io_fail_timeout_sec; 5906 opts->multipath = true; 5907 } 5908 5909 static void 5910 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5911 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *drv_opts) 5912 { 5913 char *name; 5914 5915 name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++); 5916 if (!name) { 5917 SPDK_ERRLOG("Failed to assign name to NVMe device\n"); 5918 return; 5919 } 5920 5921 if (nvme_ctrlr_create(ctrlr, name, trid, NULL) == 0) { 5922 SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name); 5923 } else { 5924 SPDK_ERRLOG("Failed to attach to %s (%s)\n", trid->traddr, name); 5925 } 5926 5927 free(name); 5928 } 5929 5930 static void 5931 _nvme_ctrlr_destruct(void *ctx) 5932 { 5933 struct nvme_ctrlr *nvme_ctrlr = ctx; 5934 5935 nvme_ctrlr_depopulate_namespaces(nvme_ctrlr); 5936 nvme_ctrlr_release(nvme_ctrlr); 5937 } 5938 5939 static int 5940 bdev_nvme_delete_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) 5941 { 5942 struct nvme_probe_skip_entry *entry; 5943 5944 /* The controller's destruction was already started */ 5945 if (nvme_ctrlr->destruct) { 5946 return -EALREADY; 5947 } 5948 5949 if (!hotplug && 5950 nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { 5951 entry = calloc(1, sizeof(*entry)); 5952 if (!entry) { 5953 return -ENOMEM; 5954 } 5955 entry->trid = nvme_ctrlr->active_path_id->trid; 5956 TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq); 5957 } 5958 5959 nvme_ctrlr->destruct = true; 5960 return 0; 5961 } 5962 5963 static int 5964 bdev_nvme_delete_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) 5965 { 5966 int rc; 5967 5968 pthread_mutex_lock(&nvme_ctrlr->mutex); 5969 rc = bdev_nvme_delete_ctrlr_unsafe(nvme_ctrlr, hotplug); 5970 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5971 5972 if (rc == 0) { 5973 _nvme_ctrlr_destruct(nvme_ctrlr); 5974 } else if (rc == -EALREADY) { 5975 rc = 0; 5976 } 5977 5978 return rc; 5979 } 5980 5981 static void 5982 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr) 5983 { 5984 struct nvme_ctrlr *nvme_ctrlr = cb_ctx; 5985 5986 bdev_nvme_delete_ctrlr(nvme_ctrlr, true); 5987 } 5988 5989 static int 5990 bdev_nvme_hotplug_probe(void *arg) 5991 { 5992 if (g_hotplug_probe_ctx == NULL) { 5993 spdk_poller_unregister(&g_hotplug_probe_poller); 5994 return SPDK_POLLER_IDLE; 5995 } 5996 5997 if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) { 5998 g_hotplug_probe_ctx = NULL; 5999 spdk_poller_unregister(&g_hotplug_probe_poller); 6000 } 6001 6002 return SPDK_POLLER_BUSY; 6003 } 6004 6005 static int 6006 bdev_nvme_hotplug(void *arg) 6007 { 6008 struct spdk_nvme_transport_id trid_pcie; 6009 6010 if (g_hotplug_probe_ctx) { 6011 return SPDK_POLLER_BUSY; 6012 } 6013 6014 memset(&trid_pcie, 0, sizeof(trid_pcie)); 6015 spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE); 6016 6017 g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL, 6018 hotplug_probe_cb, attach_cb, NULL); 6019 6020 if (g_hotplug_probe_ctx) { 6021 assert(g_hotplug_probe_poller == NULL); 6022 g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000); 6023 } 6024 6025 return SPDK_POLLER_BUSY; 6026 } 6027 6028 void 6029 bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts) 6030 { 6031 *opts = g_opts; 6032 } 6033 6034 static bool bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, 6035 uint32_t reconnect_delay_sec, 6036 uint32_t fast_io_fail_timeout_sec); 6037 6038 static int 6039 bdev_nvme_validate_opts(const struct spdk_bdev_nvme_opts *opts) 6040 { 6041 if ((opts->timeout_us == 0) && (opts->timeout_admin_us != 0)) { 6042 /* Can't set timeout_admin_us without also setting timeout_us */ 6043 SPDK_WARNLOG("Invalid options: Can't have (timeout_us == 0) with (timeout_admin_us > 0)\n"); 6044 return -EINVAL; 6045 } 6046 6047 if (opts->bdev_retry_count < -1) { 6048 SPDK_WARNLOG("Invalid option: bdev_retry_count can't be less than -1.\n"); 6049 return -EINVAL; 6050 } 6051 6052 if (!bdev_nvme_check_io_error_resiliency_params(opts->ctrlr_loss_timeout_sec, 6053 opts->reconnect_delay_sec, 6054 opts->fast_io_fail_timeout_sec)) { 6055 return -EINVAL; 6056 } 6057 6058 return 0; 6059 } 6060 6061 int 6062 bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts) 6063 { 6064 int ret; 6065 6066 ret = bdev_nvme_validate_opts(opts); 6067 if (ret) { 6068 SPDK_WARNLOG("Failed to set nvme opts.\n"); 6069 return ret; 6070 } 6071 6072 if (g_bdev_nvme_init_thread != NULL) { 6073 if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 6074 return -EPERM; 6075 } 6076 } 6077 6078 if (opts->rdma_srq_size != 0 || 6079 opts->rdma_max_cq_size != 0 || 6080 opts->rdma_cm_event_timeout_ms != 0) { 6081 struct spdk_nvme_transport_opts drv_opts; 6082 6083 spdk_nvme_transport_get_opts(&drv_opts, sizeof(drv_opts)); 6084 if (opts->rdma_srq_size != 0) { 6085 drv_opts.rdma_srq_size = opts->rdma_srq_size; 6086 } 6087 if (opts->rdma_max_cq_size != 0) { 6088 drv_opts.rdma_max_cq_size = opts->rdma_max_cq_size; 6089 } 6090 if (opts->rdma_cm_event_timeout_ms != 0) { 6091 drv_opts.rdma_cm_event_timeout_ms = opts->rdma_cm_event_timeout_ms; 6092 } 6093 6094 ret = spdk_nvme_transport_set_opts(&drv_opts, sizeof(drv_opts)); 6095 if (ret) { 6096 SPDK_ERRLOG("Failed to set NVMe transport opts.\n"); 6097 return ret; 6098 } 6099 } 6100 6101 g_opts = *opts; 6102 6103 return 0; 6104 } 6105 6106 struct set_nvme_hotplug_ctx { 6107 uint64_t period_us; 6108 bool enabled; 6109 spdk_msg_fn fn; 6110 void *fn_ctx; 6111 }; 6112 6113 static void 6114 set_nvme_hotplug_period_cb(void *_ctx) 6115 { 6116 struct set_nvme_hotplug_ctx *ctx = _ctx; 6117 6118 spdk_poller_unregister(&g_hotplug_poller); 6119 if (ctx->enabled) { 6120 g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us); 6121 } else { 6122 g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_remove_poller, NULL, 6123 NVME_HOTPLUG_POLL_PERIOD_DEFAULT); 6124 } 6125 6126 g_nvme_hotplug_poll_period_us = ctx->period_us; 6127 g_nvme_hotplug_enabled = ctx->enabled; 6128 if (ctx->fn) { 6129 ctx->fn(ctx->fn_ctx); 6130 } 6131 6132 free(ctx); 6133 } 6134 6135 int 6136 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx) 6137 { 6138 struct set_nvme_hotplug_ctx *ctx; 6139 6140 if (enabled == true && !spdk_process_is_primary()) { 6141 return -EPERM; 6142 } 6143 6144 ctx = calloc(1, sizeof(*ctx)); 6145 if (ctx == NULL) { 6146 return -ENOMEM; 6147 } 6148 6149 period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us; 6150 ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX); 6151 ctx->enabled = enabled; 6152 ctx->fn = cb; 6153 ctx->fn_ctx = cb_ctx; 6154 6155 spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx); 6156 return 0; 6157 } 6158 6159 static void 6160 nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 6161 struct nvme_async_probe_ctx *ctx) 6162 { 6163 struct nvme_ns *nvme_ns; 6164 struct nvme_bdev *nvme_bdev; 6165 size_t j; 6166 6167 assert(nvme_ctrlr != NULL); 6168 6169 if (ctx->names == NULL) { 6170 ctx->reported_bdevs = 0; 6171 populate_namespaces_cb(ctx, 0); 6172 return; 6173 } 6174 6175 /* 6176 * Report the new bdevs that were created in this call. 6177 * There can be more than one bdev per NVMe controller. 6178 */ 6179 j = 0; 6180 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 6181 while (nvme_ns != NULL) { 6182 nvme_bdev = nvme_ns->bdev; 6183 if (j < ctx->max_bdevs) { 6184 ctx->names[j] = nvme_bdev->disk.name; 6185 j++; 6186 } else { 6187 NVME_CTRLR_ERRLOG(nvme_ctrlr, 6188 "Maximum number of namespaces supported per NVMe controller is %du. " 6189 "Unable to return all names of created bdevs\n", 6190 ctx->max_bdevs); 6191 ctx->reported_bdevs = 0; 6192 populate_namespaces_cb(ctx, -ERANGE); 6193 return; 6194 } 6195 6196 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 6197 } 6198 6199 ctx->reported_bdevs = j; 6200 populate_namespaces_cb(ctx, 0); 6201 } 6202 6203 static int 6204 bdev_nvme_check_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 6205 struct spdk_nvme_ctrlr *new_ctrlr, 6206 struct spdk_nvme_transport_id *trid) 6207 { 6208 struct nvme_path_id *tmp_trid; 6209 6210 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 6211 NVME_CTRLR_ERRLOG(nvme_ctrlr, "PCIe failover is not supported.\n"); 6212 return -ENOTSUP; 6213 } 6214 6215 /* Currently we only support failover to the same transport type. */ 6216 if (nvme_ctrlr->active_path_id->trid.trtype != trid->trtype) { 6217 NVME_CTRLR_WARNLOG(nvme_ctrlr, 6218 "Failover from trtype: %s to a different trtype: %s is not supported currently\n", 6219 spdk_nvme_transport_id_trtype_str(nvme_ctrlr->active_path_id->trid.trtype), 6220 spdk_nvme_transport_id_trtype_str(trid->trtype)); 6221 return -EINVAL; 6222 } 6223 6224 6225 /* Currently we only support failover to the same NQN. */ 6226 if (strncmp(trid->subnqn, nvme_ctrlr->active_path_id->trid.subnqn, SPDK_NVMF_NQN_MAX_LEN)) { 6227 NVME_CTRLR_WARNLOG(nvme_ctrlr, 6228 "Failover from subnqn: %s to a different subnqn: %s is not supported currently\n", 6229 nvme_ctrlr->active_path_id->trid.subnqn, trid->subnqn); 6230 return -EINVAL; 6231 } 6232 6233 /* Skip all the other checks if we've already registered this path. */ 6234 TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) { 6235 if (!spdk_nvme_transport_id_compare(&tmp_trid->trid, trid)) { 6236 NVME_CTRLR_WARNLOG(nvme_ctrlr, "This path (traddr: %s subnqn: %s) is already registered\n", 6237 trid->traddr, trid->subnqn); 6238 return -EALREADY; 6239 } 6240 } 6241 6242 return 0; 6243 } 6244 6245 static int 6246 bdev_nvme_check_secondary_namespace(struct nvme_ctrlr *nvme_ctrlr, 6247 struct spdk_nvme_ctrlr *new_ctrlr) 6248 { 6249 struct nvme_ns *nvme_ns; 6250 struct spdk_nvme_ns *new_ns; 6251 6252 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 6253 while (nvme_ns != NULL) { 6254 new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nvme_ns->id); 6255 assert(new_ns != NULL); 6256 6257 if (!bdev_nvme_compare_ns(nvme_ns->ns, new_ns)) { 6258 return -EINVAL; 6259 } 6260 6261 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 6262 } 6263 6264 return 0; 6265 } 6266 6267 static int 6268 _bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 6269 struct spdk_nvme_transport_id *trid) 6270 { 6271 struct nvme_path_id *active_id, *new_trid, *tmp_trid; 6272 6273 new_trid = calloc(1, sizeof(*new_trid)); 6274 if (new_trid == NULL) { 6275 return -ENOMEM; 6276 } 6277 new_trid->trid = *trid; 6278 6279 active_id = nvme_ctrlr->active_path_id; 6280 assert(active_id != NULL); 6281 assert(active_id == TAILQ_FIRST(&nvme_ctrlr->trids)); 6282 6283 /* Skip the active trid not to replace it until it is failed. */ 6284 tmp_trid = TAILQ_NEXT(active_id, link); 6285 if (tmp_trid == NULL) { 6286 goto add_tail; 6287 } 6288 6289 /* It means the trid is faled if its last failed time is non-zero. 6290 * Insert the new alternate trid before any failed trid. 6291 */ 6292 TAILQ_FOREACH_FROM(tmp_trid, &nvme_ctrlr->trids, link) { 6293 if (tmp_trid->last_failed_tsc != 0) { 6294 TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link); 6295 return 0; 6296 } 6297 } 6298 6299 add_tail: 6300 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, new_trid, link); 6301 return 0; 6302 } 6303 6304 /* This is the case that a secondary path is added to an existing 6305 * nvme_ctrlr for failover. After checking if it can access the same 6306 * namespaces as the primary path, it is disconnected until failover occurs. 6307 */ 6308 static int 6309 bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 6310 struct spdk_nvme_ctrlr *new_ctrlr, 6311 struct spdk_nvme_transport_id *trid) 6312 { 6313 int rc; 6314 6315 assert(nvme_ctrlr != NULL); 6316 6317 pthread_mutex_lock(&nvme_ctrlr->mutex); 6318 6319 rc = bdev_nvme_check_secondary_trid(nvme_ctrlr, new_ctrlr, trid); 6320 if (rc != 0) { 6321 goto exit; 6322 } 6323 6324 rc = bdev_nvme_check_secondary_namespace(nvme_ctrlr, new_ctrlr); 6325 if (rc != 0) { 6326 goto exit; 6327 } 6328 6329 rc = _bdev_nvme_add_secondary_trid(nvme_ctrlr, trid); 6330 6331 exit: 6332 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6333 6334 spdk_nvme_detach(new_ctrlr); 6335 6336 return rc; 6337 } 6338 6339 static void 6340 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 6341 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 6342 { 6343 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 6344 struct nvme_async_probe_ctx *ctx; 6345 int rc; 6346 6347 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 6348 ctx->ctrlr_attached = true; 6349 6350 rc = nvme_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx); 6351 if (rc != 0) { 6352 ctx->reported_bdevs = 0; 6353 populate_namespaces_cb(ctx, rc); 6354 } 6355 } 6356 6357 6358 static void 6359 connect_set_failover_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 6360 struct spdk_nvme_ctrlr *ctrlr, 6361 const struct spdk_nvme_ctrlr_opts *opts) 6362 { 6363 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 6364 struct nvme_ctrlr *nvme_ctrlr; 6365 struct nvme_async_probe_ctx *ctx; 6366 int rc; 6367 6368 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 6369 ctx->ctrlr_attached = true; 6370 6371 nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->base_name); 6372 if (nvme_ctrlr) { 6373 rc = bdev_nvme_add_secondary_trid(nvme_ctrlr, ctrlr, &ctx->trid); 6374 } else { 6375 rc = -ENODEV; 6376 } 6377 6378 ctx->reported_bdevs = 0; 6379 populate_namespaces_cb(ctx, rc); 6380 } 6381 6382 static int 6383 bdev_nvme_async_poll(void *arg) 6384 { 6385 struct nvme_async_probe_ctx *ctx = arg; 6386 int rc; 6387 6388 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 6389 if (spdk_unlikely(rc != -EAGAIN)) { 6390 ctx->probe_done = true; 6391 spdk_poller_unregister(&ctx->poller); 6392 if (!ctx->ctrlr_attached) { 6393 /* The probe is done, but no controller was attached. 6394 * That means we had a failure, so report -EIO back to 6395 * the caller (usually the RPC). populate_namespaces_cb() 6396 * will take care of freeing the nvme_async_probe_ctx. 6397 */ 6398 ctx->reported_bdevs = 0; 6399 populate_namespaces_cb(ctx, -EIO); 6400 } else if (ctx->namespaces_populated) { 6401 /* The namespaces for the attached controller were all 6402 * populated and the response was already sent to the 6403 * caller (usually the RPC). So free the context here. 6404 */ 6405 free_nvme_async_probe_ctx(ctx); 6406 } 6407 } 6408 6409 return SPDK_POLLER_BUSY; 6410 } 6411 6412 static bool 6413 bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, 6414 uint32_t reconnect_delay_sec, 6415 uint32_t fast_io_fail_timeout_sec) 6416 { 6417 if (ctrlr_loss_timeout_sec < -1) { 6418 SPDK_ERRLOG("ctrlr_loss_timeout_sec can't be less than -1.\n"); 6419 return false; 6420 } else if (ctrlr_loss_timeout_sec == -1) { 6421 if (reconnect_delay_sec == 0) { 6422 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 6423 return false; 6424 } else if (fast_io_fail_timeout_sec != 0 && 6425 fast_io_fail_timeout_sec < reconnect_delay_sec) { 6426 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io-fail_timeout_sec.\n"); 6427 return false; 6428 } 6429 } else if (ctrlr_loss_timeout_sec != 0) { 6430 if (reconnect_delay_sec == 0) { 6431 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 6432 return false; 6433 } else if (reconnect_delay_sec > (uint32_t)ctrlr_loss_timeout_sec) { 6434 SPDK_ERRLOG("reconnect_delay_sec can't be more than ctrlr_loss_timeout_sec.\n"); 6435 return false; 6436 } else if (fast_io_fail_timeout_sec != 0) { 6437 if (fast_io_fail_timeout_sec < reconnect_delay_sec) { 6438 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io_fail_timeout_sec.\n"); 6439 return false; 6440 } else if (fast_io_fail_timeout_sec > (uint32_t)ctrlr_loss_timeout_sec) { 6441 SPDK_ERRLOG("fast_io_fail_timeout_sec can't be more than ctrlr_loss_timeout_sec.\n"); 6442 return false; 6443 } 6444 } 6445 } else if (reconnect_delay_sec != 0 || fast_io_fail_timeout_sec != 0) { 6446 SPDK_ERRLOG("Both reconnect_delay_sec and fast_io_fail_timeout_sec must be 0 if ctrlr_loss_timeout_sec is 0.\n"); 6447 return false; 6448 } 6449 6450 return true; 6451 } 6452 6453 int 6454 spdk_bdev_nvme_create(struct spdk_nvme_transport_id *trid, 6455 const char *base_name, 6456 const char **names, 6457 uint32_t count, 6458 spdk_bdev_nvme_create_cb cb_fn, 6459 void *cb_ctx, 6460 struct spdk_nvme_ctrlr_opts *drv_opts, 6461 struct spdk_bdev_nvme_ctrlr_opts *bdev_opts) 6462 { 6463 struct nvme_probe_skip_entry *entry, *tmp; 6464 struct nvme_async_probe_ctx *ctx; 6465 spdk_nvme_attach_cb attach_cb; 6466 struct nvme_ctrlr *nvme_ctrlr; 6467 int len; 6468 6469 /* TODO expand this check to include both the host and target TRIDs. 6470 * Only if both are the same should we fail. 6471 */ 6472 if (nvme_ctrlr_get(trid, drv_opts->hostnqn) != NULL) { 6473 SPDK_ERRLOG("A controller with the provided trid (traddr: %s, hostnqn: %s) " 6474 "already exists.\n", trid->traddr, drv_opts->hostnqn); 6475 return -EEXIST; 6476 } 6477 6478 len = strnlen(base_name, SPDK_CONTROLLER_NAME_MAX); 6479 6480 if (len == 0 || len == SPDK_CONTROLLER_NAME_MAX) { 6481 SPDK_ERRLOG("controller name must be between 1 and %d characters\n", SPDK_CONTROLLER_NAME_MAX - 1); 6482 return -EINVAL; 6483 } 6484 6485 if (bdev_opts != NULL && 6486 !bdev_nvme_check_io_error_resiliency_params(bdev_opts->ctrlr_loss_timeout_sec, 6487 bdev_opts->reconnect_delay_sec, 6488 bdev_opts->fast_io_fail_timeout_sec)) { 6489 return -EINVAL; 6490 } 6491 6492 ctx = calloc(1, sizeof(*ctx)); 6493 if (!ctx) { 6494 return -ENOMEM; 6495 } 6496 ctx->base_name = base_name; 6497 ctx->names = names; 6498 ctx->max_bdevs = count; 6499 ctx->cb_fn = cb_fn; 6500 ctx->cb_ctx = cb_ctx; 6501 ctx->trid = *trid; 6502 6503 if (bdev_opts) { 6504 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 6505 } else { 6506 spdk_bdev_nvme_get_default_ctrlr_opts(&ctx->bdev_opts); 6507 } 6508 6509 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 6510 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) { 6511 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 6512 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 6513 free(entry); 6514 break; 6515 } 6516 } 6517 } 6518 6519 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 6520 ctx->drv_opts.transport_retry_count = g_opts.transport_retry_count; 6521 ctx->drv_opts.transport_ack_timeout = g_opts.transport_ack_timeout; 6522 ctx->drv_opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms; 6523 ctx->drv_opts.disable_read_ana_log_page = true; 6524 ctx->drv_opts.transport_tos = g_opts.transport_tos; 6525 6526 if (ctx->bdev_opts.psk != NULL) { 6527 ctx->drv_opts.tls_psk = spdk_keyring_get_key(ctx->bdev_opts.psk); 6528 if (ctx->drv_opts.tls_psk == NULL) { 6529 SPDK_ERRLOG("Could not load PSK: %s\n", ctx->bdev_opts.psk); 6530 free_nvme_async_probe_ctx(ctx); 6531 return -ENOKEY; 6532 } 6533 } 6534 6535 if (ctx->bdev_opts.dhchap_key != NULL) { 6536 ctx->drv_opts.dhchap_key = spdk_keyring_get_key(ctx->bdev_opts.dhchap_key); 6537 if (ctx->drv_opts.dhchap_key == NULL) { 6538 SPDK_ERRLOG("Could not load DH-HMAC-CHAP key: %s\n", 6539 ctx->bdev_opts.dhchap_key); 6540 free_nvme_async_probe_ctx(ctx); 6541 return -ENOKEY; 6542 } 6543 6544 ctx->drv_opts.dhchap_digests = g_opts.dhchap_digests; 6545 ctx->drv_opts.dhchap_dhgroups = g_opts.dhchap_dhgroups; 6546 } 6547 if (ctx->bdev_opts.dhchap_ctrlr_key != NULL) { 6548 ctx->drv_opts.dhchap_ctrlr_key = 6549 spdk_keyring_get_key(ctx->bdev_opts.dhchap_ctrlr_key); 6550 if (ctx->drv_opts.dhchap_ctrlr_key == NULL) { 6551 SPDK_ERRLOG("Could not load DH-HMAC-CHAP controller key: %s\n", 6552 ctx->bdev_opts.dhchap_ctrlr_key); 6553 free_nvme_async_probe_ctx(ctx); 6554 return -ENOKEY; 6555 } 6556 } 6557 6558 if (nvme_bdev_ctrlr_get_by_name(base_name) == NULL || ctx->bdev_opts.multipath) { 6559 attach_cb = connect_attach_cb; 6560 } else { 6561 attach_cb = connect_set_failover_cb; 6562 } 6563 6564 nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->base_name); 6565 if (nvme_ctrlr && nvme_ctrlr->opts.multipath != ctx->bdev_opts.multipath) { 6566 /* All controllers with the same name must be configured the same 6567 * way, either for multipath or failover. If the configuration doesn't 6568 * match - report error. 6569 */ 6570 free_nvme_async_probe_ctx(ctx); 6571 return -EINVAL; 6572 } 6573 6574 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, attach_cb); 6575 if (ctx->probe_ctx == NULL) { 6576 SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr); 6577 free_nvme_async_probe_ctx(ctx); 6578 return -ENODEV; 6579 } 6580 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000); 6581 6582 return 0; 6583 } 6584 6585 struct bdev_nvme_delete_ctx { 6586 char *name; 6587 struct nvme_path_id path_id; 6588 bdev_nvme_delete_done_fn delete_done; 6589 void *delete_done_ctx; 6590 uint64_t timeout_ticks; 6591 struct spdk_poller *poller; 6592 }; 6593 6594 static void 6595 free_bdev_nvme_delete_ctx(struct bdev_nvme_delete_ctx *ctx) 6596 { 6597 if (ctx != NULL) { 6598 free(ctx->name); 6599 free(ctx); 6600 } 6601 } 6602 6603 static bool 6604 nvme_path_id_compare(struct nvme_path_id *p, const struct nvme_path_id *path_id) 6605 { 6606 if (path_id->trid.trtype != 0) { 6607 if (path_id->trid.trtype == SPDK_NVME_TRANSPORT_CUSTOM) { 6608 if (strcasecmp(path_id->trid.trstring, p->trid.trstring) != 0) { 6609 return false; 6610 } 6611 } else { 6612 if (path_id->trid.trtype != p->trid.trtype) { 6613 return false; 6614 } 6615 } 6616 } 6617 6618 if (!spdk_mem_all_zero(path_id->trid.traddr, sizeof(path_id->trid.traddr))) { 6619 if (strcasecmp(path_id->trid.traddr, p->trid.traddr) != 0) { 6620 return false; 6621 } 6622 } 6623 6624 if (path_id->trid.adrfam != 0) { 6625 if (path_id->trid.adrfam != p->trid.adrfam) { 6626 return false; 6627 } 6628 } 6629 6630 if (!spdk_mem_all_zero(path_id->trid.trsvcid, sizeof(path_id->trid.trsvcid))) { 6631 if (strcasecmp(path_id->trid.trsvcid, p->trid.trsvcid) != 0) { 6632 return false; 6633 } 6634 } 6635 6636 if (!spdk_mem_all_zero(path_id->trid.subnqn, sizeof(path_id->trid.subnqn))) { 6637 if (strcmp(path_id->trid.subnqn, p->trid.subnqn) != 0) { 6638 return false; 6639 } 6640 } 6641 6642 if (!spdk_mem_all_zero(path_id->hostid.hostaddr, sizeof(path_id->hostid.hostaddr))) { 6643 if (strcmp(path_id->hostid.hostaddr, p->hostid.hostaddr) != 0) { 6644 return false; 6645 } 6646 } 6647 6648 if (!spdk_mem_all_zero(path_id->hostid.hostsvcid, sizeof(path_id->hostid.hostsvcid))) { 6649 if (strcmp(path_id->hostid.hostsvcid, p->hostid.hostsvcid) != 0) { 6650 return false; 6651 } 6652 } 6653 6654 return true; 6655 } 6656 6657 static bool 6658 nvme_path_id_exists(const char *name, const struct nvme_path_id *path_id) 6659 { 6660 struct nvme_bdev_ctrlr *nbdev_ctrlr; 6661 struct nvme_ctrlr *ctrlr; 6662 struct nvme_path_id *p; 6663 6664 pthread_mutex_lock(&g_bdev_nvme_mutex); 6665 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 6666 if (!nbdev_ctrlr) { 6667 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6668 return false; 6669 } 6670 6671 TAILQ_FOREACH(ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 6672 pthread_mutex_lock(&ctrlr->mutex); 6673 TAILQ_FOREACH(p, &ctrlr->trids, link) { 6674 if (nvme_path_id_compare(p, path_id)) { 6675 pthread_mutex_unlock(&ctrlr->mutex); 6676 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6677 return true; 6678 } 6679 } 6680 pthread_mutex_unlock(&ctrlr->mutex); 6681 } 6682 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6683 6684 return false; 6685 } 6686 6687 static int 6688 bdev_nvme_delete_complete_poll(void *arg) 6689 { 6690 struct bdev_nvme_delete_ctx *ctx = arg; 6691 int rc = 0; 6692 6693 if (nvme_path_id_exists(ctx->name, &ctx->path_id)) { 6694 if (ctx->timeout_ticks > spdk_get_ticks()) { 6695 return SPDK_POLLER_BUSY; 6696 } 6697 6698 SPDK_ERRLOG("NVMe path '%s' still exists after delete\n", ctx->name); 6699 rc = -ETIMEDOUT; 6700 } 6701 6702 spdk_poller_unregister(&ctx->poller); 6703 6704 ctx->delete_done(ctx->delete_done_ctx, rc); 6705 free_bdev_nvme_delete_ctx(ctx); 6706 6707 return SPDK_POLLER_BUSY; 6708 } 6709 6710 static int 6711 _bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, const struct nvme_path_id *path_id) 6712 { 6713 struct nvme_path_id *p, *t; 6714 spdk_msg_fn msg_fn; 6715 int rc = -ENXIO; 6716 6717 pthread_mutex_lock(&nvme_ctrlr->mutex); 6718 6719 TAILQ_FOREACH_REVERSE_SAFE(p, &nvme_ctrlr->trids, nvme_paths, link, t) { 6720 if (p == TAILQ_FIRST(&nvme_ctrlr->trids)) { 6721 break; 6722 } 6723 6724 if (!nvme_path_id_compare(p, path_id)) { 6725 continue; 6726 } 6727 6728 /* We are not using the specified path. */ 6729 TAILQ_REMOVE(&nvme_ctrlr->trids, p, link); 6730 free(p); 6731 rc = 0; 6732 } 6733 6734 if (p == NULL || !nvme_path_id_compare(p, path_id)) { 6735 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6736 return rc; 6737 } 6738 6739 /* If we made it here, then this path is a match! Now we need to remove it. */ 6740 6741 /* This is the active path in use right now. The active path is always the first in the list. */ 6742 assert(p == nvme_ctrlr->active_path_id); 6743 6744 if (!TAILQ_NEXT(p, link)) { 6745 /* The current path is the only path. */ 6746 msg_fn = _nvme_ctrlr_destruct; 6747 rc = bdev_nvme_delete_ctrlr_unsafe(nvme_ctrlr, false); 6748 } else { 6749 /* There is an alternative path. */ 6750 msg_fn = _bdev_nvme_reset_ctrlr; 6751 rc = bdev_nvme_failover_ctrlr_unsafe(nvme_ctrlr, true); 6752 } 6753 6754 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6755 6756 if (rc == 0) { 6757 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 6758 } else if (rc == -EALREADY) { 6759 rc = 0; 6760 } 6761 6762 return rc; 6763 } 6764 6765 int 6766 bdev_nvme_delete(const char *name, const struct nvme_path_id *path_id, 6767 bdev_nvme_delete_done_fn delete_done, void *delete_done_ctx) 6768 { 6769 struct nvme_bdev_ctrlr *nbdev_ctrlr; 6770 struct nvme_ctrlr *nvme_ctrlr, *tmp_nvme_ctrlr; 6771 struct bdev_nvme_delete_ctx *ctx = NULL; 6772 int rc = -ENXIO, _rc; 6773 6774 if (name == NULL || path_id == NULL) { 6775 rc = -EINVAL; 6776 goto exit; 6777 } 6778 6779 pthread_mutex_lock(&g_bdev_nvme_mutex); 6780 6781 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 6782 if (nbdev_ctrlr == NULL) { 6783 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6784 6785 SPDK_ERRLOG("Failed to find NVMe bdev controller\n"); 6786 rc = -ENODEV; 6787 goto exit; 6788 } 6789 6790 TAILQ_FOREACH_SAFE(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq, tmp_nvme_ctrlr) { 6791 _rc = _bdev_nvme_delete(nvme_ctrlr, path_id); 6792 if (_rc < 0 && _rc != -ENXIO) { 6793 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6794 rc = _rc; 6795 goto exit; 6796 } else if (_rc == 0) { 6797 /* We traverse all remaining nvme_ctrlrs even if one nvme_ctrlr 6798 * was deleted successfully. To remember the successful deletion, 6799 * overwrite rc only if _rc is zero. 6800 */ 6801 rc = 0; 6802 } 6803 } 6804 6805 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6806 6807 if (rc != 0 || delete_done == NULL) { 6808 goto exit; 6809 } 6810 6811 ctx = calloc(1, sizeof(*ctx)); 6812 if (ctx == NULL) { 6813 SPDK_ERRLOG("Failed to allocate context for bdev_nvme_delete\n"); 6814 rc = -ENOMEM; 6815 goto exit; 6816 } 6817 6818 ctx->name = strdup(name); 6819 if (ctx->name == NULL) { 6820 SPDK_ERRLOG("Failed to copy controller name for deletion\n"); 6821 rc = -ENOMEM; 6822 goto exit; 6823 } 6824 6825 ctx->delete_done = delete_done; 6826 ctx->delete_done_ctx = delete_done_ctx; 6827 ctx->path_id = *path_id; 6828 ctx->timeout_ticks = spdk_get_ticks() + 10 * spdk_get_ticks_hz(); 6829 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_delete_complete_poll, ctx, 1000); 6830 if (ctx->poller == NULL) { 6831 SPDK_ERRLOG("Failed to register bdev_nvme_delete poller\n"); 6832 rc = -ENOMEM; 6833 goto exit; 6834 } 6835 6836 exit: 6837 if (rc != 0) { 6838 free_bdev_nvme_delete_ctx(ctx); 6839 } 6840 6841 return rc; 6842 } 6843 6844 #define DISCOVERY_INFOLOG(ctx, format, ...) \ 6845 SPDK_INFOLOG(bdev_nvme, "Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 6846 6847 #define DISCOVERY_ERRLOG(ctx, format, ...) \ 6848 SPDK_ERRLOG("Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 6849 6850 struct discovery_entry_ctx { 6851 char name[128]; 6852 struct spdk_nvme_transport_id trid; 6853 struct spdk_nvme_ctrlr_opts drv_opts; 6854 struct spdk_nvmf_discovery_log_page_entry entry; 6855 TAILQ_ENTRY(discovery_entry_ctx) tailq; 6856 struct discovery_ctx *ctx; 6857 }; 6858 6859 struct discovery_ctx { 6860 char *name; 6861 spdk_bdev_nvme_start_discovery_fn start_cb_fn; 6862 spdk_bdev_nvme_stop_discovery_fn stop_cb_fn; 6863 void *cb_ctx; 6864 struct spdk_nvme_probe_ctx *probe_ctx; 6865 struct spdk_nvme_detach_ctx *detach_ctx; 6866 struct spdk_nvme_ctrlr *ctrlr; 6867 struct spdk_nvme_transport_id trid; 6868 struct discovery_entry_ctx *entry_ctx_in_use; 6869 struct spdk_poller *poller; 6870 struct spdk_nvme_ctrlr_opts drv_opts; 6871 struct spdk_bdev_nvme_ctrlr_opts bdev_opts; 6872 struct spdk_nvmf_discovery_log_page *log_page; 6873 TAILQ_ENTRY(discovery_ctx) tailq; 6874 TAILQ_HEAD(, discovery_entry_ctx) nvm_entry_ctxs; 6875 TAILQ_HEAD(, discovery_entry_ctx) discovery_entry_ctxs; 6876 int rc; 6877 bool wait_for_attach; 6878 uint64_t timeout_ticks; 6879 /* Denotes that the discovery service is being started. We're waiting 6880 * for the initial connection to the discovery controller to be 6881 * established and attach discovered NVM ctrlrs. 6882 */ 6883 bool initializing; 6884 /* Denotes if a discovery is currently in progress for this context. 6885 * That includes connecting to newly discovered subsystems. Used to 6886 * ensure we do not start a new discovery until an existing one is 6887 * complete. 6888 */ 6889 bool in_progress; 6890 6891 /* Denotes if another discovery is needed after the one in progress 6892 * completes. Set when we receive an AER completion while a discovery 6893 * is already in progress. 6894 */ 6895 bool pending; 6896 6897 /* Signal to the discovery context poller that it should stop the 6898 * discovery service, including detaching from the current discovery 6899 * controller. 6900 */ 6901 bool stop; 6902 6903 struct spdk_thread *calling_thread; 6904 uint32_t index; 6905 uint32_t attach_in_progress; 6906 char *hostnqn; 6907 6908 /* Denotes if the discovery service was started by the mdns discovery. 6909 */ 6910 bool from_mdns_discovery_service; 6911 }; 6912 6913 TAILQ_HEAD(discovery_ctxs, discovery_ctx); 6914 static struct discovery_ctxs g_discovery_ctxs = TAILQ_HEAD_INITIALIZER(g_discovery_ctxs); 6915 6916 static void get_discovery_log_page(struct discovery_ctx *ctx); 6917 6918 static void 6919 free_discovery_ctx(struct discovery_ctx *ctx) 6920 { 6921 free(ctx->log_page); 6922 free(ctx->hostnqn); 6923 free(ctx->name); 6924 free(ctx); 6925 } 6926 6927 static void 6928 discovery_complete(struct discovery_ctx *ctx) 6929 { 6930 ctx->initializing = false; 6931 ctx->in_progress = false; 6932 if (ctx->pending) { 6933 ctx->pending = false; 6934 get_discovery_log_page(ctx); 6935 } 6936 } 6937 6938 static void 6939 build_trid_from_log_page_entry(struct spdk_nvme_transport_id *trid, 6940 struct spdk_nvmf_discovery_log_page_entry *entry) 6941 { 6942 char *space; 6943 6944 trid->trtype = entry->trtype; 6945 trid->adrfam = entry->adrfam; 6946 memcpy(trid->traddr, entry->traddr, sizeof(entry->traddr)); 6947 memcpy(trid->trsvcid, entry->trsvcid, sizeof(entry->trsvcid)); 6948 /* Because the source buffer (entry->subnqn) is longer than trid->subnqn, and 6949 * before call to this function trid->subnqn is zeroed out, we need 6950 * to copy sizeof(trid->subnqn) minus one byte to make sure the last character 6951 * remains 0. Then we can shorten the string (replace ' ' with 0) if required 6952 */ 6953 memcpy(trid->subnqn, entry->subnqn, sizeof(trid->subnqn) - 1); 6954 6955 /* We want the traddr, trsvcid and subnqn fields to be NULL-terminated. 6956 * But the log page entries typically pad them with spaces, not zeroes. 6957 * So add a NULL terminator to each of these fields at the appropriate 6958 * location. 6959 */ 6960 space = strchr(trid->traddr, ' '); 6961 if (space) { 6962 *space = 0; 6963 } 6964 space = strchr(trid->trsvcid, ' '); 6965 if (space) { 6966 *space = 0; 6967 } 6968 space = strchr(trid->subnqn, ' '); 6969 if (space) { 6970 *space = 0; 6971 } 6972 } 6973 6974 static void 6975 _stop_discovery(void *_ctx) 6976 { 6977 struct discovery_ctx *ctx = _ctx; 6978 6979 if (ctx->attach_in_progress > 0) { 6980 spdk_thread_send_msg(spdk_get_thread(), _stop_discovery, ctx); 6981 return; 6982 } 6983 6984 ctx->stop = true; 6985 6986 while (!TAILQ_EMPTY(&ctx->nvm_entry_ctxs)) { 6987 struct discovery_entry_ctx *entry_ctx; 6988 struct nvme_path_id path = {}; 6989 6990 entry_ctx = TAILQ_FIRST(&ctx->nvm_entry_ctxs); 6991 path.trid = entry_ctx->trid; 6992 bdev_nvme_delete(entry_ctx->name, &path, NULL, NULL); 6993 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 6994 free(entry_ctx); 6995 } 6996 6997 while (!TAILQ_EMPTY(&ctx->discovery_entry_ctxs)) { 6998 struct discovery_entry_ctx *entry_ctx; 6999 7000 entry_ctx = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 7001 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 7002 free(entry_ctx); 7003 } 7004 7005 free(ctx->entry_ctx_in_use); 7006 ctx->entry_ctx_in_use = NULL; 7007 } 7008 7009 static void 7010 stop_discovery(struct discovery_ctx *ctx, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 7011 { 7012 ctx->stop_cb_fn = cb_fn; 7013 ctx->cb_ctx = cb_ctx; 7014 7015 if (ctx->attach_in_progress > 0) { 7016 DISCOVERY_INFOLOG(ctx, "stopping discovery with attach_in_progress: %"PRIu32"\n", 7017 ctx->attach_in_progress); 7018 } 7019 7020 _stop_discovery(ctx); 7021 } 7022 7023 static void 7024 remove_discovery_entry(struct nvme_ctrlr *nvme_ctrlr) 7025 { 7026 struct discovery_ctx *d_ctx; 7027 struct nvme_path_id *path_id; 7028 struct spdk_nvme_transport_id trid = {}; 7029 struct discovery_entry_ctx *entry_ctx, *tmp; 7030 7031 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 7032 7033 TAILQ_FOREACH(d_ctx, &g_discovery_ctxs, tailq) { 7034 TAILQ_FOREACH_SAFE(entry_ctx, &d_ctx->nvm_entry_ctxs, tailq, tmp) { 7035 build_trid_from_log_page_entry(&trid, &entry_ctx->entry); 7036 if (spdk_nvme_transport_id_compare(&trid, &path_id->trid) != 0) { 7037 continue; 7038 } 7039 7040 TAILQ_REMOVE(&d_ctx->nvm_entry_ctxs, entry_ctx, tailq); 7041 free(entry_ctx); 7042 DISCOVERY_INFOLOG(d_ctx, "Remove discovery entry: %s:%s:%s\n", 7043 trid.subnqn, trid.traddr, trid.trsvcid); 7044 7045 /* Fail discovery ctrlr to force reattach attempt */ 7046 spdk_nvme_ctrlr_fail(d_ctx->ctrlr); 7047 } 7048 } 7049 } 7050 7051 static void 7052 discovery_remove_controllers(struct discovery_ctx *ctx) 7053 { 7054 struct spdk_nvmf_discovery_log_page *log_page = ctx->log_page; 7055 struct discovery_entry_ctx *entry_ctx, *tmp; 7056 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 7057 struct spdk_nvme_transport_id old_trid = {}; 7058 uint64_t numrec, i; 7059 bool found; 7060 7061 numrec = from_le64(&log_page->numrec); 7062 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->nvm_entry_ctxs, tailq, tmp) { 7063 found = false; 7064 old_entry = &entry_ctx->entry; 7065 build_trid_from_log_page_entry(&old_trid, old_entry); 7066 for (i = 0; i < numrec; i++) { 7067 new_entry = &log_page->entries[i]; 7068 if (!memcmp(old_entry, new_entry, sizeof(*old_entry))) { 7069 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s found again\n", 7070 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 7071 found = true; 7072 break; 7073 } 7074 } 7075 if (!found) { 7076 struct nvme_path_id path = {}; 7077 7078 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s not found\n", 7079 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 7080 7081 path.trid = entry_ctx->trid; 7082 bdev_nvme_delete(entry_ctx->name, &path, NULL, NULL); 7083 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 7084 free(entry_ctx); 7085 } 7086 } 7087 free(log_page); 7088 ctx->log_page = NULL; 7089 discovery_complete(ctx); 7090 } 7091 7092 static void 7093 complete_discovery_start(struct discovery_ctx *ctx, int status) 7094 { 7095 ctx->timeout_ticks = 0; 7096 ctx->rc = status; 7097 if (ctx->start_cb_fn) { 7098 ctx->start_cb_fn(ctx->cb_ctx, status); 7099 ctx->start_cb_fn = NULL; 7100 ctx->cb_ctx = NULL; 7101 } 7102 } 7103 7104 static void 7105 discovery_attach_controller_done(void *cb_ctx, size_t bdev_count, int rc) 7106 { 7107 struct discovery_entry_ctx *entry_ctx = cb_ctx; 7108 struct discovery_ctx *ctx = entry_ctx->ctx; 7109 7110 DISCOVERY_INFOLOG(ctx, "attach %s done\n", entry_ctx->name); 7111 ctx->attach_in_progress--; 7112 if (ctx->attach_in_progress == 0) { 7113 complete_discovery_start(ctx, ctx->rc); 7114 if (ctx->initializing && ctx->rc != 0) { 7115 DISCOVERY_ERRLOG(ctx, "stopping discovery due to errors: %d\n", ctx->rc); 7116 stop_discovery(ctx, NULL, ctx->cb_ctx); 7117 } else { 7118 discovery_remove_controllers(ctx); 7119 } 7120 } 7121 } 7122 7123 static struct discovery_entry_ctx * 7124 create_discovery_entry_ctx(struct discovery_ctx *ctx, struct spdk_nvme_transport_id *trid) 7125 { 7126 struct discovery_entry_ctx *new_ctx; 7127 7128 new_ctx = calloc(1, sizeof(*new_ctx)); 7129 if (new_ctx == NULL) { 7130 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 7131 return NULL; 7132 } 7133 7134 new_ctx->ctx = ctx; 7135 memcpy(&new_ctx->trid, trid, sizeof(*trid)); 7136 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 7137 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 7138 return new_ctx; 7139 } 7140 7141 static void 7142 discovery_log_page_cb(void *cb_arg, int rc, const struct spdk_nvme_cpl *cpl, 7143 struct spdk_nvmf_discovery_log_page *log_page) 7144 { 7145 struct discovery_ctx *ctx = cb_arg; 7146 struct discovery_entry_ctx *entry_ctx, *tmp; 7147 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 7148 uint64_t numrec, i; 7149 bool found; 7150 7151 if (rc || spdk_nvme_cpl_is_error(cpl)) { 7152 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 7153 return; 7154 } 7155 7156 ctx->log_page = log_page; 7157 assert(ctx->attach_in_progress == 0); 7158 numrec = from_le64(&log_page->numrec); 7159 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->discovery_entry_ctxs, tailq, tmp) { 7160 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 7161 free(entry_ctx); 7162 } 7163 for (i = 0; i < numrec; i++) { 7164 found = false; 7165 new_entry = &log_page->entries[i]; 7166 if (new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY_CURRENT || 7167 new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) { 7168 struct discovery_entry_ctx *new_ctx; 7169 struct spdk_nvme_transport_id trid = {}; 7170 7171 build_trid_from_log_page_entry(&trid, new_entry); 7172 new_ctx = create_discovery_entry_ctx(ctx, &trid); 7173 if (new_ctx == NULL) { 7174 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 7175 break; 7176 } 7177 7178 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, new_ctx, tailq); 7179 continue; 7180 } 7181 TAILQ_FOREACH(entry_ctx, &ctx->nvm_entry_ctxs, tailq) { 7182 old_entry = &entry_ctx->entry; 7183 if (!memcmp(new_entry, old_entry, sizeof(*new_entry))) { 7184 found = true; 7185 break; 7186 } 7187 } 7188 if (!found) { 7189 struct discovery_entry_ctx *subnqn_ctx = NULL, *new_ctx; 7190 struct discovery_ctx *d_ctx; 7191 7192 TAILQ_FOREACH(d_ctx, &g_discovery_ctxs, tailq) { 7193 TAILQ_FOREACH(subnqn_ctx, &d_ctx->nvm_entry_ctxs, tailq) { 7194 if (!memcmp(subnqn_ctx->entry.subnqn, new_entry->subnqn, 7195 sizeof(new_entry->subnqn))) { 7196 break; 7197 } 7198 } 7199 if (subnqn_ctx) { 7200 break; 7201 } 7202 } 7203 7204 new_ctx = calloc(1, sizeof(*new_ctx)); 7205 if (new_ctx == NULL) { 7206 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 7207 break; 7208 } 7209 7210 new_ctx->ctx = ctx; 7211 memcpy(&new_ctx->entry, new_entry, sizeof(*new_entry)); 7212 build_trid_from_log_page_entry(&new_ctx->trid, new_entry); 7213 if (subnqn_ctx) { 7214 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s", subnqn_ctx->name); 7215 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new path for %s\n", 7216 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 7217 new_ctx->name); 7218 } else { 7219 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s%d", ctx->name, ctx->index++); 7220 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new subsystem %s\n", 7221 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 7222 new_ctx->name); 7223 } 7224 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 7225 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 7226 rc = spdk_bdev_nvme_create(&new_ctx->trid, new_ctx->name, NULL, 0, 7227 discovery_attach_controller_done, new_ctx, 7228 &new_ctx->drv_opts, &ctx->bdev_opts); 7229 if (rc == 0) { 7230 TAILQ_INSERT_TAIL(&ctx->nvm_entry_ctxs, new_ctx, tailq); 7231 ctx->attach_in_progress++; 7232 } else { 7233 DISCOVERY_ERRLOG(ctx, "spdk_bdev_nvme_create failed (%s)\n", spdk_strerror(-rc)); 7234 } 7235 } 7236 } 7237 7238 if (ctx->attach_in_progress == 0) { 7239 discovery_remove_controllers(ctx); 7240 } 7241 } 7242 7243 static void 7244 get_discovery_log_page(struct discovery_ctx *ctx) 7245 { 7246 int rc; 7247 7248 assert(ctx->in_progress == false); 7249 ctx->in_progress = true; 7250 rc = spdk_nvme_ctrlr_get_discovery_log_page(ctx->ctrlr, discovery_log_page_cb, ctx); 7251 if (rc != 0) { 7252 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 7253 } 7254 DISCOVERY_INFOLOG(ctx, "sent discovery log page command\n"); 7255 } 7256 7257 static void 7258 discovery_aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 7259 { 7260 struct discovery_ctx *ctx = arg; 7261 uint32_t log_page_id = (cpl->cdw0 & 0xFF0000) >> 16; 7262 7263 if (spdk_nvme_cpl_is_error(cpl)) { 7264 DISCOVERY_ERRLOG(ctx, "aer failed\n"); 7265 return; 7266 } 7267 7268 if (log_page_id != SPDK_NVME_LOG_DISCOVERY) { 7269 DISCOVERY_ERRLOG(ctx, "unexpected log page 0x%x\n", log_page_id); 7270 return; 7271 } 7272 7273 DISCOVERY_INFOLOG(ctx, "got aer\n"); 7274 if (ctx->in_progress) { 7275 ctx->pending = true; 7276 return; 7277 } 7278 7279 get_discovery_log_page(ctx); 7280 } 7281 7282 static void 7283 discovery_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 7284 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 7285 { 7286 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 7287 struct discovery_ctx *ctx; 7288 7289 ctx = SPDK_CONTAINEROF(user_opts, struct discovery_ctx, drv_opts); 7290 7291 DISCOVERY_INFOLOG(ctx, "discovery ctrlr attached\n"); 7292 ctx->probe_ctx = NULL; 7293 ctx->ctrlr = ctrlr; 7294 7295 if (ctx->rc != 0) { 7296 DISCOVERY_ERRLOG(ctx, "encountered error while attaching discovery ctrlr: %d\n", 7297 ctx->rc); 7298 return; 7299 } 7300 7301 spdk_nvme_ctrlr_register_aer_callback(ctx->ctrlr, discovery_aer_cb, ctx); 7302 } 7303 7304 static int 7305 discovery_poller(void *arg) 7306 { 7307 struct discovery_ctx *ctx = arg; 7308 struct spdk_nvme_transport_id *trid; 7309 int rc; 7310 7311 if (ctx->detach_ctx) { 7312 rc = spdk_nvme_detach_poll_async(ctx->detach_ctx); 7313 if (rc != -EAGAIN) { 7314 ctx->detach_ctx = NULL; 7315 ctx->ctrlr = NULL; 7316 } 7317 } else if (ctx->stop) { 7318 if (ctx->ctrlr != NULL) { 7319 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 7320 if (rc == 0) { 7321 return SPDK_POLLER_BUSY; 7322 } 7323 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 7324 } 7325 spdk_poller_unregister(&ctx->poller); 7326 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 7327 assert(ctx->start_cb_fn == NULL); 7328 if (ctx->stop_cb_fn != NULL) { 7329 ctx->stop_cb_fn(ctx->cb_ctx); 7330 } 7331 free_discovery_ctx(ctx); 7332 } else if (ctx->probe_ctx == NULL && ctx->ctrlr == NULL) { 7333 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 7334 DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); 7335 assert(ctx->initializing); 7336 spdk_poller_unregister(&ctx->poller); 7337 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 7338 complete_discovery_start(ctx, -ETIMEDOUT); 7339 stop_discovery(ctx, NULL, NULL); 7340 free_discovery_ctx(ctx); 7341 return SPDK_POLLER_BUSY; 7342 } 7343 7344 assert(ctx->entry_ctx_in_use == NULL); 7345 ctx->entry_ctx_in_use = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 7346 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 7347 trid = &ctx->entry_ctx_in_use->trid; 7348 7349 /* All controllers must be configured explicitely either for multipath or failover. 7350 * While discovery use multipath mode, we need to set this in bdev options as well. 7351 */ 7352 ctx->bdev_opts.multipath = true; 7353 7354 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, discovery_attach_cb); 7355 if (ctx->probe_ctx) { 7356 spdk_poller_unregister(&ctx->poller); 7357 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000); 7358 } else { 7359 DISCOVERY_ERRLOG(ctx, "could not start discovery connect\n"); 7360 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 7361 ctx->entry_ctx_in_use = NULL; 7362 } 7363 } else if (ctx->probe_ctx) { 7364 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 7365 DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); 7366 complete_discovery_start(ctx, -ETIMEDOUT); 7367 return SPDK_POLLER_BUSY; 7368 } 7369 7370 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 7371 if (rc != -EAGAIN) { 7372 if (ctx->rc != 0) { 7373 assert(ctx->initializing); 7374 stop_discovery(ctx, NULL, ctx->cb_ctx); 7375 } else { 7376 assert(rc == 0); 7377 DISCOVERY_INFOLOG(ctx, "discovery ctrlr connected\n"); 7378 ctx->rc = rc; 7379 get_discovery_log_page(ctx); 7380 } 7381 } 7382 } else { 7383 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 7384 DISCOVERY_ERRLOG(ctx, "timed out while attaching NVM ctrlrs\n"); 7385 complete_discovery_start(ctx, -ETIMEDOUT); 7386 /* We need to wait until all NVM ctrlrs are attached before we stop the 7387 * discovery service to make sure we don't detach a ctrlr that is still 7388 * being attached. 7389 */ 7390 if (ctx->attach_in_progress == 0) { 7391 stop_discovery(ctx, NULL, ctx->cb_ctx); 7392 return SPDK_POLLER_BUSY; 7393 } 7394 } 7395 7396 rc = spdk_nvme_ctrlr_process_admin_completions(ctx->ctrlr); 7397 if (rc < 0) { 7398 spdk_poller_unregister(&ctx->poller); 7399 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 7400 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 7401 ctx->entry_ctx_in_use = NULL; 7402 7403 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 7404 if (rc != 0) { 7405 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 7406 ctx->ctrlr = NULL; 7407 } 7408 } 7409 } 7410 7411 return SPDK_POLLER_BUSY; 7412 } 7413 7414 static void 7415 start_discovery_poller(void *arg) 7416 { 7417 struct discovery_ctx *ctx = arg; 7418 7419 TAILQ_INSERT_TAIL(&g_discovery_ctxs, ctx, tailq); 7420 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 7421 } 7422 7423 int 7424 bdev_nvme_start_discovery(struct spdk_nvme_transport_id *trid, 7425 const char *base_name, 7426 struct spdk_nvme_ctrlr_opts *drv_opts, 7427 struct spdk_bdev_nvme_ctrlr_opts *bdev_opts, 7428 uint64_t attach_timeout, 7429 bool from_mdns, 7430 spdk_bdev_nvme_start_discovery_fn cb_fn, void *cb_ctx) 7431 { 7432 struct discovery_ctx *ctx; 7433 struct discovery_entry_ctx *discovery_entry_ctx; 7434 7435 snprintf(trid->subnqn, sizeof(trid->subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN); 7436 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 7437 if (strcmp(ctx->name, base_name) == 0) { 7438 return -EEXIST; 7439 } 7440 7441 if (ctx->entry_ctx_in_use != NULL) { 7442 if (!spdk_nvme_transport_id_compare(trid, &ctx->entry_ctx_in_use->trid)) { 7443 return -EEXIST; 7444 } 7445 } 7446 7447 TAILQ_FOREACH(discovery_entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 7448 if (!spdk_nvme_transport_id_compare(trid, &discovery_entry_ctx->trid)) { 7449 return -EEXIST; 7450 } 7451 } 7452 } 7453 7454 ctx = calloc(1, sizeof(*ctx)); 7455 if (ctx == NULL) { 7456 return -ENOMEM; 7457 } 7458 7459 ctx->name = strdup(base_name); 7460 if (ctx->name == NULL) { 7461 free_discovery_ctx(ctx); 7462 return -ENOMEM; 7463 } 7464 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 7465 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 7466 ctx->from_mdns_discovery_service = from_mdns; 7467 ctx->bdev_opts.from_discovery_service = true; 7468 ctx->calling_thread = spdk_get_thread(); 7469 ctx->start_cb_fn = cb_fn; 7470 ctx->cb_ctx = cb_ctx; 7471 ctx->initializing = true; 7472 if (ctx->start_cb_fn) { 7473 /* We can use this when dumping json to denote if this RPC parameter 7474 * was specified or not. 7475 */ 7476 ctx->wait_for_attach = true; 7477 } 7478 if (attach_timeout != 0) { 7479 ctx->timeout_ticks = spdk_get_ticks() + attach_timeout * 7480 spdk_get_ticks_hz() / 1000ull; 7481 } 7482 TAILQ_INIT(&ctx->nvm_entry_ctxs); 7483 TAILQ_INIT(&ctx->discovery_entry_ctxs); 7484 memcpy(&ctx->trid, trid, sizeof(*trid)); 7485 /* Even if user did not specify hostnqn, we can still strdup("\0"); */ 7486 ctx->hostnqn = strdup(ctx->drv_opts.hostnqn); 7487 if (ctx->hostnqn == NULL) { 7488 free_discovery_ctx(ctx); 7489 return -ENOMEM; 7490 } 7491 discovery_entry_ctx = create_discovery_entry_ctx(ctx, trid); 7492 if (discovery_entry_ctx == NULL) { 7493 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 7494 free_discovery_ctx(ctx); 7495 return -ENOMEM; 7496 } 7497 7498 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, discovery_entry_ctx, tailq); 7499 spdk_thread_send_msg(g_bdev_nvme_init_thread, start_discovery_poller, ctx); 7500 return 0; 7501 } 7502 7503 int 7504 bdev_nvme_stop_discovery(const char *name, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 7505 { 7506 struct discovery_ctx *ctx; 7507 7508 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 7509 if (strcmp(name, ctx->name) == 0) { 7510 if (ctx->stop) { 7511 return -EALREADY; 7512 } 7513 /* If we're still starting the discovery service and ->rc is non-zero, we're 7514 * going to stop it as soon as we can 7515 */ 7516 if (ctx->initializing && ctx->rc != 0) { 7517 return -EALREADY; 7518 } 7519 stop_discovery(ctx, cb_fn, cb_ctx); 7520 return 0; 7521 } 7522 } 7523 7524 return -ENOENT; 7525 } 7526 7527 static int 7528 bdev_nvme_library_init(void) 7529 { 7530 g_bdev_nvme_init_thread = spdk_get_thread(); 7531 7532 spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_create_poll_group_cb, 7533 bdev_nvme_destroy_poll_group_cb, 7534 sizeof(struct nvme_poll_group), "nvme_poll_groups"); 7535 7536 return 0; 7537 } 7538 7539 static void 7540 bdev_nvme_fini_destruct_ctrlrs(void) 7541 { 7542 struct nvme_bdev_ctrlr *nbdev_ctrlr; 7543 struct nvme_ctrlr *nvme_ctrlr; 7544 7545 pthread_mutex_lock(&g_bdev_nvme_mutex); 7546 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 7547 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 7548 pthread_mutex_lock(&nvme_ctrlr->mutex); 7549 if (nvme_ctrlr->destruct) { 7550 /* This controller's destruction was already started 7551 * before the application started shutting down 7552 */ 7553 pthread_mutex_unlock(&nvme_ctrlr->mutex); 7554 continue; 7555 } 7556 nvme_ctrlr->destruct = true; 7557 pthread_mutex_unlock(&nvme_ctrlr->mutex); 7558 7559 spdk_thread_send_msg(nvme_ctrlr->thread, _nvme_ctrlr_destruct, 7560 nvme_ctrlr); 7561 } 7562 } 7563 7564 g_bdev_nvme_module_finish = true; 7565 if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 7566 pthread_mutex_unlock(&g_bdev_nvme_mutex); 7567 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 7568 spdk_bdev_module_fini_done(); 7569 return; 7570 } 7571 7572 pthread_mutex_unlock(&g_bdev_nvme_mutex); 7573 } 7574 7575 static void 7576 check_discovery_fini(void *arg) 7577 { 7578 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 7579 bdev_nvme_fini_destruct_ctrlrs(); 7580 } 7581 } 7582 7583 static void 7584 bdev_nvme_library_fini(void) 7585 { 7586 struct nvme_probe_skip_entry *entry, *entry_tmp; 7587 struct discovery_ctx *ctx; 7588 7589 spdk_poller_unregister(&g_hotplug_poller); 7590 free(g_hotplug_probe_ctx); 7591 g_hotplug_probe_ctx = NULL; 7592 7593 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) { 7594 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 7595 free(entry); 7596 } 7597 7598 assert(spdk_get_thread() == g_bdev_nvme_init_thread); 7599 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 7600 bdev_nvme_fini_destruct_ctrlrs(); 7601 } else { 7602 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 7603 stop_discovery(ctx, check_discovery_fini, NULL); 7604 } 7605 } 7606 } 7607 7608 static void 7609 bdev_nvme_verify_pi_error(struct nvme_bdev_io *bio) 7610 { 7611 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7612 struct spdk_bdev *bdev = bdev_io->bdev; 7613 struct spdk_dif_ctx dif_ctx; 7614 struct spdk_dif_error err_blk = {}; 7615 int rc; 7616 struct spdk_dif_ctx_init_ext_opts dif_opts; 7617 7618 dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format); 7619 dif_opts.dif_pi_format = bdev->dif_pi_format; 7620 rc = spdk_dif_ctx_init(&dif_ctx, 7621 bdev->blocklen, bdev->md_len, bdev->md_interleave, 7622 bdev->dif_is_head_of_md, bdev->dif_type, 7623 bdev_io->u.bdev.dif_check_flags, 7624 bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0, &dif_opts); 7625 if (rc != 0) { 7626 SPDK_ERRLOG("Initialization of DIF context failed\n"); 7627 return; 7628 } 7629 7630 if (bdev->md_interleave) { 7631 rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 7632 bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 7633 } else { 7634 struct iovec md_iov = { 7635 .iov_base = bdev_io->u.bdev.md_buf, 7636 .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len, 7637 }; 7638 7639 rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 7640 &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 7641 } 7642 7643 if (rc != 0) { 7644 SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n", 7645 err_blk.err_type, err_blk.err_offset); 7646 } else { 7647 SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n"); 7648 } 7649 } 7650 7651 static void 7652 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 7653 { 7654 struct nvme_bdev_io *bio = ref; 7655 7656 if (spdk_nvme_cpl_is_success(cpl)) { 7657 /* Run PI verification for read data buffer. */ 7658 bdev_nvme_verify_pi_error(bio); 7659 } 7660 7661 /* Return original completion status */ 7662 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 7663 } 7664 7665 static void 7666 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 7667 { 7668 struct nvme_bdev_io *bio = ref; 7669 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7670 int ret; 7671 7672 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 7673 SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n", 7674 cpl->status.sct, cpl->status.sc); 7675 7676 /* Save completion status to use after verifying PI error. */ 7677 bio->cpl = *cpl; 7678 7679 if (spdk_likely(nvme_io_path_is_available(bio->io_path))) { 7680 /* Read without PI checking to verify PI error. */ 7681 ret = bdev_nvme_no_pi_readv(bio, 7682 bdev_io->u.bdev.iovs, 7683 bdev_io->u.bdev.iovcnt, 7684 bdev_io->u.bdev.md_buf, 7685 bdev_io->u.bdev.num_blocks, 7686 bdev_io->u.bdev.offset_blocks); 7687 if (ret == 0) { 7688 return; 7689 } 7690 } 7691 } 7692 7693 bdev_nvme_io_complete_nvme_status(bio, cpl); 7694 } 7695 7696 static void 7697 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7698 { 7699 struct nvme_bdev_io *bio = ref; 7700 7701 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 7702 SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n", 7703 cpl->status.sct, cpl->status.sc); 7704 /* Run PI verification for write data buffer if PI error is detected. */ 7705 bdev_nvme_verify_pi_error(bio); 7706 } 7707 7708 bdev_nvme_io_complete_nvme_status(bio, cpl); 7709 } 7710 7711 static void 7712 bdev_nvme_zone_appendv_done(void *ref, const struct spdk_nvme_cpl *cpl) 7713 { 7714 struct nvme_bdev_io *bio = ref; 7715 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7716 7717 /* spdk_bdev_io_get_append_location() requires that the ALBA is stored in offset_blocks. 7718 * Additionally, offset_blocks has to be set before calling bdev_nvme_verify_pi_error(). 7719 */ 7720 bdev_io->u.bdev.offset_blocks = *(uint64_t *)&cpl->cdw0; 7721 7722 if (spdk_nvme_cpl_is_pi_error(cpl)) { 7723 SPDK_ERRLOG("zone append completed with PI error (sct=%d, sc=%d)\n", 7724 cpl->status.sct, cpl->status.sc); 7725 /* Run PI verification for zone append data buffer if PI error is detected. */ 7726 bdev_nvme_verify_pi_error(bio); 7727 } 7728 7729 bdev_nvme_io_complete_nvme_status(bio, cpl); 7730 } 7731 7732 static void 7733 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7734 { 7735 struct nvme_bdev_io *bio = ref; 7736 7737 if (spdk_nvme_cpl_is_pi_error(cpl)) { 7738 SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n", 7739 cpl->status.sct, cpl->status.sc); 7740 /* Run PI verification for compare data buffer if PI error is detected. */ 7741 bdev_nvme_verify_pi_error(bio); 7742 } 7743 7744 bdev_nvme_io_complete_nvme_status(bio, cpl); 7745 } 7746 7747 static void 7748 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7749 { 7750 struct nvme_bdev_io *bio = ref; 7751 7752 /* Compare operation completion */ 7753 if (!bio->first_fused_completed) { 7754 /* Save compare result for write callback */ 7755 bio->cpl = *cpl; 7756 bio->first_fused_completed = true; 7757 return; 7758 } 7759 7760 /* Write operation completion */ 7761 if (spdk_nvme_cpl_is_error(&bio->cpl)) { 7762 /* If bio->cpl is already an error, it means the compare operation failed. In that case, 7763 * complete the IO with the compare operation's status. 7764 */ 7765 if (!spdk_nvme_cpl_is_error(cpl)) { 7766 SPDK_ERRLOG("Unexpected write success after compare failure.\n"); 7767 } 7768 7769 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 7770 } else { 7771 bdev_nvme_io_complete_nvme_status(bio, cpl); 7772 } 7773 } 7774 7775 static void 7776 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl) 7777 { 7778 struct nvme_bdev_io *bio = ref; 7779 7780 bdev_nvme_io_complete_nvme_status(bio, cpl); 7781 } 7782 7783 static int 7784 fill_zone_from_report(struct spdk_bdev_zone_info *info, struct spdk_nvme_zns_zone_desc *desc) 7785 { 7786 switch (desc->zt) { 7787 case SPDK_NVME_ZONE_TYPE_SEQWR: 7788 info->type = SPDK_BDEV_ZONE_TYPE_SEQWR; 7789 break; 7790 default: 7791 SPDK_ERRLOG("Invalid zone type: %#x in zone report\n", desc->zt); 7792 return -EIO; 7793 } 7794 7795 switch (desc->zs) { 7796 case SPDK_NVME_ZONE_STATE_EMPTY: 7797 info->state = SPDK_BDEV_ZONE_STATE_EMPTY; 7798 break; 7799 case SPDK_NVME_ZONE_STATE_IOPEN: 7800 info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN; 7801 break; 7802 case SPDK_NVME_ZONE_STATE_EOPEN: 7803 info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN; 7804 break; 7805 case SPDK_NVME_ZONE_STATE_CLOSED: 7806 info->state = SPDK_BDEV_ZONE_STATE_CLOSED; 7807 break; 7808 case SPDK_NVME_ZONE_STATE_RONLY: 7809 info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY; 7810 break; 7811 case SPDK_NVME_ZONE_STATE_FULL: 7812 info->state = SPDK_BDEV_ZONE_STATE_FULL; 7813 break; 7814 case SPDK_NVME_ZONE_STATE_OFFLINE: 7815 info->state = SPDK_BDEV_ZONE_STATE_OFFLINE; 7816 break; 7817 default: 7818 SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", desc->zs); 7819 return -EIO; 7820 } 7821 7822 info->zone_id = desc->zslba; 7823 info->write_pointer = desc->wp; 7824 info->capacity = desc->zcap; 7825 7826 return 0; 7827 } 7828 7829 static void 7830 bdev_nvme_get_zone_info_done(void *ref, const struct spdk_nvme_cpl *cpl) 7831 { 7832 struct nvme_bdev_io *bio = ref; 7833 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7834 uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id; 7835 uint32_t zones_to_copy = bdev_io->u.zone_mgmt.num_zones; 7836 struct spdk_bdev_zone_info *info = bdev_io->u.zone_mgmt.buf; 7837 uint64_t max_zones_per_buf, i; 7838 uint32_t zone_report_bufsize; 7839 struct spdk_nvme_ns *ns; 7840 struct spdk_nvme_qpair *qpair; 7841 int ret; 7842 7843 if (spdk_nvme_cpl_is_error(cpl)) { 7844 goto out_complete_io_nvme_cpl; 7845 } 7846 7847 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 7848 ret = -ENXIO; 7849 goto out_complete_io_ret; 7850 } 7851 7852 ns = bio->io_path->nvme_ns->ns; 7853 qpair = bio->io_path->qpair->qpair; 7854 7855 zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 7856 max_zones_per_buf = (zone_report_bufsize - sizeof(*bio->zone_report_buf)) / 7857 sizeof(bio->zone_report_buf->descs[0]); 7858 7859 if (bio->zone_report_buf->nr_zones > max_zones_per_buf) { 7860 ret = -EINVAL; 7861 goto out_complete_io_ret; 7862 } 7863 7864 if (!bio->zone_report_buf->nr_zones) { 7865 ret = -EINVAL; 7866 goto out_complete_io_ret; 7867 } 7868 7869 for (i = 0; i < bio->zone_report_buf->nr_zones && bio->handled_zones < zones_to_copy; i++) { 7870 ret = fill_zone_from_report(&info[bio->handled_zones], 7871 &bio->zone_report_buf->descs[i]); 7872 if (ret) { 7873 goto out_complete_io_ret; 7874 } 7875 bio->handled_zones++; 7876 } 7877 7878 if (bio->handled_zones < zones_to_copy) { 7879 uint64_t zone_size_lba = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 7880 uint64_t slba = zone_id + (zone_size_lba * bio->handled_zones); 7881 7882 memset(bio->zone_report_buf, 0, zone_report_bufsize); 7883 ret = spdk_nvme_zns_report_zones(ns, qpair, 7884 bio->zone_report_buf, zone_report_bufsize, 7885 slba, SPDK_NVME_ZRA_LIST_ALL, true, 7886 bdev_nvme_get_zone_info_done, bio); 7887 if (!ret) { 7888 return; 7889 } else { 7890 goto out_complete_io_ret; 7891 } 7892 } 7893 7894 out_complete_io_nvme_cpl: 7895 free(bio->zone_report_buf); 7896 bio->zone_report_buf = NULL; 7897 bdev_nvme_io_complete_nvme_status(bio, cpl); 7898 return; 7899 7900 out_complete_io_ret: 7901 free(bio->zone_report_buf); 7902 bio->zone_report_buf = NULL; 7903 bdev_nvme_io_complete(bio, ret); 7904 } 7905 7906 static void 7907 bdev_nvme_zone_management_done(void *ref, const struct spdk_nvme_cpl *cpl) 7908 { 7909 struct nvme_bdev_io *bio = ref; 7910 7911 bdev_nvme_io_complete_nvme_status(bio, cpl); 7912 } 7913 7914 static void 7915 bdev_nvme_admin_passthru_complete_nvme_status(void *ctx) 7916 { 7917 struct nvme_bdev_io *bio = ctx; 7918 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7919 const struct spdk_nvme_cpl *cpl = &bio->cpl; 7920 7921 assert(bdev_nvme_io_type_is_admin(bdev_io->type)); 7922 7923 __bdev_nvme_io_complete(bdev_io, 0, cpl); 7924 } 7925 7926 static void 7927 bdev_nvme_abort_complete(void *ctx) 7928 { 7929 struct nvme_bdev_io *bio = ctx; 7930 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7931 7932 if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) { 7933 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS, NULL); 7934 } else { 7935 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED, NULL); 7936 } 7937 } 7938 7939 static void 7940 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl) 7941 { 7942 struct nvme_bdev_io *bio = ref; 7943 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7944 7945 bio->cpl = *cpl; 7946 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), bdev_nvme_abort_complete, bio); 7947 } 7948 7949 static void 7950 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl) 7951 { 7952 struct nvme_bdev_io *bio = ref; 7953 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7954 7955 bio->cpl = *cpl; 7956 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7957 bdev_nvme_admin_passthru_complete_nvme_status, bio); 7958 } 7959 7960 static void 7961 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset) 7962 { 7963 struct nvme_bdev_io *bio = ref; 7964 struct iovec *iov; 7965 7966 bio->iov_offset = sgl_offset; 7967 for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) { 7968 iov = &bio->iovs[bio->iovpos]; 7969 if (bio->iov_offset < iov->iov_len) { 7970 break; 7971 } 7972 7973 bio->iov_offset -= iov->iov_len; 7974 } 7975 } 7976 7977 static int 7978 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length) 7979 { 7980 struct nvme_bdev_io *bio = ref; 7981 struct iovec *iov; 7982 7983 assert(bio->iovpos < bio->iovcnt); 7984 7985 iov = &bio->iovs[bio->iovpos]; 7986 7987 *address = iov->iov_base; 7988 *length = iov->iov_len; 7989 7990 if (bio->iov_offset) { 7991 assert(bio->iov_offset <= iov->iov_len); 7992 *address += bio->iov_offset; 7993 *length -= bio->iov_offset; 7994 } 7995 7996 bio->iov_offset += *length; 7997 if (bio->iov_offset == iov->iov_len) { 7998 bio->iovpos++; 7999 bio->iov_offset = 0; 8000 } 8001 8002 return 0; 8003 } 8004 8005 static void 8006 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset) 8007 { 8008 struct nvme_bdev_io *bio = ref; 8009 struct iovec *iov; 8010 8011 bio->fused_iov_offset = sgl_offset; 8012 for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) { 8013 iov = &bio->fused_iovs[bio->fused_iovpos]; 8014 if (bio->fused_iov_offset < iov->iov_len) { 8015 break; 8016 } 8017 8018 bio->fused_iov_offset -= iov->iov_len; 8019 } 8020 } 8021 8022 static int 8023 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length) 8024 { 8025 struct nvme_bdev_io *bio = ref; 8026 struct iovec *iov; 8027 8028 assert(bio->fused_iovpos < bio->fused_iovcnt); 8029 8030 iov = &bio->fused_iovs[bio->fused_iovpos]; 8031 8032 *address = iov->iov_base; 8033 *length = iov->iov_len; 8034 8035 if (bio->fused_iov_offset) { 8036 assert(bio->fused_iov_offset <= iov->iov_len); 8037 *address += bio->fused_iov_offset; 8038 *length -= bio->fused_iov_offset; 8039 } 8040 8041 bio->fused_iov_offset += *length; 8042 if (bio->fused_iov_offset == iov->iov_len) { 8043 bio->fused_iovpos++; 8044 bio->fused_iov_offset = 0; 8045 } 8046 8047 return 0; 8048 } 8049 8050 static int 8051 bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 8052 void *md, uint64_t lba_count, uint64_t lba) 8053 { 8054 int rc; 8055 8056 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n", 8057 lba_count, lba); 8058 8059 bio->iovs = iov; 8060 bio->iovcnt = iovcnt; 8061 bio->iovpos = 0; 8062 bio->iov_offset = 0; 8063 8064 rc = spdk_nvme_ns_cmd_readv_with_md(bio->io_path->nvme_ns->ns, 8065 bio->io_path->qpair->qpair, 8066 lba, lba_count, 8067 bdev_nvme_no_pi_readv_done, bio, 0, 8068 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 8069 md, 0, 0); 8070 8071 if (rc != 0 && rc != -ENOMEM) { 8072 SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc); 8073 } 8074 return rc; 8075 } 8076 8077 static int 8078 bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 8079 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, 8080 struct spdk_memory_domain *domain, void *domain_ctx, 8081 struct spdk_accel_sequence *seq) 8082 { 8083 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8084 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8085 int rc; 8086 8087 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n", 8088 lba_count, lba); 8089 8090 bio->iovs = iov; 8091 bio->iovcnt = iovcnt; 8092 bio->iovpos = 0; 8093 bio->iov_offset = 0; 8094 8095 if (domain != NULL || seq != NULL) { 8096 bio->ext_opts.size = SPDK_SIZEOF(&bio->ext_opts, accel_sequence); 8097 bio->ext_opts.memory_domain = domain; 8098 bio->ext_opts.memory_domain_ctx = domain_ctx; 8099 bio->ext_opts.io_flags = flags; 8100 bio->ext_opts.metadata = md; 8101 bio->ext_opts.accel_sequence = seq; 8102 8103 if (iovcnt == 1) { 8104 rc = spdk_nvme_ns_cmd_read_ext(ns, qpair, iov[0].iov_base, lba, lba_count, bdev_nvme_readv_done, 8105 bio, &bio->ext_opts); 8106 } else { 8107 rc = spdk_nvme_ns_cmd_readv_ext(ns, qpair, lba, lba_count, 8108 bdev_nvme_readv_done, bio, 8109 bdev_nvme_queued_reset_sgl, 8110 bdev_nvme_queued_next_sge, 8111 &bio->ext_opts); 8112 } 8113 } else if (iovcnt == 1) { 8114 rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base, 8115 md, lba, lba_count, bdev_nvme_readv_done, 8116 bio, flags, 0, 0); 8117 } else { 8118 rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count, 8119 bdev_nvme_readv_done, bio, flags, 8120 bdev_nvme_queued_reset_sgl, 8121 bdev_nvme_queued_next_sge, md, 0, 0); 8122 } 8123 8124 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 8125 SPDK_ERRLOG("readv failed: rc = %d\n", rc); 8126 } 8127 return rc; 8128 } 8129 8130 static int 8131 bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 8132 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, 8133 struct spdk_memory_domain *domain, void *domain_ctx, 8134 struct spdk_accel_sequence *seq, 8135 union spdk_bdev_nvme_cdw12 cdw12, union spdk_bdev_nvme_cdw13 cdw13) 8136 { 8137 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8138 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8139 int rc; 8140 8141 SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 8142 lba_count, lba); 8143 8144 bio->iovs = iov; 8145 bio->iovcnt = iovcnt; 8146 bio->iovpos = 0; 8147 bio->iov_offset = 0; 8148 8149 if (domain != NULL || seq != NULL) { 8150 bio->ext_opts.size = SPDK_SIZEOF(&bio->ext_opts, accel_sequence); 8151 bio->ext_opts.memory_domain = domain; 8152 bio->ext_opts.memory_domain_ctx = domain_ctx; 8153 bio->ext_opts.io_flags = flags | SPDK_NVME_IO_FLAGS_DIRECTIVE(cdw12.write.dtype); 8154 bio->ext_opts.cdw13 = cdw13.raw; 8155 bio->ext_opts.metadata = md; 8156 bio->ext_opts.accel_sequence = seq; 8157 8158 if (iovcnt == 1) { 8159 rc = spdk_nvme_ns_cmd_write_ext(ns, qpair, iov[0].iov_base, lba, lba_count, bdev_nvme_writev_done, 8160 bio, &bio->ext_opts); 8161 } else { 8162 rc = spdk_nvme_ns_cmd_writev_ext(ns, qpair, lba, lba_count, 8163 bdev_nvme_writev_done, bio, 8164 bdev_nvme_queued_reset_sgl, 8165 bdev_nvme_queued_next_sge, 8166 &bio->ext_opts); 8167 } 8168 } else if (iovcnt == 1) { 8169 rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base, 8170 md, lba, lba_count, bdev_nvme_writev_done, 8171 bio, flags, 0, 0); 8172 } else { 8173 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 8174 bdev_nvme_writev_done, bio, flags, 8175 bdev_nvme_queued_reset_sgl, 8176 bdev_nvme_queued_next_sge, md, 0, 0); 8177 } 8178 8179 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 8180 SPDK_ERRLOG("writev failed: rc = %d\n", rc); 8181 } 8182 return rc; 8183 } 8184 8185 static int 8186 bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 8187 void *md, uint64_t lba_count, uint64_t zslba, 8188 uint32_t flags) 8189 { 8190 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8191 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8192 int rc; 8193 8194 SPDK_DEBUGLOG(bdev_nvme, "zone append %" PRIu64 " blocks to zone start lba %#" PRIx64 "\n", 8195 lba_count, zslba); 8196 8197 bio->iovs = iov; 8198 bio->iovcnt = iovcnt; 8199 bio->iovpos = 0; 8200 bio->iov_offset = 0; 8201 8202 if (iovcnt == 1) { 8203 rc = spdk_nvme_zns_zone_append_with_md(ns, qpair, iov[0].iov_base, md, zslba, 8204 lba_count, 8205 bdev_nvme_zone_appendv_done, bio, 8206 flags, 8207 0, 0); 8208 } else { 8209 rc = spdk_nvme_zns_zone_appendv_with_md(ns, qpair, zslba, lba_count, 8210 bdev_nvme_zone_appendv_done, bio, flags, 8211 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 8212 md, 0, 0); 8213 } 8214 8215 if (rc != 0 && rc != -ENOMEM) { 8216 SPDK_ERRLOG("zone append failed: rc = %d\n", rc); 8217 } 8218 return rc; 8219 } 8220 8221 static int 8222 bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 8223 void *md, uint64_t lba_count, uint64_t lba, 8224 uint32_t flags) 8225 { 8226 int rc; 8227 8228 SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n", 8229 lba_count, lba); 8230 8231 bio->iovs = iov; 8232 bio->iovcnt = iovcnt; 8233 bio->iovpos = 0; 8234 bio->iov_offset = 0; 8235 8236 rc = spdk_nvme_ns_cmd_comparev_with_md(bio->io_path->nvme_ns->ns, 8237 bio->io_path->qpair->qpair, 8238 lba, lba_count, 8239 bdev_nvme_comparev_done, bio, flags, 8240 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 8241 md, 0, 0); 8242 8243 if (rc != 0 && rc != -ENOMEM) { 8244 SPDK_ERRLOG("comparev failed: rc = %d\n", rc); 8245 } 8246 return rc; 8247 } 8248 8249 static int 8250 bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, 8251 struct iovec *write_iov, int write_iovcnt, 8252 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags) 8253 { 8254 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8255 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8256 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 8257 int rc; 8258 8259 SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 8260 lba_count, lba); 8261 8262 bio->iovs = cmp_iov; 8263 bio->iovcnt = cmp_iovcnt; 8264 bio->iovpos = 0; 8265 bio->iov_offset = 0; 8266 bio->fused_iovs = write_iov; 8267 bio->fused_iovcnt = write_iovcnt; 8268 bio->fused_iovpos = 0; 8269 bio->fused_iov_offset = 0; 8270 8271 if (bdev_io->num_retries == 0) { 8272 bio->first_fused_submitted = false; 8273 bio->first_fused_completed = false; 8274 } 8275 8276 if (!bio->first_fused_submitted) { 8277 flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST; 8278 memset(&bio->cpl, 0, sizeof(bio->cpl)); 8279 8280 rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count, 8281 bdev_nvme_comparev_and_writev_done, bio, flags, 8282 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0); 8283 if (rc == 0) { 8284 bio->first_fused_submitted = true; 8285 flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST; 8286 } else { 8287 if (rc != -ENOMEM) { 8288 SPDK_ERRLOG("compare failed: rc = %d\n", rc); 8289 } 8290 return rc; 8291 } 8292 } 8293 8294 flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND; 8295 8296 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 8297 bdev_nvme_comparev_and_writev_done, bio, flags, 8298 bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0); 8299 if (rc != 0 && rc != -ENOMEM) { 8300 SPDK_ERRLOG("write failed: rc = %d\n", rc); 8301 rc = 0; 8302 } 8303 8304 return rc; 8305 } 8306 8307 static int 8308 bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 8309 { 8310 struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES]; 8311 struct spdk_nvme_dsm_range *range; 8312 uint64_t offset, remaining; 8313 uint64_t num_ranges_u64; 8314 uint16_t num_ranges; 8315 int rc; 8316 8317 num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) / 8318 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 8319 if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) { 8320 SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks); 8321 return -EINVAL; 8322 } 8323 num_ranges = (uint16_t)num_ranges_u64; 8324 8325 offset = offset_blocks; 8326 remaining = num_blocks; 8327 range = &dsm_ranges[0]; 8328 8329 /* Fill max-size ranges until the remaining blocks fit into one range */ 8330 while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) { 8331 range->attributes.raw = 0; 8332 range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 8333 range->starting_lba = offset; 8334 8335 offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 8336 remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 8337 range++; 8338 } 8339 8340 /* Final range describes the remaining blocks */ 8341 range->attributes.raw = 0; 8342 range->length = remaining; 8343 range->starting_lba = offset; 8344 8345 rc = spdk_nvme_ns_cmd_dataset_management(bio->io_path->nvme_ns->ns, 8346 bio->io_path->qpair->qpair, 8347 SPDK_NVME_DSM_ATTR_DEALLOCATE, 8348 dsm_ranges, num_ranges, 8349 bdev_nvme_queued_done, bio); 8350 8351 return rc; 8352 } 8353 8354 static int 8355 bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 8356 { 8357 if (num_blocks > UINT16_MAX + 1) { 8358 SPDK_ERRLOG("NVMe write zeroes is limited to 16-bit block count\n"); 8359 return -EINVAL; 8360 } 8361 8362 return spdk_nvme_ns_cmd_write_zeroes(bio->io_path->nvme_ns->ns, 8363 bio->io_path->qpair->qpair, 8364 offset_blocks, num_blocks, 8365 bdev_nvme_queued_done, bio, 8366 0); 8367 } 8368 8369 static int 8370 bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones, 8371 struct spdk_bdev_zone_info *info) 8372 { 8373 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8374 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8375 uint32_t zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 8376 uint64_t zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 8377 uint64_t total_zones = spdk_nvme_zns_ns_get_num_zones(ns); 8378 8379 if (zone_id % zone_size != 0) { 8380 return -EINVAL; 8381 } 8382 8383 if (num_zones > total_zones || !num_zones) { 8384 return -EINVAL; 8385 } 8386 8387 assert(!bio->zone_report_buf); 8388 bio->zone_report_buf = calloc(1, zone_report_bufsize); 8389 if (!bio->zone_report_buf) { 8390 return -ENOMEM; 8391 } 8392 8393 bio->handled_zones = 0; 8394 8395 return spdk_nvme_zns_report_zones(ns, qpair, bio->zone_report_buf, zone_report_bufsize, 8396 zone_id, SPDK_NVME_ZRA_LIST_ALL, true, 8397 bdev_nvme_get_zone_info_done, bio); 8398 } 8399 8400 static int 8401 bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 8402 enum spdk_bdev_zone_action action) 8403 { 8404 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8405 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8406 8407 switch (action) { 8408 case SPDK_BDEV_ZONE_CLOSE: 8409 return spdk_nvme_zns_close_zone(ns, qpair, zone_id, false, 8410 bdev_nvme_zone_management_done, bio); 8411 case SPDK_BDEV_ZONE_FINISH: 8412 return spdk_nvme_zns_finish_zone(ns, qpair, zone_id, false, 8413 bdev_nvme_zone_management_done, bio); 8414 case SPDK_BDEV_ZONE_OPEN: 8415 return spdk_nvme_zns_open_zone(ns, qpair, zone_id, false, 8416 bdev_nvme_zone_management_done, bio); 8417 case SPDK_BDEV_ZONE_RESET: 8418 return spdk_nvme_zns_reset_zone(ns, qpair, zone_id, false, 8419 bdev_nvme_zone_management_done, bio); 8420 case SPDK_BDEV_ZONE_OFFLINE: 8421 return spdk_nvme_zns_offline_zone(ns, qpair, zone_id, false, 8422 bdev_nvme_zone_management_done, bio); 8423 default: 8424 return -EINVAL; 8425 } 8426 } 8427 8428 static void 8429 bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 8430 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) 8431 { 8432 struct nvme_io_path *io_path; 8433 struct nvme_ctrlr *nvme_ctrlr; 8434 uint32_t max_xfer_size; 8435 int rc = -ENXIO; 8436 8437 /* Choose the first ctrlr which is not failed. */ 8438 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 8439 nvme_ctrlr = io_path->qpair->ctrlr; 8440 8441 /* We should skip any unavailable nvme_ctrlr rather than checking 8442 * if the return value of spdk_nvme_ctrlr_cmd_admin_raw() is -ENXIO. 8443 */ 8444 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 8445 continue; 8446 } 8447 8448 max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ctrlr->ctrlr); 8449 8450 if (nbytes > max_xfer_size) { 8451 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 8452 rc = -EINVAL; 8453 goto err; 8454 } 8455 8456 rc = spdk_nvme_ctrlr_cmd_admin_raw(nvme_ctrlr->ctrlr, cmd, buf, (uint32_t)nbytes, 8457 bdev_nvme_admin_passthru_done, bio); 8458 if (rc == 0) { 8459 return; 8460 } 8461 } 8462 8463 err: 8464 bdev_nvme_admin_complete(bio, rc); 8465 } 8466 8467 static int 8468 bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 8469 void *buf, size_t nbytes) 8470 { 8471 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8472 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8473 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 8474 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 8475 8476 if (nbytes > max_xfer_size) { 8477 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 8478 return -EINVAL; 8479 } 8480 8481 /* 8482 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 8483 * so fill it out automatically. 8484 */ 8485 cmd->nsid = spdk_nvme_ns_get_id(ns); 8486 8487 return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf, 8488 (uint32_t)nbytes, bdev_nvme_queued_done, bio); 8489 } 8490 8491 static int 8492 bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 8493 void *buf, size_t nbytes, void *md_buf, size_t md_len) 8494 { 8495 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8496 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8497 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 8498 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 8499 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 8500 8501 if (nbytes > max_xfer_size) { 8502 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 8503 return -EINVAL; 8504 } 8505 8506 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 8507 SPDK_ERRLOG("invalid meta data buffer size\n"); 8508 return -EINVAL; 8509 } 8510 8511 /* 8512 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 8513 * so fill it out automatically. 8514 */ 8515 cmd->nsid = spdk_nvme_ns_get_id(ns); 8516 8517 return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf, 8518 (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio); 8519 } 8520 8521 static int 8522 bdev_nvme_iov_passthru_md(struct nvme_bdev_io *bio, 8523 struct spdk_nvme_cmd *cmd, struct iovec *iov, int iovcnt, 8524 size_t nbytes, void *md_buf, size_t md_len) 8525 { 8526 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8527 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8528 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 8529 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 8530 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 8531 8532 bio->iovs = iov; 8533 bio->iovcnt = iovcnt; 8534 bio->iovpos = 0; 8535 bio->iov_offset = 0; 8536 8537 if (nbytes > max_xfer_size) { 8538 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 8539 return -EINVAL; 8540 } 8541 8542 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 8543 SPDK_ERRLOG("invalid meta data buffer size\n"); 8544 return -EINVAL; 8545 } 8546 8547 /* 8548 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands 8549 * require a nsid, so fill it out automatically. 8550 */ 8551 cmd->nsid = spdk_nvme_ns_get_id(ns); 8552 8553 return spdk_nvme_ctrlr_cmd_iov_raw_with_md( 8554 ctrlr, qpair, cmd, (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio, 8555 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge); 8556 } 8557 8558 static void 8559 bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 8560 struct nvme_bdev_io *bio_to_abort) 8561 { 8562 struct nvme_io_path *io_path; 8563 int rc = 0; 8564 8565 rc = bdev_nvme_abort_retry_io(nbdev_ch, bio_to_abort); 8566 if (rc == 0) { 8567 bdev_nvme_admin_complete(bio, 0); 8568 return; 8569 } 8570 8571 io_path = bio_to_abort->io_path; 8572 if (io_path != NULL) { 8573 rc = spdk_nvme_ctrlr_cmd_abort_ext(io_path->qpair->ctrlr->ctrlr, 8574 io_path->qpair->qpair, 8575 bio_to_abort, 8576 bdev_nvme_abort_done, bio); 8577 } else { 8578 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 8579 rc = spdk_nvme_ctrlr_cmd_abort_ext(io_path->qpair->ctrlr->ctrlr, 8580 NULL, 8581 bio_to_abort, 8582 bdev_nvme_abort_done, bio); 8583 8584 if (rc != -ENOENT) { 8585 break; 8586 } 8587 } 8588 } 8589 8590 if (rc != 0) { 8591 /* If no command was found or there was any error, complete the abort 8592 * request with failure. 8593 */ 8594 bdev_nvme_admin_complete(bio, rc); 8595 } 8596 } 8597 8598 static int 8599 bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, uint64_t src_offset_blocks, 8600 uint64_t num_blocks) 8601 { 8602 struct spdk_nvme_scc_source_range range = { 8603 .slba = src_offset_blocks, 8604 .nlb = num_blocks - 1 8605 }; 8606 8607 return spdk_nvme_ns_cmd_copy(bio->io_path->nvme_ns->ns, 8608 bio->io_path->qpair->qpair, 8609 &range, 1, dst_offset_blocks, 8610 bdev_nvme_queued_done, bio); 8611 } 8612 8613 static void 8614 bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w) 8615 { 8616 const char *action; 8617 uint32_t i; 8618 8619 if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) { 8620 action = "reset"; 8621 } else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) { 8622 action = "abort"; 8623 } else { 8624 action = "none"; 8625 } 8626 8627 spdk_json_write_object_begin(w); 8628 8629 spdk_json_write_named_string(w, "method", "bdev_nvme_set_options"); 8630 8631 spdk_json_write_named_object_begin(w, "params"); 8632 spdk_json_write_named_string(w, "action_on_timeout", action); 8633 spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us); 8634 spdk_json_write_named_uint64(w, "timeout_admin_us", g_opts.timeout_admin_us); 8635 spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms); 8636 spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst); 8637 spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight); 8638 spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight); 8639 spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight); 8640 spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us); 8641 spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us); 8642 spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests); 8643 spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit); 8644 spdk_json_write_named_uint32(w, "transport_retry_count", g_opts.transport_retry_count); 8645 spdk_json_write_named_int32(w, "bdev_retry_count", g_opts.bdev_retry_count); 8646 spdk_json_write_named_uint8(w, "transport_ack_timeout", g_opts.transport_ack_timeout); 8647 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", g_opts.ctrlr_loss_timeout_sec); 8648 spdk_json_write_named_uint32(w, "reconnect_delay_sec", g_opts.reconnect_delay_sec); 8649 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", g_opts.fast_io_fail_timeout_sec); 8650 spdk_json_write_named_bool(w, "disable_auto_failback", g_opts.disable_auto_failback); 8651 spdk_json_write_named_bool(w, "generate_uuids", g_opts.generate_uuids); 8652 spdk_json_write_named_uint8(w, "transport_tos", g_opts.transport_tos); 8653 spdk_json_write_named_bool(w, "nvme_error_stat", g_opts.nvme_error_stat); 8654 spdk_json_write_named_uint32(w, "rdma_srq_size", g_opts.rdma_srq_size); 8655 spdk_json_write_named_bool(w, "io_path_stat", g_opts.io_path_stat); 8656 spdk_json_write_named_bool(w, "allow_accel_sequence", g_opts.allow_accel_sequence); 8657 spdk_json_write_named_uint32(w, "rdma_max_cq_size", g_opts.rdma_max_cq_size); 8658 spdk_json_write_named_uint16(w, "rdma_cm_event_timeout_ms", g_opts.rdma_cm_event_timeout_ms); 8659 spdk_json_write_named_array_begin(w, "dhchap_digests"); 8660 for (i = 0; i < 32; ++i) { 8661 if (g_opts.dhchap_digests & SPDK_BIT(i)) { 8662 spdk_json_write_string(w, spdk_nvme_dhchap_get_digest_name(i)); 8663 } 8664 } 8665 spdk_json_write_array_end(w); 8666 spdk_json_write_named_array_begin(w, "dhchap_dhgroups"); 8667 for (i = 0; i < 32; ++i) { 8668 if (g_opts.dhchap_dhgroups & SPDK_BIT(i)) { 8669 spdk_json_write_string(w, spdk_nvme_dhchap_get_dhgroup_name(i)); 8670 } 8671 } 8672 8673 spdk_json_write_array_end(w); 8674 spdk_json_write_object_end(w); 8675 8676 spdk_json_write_object_end(w); 8677 } 8678 8679 static void 8680 bdev_nvme_discovery_config_json(struct spdk_json_write_ctx *w, struct discovery_ctx *ctx) 8681 { 8682 struct spdk_nvme_transport_id trid; 8683 8684 spdk_json_write_object_begin(w); 8685 8686 spdk_json_write_named_string(w, "method", "bdev_nvme_start_discovery"); 8687 8688 spdk_json_write_named_object_begin(w, "params"); 8689 spdk_json_write_named_string(w, "name", ctx->name); 8690 spdk_json_write_named_string(w, "hostnqn", ctx->hostnqn); 8691 8692 trid = ctx->trid; 8693 memset(trid.subnqn, 0, sizeof(trid.subnqn)); 8694 nvme_bdev_dump_trid_json(&trid, w); 8695 8696 spdk_json_write_named_bool(w, "wait_for_attach", ctx->wait_for_attach); 8697 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", ctx->bdev_opts.ctrlr_loss_timeout_sec); 8698 spdk_json_write_named_uint32(w, "reconnect_delay_sec", ctx->bdev_opts.reconnect_delay_sec); 8699 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 8700 ctx->bdev_opts.fast_io_fail_timeout_sec); 8701 spdk_json_write_object_end(w); 8702 8703 spdk_json_write_object_end(w); 8704 } 8705 8706 #ifdef SPDK_CONFIG_NVME_CUSE 8707 static void 8708 nvme_ctrlr_cuse_config_json(struct spdk_json_write_ctx *w, 8709 struct nvme_ctrlr *nvme_ctrlr) 8710 { 8711 size_t cuse_name_size = 128; 8712 char cuse_name[cuse_name_size]; 8713 8714 if (spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr, 8715 cuse_name, &cuse_name_size) != 0) { 8716 return; 8717 } 8718 8719 spdk_json_write_object_begin(w); 8720 8721 spdk_json_write_named_string(w, "method", "bdev_nvme_cuse_register"); 8722 8723 spdk_json_write_named_object_begin(w, "params"); 8724 spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name); 8725 spdk_json_write_object_end(w); 8726 8727 spdk_json_write_object_end(w); 8728 } 8729 #endif 8730 8731 static void 8732 nvme_ctrlr_config_json(struct spdk_json_write_ctx *w, 8733 struct nvme_ctrlr *nvme_ctrlr, 8734 struct nvme_path_id *path_id) 8735 { 8736 struct spdk_nvme_transport_id *trid; 8737 const struct spdk_nvme_ctrlr_opts *opts; 8738 8739 if (nvme_ctrlr->opts.from_discovery_service) { 8740 /* Do not emit an RPC for this - it will be implicitly 8741 * covered by a separate bdev_nvme_start_discovery or 8742 * bdev_nvme_start_mdns_discovery RPC. 8743 */ 8744 return; 8745 } 8746 8747 trid = &path_id->trid; 8748 8749 spdk_json_write_object_begin(w); 8750 8751 spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller"); 8752 8753 spdk_json_write_named_object_begin(w, "params"); 8754 spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name); 8755 nvme_bdev_dump_trid_json(trid, w); 8756 spdk_json_write_named_bool(w, "prchk_reftag", 8757 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0); 8758 spdk_json_write_named_bool(w, "prchk_guard", 8759 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0); 8760 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", nvme_ctrlr->opts.ctrlr_loss_timeout_sec); 8761 spdk_json_write_named_uint32(w, "reconnect_delay_sec", nvme_ctrlr->opts.reconnect_delay_sec); 8762 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 8763 nvme_ctrlr->opts.fast_io_fail_timeout_sec); 8764 if (nvme_ctrlr->psk != NULL) { 8765 spdk_json_write_named_string(w, "psk", spdk_key_get_name(nvme_ctrlr->psk)); 8766 } 8767 if (nvme_ctrlr->dhchap_key != NULL) { 8768 spdk_json_write_named_string(w, "dhchap_key", 8769 spdk_key_get_name(nvme_ctrlr->dhchap_key)); 8770 } 8771 if (nvme_ctrlr->dhchap_ctrlr_key != NULL) { 8772 spdk_json_write_named_string(w, "dhchap_ctrlr_key", 8773 spdk_key_get_name(nvme_ctrlr->dhchap_ctrlr_key)); 8774 } 8775 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 8776 spdk_json_write_named_string(w, "hostnqn", opts->hostnqn); 8777 spdk_json_write_named_bool(w, "hdgst", opts->header_digest); 8778 spdk_json_write_named_bool(w, "ddgst", opts->data_digest); 8779 if (opts->src_addr[0] != '\0') { 8780 spdk_json_write_named_string(w, "hostaddr", opts->src_addr); 8781 } 8782 if (opts->src_svcid[0] != '\0') { 8783 spdk_json_write_named_string(w, "hostsvcid", opts->src_svcid); 8784 } 8785 8786 if (nvme_ctrlr->opts.multipath) { 8787 spdk_json_write_named_string(w, "multipath", "multipath"); 8788 } 8789 spdk_json_write_object_end(w); 8790 8791 spdk_json_write_object_end(w); 8792 } 8793 8794 static void 8795 bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w) 8796 { 8797 spdk_json_write_object_begin(w); 8798 spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug"); 8799 8800 spdk_json_write_named_object_begin(w, "params"); 8801 spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us); 8802 spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled); 8803 spdk_json_write_object_end(w); 8804 8805 spdk_json_write_object_end(w); 8806 } 8807 8808 static int 8809 bdev_nvme_config_json(struct spdk_json_write_ctx *w) 8810 { 8811 struct nvme_bdev_ctrlr *nbdev_ctrlr; 8812 struct nvme_ctrlr *nvme_ctrlr; 8813 struct discovery_ctx *ctx; 8814 struct nvme_path_id *path_id; 8815 8816 bdev_nvme_opts_config_json(w); 8817 8818 pthread_mutex_lock(&g_bdev_nvme_mutex); 8819 8820 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 8821 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 8822 path_id = nvme_ctrlr->active_path_id; 8823 assert(path_id == TAILQ_FIRST(&nvme_ctrlr->trids)); 8824 nvme_ctrlr_config_json(w, nvme_ctrlr, path_id); 8825 8826 path_id = TAILQ_NEXT(path_id, link); 8827 while (path_id != NULL) { 8828 nvme_ctrlr_config_json(w, nvme_ctrlr, path_id); 8829 path_id = TAILQ_NEXT(path_id, link); 8830 } 8831 8832 #ifdef SPDK_CONFIG_NVME_CUSE 8833 nvme_ctrlr_cuse_config_json(w, nvme_ctrlr); 8834 #endif 8835 } 8836 } 8837 8838 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 8839 if (!ctx->from_mdns_discovery_service) { 8840 bdev_nvme_discovery_config_json(w, ctx); 8841 } 8842 } 8843 8844 bdev_nvme_mdns_discovery_config_json(w); 8845 8846 /* Dump as last parameter to give all NVMe bdevs chance to be constructed 8847 * before enabling hotplug poller. 8848 */ 8849 bdev_nvme_hotplug_config_json(w); 8850 8851 pthread_mutex_unlock(&g_bdev_nvme_mutex); 8852 return 0; 8853 } 8854 8855 struct spdk_nvme_ctrlr * 8856 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev) 8857 { 8858 struct nvme_bdev *nbdev; 8859 struct nvme_ns *nvme_ns; 8860 8861 if (!bdev || bdev->module != &nvme_if) { 8862 return NULL; 8863 } 8864 8865 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 8866 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 8867 assert(nvme_ns != NULL); 8868 8869 return nvme_ns->ctrlr->ctrlr; 8870 } 8871 8872 static bool 8873 nvme_io_path_is_current(struct nvme_io_path *io_path) 8874 { 8875 const struct nvme_bdev_channel *nbdev_ch; 8876 bool current; 8877 8878 if (!nvme_io_path_is_available(io_path)) { 8879 return false; 8880 } 8881 8882 nbdev_ch = io_path->nbdev_ch; 8883 if (nbdev_ch == NULL) { 8884 current = false; 8885 } else if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) { 8886 struct nvme_io_path *optimized_io_path = NULL; 8887 8888 STAILQ_FOREACH(optimized_io_path, &nbdev_ch->io_path_list, stailq) { 8889 if (optimized_io_path->nvme_ns->ana_state == SPDK_NVME_ANA_OPTIMIZED_STATE) { 8890 break; 8891 } 8892 } 8893 8894 /* A non-optimized path is only current if there are no optimized paths. */ 8895 current = (io_path->nvme_ns->ana_state == SPDK_NVME_ANA_OPTIMIZED_STATE) || 8896 (optimized_io_path == NULL); 8897 } else { 8898 if (nbdev_ch->current_io_path) { 8899 current = (io_path == nbdev_ch->current_io_path); 8900 } else { 8901 struct nvme_io_path *first_path; 8902 8903 /* We arrived here as there are no optimized paths for active-passive 8904 * mode. Check if this io_path is the first one available on the list. 8905 */ 8906 current = false; 8907 STAILQ_FOREACH(first_path, &nbdev_ch->io_path_list, stailq) { 8908 if (nvme_io_path_is_available(first_path)) { 8909 current = (io_path == first_path); 8910 break; 8911 } 8912 } 8913 } 8914 } 8915 8916 return current; 8917 } 8918 8919 static struct nvme_ctrlr * 8920 bdev_nvme_next_ctrlr_unsafe(struct nvme_bdev_ctrlr *nbdev_ctrlr, struct nvme_ctrlr *prev) 8921 { 8922 struct nvme_ctrlr *next; 8923 8924 /* Must be called under g_bdev_nvme_mutex */ 8925 next = prev != NULL ? TAILQ_NEXT(prev, tailq) : TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 8926 while (next != NULL) { 8927 /* ref can be 0 when the ctrlr was released, but hasn't been detached yet */ 8928 pthread_mutex_lock(&next->mutex); 8929 if (next->ref > 0) { 8930 next->ref++; 8931 pthread_mutex_unlock(&next->mutex); 8932 return next; 8933 } 8934 8935 pthread_mutex_unlock(&next->mutex); 8936 next = TAILQ_NEXT(next, tailq); 8937 } 8938 8939 return NULL; 8940 } 8941 8942 struct bdev_nvme_set_keys_ctx { 8943 struct nvme_ctrlr *nctrlr; 8944 struct spdk_key *dhchap_key; 8945 struct spdk_key *dhchap_ctrlr_key; 8946 struct spdk_thread *thread; 8947 bdev_nvme_set_keys_cb cb_fn; 8948 void *cb_ctx; 8949 int status; 8950 }; 8951 8952 static void 8953 bdev_nvme_free_set_keys_ctx(struct bdev_nvme_set_keys_ctx *ctx) 8954 { 8955 if (ctx == NULL) { 8956 return; 8957 } 8958 8959 spdk_keyring_put_key(ctx->dhchap_key); 8960 spdk_keyring_put_key(ctx->dhchap_ctrlr_key); 8961 free(ctx); 8962 } 8963 8964 static void 8965 _bdev_nvme_set_keys_done(void *_ctx) 8966 { 8967 struct bdev_nvme_set_keys_ctx *ctx = _ctx; 8968 8969 ctx->cb_fn(ctx->cb_ctx, ctx->status); 8970 8971 if (ctx->nctrlr != NULL) { 8972 nvme_ctrlr_release(ctx->nctrlr); 8973 } 8974 bdev_nvme_free_set_keys_ctx(ctx); 8975 } 8976 8977 static void 8978 bdev_nvme_set_keys_done(struct bdev_nvme_set_keys_ctx *ctx, int status) 8979 { 8980 ctx->status = status; 8981 spdk_thread_exec_msg(ctx->thread, _bdev_nvme_set_keys_done, ctx); 8982 } 8983 8984 static void bdev_nvme_authenticate_ctrlr(struct bdev_nvme_set_keys_ctx *ctx); 8985 8986 static void 8987 bdev_nvme_authenticate_ctrlr_continue(struct bdev_nvme_set_keys_ctx *ctx) 8988 { 8989 struct nvme_ctrlr *next; 8990 8991 pthread_mutex_lock(&g_bdev_nvme_mutex); 8992 next = bdev_nvme_next_ctrlr_unsafe(NULL, ctx->nctrlr); 8993 pthread_mutex_unlock(&g_bdev_nvme_mutex); 8994 8995 nvme_ctrlr_release(ctx->nctrlr); 8996 ctx->nctrlr = next; 8997 8998 if (next == NULL) { 8999 bdev_nvme_set_keys_done(ctx, 0); 9000 } else { 9001 bdev_nvme_authenticate_ctrlr(ctx); 9002 } 9003 } 9004 9005 static void 9006 bdev_nvme_authenticate_qpairs_done(struct spdk_io_channel_iter *i, int status) 9007 { 9008 struct bdev_nvme_set_keys_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 9009 9010 if (status != 0) { 9011 bdev_nvme_set_keys_done(ctx, status); 9012 return; 9013 } 9014 bdev_nvme_authenticate_ctrlr_continue(ctx); 9015 } 9016 9017 static void 9018 bdev_nvme_authenticate_qpair_done(void *ctx, int status) 9019 { 9020 spdk_for_each_channel_continue(ctx, status); 9021 } 9022 9023 static void 9024 bdev_nvme_authenticate_qpair(struct spdk_io_channel_iter *i) 9025 { 9026 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 9027 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch); 9028 struct nvme_qpair *qpair = ctrlr_ch->qpair; 9029 int rc; 9030 9031 if (!nvme_qpair_is_connected(qpair)) { 9032 spdk_for_each_channel_continue(i, 0); 9033 return; 9034 } 9035 9036 rc = spdk_nvme_qpair_authenticate(qpair->qpair, bdev_nvme_authenticate_qpair_done, i); 9037 if (rc != 0) { 9038 spdk_for_each_channel_continue(i, rc); 9039 } 9040 } 9041 9042 static void 9043 bdev_nvme_authenticate_ctrlr_done(void *_ctx, int status) 9044 { 9045 struct bdev_nvme_set_keys_ctx *ctx = _ctx; 9046 9047 if (status != 0) { 9048 bdev_nvme_set_keys_done(ctx, status); 9049 return; 9050 } 9051 9052 spdk_for_each_channel(ctx->nctrlr, bdev_nvme_authenticate_qpair, ctx, 9053 bdev_nvme_authenticate_qpairs_done); 9054 } 9055 9056 static void 9057 bdev_nvme_authenticate_ctrlr(struct bdev_nvme_set_keys_ctx *ctx) 9058 { 9059 struct spdk_nvme_ctrlr_key_opts opts = {}; 9060 struct nvme_ctrlr *nctrlr = ctx->nctrlr; 9061 int rc; 9062 9063 opts.size = SPDK_SIZEOF(&opts, dhchap_ctrlr_key); 9064 opts.dhchap_key = ctx->dhchap_key; 9065 opts.dhchap_ctrlr_key = ctx->dhchap_ctrlr_key; 9066 rc = spdk_nvme_ctrlr_set_keys(nctrlr->ctrlr, &opts); 9067 if (rc != 0) { 9068 bdev_nvme_set_keys_done(ctx, rc); 9069 return; 9070 } 9071 9072 if (ctx->dhchap_key != NULL) { 9073 rc = spdk_nvme_ctrlr_authenticate(nctrlr->ctrlr, 9074 bdev_nvme_authenticate_ctrlr_done, ctx); 9075 if (rc != 0) { 9076 bdev_nvme_set_keys_done(ctx, rc); 9077 } 9078 } else { 9079 bdev_nvme_authenticate_ctrlr_continue(ctx); 9080 } 9081 } 9082 9083 int 9084 bdev_nvme_set_keys(const char *name, const char *dhchap_key, const char *dhchap_ctrlr_key, 9085 bdev_nvme_set_keys_cb cb_fn, void *cb_ctx) 9086 { 9087 struct bdev_nvme_set_keys_ctx *ctx; 9088 struct nvme_bdev_ctrlr *nbdev_ctrlr; 9089 struct nvme_ctrlr *nctrlr; 9090 9091 ctx = calloc(1, sizeof(*ctx)); 9092 if (ctx == NULL) { 9093 return -ENOMEM; 9094 } 9095 9096 if (dhchap_key != NULL) { 9097 ctx->dhchap_key = spdk_keyring_get_key(dhchap_key); 9098 if (ctx->dhchap_key == NULL) { 9099 SPDK_ERRLOG("Could not find key %s for bdev %s\n", dhchap_key, name); 9100 bdev_nvme_free_set_keys_ctx(ctx); 9101 return -ENOKEY; 9102 } 9103 } 9104 if (dhchap_ctrlr_key != NULL) { 9105 ctx->dhchap_ctrlr_key = spdk_keyring_get_key(dhchap_ctrlr_key); 9106 if (ctx->dhchap_ctrlr_key == NULL) { 9107 SPDK_ERRLOG("Could not find key %s for bdev %s\n", dhchap_ctrlr_key, name); 9108 bdev_nvme_free_set_keys_ctx(ctx); 9109 return -ENOKEY; 9110 } 9111 } 9112 9113 pthread_mutex_lock(&g_bdev_nvme_mutex); 9114 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 9115 if (nbdev_ctrlr == NULL) { 9116 SPDK_ERRLOG("Could not find bdev_ctrlr %s\n", name); 9117 pthread_mutex_unlock(&g_bdev_nvme_mutex); 9118 bdev_nvme_free_set_keys_ctx(ctx); 9119 return -ENODEV; 9120 } 9121 nctrlr = bdev_nvme_next_ctrlr_unsafe(nbdev_ctrlr, NULL); 9122 if (nctrlr == NULL) { 9123 SPDK_ERRLOG("Could not find any nvme_ctrlrs on bdev_ctrlr %s\n", name); 9124 pthread_mutex_unlock(&g_bdev_nvme_mutex); 9125 bdev_nvme_free_set_keys_ctx(ctx); 9126 return -ENODEV; 9127 } 9128 pthread_mutex_unlock(&g_bdev_nvme_mutex); 9129 9130 ctx->nctrlr = nctrlr; 9131 ctx->cb_fn = cb_fn; 9132 ctx->cb_ctx = cb_ctx; 9133 ctx->thread = spdk_get_thread(); 9134 9135 bdev_nvme_authenticate_ctrlr(ctx); 9136 9137 return 0; 9138 } 9139 9140 void 9141 nvme_io_path_info_json(struct spdk_json_write_ctx *w, struct nvme_io_path *io_path) 9142 { 9143 struct nvme_ns *nvme_ns = io_path->nvme_ns; 9144 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 9145 const struct spdk_nvme_ctrlr_data *cdata; 9146 const struct spdk_nvme_transport_id *trid; 9147 const char *adrfam_str; 9148 9149 spdk_json_write_object_begin(w); 9150 9151 spdk_json_write_named_string(w, "bdev_name", nvme_ns->bdev->disk.name); 9152 9153 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 9154 trid = spdk_nvme_ctrlr_get_transport_id(nvme_ctrlr->ctrlr); 9155 9156 spdk_json_write_named_uint32(w, "cntlid", cdata->cntlid); 9157 spdk_json_write_named_bool(w, "current", nvme_io_path_is_current(io_path)); 9158 spdk_json_write_named_bool(w, "connected", nvme_qpair_is_connected(io_path->qpair)); 9159 spdk_json_write_named_bool(w, "accessible", nvme_ns_is_accessible(nvme_ns)); 9160 9161 spdk_json_write_named_object_begin(w, "transport"); 9162 spdk_json_write_named_string(w, "trtype", trid->trstring); 9163 spdk_json_write_named_string(w, "traddr", trid->traddr); 9164 if (trid->trsvcid[0] != '\0') { 9165 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 9166 } 9167 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 9168 if (adrfam_str) { 9169 spdk_json_write_named_string(w, "adrfam", adrfam_str); 9170 } 9171 spdk_json_write_object_end(w); 9172 9173 spdk_json_write_object_end(w); 9174 } 9175 9176 void 9177 bdev_nvme_get_discovery_info(struct spdk_json_write_ctx *w) 9178 { 9179 struct discovery_ctx *ctx; 9180 struct discovery_entry_ctx *entry_ctx; 9181 9182 spdk_json_write_array_begin(w); 9183 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 9184 spdk_json_write_object_begin(w); 9185 spdk_json_write_named_string(w, "name", ctx->name); 9186 9187 spdk_json_write_named_object_begin(w, "trid"); 9188 nvme_bdev_dump_trid_json(&ctx->trid, w); 9189 spdk_json_write_object_end(w); 9190 9191 spdk_json_write_named_array_begin(w, "referrals"); 9192 TAILQ_FOREACH(entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 9193 spdk_json_write_object_begin(w); 9194 spdk_json_write_named_object_begin(w, "trid"); 9195 nvme_bdev_dump_trid_json(&entry_ctx->trid, w); 9196 spdk_json_write_object_end(w); 9197 spdk_json_write_object_end(w); 9198 } 9199 spdk_json_write_array_end(w); 9200 9201 spdk_json_write_object_end(w); 9202 } 9203 spdk_json_write_array_end(w); 9204 } 9205 9206 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme) 9207 9208 static void 9209 bdev_nvme_trace(void) 9210 { 9211 struct spdk_trace_tpoint_opts opts[] = { 9212 { 9213 "BDEV_NVME_IO_START", TRACE_BDEV_NVME_IO_START, 9214 OWNER_TYPE_NONE, OBJECT_BDEV_NVME_IO, 1, 9215 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 9216 }, 9217 { 9218 "BDEV_NVME_IO_DONE", TRACE_BDEV_NVME_IO_DONE, 9219 OWNER_TYPE_NONE, OBJECT_BDEV_NVME_IO, 0, 9220 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 9221 } 9222 }; 9223 9224 9225 spdk_trace_register_object(OBJECT_BDEV_NVME_IO, 'N'); 9226 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 9227 spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_SUBMIT, OBJECT_BDEV_NVME_IO, 0); 9228 spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_SUBMIT, OBJECT_BDEV_NVME_IO, 0); 9229 spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_COMPLETE, OBJECT_BDEV_NVME_IO, 0); 9230 spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_COMPLETE, OBJECT_BDEV_NVME_IO, 0); 9231 } 9232 SPDK_TRACE_REGISTER_FN(bdev_nvme_trace, "bdev_nvme", TRACE_GROUP_BDEV_NVME) 9233