1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 * Copyright (c) 2022 Dell Inc, or its subsidiaries. All rights reserved. 6 */ 7 8 #include "spdk/stdinc.h" 9 10 #include "bdev_nvme.h" 11 12 #include "spdk/accel.h" 13 #include "spdk/config.h" 14 #include "spdk/endian.h" 15 #include "spdk/bdev.h" 16 #include "spdk/json.h" 17 #include "spdk/keyring.h" 18 #include "spdk/likely.h" 19 #include "spdk/nvme.h" 20 #include "spdk/nvme_ocssd.h" 21 #include "spdk/nvme_zns.h" 22 #include "spdk/opal.h" 23 #include "spdk/thread.h" 24 #include "spdk/trace.h" 25 #include "spdk/string.h" 26 #include "spdk/util.h" 27 #include "spdk/uuid.h" 28 29 #include "spdk/bdev_module.h" 30 #include "spdk/log.h" 31 32 #include "spdk_internal/usdt.h" 33 #include "spdk_internal/trace_defs.h" 34 35 #define CTRLR_STRING(nvme_ctrlr) \ 36 (spdk_nvme_trtype_is_fabrics(nvme_ctrlr->active_path_id->trid.trtype) ? \ 37 nvme_ctrlr->active_path_id->trid.subnqn : nvme_ctrlr->active_path_id->trid.traddr) 38 39 #define CTRLR_ID(nvme_ctrlr) (spdk_nvme_ctrlr_get_id(nvme_ctrlr->ctrlr)) 40 41 #define NVME_CTRLR_ERRLOG(ctrlr, format, ...) \ 42 SPDK_ERRLOG("[%s, %u] " format, CTRLR_STRING(ctrlr), CTRLR_ID(ctrlr), ##__VA_ARGS__); 43 44 #define NVME_CTRLR_WARNLOG(ctrlr, format, ...) \ 45 SPDK_WARNLOG("[%s, %u] " format, CTRLR_STRING(ctrlr), CTRLR_ID(ctrlr), ##__VA_ARGS__); 46 47 #define NVME_CTRLR_NOTICELOG(ctrlr, format, ...) \ 48 SPDK_NOTICELOG("[%s, %u] " format, CTRLR_STRING(ctrlr), CTRLR_ID(ctrlr), ##__VA_ARGS__); 49 50 #define NVME_CTRLR_INFOLOG(ctrlr, format, ...) \ 51 SPDK_INFOLOG(bdev_nvme, "[%s, %u] " format, CTRLR_STRING(ctrlr), CTRLR_ID(ctrlr), ##__VA_ARGS__); 52 53 #ifdef DEBUG 54 #define NVME_CTRLR_DEBUGLOG(ctrlr, format, ...) \ 55 SPDK_DEBUGLOG(bdev_nvme, "[%s, %u] " format, CTRLR_STRING(ctrlr), CTRLR_ID(ctrlr), ##__VA_ARGS__); 56 #else 57 #define NVME_CTRLR_DEBUGLOG(ctrlr, ...) do { } while (0) 58 #endif 59 60 #define BDEV_STRING(nbdev) (nbdev->disk.name) 61 62 #define NVME_BDEV_ERRLOG(nbdev, format, ...) \ 63 SPDK_ERRLOG("[%s] " format, BDEV_STRING(nbdev), ##__VA_ARGS__); 64 65 #define NVME_BDEV_WARNLOG(nbdev, format, ...) \ 66 SPDK_WARNLOG("[%s] " format, BDEV_STRING(nbdev), ##__VA_ARGS__); 67 68 #define NVME_BDEV_NOTICELOG(nbdev, format, ...) \ 69 SPDK_NOTICELOG("[%s] " format, BDEV_STRING(nbdev), ##__VA_ARGS__); 70 71 #define NVME_BDEV_INFOLOG(nbdev, format, ...) \ 72 SPDK_INFOLOG(bdev_nvme, "[%s] " format, BDEV_STRING(nbdev), ##__VA_ARGS__); 73 74 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true 75 #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS (10000) 76 77 #define NSID_STR_LEN 10 78 79 #define SPDK_CONTROLLER_NAME_MAX 512 80 81 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w); 82 83 struct nvme_bdev_io { 84 /** array of iovecs to transfer. */ 85 struct iovec *iovs; 86 87 /** Number of iovecs in iovs array. */ 88 int iovcnt; 89 90 /** Current iovec position. */ 91 int iovpos; 92 93 /** Offset in current iovec. */ 94 uint32_t iov_offset; 95 96 /** Offset in current iovec. */ 97 uint32_t fused_iov_offset; 98 99 /** array of iovecs to transfer. */ 100 struct iovec *fused_iovs; 101 102 /** Number of iovecs in iovs array. */ 103 int fused_iovcnt; 104 105 /** Current iovec position. */ 106 int fused_iovpos; 107 108 /** I/O path the current I/O or admin passthrough is submitted on, or the I/O path 109 * being reset in a reset I/O. 110 */ 111 struct nvme_io_path *io_path; 112 113 /** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */ 114 struct spdk_nvme_cpl cpl; 115 116 /** Extended IO opts passed by the user to bdev layer and mapped to NVME format */ 117 struct spdk_nvme_ns_cmd_ext_io_opts ext_opts; 118 119 /** Keeps track if first of fused commands was submitted */ 120 bool first_fused_submitted; 121 122 /** Keeps track if first of fused commands was completed */ 123 bool first_fused_completed; 124 125 /* How many times the current I/O was retried. */ 126 int32_t retry_count; 127 128 /** Expiration value in ticks to retry the current I/O. */ 129 uint64_t retry_ticks; 130 131 /** Temporary pointer to zone report buffer */ 132 struct spdk_nvme_zns_zone_report *zone_report_buf; 133 134 /** Keep track of how many zones that have been copied to the spdk_bdev_zone_info struct */ 135 uint64_t handled_zones; 136 137 /* Current tsc at submit time. */ 138 uint64_t submit_tsc; 139 140 /* Used to put nvme_bdev_io into the list */ 141 TAILQ_ENTRY(nvme_bdev_io) retry_link; 142 }; 143 144 struct nvme_probe_skip_entry { 145 struct spdk_nvme_transport_id trid; 146 TAILQ_ENTRY(nvme_probe_skip_entry) tailq; 147 }; 148 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */ 149 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER( 150 g_skipped_nvme_ctrlrs); 151 152 #define BDEV_NVME_DEFAULT_DIGESTS (SPDK_BIT(SPDK_NVMF_DHCHAP_HASH_SHA256) | \ 153 SPDK_BIT(SPDK_NVMF_DHCHAP_HASH_SHA384) | \ 154 SPDK_BIT(SPDK_NVMF_DHCHAP_HASH_SHA512)) 155 156 #define BDEV_NVME_DEFAULT_DHGROUPS (SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_NULL) | \ 157 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_2048) | \ 158 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_3072) | \ 159 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_4096) | \ 160 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_6144) | \ 161 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_8192)) 162 163 static struct spdk_bdev_nvme_opts g_opts = { 164 .action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE, 165 .keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS, 166 .timeout_us = 0, 167 .timeout_admin_us = 0, 168 .transport_retry_count = 4, 169 .arbitration_burst = 0, 170 .low_priority_weight = 0, 171 .medium_priority_weight = 0, 172 .high_priority_weight = 0, 173 .io_queue_requests = 0, 174 .nvme_adminq_poll_period_us = 10000ULL, 175 .nvme_ioq_poll_period_us = 0, 176 .delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT, 177 .bdev_retry_count = 3, 178 .ctrlr_loss_timeout_sec = 0, 179 .reconnect_delay_sec = 0, 180 .fast_io_fail_timeout_sec = 0, 181 .transport_ack_timeout = 0, 182 .disable_auto_failback = false, 183 .generate_uuids = false, 184 .transport_tos = 0, 185 .nvme_error_stat = false, 186 .io_path_stat = false, 187 .allow_accel_sequence = false, 188 .dhchap_digests = BDEV_NVME_DEFAULT_DIGESTS, 189 .dhchap_dhgroups = BDEV_NVME_DEFAULT_DHGROUPS, 190 .rdma_umr_per_io = false, 191 }; 192 193 #define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL 194 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT 100000ULL 195 196 static int g_hot_insert_nvme_controller_index = 0; 197 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT; 198 static bool g_nvme_hotplug_enabled = false; 199 struct spdk_thread *g_bdev_nvme_init_thread; 200 static struct spdk_poller *g_hotplug_poller; 201 static struct spdk_poller *g_hotplug_probe_poller; 202 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx; 203 204 static void nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 205 struct nvme_async_probe_ctx *ctx); 206 static void nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 207 struct nvme_async_probe_ctx *ctx); 208 static int bdev_nvme_library_init(void); 209 static void bdev_nvme_library_fini(void); 210 static void _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, 211 struct spdk_bdev_io *bdev_io); 212 static void bdev_nvme_submit_request(struct spdk_io_channel *ch, 213 struct spdk_bdev_io *bdev_io); 214 static int bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 215 void *md, uint64_t lba_count, uint64_t lba, 216 uint32_t flags, struct spdk_memory_domain *domain, void *domain_ctx, 217 struct spdk_accel_sequence *seq); 218 static int bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 219 void *md, uint64_t lba_count, uint64_t lba); 220 static int bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 221 void *md, uint64_t lba_count, uint64_t lba, 222 uint32_t flags, struct spdk_memory_domain *domain, void *domain_ctx, 223 struct spdk_accel_sequence *seq, 224 union spdk_bdev_nvme_cdw12 cdw12, union spdk_bdev_nvme_cdw13 cdw13); 225 static int bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 226 void *md, uint64_t lba_count, 227 uint64_t zslba, uint32_t flags); 228 static int bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 229 void *md, uint64_t lba_count, uint64_t lba, 230 uint32_t flags); 231 static int bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, 232 struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov, 233 int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba, 234 uint32_t flags); 235 static int bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, 236 uint32_t num_zones, struct spdk_bdev_zone_info *info); 237 static int bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 238 enum spdk_bdev_zone_action action); 239 static void bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, 240 struct nvme_bdev_io *bio, 241 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); 242 static int bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 243 void *buf, size_t nbytes); 244 static int bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 245 void *buf, size_t nbytes, void *md_buf, size_t md_len); 246 static int bdev_nvme_iov_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 247 struct iovec *iov, int iovcnt, size_t nbytes, 248 void *md_buf, size_t md_len); 249 static void bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, 250 struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort); 251 static void bdev_nvme_reset_io(struct nvme_bdev *nbdev, struct nvme_bdev_io *bio); 252 static int bdev_nvme_reset_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 253 static int bdev_nvme_failover_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 254 static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr); 255 static int nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr); 256 257 static struct nvme_ns *nvme_ns_alloc(void); 258 static void nvme_ns_free(struct nvme_ns *ns); 259 260 static int 261 nvme_ns_cmp(struct nvme_ns *ns1, struct nvme_ns *ns2) 262 { 263 return ns1->id < ns2->id ? -1 : ns1->id > ns2->id; 264 } 265 266 RB_GENERATE_STATIC(nvme_ns_tree, nvme_ns, node, nvme_ns_cmp); 267 268 struct spdk_nvme_qpair * 269 bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch) 270 { 271 struct nvme_ctrlr_channel *ctrlr_ch; 272 273 assert(ctrlr_io_ch != NULL); 274 275 ctrlr_ch = spdk_io_channel_get_ctx(ctrlr_io_ch); 276 277 return ctrlr_ch->qpair->qpair; 278 } 279 280 static int 281 bdev_nvme_get_ctx_size(void) 282 { 283 return sizeof(struct nvme_bdev_io); 284 } 285 286 static struct spdk_bdev_module nvme_if = { 287 .name = "nvme", 288 .async_fini = true, 289 .module_init = bdev_nvme_library_init, 290 .module_fini = bdev_nvme_library_fini, 291 .config_json = bdev_nvme_config_json, 292 .get_ctx_size = bdev_nvme_get_ctx_size, 293 294 }; 295 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if) 296 297 struct nvme_bdev_ctrlrs g_nvme_bdev_ctrlrs = TAILQ_HEAD_INITIALIZER(g_nvme_bdev_ctrlrs); 298 pthread_mutex_t g_bdev_nvme_mutex = PTHREAD_MUTEX_INITIALIZER; 299 bool g_bdev_nvme_module_finish; 300 301 struct nvme_bdev_ctrlr * 302 nvme_bdev_ctrlr_get_by_name(const char *name) 303 { 304 struct nvme_bdev_ctrlr *nbdev_ctrlr; 305 306 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 307 if (strcmp(name, nbdev_ctrlr->name) == 0) { 308 break; 309 } 310 } 311 312 return nbdev_ctrlr; 313 } 314 315 static struct nvme_ctrlr * 316 nvme_bdev_ctrlr_get_ctrlr(struct nvme_bdev_ctrlr *nbdev_ctrlr, 317 const struct spdk_nvme_transport_id *trid, const char *hostnqn) 318 { 319 const struct spdk_nvme_ctrlr_opts *opts; 320 struct nvme_ctrlr *nvme_ctrlr; 321 322 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 323 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 324 if (spdk_nvme_transport_id_compare(trid, &nvme_ctrlr->active_path_id->trid) == 0 && 325 strcmp(hostnqn, opts->hostnqn) == 0) { 326 break; 327 } 328 } 329 330 return nvme_ctrlr; 331 } 332 333 struct nvme_ctrlr * 334 nvme_bdev_ctrlr_get_ctrlr_by_id(struct nvme_bdev_ctrlr *nbdev_ctrlr, 335 uint16_t cntlid) 336 { 337 struct nvme_ctrlr *nvme_ctrlr; 338 const struct spdk_nvme_ctrlr_data *cdata; 339 340 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 341 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 342 if (cdata->cntlid == cntlid) { 343 break; 344 } 345 } 346 347 return nvme_ctrlr; 348 } 349 350 static struct nvme_bdev * 351 nvme_bdev_ctrlr_get_bdev(struct nvme_bdev_ctrlr *nbdev_ctrlr, uint32_t nsid) 352 { 353 struct nvme_bdev *nbdev; 354 355 pthread_mutex_lock(&g_bdev_nvme_mutex); 356 TAILQ_FOREACH(nbdev, &nbdev_ctrlr->bdevs, tailq) { 357 if (nbdev->nsid == nsid) { 358 break; 359 } 360 } 361 pthread_mutex_unlock(&g_bdev_nvme_mutex); 362 363 return nbdev; 364 } 365 366 struct nvme_ns * 367 nvme_ctrlr_get_ns(struct nvme_ctrlr *nvme_ctrlr, uint32_t nsid) 368 { 369 struct nvme_ns ns; 370 371 assert(nsid > 0); 372 373 ns.id = nsid; 374 return RB_FIND(nvme_ns_tree, &nvme_ctrlr->namespaces, &ns); 375 } 376 377 struct nvme_ns * 378 nvme_ctrlr_get_first_active_ns(struct nvme_ctrlr *nvme_ctrlr) 379 { 380 return RB_MIN(nvme_ns_tree, &nvme_ctrlr->namespaces); 381 } 382 383 struct nvme_ns * 384 nvme_ctrlr_get_next_active_ns(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *ns) 385 { 386 if (ns == NULL) { 387 return NULL; 388 } 389 390 return RB_NEXT(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 391 } 392 393 static struct nvme_ctrlr * 394 nvme_ctrlr_get(const struct spdk_nvme_transport_id *trid, const char *hostnqn) 395 { 396 struct nvme_bdev_ctrlr *nbdev_ctrlr; 397 struct nvme_ctrlr *nvme_ctrlr = NULL; 398 399 pthread_mutex_lock(&g_bdev_nvme_mutex); 400 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 401 nvme_ctrlr = nvme_bdev_ctrlr_get_ctrlr(nbdev_ctrlr, trid, hostnqn); 402 if (nvme_ctrlr != NULL) { 403 break; 404 } 405 } 406 pthread_mutex_unlock(&g_bdev_nvme_mutex); 407 408 return nvme_ctrlr; 409 } 410 411 struct nvme_ctrlr * 412 nvme_ctrlr_get_by_name(const char *name) 413 { 414 struct nvme_bdev_ctrlr *nbdev_ctrlr; 415 struct nvme_ctrlr *nvme_ctrlr = NULL; 416 417 if (name == NULL) { 418 return NULL; 419 } 420 421 pthread_mutex_lock(&g_bdev_nvme_mutex); 422 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 423 if (nbdev_ctrlr != NULL) { 424 nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 425 } 426 pthread_mutex_unlock(&g_bdev_nvme_mutex); 427 428 return nvme_ctrlr; 429 } 430 431 void 432 nvme_bdev_ctrlr_for_each(nvme_bdev_ctrlr_for_each_fn fn, void *ctx) 433 { 434 struct nvme_bdev_ctrlr *nbdev_ctrlr; 435 436 pthread_mutex_lock(&g_bdev_nvme_mutex); 437 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 438 fn(nbdev_ctrlr, ctx); 439 } 440 pthread_mutex_unlock(&g_bdev_nvme_mutex); 441 } 442 443 struct nvme_ctrlr_channel_iter { 444 nvme_ctrlr_for_each_channel_msg fn; 445 nvme_ctrlr_for_each_channel_done cpl; 446 struct spdk_io_channel_iter *i; 447 void *ctx; 448 }; 449 450 void 451 nvme_ctrlr_for_each_channel_continue(struct nvme_ctrlr_channel_iter *iter, int status) 452 { 453 spdk_for_each_channel_continue(iter->i, status); 454 } 455 456 static void 457 nvme_ctrlr_each_channel_msg(struct spdk_io_channel_iter *i) 458 { 459 struct nvme_ctrlr_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 460 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 461 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 462 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch); 463 464 iter->i = i; 465 iter->fn(iter, nvme_ctrlr, ctrlr_ch, iter->ctx); 466 } 467 468 static void 469 nvme_ctrlr_each_channel_cpl(struct spdk_io_channel_iter *i, int status) 470 { 471 struct nvme_ctrlr_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 472 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 473 474 iter->i = i; 475 iter->cpl(nvme_ctrlr, iter->ctx, status); 476 477 free(iter); 478 } 479 480 void 481 nvme_ctrlr_for_each_channel(struct nvme_ctrlr *nvme_ctrlr, 482 nvme_ctrlr_for_each_channel_msg fn, void *ctx, 483 nvme_ctrlr_for_each_channel_done cpl) 484 { 485 struct nvme_ctrlr_channel_iter *iter; 486 487 assert(nvme_ctrlr != NULL && fn != NULL); 488 489 iter = calloc(1, sizeof(struct nvme_ctrlr_channel_iter)); 490 if (iter == NULL) { 491 SPDK_ERRLOG("Unable to allocate iterator\n"); 492 assert(false); 493 return; 494 } 495 496 iter->fn = fn; 497 iter->cpl = cpl; 498 iter->ctx = ctx; 499 500 spdk_for_each_channel(nvme_ctrlr, nvme_ctrlr_each_channel_msg, 501 iter, nvme_ctrlr_each_channel_cpl); 502 } 503 504 struct nvme_bdev_channel_iter { 505 nvme_bdev_for_each_channel_msg fn; 506 nvme_bdev_for_each_channel_done cpl; 507 struct spdk_io_channel_iter *i; 508 void *ctx; 509 }; 510 511 void 512 nvme_bdev_for_each_channel_continue(struct nvme_bdev_channel_iter *iter, int status) 513 { 514 spdk_for_each_channel_continue(iter->i, status); 515 } 516 517 static void 518 nvme_bdev_each_channel_msg(struct spdk_io_channel_iter *i) 519 { 520 struct nvme_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 521 struct nvme_bdev *nbdev = spdk_io_channel_iter_get_io_device(i); 522 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 523 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 524 525 iter->i = i; 526 iter->fn(iter, nbdev, nbdev_ch, iter->ctx); 527 } 528 529 static void 530 nvme_bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status) 531 { 532 struct nvme_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 533 struct nvme_bdev *nbdev = spdk_io_channel_iter_get_io_device(i); 534 535 iter->i = i; 536 iter->cpl(nbdev, iter->ctx, status); 537 538 free(iter); 539 } 540 541 void 542 nvme_bdev_for_each_channel(struct nvme_bdev *nbdev, 543 nvme_bdev_for_each_channel_msg fn, void *ctx, 544 nvme_bdev_for_each_channel_done cpl) 545 { 546 struct nvme_bdev_channel_iter *iter; 547 548 assert(nbdev != NULL && fn != NULL); 549 550 iter = calloc(1, sizeof(struct nvme_bdev_channel_iter)); 551 if (iter == NULL) { 552 SPDK_ERRLOG("Unable to allocate iterator\n"); 553 assert(false); 554 return; 555 } 556 557 iter->fn = fn; 558 iter->cpl = cpl; 559 iter->ctx = ctx; 560 561 spdk_for_each_channel(nbdev, nvme_bdev_each_channel_msg, iter, 562 nvme_bdev_each_channel_cpl); 563 } 564 565 void 566 nvme_bdev_dump_trid_json(const struct spdk_nvme_transport_id *trid, struct spdk_json_write_ctx *w) 567 { 568 const char *trtype_str; 569 const char *adrfam_str; 570 571 trtype_str = spdk_nvme_transport_id_trtype_str(trid->trtype); 572 if (trtype_str) { 573 spdk_json_write_named_string(w, "trtype", trtype_str); 574 } 575 576 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 577 if (adrfam_str) { 578 spdk_json_write_named_string(w, "adrfam", adrfam_str); 579 } 580 581 if (trid->traddr[0] != '\0') { 582 spdk_json_write_named_string(w, "traddr", trid->traddr); 583 } 584 585 if (trid->trsvcid[0] != '\0') { 586 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 587 } 588 589 if (trid->subnqn[0] != '\0') { 590 spdk_json_write_named_string(w, "subnqn", trid->subnqn); 591 } 592 } 593 594 static void 595 nvme_bdev_ctrlr_delete(struct nvme_bdev_ctrlr *nbdev_ctrlr, 596 struct nvme_ctrlr *nvme_ctrlr) 597 { 598 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_delete, nvme_ctrlr->nbdev_ctrlr->name); 599 pthread_mutex_lock(&g_bdev_nvme_mutex); 600 601 TAILQ_REMOVE(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 602 if (!TAILQ_EMPTY(&nbdev_ctrlr->ctrlrs)) { 603 pthread_mutex_unlock(&g_bdev_nvme_mutex); 604 605 return; 606 } 607 TAILQ_REMOVE(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 608 609 pthread_mutex_unlock(&g_bdev_nvme_mutex); 610 611 assert(TAILQ_EMPTY(&nbdev_ctrlr->bdevs)); 612 613 free(nbdev_ctrlr->name); 614 free(nbdev_ctrlr); 615 } 616 617 static void 618 _nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 619 { 620 struct nvme_path_id *path_id, *tmp_path; 621 struct nvme_ns *ns, *tmp_ns; 622 623 free(nvme_ctrlr->copied_ana_desc); 624 spdk_free(nvme_ctrlr->ana_log_page); 625 626 if (nvme_ctrlr->opal_dev) { 627 spdk_opal_dev_destruct(nvme_ctrlr->opal_dev); 628 nvme_ctrlr->opal_dev = NULL; 629 } 630 631 if (nvme_ctrlr->nbdev_ctrlr) { 632 nvme_bdev_ctrlr_delete(nvme_ctrlr->nbdev_ctrlr, nvme_ctrlr); 633 } 634 635 RB_FOREACH_SAFE(ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp_ns) { 636 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 637 nvme_ns_free(ns); 638 } 639 640 TAILQ_FOREACH_SAFE(path_id, &nvme_ctrlr->trids, link, tmp_path) { 641 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 642 free(path_id); 643 } 644 645 pthread_mutex_destroy(&nvme_ctrlr->mutex); 646 spdk_keyring_put_key(nvme_ctrlr->psk); 647 spdk_keyring_put_key(nvme_ctrlr->dhchap_key); 648 spdk_keyring_put_key(nvme_ctrlr->dhchap_ctrlr_key); 649 free(nvme_ctrlr); 650 651 pthread_mutex_lock(&g_bdev_nvme_mutex); 652 if (g_bdev_nvme_module_finish && TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 653 pthread_mutex_unlock(&g_bdev_nvme_mutex); 654 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 655 spdk_bdev_module_fini_done(); 656 return; 657 } 658 pthread_mutex_unlock(&g_bdev_nvme_mutex); 659 } 660 661 static int 662 nvme_detach_poller(void *arg) 663 { 664 struct nvme_ctrlr *nvme_ctrlr = arg; 665 int rc; 666 667 rc = spdk_nvme_detach_poll_async(nvme_ctrlr->detach_ctx); 668 if (rc != -EAGAIN) { 669 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 670 _nvme_ctrlr_delete(nvme_ctrlr); 671 } 672 673 return SPDK_POLLER_BUSY; 674 } 675 676 static void 677 nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 678 { 679 int rc; 680 681 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 682 683 if (spdk_interrupt_mode_is_enabled()) { 684 spdk_interrupt_unregister(&nvme_ctrlr->intr); 685 } 686 687 /* First, unregister the adminq poller, as the driver will poll adminq if necessary */ 688 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 689 690 /* If we got here, the reset/detach poller cannot be active */ 691 assert(nvme_ctrlr->reset_detach_poller == NULL); 692 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(nvme_detach_poller, 693 nvme_ctrlr, 1000); 694 if (nvme_ctrlr->reset_detach_poller == NULL) { 695 NVME_CTRLR_ERRLOG(nvme_ctrlr, "Failed to register detach poller\n"); 696 goto error; 697 } 698 699 rc = spdk_nvme_detach_async(nvme_ctrlr->ctrlr, &nvme_ctrlr->detach_ctx); 700 if (rc != 0) { 701 NVME_CTRLR_ERRLOG(nvme_ctrlr, "Failed to detach the NVMe controller\n"); 702 goto error; 703 } 704 705 return; 706 error: 707 /* We don't have a good way to handle errors here, so just do what we can and delete the 708 * controller without detaching the underlying NVMe device. 709 */ 710 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 711 _nvme_ctrlr_delete(nvme_ctrlr); 712 } 713 714 static void 715 nvme_ctrlr_unregister_cb(void *io_device) 716 { 717 struct nvme_ctrlr *nvme_ctrlr = io_device; 718 719 nvme_ctrlr_delete(nvme_ctrlr); 720 } 721 722 static void 723 nvme_ctrlr_unregister(void *ctx) 724 { 725 struct nvme_ctrlr *nvme_ctrlr = ctx; 726 727 spdk_io_device_unregister(nvme_ctrlr, nvme_ctrlr_unregister_cb); 728 } 729 730 static bool 731 nvme_ctrlr_can_be_unregistered(struct nvme_ctrlr *nvme_ctrlr) 732 { 733 if (!nvme_ctrlr->destruct) { 734 return false; 735 } 736 737 if (nvme_ctrlr->ref > 0) { 738 return false; 739 } 740 741 if (nvme_ctrlr->resetting) { 742 return false; 743 } 744 745 if (nvme_ctrlr->ana_log_page_updating) { 746 return false; 747 } 748 749 if (nvme_ctrlr->io_path_cache_clearing) { 750 return false; 751 } 752 753 return true; 754 } 755 756 static void 757 nvme_ctrlr_put_ref(struct nvme_ctrlr *nvme_ctrlr) 758 { 759 pthread_mutex_lock(&nvme_ctrlr->mutex); 760 SPDK_DTRACE_PROBE2(bdev_nvme_ctrlr_release, nvme_ctrlr->nbdev_ctrlr->name, nvme_ctrlr->ref); 761 762 assert(nvme_ctrlr->ref > 0); 763 nvme_ctrlr->ref--; 764 765 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 766 pthread_mutex_unlock(&nvme_ctrlr->mutex); 767 return; 768 } 769 770 pthread_mutex_unlock(&nvme_ctrlr->mutex); 771 772 spdk_thread_exec_msg(nvme_ctrlr->thread, nvme_ctrlr_unregister, nvme_ctrlr); 773 } 774 775 static void 776 nvme_ctrlr_get_ref(struct nvme_ctrlr *nvme_ctrlr) 777 { 778 pthread_mutex_lock(&nvme_ctrlr->mutex); 779 nvme_ctrlr->ref++; 780 pthread_mutex_unlock(&nvme_ctrlr->mutex); 781 } 782 783 static void 784 bdev_nvme_clear_current_io_path(struct nvme_bdev_channel *nbdev_ch) 785 { 786 nbdev_ch->current_io_path = NULL; 787 nbdev_ch->rr_counter = 0; 788 } 789 790 static struct nvme_io_path * 791 _bdev_nvme_get_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 792 { 793 struct nvme_io_path *io_path; 794 795 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 796 if (io_path->nvme_ns == nvme_ns) { 797 break; 798 } 799 } 800 801 return io_path; 802 } 803 804 static struct nvme_io_path * 805 nvme_io_path_alloc(void) 806 { 807 struct nvme_io_path *io_path; 808 809 io_path = calloc(1, sizeof(*io_path)); 810 if (io_path == NULL) { 811 SPDK_ERRLOG("Failed to alloc io_path.\n"); 812 return NULL; 813 } 814 815 if (g_opts.io_path_stat) { 816 io_path->stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); 817 if (io_path->stat == NULL) { 818 free(io_path); 819 SPDK_ERRLOG("Failed to alloc io_path stat.\n"); 820 return NULL; 821 } 822 spdk_bdev_reset_io_stat(io_path->stat, SPDK_BDEV_RESET_STAT_MAXMIN); 823 } 824 825 return io_path; 826 } 827 828 static void 829 nvme_io_path_free(struct nvme_io_path *io_path) 830 { 831 free(io_path->stat); 832 free(io_path); 833 } 834 835 static int 836 _bdev_nvme_add_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 837 { 838 struct nvme_io_path *io_path; 839 struct spdk_io_channel *ch; 840 struct nvme_ctrlr_channel *ctrlr_ch; 841 struct nvme_qpair *nvme_qpair; 842 843 io_path = nvme_io_path_alloc(); 844 if (io_path == NULL) { 845 return -ENOMEM; 846 } 847 848 io_path->nvme_ns = nvme_ns; 849 850 ch = spdk_get_io_channel(nvme_ns->ctrlr); 851 if (ch == NULL) { 852 nvme_io_path_free(io_path); 853 SPDK_ERRLOG("Failed to alloc io_channel.\n"); 854 return -ENOMEM; 855 } 856 857 ctrlr_ch = spdk_io_channel_get_ctx(ch); 858 859 nvme_qpair = ctrlr_ch->qpair; 860 assert(nvme_qpair != NULL); 861 862 io_path->qpair = nvme_qpair; 863 TAILQ_INSERT_TAIL(&nvme_qpair->io_path_list, io_path, tailq); 864 865 io_path->nbdev_ch = nbdev_ch; 866 STAILQ_INSERT_TAIL(&nbdev_ch->io_path_list, io_path, stailq); 867 868 bdev_nvme_clear_current_io_path(nbdev_ch); 869 870 return 0; 871 } 872 873 static void 874 bdev_nvme_clear_retry_io_path(struct nvme_bdev_channel *nbdev_ch, 875 struct nvme_io_path *io_path) 876 { 877 struct nvme_bdev_io *bio; 878 879 TAILQ_FOREACH(bio, &nbdev_ch->retry_io_list, retry_link) { 880 if (bio->io_path == io_path) { 881 bio->io_path = NULL; 882 } 883 } 884 } 885 886 static void 887 _bdev_nvme_delete_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *io_path) 888 { 889 struct spdk_io_channel *ch; 890 struct nvme_qpair *nvme_qpair; 891 struct nvme_ctrlr_channel *ctrlr_ch; 892 struct nvme_bdev *nbdev; 893 894 nbdev = spdk_io_channel_get_io_device(spdk_io_channel_from_ctx(nbdev_ch)); 895 896 /* Add the statistics to nvme_ns before this path is destroyed. */ 897 pthread_mutex_lock(&nbdev->mutex); 898 if (nbdev->ref != 0 && io_path->nvme_ns->stat != NULL && io_path->stat != NULL) { 899 spdk_bdev_add_io_stat(io_path->nvme_ns->stat, io_path->stat); 900 } 901 pthread_mutex_unlock(&nbdev->mutex); 902 903 bdev_nvme_clear_current_io_path(nbdev_ch); 904 bdev_nvme_clear_retry_io_path(nbdev_ch, io_path); 905 906 STAILQ_REMOVE(&nbdev_ch->io_path_list, io_path, nvme_io_path, stailq); 907 io_path->nbdev_ch = NULL; 908 909 nvme_qpair = io_path->qpair; 910 assert(nvme_qpair != NULL); 911 912 ctrlr_ch = nvme_qpair->ctrlr_ch; 913 assert(ctrlr_ch != NULL); 914 915 ch = spdk_io_channel_from_ctx(ctrlr_ch); 916 spdk_put_io_channel(ch); 917 918 /* After an io_path is removed, I/Os submitted to it may complete and update statistics 919 * of the io_path. To avoid heap-use-after-free error from this case, do not free the 920 * io_path here but free the io_path when the associated qpair is freed. It is ensured 921 * that all I/Os submitted to the io_path are completed when the associated qpair is freed. 922 */ 923 } 924 925 static void 926 _bdev_nvme_delete_io_paths(struct nvme_bdev_channel *nbdev_ch) 927 { 928 struct nvme_io_path *io_path, *tmp_io_path; 929 930 STAILQ_FOREACH_SAFE(io_path, &nbdev_ch->io_path_list, stailq, tmp_io_path) { 931 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 932 } 933 } 934 935 static int 936 bdev_nvme_create_bdev_channel_cb(void *io_device, void *ctx_buf) 937 { 938 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 939 struct nvme_bdev *nbdev = io_device; 940 struct nvme_ns *nvme_ns; 941 int rc; 942 943 STAILQ_INIT(&nbdev_ch->io_path_list); 944 TAILQ_INIT(&nbdev_ch->retry_io_list); 945 946 pthread_mutex_lock(&nbdev->mutex); 947 948 nbdev_ch->mp_policy = nbdev->mp_policy; 949 nbdev_ch->mp_selector = nbdev->mp_selector; 950 nbdev_ch->rr_min_io = nbdev->rr_min_io; 951 952 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 953 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 954 if (rc != 0) { 955 pthread_mutex_unlock(&nbdev->mutex); 956 957 _bdev_nvme_delete_io_paths(nbdev_ch); 958 return rc; 959 } 960 } 961 pthread_mutex_unlock(&nbdev->mutex); 962 963 return 0; 964 } 965 966 /* If cpl != NULL, complete the bdev_io with nvme status based on 'cpl'. 967 * If cpl == NULL, complete the bdev_io with bdev status based on 'status'. 968 */ 969 static inline void 970 __bdev_nvme_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status, 971 const struct spdk_nvme_cpl *cpl) 972 { 973 spdk_trace_record(TRACE_BDEV_NVME_IO_DONE, 0, 0, (uintptr_t)bdev_io->driver_ctx, 974 (uintptr_t)bdev_io); 975 if (cpl) { 976 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 977 } else { 978 spdk_bdev_io_complete(bdev_io, status); 979 } 980 } 981 982 static void bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch); 983 984 static void 985 bdev_nvme_destroy_bdev_channel_cb(void *io_device, void *ctx_buf) 986 { 987 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 988 989 bdev_nvme_abort_retry_ios(nbdev_ch); 990 _bdev_nvme_delete_io_paths(nbdev_ch); 991 } 992 993 static inline bool 994 bdev_nvme_io_type_is_admin(enum spdk_bdev_io_type io_type) 995 { 996 switch (io_type) { 997 case SPDK_BDEV_IO_TYPE_RESET: 998 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 999 case SPDK_BDEV_IO_TYPE_ABORT: 1000 return true; 1001 default: 1002 break; 1003 } 1004 1005 return false; 1006 } 1007 1008 static inline bool 1009 nvme_ns_is_active(struct nvme_ns *nvme_ns) 1010 { 1011 if (spdk_unlikely(nvme_ns->ana_state_updating)) { 1012 return false; 1013 } 1014 1015 if (spdk_unlikely(nvme_ns->ns == NULL)) { 1016 return false; 1017 } 1018 1019 return true; 1020 } 1021 1022 static inline bool 1023 nvme_ns_is_accessible(struct nvme_ns *nvme_ns) 1024 { 1025 if (spdk_unlikely(!nvme_ns_is_active(nvme_ns))) { 1026 return false; 1027 } 1028 1029 switch (nvme_ns->ana_state) { 1030 case SPDK_NVME_ANA_OPTIMIZED_STATE: 1031 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 1032 return true; 1033 default: 1034 break; 1035 } 1036 1037 return false; 1038 } 1039 1040 static inline bool 1041 nvme_qpair_is_connected(struct nvme_qpair *nvme_qpair) 1042 { 1043 if (spdk_unlikely(nvme_qpair->qpair == NULL)) { 1044 return false; 1045 } 1046 1047 if (spdk_unlikely(spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) != 1048 SPDK_NVME_QPAIR_FAILURE_NONE)) { 1049 return false; 1050 } 1051 1052 if (spdk_unlikely(nvme_qpair->ctrlr_ch->reset_iter != NULL)) { 1053 return false; 1054 } 1055 1056 return true; 1057 } 1058 1059 static inline bool 1060 nvme_io_path_is_available(struct nvme_io_path *io_path) 1061 { 1062 if (spdk_unlikely(!nvme_qpair_is_connected(io_path->qpair))) { 1063 return false; 1064 } 1065 1066 if (spdk_unlikely(!nvme_ns_is_accessible(io_path->nvme_ns))) { 1067 return false; 1068 } 1069 1070 return true; 1071 } 1072 1073 static inline bool 1074 nvme_ctrlr_is_failed(struct nvme_ctrlr *nvme_ctrlr) 1075 { 1076 if (nvme_ctrlr->destruct) { 1077 return true; 1078 } 1079 1080 if (nvme_ctrlr->fast_io_fail_timedout) { 1081 return true; 1082 } 1083 1084 if (nvme_ctrlr->resetting) { 1085 if (nvme_ctrlr->opts.reconnect_delay_sec != 0) { 1086 return false; 1087 } else { 1088 return true; 1089 } 1090 } 1091 1092 if (nvme_ctrlr->reconnect_is_delayed) { 1093 return false; 1094 } 1095 1096 if (nvme_ctrlr->disabled) { 1097 return true; 1098 } 1099 1100 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 1101 return true; 1102 } else { 1103 return false; 1104 } 1105 } 1106 1107 static bool 1108 nvme_ctrlr_is_available(struct nvme_ctrlr *nvme_ctrlr) 1109 { 1110 if (nvme_ctrlr->destruct) { 1111 return false; 1112 } 1113 1114 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 1115 return false; 1116 } 1117 1118 if (nvme_ctrlr->resetting || nvme_ctrlr->reconnect_is_delayed) { 1119 return false; 1120 } 1121 1122 if (nvme_ctrlr->disabled) { 1123 return false; 1124 } 1125 1126 return true; 1127 } 1128 1129 /* Simulate circular linked list. */ 1130 static inline struct nvme_io_path * 1131 nvme_io_path_get_next(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *prev_path) 1132 { 1133 struct nvme_io_path *next_path; 1134 1135 if (prev_path != NULL) { 1136 next_path = STAILQ_NEXT(prev_path, stailq); 1137 if (next_path != NULL) { 1138 return next_path; 1139 } 1140 } 1141 1142 return STAILQ_FIRST(&nbdev_ch->io_path_list); 1143 } 1144 1145 static struct nvme_io_path * 1146 _bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 1147 { 1148 struct nvme_io_path *io_path, *start, *non_optimized = NULL; 1149 1150 start = nvme_io_path_get_next(nbdev_ch, nbdev_ch->current_io_path); 1151 1152 io_path = start; 1153 do { 1154 if (spdk_likely(nvme_io_path_is_available(io_path))) { 1155 switch (io_path->nvme_ns->ana_state) { 1156 case SPDK_NVME_ANA_OPTIMIZED_STATE: 1157 nbdev_ch->current_io_path = io_path; 1158 return io_path; 1159 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 1160 if (non_optimized == NULL) { 1161 non_optimized = io_path; 1162 } 1163 break; 1164 default: 1165 assert(false); 1166 break; 1167 } 1168 } 1169 io_path = nvme_io_path_get_next(nbdev_ch, io_path); 1170 } while (io_path != start); 1171 1172 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) { 1173 /* We come here only if there is no optimized path. Cache even non_optimized 1174 * path for load balance across multiple non_optimized paths. 1175 */ 1176 nbdev_ch->current_io_path = non_optimized; 1177 } 1178 1179 return non_optimized; 1180 } 1181 1182 static struct nvme_io_path * 1183 _bdev_nvme_find_io_path_min_qd(struct nvme_bdev_channel *nbdev_ch) 1184 { 1185 struct nvme_io_path *io_path; 1186 struct nvme_io_path *optimized = NULL, *non_optimized = NULL; 1187 uint32_t opt_min_qd = UINT32_MAX, non_opt_min_qd = UINT32_MAX; 1188 uint32_t num_outstanding_reqs; 1189 1190 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 1191 if (spdk_unlikely(!nvme_qpair_is_connected(io_path->qpair))) { 1192 /* The device is currently resetting. */ 1193 continue; 1194 } 1195 1196 if (spdk_unlikely(!nvme_ns_is_active(io_path->nvme_ns))) { 1197 continue; 1198 } 1199 1200 num_outstanding_reqs = spdk_nvme_qpair_get_num_outstanding_reqs(io_path->qpair->qpair); 1201 switch (io_path->nvme_ns->ana_state) { 1202 case SPDK_NVME_ANA_OPTIMIZED_STATE: 1203 if (num_outstanding_reqs < opt_min_qd) { 1204 opt_min_qd = num_outstanding_reqs; 1205 optimized = io_path; 1206 } 1207 break; 1208 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 1209 if (num_outstanding_reqs < non_opt_min_qd) { 1210 non_opt_min_qd = num_outstanding_reqs; 1211 non_optimized = io_path; 1212 } 1213 break; 1214 default: 1215 break; 1216 } 1217 } 1218 1219 /* don't cache io path for BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH selector */ 1220 if (optimized != NULL) { 1221 return optimized; 1222 } 1223 1224 return non_optimized; 1225 } 1226 1227 static inline struct nvme_io_path * 1228 bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 1229 { 1230 if (spdk_likely(nbdev_ch->current_io_path != NULL)) { 1231 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE) { 1232 return nbdev_ch->current_io_path; 1233 } else if (nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 1234 if (++nbdev_ch->rr_counter < nbdev_ch->rr_min_io) { 1235 return nbdev_ch->current_io_path; 1236 } 1237 nbdev_ch->rr_counter = 0; 1238 } 1239 } 1240 1241 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE || 1242 nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 1243 return _bdev_nvme_find_io_path(nbdev_ch); 1244 } else { 1245 return _bdev_nvme_find_io_path_min_qd(nbdev_ch); 1246 } 1247 } 1248 1249 /* Return true if there is any io_path whose qpair is active or ctrlr is not failed, 1250 * or false otherwise. 1251 * 1252 * If any io_path has an active qpair but find_io_path() returned NULL, its namespace 1253 * is likely to be non-accessible now but may become accessible. 1254 * 1255 * If any io_path has an unfailed ctrlr but find_io_path() returned NULL, the ctrlr 1256 * is likely to be resetting now but the reset may succeed. A ctrlr is set to unfailed 1257 * when starting to reset it but it is set to failed when the reset failed. Hence, if 1258 * a ctrlr is unfailed, it is likely that it works fine or is resetting. 1259 */ 1260 static bool 1261 any_io_path_may_become_available(struct nvme_bdev_channel *nbdev_ch) 1262 { 1263 struct nvme_io_path *io_path; 1264 1265 if (nbdev_ch->resetting) { 1266 return false; 1267 } 1268 1269 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 1270 if (io_path->nvme_ns->ana_transition_timedout) { 1271 continue; 1272 } 1273 1274 if (nvme_qpair_is_connected(io_path->qpair) || 1275 !nvme_ctrlr_is_failed(io_path->qpair->ctrlr)) { 1276 return true; 1277 } 1278 } 1279 1280 return false; 1281 } 1282 1283 static void 1284 bdev_nvme_retry_io(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io) 1285 { 1286 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 1287 struct spdk_io_channel *ch; 1288 1289 if (nbdev_io->io_path != NULL && nvme_io_path_is_available(nbdev_io->io_path)) { 1290 _bdev_nvme_submit_request(nbdev_ch, bdev_io); 1291 } else { 1292 ch = spdk_io_channel_from_ctx(nbdev_ch); 1293 bdev_nvme_submit_request(ch, bdev_io); 1294 } 1295 } 1296 1297 static int 1298 bdev_nvme_retry_ios(void *arg) 1299 { 1300 struct nvme_bdev_channel *nbdev_ch = arg; 1301 struct nvme_bdev_io *bio, *tmp_bio; 1302 uint64_t now, delay_us; 1303 1304 now = spdk_get_ticks(); 1305 1306 TAILQ_FOREACH_SAFE(bio, &nbdev_ch->retry_io_list, retry_link, tmp_bio) { 1307 if (bio->retry_ticks > now) { 1308 break; 1309 } 1310 1311 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bio, retry_link); 1312 1313 bdev_nvme_retry_io(nbdev_ch, spdk_bdev_io_from_ctx(bio)); 1314 } 1315 1316 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1317 1318 bio = TAILQ_FIRST(&nbdev_ch->retry_io_list); 1319 if (bio != NULL) { 1320 delay_us = (bio->retry_ticks - now) * SPDK_SEC_TO_USEC / spdk_get_ticks_hz(); 1321 1322 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 1323 delay_us); 1324 } 1325 1326 return SPDK_POLLER_BUSY; 1327 } 1328 1329 static void 1330 bdev_nvme_queue_retry_io(struct nvme_bdev_channel *nbdev_ch, 1331 struct nvme_bdev_io *bio, uint64_t delay_ms) 1332 { 1333 struct nvme_bdev_io *tmp_bio; 1334 1335 bio->retry_ticks = spdk_get_ticks() + delay_ms * spdk_get_ticks_hz() / 1000ULL; 1336 1337 TAILQ_FOREACH_REVERSE(tmp_bio, &nbdev_ch->retry_io_list, retry_io_head, retry_link) { 1338 if (tmp_bio->retry_ticks <= bio->retry_ticks) { 1339 TAILQ_INSERT_AFTER(&nbdev_ch->retry_io_list, tmp_bio, bio, 1340 retry_link); 1341 return; 1342 } 1343 } 1344 1345 /* No earlier I/Os were found. This I/O must be the new head. */ 1346 TAILQ_INSERT_HEAD(&nbdev_ch->retry_io_list, bio, retry_link); 1347 1348 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1349 1350 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 1351 delay_ms * 1000ULL); 1352 } 1353 1354 static void 1355 bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch) 1356 { 1357 struct nvme_bdev_io *bio, *tmp_bio; 1358 1359 TAILQ_FOREACH_SAFE(bio, &nbdev_ch->retry_io_list, retry_link, tmp_bio) { 1360 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bio, retry_link); 1361 __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_ABORTED, NULL); 1362 } 1363 1364 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1365 } 1366 1367 static int 1368 bdev_nvme_abort_retry_io(struct nvme_bdev_channel *nbdev_ch, 1369 struct nvme_bdev_io *bio_to_abort) 1370 { 1371 struct nvme_bdev_io *bio; 1372 1373 TAILQ_FOREACH(bio, &nbdev_ch->retry_io_list, retry_link) { 1374 if (bio == bio_to_abort) { 1375 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bio, retry_link); 1376 __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_ABORTED, NULL); 1377 return 0; 1378 } 1379 } 1380 1381 return -ENOENT; 1382 } 1383 1384 static void 1385 bdev_nvme_update_nvme_error_stat(struct spdk_bdev_io *bdev_io, const struct spdk_nvme_cpl *cpl) 1386 { 1387 struct nvme_bdev *nbdev; 1388 uint16_t sct, sc; 1389 1390 assert(spdk_nvme_cpl_is_error(cpl)); 1391 1392 nbdev = bdev_io->bdev->ctxt; 1393 1394 if (nbdev->err_stat == NULL) { 1395 return; 1396 } 1397 1398 sct = cpl->status.sct; 1399 sc = cpl->status.sc; 1400 1401 pthread_mutex_lock(&nbdev->mutex); 1402 1403 nbdev->err_stat->status_type[sct]++; 1404 switch (sct) { 1405 case SPDK_NVME_SCT_GENERIC: 1406 case SPDK_NVME_SCT_COMMAND_SPECIFIC: 1407 case SPDK_NVME_SCT_MEDIA_ERROR: 1408 case SPDK_NVME_SCT_PATH: 1409 nbdev->err_stat->status[sct][sc]++; 1410 break; 1411 default: 1412 break; 1413 } 1414 1415 pthread_mutex_unlock(&nbdev->mutex); 1416 } 1417 1418 static inline void 1419 bdev_nvme_update_io_path_stat(struct nvme_bdev_io *bio) 1420 { 1421 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1422 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 1423 uint32_t blocklen = bdev_io->bdev->blocklen; 1424 struct spdk_bdev_io_stat *stat; 1425 uint64_t tsc_diff; 1426 1427 if (bio->io_path->stat == NULL) { 1428 return; 1429 } 1430 1431 tsc_diff = spdk_get_ticks() - bio->submit_tsc; 1432 stat = bio->io_path->stat; 1433 1434 switch (bdev_io->type) { 1435 case SPDK_BDEV_IO_TYPE_READ: 1436 stat->bytes_read += num_blocks * blocklen; 1437 stat->num_read_ops++; 1438 stat->read_latency_ticks += tsc_diff; 1439 if (stat->max_read_latency_ticks < tsc_diff) { 1440 stat->max_read_latency_ticks = tsc_diff; 1441 } 1442 if (stat->min_read_latency_ticks > tsc_diff) { 1443 stat->min_read_latency_ticks = tsc_diff; 1444 } 1445 break; 1446 case SPDK_BDEV_IO_TYPE_WRITE: 1447 stat->bytes_written += num_blocks * blocklen; 1448 stat->num_write_ops++; 1449 stat->write_latency_ticks += tsc_diff; 1450 if (stat->max_write_latency_ticks < tsc_diff) { 1451 stat->max_write_latency_ticks = tsc_diff; 1452 } 1453 if (stat->min_write_latency_ticks > tsc_diff) { 1454 stat->min_write_latency_ticks = tsc_diff; 1455 } 1456 break; 1457 case SPDK_BDEV_IO_TYPE_UNMAP: 1458 stat->bytes_unmapped += num_blocks * blocklen; 1459 stat->num_unmap_ops++; 1460 stat->unmap_latency_ticks += tsc_diff; 1461 if (stat->max_unmap_latency_ticks < tsc_diff) { 1462 stat->max_unmap_latency_ticks = tsc_diff; 1463 } 1464 if (stat->min_unmap_latency_ticks > tsc_diff) { 1465 stat->min_unmap_latency_ticks = tsc_diff; 1466 } 1467 break; 1468 case SPDK_BDEV_IO_TYPE_ZCOPY: 1469 /* Track the data in the start phase only */ 1470 if (!bdev_io->u.bdev.zcopy.start) { 1471 break; 1472 } 1473 if (bdev_io->u.bdev.zcopy.populate) { 1474 stat->bytes_read += num_blocks * blocklen; 1475 stat->num_read_ops++; 1476 stat->read_latency_ticks += tsc_diff; 1477 if (stat->max_read_latency_ticks < tsc_diff) { 1478 stat->max_read_latency_ticks = tsc_diff; 1479 } 1480 if (stat->min_read_latency_ticks > tsc_diff) { 1481 stat->min_read_latency_ticks = tsc_diff; 1482 } 1483 } else { 1484 stat->bytes_written += num_blocks * blocklen; 1485 stat->num_write_ops++; 1486 stat->write_latency_ticks += tsc_diff; 1487 if (stat->max_write_latency_ticks < tsc_diff) { 1488 stat->max_write_latency_ticks = tsc_diff; 1489 } 1490 if (stat->min_write_latency_ticks > tsc_diff) { 1491 stat->min_write_latency_ticks = tsc_diff; 1492 } 1493 } 1494 break; 1495 case SPDK_BDEV_IO_TYPE_COPY: 1496 stat->bytes_copied += num_blocks * blocklen; 1497 stat->num_copy_ops++; 1498 stat->copy_latency_ticks += tsc_diff; 1499 if (stat->max_copy_latency_ticks < tsc_diff) { 1500 stat->max_copy_latency_ticks = tsc_diff; 1501 } 1502 if (stat->min_copy_latency_ticks > tsc_diff) { 1503 stat->min_copy_latency_ticks = tsc_diff; 1504 } 1505 break; 1506 default: 1507 break; 1508 } 1509 } 1510 1511 static bool 1512 bdev_nvme_check_retry_io(struct nvme_bdev_io *bio, 1513 const struct spdk_nvme_cpl *cpl, 1514 struct nvme_bdev_channel *nbdev_ch, 1515 uint64_t *_delay_ms) 1516 { 1517 struct nvme_io_path *io_path = bio->io_path; 1518 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 1519 const struct spdk_nvme_ctrlr_data *cdata; 1520 1521 if (spdk_nvme_cpl_is_path_error(cpl) || 1522 spdk_nvme_cpl_is_aborted_sq_deletion(cpl) || 1523 !nvme_io_path_is_available(io_path) || 1524 !nvme_ctrlr_is_available(nvme_ctrlr)) { 1525 bdev_nvme_clear_current_io_path(nbdev_ch); 1526 bio->io_path = NULL; 1527 if (spdk_nvme_cpl_is_ana_error(cpl)) { 1528 if (nvme_ctrlr_read_ana_log_page(nvme_ctrlr) == 0) { 1529 io_path->nvme_ns->ana_state_updating = true; 1530 } 1531 } 1532 if (!any_io_path_may_become_available(nbdev_ch)) { 1533 return false; 1534 } 1535 *_delay_ms = 0; 1536 } else { 1537 bio->retry_count++; 1538 1539 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 1540 1541 if (cpl->status.crd != 0) { 1542 *_delay_ms = cdata->crdt[cpl->status.crd] * 100; 1543 } else { 1544 *_delay_ms = 0; 1545 } 1546 } 1547 1548 return true; 1549 } 1550 1551 static inline void 1552 bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio, 1553 const struct spdk_nvme_cpl *cpl) 1554 { 1555 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1556 struct nvme_bdev_channel *nbdev_ch; 1557 uint64_t delay_ms; 1558 1559 assert(!bdev_nvme_io_type_is_admin(bdev_io->type)); 1560 1561 if (spdk_likely(spdk_nvme_cpl_is_success(cpl))) { 1562 bdev_nvme_update_io_path_stat(bio); 1563 goto complete; 1564 } 1565 1566 /* Update error counts before deciding if retry is needed. 1567 * Hence, error counts may be more than the number of I/O errors. 1568 */ 1569 bdev_nvme_update_nvme_error_stat(bdev_io, cpl); 1570 1571 if (cpl->status.dnr != 0 || spdk_nvme_cpl_is_aborted_by_request(cpl) || 1572 (g_opts.bdev_retry_count != -1 && bio->retry_count >= g_opts.bdev_retry_count)) { 1573 goto complete; 1574 } 1575 1576 /* At this point we don't know whether the sequence was successfully executed or not, so we 1577 * cannot retry the IO */ 1578 if (bdev_io->u.bdev.accel_sequence != NULL) { 1579 goto complete; 1580 } 1581 1582 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1583 1584 if (bdev_nvme_check_retry_io(bio, cpl, nbdev_ch, &delay_ms)) { 1585 bdev_nvme_queue_retry_io(nbdev_ch, bio, delay_ms); 1586 return; 1587 } 1588 1589 complete: 1590 bio->retry_count = 0; 1591 bio->submit_tsc = 0; 1592 bdev_io->u.bdev.accel_sequence = NULL; 1593 __bdev_nvme_io_complete(bdev_io, 0, cpl); 1594 } 1595 1596 static inline void 1597 bdev_nvme_io_complete(struct nvme_bdev_io *bio, int rc) 1598 { 1599 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1600 struct nvme_bdev_channel *nbdev_ch; 1601 enum spdk_bdev_io_status io_status; 1602 1603 assert(!bdev_nvme_io_type_is_admin(bdev_io->type)); 1604 1605 switch (rc) { 1606 case 0: 1607 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1608 break; 1609 case -ENOMEM: 1610 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1611 break; 1612 case -ENXIO: 1613 if (g_opts.bdev_retry_count == -1 || bio->retry_count < g_opts.bdev_retry_count) { 1614 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1615 1616 bdev_nvme_clear_current_io_path(nbdev_ch); 1617 bio->io_path = NULL; 1618 1619 if (any_io_path_may_become_available(nbdev_ch)) { 1620 bdev_nvme_queue_retry_io(nbdev_ch, bio, 1000ULL); 1621 return; 1622 } 1623 } 1624 1625 /* fallthrough */ 1626 default: 1627 spdk_accel_sequence_abort(bdev_io->u.bdev.accel_sequence); 1628 bdev_io->u.bdev.accel_sequence = NULL; 1629 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1630 break; 1631 } 1632 1633 bio->retry_count = 0; 1634 bio->submit_tsc = 0; 1635 __bdev_nvme_io_complete(bdev_io, io_status, NULL); 1636 } 1637 1638 static inline void 1639 bdev_nvme_admin_complete(struct nvme_bdev_io *bio, int rc) 1640 { 1641 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1642 enum spdk_bdev_io_status io_status; 1643 1644 switch (rc) { 1645 case 0: 1646 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1647 break; 1648 case -ENOMEM: 1649 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1650 break; 1651 case -ENXIO: 1652 /* fallthrough */ 1653 default: 1654 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1655 break; 1656 } 1657 1658 __bdev_nvme_io_complete(bdev_io, io_status, NULL); 1659 } 1660 1661 static void 1662 bdev_nvme_clear_io_path_caches_done(struct nvme_ctrlr *nvme_ctrlr, 1663 void *ctx, int status) 1664 { 1665 pthread_mutex_lock(&nvme_ctrlr->mutex); 1666 1667 assert(nvme_ctrlr->io_path_cache_clearing == true); 1668 nvme_ctrlr->io_path_cache_clearing = false; 1669 1670 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 1671 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1672 return; 1673 } 1674 1675 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1676 1677 nvme_ctrlr_unregister(nvme_ctrlr); 1678 } 1679 1680 static void 1681 _bdev_nvme_clear_io_path_cache(struct nvme_qpair *nvme_qpair) 1682 { 1683 struct nvme_io_path *io_path; 1684 1685 TAILQ_FOREACH(io_path, &nvme_qpair->io_path_list, tailq) { 1686 if (io_path->nbdev_ch == NULL) { 1687 continue; 1688 } 1689 bdev_nvme_clear_current_io_path(io_path->nbdev_ch); 1690 } 1691 } 1692 1693 static void 1694 bdev_nvme_clear_io_path_cache(struct nvme_ctrlr_channel_iter *i, 1695 struct nvme_ctrlr *nvme_ctrlr, 1696 struct nvme_ctrlr_channel *ctrlr_ch, 1697 void *ctx) 1698 { 1699 assert(ctrlr_ch->qpair != NULL); 1700 1701 _bdev_nvme_clear_io_path_cache(ctrlr_ch->qpair); 1702 1703 nvme_ctrlr_for_each_channel_continue(i, 0); 1704 } 1705 1706 static void 1707 bdev_nvme_clear_io_path_caches(struct nvme_ctrlr *nvme_ctrlr) 1708 { 1709 pthread_mutex_lock(&nvme_ctrlr->mutex); 1710 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 1711 nvme_ctrlr->io_path_cache_clearing) { 1712 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1713 return; 1714 } 1715 1716 nvme_ctrlr->io_path_cache_clearing = true; 1717 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1718 1719 nvme_ctrlr_for_each_channel(nvme_ctrlr, 1720 bdev_nvme_clear_io_path_cache, 1721 NULL, 1722 bdev_nvme_clear_io_path_caches_done); 1723 } 1724 1725 static struct nvme_qpair * 1726 nvme_poll_group_get_qpair(struct nvme_poll_group *group, struct spdk_nvme_qpair *qpair) 1727 { 1728 struct nvme_qpair *nvme_qpair; 1729 1730 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1731 if (nvme_qpair->qpair == qpair) { 1732 break; 1733 } 1734 } 1735 1736 return nvme_qpair; 1737 } 1738 1739 static void nvme_qpair_delete(struct nvme_qpair *nvme_qpair); 1740 1741 static void 1742 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx) 1743 { 1744 struct nvme_poll_group *group = poll_group_ctx; 1745 struct nvme_qpair *nvme_qpair; 1746 struct nvme_ctrlr *nvme_ctrlr; 1747 struct nvme_ctrlr_channel *ctrlr_ch; 1748 int status; 1749 1750 nvme_qpair = nvme_poll_group_get_qpair(group, qpair); 1751 if (nvme_qpair == NULL) { 1752 return; 1753 } 1754 1755 if (nvme_qpair->qpair != NULL) { 1756 spdk_nvme_ctrlr_free_io_qpair(nvme_qpair->qpair); 1757 nvme_qpair->qpair = NULL; 1758 } 1759 1760 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1761 1762 nvme_ctrlr = nvme_qpair->ctrlr; 1763 ctrlr_ch = nvme_qpair->ctrlr_ch; 1764 1765 if (ctrlr_ch != NULL) { 1766 if (ctrlr_ch->reset_iter != NULL) { 1767 /* We are in a full reset sequence. */ 1768 if (ctrlr_ch->connect_poller != NULL) { 1769 /* qpair was failed to connect. Abort the reset sequence. */ 1770 NVME_CTRLR_INFOLOG(nvme_ctrlr, 1771 "qpair %p was failed to connect. abort the reset ctrlr sequence.\n", 1772 qpair); 1773 spdk_poller_unregister(&ctrlr_ch->connect_poller); 1774 status = -1; 1775 } else { 1776 /* qpair was completed to disconnect. Just move to the next ctrlr_channel. */ 1777 NVME_CTRLR_INFOLOG(nvme_ctrlr, 1778 "qpair %p was disconnected and freed in a reset ctrlr sequence.\n", 1779 qpair); 1780 status = 0; 1781 } 1782 nvme_ctrlr_for_each_channel_continue(ctrlr_ch->reset_iter, status); 1783 ctrlr_ch->reset_iter = NULL; 1784 } else { 1785 /* qpair was disconnected unexpectedly. Reset controller for recovery. */ 1786 NVME_CTRLR_INFOLOG(nvme_ctrlr, "qpair %p was disconnected and freed. reset controller.\n", 1787 qpair); 1788 bdev_nvme_failover_ctrlr(nvme_ctrlr); 1789 } 1790 } else { 1791 /* In this case, ctrlr_channel is already deleted. */ 1792 NVME_CTRLR_INFOLOG(nvme_ctrlr, "qpair %p was disconnected and freed. delete nvme_qpair.\n", 1793 qpair); 1794 nvme_qpair_delete(nvme_qpair); 1795 } 1796 } 1797 1798 static void 1799 bdev_nvme_check_io_qpairs(struct nvme_poll_group *group) 1800 { 1801 struct nvme_qpair *nvme_qpair; 1802 1803 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1804 if (nvme_qpair->qpair == NULL || nvme_qpair->ctrlr_ch == NULL) { 1805 continue; 1806 } 1807 1808 if (spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) != 1809 SPDK_NVME_QPAIR_FAILURE_NONE) { 1810 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1811 } 1812 } 1813 } 1814 1815 static int 1816 bdev_nvme_poll(void *arg) 1817 { 1818 struct nvme_poll_group *group = arg; 1819 int64_t num_completions; 1820 1821 if (group->collect_spin_stat && group->start_ticks == 0) { 1822 group->start_ticks = spdk_get_ticks(); 1823 } 1824 1825 num_completions = spdk_nvme_poll_group_process_completions(group->group, 0, 1826 bdev_nvme_disconnected_qpair_cb); 1827 if (group->collect_spin_stat) { 1828 if (num_completions > 0) { 1829 if (group->end_ticks != 0) { 1830 group->spin_ticks += (group->end_ticks - group->start_ticks); 1831 group->end_ticks = 0; 1832 } 1833 group->start_ticks = 0; 1834 } else { 1835 group->end_ticks = spdk_get_ticks(); 1836 } 1837 } 1838 1839 if (spdk_unlikely(num_completions < 0)) { 1840 bdev_nvme_check_io_qpairs(group); 1841 } 1842 1843 return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 1844 } 1845 1846 static int bdev_nvme_poll_adminq(void *arg); 1847 1848 static void 1849 bdev_nvme_change_adminq_poll_period(struct nvme_ctrlr *nvme_ctrlr, uint64_t new_period_us) 1850 { 1851 if (spdk_interrupt_mode_is_enabled()) { 1852 return; 1853 } 1854 1855 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 1856 1857 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, 1858 nvme_ctrlr, new_period_us); 1859 } 1860 1861 static int 1862 bdev_nvme_poll_adminq(void *arg) 1863 { 1864 int32_t rc; 1865 struct nvme_ctrlr *nvme_ctrlr = arg; 1866 nvme_ctrlr_disconnected_cb disconnected_cb; 1867 1868 assert(nvme_ctrlr != NULL); 1869 1870 rc = spdk_nvme_ctrlr_process_admin_completions(nvme_ctrlr->ctrlr); 1871 if (rc < 0) { 1872 disconnected_cb = nvme_ctrlr->disconnected_cb; 1873 nvme_ctrlr->disconnected_cb = NULL; 1874 1875 if (disconnected_cb != NULL) { 1876 bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 1877 g_opts.nvme_adminq_poll_period_us); 1878 disconnected_cb(nvme_ctrlr); 1879 } else { 1880 bdev_nvme_failover_ctrlr(nvme_ctrlr); 1881 } 1882 } else if (spdk_nvme_ctrlr_get_admin_qp_failure_reason(nvme_ctrlr->ctrlr) != 1883 SPDK_NVME_QPAIR_FAILURE_NONE) { 1884 bdev_nvme_clear_io_path_caches(nvme_ctrlr); 1885 } 1886 1887 return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY; 1888 } 1889 1890 static void 1891 nvme_bdev_free(void *io_device) 1892 { 1893 struct nvme_bdev *nbdev = io_device; 1894 1895 pthread_mutex_destroy(&nbdev->mutex); 1896 free(nbdev->disk.name); 1897 free(nbdev->err_stat); 1898 free(nbdev); 1899 } 1900 1901 static int 1902 bdev_nvme_destruct(void *ctx) 1903 { 1904 struct nvme_bdev *nbdev = ctx; 1905 struct nvme_ns *nvme_ns, *tmp_nvme_ns; 1906 1907 SPDK_DTRACE_PROBE2(bdev_nvme_destruct, nbdev->nbdev_ctrlr->name, nbdev->nsid); 1908 1909 pthread_mutex_lock(&nbdev->mutex); 1910 1911 TAILQ_FOREACH_SAFE(nvme_ns, &nbdev->nvme_ns_list, tailq, tmp_nvme_ns) { 1912 pthread_mutex_lock(&nvme_ns->ctrlr->mutex); 1913 1914 nvme_ns->bdev = NULL; 1915 1916 assert(nvme_ns->id > 0); 1917 1918 if (nvme_ctrlr_get_ns(nvme_ns->ctrlr, nvme_ns->id) == NULL) { 1919 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1920 1921 nvme_ctrlr_put_ref(nvme_ns->ctrlr); 1922 nvme_ns_free(nvme_ns); 1923 } else { 1924 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1925 } 1926 } 1927 1928 pthread_mutex_unlock(&nbdev->mutex); 1929 1930 pthread_mutex_lock(&g_bdev_nvme_mutex); 1931 TAILQ_REMOVE(&nbdev->nbdev_ctrlr->bdevs, nbdev, tailq); 1932 pthread_mutex_unlock(&g_bdev_nvme_mutex); 1933 1934 spdk_io_device_unregister(nbdev, nvme_bdev_free); 1935 1936 return 0; 1937 } 1938 1939 static int 1940 bdev_nvme_create_qpair(struct nvme_qpair *nvme_qpair) 1941 { 1942 struct nvme_ctrlr *nvme_ctrlr; 1943 struct spdk_nvme_io_qpair_opts opts; 1944 struct spdk_nvme_qpair *qpair; 1945 int rc; 1946 1947 nvme_ctrlr = nvme_qpair->ctrlr; 1948 1949 spdk_nvme_ctrlr_get_default_io_qpair_opts(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1950 opts.create_only = true; 1951 /* In interrupt mode qpairs must be created in sync mode, else it will never be connected. 1952 * delay_cmd_submit must be false as in interrupt mode requests cannot be submitted in 1953 * completion context. 1954 */ 1955 if (!spdk_interrupt_mode_is_enabled()) { 1956 opts.async_mode = true; 1957 opts.delay_cmd_submit = g_opts.delay_cmd_submit; 1958 } 1959 opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests); 1960 g_opts.io_queue_requests = opts.io_queue_requests; 1961 1962 qpair = spdk_nvme_ctrlr_alloc_io_qpair(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1963 if (qpair == NULL) { 1964 return -1; 1965 } 1966 1967 SPDK_DTRACE_PROBE3(bdev_nvme_create_qpair, nvme_ctrlr->nbdev_ctrlr->name, 1968 spdk_nvme_qpair_get_id(qpair), spdk_thread_get_id(nvme_ctrlr->thread)); 1969 1970 assert(nvme_qpair->group != NULL); 1971 1972 rc = spdk_nvme_poll_group_add(nvme_qpair->group->group, qpair); 1973 if (rc != 0) { 1974 NVME_CTRLR_ERRLOG(nvme_ctrlr, "Unable to begin polling on NVMe Channel.\n"); 1975 goto err; 1976 } 1977 1978 rc = spdk_nvme_ctrlr_connect_io_qpair(nvme_ctrlr->ctrlr, qpair); 1979 if (rc != 0) { 1980 NVME_CTRLR_ERRLOG(nvme_ctrlr, "Unable to connect I/O qpair.\n"); 1981 goto err; 1982 } 1983 1984 nvme_qpair->qpair = qpair; 1985 1986 if (!g_opts.disable_auto_failback) { 1987 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1988 } 1989 1990 NVME_CTRLR_INFOLOG(nvme_ctrlr, "Connecting qpair %p:%u started.\n", 1991 qpair, spdk_nvme_qpair_get_id(qpair)); 1992 1993 return 0; 1994 1995 err: 1996 spdk_nvme_ctrlr_free_io_qpair(qpair); 1997 1998 return rc; 1999 } 2000 2001 static void bdev_nvme_reset_io_continue(void *cb_arg, int rc); 2002 2003 static void 2004 bdev_nvme_complete_pending_resets(struct nvme_ctrlr *nvme_ctrlr, bool success) 2005 { 2006 int rc = 0; 2007 struct nvme_bdev_io *bio; 2008 2009 if (!success) { 2010 rc = -1; 2011 } 2012 2013 while (!TAILQ_EMPTY(&nvme_ctrlr->pending_resets)) { 2014 bio = TAILQ_FIRST(&nvme_ctrlr->pending_resets); 2015 TAILQ_REMOVE(&nvme_ctrlr->pending_resets, bio, retry_link); 2016 2017 bdev_nvme_reset_io_continue(bio, rc); 2018 } 2019 } 2020 2021 /* This function marks the current trid as failed by storing the current ticks 2022 * and then sets the next trid to the active trid within a controller if exists. 2023 * 2024 * The purpose of the boolean return value is to request the caller to disconnect 2025 * the current trid now to try connecting the next trid. 2026 */ 2027 static bool 2028 bdev_nvme_failover_trid(struct nvme_ctrlr *nvme_ctrlr, bool remove, bool start) 2029 { 2030 struct nvme_path_id *path_id, *next_path; 2031 int rc __attribute__((unused)); 2032 2033 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 2034 assert(path_id); 2035 assert(path_id == nvme_ctrlr->active_path_id); 2036 next_path = TAILQ_NEXT(path_id, link); 2037 2038 /* Update the last failed time. It means the trid is failed if its last 2039 * failed time is non-zero. 2040 */ 2041 path_id->last_failed_tsc = spdk_get_ticks(); 2042 2043 if (next_path == NULL) { 2044 /* There is no alternate trid within a controller. */ 2045 return false; 2046 } 2047 2048 if (!start && nvme_ctrlr->opts.reconnect_delay_sec == 0) { 2049 /* Connect is not retried in a controller reset sequence. Connecting 2050 * the next trid will be done by the next bdev_nvme_failover_ctrlr() call. 2051 */ 2052 return false; 2053 } 2054 2055 assert(path_id->trid.trtype != SPDK_NVME_TRANSPORT_PCIE); 2056 2057 NVME_CTRLR_NOTICELOG(nvme_ctrlr, "Start failover from %s:%s to %s:%s\n", 2058 path_id->trid.traddr, path_id->trid.trsvcid, 2059 next_path->trid.traddr, next_path->trid.trsvcid); 2060 2061 spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr); 2062 nvme_ctrlr->active_path_id = next_path; 2063 rc = spdk_nvme_ctrlr_set_trid(nvme_ctrlr->ctrlr, &next_path->trid); 2064 assert(rc == 0); 2065 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 2066 if (!remove) { 2067 /** Shuffle the old trid to the end of the list and use the new one. 2068 * Allows for round robin through multiple connections. 2069 */ 2070 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, path_id, link); 2071 } else { 2072 free(path_id); 2073 } 2074 2075 if (start || next_path->last_failed_tsc == 0) { 2076 /* bdev_nvme_failover_ctrlr() is just called or the next trid is not failed 2077 * or used yet. Try the next trid now. 2078 */ 2079 return true; 2080 } 2081 2082 if (spdk_get_ticks() > next_path->last_failed_tsc + spdk_get_ticks_hz() * 2083 nvme_ctrlr->opts.reconnect_delay_sec) { 2084 /* Enough backoff passed since the next trid failed. Try the next trid now. */ 2085 return true; 2086 } 2087 2088 /* The next trid will be tried after reconnect_delay_sec seconds. */ 2089 return false; 2090 } 2091 2092 static bool 2093 bdev_nvme_check_ctrlr_loss_timeout(struct nvme_ctrlr *nvme_ctrlr) 2094 { 2095 int32_t elapsed; 2096 2097 if (nvme_ctrlr->opts.ctrlr_loss_timeout_sec == 0 || 2098 nvme_ctrlr->opts.ctrlr_loss_timeout_sec == -1) { 2099 return false; 2100 } 2101 2102 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 2103 if (elapsed >= nvme_ctrlr->opts.ctrlr_loss_timeout_sec) { 2104 return true; 2105 } else { 2106 return false; 2107 } 2108 } 2109 2110 static bool 2111 bdev_nvme_check_fast_io_fail_timeout(struct nvme_ctrlr *nvme_ctrlr) 2112 { 2113 uint32_t elapsed; 2114 2115 if (nvme_ctrlr->opts.fast_io_fail_timeout_sec == 0) { 2116 return false; 2117 } 2118 2119 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 2120 if (elapsed >= nvme_ctrlr->opts.fast_io_fail_timeout_sec) { 2121 return true; 2122 } else { 2123 return false; 2124 } 2125 } 2126 2127 static void bdev_nvme_reset_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, bool success); 2128 2129 static void 2130 nvme_ctrlr_disconnect(struct nvme_ctrlr *nvme_ctrlr, nvme_ctrlr_disconnected_cb cb_fn) 2131 { 2132 int rc; 2133 2134 NVME_CTRLR_INFOLOG(nvme_ctrlr, "Start disconnecting ctrlr.\n"); 2135 2136 rc = spdk_nvme_ctrlr_disconnect(nvme_ctrlr->ctrlr); 2137 if (rc != 0) { 2138 NVME_CTRLR_WARNLOG(nvme_ctrlr, "disconnecting ctrlr failed.\n"); 2139 2140 /* Disconnect fails if ctrlr is already resetting or removed. In this case, 2141 * fail the reset sequence immediately. 2142 */ 2143 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 2144 return; 2145 } 2146 2147 /* spdk_nvme_ctrlr_disconnect() may complete asynchronously later by polling adminq. 2148 * Set callback here to execute the specified operation after ctrlr is really disconnected. 2149 */ 2150 assert(nvme_ctrlr->disconnected_cb == NULL); 2151 nvme_ctrlr->disconnected_cb = cb_fn; 2152 2153 /* During disconnection, reduce the period to poll adminq more often. */ 2154 bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 0); 2155 } 2156 2157 enum bdev_nvme_op_after_reset { 2158 OP_NONE, 2159 OP_COMPLETE_PENDING_DESTRUCT, 2160 OP_DESTRUCT, 2161 OP_DELAYED_RECONNECT, 2162 OP_FAILOVER, 2163 }; 2164 2165 typedef enum bdev_nvme_op_after_reset _bdev_nvme_op_after_reset; 2166 2167 static _bdev_nvme_op_after_reset 2168 bdev_nvme_check_op_after_reset(struct nvme_ctrlr *nvme_ctrlr, bool success) 2169 { 2170 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 2171 /* Complete pending destruct after reset completes. */ 2172 return OP_COMPLETE_PENDING_DESTRUCT; 2173 } else if (nvme_ctrlr->pending_failover) { 2174 nvme_ctrlr->pending_failover = false; 2175 nvme_ctrlr->reset_start_tsc = 0; 2176 return OP_FAILOVER; 2177 } else if (success || nvme_ctrlr->opts.reconnect_delay_sec == 0) { 2178 nvme_ctrlr->reset_start_tsc = 0; 2179 return OP_NONE; 2180 } else if (bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 2181 return OP_DESTRUCT; 2182 } else { 2183 if (bdev_nvme_check_fast_io_fail_timeout(nvme_ctrlr)) { 2184 nvme_ctrlr->fast_io_fail_timedout = true; 2185 } 2186 return OP_DELAYED_RECONNECT; 2187 } 2188 } 2189 2190 static int bdev_nvme_delete_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool hotplug); 2191 static void bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 2192 2193 static int 2194 bdev_nvme_reconnect_delay_timer_expired(void *ctx) 2195 { 2196 struct nvme_ctrlr *nvme_ctrlr = ctx; 2197 2198 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect_delay, nvme_ctrlr->nbdev_ctrlr->name); 2199 pthread_mutex_lock(&nvme_ctrlr->mutex); 2200 2201 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 2202 2203 if (!nvme_ctrlr->reconnect_is_delayed) { 2204 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2205 return SPDK_POLLER_BUSY; 2206 } 2207 2208 nvme_ctrlr->reconnect_is_delayed = false; 2209 2210 if (nvme_ctrlr->destruct) { 2211 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2212 return SPDK_POLLER_BUSY; 2213 } 2214 2215 assert(nvme_ctrlr->resetting == false); 2216 nvme_ctrlr->resetting = true; 2217 2218 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2219 2220 spdk_poller_resume(nvme_ctrlr->adminq_timer_poller); 2221 2222 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2223 return SPDK_POLLER_BUSY; 2224 } 2225 2226 static void 2227 bdev_nvme_start_reconnect_delay_timer(struct nvme_ctrlr *nvme_ctrlr) 2228 { 2229 spdk_poller_pause(nvme_ctrlr->adminq_timer_poller); 2230 2231 assert(nvme_ctrlr->reconnect_is_delayed == false); 2232 nvme_ctrlr->reconnect_is_delayed = true; 2233 2234 assert(nvme_ctrlr->reconnect_delay_timer == NULL); 2235 nvme_ctrlr->reconnect_delay_timer = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_delay_timer_expired, 2236 nvme_ctrlr, 2237 nvme_ctrlr->opts.reconnect_delay_sec * SPDK_SEC_TO_USEC); 2238 } 2239 2240 static void remove_discovery_entry(struct nvme_ctrlr *nvme_ctrlr); 2241 2242 static void 2243 bdev_nvme_reset_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, bool success) 2244 { 2245 bdev_nvme_ctrlr_op_cb ctrlr_op_cb_fn = nvme_ctrlr->ctrlr_op_cb_fn; 2246 void *ctrlr_op_cb_arg = nvme_ctrlr->ctrlr_op_cb_arg; 2247 enum bdev_nvme_op_after_reset op_after_reset; 2248 2249 assert(nvme_ctrlr->thread == spdk_get_thread()); 2250 2251 pthread_mutex_lock(&nvme_ctrlr->mutex); 2252 if (!success) { 2253 /* Connecting the active trid failed. Set the next alternate trid to the 2254 * active trid if it exists. 2255 */ 2256 if (bdev_nvme_failover_trid(nvme_ctrlr, false, false)) { 2257 /* The next alternate trid exists and is ready to try. Try it now. */ 2258 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2259 2260 NVME_CTRLR_INFOLOG(nvme_ctrlr, "Try the next alternate trid %s:%s now.\n", 2261 nvme_ctrlr->active_path_id->trid.traddr, 2262 nvme_ctrlr->active_path_id->trid.trsvcid); 2263 2264 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr); 2265 return; 2266 } 2267 2268 /* We came here if there is no alternate trid or if the next trid exists but 2269 * is not ready to try. We will try the active trid after reconnect_delay_sec 2270 * seconds if it is non-zero or at the next reset call otherwise. 2271 */ 2272 } else { 2273 /* Connecting the active trid succeeded. Clear the last failed time because it 2274 * means the trid is failed if its last failed time is non-zero. 2275 */ 2276 nvme_ctrlr->active_path_id->last_failed_tsc = 0; 2277 } 2278 2279 NVME_CTRLR_INFOLOG(nvme_ctrlr, "Clear pending resets.\n"); 2280 2281 /* Make sure we clear any pending resets before returning. */ 2282 bdev_nvme_complete_pending_resets(nvme_ctrlr, success); 2283 2284 if (!success) { 2285 NVME_CTRLR_ERRLOG(nvme_ctrlr, "Resetting controller failed.\n"); 2286 } else { 2287 NVME_CTRLR_NOTICELOG(nvme_ctrlr, "Resetting controller successful.\n"); 2288 } 2289 2290 nvme_ctrlr->resetting = false; 2291 nvme_ctrlr->dont_retry = false; 2292 nvme_ctrlr->in_failover = false; 2293 2294 nvme_ctrlr->ctrlr_op_cb_fn = NULL; 2295 nvme_ctrlr->ctrlr_op_cb_arg = NULL; 2296 2297 op_after_reset = bdev_nvme_check_op_after_reset(nvme_ctrlr, success); 2298 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2299 2300 /* Delay callbacks when the next operation is a failover. */ 2301 if (ctrlr_op_cb_fn && op_after_reset != OP_FAILOVER) { 2302 ctrlr_op_cb_fn(ctrlr_op_cb_arg, success ? 0 : -1); 2303 } 2304 2305 switch (op_after_reset) { 2306 case OP_COMPLETE_PENDING_DESTRUCT: 2307 nvme_ctrlr_unregister(nvme_ctrlr); 2308 break; 2309 case OP_DESTRUCT: 2310 bdev_nvme_delete_ctrlr(nvme_ctrlr, false); 2311 remove_discovery_entry(nvme_ctrlr); 2312 break; 2313 case OP_DELAYED_RECONNECT: 2314 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_start_reconnect_delay_timer); 2315 break; 2316 case OP_FAILOVER: 2317 nvme_ctrlr->ctrlr_op_cb_fn = ctrlr_op_cb_fn; 2318 nvme_ctrlr->ctrlr_op_cb_arg = ctrlr_op_cb_arg; 2319 bdev_nvme_failover_ctrlr(nvme_ctrlr); 2320 break; 2321 default: 2322 break; 2323 } 2324 } 2325 2326 static void 2327 bdev_nvme_reset_create_qpairs_failed(struct nvme_ctrlr *nvme_ctrlr, void *ctx, int status) 2328 { 2329 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 2330 } 2331 2332 static void 2333 bdev_nvme_reset_destroy_qpair(struct nvme_ctrlr_channel_iter *i, 2334 struct nvme_ctrlr *nvme_ctrlr, 2335 struct nvme_ctrlr_channel *ctrlr_ch, void *ctx) 2336 { 2337 struct nvme_qpair *nvme_qpair; 2338 struct spdk_nvme_qpair *qpair; 2339 2340 nvme_qpair = ctrlr_ch->qpair; 2341 assert(nvme_qpair != NULL); 2342 2343 _bdev_nvme_clear_io_path_cache(nvme_qpair); 2344 2345 qpair = nvme_qpair->qpair; 2346 if (qpair != NULL) { 2347 NVME_CTRLR_INFOLOG(nvme_ctrlr, "Start disconnecting qpair %p:%u.\n", 2348 qpair, spdk_nvme_qpair_get_id(qpair)); 2349 2350 if (nvme_qpair->ctrlr->dont_retry) { 2351 spdk_nvme_qpair_set_abort_dnr(qpair, true); 2352 } 2353 spdk_nvme_ctrlr_disconnect_io_qpair(qpair); 2354 2355 /* The current full reset sequence will move to the next 2356 * ctrlr_channel after the qpair is actually disconnected. 2357 */ 2358 assert(ctrlr_ch->reset_iter == NULL); 2359 ctrlr_ch->reset_iter = i; 2360 } else { 2361 nvme_ctrlr_for_each_channel_continue(i, 0); 2362 } 2363 } 2364 2365 static void 2366 bdev_nvme_reset_create_qpairs_done(struct nvme_ctrlr *nvme_ctrlr, void *ctx, int status) 2367 { 2368 if (status == 0) { 2369 NVME_CTRLR_INFOLOG(nvme_ctrlr, "qpairs were created after ctrlr reset.\n"); 2370 2371 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, true); 2372 } else { 2373 NVME_CTRLR_INFOLOG(nvme_ctrlr, "qpairs were failed to create after ctrlr reset.\n"); 2374 2375 /* Delete the added qpairs and quiesce ctrlr to make the states clean. */ 2376 nvme_ctrlr_for_each_channel(nvme_ctrlr, 2377 bdev_nvme_reset_destroy_qpair, 2378 NULL, 2379 bdev_nvme_reset_create_qpairs_failed); 2380 } 2381 } 2382 2383 static int 2384 bdev_nvme_reset_check_qpair_connected(void *ctx) 2385 { 2386 struct nvme_ctrlr_channel *ctrlr_ch = ctx; 2387 struct nvme_qpair *nvme_qpair = ctrlr_ch->qpair; 2388 struct spdk_nvme_qpair *qpair; 2389 2390 if (ctrlr_ch->reset_iter == NULL) { 2391 /* qpair was already failed to connect and the reset sequence is being aborted. */ 2392 assert(ctrlr_ch->connect_poller == NULL); 2393 assert(nvme_qpair->qpair == NULL); 2394 2395 NVME_CTRLR_INFOLOG(nvme_qpair->ctrlr, 2396 "qpair was already failed to connect. reset is being aborted.\n"); 2397 return SPDK_POLLER_BUSY; 2398 } 2399 2400 qpair = nvme_qpair->qpair; 2401 assert(qpair != NULL); 2402 2403 if (!spdk_nvme_qpair_is_connected(qpair)) { 2404 return SPDK_POLLER_BUSY; 2405 } 2406 2407 NVME_CTRLR_INFOLOG(nvme_qpair->ctrlr, "qpair %p:%u was connected.\n", 2408 qpair, spdk_nvme_qpair_get_id(qpair)); 2409 2410 spdk_poller_unregister(&ctrlr_ch->connect_poller); 2411 2412 /* qpair was completed to connect. Move to the next ctrlr_channel */ 2413 nvme_ctrlr_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 2414 ctrlr_ch->reset_iter = NULL; 2415 2416 if (!g_opts.disable_auto_failback) { 2417 _bdev_nvme_clear_io_path_cache(nvme_qpair); 2418 } 2419 2420 return SPDK_POLLER_BUSY; 2421 } 2422 2423 static void 2424 bdev_nvme_reset_create_qpair(struct nvme_ctrlr_channel_iter *i, 2425 struct nvme_ctrlr *nvme_ctrlr, 2426 struct nvme_ctrlr_channel *ctrlr_ch, 2427 void *ctx) 2428 { 2429 struct nvme_qpair *nvme_qpair = ctrlr_ch->qpair; 2430 struct spdk_nvme_qpair *qpair; 2431 int rc = 0; 2432 2433 if (nvme_qpair->qpair == NULL) { 2434 rc = bdev_nvme_create_qpair(nvme_qpair); 2435 } 2436 if (rc == 0) { 2437 ctrlr_ch->connect_poller = SPDK_POLLER_REGISTER(bdev_nvme_reset_check_qpair_connected, 2438 ctrlr_ch, 0); 2439 2440 qpair = nvme_qpair->qpair; 2441 2442 NVME_CTRLR_INFOLOG(nvme_ctrlr, "Start checking qpair %p:%u to be connected.\n", 2443 qpair, spdk_nvme_qpair_get_id(qpair)); 2444 2445 /* The current full reset sequence will move to the next 2446 * ctrlr_channel after the qpair is actually connected. 2447 */ 2448 assert(ctrlr_ch->reset_iter == NULL); 2449 ctrlr_ch->reset_iter = i; 2450 } else { 2451 nvme_ctrlr_for_each_channel_continue(i, rc); 2452 } 2453 } 2454 2455 static void 2456 nvme_ctrlr_check_namespaces(struct nvme_ctrlr *nvme_ctrlr) 2457 { 2458 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 2459 struct nvme_ns *nvme_ns; 2460 2461 for (nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 2462 nvme_ns != NULL; 2463 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns)) { 2464 if (!spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) { 2465 SPDK_DEBUGLOG(bdev_nvme, "NSID %u was removed during reset.\n", nvme_ns->id); 2466 /* NS can be added again. Just nullify nvme_ns->ns. */ 2467 nvme_ns->ns = NULL; 2468 } 2469 } 2470 } 2471 2472 2473 static int 2474 bdev_nvme_reconnect_ctrlr_poll(void *arg) 2475 { 2476 struct nvme_ctrlr *nvme_ctrlr = arg; 2477 struct spdk_nvme_transport_id *trid; 2478 int rc = -ETIMEDOUT; 2479 2480 if (bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 2481 /* Mark the ctrlr as failed. The next call to 2482 * spdk_nvme_ctrlr_reconnect_poll_async() will then 2483 * do the necessary cleanup and return failure. 2484 */ 2485 spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr); 2486 } 2487 2488 rc = spdk_nvme_ctrlr_reconnect_poll_async(nvme_ctrlr->ctrlr); 2489 if (rc == -EAGAIN) { 2490 return SPDK_POLLER_BUSY; 2491 } 2492 2493 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 2494 if (rc == 0) { 2495 trid = &nvme_ctrlr->active_path_id->trid; 2496 2497 if (spdk_nvme_trtype_is_fabrics(trid->trtype)) { 2498 NVME_CTRLR_INFOLOG(nvme_ctrlr, "ctrlr was connected to %s:%s. Create qpairs.\n", 2499 trid->traddr, trid->trsvcid); 2500 } else { 2501 NVME_CTRLR_INFOLOG(nvme_ctrlr, "ctrlr was connected. Create qpairs.\n"); 2502 } 2503 2504 nvme_ctrlr_check_namespaces(nvme_ctrlr); 2505 2506 /* Recreate all of the I/O queue pairs */ 2507 nvme_ctrlr_for_each_channel(nvme_ctrlr, 2508 bdev_nvme_reset_create_qpair, 2509 NULL, 2510 bdev_nvme_reset_create_qpairs_done); 2511 } else { 2512 NVME_CTRLR_INFOLOG(nvme_ctrlr, "ctrlr could not be connected.\n"); 2513 2514 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 2515 } 2516 return SPDK_POLLER_BUSY; 2517 } 2518 2519 static void 2520 bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2521 { 2522 NVME_CTRLR_INFOLOG(nvme_ctrlr, "Start reconnecting ctrlr.\n"); 2523 2524 spdk_nvme_ctrlr_reconnect_async(nvme_ctrlr->ctrlr); 2525 2526 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect, nvme_ctrlr->nbdev_ctrlr->name); 2527 assert(nvme_ctrlr->reset_detach_poller == NULL); 2528 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_ctrlr_poll, 2529 nvme_ctrlr, 0); 2530 } 2531 2532 static void 2533 bdev_nvme_reset_destroy_qpair_done(struct nvme_ctrlr *nvme_ctrlr, void *ctx, int status) 2534 { 2535 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reset, nvme_ctrlr->nbdev_ctrlr->name); 2536 assert(status == 0); 2537 2538 NVME_CTRLR_INFOLOG(nvme_ctrlr, "qpairs were deleted.\n"); 2539 2540 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2541 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2542 } else { 2543 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr); 2544 } 2545 } 2546 2547 static void 2548 bdev_nvme_reset_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr) 2549 { 2550 NVME_CTRLR_INFOLOG(nvme_ctrlr, "Delete qpairs for reset.\n"); 2551 2552 nvme_ctrlr_for_each_channel(nvme_ctrlr, 2553 bdev_nvme_reset_destroy_qpair, 2554 NULL, 2555 bdev_nvme_reset_destroy_qpair_done); 2556 } 2557 2558 static void 2559 bdev_nvme_reconnect_ctrlr_now(void *ctx) 2560 { 2561 struct nvme_ctrlr *nvme_ctrlr = ctx; 2562 2563 assert(nvme_ctrlr->resetting == true); 2564 assert(nvme_ctrlr->thread == spdk_get_thread()); 2565 2566 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 2567 2568 spdk_poller_resume(nvme_ctrlr->adminq_timer_poller); 2569 2570 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2571 } 2572 2573 static void 2574 _bdev_nvme_reset_ctrlr(void *ctx) 2575 { 2576 struct nvme_ctrlr *nvme_ctrlr = ctx; 2577 2578 assert(nvme_ctrlr->resetting == true); 2579 assert(nvme_ctrlr->thread == spdk_get_thread()); 2580 2581 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2582 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reset_destroy_qpairs); 2583 } else { 2584 bdev_nvme_reset_destroy_qpairs(nvme_ctrlr); 2585 } 2586 } 2587 2588 static int 2589 bdev_nvme_reset_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, spdk_msg_fn *msg_fn) 2590 { 2591 if (nvme_ctrlr->destruct) { 2592 return -ENXIO; 2593 } 2594 2595 if (nvme_ctrlr->resetting) { 2596 NVME_CTRLR_NOTICELOG(nvme_ctrlr, "Unable to perform reset, already in progress.\n"); 2597 return -EBUSY; 2598 } 2599 2600 if (nvme_ctrlr->disabled) { 2601 NVME_CTRLR_NOTICELOG(nvme_ctrlr, "Unable to perform reset. Controller is disabled.\n"); 2602 return -EALREADY; 2603 } 2604 2605 nvme_ctrlr->resetting = true; 2606 nvme_ctrlr->dont_retry = true; 2607 2608 if (nvme_ctrlr->reconnect_is_delayed) { 2609 NVME_CTRLR_INFOLOG(nvme_ctrlr, "Reconnect is already scheduled.\n"); 2610 *msg_fn = bdev_nvme_reconnect_ctrlr_now; 2611 nvme_ctrlr->reconnect_is_delayed = false; 2612 } else { 2613 *msg_fn = _bdev_nvme_reset_ctrlr; 2614 assert(nvme_ctrlr->reset_start_tsc == 0); 2615 } 2616 2617 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2618 2619 return 0; 2620 } 2621 2622 static int 2623 bdev_nvme_reset_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2624 { 2625 spdk_msg_fn msg_fn; 2626 int rc; 2627 2628 pthread_mutex_lock(&nvme_ctrlr->mutex); 2629 rc = bdev_nvme_reset_ctrlr_unsafe(nvme_ctrlr, &msg_fn); 2630 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2631 2632 if (rc == 0) { 2633 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 2634 } 2635 2636 return rc; 2637 } 2638 2639 static int 2640 bdev_nvme_enable_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2641 { 2642 pthread_mutex_lock(&nvme_ctrlr->mutex); 2643 if (nvme_ctrlr->destruct) { 2644 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2645 return -ENXIO; 2646 } 2647 2648 if (nvme_ctrlr->resetting) { 2649 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2650 return -EBUSY; 2651 } 2652 2653 if (!nvme_ctrlr->disabled) { 2654 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2655 return -EALREADY; 2656 } 2657 2658 nvme_ctrlr->disabled = false; 2659 nvme_ctrlr->resetting = true; 2660 2661 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2662 2663 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2664 2665 spdk_thread_send_msg(nvme_ctrlr->thread, bdev_nvme_reconnect_ctrlr_now, nvme_ctrlr); 2666 return 0; 2667 } 2668 2669 static void 2670 bdev_nvme_disable_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr) 2671 { 2672 bdev_nvme_ctrlr_op_cb ctrlr_op_cb_fn = nvme_ctrlr->ctrlr_op_cb_fn; 2673 void *ctrlr_op_cb_arg = nvme_ctrlr->ctrlr_op_cb_arg; 2674 enum bdev_nvme_op_after_reset op_after_disable; 2675 2676 assert(nvme_ctrlr->thread == spdk_get_thread()); 2677 2678 nvme_ctrlr->ctrlr_op_cb_fn = NULL; 2679 nvme_ctrlr->ctrlr_op_cb_arg = NULL; 2680 2681 pthread_mutex_lock(&nvme_ctrlr->mutex); 2682 2683 nvme_ctrlr->resetting = false; 2684 nvme_ctrlr->dont_retry = false; 2685 2686 op_after_disable = bdev_nvme_check_op_after_reset(nvme_ctrlr, true); 2687 2688 nvme_ctrlr->disabled = true; 2689 spdk_poller_pause(nvme_ctrlr->adminq_timer_poller); 2690 2691 /* Make sure we clear any pending resets before returning. */ 2692 bdev_nvme_complete_pending_resets(nvme_ctrlr, true); 2693 2694 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2695 2696 if (ctrlr_op_cb_fn) { 2697 ctrlr_op_cb_fn(ctrlr_op_cb_arg, 0); 2698 } 2699 2700 switch (op_after_disable) { 2701 case OP_COMPLETE_PENDING_DESTRUCT: 2702 nvme_ctrlr_unregister(nvme_ctrlr); 2703 break; 2704 default: 2705 break; 2706 } 2707 } 2708 2709 static void 2710 bdev_nvme_disable_destroy_qpairs_done(struct nvme_ctrlr *nvme_ctrlr, void *ctx, int status) 2711 { 2712 assert(status == 0); 2713 2714 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2715 bdev_nvme_disable_ctrlr_complete(nvme_ctrlr); 2716 } else { 2717 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_disable_ctrlr_complete); 2718 } 2719 } 2720 2721 static void 2722 bdev_nvme_disable_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr) 2723 { 2724 nvme_ctrlr_for_each_channel(nvme_ctrlr, 2725 bdev_nvme_reset_destroy_qpair, 2726 NULL, 2727 bdev_nvme_disable_destroy_qpairs_done); 2728 } 2729 2730 static void 2731 _bdev_nvme_cancel_reconnect_and_disable_ctrlr(void *ctx) 2732 { 2733 struct nvme_ctrlr *nvme_ctrlr = ctx; 2734 2735 assert(nvme_ctrlr->resetting == true); 2736 assert(nvme_ctrlr->thread == spdk_get_thread()); 2737 2738 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 2739 2740 bdev_nvme_disable_ctrlr_complete(nvme_ctrlr); 2741 } 2742 2743 static void 2744 _bdev_nvme_disconnect_and_disable_ctrlr(void *ctx) 2745 { 2746 struct nvme_ctrlr *nvme_ctrlr = ctx; 2747 2748 assert(nvme_ctrlr->resetting == true); 2749 assert(nvme_ctrlr->thread == spdk_get_thread()); 2750 2751 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2752 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_disable_destroy_qpairs); 2753 } else { 2754 bdev_nvme_disable_destroy_qpairs(nvme_ctrlr); 2755 } 2756 } 2757 2758 static int 2759 bdev_nvme_disable_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2760 { 2761 spdk_msg_fn msg_fn; 2762 2763 pthread_mutex_lock(&nvme_ctrlr->mutex); 2764 if (nvme_ctrlr->destruct) { 2765 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2766 return -ENXIO; 2767 } 2768 2769 if (nvme_ctrlr->resetting) { 2770 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2771 return -EBUSY; 2772 } 2773 2774 if (nvme_ctrlr->disabled) { 2775 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2776 return -EALREADY; 2777 } 2778 2779 nvme_ctrlr->resetting = true; 2780 nvme_ctrlr->dont_retry = true; 2781 2782 if (nvme_ctrlr->reconnect_is_delayed) { 2783 msg_fn = _bdev_nvme_cancel_reconnect_and_disable_ctrlr; 2784 nvme_ctrlr->reconnect_is_delayed = false; 2785 } else { 2786 msg_fn = _bdev_nvme_disconnect_and_disable_ctrlr; 2787 } 2788 2789 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2790 2791 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2792 2793 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 2794 return 0; 2795 } 2796 2797 static int 2798 nvme_ctrlr_op(struct nvme_ctrlr *nvme_ctrlr, enum nvme_ctrlr_op op, 2799 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2800 { 2801 int rc; 2802 2803 switch (op) { 2804 case NVME_CTRLR_OP_RESET: 2805 rc = bdev_nvme_reset_ctrlr(nvme_ctrlr); 2806 break; 2807 case NVME_CTRLR_OP_ENABLE: 2808 rc = bdev_nvme_enable_ctrlr(nvme_ctrlr); 2809 break; 2810 case NVME_CTRLR_OP_DISABLE: 2811 rc = bdev_nvme_disable_ctrlr(nvme_ctrlr); 2812 break; 2813 default: 2814 rc = -EINVAL; 2815 break; 2816 } 2817 2818 if (rc == 0) { 2819 assert(nvme_ctrlr->ctrlr_op_cb_fn == NULL); 2820 assert(nvme_ctrlr->ctrlr_op_cb_arg == NULL); 2821 nvme_ctrlr->ctrlr_op_cb_fn = cb_fn; 2822 nvme_ctrlr->ctrlr_op_cb_arg = cb_arg; 2823 } 2824 return rc; 2825 } 2826 2827 struct nvme_ctrlr_op_rpc_ctx { 2828 struct nvme_ctrlr *nvme_ctrlr; 2829 struct spdk_thread *orig_thread; 2830 enum nvme_ctrlr_op op; 2831 int rc; 2832 bdev_nvme_ctrlr_op_cb cb_fn; 2833 void *cb_arg; 2834 }; 2835 2836 static void 2837 _nvme_ctrlr_op_rpc_complete(void *_ctx) 2838 { 2839 struct nvme_ctrlr_op_rpc_ctx *ctx = _ctx; 2840 2841 assert(ctx != NULL); 2842 assert(ctx->cb_fn != NULL); 2843 2844 ctx->cb_fn(ctx->cb_arg, ctx->rc); 2845 2846 free(ctx); 2847 } 2848 2849 static void 2850 nvme_ctrlr_op_rpc_complete(void *cb_arg, int rc) 2851 { 2852 struct nvme_ctrlr_op_rpc_ctx *ctx = cb_arg; 2853 2854 ctx->rc = rc; 2855 2856 spdk_thread_send_msg(ctx->orig_thread, _nvme_ctrlr_op_rpc_complete, ctx); 2857 } 2858 2859 void 2860 nvme_ctrlr_op_rpc(struct nvme_ctrlr *nvme_ctrlr, enum nvme_ctrlr_op op, 2861 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2862 { 2863 struct nvme_ctrlr_op_rpc_ctx *ctx; 2864 int rc; 2865 2866 assert(cb_fn != NULL); 2867 2868 ctx = calloc(1, sizeof(*ctx)); 2869 if (ctx == NULL) { 2870 NVME_CTRLR_ERRLOG(nvme_ctrlr, "Failed to allocate nvme_ctrlr_op_rpc_ctx.\n"); 2871 cb_fn(cb_arg, -ENOMEM); 2872 return; 2873 } 2874 2875 ctx->orig_thread = spdk_get_thread(); 2876 ctx->cb_fn = cb_fn; 2877 ctx->cb_arg = cb_arg; 2878 2879 rc = nvme_ctrlr_op(nvme_ctrlr, op, nvme_ctrlr_op_rpc_complete, ctx); 2880 if (rc == 0) { 2881 return; 2882 } else if (rc == -EALREADY) { 2883 rc = 0; 2884 } 2885 2886 nvme_ctrlr_op_rpc_complete(ctx, rc); 2887 } 2888 2889 static void nvme_bdev_ctrlr_op_rpc_continue(void *cb_arg, int rc); 2890 2891 static void 2892 _nvme_bdev_ctrlr_op_rpc_continue(void *_ctx) 2893 { 2894 struct nvme_ctrlr_op_rpc_ctx *ctx = _ctx; 2895 struct nvme_ctrlr *prev_nvme_ctrlr, *next_nvme_ctrlr; 2896 int rc; 2897 2898 prev_nvme_ctrlr = ctx->nvme_ctrlr; 2899 ctx->nvme_ctrlr = NULL; 2900 2901 if (ctx->rc != 0) { 2902 goto complete; 2903 } 2904 2905 next_nvme_ctrlr = TAILQ_NEXT(prev_nvme_ctrlr, tailq); 2906 if (next_nvme_ctrlr == NULL) { 2907 goto complete; 2908 } 2909 2910 rc = nvme_ctrlr_op(next_nvme_ctrlr, ctx->op, nvme_bdev_ctrlr_op_rpc_continue, ctx); 2911 if (rc == 0) { 2912 ctx->nvme_ctrlr = next_nvme_ctrlr; 2913 return; 2914 } else if (rc == -EALREADY) { 2915 ctx->nvme_ctrlr = next_nvme_ctrlr; 2916 rc = 0; 2917 } 2918 2919 ctx->rc = rc; 2920 2921 complete: 2922 ctx->cb_fn(ctx->cb_arg, ctx->rc); 2923 free(ctx); 2924 } 2925 2926 static void 2927 nvme_bdev_ctrlr_op_rpc_continue(void *cb_arg, int rc) 2928 { 2929 struct nvme_ctrlr_op_rpc_ctx *ctx = cb_arg; 2930 2931 ctx->rc = rc; 2932 2933 spdk_thread_send_msg(ctx->orig_thread, _nvme_bdev_ctrlr_op_rpc_continue, ctx); 2934 } 2935 2936 void 2937 nvme_bdev_ctrlr_op_rpc(struct nvme_bdev_ctrlr *nbdev_ctrlr, enum nvme_ctrlr_op op, 2938 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2939 { 2940 struct nvme_ctrlr_op_rpc_ctx *ctx; 2941 struct nvme_ctrlr *nvme_ctrlr; 2942 int rc; 2943 2944 assert(cb_fn != NULL); 2945 2946 ctx = calloc(1, sizeof(*ctx)); 2947 if (ctx == NULL) { 2948 SPDK_ERRLOG("Failed to allocate nvme_ctrlr_op_rpc_ctx.\n"); 2949 cb_fn(cb_arg, -ENOMEM); 2950 return; 2951 } 2952 2953 ctx->orig_thread = spdk_get_thread(); 2954 ctx->op = op; 2955 ctx->cb_fn = cb_fn; 2956 ctx->cb_arg = cb_arg; 2957 2958 nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 2959 assert(nvme_ctrlr != NULL); 2960 2961 rc = nvme_ctrlr_op(nvme_ctrlr, op, nvme_bdev_ctrlr_op_rpc_continue, ctx); 2962 if (rc == 0) { 2963 ctx->nvme_ctrlr = nvme_ctrlr; 2964 return; 2965 } else if (rc == -EALREADY) { 2966 ctx->nvme_ctrlr = nvme_ctrlr; 2967 rc = 0; 2968 } 2969 2970 nvme_bdev_ctrlr_op_rpc_continue(ctx, rc); 2971 } 2972 2973 static int _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio); 2974 2975 static void 2976 bdev_nvme_unfreeze_bdev_channel_done(struct nvme_bdev *nbdev, void *ctx, int status) 2977 { 2978 struct nvme_bdev_io *bio = ctx; 2979 enum spdk_bdev_io_status io_status; 2980 2981 if (bio->cpl.cdw0 == 0) { 2982 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 2983 } else { 2984 io_status = SPDK_BDEV_IO_STATUS_FAILED; 2985 } 2986 2987 NVME_BDEV_INFOLOG(nbdev, "reset_io %p completed, status:%d\n", bio, io_status); 2988 2989 __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), io_status, NULL); 2990 } 2991 2992 static void 2993 bdev_nvme_unfreeze_bdev_channel(struct nvme_bdev_channel_iter *i, 2994 struct nvme_bdev *nbdev, 2995 struct nvme_bdev_channel *nbdev_ch, void *ctx) 2996 { 2997 bdev_nvme_abort_retry_ios(nbdev_ch); 2998 nbdev_ch->resetting = false; 2999 3000 nvme_bdev_for_each_channel_continue(i, 0); 3001 } 3002 3003 static void 3004 bdev_nvme_reset_io_complete(struct nvme_bdev_io *bio) 3005 { 3006 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 3007 struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt; 3008 3009 /* Abort all queued I/Os for retry. */ 3010 nvme_bdev_for_each_channel(nbdev, 3011 bdev_nvme_unfreeze_bdev_channel, 3012 bio, 3013 bdev_nvme_unfreeze_bdev_channel_done); 3014 } 3015 3016 static void 3017 _bdev_nvme_reset_io_continue(void *ctx) 3018 { 3019 struct nvme_bdev_io *bio = ctx; 3020 struct nvme_io_path *prev_io_path, *next_io_path; 3021 int rc; 3022 3023 prev_io_path = bio->io_path; 3024 bio->io_path = NULL; 3025 3026 next_io_path = STAILQ_NEXT(prev_io_path, stailq); 3027 if (next_io_path == NULL) { 3028 goto complete; 3029 } 3030 3031 rc = _bdev_nvme_reset_io(next_io_path, bio); 3032 if (rc == 0) { 3033 return; 3034 } 3035 3036 complete: 3037 bdev_nvme_reset_io_complete(bio); 3038 } 3039 3040 static void 3041 bdev_nvme_reset_io_continue(void *cb_arg, int rc) 3042 { 3043 struct nvme_bdev_io *bio = cb_arg; 3044 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 3045 struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt; 3046 3047 NVME_BDEV_INFOLOG(nbdev, "continue reset_io %p, rc:%d\n", bio, rc); 3048 3049 /* Reset status is initialized as "failed". Set to "success" once we have at least one 3050 * successfully reset nvme_ctrlr. 3051 */ 3052 if (rc == 0) { 3053 bio->cpl.cdw0 = 0; 3054 } 3055 3056 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), _bdev_nvme_reset_io_continue, bio); 3057 } 3058 3059 static int 3060 _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio) 3061 { 3062 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 3063 struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt; 3064 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 3065 spdk_msg_fn msg_fn; 3066 int rc; 3067 3068 assert(bio->io_path == NULL); 3069 bio->io_path = io_path; 3070 3071 pthread_mutex_lock(&nvme_ctrlr->mutex); 3072 rc = bdev_nvme_reset_ctrlr_unsafe(nvme_ctrlr, &msg_fn); 3073 if (rc == -EBUSY) { 3074 /* 3075 * Reset call is queued only if it is from the app framework. This is on purpose so that 3076 * we don't interfere with the app framework reset strategy. i.e. we are deferring to the 3077 * upper level. If they are in the middle of a reset, we won't try to schedule another one. 3078 */ 3079 TAILQ_INSERT_TAIL(&nvme_ctrlr->pending_resets, bio, retry_link); 3080 } 3081 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3082 3083 if (rc == 0) { 3084 assert(nvme_ctrlr->ctrlr_op_cb_fn == NULL); 3085 assert(nvme_ctrlr->ctrlr_op_cb_arg == NULL); 3086 nvme_ctrlr->ctrlr_op_cb_fn = bdev_nvme_reset_io_continue; 3087 nvme_ctrlr->ctrlr_op_cb_arg = bio; 3088 3089 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 3090 3091 NVME_BDEV_INFOLOG(nbdev, "reset_io %p started resetting ctrlr [%s, %u].\n", 3092 bio, CTRLR_STRING(nvme_ctrlr), CTRLR_ID(nvme_ctrlr)); 3093 } else if (rc == -EBUSY) { 3094 rc = 0; 3095 3096 NVME_BDEV_INFOLOG(nbdev, "reset_io %p was queued to ctrlr [%s, %u].\n", 3097 bio, CTRLR_STRING(nvme_ctrlr), CTRLR_ID(nvme_ctrlr)); 3098 } else { 3099 NVME_BDEV_INFOLOG(nbdev, "reset_io %p could not reset ctrlr [%s, %u], rc:%d\n", 3100 bio, CTRLR_STRING(nvme_ctrlr), CTRLR_ID(nvme_ctrlr), rc); 3101 } 3102 3103 return rc; 3104 } 3105 3106 static void 3107 bdev_nvme_freeze_bdev_channel_done(struct nvme_bdev *nbdev, void *ctx, int status) 3108 { 3109 struct nvme_bdev_io *bio = ctx; 3110 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 3111 struct nvme_bdev_channel *nbdev_ch; 3112 struct nvme_io_path *io_path; 3113 int rc; 3114 3115 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 3116 3117 /* Initialize with failed status. With multipath it is enough to have at least one successful 3118 * nvme_ctrlr reset. If there is none, reset status will remain failed. 3119 */ 3120 bio->cpl.cdw0 = 1; 3121 3122 /* Reset all nvme_ctrlrs of a bdev controller sequentially. */ 3123 io_path = STAILQ_FIRST(&nbdev_ch->io_path_list); 3124 assert(io_path != NULL); 3125 3126 rc = _bdev_nvme_reset_io(io_path, bio); 3127 if (rc != 0) { 3128 /* If the current nvme_ctrlr is disabled, skip it and move to the next nvme_ctrlr. */ 3129 rc = (rc == -EALREADY) ? 0 : rc; 3130 3131 bdev_nvme_reset_io_continue(bio, rc); 3132 } 3133 } 3134 3135 static void 3136 bdev_nvme_freeze_bdev_channel(struct nvme_bdev_channel_iter *i, 3137 struct nvme_bdev *nbdev, 3138 struct nvme_bdev_channel *nbdev_ch, void *ctx) 3139 { 3140 nbdev_ch->resetting = true; 3141 3142 nvme_bdev_for_each_channel_continue(i, 0); 3143 } 3144 3145 static void 3146 bdev_nvme_reset_io(struct nvme_bdev *nbdev, struct nvme_bdev_io *bio) 3147 { 3148 NVME_BDEV_INFOLOG(nbdev, "reset_io %p started.\n", bio); 3149 3150 nvme_bdev_for_each_channel(nbdev, 3151 bdev_nvme_freeze_bdev_channel, 3152 bio, 3153 bdev_nvme_freeze_bdev_channel_done); 3154 } 3155 3156 static int 3157 bdev_nvme_failover_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool remove) 3158 { 3159 if (nvme_ctrlr->destruct) { 3160 /* Don't bother resetting if the controller is in the process of being destructed. */ 3161 return -ENXIO; 3162 } 3163 3164 if (nvme_ctrlr->resetting) { 3165 if (!nvme_ctrlr->in_failover) { 3166 NVME_CTRLR_NOTICELOG(nvme_ctrlr, 3167 "Reset is already in progress. Defer failover until reset completes.\n"); 3168 3169 /* Defer failover until reset completes. */ 3170 nvme_ctrlr->pending_failover = true; 3171 return -EINPROGRESS; 3172 } else { 3173 NVME_CTRLR_NOTICELOG(nvme_ctrlr, "Unable to perform failover, already in progress.\n"); 3174 return -EBUSY; 3175 } 3176 } 3177 3178 bdev_nvme_failover_trid(nvme_ctrlr, remove, true); 3179 3180 if (nvme_ctrlr->reconnect_is_delayed) { 3181 NVME_CTRLR_NOTICELOG(nvme_ctrlr, "Reconnect is already scheduled.\n"); 3182 3183 /* We rely on the next reconnect for the failover. */ 3184 return -EALREADY; 3185 } 3186 3187 if (nvme_ctrlr->disabled) { 3188 NVME_CTRLR_NOTICELOG(nvme_ctrlr, "Controller is disabled.\n"); 3189 3190 /* We rely on the enablement for the failover. */ 3191 return -EALREADY; 3192 } 3193 3194 nvme_ctrlr->resetting = true; 3195 nvme_ctrlr->in_failover = true; 3196 3197 assert(nvme_ctrlr->reset_start_tsc == 0); 3198 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 3199 3200 return 0; 3201 } 3202 3203 static int 3204 bdev_nvme_failover_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 3205 { 3206 int rc; 3207 3208 pthread_mutex_lock(&nvme_ctrlr->mutex); 3209 rc = bdev_nvme_failover_ctrlr_unsafe(nvme_ctrlr, false); 3210 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3211 3212 if (rc == 0) { 3213 spdk_thread_send_msg(nvme_ctrlr->thread, _bdev_nvme_reset_ctrlr, nvme_ctrlr); 3214 } else if (rc == -EALREADY) { 3215 rc = 0; 3216 } 3217 3218 return rc; 3219 } 3220 3221 static int bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, 3222 uint64_t num_blocks); 3223 3224 static int bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, 3225 uint64_t num_blocks); 3226 3227 static int bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, 3228 uint64_t src_offset_blocks, 3229 uint64_t num_blocks); 3230 3231 static void 3232 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 3233 bool success) 3234 { 3235 struct nvme_bdev_io *bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 3236 int ret; 3237 3238 if (!success) { 3239 ret = -EINVAL; 3240 goto exit; 3241 } 3242 3243 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 3244 ret = -ENXIO; 3245 goto exit; 3246 } 3247 3248 ret = bdev_nvme_readv(bio, 3249 bdev_io->u.bdev.iovs, 3250 bdev_io->u.bdev.iovcnt, 3251 bdev_io->u.bdev.md_buf, 3252 bdev_io->u.bdev.num_blocks, 3253 bdev_io->u.bdev.offset_blocks, 3254 bdev_io->u.bdev.dif_check_flags, 3255 bdev_io->u.bdev.memory_domain, 3256 bdev_io->u.bdev.memory_domain_ctx, 3257 bdev_io->u.bdev.accel_sequence); 3258 3259 exit: 3260 if (spdk_unlikely(ret != 0)) { 3261 bdev_nvme_io_complete(bio, ret); 3262 } 3263 } 3264 3265 static inline void 3266 _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io) 3267 { 3268 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 3269 struct spdk_bdev *bdev = bdev_io->bdev; 3270 struct nvme_bdev_io *nbdev_io_to_abort; 3271 int rc = 0; 3272 3273 switch (bdev_io->type) { 3274 case SPDK_BDEV_IO_TYPE_READ: 3275 if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) { 3276 3277 rc = bdev_nvme_readv(nbdev_io, 3278 bdev_io->u.bdev.iovs, 3279 bdev_io->u.bdev.iovcnt, 3280 bdev_io->u.bdev.md_buf, 3281 bdev_io->u.bdev.num_blocks, 3282 bdev_io->u.bdev.offset_blocks, 3283 bdev_io->u.bdev.dif_check_flags, 3284 bdev_io->u.bdev.memory_domain, 3285 bdev_io->u.bdev.memory_domain_ctx, 3286 bdev_io->u.bdev.accel_sequence); 3287 } else { 3288 spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb, 3289 bdev_io->u.bdev.num_blocks * bdev->blocklen); 3290 rc = 0; 3291 } 3292 break; 3293 case SPDK_BDEV_IO_TYPE_WRITE: 3294 rc = bdev_nvme_writev(nbdev_io, 3295 bdev_io->u.bdev.iovs, 3296 bdev_io->u.bdev.iovcnt, 3297 bdev_io->u.bdev.md_buf, 3298 bdev_io->u.bdev.num_blocks, 3299 bdev_io->u.bdev.offset_blocks, 3300 bdev_io->u.bdev.dif_check_flags, 3301 bdev_io->u.bdev.memory_domain, 3302 bdev_io->u.bdev.memory_domain_ctx, 3303 bdev_io->u.bdev.accel_sequence, 3304 bdev_io->u.bdev.nvme_cdw12, 3305 bdev_io->u.bdev.nvme_cdw13); 3306 break; 3307 case SPDK_BDEV_IO_TYPE_COMPARE: 3308 rc = bdev_nvme_comparev(nbdev_io, 3309 bdev_io->u.bdev.iovs, 3310 bdev_io->u.bdev.iovcnt, 3311 bdev_io->u.bdev.md_buf, 3312 bdev_io->u.bdev.num_blocks, 3313 bdev_io->u.bdev.offset_blocks, 3314 bdev_io->u.bdev.dif_check_flags); 3315 break; 3316 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 3317 rc = bdev_nvme_comparev_and_writev(nbdev_io, 3318 bdev_io->u.bdev.iovs, 3319 bdev_io->u.bdev.iovcnt, 3320 bdev_io->u.bdev.fused_iovs, 3321 bdev_io->u.bdev.fused_iovcnt, 3322 bdev_io->u.bdev.md_buf, 3323 bdev_io->u.bdev.num_blocks, 3324 bdev_io->u.bdev.offset_blocks, 3325 bdev_io->u.bdev.dif_check_flags); 3326 break; 3327 case SPDK_BDEV_IO_TYPE_UNMAP: 3328 rc = bdev_nvme_unmap(nbdev_io, 3329 bdev_io->u.bdev.offset_blocks, 3330 bdev_io->u.bdev.num_blocks); 3331 break; 3332 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3333 rc = bdev_nvme_write_zeroes(nbdev_io, 3334 bdev_io->u.bdev.offset_blocks, 3335 bdev_io->u.bdev.num_blocks); 3336 break; 3337 case SPDK_BDEV_IO_TYPE_RESET: 3338 nbdev_io->io_path = NULL; 3339 bdev_nvme_reset_io(bdev->ctxt, nbdev_io); 3340 return; 3341 3342 case SPDK_BDEV_IO_TYPE_FLUSH: 3343 bdev_nvme_io_complete(nbdev_io, 0); 3344 return; 3345 3346 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 3347 rc = bdev_nvme_zone_appendv(nbdev_io, 3348 bdev_io->u.bdev.iovs, 3349 bdev_io->u.bdev.iovcnt, 3350 bdev_io->u.bdev.md_buf, 3351 bdev_io->u.bdev.num_blocks, 3352 bdev_io->u.bdev.offset_blocks, 3353 bdev_io->u.bdev.dif_check_flags); 3354 break; 3355 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 3356 rc = bdev_nvme_get_zone_info(nbdev_io, 3357 bdev_io->u.zone_mgmt.zone_id, 3358 bdev_io->u.zone_mgmt.num_zones, 3359 bdev_io->u.zone_mgmt.buf); 3360 break; 3361 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 3362 rc = bdev_nvme_zone_management(nbdev_io, 3363 bdev_io->u.zone_mgmt.zone_id, 3364 bdev_io->u.zone_mgmt.zone_action); 3365 break; 3366 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 3367 nbdev_io->io_path = NULL; 3368 bdev_nvme_admin_passthru(nbdev_ch, 3369 nbdev_io, 3370 &bdev_io->u.nvme_passthru.cmd, 3371 bdev_io->u.nvme_passthru.buf, 3372 bdev_io->u.nvme_passthru.nbytes); 3373 return; 3374 3375 case SPDK_BDEV_IO_TYPE_NVME_IO: 3376 rc = bdev_nvme_io_passthru(nbdev_io, 3377 &bdev_io->u.nvme_passthru.cmd, 3378 bdev_io->u.nvme_passthru.buf, 3379 bdev_io->u.nvme_passthru.nbytes); 3380 break; 3381 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3382 rc = bdev_nvme_io_passthru_md(nbdev_io, 3383 &bdev_io->u.nvme_passthru.cmd, 3384 bdev_io->u.nvme_passthru.buf, 3385 bdev_io->u.nvme_passthru.nbytes, 3386 bdev_io->u.nvme_passthru.md_buf, 3387 bdev_io->u.nvme_passthru.md_len); 3388 break; 3389 case SPDK_BDEV_IO_TYPE_NVME_IOV_MD: 3390 rc = bdev_nvme_iov_passthru_md(nbdev_io, 3391 &bdev_io->u.nvme_passthru.cmd, 3392 bdev_io->u.nvme_passthru.iovs, 3393 bdev_io->u.nvme_passthru.iovcnt, 3394 bdev_io->u.nvme_passthru.nbytes, 3395 bdev_io->u.nvme_passthru.md_buf, 3396 bdev_io->u.nvme_passthru.md_len); 3397 break; 3398 case SPDK_BDEV_IO_TYPE_ABORT: 3399 nbdev_io->io_path = NULL; 3400 nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx; 3401 bdev_nvme_abort(nbdev_ch, 3402 nbdev_io, 3403 nbdev_io_to_abort); 3404 return; 3405 3406 case SPDK_BDEV_IO_TYPE_COPY: 3407 rc = bdev_nvme_copy(nbdev_io, 3408 bdev_io->u.bdev.offset_blocks, 3409 bdev_io->u.bdev.copy.src_offset_blocks, 3410 bdev_io->u.bdev.num_blocks); 3411 break; 3412 default: 3413 rc = -EINVAL; 3414 break; 3415 } 3416 3417 if (spdk_unlikely(rc != 0)) { 3418 bdev_nvme_io_complete(nbdev_io, rc); 3419 } 3420 } 3421 3422 static void 3423 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 3424 { 3425 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 3426 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 3427 3428 if (spdk_likely(nbdev_io->submit_tsc == 0)) { 3429 nbdev_io->submit_tsc = spdk_bdev_io_get_submit_tsc(bdev_io); 3430 } else { 3431 /* There are cases where submit_tsc != 0, i.e. retry I/O. 3432 * We need to update submit_tsc here. 3433 */ 3434 nbdev_io->submit_tsc = spdk_get_ticks(); 3435 } 3436 3437 spdk_trace_record(TRACE_BDEV_NVME_IO_START, 0, 0, (uintptr_t)nbdev_io, (uintptr_t)bdev_io); 3438 nbdev_io->io_path = bdev_nvme_find_io_path(nbdev_ch); 3439 if (spdk_unlikely(!nbdev_io->io_path)) { 3440 if (!bdev_nvme_io_type_is_admin(bdev_io->type)) { 3441 bdev_nvme_io_complete(nbdev_io, -ENXIO); 3442 return; 3443 } 3444 3445 /* Admin commands do not use the optimal I/O path. 3446 * Simply fall through even if it is not found. 3447 */ 3448 } 3449 3450 _bdev_nvme_submit_request(nbdev_ch, bdev_io); 3451 } 3452 3453 static bool 3454 bdev_nvme_is_supported_csi(enum spdk_nvme_csi csi) 3455 { 3456 switch (csi) { 3457 case SPDK_NVME_CSI_NVM: 3458 return true; 3459 case SPDK_NVME_CSI_ZNS: 3460 return true; 3461 default: 3462 return false; 3463 } 3464 } 3465 3466 static bool 3467 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 3468 { 3469 struct nvme_bdev *nbdev = ctx; 3470 struct nvme_ns *nvme_ns; 3471 struct spdk_nvme_ns *ns; 3472 struct spdk_nvme_ctrlr *ctrlr; 3473 const struct spdk_nvme_ctrlr_data *cdata; 3474 3475 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 3476 assert(nvme_ns != NULL); 3477 ns = nvme_ns->ns; 3478 if (ns == NULL) { 3479 return false; 3480 } 3481 3482 if (!bdev_nvme_is_supported_csi(spdk_nvme_ns_get_csi(ns))) { 3483 switch (io_type) { 3484 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 3485 case SPDK_BDEV_IO_TYPE_NVME_IO: 3486 return true; 3487 3488 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3489 return spdk_nvme_ns_get_md_size(ns) ? true : false; 3490 3491 default: 3492 return false; 3493 } 3494 } 3495 3496 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 3497 3498 switch (io_type) { 3499 case SPDK_BDEV_IO_TYPE_READ: 3500 case SPDK_BDEV_IO_TYPE_WRITE: 3501 case SPDK_BDEV_IO_TYPE_RESET: 3502 case SPDK_BDEV_IO_TYPE_FLUSH: 3503 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 3504 case SPDK_BDEV_IO_TYPE_NVME_IO: 3505 case SPDK_BDEV_IO_TYPE_ABORT: 3506 return true; 3507 3508 case SPDK_BDEV_IO_TYPE_COMPARE: 3509 return spdk_nvme_ns_supports_compare(ns); 3510 3511 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3512 return spdk_nvme_ns_get_md_size(ns) ? true : false; 3513 3514 case SPDK_BDEV_IO_TYPE_UNMAP: 3515 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3516 return cdata->oncs.dsm; 3517 3518 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3519 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3520 return cdata->oncs.write_zeroes; 3521 3522 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 3523 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 3524 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) { 3525 return true; 3526 } 3527 return false; 3528 3529 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 3530 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 3531 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS; 3532 3533 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 3534 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS && 3535 spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ZONE_APPEND_SUPPORTED; 3536 3537 case SPDK_BDEV_IO_TYPE_COPY: 3538 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3539 return cdata->oncs.copy; 3540 3541 default: 3542 return false; 3543 } 3544 } 3545 3546 static int 3547 nvme_qpair_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ctrlr_channel *ctrlr_ch) 3548 { 3549 struct nvme_qpair *nvme_qpair; 3550 struct spdk_io_channel *pg_ch; 3551 int rc; 3552 3553 nvme_qpair = calloc(1, sizeof(*nvme_qpair)); 3554 if (!nvme_qpair) { 3555 NVME_CTRLR_ERRLOG(nvme_ctrlr, "Failed to alloc nvme_qpair.\n"); 3556 return -1; 3557 } 3558 3559 TAILQ_INIT(&nvme_qpair->io_path_list); 3560 3561 nvme_qpair->ctrlr = nvme_ctrlr; 3562 nvme_qpair->ctrlr_ch = ctrlr_ch; 3563 3564 pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs); 3565 if (!pg_ch) { 3566 free(nvme_qpair); 3567 return -1; 3568 } 3569 3570 nvme_qpair->group = spdk_io_channel_get_ctx(pg_ch); 3571 3572 #ifdef SPDK_CONFIG_VTUNE 3573 nvme_qpair->group->collect_spin_stat = true; 3574 #else 3575 nvme_qpair->group->collect_spin_stat = false; 3576 #endif 3577 3578 if (!nvme_ctrlr->disabled) { 3579 /* If a nvme_ctrlr is disabled, don't try to create qpair for it. Qpair will 3580 * be created when it's enabled. 3581 */ 3582 rc = bdev_nvme_create_qpair(nvme_qpair); 3583 if (rc != 0) { 3584 /* nvme_ctrlr can't create IO qpair if connection is down. 3585 * If reconnect_delay_sec is non-zero, creating IO qpair is retried 3586 * after reconnect_delay_sec seconds. If bdev_retry_count is non-zero, 3587 * submitted IO will be queued until IO qpair is successfully created. 3588 * 3589 * Hence, if both are satisfied, ignore the failure. 3590 */ 3591 if (nvme_ctrlr->opts.reconnect_delay_sec == 0 || g_opts.bdev_retry_count == 0) { 3592 spdk_put_io_channel(pg_ch); 3593 free(nvme_qpair); 3594 return rc; 3595 } 3596 } 3597 } 3598 3599 TAILQ_INSERT_TAIL(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 3600 3601 ctrlr_ch->qpair = nvme_qpair; 3602 3603 nvme_ctrlr_get_ref(nvme_ctrlr); 3604 3605 return 0; 3606 } 3607 3608 static int 3609 bdev_nvme_create_ctrlr_channel_cb(void *io_device, void *ctx_buf) 3610 { 3611 struct nvme_ctrlr *nvme_ctrlr = io_device; 3612 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 3613 3614 return nvme_qpair_create(nvme_ctrlr, ctrlr_ch); 3615 } 3616 3617 static void 3618 nvme_qpair_delete(struct nvme_qpair *nvme_qpair) 3619 { 3620 struct nvme_io_path *io_path, *next; 3621 3622 assert(nvme_qpair->group != NULL); 3623 3624 TAILQ_FOREACH_SAFE(io_path, &nvme_qpair->io_path_list, tailq, next) { 3625 TAILQ_REMOVE(&nvme_qpair->io_path_list, io_path, tailq); 3626 nvme_io_path_free(io_path); 3627 } 3628 3629 TAILQ_REMOVE(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 3630 3631 spdk_put_io_channel(spdk_io_channel_from_ctx(nvme_qpair->group)); 3632 3633 nvme_ctrlr_put_ref(nvme_qpair->ctrlr); 3634 3635 free(nvme_qpair); 3636 } 3637 3638 static void 3639 bdev_nvme_destroy_ctrlr_channel_cb(void *io_device, void *ctx_buf) 3640 { 3641 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 3642 struct nvme_qpair *nvme_qpair; 3643 3644 nvme_qpair = ctrlr_ch->qpair; 3645 assert(nvme_qpair != NULL); 3646 3647 _bdev_nvme_clear_io_path_cache(nvme_qpair); 3648 3649 if (nvme_qpair->qpair != NULL) { 3650 /* Always try to disconnect the qpair, even if a reset is in progress. 3651 * The qpair may have been created after the reset process started. 3652 */ 3653 spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); 3654 if (ctrlr_ch->reset_iter) { 3655 /* Skip current ctrlr_channel in a full reset sequence because 3656 * it is being deleted now. 3657 */ 3658 nvme_ctrlr_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 3659 } 3660 3661 /* We cannot release a reference to the poll group now. 3662 * The qpair may be disconnected asynchronously later. 3663 * We need to poll it until it is actually disconnected. 3664 * Just detach the qpair from the deleting ctrlr_channel. 3665 */ 3666 nvme_qpair->ctrlr_ch = NULL; 3667 } else { 3668 assert(ctrlr_ch->reset_iter == NULL); 3669 3670 nvme_qpair_delete(nvme_qpair); 3671 } 3672 } 3673 3674 static inline struct spdk_io_channel * 3675 bdev_nvme_get_accel_channel(struct nvme_poll_group *group) 3676 { 3677 if (spdk_unlikely(!group->accel_channel)) { 3678 group->accel_channel = spdk_accel_get_io_channel(); 3679 if (!group->accel_channel) { 3680 SPDK_ERRLOG("Cannot get the accel_channel for bdev nvme polling group=%p\n", 3681 group); 3682 return NULL; 3683 } 3684 } 3685 3686 return group->accel_channel; 3687 } 3688 3689 static void 3690 bdev_nvme_finish_sequence(void *seq, spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) 3691 { 3692 spdk_accel_sequence_finish(seq, cb_fn, cb_arg); 3693 } 3694 3695 static void 3696 bdev_nvme_abort_sequence(void *seq) 3697 { 3698 spdk_accel_sequence_abort(seq); 3699 } 3700 3701 static void 3702 bdev_nvme_reverse_sequence(void *seq) 3703 { 3704 spdk_accel_sequence_reverse(seq); 3705 } 3706 3707 static int 3708 bdev_nvme_append_crc32c(void *ctx, void **seq, uint32_t *dst, struct iovec *iovs, uint32_t iovcnt, 3709 struct spdk_memory_domain *domain, void *domain_ctx, uint32_t seed, 3710 spdk_nvme_accel_step_cb cb_fn, void *cb_arg) 3711 { 3712 struct spdk_io_channel *ch; 3713 struct nvme_poll_group *group = ctx; 3714 3715 ch = bdev_nvme_get_accel_channel(group); 3716 if (spdk_unlikely(ch == NULL)) { 3717 return -ENOMEM; 3718 } 3719 3720 return spdk_accel_append_crc32c((struct spdk_accel_sequence **)seq, ch, dst, iovs, iovcnt, 3721 domain, domain_ctx, seed, cb_fn, cb_arg); 3722 } 3723 3724 static int 3725 bdev_nvme_append_copy(void *ctx, void **seq, struct iovec *dst_iovs, uint32_t dst_iovcnt, 3726 struct spdk_memory_domain *dst_domain, void *dst_domain_ctx, 3727 struct iovec *src_iovs, uint32_t src_iovcnt, 3728 struct spdk_memory_domain *src_domain, void *src_domain_ctx, 3729 spdk_nvme_accel_step_cb cb_fn, void *cb_arg) 3730 { 3731 struct spdk_io_channel *ch; 3732 struct nvme_poll_group *group = ctx; 3733 3734 ch = bdev_nvme_get_accel_channel(group); 3735 if (spdk_unlikely(ch == NULL)) { 3736 return -ENOMEM; 3737 } 3738 3739 return spdk_accel_append_copy((struct spdk_accel_sequence **)seq, ch, 3740 dst_iovs, dst_iovcnt, dst_domain, dst_domain_ctx, 3741 src_iovs, src_iovcnt, src_domain, src_domain_ctx, 3742 cb_fn, cb_arg); 3743 } 3744 3745 static struct spdk_nvme_accel_fn_table g_bdev_nvme_accel_fn_table = { 3746 .table_size = sizeof(struct spdk_nvme_accel_fn_table), 3747 .append_crc32c = bdev_nvme_append_crc32c, 3748 .append_copy = bdev_nvme_append_copy, 3749 .finish_sequence = bdev_nvme_finish_sequence, 3750 .reverse_sequence = bdev_nvme_reverse_sequence, 3751 .abort_sequence = bdev_nvme_abort_sequence, 3752 }; 3753 3754 static void 3755 bdev_nvme_poll_group_interrupt_cb(struct spdk_nvme_poll_group *group, void *ctx) 3756 { 3757 bdev_nvme_poll(ctx); 3758 } 3759 3760 static int 3761 bdev_nvme_create_poll_group_cb(void *io_device, void *ctx_buf) 3762 { 3763 struct nvme_poll_group *group = ctx_buf; 3764 struct spdk_fd_group *fgrp; 3765 uint64_t period; 3766 int rc; 3767 3768 TAILQ_INIT(&group->qpair_list); 3769 3770 group->group = spdk_nvme_poll_group_create(group, &g_bdev_nvme_accel_fn_table); 3771 if (group->group == NULL) { 3772 return -1; 3773 } 3774 3775 period = spdk_interrupt_mode_is_enabled() ? 0 : g_opts.nvme_ioq_poll_period_us; 3776 group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, period); 3777 3778 if (group->poller == NULL) { 3779 spdk_nvme_poll_group_destroy(group->group); 3780 return -1; 3781 } 3782 3783 if (spdk_interrupt_mode_is_enabled()) { 3784 spdk_poller_register_interrupt(group->poller, NULL, NULL); 3785 3786 fgrp = spdk_nvme_poll_group_get_fd_group(group->group); 3787 if (fgrp == NULL) { 3788 spdk_nvme_poll_group_destroy(group->group); 3789 return -1; 3790 } 3791 3792 rc = spdk_nvme_poll_group_set_interrupt_callback(group->group, 3793 bdev_nvme_poll_group_interrupt_cb, group); 3794 if (rc != 0) { 3795 spdk_nvme_poll_group_destroy(group->group); 3796 return -1; 3797 } 3798 3799 group->intr = spdk_interrupt_register_fd_group(fgrp, "bdev_nvme_interrupt"); 3800 if (!group->intr) { 3801 spdk_nvme_poll_group_destroy(group->group); 3802 return -1; 3803 } 3804 } 3805 3806 return 0; 3807 } 3808 3809 static void 3810 bdev_nvme_destroy_poll_group_cb(void *io_device, void *ctx_buf) 3811 { 3812 struct nvme_poll_group *group = ctx_buf; 3813 3814 assert(TAILQ_EMPTY(&group->qpair_list)); 3815 3816 if (group->accel_channel) { 3817 spdk_put_io_channel(group->accel_channel); 3818 } 3819 3820 if (spdk_interrupt_mode_is_enabled()) { 3821 spdk_interrupt_unregister(&group->intr); 3822 } 3823 3824 spdk_poller_unregister(&group->poller); 3825 if (spdk_nvme_poll_group_destroy(group->group)) { 3826 SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.\n"); 3827 assert(false); 3828 } 3829 } 3830 3831 static struct spdk_io_channel * 3832 bdev_nvme_get_io_channel(void *ctx) 3833 { 3834 struct nvme_bdev *nbdev = ctx; 3835 3836 return spdk_get_io_channel(nbdev); 3837 } 3838 3839 static void * 3840 bdev_nvme_get_module_ctx(void *ctx) 3841 { 3842 struct nvme_bdev *nbdev = ctx; 3843 struct nvme_ns *nvme_ns; 3844 3845 if (!nbdev || nbdev->disk.module != &nvme_if) { 3846 return NULL; 3847 } 3848 3849 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 3850 if (!nvme_ns) { 3851 return NULL; 3852 } 3853 3854 return nvme_ns->ns; 3855 } 3856 3857 static const char * 3858 _nvme_ana_state_str(enum spdk_nvme_ana_state ana_state) 3859 { 3860 switch (ana_state) { 3861 case SPDK_NVME_ANA_OPTIMIZED_STATE: 3862 return "optimized"; 3863 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 3864 return "non_optimized"; 3865 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 3866 return "inaccessible"; 3867 case SPDK_NVME_ANA_PERSISTENT_LOSS_STATE: 3868 return "persistent_loss"; 3869 case SPDK_NVME_ANA_CHANGE_STATE: 3870 return "change"; 3871 default: 3872 return NULL; 3873 } 3874 } 3875 3876 static int 3877 bdev_nvme_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size) 3878 { 3879 struct spdk_memory_domain **_domains = NULL; 3880 struct nvme_bdev *nbdev = ctx; 3881 struct nvme_ns *nvme_ns; 3882 int i = 0, _array_size = array_size; 3883 int rc = 0; 3884 3885 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 3886 if (domains && array_size >= i) { 3887 _domains = &domains[i]; 3888 } else { 3889 _domains = NULL; 3890 } 3891 rc = spdk_nvme_ctrlr_get_memory_domains(nvme_ns->ctrlr->ctrlr, _domains, _array_size); 3892 if (rc > 0) { 3893 i += rc; 3894 if (_array_size >= rc) { 3895 _array_size -= rc; 3896 } else { 3897 _array_size = 0; 3898 } 3899 } else if (rc < 0) { 3900 return rc; 3901 } 3902 } 3903 3904 return i; 3905 } 3906 3907 static const char * 3908 nvme_ctrlr_get_state_str(struct nvme_ctrlr *nvme_ctrlr) 3909 { 3910 if (nvme_ctrlr->destruct) { 3911 return "deleting"; 3912 } else if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 3913 return "failed"; 3914 } else if (nvme_ctrlr->resetting) { 3915 return "resetting"; 3916 } else if (nvme_ctrlr->reconnect_is_delayed > 0) { 3917 return "reconnect_is_delayed"; 3918 } else if (nvme_ctrlr->disabled) { 3919 return "disabled"; 3920 } else { 3921 return "enabled"; 3922 } 3923 } 3924 3925 void 3926 nvme_ctrlr_info_json(struct spdk_json_write_ctx *w, struct nvme_ctrlr *nvme_ctrlr) 3927 { 3928 struct spdk_nvme_transport_id *trid; 3929 const struct spdk_nvme_ctrlr_opts *opts; 3930 const struct spdk_nvme_ctrlr_data *cdata; 3931 struct nvme_path_id *path_id; 3932 int32_t numa_id; 3933 3934 spdk_json_write_object_begin(w); 3935 3936 spdk_json_write_named_string(w, "state", nvme_ctrlr_get_state_str(nvme_ctrlr)); 3937 3938 #ifdef SPDK_CONFIG_NVME_CUSE 3939 size_t cuse_name_size = 128; 3940 char cuse_name[cuse_name_size]; 3941 3942 int rc = spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr, cuse_name, &cuse_name_size); 3943 if (rc == 0) { 3944 spdk_json_write_named_string(w, "cuse_device", cuse_name); 3945 } 3946 #endif 3947 trid = &nvme_ctrlr->active_path_id->trid; 3948 spdk_json_write_named_object_begin(w, "trid"); 3949 nvme_bdev_dump_trid_json(trid, w); 3950 spdk_json_write_object_end(w); 3951 3952 path_id = TAILQ_NEXT(nvme_ctrlr->active_path_id, link); 3953 if (path_id != NULL) { 3954 spdk_json_write_named_array_begin(w, "alternate_trids"); 3955 do { 3956 trid = &path_id->trid; 3957 spdk_json_write_object_begin(w); 3958 nvme_bdev_dump_trid_json(trid, w); 3959 spdk_json_write_object_end(w); 3960 3961 path_id = TAILQ_NEXT(path_id, link); 3962 } while (path_id != NULL); 3963 spdk_json_write_array_end(w); 3964 } 3965 3966 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 3967 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 3968 3969 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 3970 spdk_json_write_named_object_begin(w, "host"); 3971 spdk_json_write_named_string(w, "nqn", opts->hostnqn); 3972 spdk_json_write_named_string(w, "addr", opts->src_addr); 3973 spdk_json_write_named_string(w, "svcid", opts->src_svcid); 3974 spdk_json_write_object_end(w); 3975 3976 numa_id = spdk_nvme_ctrlr_get_numa_id(nvme_ctrlr->ctrlr); 3977 if (numa_id != SPDK_ENV_NUMA_ID_ANY) { 3978 spdk_json_write_named_uint32(w, "numa_id", numa_id); 3979 } 3980 spdk_json_write_object_end(w); 3981 } 3982 3983 static void 3984 nvme_namespace_info_json(struct spdk_json_write_ctx *w, 3985 struct nvme_ns *nvme_ns) 3986 { 3987 struct spdk_nvme_ns *ns; 3988 struct spdk_nvme_ctrlr *ctrlr; 3989 const struct spdk_nvme_ctrlr_data *cdata; 3990 const struct spdk_nvme_transport_id *trid; 3991 union spdk_nvme_vs_register vs; 3992 const struct spdk_nvme_ns_data *nsdata; 3993 char buf[128]; 3994 3995 ns = nvme_ns->ns; 3996 if (ns == NULL) { 3997 return; 3998 } 3999 4000 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 4001 4002 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 4003 trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); 4004 vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr); 4005 4006 spdk_json_write_object_begin(w); 4007 4008 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 4009 spdk_json_write_named_string(w, "pci_address", trid->traddr); 4010 } 4011 4012 spdk_json_write_named_object_begin(w, "trid"); 4013 4014 nvme_bdev_dump_trid_json(trid, w); 4015 4016 spdk_json_write_object_end(w); 4017 4018 #ifdef SPDK_CONFIG_NVME_CUSE 4019 size_t cuse_name_size = 128; 4020 char cuse_name[cuse_name_size]; 4021 4022 int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns), 4023 cuse_name, &cuse_name_size); 4024 if (rc == 0) { 4025 spdk_json_write_named_string(w, "cuse_device", cuse_name); 4026 } 4027 #endif 4028 4029 spdk_json_write_named_object_begin(w, "ctrlr_data"); 4030 4031 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 4032 4033 spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid); 4034 4035 snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn); 4036 spdk_str_trim(buf); 4037 spdk_json_write_named_string(w, "model_number", buf); 4038 4039 snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn); 4040 spdk_str_trim(buf); 4041 spdk_json_write_named_string(w, "serial_number", buf); 4042 4043 snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr); 4044 spdk_str_trim(buf); 4045 spdk_json_write_named_string(w, "firmware_revision", buf); 4046 4047 if (cdata->subnqn[0] != '\0') { 4048 spdk_json_write_named_string(w, "subnqn", cdata->subnqn); 4049 } 4050 4051 spdk_json_write_named_object_begin(w, "oacs"); 4052 4053 spdk_json_write_named_uint32(w, "security", cdata->oacs.security); 4054 spdk_json_write_named_uint32(w, "format", cdata->oacs.format); 4055 spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware); 4056 spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage); 4057 4058 spdk_json_write_object_end(w); 4059 4060 spdk_json_write_named_bool(w, "multi_ctrlr", cdata->cmic.multi_ctrlr); 4061 spdk_json_write_named_bool(w, "ana_reporting", cdata->cmic.ana_reporting); 4062 4063 spdk_json_write_object_end(w); 4064 4065 spdk_json_write_named_object_begin(w, "vs"); 4066 4067 spdk_json_write_name(w, "nvme_version"); 4068 if (vs.bits.ter) { 4069 spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter); 4070 } else { 4071 spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr); 4072 } 4073 4074 spdk_json_write_object_end(w); 4075 4076 nsdata = spdk_nvme_ns_get_data(ns); 4077 4078 spdk_json_write_named_object_begin(w, "ns_data"); 4079 4080 spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns)); 4081 4082 if (cdata->cmic.ana_reporting) { 4083 spdk_json_write_named_string(w, "ana_state", 4084 _nvme_ana_state_str(nvme_ns->ana_state)); 4085 } 4086 4087 spdk_json_write_named_bool(w, "can_share", nsdata->nmic.can_share); 4088 4089 spdk_json_write_object_end(w); 4090 4091 if (cdata->oacs.security) { 4092 spdk_json_write_named_object_begin(w, "security"); 4093 4094 spdk_json_write_named_bool(w, "opal", nvme_ns->bdev->opal); 4095 4096 spdk_json_write_object_end(w); 4097 } 4098 4099 spdk_json_write_object_end(w); 4100 } 4101 4102 static const char * 4103 nvme_bdev_get_mp_policy_str(struct nvme_bdev *nbdev) 4104 { 4105 switch (nbdev->mp_policy) { 4106 case BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE: 4107 return "active_passive"; 4108 case BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE: 4109 return "active_active"; 4110 default: 4111 assert(false); 4112 return "invalid"; 4113 } 4114 } 4115 4116 static const char * 4117 nvme_bdev_get_mp_selector_str(struct nvme_bdev *nbdev) 4118 { 4119 switch (nbdev->mp_selector) { 4120 case BDEV_NVME_MP_SELECTOR_ROUND_ROBIN: 4121 return "round_robin"; 4122 case BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH: 4123 return "queue_depth"; 4124 default: 4125 assert(false); 4126 return "invalid"; 4127 } 4128 } 4129 4130 static int 4131 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) 4132 { 4133 struct nvme_bdev *nbdev = ctx; 4134 struct nvme_ns *nvme_ns; 4135 4136 pthread_mutex_lock(&nbdev->mutex); 4137 spdk_json_write_named_array_begin(w, "nvme"); 4138 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 4139 nvme_namespace_info_json(w, nvme_ns); 4140 } 4141 spdk_json_write_array_end(w); 4142 spdk_json_write_named_string(w, "mp_policy", nvme_bdev_get_mp_policy_str(nbdev)); 4143 if (nbdev->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) { 4144 spdk_json_write_named_string(w, "selector", nvme_bdev_get_mp_selector_str(nbdev)); 4145 if (nbdev->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 4146 spdk_json_write_named_uint32(w, "rr_min_io", nbdev->rr_min_io); 4147 } 4148 } 4149 pthread_mutex_unlock(&nbdev->mutex); 4150 4151 return 0; 4152 } 4153 4154 static void 4155 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 4156 { 4157 /* No config per bdev needed */ 4158 } 4159 4160 static uint64_t 4161 bdev_nvme_get_spin_time(struct spdk_io_channel *ch) 4162 { 4163 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 4164 struct nvme_io_path *io_path; 4165 struct nvme_poll_group *group; 4166 uint64_t spin_time = 0; 4167 4168 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 4169 group = io_path->qpair->group; 4170 4171 if (!group || !group->collect_spin_stat) { 4172 continue; 4173 } 4174 4175 if (group->end_ticks != 0) { 4176 group->spin_ticks += (group->end_ticks - group->start_ticks); 4177 group->end_ticks = 0; 4178 } 4179 4180 spin_time += group->spin_ticks; 4181 group->start_ticks = 0; 4182 group->spin_ticks = 0; 4183 } 4184 4185 return (spin_time * 1000000ULL) / spdk_get_ticks_hz(); 4186 } 4187 4188 static void 4189 bdev_nvme_reset_device_stat(void *ctx) 4190 { 4191 struct nvme_bdev *nbdev = ctx; 4192 4193 if (nbdev->err_stat != NULL) { 4194 memset(nbdev->err_stat, 0, sizeof(struct nvme_error_stat)); 4195 } 4196 } 4197 4198 /* JSON string should be lowercases and underscore delimited string. */ 4199 static void 4200 bdev_nvme_format_nvme_status(char *dst, const char *src) 4201 { 4202 char tmp[256]; 4203 4204 spdk_strcpy_replace(dst, 256, src, " - ", "_"); 4205 spdk_strcpy_replace(tmp, 256, dst, "-", "_"); 4206 spdk_strcpy_replace(dst, 256, tmp, " ", "_"); 4207 spdk_strlwr(dst); 4208 } 4209 4210 static void 4211 bdev_nvme_dump_device_stat_json(void *ctx, struct spdk_json_write_ctx *w) 4212 { 4213 struct nvme_bdev *nbdev = ctx; 4214 struct spdk_nvme_status status = {}; 4215 uint16_t sct, sc; 4216 char status_json[256]; 4217 const char *status_str; 4218 4219 if (nbdev->err_stat == NULL) { 4220 return; 4221 } 4222 4223 spdk_json_write_named_object_begin(w, "nvme_error"); 4224 4225 spdk_json_write_named_object_begin(w, "status_type"); 4226 for (sct = 0; sct < 8; sct++) { 4227 if (nbdev->err_stat->status_type[sct] == 0) { 4228 continue; 4229 } 4230 status.sct = sct; 4231 4232 status_str = spdk_nvme_cpl_get_status_type_string(&status); 4233 assert(status_str != NULL); 4234 bdev_nvme_format_nvme_status(status_json, status_str); 4235 4236 spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status_type[sct]); 4237 } 4238 spdk_json_write_object_end(w); 4239 4240 spdk_json_write_named_object_begin(w, "status_code"); 4241 for (sct = 0; sct < 4; sct++) { 4242 status.sct = sct; 4243 for (sc = 0; sc < 256; sc++) { 4244 if (nbdev->err_stat->status[sct][sc] == 0) { 4245 continue; 4246 } 4247 status.sc = sc; 4248 4249 status_str = spdk_nvme_cpl_get_status_string(&status); 4250 assert(status_str != NULL); 4251 bdev_nvme_format_nvme_status(status_json, status_str); 4252 4253 spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status[sct][sc]); 4254 } 4255 } 4256 spdk_json_write_object_end(w); 4257 4258 spdk_json_write_object_end(w); 4259 } 4260 4261 static bool 4262 bdev_nvme_accel_sequence_supported(void *ctx, enum spdk_bdev_io_type type) 4263 { 4264 struct nvme_bdev *nbdev = ctx; 4265 struct nvme_ns *nvme_ns; 4266 struct spdk_nvme_ctrlr *ctrlr; 4267 4268 if (!g_opts.allow_accel_sequence) { 4269 return false; 4270 } 4271 4272 switch (type) { 4273 case SPDK_BDEV_IO_TYPE_WRITE: 4274 case SPDK_BDEV_IO_TYPE_READ: 4275 break; 4276 default: 4277 return false; 4278 } 4279 4280 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 4281 assert(nvme_ns != NULL); 4282 4283 ctrlr = nvme_ns->ctrlr->ctrlr; 4284 assert(ctrlr != NULL); 4285 4286 return spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ACCEL_SEQUENCE_SUPPORTED; 4287 } 4288 4289 static const struct spdk_bdev_fn_table nvmelib_fn_table = { 4290 .destruct = bdev_nvme_destruct, 4291 .submit_request = bdev_nvme_submit_request, 4292 .io_type_supported = bdev_nvme_io_type_supported, 4293 .get_io_channel = bdev_nvme_get_io_channel, 4294 .dump_info_json = bdev_nvme_dump_info_json, 4295 .write_config_json = bdev_nvme_write_config_json, 4296 .get_spin_time = bdev_nvme_get_spin_time, 4297 .get_module_ctx = bdev_nvme_get_module_ctx, 4298 .get_memory_domains = bdev_nvme_get_memory_domains, 4299 .accel_sequence_supported = bdev_nvme_accel_sequence_supported, 4300 .reset_device_stat = bdev_nvme_reset_device_stat, 4301 .dump_device_stat_json = bdev_nvme_dump_device_stat_json, 4302 }; 4303 4304 typedef int (*bdev_nvme_parse_ana_log_page_cb)( 4305 const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg); 4306 4307 static int 4308 bdev_nvme_parse_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 4309 bdev_nvme_parse_ana_log_page_cb cb_fn, void *cb_arg) 4310 { 4311 struct spdk_nvme_ana_group_descriptor *copied_desc; 4312 uint8_t *orig_desc; 4313 uint32_t i, desc_size, copy_len; 4314 int rc = 0; 4315 4316 if (nvme_ctrlr->ana_log_page == NULL) { 4317 return -EINVAL; 4318 } 4319 4320 copied_desc = nvme_ctrlr->copied_ana_desc; 4321 4322 orig_desc = (uint8_t *)nvme_ctrlr->ana_log_page + sizeof(struct spdk_nvme_ana_page); 4323 copy_len = nvme_ctrlr->max_ana_log_page_size - sizeof(struct spdk_nvme_ana_page); 4324 4325 for (i = 0; i < nvme_ctrlr->ana_log_page->num_ana_group_desc; i++) { 4326 memcpy(copied_desc, orig_desc, copy_len); 4327 4328 rc = cb_fn(copied_desc, cb_arg); 4329 if (rc != 0) { 4330 break; 4331 } 4332 4333 desc_size = sizeof(struct spdk_nvme_ana_group_descriptor) + 4334 copied_desc->num_of_nsid * sizeof(uint32_t); 4335 orig_desc += desc_size; 4336 copy_len -= desc_size; 4337 } 4338 4339 return rc; 4340 } 4341 4342 static int 4343 nvme_ns_ana_transition_timedout(void *ctx) 4344 { 4345 struct nvme_ns *nvme_ns = ctx; 4346 4347 spdk_poller_unregister(&nvme_ns->anatt_timer); 4348 nvme_ns->ana_transition_timedout = true; 4349 4350 return SPDK_POLLER_BUSY; 4351 } 4352 4353 static void 4354 _nvme_ns_set_ana_state(struct nvme_ns *nvme_ns, 4355 const struct spdk_nvme_ana_group_descriptor *desc) 4356 { 4357 const struct spdk_nvme_ctrlr_data *cdata; 4358 4359 nvme_ns->ana_group_id = desc->ana_group_id; 4360 nvme_ns->ana_state = desc->ana_state; 4361 nvme_ns->ana_state_updating = false; 4362 4363 switch (nvme_ns->ana_state) { 4364 case SPDK_NVME_ANA_OPTIMIZED_STATE: 4365 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 4366 nvme_ns->ana_transition_timedout = false; 4367 spdk_poller_unregister(&nvme_ns->anatt_timer); 4368 break; 4369 4370 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 4371 case SPDK_NVME_ANA_CHANGE_STATE: 4372 if (nvme_ns->anatt_timer != NULL) { 4373 break; 4374 } 4375 4376 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 4377 nvme_ns->anatt_timer = SPDK_POLLER_REGISTER(nvme_ns_ana_transition_timedout, 4378 nvme_ns, 4379 cdata->anatt * SPDK_SEC_TO_USEC); 4380 break; 4381 default: 4382 break; 4383 } 4384 } 4385 4386 static int 4387 nvme_ns_set_ana_state(const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg) 4388 { 4389 struct nvme_ns *nvme_ns = cb_arg; 4390 uint32_t i; 4391 4392 assert(nvme_ns->ns != NULL); 4393 4394 for (i = 0; i < desc->num_of_nsid; i++) { 4395 if (desc->nsid[i] != spdk_nvme_ns_get_id(nvme_ns->ns)) { 4396 continue; 4397 } 4398 4399 _nvme_ns_set_ana_state(nvme_ns, desc); 4400 return 1; 4401 } 4402 4403 return 0; 4404 } 4405 4406 static int 4407 nvme_generate_uuid(const char *sn, uint32_t nsid, struct spdk_uuid *uuid) 4408 { 4409 int rc = 0; 4410 struct spdk_uuid new_uuid, namespace_uuid; 4411 char merged_str[SPDK_NVME_CTRLR_SN_LEN + NSID_STR_LEN + 1] = {'\0'}; 4412 /* This namespace UUID was generated using uuid_generate() method. */ 4413 const char *namespace_str = {"edaed2de-24bc-4b07-b559-f47ecbe730fd"}; 4414 int size; 4415 4416 assert(strlen(sn) <= SPDK_NVME_CTRLR_SN_LEN); 4417 4418 spdk_uuid_set_null(&new_uuid); 4419 spdk_uuid_set_null(&namespace_uuid); 4420 4421 size = snprintf(merged_str, sizeof(merged_str), "%s%"PRIu32, sn, nsid); 4422 if (size <= 0 || (unsigned long)size >= sizeof(merged_str)) { 4423 return -EINVAL; 4424 } 4425 4426 spdk_uuid_parse(&namespace_uuid, namespace_str); 4427 4428 rc = spdk_uuid_generate_sha1(&new_uuid, &namespace_uuid, merged_str, size); 4429 if (rc == 0) { 4430 memcpy(uuid, &new_uuid, sizeof(struct spdk_uuid)); 4431 } 4432 4433 return rc; 4434 } 4435 4436 static int 4437 nbdev_create(struct spdk_bdev *disk, const char *base_name, 4438 struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns, 4439 struct spdk_bdev_nvme_ctrlr_opts *bdev_opts, void *ctx) 4440 { 4441 const struct spdk_uuid *uuid; 4442 const uint8_t *nguid; 4443 const struct spdk_nvme_ctrlr_data *cdata; 4444 const struct spdk_nvme_ns_data *nsdata; 4445 const struct spdk_nvme_ctrlr_opts *opts; 4446 enum spdk_nvme_csi csi; 4447 uint32_t atomic_bs, phys_bs, bs; 4448 char sn_tmp[SPDK_NVME_CTRLR_SN_LEN + 1] = {'\0'}; 4449 int rc; 4450 4451 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 4452 csi = spdk_nvme_ns_get_csi(ns); 4453 opts = spdk_nvme_ctrlr_get_opts(ctrlr); 4454 4455 switch (csi) { 4456 case SPDK_NVME_CSI_NVM: 4457 disk->product_name = "NVMe disk"; 4458 break; 4459 case SPDK_NVME_CSI_ZNS: 4460 disk->product_name = "NVMe ZNS disk"; 4461 disk->zoned = true; 4462 disk->zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 4463 disk->max_zone_append_size = spdk_nvme_zns_ctrlr_get_max_zone_append_size(ctrlr) / 4464 spdk_nvme_ns_get_extended_sector_size(ns); 4465 disk->max_open_zones = spdk_nvme_zns_ns_get_max_open_zones(ns); 4466 disk->max_active_zones = spdk_nvme_zns_ns_get_max_active_zones(ns); 4467 break; 4468 default: 4469 if (bdev_opts->allow_unrecognized_csi) { 4470 disk->product_name = "NVMe Passthrough disk"; 4471 break; 4472 } 4473 SPDK_ERRLOG("unsupported CSI: %u\n", csi); 4474 return -ENOTSUP; 4475 } 4476 4477 nguid = spdk_nvme_ns_get_nguid(ns); 4478 if (!nguid) { 4479 uuid = spdk_nvme_ns_get_uuid(ns); 4480 if (uuid) { 4481 disk->uuid = *uuid; 4482 } else if (g_opts.generate_uuids) { 4483 spdk_strcpy_pad(sn_tmp, cdata->sn, SPDK_NVME_CTRLR_SN_LEN, '\0'); 4484 rc = nvme_generate_uuid(sn_tmp, spdk_nvme_ns_get_id(ns), &disk->uuid); 4485 if (rc < 0) { 4486 SPDK_ERRLOG("UUID generation failed (%s)\n", spdk_strerror(-rc)); 4487 return rc; 4488 } 4489 } 4490 } else { 4491 memcpy(&disk->uuid, nguid, sizeof(disk->uuid)); 4492 } 4493 4494 disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns)); 4495 if (!disk->name) { 4496 return -ENOMEM; 4497 } 4498 4499 disk->write_cache = 0; 4500 if (cdata->vwc.present) { 4501 /* Enable if the Volatile Write Cache exists */ 4502 disk->write_cache = 1; 4503 } 4504 if (cdata->oncs.write_zeroes) { 4505 disk->max_write_zeroes = UINT16_MAX + 1; 4506 } 4507 disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns); 4508 disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns); 4509 disk->max_segment_size = spdk_nvme_ctrlr_get_max_xfer_size(ctrlr); 4510 disk->ctratt.raw = cdata->ctratt.raw; 4511 disk->nsid = spdk_nvme_ns_get_id(ns); 4512 /* NVMe driver will split one request into multiple requests 4513 * based on MDTS and stripe boundary, the bdev layer will use 4514 * max_segment_size and max_num_segments to split one big IO 4515 * into multiple requests, then small request can't run out 4516 * of NVMe internal requests data structure. 4517 */ 4518 if (opts && opts->io_queue_requests) { 4519 disk->max_num_segments = opts->io_queue_requests / 2; 4520 } 4521 if (spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_SGL_SUPPORTED) { 4522 /* The nvme driver will try to split I/O that have too many 4523 * SGEs, but it doesn't work if that last SGE doesn't end on 4524 * an aggregate total that is block aligned. The bdev layer has 4525 * a more robust splitting framework, so use that instead for 4526 * this case. (See issue #3269.) 4527 */ 4528 uint16_t max_sges = spdk_nvme_ctrlr_get_max_sges(ctrlr); 4529 4530 if (disk->max_num_segments == 0) { 4531 disk->max_num_segments = max_sges; 4532 } else { 4533 disk->max_num_segments = spdk_min(disk->max_num_segments, max_sges); 4534 } 4535 } 4536 disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns); 4537 4538 nsdata = spdk_nvme_ns_get_data(ns); 4539 bs = spdk_nvme_ns_get_sector_size(ns); 4540 atomic_bs = bs; 4541 phys_bs = bs; 4542 if (nsdata->nabo == 0) { 4543 if (nsdata->nsfeat.ns_atomic_write_unit && nsdata->nawupf) { 4544 atomic_bs = bs * (1 + nsdata->nawupf); 4545 } else { 4546 atomic_bs = bs * (1 + cdata->awupf); 4547 } 4548 } 4549 if (nsdata->nsfeat.optperf) { 4550 phys_bs = bs * (1 + nsdata->npwg); 4551 } 4552 disk->phys_blocklen = spdk_min(phys_bs, atomic_bs); 4553 4554 disk->md_len = spdk_nvme_ns_get_md_size(ns); 4555 if (disk->md_len != 0) { 4556 disk->md_interleave = nsdata->flbas.extended; 4557 disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns); 4558 if (disk->dif_type != SPDK_DIF_DISABLE) { 4559 disk->dif_is_head_of_md = nsdata->dps.md_start; 4560 disk->dif_check_flags = bdev_opts->prchk_flags; 4561 disk->dif_pi_format = (enum spdk_dif_pi_format)spdk_nvme_ns_get_pi_format(ns); 4562 } 4563 } 4564 4565 if (!(spdk_nvme_ctrlr_get_flags(ctrlr) & 4566 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) { 4567 disk->acwu = 0; 4568 } else if (nsdata->nsfeat.ns_atomic_write_unit) { 4569 disk->acwu = nsdata->nacwu + 1; /* 0-based */ 4570 } else { 4571 disk->acwu = cdata->acwu + 1; /* 0-based */ 4572 } 4573 4574 if (cdata->oncs.copy) { 4575 /* For now bdev interface allows only single segment copy */ 4576 disk->max_copy = nsdata->mssrl; 4577 } 4578 4579 disk->ctxt = ctx; 4580 disk->fn_table = &nvmelib_fn_table; 4581 disk->module = &nvme_if; 4582 4583 disk->numa.id_valid = 1; 4584 disk->numa.id = spdk_nvme_ctrlr_get_numa_id(ctrlr); 4585 4586 return 0; 4587 } 4588 4589 static struct nvme_bdev * 4590 nvme_bdev_alloc(void) 4591 { 4592 struct nvme_bdev *nbdev; 4593 int rc; 4594 4595 nbdev = calloc(1, sizeof(*nbdev)); 4596 if (!nbdev) { 4597 SPDK_ERRLOG("nbdev calloc() failed\n"); 4598 return NULL; 4599 } 4600 4601 if (g_opts.nvme_error_stat) { 4602 nbdev->err_stat = calloc(1, sizeof(struct nvme_error_stat)); 4603 if (!nbdev->err_stat) { 4604 SPDK_ERRLOG("err_stat calloc() failed\n"); 4605 free(nbdev); 4606 return NULL; 4607 } 4608 } 4609 4610 rc = pthread_mutex_init(&nbdev->mutex, NULL); 4611 if (rc != 0) { 4612 free(nbdev->err_stat); 4613 free(nbdev); 4614 return NULL; 4615 } 4616 4617 nbdev->ref = 1; 4618 nbdev->mp_policy = BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE; 4619 nbdev->mp_selector = BDEV_NVME_MP_SELECTOR_ROUND_ROBIN; 4620 nbdev->rr_min_io = UINT32_MAX; 4621 TAILQ_INIT(&nbdev->nvme_ns_list); 4622 4623 return nbdev; 4624 } 4625 4626 static int 4627 nvme_bdev_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4628 { 4629 struct nvme_bdev *nbdev; 4630 struct nvme_bdev_ctrlr *nbdev_ctrlr = nvme_ctrlr->nbdev_ctrlr; 4631 int rc; 4632 4633 nbdev = nvme_bdev_alloc(); 4634 if (nbdev == NULL) { 4635 SPDK_ERRLOG("Failed to allocate NVMe bdev\n"); 4636 return -ENOMEM; 4637 } 4638 4639 nbdev->opal = nvme_ctrlr->opal_dev != NULL; 4640 4641 rc = nbdev_create(&nbdev->disk, nbdev_ctrlr->name, nvme_ctrlr->ctrlr, 4642 nvme_ns->ns, &nvme_ctrlr->opts, nbdev); 4643 if (rc != 0) { 4644 SPDK_ERRLOG("Failed to create NVMe disk\n"); 4645 nvme_bdev_free(nbdev); 4646 return rc; 4647 } 4648 4649 spdk_io_device_register(nbdev, 4650 bdev_nvme_create_bdev_channel_cb, 4651 bdev_nvme_destroy_bdev_channel_cb, 4652 sizeof(struct nvme_bdev_channel), 4653 nbdev->disk.name); 4654 4655 nvme_ns->bdev = nbdev; 4656 nbdev->nsid = nvme_ns->id; 4657 TAILQ_INSERT_TAIL(&nbdev->nvme_ns_list, nvme_ns, tailq); 4658 4659 pthread_mutex_lock(&g_bdev_nvme_mutex); 4660 4661 nbdev->nbdev_ctrlr = nbdev_ctrlr; 4662 TAILQ_INSERT_TAIL(&nbdev_ctrlr->bdevs, nbdev, tailq); 4663 4664 rc = spdk_bdev_register(&nbdev->disk); 4665 if (rc != 0) { 4666 SPDK_ERRLOG("spdk_bdev_register() failed\n"); 4667 spdk_io_device_unregister(nbdev, NULL); 4668 nvme_ns->bdev = NULL; 4669 4670 TAILQ_REMOVE(&nbdev_ctrlr->bdevs, nbdev, tailq); 4671 4672 pthread_mutex_unlock(&g_bdev_nvme_mutex); 4673 4674 nvme_bdev_free(nbdev); 4675 return rc; 4676 } 4677 4678 pthread_mutex_unlock(&g_bdev_nvme_mutex); 4679 4680 return 0; 4681 } 4682 4683 static bool 4684 bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2) 4685 { 4686 const struct spdk_nvme_ns_data *nsdata1, *nsdata2; 4687 const struct spdk_uuid *uuid1, *uuid2; 4688 4689 nsdata1 = spdk_nvme_ns_get_data(ns1); 4690 nsdata2 = spdk_nvme_ns_get_data(ns2); 4691 uuid1 = spdk_nvme_ns_get_uuid(ns1); 4692 uuid2 = spdk_nvme_ns_get_uuid(ns2); 4693 4694 return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid)) == 0 && 4695 nsdata1->eui64 == nsdata2->eui64 && 4696 ((uuid1 == NULL && uuid2 == NULL) || 4697 (uuid1 != NULL && uuid2 != NULL && spdk_uuid_compare(uuid1, uuid2) == 0)) && 4698 spdk_nvme_ns_get_csi(ns1) == spdk_nvme_ns_get_csi(ns2); 4699 } 4700 4701 static bool 4702 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 4703 struct spdk_nvme_ctrlr_opts *opts) 4704 { 4705 struct nvme_probe_skip_entry *entry; 4706 4707 TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) { 4708 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 4709 return false; 4710 } 4711 } 4712 4713 opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst; 4714 opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight; 4715 opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight; 4716 opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight; 4717 opts->disable_read_ana_log_page = true; 4718 4719 SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr); 4720 4721 return true; 4722 } 4723 4724 static void 4725 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl) 4726 { 4727 struct nvme_ctrlr *nvme_ctrlr = ctx; 4728 4729 if (spdk_nvme_cpl_is_error(cpl)) { 4730 NVME_CTRLR_WARNLOG(nvme_ctrlr, "Abort failed. Resetting controller. sc is %u, sct is %u.\n", 4731 cpl->status.sc, cpl->status.sct); 4732 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4733 } else if (cpl->cdw0 & 0x1) { 4734 NVME_CTRLR_WARNLOG(nvme_ctrlr, "Specified command could not be aborted.\n"); 4735 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4736 } 4737 } 4738 4739 static void 4740 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr, 4741 struct spdk_nvme_qpair *qpair, uint16_t cid) 4742 { 4743 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 4744 union spdk_nvme_csts_register csts; 4745 int rc; 4746 4747 assert(nvme_ctrlr->ctrlr == ctrlr); 4748 4749 NVME_CTRLR_WARNLOG(nvme_ctrlr, "Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", 4750 ctrlr, qpair, cid); 4751 4752 /* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O 4753 * queue. (Note: qpair == NULL when there's an admin cmd timeout.) Otherwise we 4754 * would submit another fabrics cmd on the admin queue to read CSTS and check for its 4755 * completion recursively. 4756 */ 4757 if (nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) { 4758 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); 4759 if (csts.bits.cfs) { 4760 NVME_CTRLR_ERRLOG(nvme_ctrlr, "Controller Fatal Status, reset required\n"); 4761 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4762 return; 4763 } 4764 } 4765 4766 switch (g_opts.action_on_timeout) { 4767 case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT: 4768 if (qpair) { 4769 /* Don't send abort to ctrlr when ctrlr is not available. */ 4770 pthread_mutex_lock(&nvme_ctrlr->mutex); 4771 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 4772 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4773 NVME_CTRLR_NOTICELOG(nvme_ctrlr, "Quit abort. Ctrlr is not available.\n"); 4774 return; 4775 } 4776 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4777 4778 rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid, 4779 nvme_abort_cpl, nvme_ctrlr); 4780 if (rc == 0) { 4781 return; 4782 } 4783 4784 NVME_CTRLR_ERRLOG(nvme_ctrlr, "Unable to send abort. Resetting, rc is %d.\n", rc); 4785 } 4786 4787 /* FALLTHROUGH */ 4788 case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET: 4789 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4790 break; 4791 case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE: 4792 NVME_CTRLR_DEBUGLOG(nvme_ctrlr, "No action for nvme controller timeout.\n"); 4793 break; 4794 default: 4795 NVME_CTRLR_ERRLOG(nvme_ctrlr, "An invalid timeout action value is found.\n"); 4796 break; 4797 } 4798 } 4799 4800 static struct nvme_ns * 4801 nvme_ns_alloc(void) 4802 { 4803 struct nvme_ns *nvme_ns; 4804 4805 nvme_ns = calloc(1, sizeof(struct nvme_ns)); 4806 if (nvme_ns == NULL) { 4807 return NULL; 4808 } 4809 4810 if (g_opts.io_path_stat) { 4811 nvme_ns->stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); 4812 if (nvme_ns->stat == NULL) { 4813 free(nvme_ns); 4814 return NULL; 4815 } 4816 spdk_bdev_reset_io_stat(nvme_ns->stat, SPDK_BDEV_RESET_STAT_MAXMIN); 4817 } 4818 4819 return nvme_ns; 4820 } 4821 4822 static void 4823 nvme_ns_free(struct nvme_ns *nvme_ns) 4824 { 4825 free(nvme_ns->stat); 4826 free(nvme_ns); 4827 } 4828 4829 static void 4830 nvme_ctrlr_populate_namespace_done(struct nvme_ns *nvme_ns, int rc) 4831 { 4832 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 4833 struct nvme_async_probe_ctx *ctx = nvme_ns->probe_ctx; 4834 4835 if (rc == 0) { 4836 nvme_ns->probe_ctx = NULL; 4837 nvme_ctrlr_get_ref(nvme_ctrlr); 4838 } else { 4839 pthread_mutex_lock(&nvme_ctrlr->mutex); 4840 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4841 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4842 4843 nvme_ns_free(nvme_ns); 4844 } 4845 4846 if (ctx) { 4847 ctx->populates_in_progress--; 4848 if (ctx->populates_in_progress == 0) { 4849 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 4850 } 4851 } 4852 } 4853 4854 static void 4855 bdev_nvme_add_io_path(struct nvme_bdev_channel_iter *i, 4856 struct nvme_bdev *nbdev, 4857 struct nvme_bdev_channel *nbdev_ch, void *ctx) 4858 { 4859 struct nvme_ns *nvme_ns = ctx; 4860 int rc; 4861 4862 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 4863 if (rc != 0) { 4864 SPDK_ERRLOG("Failed to add I/O path to bdev_channel dynamically.\n"); 4865 } 4866 4867 nvme_bdev_for_each_channel_continue(i, rc); 4868 } 4869 4870 static void 4871 bdev_nvme_delete_io_path(struct nvme_bdev_channel_iter *i, 4872 struct nvme_bdev *nbdev, 4873 struct nvme_bdev_channel *nbdev_ch, void *ctx) 4874 { 4875 struct nvme_ns *nvme_ns = ctx; 4876 struct nvme_io_path *io_path; 4877 4878 io_path = _bdev_nvme_get_io_path(nbdev_ch, nvme_ns); 4879 if (io_path != NULL) { 4880 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 4881 } 4882 4883 nvme_bdev_for_each_channel_continue(i, 0); 4884 } 4885 4886 static void 4887 bdev_nvme_add_io_path_failed(struct nvme_bdev *nbdev, void *ctx, int status) 4888 { 4889 struct nvme_ns *nvme_ns = ctx; 4890 4891 nvme_ctrlr_populate_namespace_done(nvme_ns, -1); 4892 } 4893 4894 static void 4895 bdev_nvme_add_io_path_done(struct nvme_bdev *nbdev, void *ctx, int status) 4896 { 4897 struct nvme_ns *nvme_ns = ctx; 4898 4899 if (status == 0) { 4900 nvme_ctrlr_populate_namespace_done(nvme_ns, 0); 4901 } else { 4902 /* Delete the added io_paths and fail populating the namespace. */ 4903 nvme_bdev_for_each_channel(nbdev, 4904 bdev_nvme_delete_io_path, 4905 nvme_ns, 4906 bdev_nvme_add_io_path_failed); 4907 } 4908 } 4909 4910 static int 4911 nvme_bdev_add_ns(struct nvme_bdev *nbdev, struct nvme_ns *nvme_ns) 4912 { 4913 struct nvme_ns *tmp_ns; 4914 const struct spdk_nvme_ns_data *nsdata; 4915 4916 nsdata = spdk_nvme_ns_get_data(nvme_ns->ns); 4917 if (!nsdata->nmic.can_share) { 4918 SPDK_ERRLOG("Namespace cannot be shared.\n"); 4919 return -EINVAL; 4920 } 4921 4922 pthread_mutex_lock(&nbdev->mutex); 4923 4924 tmp_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 4925 assert(tmp_ns != NULL); 4926 4927 if (tmp_ns->ns != NULL && !bdev_nvme_compare_ns(nvme_ns->ns, tmp_ns->ns)) { 4928 pthread_mutex_unlock(&nbdev->mutex); 4929 SPDK_ERRLOG("Namespaces are not identical.\n"); 4930 return -EINVAL; 4931 } 4932 4933 nbdev->ref++; 4934 TAILQ_INSERT_TAIL(&nbdev->nvme_ns_list, nvme_ns, tailq); 4935 nvme_ns->bdev = nbdev; 4936 4937 pthread_mutex_unlock(&nbdev->mutex); 4938 4939 /* Add nvme_io_path to nvme_bdev_channels dynamically. */ 4940 nvme_bdev_for_each_channel(nbdev, 4941 bdev_nvme_add_io_path, 4942 nvme_ns, 4943 bdev_nvme_add_io_path_done); 4944 4945 return 0; 4946 } 4947 4948 static void 4949 nvme_ctrlr_populate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4950 { 4951 struct spdk_nvme_ns *ns; 4952 struct nvme_bdev *bdev; 4953 int rc = 0; 4954 4955 ns = spdk_nvme_ctrlr_get_ns(nvme_ctrlr->ctrlr, nvme_ns->id); 4956 if (!ns) { 4957 NVME_CTRLR_DEBUGLOG(nvme_ctrlr, "Invalid NS %d\n", nvme_ns->id); 4958 rc = -EINVAL; 4959 goto done; 4960 } 4961 4962 nvme_ns->ns = ns; 4963 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 4964 4965 if (nvme_ctrlr->ana_log_page != NULL) { 4966 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ns_set_ana_state, nvme_ns); 4967 } 4968 4969 bdev = nvme_bdev_ctrlr_get_bdev(nvme_ctrlr->nbdev_ctrlr, nvme_ns->id); 4970 if (bdev == NULL) { 4971 rc = nvme_bdev_create(nvme_ctrlr, nvme_ns); 4972 } else { 4973 rc = nvme_bdev_add_ns(bdev, nvme_ns); 4974 if (rc == 0) { 4975 return; 4976 } 4977 } 4978 done: 4979 nvme_ctrlr_populate_namespace_done(nvme_ns, rc); 4980 } 4981 4982 static void 4983 nvme_ctrlr_depopulate_namespace_done(struct nvme_ns *nvme_ns) 4984 { 4985 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 4986 4987 assert(nvme_ctrlr != NULL); 4988 4989 pthread_mutex_lock(&nvme_ctrlr->mutex); 4990 4991 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4992 4993 if (nvme_ns->bdev != NULL) { 4994 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4995 return; 4996 } 4997 4998 nvme_ns_free(nvme_ns); 4999 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5000 5001 nvme_ctrlr_put_ref(nvme_ctrlr); 5002 } 5003 5004 static void 5005 bdev_nvme_delete_io_path_done(struct nvme_bdev *nbdev, void *ctx, int status) 5006 { 5007 struct nvme_ns *nvme_ns = ctx; 5008 5009 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 5010 } 5011 5012 static void 5013 nvme_ctrlr_depopulate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 5014 { 5015 struct nvme_bdev *nbdev; 5016 5017 if (nvme_ns->depopulating) { 5018 /* Maybe we received 2 AENs in a row */ 5019 return; 5020 } 5021 nvme_ns->depopulating = true; 5022 5023 spdk_poller_unregister(&nvme_ns->anatt_timer); 5024 5025 nbdev = nvme_ns->bdev; 5026 if (nbdev != NULL) { 5027 pthread_mutex_lock(&nbdev->mutex); 5028 5029 assert(nbdev->ref > 0); 5030 nbdev->ref--; 5031 if (nbdev->ref == 0) { 5032 pthread_mutex_unlock(&nbdev->mutex); 5033 5034 spdk_bdev_unregister(&nbdev->disk, NULL, NULL); 5035 } else { 5036 /* spdk_bdev_unregister() is not called until the last nvme_ns is 5037 * depopulated. Hence we need to remove nvme_ns from bdev->nvme_ns_list 5038 * and clear nvme_ns->bdev here. 5039 */ 5040 TAILQ_REMOVE(&nbdev->nvme_ns_list, nvme_ns, tailq); 5041 5042 pthread_mutex_lock(&nvme_ns->ctrlr->mutex); 5043 nvme_ns->bdev = NULL; 5044 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 5045 5046 pthread_mutex_unlock(&nbdev->mutex); 5047 5048 /* Delete nvme_io_paths from nvme_bdev_channels dynamically. After that, 5049 * we call depopulate_namespace_done() to avoid use-after-free. 5050 */ 5051 nvme_bdev_for_each_channel(nbdev, 5052 bdev_nvme_delete_io_path, 5053 nvme_ns, 5054 bdev_nvme_delete_io_path_done); 5055 return; 5056 } 5057 } 5058 5059 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 5060 } 5061 5062 static void 5063 nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 5064 struct nvme_async_probe_ctx *ctx) 5065 { 5066 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 5067 struct nvme_ns *nvme_ns, *next; 5068 struct spdk_nvme_ns *ns; 5069 struct nvme_bdev *nbdev; 5070 uint32_t nsid; 5071 int rc; 5072 uint64_t num_sectors; 5073 5074 if (ctx) { 5075 /* Initialize this count to 1 to handle the populate functions 5076 * calling nvme_ctrlr_populate_namespace_done() immediately. 5077 */ 5078 ctx->populates_in_progress = 1; 5079 } 5080 5081 /* First loop over our existing namespaces and see if they have been 5082 * removed. */ 5083 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 5084 while (nvme_ns != NULL) { 5085 next = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 5086 5087 if (spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) { 5088 /* NS is still there or added again. Its attributes may have changed. */ 5089 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id); 5090 if (nvme_ns->ns != ns) { 5091 assert(nvme_ns->ns == NULL); 5092 nvme_ns->ns = ns; 5093 NVME_CTRLR_DEBUGLOG(nvme_ctrlr, "NSID %u was added\n", nvme_ns->id); 5094 } 5095 5096 num_sectors = spdk_nvme_ns_get_num_sectors(ns); 5097 nbdev = nvme_ns->bdev; 5098 assert(nbdev != NULL); 5099 if (nbdev->disk.blockcnt != num_sectors) { 5100 NVME_CTRLR_NOTICELOG(nvme_ctrlr, 5101 "NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n", 5102 nvme_ns->id, 5103 nbdev->disk.name, 5104 nbdev->disk.blockcnt, 5105 num_sectors); 5106 rc = spdk_bdev_notify_blockcnt_change(&nbdev->disk, num_sectors); 5107 if (rc != 0) { 5108 NVME_CTRLR_ERRLOG(nvme_ctrlr, 5109 "Could not change num blocks for nvme bdev: name %s, errno: %d.\n", 5110 nbdev->disk.name, rc); 5111 } 5112 } 5113 } else { 5114 /* Namespace was removed */ 5115 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 5116 } 5117 5118 nvme_ns = next; 5119 } 5120 5121 /* Loop through all of the namespaces at the nvme level and see if any of them are new */ 5122 nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 5123 while (nsid != 0) { 5124 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 5125 5126 if (nvme_ns == NULL) { 5127 /* Found a new one */ 5128 nvme_ns = nvme_ns_alloc(); 5129 if (nvme_ns == NULL) { 5130 NVME_CTRLR_ERRLOG(nvme_ctrlr, "Failed to allocate namespace\n"); 5131 /* This just fails to attach the namespace. It may work on a future attempt. */ 5132 continue; 5133 } 5134 5135 nvme_ns->id = nsid; 5136 nvme_ns->ctrlr = nvme_ctrlr; 5137 5138 nvme_ns->bdev = NULL; 5139 5140 if (ctx) { 5141 ctx->populates_in_progress++; 5142 } 5143 nvme_ns->probe_ctx = ctx; 5144 5145 pthread_mutex_lock(&nvme_ctrlr->mutex); 5146 RB_INSERT(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 5147 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5148 5149 nvme_ctrlr_populate_namespace(nvme_ctrlr, nvme_ns); 5150 } 5151 5152 nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid); 5153 } 5154 5155 if (ctx) { 5156 /* Decrement this count now that the loop is over to account 5157 * for the one we started with. If the count is then 0, we 5158 * know any populate_namespace functions completed immediately, 5159 * so we'll kick the callback here. 5160 */ 5161 ctx->populates_in_progress--; 5162 if (ctx->populates_in_progress == 0) { 5163 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 5164 } 5165 } 5166 5167 } 5168 5169 static void 5170 nvme_ctrlr_depopulate_namespaces(struct nvme_ctrlr *nvme_ctrlr) 5171 { 5172 struct nvme_ns *nvme_ns, *tmp; 5173 5174 RB_FOREACH_SAFE(nvme_ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp) { 5175 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 5176 } 5177 } 5178 5179 static uint32_t 5180 nvme_ctrlr_get_ana_log_page_size(struct nvme_ctrlr *nvme_ctrlr) 5181 { 5182 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 5183 const struct spdk_nvme_ctrlr_data *cdata; 5184 uint32_t nsid, ns_count = 0; 5185 5186 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5187 5188 for (nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 5189 nsid != 0; nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid)) { 5190 ns_count++; 5191 } 5192 5193 return sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 5194 sizeof(struct spdk_nvme_ana_group_descriptor) + ns_count * 5195 sizeof(uint32_t); 5196 } 5197 5198 static int 5199 nvme_ctrlr_set_ana_states(const struct spdk_nvme_ana_group_descriptor *desc, 5200 void *cb_arg) 5201 { 5202 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 5203 struct nvme_ns *nvme_ns; 5204 uint32_t i, nsid; 5205 5206 for (i = 0; i < desc->num_of_nsid; i++) { 5207 nsid = desc->nsid[i]; 5208 if (nsid == 0) { 5209 continue; 5210 } 5211 5212 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 5213 5214 if (nvme_ns == NULL) { 5215 /* Target told us that an inactive namespace had an ANA change */ 5216 continue; 5217 } 5218 5219 _nvme_ns_set_ana_state(nvme_ns, desc); 5220 } 5221 5222 return 0; 5223 } 5224 5225 static void 5226 bdev_nvme_disable_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 5227 { 5228 struct nvme_ns *nvme_ns; 5229 5230 spdk_free(nvme_ctrlr->ana_log_page); 5231 nvme_ctrlr->ana_log_page = NULL; 5232 5233 for (nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 5234 nvme_ns != NULL; 5235 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns)) { 5236 nvme_ns->ana_state_updating = false; 5237 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 5238 } 5239 } 5240 5241 static void 5242 nvme_ctrlr_read_ana_log_page_done(void *ctx, const struct spdk_nvme_cpl *cpl) 5243 { 5244 struct nvme_ctrlr *nvme_ctrlr = ctx; 5245 5246 if (cpl != NULL && spdk_nvme_cpl_is_success(cpl)) { 5247 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ctrlr_set_ana_states, 5248 nvme_ctrlr); 5249 } else { 5250 bdev_nvme_disable_read_ana_log_page(nvme_ctrlr); 5251 } 5252 5253 pthread_mutex_lock(&nvme_ctrlr->mutex); 5254 5255 assert(nvme_ctrlr->ana_log_page_updating == true); 5256 nvme_ctrlr->ana_log_page_updating = false; 5257 5258 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 5259 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5260 5261 nvme_ctrlr_unregister(nvme_ctrlr); 5262 } else { 5263 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5264 5265 bdev_nvme_clear_io_path_caches(nvme_ctrlr); 5266 } 5267 } 5268 5269 static int 5270 nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 5271 { 5272 uint32_t ana_log_page_size; 5273 int rc; 5274 5275 if (nvme_ctrlr->ana_log_page == NULL) { 5276 return -EINVAL; 5277 } 5278 5279 ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); 5280 5281 if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { 5282 NVME_CTRLR_ERRLOG(nvme_ctrlr, 5283 "ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", 5284 ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); 5285 return -EINVAL; 5286 } 5287 5288 pthread_mutex_lock(&nvme_ctrlr->mutex); 5289 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 5290 nvme_ctrlr->ana_log_page_updating) { 5291 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5292 return -EBUSY; 5293 } 5294 5295 nvme_ctrlr->ana_log_page_updating = true; 5296 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5297 5298 rc = spdk_nvme_ctrlr_cmd_get_log_page(nvme_ctrlr->ctrlr, 5299 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 5300 SPDK_NVME_GLOBAL_NS_TAG, 5301 nvme_ctrlr->ana_log_page, 5302 ana_log_page_size, 0, 5303 nvme_ctrlr_read_ana_log_page_done, 5304 nvme_ctrlr); 5305 if (rc != 0) { 5306 nvme_ctrlr_read_ana_log_page_done(nvme_ctrlr, NULL); 5307 } 5308 5309 return rc; 5310 } 5311 5312 static void 5313 dummy_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 5314 { 5315 } 5316 5317 struct bdev_nvme_set_preferred_path_ctx { 5318 struct spdk_bdev_desc *desc; 5319 struct nvme_ns *nvme_ns; 5320 bdev_nvme_set_preferred_path_cb cb_fn; 5321 void *cb_arg; 5322 }; 5323 5324 static void 5325 bdev_nvme_set_preferred_path_done(struct nvme_bdev *nbdev, void *_ctx, int status) 5326 { 5327 struct bdev_nvme_set_preferred_path_ctx *ctx = _ctx; 5328 5329 assert(ctx != NULL); 5330 assert(ctx->desc != NULL); 5331 assert(ctx->cb_fn != NULL); 5332 5333 spdk_bdev_close(ctx->desc); 5334 5335 ctx->cb_fn(ctx->cb_arg, status); 5336 5337 free(ctx); 5338 } 5339 5340 static void 5341 _bdev_nvme_set_preferred_path(struct nvme_bdev_channel_iter *i, 5342 struct nvme_bdev *nbdev, 5343 struct nvme_bdev_channel *nbdev_ch, void *_ctx) 5344 { 5345 struct bdev_nvme_set_preferred_path_ctx *ctx = _ctx; 5346 struct nvme_io_path *io_path, *prev; 5347 5348 prev = NULL; 5349 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 5350 if (io_path->nvme_ns == ctx->nvme_ns) { 5351 break; 5352 } 5353 prev = io_path; 5354 } 5355 5356 if (io_path != NULL) { 5357 if (prev != NULL) { 5358 STAILQ_REMOVE_AFTER(&nbdev_ch->io_path_list, prev, stailq); 5359 STAILQ_INSERT_HEAD(&nbdev_ch->io_path_list, io_path, stailq); 5360 } 5361 5362 /* We can set io_path to nbdev_ch->current_io_path directly here. 5363 * However, it needs to be conditional. To simplify the code, 5364 * just clear nbdev_ch->current_io_path and let find_io_path() 5365 * fill it. 5366 * 5367 * Automatic failback may be disabled. Hence even if the io_path is 5368 * already at the head, clear nbdev_ch->current_io_path. 5369 */ 5370 bdev_nvme_clear_current_io_path(nbdev_ch); 5371 } 5372 5373 nvme_bdev_for_each_channel_continue(i, 0); 5374 } 5375 5376 static struct nvme_ns * 5377 bdev_nvme_set_preferred_ns(struct nvme_bdev *nbdev, uint16_t cntlid) 5378 { 5379 struct nvme_ns *nvme_ns, *prev; 5380 const struct spdk_nvme_ctrlr_data *cdata; 5381 5382 prev = NULL; 5383 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 5384 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 5385 5386 if (cdata->cntlid == cntlid) { 5387 break; 5388 } 5389 prev = nvme_ns; 5390 } 5391 5392 if (nvme_ns != NULL && prev != NULL) { 5393 TAILQ_REMOVE(&nbdev->nvme_ns_list, nvme_ns, tailq); 5394 TAILQ_INSERT_HEAD(&nbdev->nvme_ns_list, nvme_ns, tailq); 5395 } 5396 5397 return nvme_ns; 5398 } 5399 5400 /* This function supports only multipath mode. There is only a single I/O path 5401 * for each NVMe-oF controller. Hence, just move the matched I/O path to the 5402 * head of the I/O path list for each NVMe bdev channel. 5403 * 5404 * NVMe bdev channel may be acquired after completing this function. move the 5405 * matched namespace to the head of the namespace list for the NVMe bdev too. 5406 */ 5407 void 5408 bdev_nvme_set_preferred_path(const char *name, uint16_t cntlid, 5409 bdev_nvme_set_preferred_path_cb cb_fn, void *cb_arg) 5410 { 5411 struct bdev_nvme_set_preferred_path_ctx *ctx; 5412 struct spdk_bdev *bdev; 5413 struct nvme_bdev *nbdev; 5414 int rc = 0; 5415 5416 assert(cb_fn != NULL); 5417 5418 ctx = calloc(1, sizeof(*ctx)); 5419 if (ctx == NULL) { 5420 SPDK_ERRLOG("Failed to alloc context.\n"); 5421 rc = -ENOMEM; 5422 goto err_alloc; 5423 } 5424 5425 ctx->cb_fn = cb_fn; 5426 ctx->cb_arg = cb_arg; 5427 5428 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 5429 if (rc != 0) { 5430 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 5431 goto err_open; 5432 } 5433 5434 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 5435 5436 if (bdev->module != &nvme_if) { 5437 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 5438 rc = -ENODEV; 5439 goto err_bdev; 5440 } 5441 5442 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 5443 5444 pthread_mutex_lock(&nbdev->mutex); 5445 5446 ctx->nvme_ns = bdev_nvme_set_preferred_ns(nbdev, cntlid); 5447 if (ctx->nvme_ns == NULL) { 5448 pthread_mutex_unlock(&nbdev->mutex); 5449 5450 SPDK_ERRLOG("bdev %s does not have namespace to controller %u.\n", name, cntlid); 5451 rc = -ENODEV; 5452 goto err_bdev; 5453 } 5454 5455 pthread_mutex_unlock(&nbdev->mutex); 5456 5457 nvme_bdev_for_each_channel(nbdev, 5458 _bdev_nvme_set_preferred_path, 5459 ctx, 5460 bdev_nvme_set_preferred_path_done); 5461 return; 5462 5463 err_bdev: 5464 spdk_bdev_close(ctx->desc); 5465 err_open: 5466 free(ctx); 5467 err_alloc: 5468 cb_fn(cb_arg, rc); 5469 } 5470 5471 struct bdev_nvme_set_multipath_policy_ctx { 5472 struct spdk_bdev_desc *desc; 5473 spdk_bdev_nvme_set_multipath_policy_cb cb_fn; 5474 void *cb_arg; 5475 }; 5476 5477 static void 5478 bdev_nvme_set_multipath_policy_done(struct nvme_bdev *nbdev, void *_ctx, int status) 5479 { 5480 struct bdev_nvme_set_multipath_policy_ctx *ctx = _ctx; 5481 5482 assert(ctx != NULL); 5483 assert(ctx->desc != NULL); 5484 assert(ctx->cb_fn != NULL); 5485 5486 spdk_bdev_close(ctx->desc); 5487 5488 ctx->cb_fn(ctx->cb_arg, status); 5489 5490 free(ctx); 5491 } 5492 5493 static void 5494 _bdev_nvme_set_multipath_policy(struct nvme_bdev_channel_iter *i, 5495 struct nvme_bdev *nbdev, 5496 struct nvme_bdev_channel *nbdev_ch, void *ctx) 5497 { 5498 nbdev_ch->mp_policy = nbdev->mp_policy; 5499 nbdev_ch->mp_selector = nbdev->mp_selector; 5500 nbdev_ch->rr_min_io = nbdev->rr_min_io; 5501 bdev_nvme_clear_current_io_path(nbdev_ch); 5502 5503 nvme_bdev_for_each_channel_continue(i, 0); 5504 } 5505 5506 void 5507 spdk_bdev_nvme_set_multipath_policy(const char *name, enum spdk_bdev_nvme_multipath_policy policy, 5508 enum spdk_bdev_nvme_multipath_selector selector, uint32_t rr_min_io, 5509 spdk_bdev_nvme_set_multipath_policy_cb cb_fn, void *cb_arg) 5510 { 5511 struct bdev_nvme_set_multipath_policy_ctx *ctx; 5512 struct spdk_bdev *bdev; 5513 struct nvme_bdev *nbdev; 5514 int rc; 5515 5516 assert(cb_fn != NULL); 5517 5518 switch (policy) { 5519 case BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE: 5520 break; 5521 case BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE: 5522 switch (selector) { 5523 case BDEV_NVME_MP_SELECTOR_ROUND_ROBIN: 5524 if (rr_min_io == UINT32_MAX) { 5525 rr_min_io = 1; 5526 } else if (rr_min_io == 0) { 5527 rc = -EINVAL; 5528 goto exit; 5529 } 5530 break; 5531 case BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH: 5532 break; 5533 default: 5534 rc = -EINVAL; 5535 goto exit; 5536 } 5537 break; 5538 default: 5539 rc = -EINVAL; 5540 goto exit; 5541 } 5542 5543 ctx = calloc(1, sizeof(*ctx)); 5544 if (ctx == NULL) { 5545 SPDK_ERRLOG("Failed to alloc context.\n"); 5546 rc = -ENOMEM; 5547 goto exit; 5548 } 5549 5550 ctx->cb_fn = cb_fn; 5551 ctx->cb_arg = cb_arg; 5552 5553 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 5554 if (rc != 0) { 5555 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 5556 rc = -ENODEV; 5557 goto err_open; 5558 } 5559 5560 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 5561 if (bdev->module != &nvme_if) { 5562 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 5563 rc = -ENODEV; 5564 goto err_module; 5565 } 5566 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 5567 5568 pthread_mutex_lock(&nbdev->mutex); 5569 nbdev->mp_policy = policy; 5570 nbdev->mp_selector = selector; 5571 nbdev->rr_min_io = rr_min_io; 5572 pthread_mutex_unlock(&nbdev->mutex); 5573 5574 nvme_bdev_for_each_channel(nbdev, 5575 _bdev_nvme_set_multipath_policy, 5576 ctx, 5577 bdev_nvme_set_multipath_policy_done); 5578 return; 5579 5580 err_module: 5581 spdk_bdev_close(ctx->desc); 5582 err_open: 5583 free(ctx); 5584 exit: 5585 cb_fn(cb_arg, rc); 5586 } 5587 5588 static void 5589 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 5590 { 5591 struct nvme_ctrlr *nvme_ctrlr = arg; 5592 union spdk_nvme_async_event_completion event; 5593 5594 if (spdk_nvme_cpl_is_error(cpl)) { 5595 SPDK_WARNLOG("AER request execute failed\n"); 5596 return; 5597 } 5598 5599 event.raw = cpl->cdw0; 5600 if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 5601 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) { 5602 nvme_ctrlr_populate_namespaces(nvme_ctrlr, NULL); 5603 } else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 5604 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_ANA_CHANGE)) { 5605 nvme_ctrlr_read_ana_log_page(nvme_ctrlr); 5606 } 5607 } 5608 5609 static void 5610 free_nvme_async_probe_ctx(struct nvme_async_probe_ctx *ctx) 5611 { 5612 spdk_keyring_put_key(ctx->drv_opts.tls_psk); 5613 spdk_keyring_put_key(ctx->drv_opts.dhchap_key); 5614 spdk_keyring_put_key(ctx->drv_opts.dhchap_ctrlr_key); 5615 free(ctx->base_name); 5616 free(ctx); 5617 } 5618 5619 static void 5620 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, int rc) 5621 { 5622 if (ctx->cb_fn) { 5623 ctx->cb_fn(ctx->cb_ctx, ctx->reported_bdevs, rc); 5624 } 5625 5626 ctx->namespaces_populated = true; 5627 if (ctx->probe_done) { 5628 /* The probe was already completed, so we need to free the context 5629 * here. This can happen for cases like OCSSD, where we need to 5630 * send additional commands to the SSD after attach. 5631 */ 5632 free_nvme_async_probe_ctx(ctx); 5633 } 5634 } 5635 5636 static int 5637 bdev_nvme_remove_poller(void *ctx) 5638 { 5639 struct spdk_nvme_transport_id trid_pcie; 5640 5641 if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 5642 spdk_poller_unregister(&g_hotplug_poller); 5643 return SPDK_POLLER_IDLE; 5644 } 5645 5646 memset(&trid_pcie, 0, sizeof(trid_pcie)); 5647 spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE); 5648 5649 if (spdk_nvme_scan_attached(&trid_pcie)) { 5650 SPDK_ERRLOG_RATELIMIT("spdk_nvme_scan_attached() failed\n"); 5651 } 5652 5653 return SPDK_POLLER_BUSY; 5654 } 5655 5656 static void 5657 nvme_ctrlr_create_done(struct nvme_ctrlr *nvme_ctrlr, 5658 struct nvme_async_probe_ctx *ctx) 5659 { 5660 struct spdk_nvme_transport_id *trid = &nvme_ctrlr->active_path_id->trid; 5661 5662 if (spdk_nvme_trtype_is_fabrics(trid->trtype)) { 5663 NVME_CTRLR_INFOLOG(nvme_ctrlr, "ctrlr was created to %s:%s\n", 5664 trid->traddr, trid->trsvcid); 5665 } else { 5666 NVME_CTRLR_INFOLOG(nvme_ctrlr, "ctrlr was created\n"); 5667 } 5668 5669 spdk_io_device_register(nvme_ctrlr, 5670 bdev_nvme_create_ctrlr_channel_cb, 5671 bdev_nvme_destroy_ctrlr_channel_cb, 5672 sizeof(struct nvme_ctrlr_channel), 5673 nvme_ctrlr->nbdev_ctrlr->name); 5674 5675 nvme_ctrlr_populate_namespaces(nvme_ctrlr, ctx); 5676 5677 if (g_hotplug_poller == NULL) { 5678 g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_remove_poller, NULL, 5679 NVME_HOTPLUG_POLL_PERIOD_DEFAULT); 5680 } 5681 } 5682 5683 static void 5684 nvme_ctrlr_init_ana_log_page_done(void *_ctx, const struct spdk_nvme_cpl *cpl) 5685 { 5686 struct nvme_ctrlr *nvme_ctrlr = _ctx; 5687 struct nvme_async_probe_ctx *ctx = nvme_ctrlr->probe_ctx; 5688 5689 nvme_ctrlr->probe_ctx = NULL; 5690 5691 if (spdk_nvme_cpl_is_error(cpl)) { 5692 nvme_ctrlr_delete(nvme_ctrlr); 5693 5694 if (ctx != NULL) { 5695 ctx->reported_bdevs = 0; 5696 populate_namespaces_cb(ctx, -1); 5697 } 5698 return; 5699 } 5700 5701 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 5702 } 5703 5704 static int 5705 nvme_ctrlr_init_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 5706 struct nvme_async_probe_ctx *ctx) 5707 { 5708 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 5709 const struct spdk_nvme_ctrlr_data *cdata; 5710 uint32_t ana_log_page_size; 5711 5712 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5713 5714 /* Set buffer size enough to include maximum number of allowed namespaces. */ 5715 ana_log_page_size = sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 5716 sizeof(struct spdk_nvme_ana_group_descriptor) + cdata->mnan * 5717 sizeof(uint32_t); 5718 5719 nvme_ctrlr->ana_log_page = spdk_zmalloc(ana_log_page_size, 64, NULL, 5720 SPDK_ENV_NUMA_ID_ANY, SPDK_MALLOC_DMA); 5721 if (nvme_ctrlr->ana_log_page == NULL) { 5722 NVME_CTRLR_ERRLOG(nvme_ctrlr, "could not allocate ANA log page buffer\n"); 5723 return -ENXIO; 5724 } 5725 5726 /* Each descriptor in a ANA log page is not ensured to be 8-bytes aligned. 5727 * Hence copy each descriptor to a temporary area when parsing it. 5728 * 5729 * Allocate a buffer whose size is as large as ANA log page buffer because 5730 * we do not know the size of a descriptor until actually reading it. 5731 */ 5732 nvme_ctrlr->copied_ana_desc = calloc(1, ana_log_page_size); 5733 if (nvme_ctrlr->copied_ana_desc == NULL) { 5734 NVME_CTRLR_ERRLOG(nvme_ctrlr, "could not allocate a buffer to parse ANA descriptor\n"); 5735 return -ENOMEM; 5736 } 5737 5738 nvme_ctrlr->max_ana_log_page_size = ana_log_page_size; 5739 5740 nvme_ctrlr->probe_ctx = ctx; 5741 5742 /* Then, set the read size only to include the current active namespaces. */ 5743 ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); 5744 5745 if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { 5746 NVME_CTRLR_ERRLOG(nvme_ctrlr, "ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", 5747 ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); 5748 return -EINVAL; 5749 } 5750 5751 return spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, 5752 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 5753 SPDK_NVME_GLOBAL_NS_TAG, 5754 nvme_ctrlr->ana_log_page, 5755 ana_log_page_size, 0, 5756 nvme_ctrlr_init_ana_log_page_done, 5757 nvme_ctrlr); 5758 } 5759 5760 /* hostnqn and subnqn were already verified before attaching a controller. 5761 * Hence check only the multipath capability and cntlid here. 5762 */ 5763 static bool 5764 bdev_nvme_check_multipath(struct nvme_bdev_ctrlr *nbdev_ctrlr, struct spdk_nvme_ctrlr *ctrlr) 5765 { 5766 struct nvme_ctrlr *tmp; 5767 const struct spdk_nvme_ctrlr_data *cdata, *tmp_cdata; 5768 5769 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5770 5771 if (!cdata->cmic.multi_ctrlr) { 5772 SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); 5773 return false; 5774 } 5775 5776 TAILQ_FOREACH(tmp, &nbdev_ctrlr->ctrlrs, tailq) { 5777 tmp_cdata = spdk_nvme_ctrlr_get_data(tmp->ctrlr); 5778 5779 if (!tmp_cdata->cmic.multi_ctrlr) { 5780 NVME_CTRLR_ERRLOG(tmp, "Ctrlr%u does not support multipath.\n", cdata->cntlid); 5781 return false; 5782 } 5783 if (cdata->cntlid == tmp_cdata->cntlid) { 5784 NVME_CTRLR_ERRLOG(tmp, "cntlid %u are duplicated.\n", tmp_cdata->cntlid); 5785 return false; 5786 } 5787 } 5788 5789 return true; 5790 } 5791 5792 5793 static int 5794 nvme_bdev_ctrlr_create(const char *name, struct nvme_ctrlr *nvme_ctrlr) 5795 { 5796 struct nvme_bdev_ctrlr *nbdev_ctrlr; 5797 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 5798 struct nvme_ctrlr *nctrlr; 5799 int rc = 0; 5800 5801 pthread_mutex_lock(&g_bdev_nvme_mutex); 5802 5803 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 5804 if (nbdev_ctrlr != NULL) { 5805 if (!bdev_nvme_check_multipath(nbdev_ctrlr, ctrlr)) { 5806 rc = -EINVAL; 5807 goto exit; 5808 } 5809 TAILQ_FOREACH(nctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 5810 if (nctrlr->opts.multipath != nvme_ctrlr->opts.multipath) { 5811 /* All controllers with the same name must be configured the same 5812 * way, either for multipath or failover. If the configuration doesn't 5813 * match - report error. 5814 */ 5815 rc = -EINVAL; 5816 goto exit; 5817 } 5818 } 5819 } else { 5820 nbdev_ctrlr = calloc(1, sizeof(*nbdev_ctrlr)); 5821 if (nbdev_ctrlr == NULL) { 5822 NVME_CTRLR_ERRLOG(nvme_ctrlr, "Failed to allocate nvme_bdev_ctrlr.\n"); 5823 rc = -ENOMEM; 5824 goto exit; 5825 } 5826 nbdev_ctrlr->name = strdup(name); 5827 if (nbdev_ctrlr->name == NULL) { 5828 NVME_CTRLR_ERRLOG(nvme_ctrlr, "Failed to allocate name of nvme_bdev_ctrlr.\n"); 5829 free(nbdev_ctrlr); 5830 goto exit; 5831 } 5832 TAILQ_INIT(&nbdev_ctrlr->ctrlrs); 5833 TAILQ_INIT(&nbdev_ctrlr->bdevs); 5834 TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 5835 } 5836 nvme_ctrlr->nbdev_ctrlr = nbdev_ctrlr; 5837 TAILQ_INSERT_TAIL(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 5838 exit: 5839 pthread_mutex_unlock(&g_bdev_nvme_mutex); 5840 return rc; 5841 } 5842 5843 static int 5844 nvme_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr, 5845 const char *name, 5846 const struct spdk_nvme_transport_id *trid, 5847 struct nvme_async_probe_ctx *ctx) 5848 { 5849 struct nvme_ctrlr *nvme_ctrlr; 5850 struct nvme_path_id *path_id; 5851 const struct spdk_nvme_ctrlr_data *cdata; 5852 struct spdk_event_handler_opts opts = { 5853 .opts_size = SPDK_SIZEOF(&opts, fd_type), 5854 }; 5855 uint64_t period; 5856 int fd, rc; 5857 5858 nvme_ctrlr = calloc(1, sizeof(*nvme_ctrlr)); 5859 if (nvme_ctrlr == NULL) { 5860 SPDK_ERRLOG("Failed to allocate device struct\n"); 5861 return -ENOMEM; 5862 } 5863 5864 rc = pthread_mutex_init(&nvme_ctrlr->mutex, NULL); 5865 if (rc != 0) { 5866 free(nvme_ctrlr); 5867 return rc; 5868 } 5869 5870 TAILQ_INIT(&nvme_ctrlr->trids); 5871 TAILQ_INIT(&nvme_ctrlr->pending_resets); 5872 RB_INIT(&nvme_ctrlr->namespaces); 5873 5874 /* Get another reference to the key, so the first one can be released from probe_ctx */ 5875 if (ctx != NULL) { 5876 if (ctx->drv_opts.tls_psk != NULL) { 5877 nvme_ctrlr->psk = spdk_keyring_get_key( 5878 spdk_key_get_name(ctx->drv_opts.tls_psk)); 5879 if (nvme_ctrlr->psk == NULL) { 5880 /* Could only happen if the key was removed in the meantime */ 5881 SPDK_ERRLOG("Couldn't get a reference to the key '%s'\n", 5882 spdk_key_get_name(ctx->drv_opts.tls_psk)); 5883 rc = -ENOKEY; 5884 goto err; 5885 } 5886 } 5887 5888 if (ctx->drv_opts.dhchap_key != NULL) { 5889 nvme_ctrlr->dhchap_key = spdk_keyring_get_key( 5890 spdk_key_get_name(ctx->drv_opts.dhchap_key)); 5891 if (nvme_ctrlr->dhchap_key == NULL) { 5892 SPDK_ERRLOG("Couldn't get a reference to the key '%s'\n", 5893 spdk_key_get_name(ctx->drv_opts.dhchap_key)); 5894 rc = -ENOKEY; 5895 goto err; 5896 } 5897 } 5898 5899 if (ctx->drv_opts.dhchap_ctrlr_key != NULL) { 5900 nvme_ctrlr->dhchap_ctrlr_key = 5901 spdk_keyring_get_key( 5902 spdk_key_get_name(ctx->drv_opts.dhchap_ctrlr_key)); 5903 if (nvme_ctrlr->dhchap_ctrlr_key == NULL) { 5904 SPDK_ERRLOG("Couldn't get a reference to the key '%s'\n", 5905 spdk_key_get_name(ctx->drv_opts.dhchap_ctrlr_key)); 5906 rc = -ENOKEY; 5907 goto err; 5908 } 5909 } 5910 } 5911 5912 /* Check if we manage to enable interrupts on the controller. */ 5913 if (spdk_interrupt_mode_is_enabled() && ctx != NULL && !ctx->drv_opts.enable_interrupts) { 5914 SPDK_ERRLOG("Failed to enable interrupts on the controller\n"); 5915 rc = -ENOTSUP; 5916 goto err; 5917 } 5918 5919 path_id = calloc(1, sizeof(*path_id)); 5920 if (path_id == NULL) { 5921 SPDK_ERRLOG("Failed to allocate trid entry pointer\n"); 5922 rc = -ENOMEM; 5923 goto err; 5924 } 5925 5926 path_id->trid = *trid; 5927 if (ctx != NULL) { 5928 memcpy(path_id->hostid.hostaddr, ctx->drv_opts.src_addr, sizeof(path_id->hostid.hostaddr)); 5929 memcpy(path_id->hostid.hostsvcid, ctx->drv_opts.src_svcid, sizeof(path_id->hostid.hostsvcid)); 5930 } 5931 nvme_ctrlr->active_path_id = path_id; 5932 TAILQ_INSERT_HEAD(&nvme_ctrlr->trids, path_id, link); 5933 5934 nvme_ctrlr->thread = spdk_get_thread(); 5935 nvme_ctrlr->ctrlr = ctrlr; 5936 nvme_ctrlr->ref = 1; 5937 5938 if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) { 5939 SPDK_ERRLOG("OCSSDs are not supported"); 5940 rc = -ENOTSUP; 5941 goto err; 5942 } 5943 5944 if (ctx != NULL) { 5945 memcpy(&nvme_ctrlr->opts, &ctx->bdev_opts, sizeof(ctx->bdev_opts)); 5946 } else { 5947 spdk_bdev_nvme_get_default_ctrlr_opts(&nvme_ctrlr->opts); 5948 } 5949 5950 period = spdk_interrupt_mode_is_enabled() ? 0 : g_opts.nvme_adminq_poll_period_us; 5951 5952 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_ctrlr, 5953 period); 5954 5955 if (spdk_interrupt_mode_is_enabled()) { 5956 spdk_poller_register_interrupt(nvme_ctrlr->adminq_timer_poller, NULL, NULL); 5957 5958 fd = spdk_nvme_ctrlr_get_admin_qp_fd(nvme_ctrlr->ctrlr, &opts); 5959 if (fd < 0) { 5960 rc = fd; 5961 goto err; 5962 } 5963 5964 nvme_ctrlr->intr = SPDK_INTERRUPT_REGISTER_EXT(fd, bdev_nvme_poll_adminq, 5965 nvme_ctrlr, &opts); 5966 if (!nvme_ctrlr->intr) { 5967 rc = -EINVAL; 5968 goto err; 5969 } 5970 } 5971 5972 if (g_opts.timeout_us > 0) { 5973 /* Register timeout callback. Timeout values for IO vs. admin reqs can be different. */ 5974 /* If timeout_admin_us is 0 (not specified), admin uses same timeout as IO. */ 5975 uint64_t adm_timeout_us = (g_opts.timeout_admin_us == 0) ? 5976 g_opts.timeout_us : g_opts.timeout_admin_us; 5977 spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us, 5978 adm_timeout_us, timeout_cb, nvme_ctrlr); 5979 } 5980 5981 spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_ctrlr); 5982 spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_ctrlr); 5983 5984 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 5985 SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) { 5986 nvme_ctrlr->opal_dev = spdk_opal_dev_construct(ctrlr); 5987 } 5988 5989 rc = nvme_bdev_ctrlr_create(name, nvme_ctrlr); 5990 if (rc != 0) { 5991 goto err; 5992 } 5993 5994 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5995 5996 if (cdata->cmic.ana_reporting) { 5997 rc = nvme_ctrlr_init_ana_log_page(nvme_ctrlr, ctx); 5998 if (rc == 0) { 5999 return 0; 6000 } 6001 } else { 6002 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 6003 return 0; 6004 } 6005 6006 err: 6007 nvme_ctrlr_delete(nvme_ctrlr); 6008 return rc; 6009 } 6010 6011 void 6012 spdk_bdev_nvme_get_default_ctrlr_opts(struct spdk_bdev_nvme_ctrlr_opts *opts) 6013 { 6014 opts->prchk_flags = 0; 6015 opts->ctrlr_loss_timeout_sec = g_opts.ctrlr_loss_timeout_sec; 6016 opts->reconnect_delay_sec = g_opts.reconnect_delay_sec; 6017 opts->fast_io_fail_timeout_sec = g_opts.fast_io_fail_timeout_sec; 6018 opts->multipath = true; 6019 } 6020 6021 static void 6022 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 6023 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *drv_opts) 6024 { 6025 char *name; 6026 6027 name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++); 6028 if (!name) { 6029 SPDK_ERRLOG("Failed to assign name to NVMe device\n"); 6030 return; 6031 } 6032 6033 if (nvme_ctrlr_create(ctrlr, name, trid, NULL) == 0) { 6034 SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name); 6035 } else { 6036 SPDK_ERRLOG("Failed to attach to %s (%s)\n", trid->traddr, name); 6037 } 6038 6039 free(name); 6040 } 6041 6042 static void 6043 _nvme_ctrlr_destruct(void *ctx) 6044 { 6045 struct nvme_ctrlr *nvme_ctrlr = ctx; 6046 6047 nvme_ctrlr_depopulate_namespaces(nvme_ctrlr); 6048 nvme_ctrlr_put_ref(nvme_ctrlr); 6049 } 6050 6051 static int 6052 bdev_nvme_delete_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) 6053 { 6054 struct nvme_probe_skip_entry *entry; 6055 6056 /* The controller's destruction was already started */ 6057 if (nvme_ctrlr->destruct) { 6058 return -EALREADY; 6059 } 6060 6061 if (!hotplug && 6062 nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { 6063 entry = calloc(1, sizeof(*entry)); 6064 if (!entry) { 6065 return -ENOMEM; 6066 } 6067 entry->trid = nvme_ctrlr->active_path_id->trid; 6068 TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq); 6069 } 6070 6071 nvme_ctrlr->destruct = true; 6072 return 0; 6073 } 6074 6075 static int 6076 bdev_nvme_delete_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) 6077 { 6078 int rc; 6079 6080 pthread_mutex_lock(&nvme_ctrlr->mutex); 6081 rc = bdev_nvme_delete_ctrlr_unsafe(nvme_ctrlr, hotplug); 6082 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6083 6084 if (rc == 0) { 6085 _nvme_ctrlr_destruct(nvme_ctrlr); 6086 } else if (rc == -EALREADY) { 6087 rc = 0; 6088 } 6089 6090 return rc; 6091 } 6092 6093 static void 6094 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr) 6095 { 6096 struct nvme_ctrlr *nvme_ctrlr = cb_ctx; 6097 6098 bdev_nvme_delete_ctrlr(nvme_ctrlr, true); 6099 } 6100 6101 static int 6102 bdev_nvme_hotplug_probe(void *arg) 6103 { 6104 if (g_hotplug_probe_ctx == NULL) { 6105 spdk_poller_unregister(&g_hotplug_probe_poller); 6106 return SPDK_POLLER_IDLE; 6107 } 6108 6109 if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) { 6110 g_hotplug_probe_ctx = NULL; 6111 spdk_poller_unregister(&g_hotplug_probe_poller); 6112 } 6113 6114 return SPDK_POLLER_BUSY; 6115 } 6116 6117 static int 6118 bdev_nvme_hotplug(void *arg) 6119 { 6120 struct spdk_nvme_transport_id trid_pcie; 6121 6122 if (g_hotplug_probe_ctx) { 6123 return SPDK_POLLER_BUSY; 6124 } 6125 6126 memset(&trid_pcie, 0, sizeof(trid_pcie)); 6127 spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE); 6128 6129 g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL, 6130 hotplug_probe_cb, attach_cb, NULL); 6131 6132 if (g_hotplug_probe_ctx) { 6133 assert(g_hotplug_probe_poller == NULL); 6134 g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000); 6135 } 6136 6137 return SPDK_POLLER_BUSY; 6138 } 6139 6140 void 6141 spdk_bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts, size_t opts_size) 6142 { 6143 if (!opts) { 6144 SPDK_ERRLOG("opts should not be NULL\n"); 6145 return; 6146 } 6147 6148 if (!opts_size) { 6149 SPDK_ERRLOG("opts_size should not be zero value\n"); 6150 return; 6151 } 6152 6153 opts->opts_size = opts_size; 6154 6155 #define SET_FIELD(field, defval) \ 6156 opts->field = SPDK_GET_FIELD(&g_opts, field, defval, opts_size); \ 6157 6158 SET_FIELD(action_on_timeout, 0); 6159 SET_FIELD(keep_alive_timeout_ms, 0); 6160 SET_FIELD(timeout_us, 0); 6161 SET_FIELD(timeout_admin_us, 0); 6162 SET_FIELD(transport_retry_count, 0); 6163 SET_FIELD(arbitration_burst, 0); 6164 SET_FIELD(low_priority_weight, 0); 6165 SET_FIELD(medium_priority_weight, 0); 6166 SET_FIELD(high_priority_weight, 0); 6167 SET_FIELD(io_queue_requests, 0); 6168 SET_FIELD(nvme_adminq_poll_period_us, 0); 6169 SET_FIELD(nvme_ioq_poll_period_us, 0); 6170 SET_FIELD(delay_cmd_submit, 0); 6171 SET_FIELD(bdev_retry_count, 0); 6172 SET_FIELD(ctrlr_loss_timeout_sec, 0); 6173 SET_FIELD(reconnect_delay_sec, 0); 6174 SET_FIELD(fast_io_fail_timeout_sec, 0); 6175 SET_FIELD(transport_ack_timeout, 0); 6176 SET_FIELD(disable_auto_failback, false); 6177 SET_FIELD(generate_uuids, false); 6178 SET_FIELD(transport_tos, 0); 6179 SET_FIELD(nvme_error_stat, false); 6180 SET_FIELD(io_path_stat, false); 6181 SET_FIELD(allow_accel_sequence, false); 6182 SET_FIELD(rdma_srq_size, 0); 6183 SET_FIELD(rdma_max_cq_size, 0); 6184 SET_FIELD(rdma_cm_event_timeout_ms, 0); 6185 SET_FIELD(dhchap_digests, 0); 6186 SET_FIELD(dhchap_dhgroups, 0); 6187 SET_FIELD(rdma_umr_per_io, false); 6188 6189 #undef SET_FIELD 6190 6191 /* Do not remove this statement, you should always update this statement when you adding a new field, 6192 * and do not forget to add the SET_FIELD statement for your added field. */ 6193 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_nvme_opts) == 128, "Incorrect size"); 6194 } 6195 6196 static bool bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, 6197 uint32_t reconnect_delay_sec, 6198 uint32_t fast_io_fail_timeout_sec); 6199 6200 static int 6201 bdev_nvme_validate_opts(const struct spdk_bdev_nvme_opts *opts) 6202 { 6203 if ((opts->timeout_us == 0) && (opts->timeout_admin_us != 0)) { 6204 /* Can't set timeout_admin_us without also setting timeout_us */ 6205 SPDK_WARNLOG("Invalid options: Can't have (timeout_us == 0) with (timeout_admin_us > 0)\n"); 6206 return -EINVAL; 6207 } 6208 6209 if (opts->bdev_retry_count < -1) { 6210 SPDK_WARNLOG("Invalid option: bdev_retry_count can't be less than -1.\n"); 6211 return -EINVAL; 6212 } 6213 6214 if (!bdev_nvme_check_io_error_resiliency_params(opts->ctrlr_loss_timeout_sec, 6215 opts->reconnect_delay_sec, 6216 opts->fast_io_fail_timeout_sec)) { 6217 return -EINVAL; 6218 } 6219 6220 return 0; 6221 } 6222 6223 int 6224 spdk_bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts) 6225 { 6226 struct spdk_nvme_transport_opts drv_opts; 6227 int ret; 6228 6229 if (!opts) { 6230 SPDK_ERRLOG("opts cannot be NULL\n"); 6231 return -1; 6232 } 6233 6234 if (!opts->opts_size) { 6235 SPDK_ERRLOG("opts_size inside opts cannot be zero value\n"); 6236 return -1; 6237 } 6238 6239 ret = bdev_nvme_validate_opts(opts); 6240 if (ret) { 6241 SPDK_WARNLOG("Failed to set nvme opts.\n"); 6242 return ret; 6243 } 6244 6245 if (g_bdev_nvme_init_thread != NULL) { 6246 if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 6247 return -EPERM; 6248 } 6249 } 6250 6251 spdk_nvme_transport_get_opts(&drv_opts, sizeof(drv_opts)); 6252 if (opts->rdma_srq_size != 0) { 6253 drv_opts.rdma_srq_size = opts->rdma_srq_size; 6254 } 6255 if (opts->rdma_max_cq_size != 0) { 6256 drv_opts.rdma_max_cq_size = opts->rdma_max_cq_size; 6257 } 6258 if (opts->rdma_cm_event_timeout_ms != 0) { 6259 drv_opts.rdma_cm_event_timeout_ms = opts->rdma_cm_event_timeout_ms; 6260 } 6261 if (drv_opts.rdma_umr_per_io != opts->rdma_umr_per_io) { 6262 drv_opts.rdma_umr_per_io = opts->rdma_umr_per_io; 6263 } 6264 ret = spdk_nvme_transport_set_opts(&drv_opts, sizeof(drv_opts)); 6265 if (ret) { 6266 SPDK_ERRLOG("Failed to set NVMe transport opts.\n"); 6267 return ret; 6268 } 6269 6270 #define SET_FIELD(field, defval) \ 6271 g_opts.field = SPDK_GET_FIELD(opts, field, defval, opts->opts_size); \ 6272 6273 SET_FIELD(action_on_timeout, 0); 6274 SET_FIELD(keep_alive_timeout_ms, 0); 6275 SET_FIELD(timeout_us, 0); 6276 SET_FIELD(timeout_admin_us, 0); 6277 SET_FIELD(transport_retry_count, 0); 6278 SET_FIELD(arbitration_burst, 0); 6279 SET_FIELD(low_priority_weight, 0); 6280 SET_FIELD(medium_priority_weight, 0); 6281 SET_FIELD(high_priority_weight, 0); 6282 SET_FIELD(io_queue_requests, 0); 6283 SET_FIELD(nvme_adminq_poll_period_us, 0); 6284 SET_FIELD(nvme_ioq_poll_period_us, 0); 6285 SET_FIELD(delay_cmd_submit, 0); 6286 SET_FIELD(bdev_retry_count, 0); 6287 SET_FIELD(ctrlr_loss_timeout_sec, 0); 6288 SET_FIELD(reconnect_delay_sec, 0); 6289 SET_FIELD(fast_io_fail_timeout_sec, 0); 6290 SET_FIELD(transport_ack_timeout, 0); 6291 SET_FIELD(disable_auto_failback, false); 6292 SET_FIELD(generate_uuids, false); 6293 SET_FIELD(transport_tos, 0); 6294 SET_FIELD(nvme_error_stat, false); 6295 SET_FIELD(io_path_stat, false); 6296 SET_FIELD(allow_accel_sequence, false); 6297 SET_FIELD(rdma_srq_size, 0); 6298 SET_FIELD(rdma_max_cq_size, 0); 6299 SET_FIELD(rdma_cm_event_timeout_ms, 0); 6300 SET_FIELD(dhchap_digests, 0); 6301 SET_FIELD(dhchap_dhgroups, 0); 6302 6303 g_opts.opts_size = opts->opts_size; 6304 6305 #undef SET_FIELD 6306 6307 return 0; 6308 } 6309 6310 struct set_nvme_hotplug_ctx { 6311 uint64_t period_us; 6312 bool enabled; 6313 spdk_msg_fn fn; 6314 void *fn_ctx; 6315 }; 6316 6317 static void 6318 set_nvme_hotplug_period_cb(void *_ctx) 6319 { 6320 struct set_nvme_hotplug_ctx *ctx = _ctx; 6321 6322 spdk_poller_unregister(&g_hotplug_poller); 6323 if (ctx->enabled) { 6324 g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us); 6325 } else { 6326 g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_remove_poller, NULL, 6327 NVME_HOTPLUG_POLL_PERIOD_DEFAULT); 6328 } 6329 6330 g_nvme_hotplug_poll_period_us = ctx->period_us; 6331 g_nvme_hotplug_enabled = ctx->enabled; 6332 if (ctx->fn) { 6333 ctx->fn(ctx->fn_ctx); 6334 } 6335 6336 free(ctx); 6337 } 6338 6339 int 6340 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx) 6341 { 6342 struct set_nvme_hotplug_ctx *ctx; 6343 6344 if (enabled == true && !spdk_process_is_primary()) { 6345 return -EPERM; 6346 } 6347 6348 ctx = calloc(1, sizeof(*ctx)); 6349 if (ctx == NULL) { 6350 return -ENOMEM; 6351 } 6352 6353 period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us; 6354 ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX); 6355 ctx->enabled = enabled; 6356 ctx->fn = cb; 6357 ctx->fn_ctx = cb_ctx; 6358 6359 spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx); 6360 return 0; 6361 } 6362 6363 static void 6364 nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 6365 struct nvme_async_probe_ctx *ctx) 6366 { 6367 struct nvme_ns *nvme_ns; 6368 struct nvme_bdev *nvme_bdev; 6369 size_t j; 6370 6371 assert(nvme_ctrlr != NULL); 6372 6373 if (ctx->names == NULL) { 6374 ctx->reported_bdevs = 0; 6375 populate_namespaces_cb(ctx, 0); 6376 return; 6377 } 6378 6379 /* 6380 * Report the new bdevs that were created in this call. 6381 * There can be more than one bdev per NVMe controller. 6382 */ 6383 j = 0; 6384 6385 pthread_mutex_lock(&nvme_ctrlr->mutex); 6386 6387 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 6388 while (nvme_ns != NULL) { 6389 nvme_bdev = nvme_ns->bdev; 6390 if (j < ctx->max_bdevs) { 6391 ctx->names[j] = nvme_bdev->disk.name; 6392 j++; 6393 } else { 6394 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6395 6396 NVME_CTRLR_ERRLOG(nvme_ctrlr, 6397 "Maximum number of namespaces supported per NVMe controller is %du. " 6398 "Unable to return all names of created bdevs\n", 6399 ctx->max_bdevs); 6400 ctx->reported_bdevs = 0; 6401 populate_namespaces_cb(ctx, -ERANGE); 6402 return; 6403 } 6404 6405 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 6406 } 6407 6408 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6409 6410 ctx->reported_bdevs = j; 6411 populate_namespaces_cb(ctx, 0); 6412 } 6413 6414 static int 6415 bdev_nvme_check_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 6416 struct spdk_nvme_ctrlr *new_ctrlr, 6417 struct spdk_nvme_transport_id *trid) 6418 { 6419 struct nvme_path_id *tmp_trid; 6420 6421 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 6422 NVME_CTRLR_ERRLOG(nvme_ctrlr, "PCIe failover is not supported.\n"); 6423 return -ENOTSUP; 6424 } 6425 6426 /* Currently we only support failover to the same transport type. */ 6427 if (nvme_ctrlr->active_path_id->trid.trtype != trid->trtype) { 6428 NVME_CTRLR_WARNLOG(nvme_ctrlr, 6429 "Failover from trtype: %s to a different trtype: %s is not supported currently\n", 6430 spdk_nvme_transport_id_trtype_str(nvme_ctrlr->active_path_id->trid.trtype), 6431 spdk_nvme_transport_id_trtype_str(trid->trtype)); 6432 return -EINVAL; 6433 } 6434 6435 6436 /* Currently we only support failover to the same NQN. */ 6437 if (strncmp(trid->subnqn, nvme_ctrlr->active_path_id->trid.subnqn, SPDK_NVMF_NQN_MAX_LEN)) { 6438 NVME_CTRLR_WARNLOG(nvme_ctrlr, 6439 "Failover from subnqn: %s to a different subnqn: %s is not supported currently\n", 6440 nvme_ctrlr->active_path_id->trid.subnqn, trid->subnqn); 6441 return -EINVAL; 6442 } 6443 6444 /* Skip all the other checks if we've already registered this path. */ 6445 TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) { 6446 if (!spdk_nvme_transport_id_compare(&tmp_trid->trid, trid)) { 6447 NVME_CTRLR_WARNLOG(nvme_ctrlr, "This path (traddr: %s subnqn: %s) is already registered\n", 6448 trid->traddr, trid->subnqn); 6449 return -EALREADY; 6450 } 6451 } 6452 6453 return 0; 6454 } 6455 6456 static int 6457 bdev_nvme_check_secondary_namespace(struct nvme_ctrlr *nvme_ctrlr, 6458 struct spdk_nvme_ctrlr *new_ctrlr) 6459 { 6460 struct nvme_ns *nvme_ns; 6461 struct spdk_nvme_ns *new_ns; 6462 6463 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 6464 while (nvme_ns != NULL) { 6465 new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nvme_ns->id); 6466 assert(new_ns != NULL); 6467 6468 if (!bdev_nvme_compare_ns(nvme_ns->ns, new_ns)) { 6469 return -EINVAL; 6470 } 6471 6472 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 6473 } 6474 6475 return 0; 6476 } 6477 6478 static int 6479 _bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 6480 struct spdk_nvme_transport_id *trid) 6481 { 6482 struct nvme_path_id *active_id, *new_trid, *tmp_trid; 6483 6484 new_trid = calloc(1, sizeof(*new_trid)); 6485 if (new_trid == NULL) { 6486 return -ENOMEM; 6487 } 6488 new_trid->trid = *trid; 6489 6490 active_id = nvme_ctrlr->active_path_id; 6491 assert(active_id != NULL); 6492 assert(active_id == TAILQ_FIRST(&nvme_ctrlr->trids)); 6493 6494 /* Skip the active trid not to replace it until it is failed. */ 6495 tmp_trid = TAILQ_NEXT(active_id, link); 6496 if (tmp_trid == NULL) { 6497 goto add_tail; 6498 } 6499 6500 /* It means the trid is faled if its last failed time is non-zero. 6501 * Insert the new alternate trid before any failed trid. 6502 */ 6503 TAILQ_FOREACH_FROM(tmp_trid, &nvme_ctrlr->trids, link) { 6504 if (tmp_trid->last_failed_tsc != 0) { 6505 TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link); 6506 return 0; 6507 } 6508 } 6509 6510 add_tail: 6511 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, new_trid, link); 6512 return 0; 6513 } 6514 6515 /* This is the case that a secondary path is added to an existing 6516 * nvme_ctrlr for failover. After checking if it can access the same 6517 * namespaces as the primary path, it is disconnected until failover occurs. 6518 */ 6519 static int 6520 bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 6521 struct spdk_nvme_ctrlr *new_ctrlr, 6522 struct spdk_nvme_transport_id *trid) 6523 { 6524 int rc; 6525 6526 assert(nvme_ctrlr != NULL); 6527 6528 pthread_mutex_lock(&nvme_ctrlr->mutex); 6529 6530 rc = bdev_nvme_check_secondary_trid(nvme_ctrlr, new_ctrlr, trid); 6531 if (rc != 0) { 6532 goto exit; 6533 } 6534 6535 rc = bdev_nvme_check_secondary_namespace(nvme_ctrlr, new_ctrlr); 6536 if (rc != 0) { 6537 goto exit; 6538 } 6539 6540 rc = _bdev_nvme_add_secondary_trid(nvme_ctrlr, trid); 6541 6542 exit: 6543 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6544 6545 spdk_nvme_detach(new_ctrlr); 6546 6547 return rc; 6548 } 6549 6550 static void 6551 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 6552 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 6553 { 6554 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 6555 struct nvme_async_probe_ctx *ctx; 6556 int rc; 6557 6558 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 6559 ctx->ctrlr_attached = true; 6560 6561 rc = nvme_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx); 6562 if (rc != 0) { 6563 ctx->reported_bdevs = 0; 6564 populate_namespaces_cb(ctx, rc); 6565 } 6566 } 6567 6568 6569 static void 6570 connect_set_failover_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 6571 struct spdk_nvme_ctrlr *ctrlr, 6572 const struct spdk_nvme_ctrlr_opts *opts) 6573 { 6574 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 6575 struct nvme_ctrlr *nvme_ctrlr; 6576 struct nvme_async_probe_ctx *ctx; 6577 int rc; 6578 6579 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 6580 ctx->ctrlr_attached = true; 6581 6582 nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->base_name); 6583 if (nvme_ctrlr) { 6584 rc = bdev_nvme_add_secondary_trid(nvme_ctrlr, ctrlr, &ctx->trid); 6585 } else { 6586 rc = -ENODEV; 6587 } 6588 6589 ctx->reported_bdevs = 0; 6590 populate_namespaces_cb(ctx, rc); 6591 } 6592 6593 static int 6594 bdev_nvme_async_poll(void *arg) 6595 { 6596 struct nvme_async_probe_ctx *ctx = arg; 6597 int rc; 6598 6599 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 6600 if (spdk_unlikely(rc != -EAGAIN)) { 6601 ctx->probe_done = true; 6602 spdk_poller_unregister(&ctx->poller); 6603 if (!ctx->ctrlr_attached) { 6604 /* The probe is done, but no controller was attached. 6605 * That means we had a failure, so report -EIO back to 6606 * the caller (usually the RPC). populate_namespaces_cb() 6607 * will take care of freeing the nvme_async_probe_ctx. 6608 */ 6609 ctx->reported_bdevs = 0; 6610 populate_namespaces_cb(ctx, -EIO); 6611 } else if (ctx->namespaces_populated) { 6612 /* The namespaces for the attached controller were all 6613 * populated and the response was already sent to the 6614 * caller (usually the RPC). So free the context here. 6615 */ 6616 free_nvme_async_probe_ctx(ctx); 6617 } 6618 } 6619 6620 return SPDK_POLLER_BUSY; 6621 } 6622 6623 static bool 6624 bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, 6625 uint32_t reconnect_delay_sec, 6626 uint32_t fast_io_fail_timeout_sec) 6627 { 6628 if (ctrlr_loss_timeout_sec < -1) { 6629 SPDK_ERRLOG("ctrlr_loss_timeout_sec can't be less than -1.\n"); 6630 return false; 6631 } else if (ctrlr_loss_timeout_sec == -1) { 6632 if (reconnect_delay_sec == 0) { 6633 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 6634 return false; 6635 } else if (fast_io_fail_timeout_sec != 0 && 6636 fast_io_fail_timeout_sec < reconnect_delay_sec) { 6637 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io-fail_timeout_sec.\n"); 6638 return false; 6639 } 6640 } else if (ctrlr_loss_timeout_sec != 0) { 6641 if (reconnect_delay_sec == 0) { 6642 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 6643 return false; 6644 } else if (reconnect_delay_sec > (uint32_t)ctrlr_loss_timeout_sec) { 6645 SPDK_ERRLOG("reconnect_delay_sec can't be more than ctrlr_loss_timeout_sec.\n"); 6646 return false; 6647 } else if (fast_io_fail_timeout_sec != 0) { 6648 if (fast_io_fail_timeout_sec < reconnect_delay_sec) { 6649 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io_fail_timeout_sec.\n"); 6650 return false; 6651 } else if (fast_io_fail_timeout_sec > (uint32_t)ctrlr_loss_timeout_sec) { 6652 SPDK_ERRLOG("fast_io_fail_timeout_sec can't be more than ctrlr_loss_timeout_sec.\n"); 6653 return false; 6654 } 6655 } 6656 } else if (reconnect_delay_sec != 0 || fast_io_fail_timeout_sec != 0) { 6657 SPDK_ERRLOG("Both reconnect_delay_sec and fast_io_fail_timeout_sec must be 0 if ctrlr_loss_timeout_sec is 0.\n"); 6658 return false; 6659 } 6660 6661 return true; 6662 } 6663 6664 int 6665 spdk_bdev_nvme_create(struct spdk_nvme_transport_id *trid, 6666 const char *base_name, 6667 const char **names, 6668 uint32_t count, 6669 spdk_bdev_nvme_create_cb cb_fn, 6670 void *cb_ctx, 6671 struct spdk_nvme_ctrlr_opts *drv_opts, 6672 struct spdk_bdev_nvme_ctrlr_opts *bdev_opts) 6673 { 6674 struct nvme_probe_skip_entry *entry, *tmp; 6675 struct nvme_async_probe_ctx *ctx; 6676 spdk_nvme_attach_cb attach_cb; 6677 struct nvme_ctrlr *nvme_ctrlr; 6678 int len; 6679 6680 /* TODO expand this check to include both the host and target TRIDs. 6681 * Only if both are the same should we fail. 6682 */ 6683 if (nvme_ctrlr_get(trid, drv_opts->hostnqn) != NULL) { 6684 SPDK_ERRLOG("A controller with the provided trid (traddr: %s, hostnqn: %s) " 6685 "already exists.\n", trid->traddr, drv_opts->hostnqn); 6686 return -EEXIST; 6687 } 6688 6689 len = strnlen(base_name, SPDK_CONTROLLER_NAME_MAX); 6690 6691 if (len == 0 || len == SPDK_CONTROLLER_NAME_MAX) { 6692 SPDK_ERRLOG("controller name must be between 1 and %d characters\n", SPDK_CONTROLLER_NAME_MAX - 1); 6693 return -EINVAL; 6694 } 6695 6696 if (bdev_opts != NULL && 6697 !bdev_nvme_check_io_error_resiliency_params(bdev_opts->ctrlr_loss_timeout_sec, 6698 bdev_opts->reconnect_delay_sec, 6699 bdev_opts->fast_io_fail_timeout_sec)) { 6700 return -EINVAL; 6701 } 6702 6703 ctx = calloc(1, sizeof(*ctx)); 6704 if (!ctx) { 6705 return -ENOMEM; 6706 } 6707 ctx->base_name = strdup(base_name); 6708 if (!ctx->base_name) { 6709 free(ctx); 6710 return -ENOMEM; 6711 } 6712 ctx->names = names; 6713 ctx->max_bdevs = count; 6714 ctx->cb_fn = cb_fn; 6715 ctx->cb_ctx = cb_ctx; 6716 ctx->trid = *trid; 6717 6718 if (bdev_opts) { 6719 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 6720 } else { 6721 spdk_bdev_nvme_get_default_ctrlr_opts(&ctx->bdev_opts); 6722 } 6723 6724 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 6725 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) { 6726 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 6727 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 6728 free(entry); 6729 break; 6730 } 6731 } 6732 } 6733 6734 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 6735 ctx->drv_opts.transport_retry_count = g_opts.transport_retry_count; 6736 ctx->drv_opts.transport_ack_timeout = g_opts.transport_ack_timeout; 6737 ctx->drv_opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms; 6738 ctx->drv_opts.disable_read_ana_log_page = true; 6739 ctx->drv_opts.transport_tos = g_opts.transport_tos; 6740 6741 if (spdk_interrupt_mode_is_enabled()) { 6742 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 6743 ctx->drv_opts.enable_interrupts = true; 6744 } else { 6745 SPDK_ERRLOG("Interrupt mode is only supported with PCIe transport\n"); 6746 free_nvme_async_probe_ctx(ctx); 6747 return -ENOTSUP; 6748 } 6749 } 6750 6751 if (ctx->bdev_opts.psk != NULL) { 6752 ctx->drv_opts.tls_psk = spdk_keyring_get_key(ctx->bdev_opts.psk); 6753 if (ctx->drv_opts.tls_psk == NULL) { 6754 SPDK_ERRLOG("Could not load PSK: %s\n", ctx->bdev_opts.psk); 6755 free_nvme_async_probe_ctx(ctx); 6756 return -ENOKEY; 6757 } 6758 } 6759 6760 if (ctx->bdev_opts.dhchap_key != NULL) { 6761 ctx->drv_opts.dhchap_key = spdk_keyring_get_key(ctx->bdev_opts.dhchap_key); 6762 if (ctx->drv_opts.dhchap_key == NULL) { 6763 SPDK_ERRLOG("Could not load DH-HMAC-CHAP key: %s\n", 6764 ctx->bdev_opts.dhchap_key); 6765 free_nvme_async_probe_ctx(ctx); 6766 return -ENOKEY; 6767 } 6768 6769 ctx->drv_opts.dhchap_digests = g_opts.dhchap_digests; 6770 ctx->drv_opts.dhchap_dhgroups = g_opts.dhchap_dhgroups; 6771 } 6772 if (ctx->bdev_opts.dhchap_ctrlr_key != NULL) { 6773 ctx->drv_opts.dhchap_ctrlr_key = 6774 spdk_keyring_get_key(ctx->bdev_opts.dhchap_ctrlr_key); 6775 if (ctx->drv_opts.dhchap_ctrlr_key == NULL) { 6776 SPDK_ERRLOG("Could not load DH-HMAC-CHAP controller key: %s\n", 6777 ctx->bdev_opts.dhchap_ctrlr_key); 6778 free_nvme_async_probe_ctx(ctx); 6779 return -ENOKEY; 6780 } 6781 } 6782 6783 if (nvme_bdev_ctrlr_get_by_name(base_name) == NULL || ctx->bdev_opts.multipath) { 6784 attach_cb = connect_attach_cb; 6785 } else { 6786 attach_cb = connect_set_failover_cb; 6787 } 6788 6789 nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->base_name); 6790 if (nvme_ctrlr && nvme_ctrlr->opts.multipath != ctx->bdev_opts.multipath) { 6791 /* All controllers with the same name must be configured the same 6792 * way, either for multipath or failover. If the configuration doesn't 6793 * match - report error. 6794 */ 6795 free_nvme_async_probe_ctx(ctx); 6796 return -EINVAL; 6797 } 6798 6799 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, attach_cb); 6800 if (ctx->probe_ctx == NULL) { 6801 SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr); 6802 free_nvme_async_probe_ctx(ctx); 6803 return -ENODEV; 6804 } 6805 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000); 6806 6807 return 0; 6808 } 6809 6810 struct bdev_nvme_delete_ctx { 6811 char *name; 6812 struct nvme_path_id path_id; 6813 bdev_nvme_delete_done_fn delete_done; 6814 void *delete_done_ctx; 6815 uint64_t timeout_ticks; 6816 struct spdk_poller *poller; 6817 }; 6818 6819 static void 6820 free_bdev_nvme_delete_ctx(struct bdev_nvme_delete_ctx *ctx) 6821 { 6822 if (ctx != NULL) { 6823 free(ctx->name); 6824 free(ctx); 6825 } 6826 } 6827 6828 static bool 6829 nvme_path_id_compare(struct nvme_path_id *p, const struct nvme_path_id *path_id) 6830 { 6831 if (path_id->trid.trtype != 0) { 6832 if (path_id->trid.trtype == SPDK_NVME_TRANSPORT_CUSTOM) { 6833 if (strcasecmp(path_id->trid.trstring, p->trid.trstring) != 0) { 6834 return false; 6835 } 6836 } else { 6837 if (path_id->trid.trtype != p->trid.trtype) { 6838 return false; 6839 } 6840 } 6841 } 6842 6843 if (!spdk_mem_all_zero(path_id->trid.traddr, sizeof(path_id->trid.traddr))) { 6844 if (strcasecmp(path_id->trid.traddr, p->trid.traddr) != 0) { 6845 return false; 6846 } 6847 } 6848 6849 if (path_id->trid.adrfam != 0) { 6850 if (path_id->trid.adrfam != p->trid.adrfam) { 6851 return false; 6852 } 6853 } 6854 6855 if (!spdk_mem_all_zero(path_id->trid.trsvcid, sizeof(path_id->trid.trsvcid))) { 6856 if (strcasecmp(path_id->trid.trsvcid, p->trid.trsvcid) != 0) { 6857 return false; 6858 } 6859 } 6860 6861 if (!spdk_mem_all_zero(path_id->trid.subnqn, sizeof(path_id->trid.subnqn))) { 6862 if (strcmp(path_id->trid.subnqn, p->trid.subnqn) != 0) { 6863 return false; 6864 } 6865 } 6866 6867 if (!spdk_mem_all_zero(path_id->hostid.hostaddr, sizeof(path_id->hostid.hostaddr))) { 6868 if (strcmp(path_id->hostid.hostaddr, p->hostid.hostaddr) != 0) { 6869 return false; 6870 } 6871 } 6872 6873 if (!spdk_mem_all_zero(path_id->hostid.hostsvcid, sizeof(path_id->hostid.hostsvcid))) { 6874 if (strcmp(path_id->hostid.hostsvcid, p->hostid.hostsvcid) != 0) { 6875 return false; 6876 } 6877 } 6878 6879 return true; 6880 } 6881 6882 static bool 6883 nvme_path_id_exists(const char *name, const struct nvme_path_id *path_id) 6884 { 6885 struct nvme_bdev_ctrlr *nbdev_ctrlr; 6886 struct nvme_ctrlr *ctrlr; 6887 struct nvme_path_id *p; 6888 6889 pthread_mutex_lock(&g_bdev_nvme_mutex); 6890 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 6891 if (!nbdev_ctrlr) { 6892 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6893 return false; 6894 } 6895 6896 TAILQ_FOREACH(ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 6897 pthread_mutex_lock(&ctrlr->mutex); 6898 TAILQ_FOREACH(p, &ctrlr->trids, link) { 6899 if (nvme_path_id_compare(p, path_id)) { 6900 pthread_mutex_unlock(&ctrlr->mutex); 6901 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6902 return true; 6903 } 6904 } 6905 pthread_mutex_unlock(&ctrlr->mutex); 6906 } 6907 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6908 6909 return false; 6910 } 6911 6912 static int 6913 bdev_nvme_delete_complete_poll(void *arg) 6914 { 6915 struct bdev_nvme_delete_ctx *ctx = arg; 6916 int rc = 0; 6917 6918 if (nvme_path_id_exists(ctx->name, &ctx->path_id)) { 6919 if (ctx->timeout_ticks > spdk_get_ticks()) { 6920 return SPDK_POLLER_BUSY; 6921 } 6922 6923 SPDK_ERRLOG("NVMe path '%s' still exists after delete\n", ctx->name); 6924 rc = -ETIMEDOUT; 6925 } 6926 6927 spdk_poller_unregister(&ctx->poller); 6928 6929 ctx->delete_done(ctx->delete_done_ctx, rc); 6930 free_bdev_nvme_delete_ctx(ctx); 6931 6932 return SPDK_POLLER_BUSY; 6933 } 6934 6935 static int 6936 _bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, const struct nvme_path_id *path_id) 6937 { 6938 struct nvme_path_id *p, *t; 6939 spdk_msg_fn msg_fn; 6940 int rc = -ENXIO; 6941 6942 pthread_mutex_lock(&nvme_ctrlr->mutex); 6943 6944 TAILQ_FOREACH_REVERSE_SAFE(p, &nvme_ctrlr->trids, nvme_paths, link, t) { 6945 if (p == TAILQ_FIRST(&nvme_ctrlr->trids)) { 6946 break; 6947 } 6948 6949 if (!nvme_path_id_compare(p, path_id)) { 6950 continue; 6951 } 6952 6953 /* We are not using the specified path. */ 6954 TAILQ_REMOVE(&nvme_ctrlr->trids, p, link); 6955 free(p); 6956 rc = 0; 6957 } 6958 6959 if (p == NULL || !nvme_path_id_compare(p, path_id)) { 6960 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6961 return rc; 6962 } 6963 6964 /* If we made it here, then this path is a match! Now we need to remove it. */ 6965 6966 /* This is the active path in use right now. The active path is always the first in the list. */ 6967 assert(p == nvme_ctrlr->active_path_id); 6968 6969 if (!TAILQ_NEXT(p, link)) { 6970 /* The current path is the only path. */ 6971 msg_fn = _nvme_ctrlr_destruct; 6972 rc = bdev_nvme_delete_ctrlr_unsafe(nvme_ctrlr, false); 6973 } else { 6974 /* There is an alternative path. */ 6975 msg_fn = _bdev_nvme_reset_ctrlr; 6976 rc = bdev_nvme_failover_ctrlr_unsafe(nvme_ctrlr, true); 6977 } 6978 6979 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6980 6981 if (rc == 0) { 6982 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 6983 } else if (rc == -EALREADY) { 6984 rc = 0; 6985 } 6986 6987 return rc; 6988 } 6989 6990 int 6991 bdev_nvme_delete(const char *name, const struct nvme_path_id *path_id, 6992 bdev_nvme_delete_done_fn delete_done, void *delete_done_ctx) 6993 { 6994 struct nvme_bdev_ctrlr *nbdev_ctrlr; 6995 struct nvme_ctrlr *nvme_ctrlr, *tmp_nvme_ctrlr; 6996 struct bdev_nvme_delete_ctx *ctx = NULL; 6997 int rc = -ENXIO, _rc; 6998 6999 if (name == NULL || path_id == NULL) { 7000 rc = -EINVAL; 7001 goto exit; 7002 } 7003 7004 pthread_mutex_lock(&g_bdev_nvme_mutex); 7005 7006 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 7007 if (nbdev_ctrlr == NULL) { 7008 pthread_mutex_unlock(&g_bdev_nvme_mutex); 7009 7010 SPDK_ERRLOG("Failed to find NVMe bdev controller\n"); 7011 rc = -ENODEV; 7012 goto exit; 7013 } 7014 7015 TAILQ_FOREACH_SAFE(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq, tmp_nvme_ctrlr) { 7016 _rc = _bdev_nvme_delete(nvme_ctrlr, path_id); 7017 if (_rc < 0 && _rc != -ENXIO) { 7018 pthread_mutex_unlock(&g_bdev_nvme_mutex); 7019 rc = _rc; 7020 goto exit; 7021 } else if (_rc == 0) { 7022 /* We traverse all remaining nvme_ctrlrs even if one nvme_ctrlr 7023 * was deleted successfully. To remember the successful deletion, 7024 * overwrite rc only if _rc is zero. 7025 */ 7026 rc = 0; 7027 } 7028 } 7029 7030 pthread_mutex_unlock(&g_bdev_nvme_mutex); 7031 7032 if (rc != 0 || delete_done == NULL) { 7033 goto exit; 7034 } 7035 7036 ctx = calloc(1, sizeof(*ctx)); 7037 if (ctx == NULL) { 7038 SPDK_ERRLOG("Failed to allocate context for bdev_nvme_delete\n"); 7039 rc = -ENOMEM; 7040 goto exit; 7041 } 7042 7043 ctx->name = strdup(name); 7044 if (ctx->name == NULL) { 7045 SPDK_ERRLOG("Failed to copy controller name for deletion\n"); 7046 rc = -ENOMEM; 7047 goto exit; 7048 } 7049 7050 ctx->delete_done = delete_done; 7051 ctx->delete_done_ctx = delete_done_ctx; 7052 ctx->path_id = *path_id; 7053 ctx->timeout_ticks = spdk_get_ticks() + 10 * spdk_get_ticks_hz(); 7054 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_delete_complete_poll, ctx, 1000); 7055 if (ctx->poller == NULL) { 7056 SPDK_ERRLOG("Failed to register bdev_nvme_delete poller\n"); 7057 rc = -ENOMEM; 7058 goto exit; 7059 } 7060 7061 exit: 7062 if (rc != 0) { 7063 free_bdev_nvme_delete_ctx(ctx); 7064 } 7065 7066 return rc; 7067 } 7068 7069 #define DISCOVERY_INFOLOG(ctx, format, ...) \ 7070 SPDK_INFOLOG(bdev_nvme, "Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 7071 7072 #define DISCOVERY_ERRLOG(ctx, format, ...) \ 7073 SPDK_ERRLOG("Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 7074 7075 struct discovery_entry_ctx { 7076 char name[128]; 7077 struct spdk_nvme_transport_id trid; 7078 struct spdk_nvme_ctrlr_opts drv_opts; 7079 struct spdk_nvmf_discovery_log_page_entry entry; 7080 TAILQ_ENTRY(discovery_entry_ctx) tailq; 7081 struct discovery_ctx *ctx; 7082 }; 7083 7084 struct discovery_ctx { 7085 char *name; 7086 spdk_bdev_nvme_start_discovery_fn start_cb_fn; 7087 spdk_bdev_nvme_stop_discovery_fn stop_cb_fn; 7088 void *cb_ctx; 7089 struct spdk_nvme_probe_ctx *probe_ctx; 7090 struct spdk_nvme_detach_ctx *detach_ctx; 7091 struct spdk_nvme_ctrlr *ctrlr; 7092 struct spdk_nvme_transport_id trid; 7093 struct discovery_entry_ctx *entry_ctx_in_use; 7094 struct spdk_poller *poller; 7095 struct spdk_nvme_ctrlr_opts drv_opts; 7096 struct spdk_bdev_nvme_ctrlr_opts bdev_opts; 7097 struct spdk_nvmf_discovery_log_page *log_page; 7098 TAILQ_ENTRY(discovery_ctx) tailq; 7099 TAILQ_HEAD(, discovery_entry_ctx) nvm_entry_ctxs; 7100 TAILQ_HEAD(, discovery_entry_ctx) discovery_entry_ctxs; 7101 int rc; 7102 bool wait_for_attach; 7103 uint64_t timeout_ticks; 7104 /* Denotes that the discovery service is being started. We're waiting 7105 * for the initial connection to the discovery controller to be 7106 * established and attach discovered NVM ctrlrs. 7107 */ 7108 bool initializing; 7109 /* Denotes if a discovery is currently in progress for this context. 7110 * That includes connecting to newly discovered subsystems. Used to 7111 * ensure we do not start a new discovery until an existing one is 7112 * complete. 7113 */ 7114 bool in_progress; 7115 7116 /* Denotes if another discovery is needed after the one in progress 7117 * completes. Set when we receive an AER completion while a discovery 7118 * is already in progress. 7119 */ 7120 bool pending; 7121 7122 /* Signal to the discovery context poller that it should stop the 7123 * discovery service, including detaching from the current discovery 7124 * controller. 7125 */ 7126 bool stop; 7127 7128 struct spdk_thread *calling_thread; 7129 uint32_t index; 7130 uint32_t attach_in_progress; 7131 char *hostnqn; 7132 7133 /* Denotes if the discovery service was started by the mdns discovery. 7134 */ 7135 bool from_mdns_discovery_service; 7136 }; 7137 7138 TAILQ_HEAD(discovery_ctxs, discovery_ctx); 7139 static struct discovery_ctxs g_discovery_ctxs = TAILQ_HEAD_INITIALIZER(g_discovery_ctxs); 7140 7141 static void get_discovery_log_page(struct discovery_ctx *ctx); 7142 7143 static void 7144 free_discovery_ctx(struct discovery_ctx *ctx) 7145 { 7146 free(ctx->log_page); 7147 free(ctx->hostnqn); 7148 free(ctx->name); 7149 free(ctx); 7150 } 7151 7152 static void 7153 discovery_complete(struct discovery_ctx *ctx) 7154 { 7155 ctx->initializing = false; 7156 ctx->in_progress = false; 7157 if (ctx->pending) { 7158 ctx->pending = false; 7159 get_discovery_log_page(ctx); 7160 } 7161 } 7162 7163 static void 7164 build_trid_from_log_page_entry(struct spdk_nvme_transport_id *trid, 7165 struct spdk_nvmf_discovery_log_page_entry *entry) 7166 { 7167 char *space; 7168 7169 trid->trtype = entry->trtype; 7170 trid->adrfam = entry->adrfam; 7171 memcpy(trid->traddr, entry->traddr, sizeof(entry->traddr)); 7172 memcpy(trid->trsvcid, entry->trsvcid, sizeof(entry->trsvcid)); 7173 /* Because the source buffer (entry->subnqn) is longer than trid->subnqn, and 7174 * before call to this function trid->subnqn is zeroed out, we need 7175 * to copy sizeof(trid->subnqn) minus one byte to make sure the last character 7176 * remains 0. Then we can shorten the string (replace ' ' with 0) if required 7177 */ 7178 memcpy(trid->subnqn, entry->subnqn, sizeof(trid->subnqn) - 1); 7179 7180 /* We want the traddr, trsvcid and subnqn fields to be NULL-terminated. 7181 * But the log page entries typically pad them with spaces, not zeroes. 7182 * So add a NULL terminator to each of these fields at the appropriate 7183 * location. 7184 */ 7185 space = strchr(trid->traddr, ' '); 7186 if (space) { 7187 *space = 0; 7188 } 7189 space = strchr(trid->trsvcid, ' '); 7190 if (space) { 7191 *space = 0; 7192 } 7193 space = strchr(trid->subnqn, ' '); 7194 if (space) { 7195 *space = 0; 7196 } 7197 } 7198 7199 static void 7200 _stop_discovery(void *_ctx) 7201 { 7202 struct discovery_ctx *ctx = _ctx; 7203 7204 if (ctx->attach_in_progress > 0) { 7205 spdk_thread_send_msg(spdk_get_thread(), _stop_discovery, ctx); 7206 return; 7207 } 7208 7209 ctx->stop = true; 7210 7211 while (!TAILQ_EMPTY(&ctx->nvm_entry_ctxs)) { 7212 struct discovery_entry_ctx *entry_ctx; 7213 struct nvme_path_id path = {}; 7214 7215 entry_ctx = TAILQ_FIRST(&ctx->nvm_entry_ctxs); 7216 path.trid = entry_ctx->trid; 7217 bdev_nvme_delete(entry_ctx->name, &path, NULL, NULL); 7218 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 7219 free(entry_ctx); 7220 } 7221 7222 while (!TAILQ_EMPTY(&ctx->discovery_entry_ctxs)) { 7223 struct discovery_entry_ctx *entry_ctx; 7224 7225 entry_ctx = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 7226 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 7227 free(entry_ctx); 7228 } 7229 7230 free(ctx->entry_ctx_in_use); 7231 ctx->entry_ctx_in_use = NULL; 7232 } 7233 7234 static void 7235 stop_discovery(struct discovery_ctx *ctx, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 7236 { 7237 ctx->stop_cb_fn = cb_fn; 7238 ctx->cb_ctx = cb_ctx; 7239 7240 if (ctx->attach_in_progress > 0) { 7241 DISCOVERY_INFOLOG(ctx, "stopping discovery with attach_in_progress: %"PRIu32"\n", 7242 ctx->attach_in_progress); 7243 } 7244 7245 _stop_discovery(ctx); 7246 } 7247 7248 static void 7249 remove_discovery_entry(struct nvme_ctrlr *nvme_ctrlr) 7250 { 7251 struct discovery_ctx *d_ctx; 7252 struct nvme_path_id *path_id; 7253 struct spdk_nvme_transport_id trid = {}; 7254 struct discovery_entry_ctx *entry_ctx, *tmp; 7255 7256 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 7257 7258 TAILQ_FOREACH(d_ctx, &g_discovery_ctxs, tailq) { 7259 TAILQ_FOREACH_SAFE(entry_ctx, &d_ctx->nvm_entry_ctxs, tailq, tmp) { 7260 build_trid_from_log_page_entry(&trid, &entry_ctx->entry); 7261 if (spdk_nvme_transport_id_compare(&trid, &path_id->trid) != 0) { 7262 continue; 7263 } 7264 7265 TAILQ_REMOVE(&d_ctx->nvm_entry_ctxs, entry_ctx, tailq); 7266 free(entry_ctx); 7267 DISCOVERY_INFOLOG(d_ctx, "Remove discovery entry: %s:%s:%s\n", 7268 trid.subnqn, trid.traddr, trid.trsvcid); 7269 7270 /* Fail discovery ctrlr to force reattach attempt */ 7271 spdk_nvme_ctrlr_fail(d_ctx->ctrlr); 7272 } 7273 } 7274 } 7275 7276 static void 7277 discovery_remove_controllers(struct discovery_ctx *ctx) 7278 { 7279 struct spdk_nvmf_discovery_log_page *log_page = ctx->log_page; 7280 struct discovery_entry_ctx *entry_ctx, *tmp; 7281 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 7282 struct spdk_nvme_transport_id old_trid = {}; 7283 uint64_t numrec, i; 7284 bool found; 7285 7286 numrec = from_le64(&log_page->numrec); 7287 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->nvm_entry_ctxs, tailq, tmp) { 7288 found = false; 7289 old_entry = &entry_ctx->entry; 7290 build_trid_from_log_page_entry(&old_trid, old_entry); 7291 for (i = 0; i < numrec; i++) { 7292 new_entry = &log_page->entries[i]; 7293 if (!memcmp(old_entry, new_entry, sizeof(*old_entry))) { 7294 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s found again\n", 7295 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 7296 found = true; 7297 break; 7298 } 7299 } 7300 if (!found) { 7301 struct nvme_path_id path = {}; 7302 7303 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s not found\n", 7304 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 7305 7306 path.trid = entry_ctx->trid; 7307 bdev_nvme_delete(entry_ctx->name, &path, NULL, NULL); 7308 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 7309 free(entry_ctx); 7310 } 7311 } 7312 free(log_page); 7313 ctx->log_page = NULL; 7314 discovery_complete(ctx); 7315 } 7316 7317 static void 7318 complete_discovery_start(struct discovery_ctx *ctx, int status) 7319 { 7320 ctx->timeout_ticks = 0; 7321 ctx->rc = status; 7322 if (ctx->start_cb_fn) { 7323 ctx->start_cb_fn(ctx->cb_ctx, status); 7324 ctx->start_cb_fn = NULL; 7325 ctx->cb_ctx = NULL; 7326 } 7327 } 7328 7329 static void 7330 discovery_attach_controller_done(void *cb_ctx, size_t bdev_count, int rc) 7331 { 7332 struct discovery_entry_ctx *entry_ctx = cb_ctx; 7333 struct discovery_ctx *ctx = entry_ctx->ctx; 7334 7335 DISCOVERY_INFOLOG(ctx, "attach %s done\n", entry_ctx->name); 7336 ctx->attach_in_progress--; 7337 if (ctx->attach_in_progress == 0) { 7338 complete_discovery_start(ctx, ctx->rc); 7339 if (ctx->initializing && ctx->rc != 0) { 7340 DISCOVERY_ERRLOG(ctx, "stopping discovery due to errors: %d\n", ctx->rc); 7341 stop_discovery(ctx, NULL, ctx->cb_ctx); 7342 } else { 7343 discovery_remove_controllers(ctx); 7344 } 7345 } 7346 } 7347 7348 static struct discovery_entry_ctx * 7349 create_discovery_entry_ctx(struct discovery_ctx *ctx, struct spdk_nvme_transport_id *trid) 7350 { 7351 struct discovery_entry_ctx *new_ctx; 7352 7353 new_ctx = calloc(1, sizeof(*new_ctx)); 7354 if (new_ctx == NULL) { 7355 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 7356 return NULL; 7357 } 7358 7359 new_ctx->ctx = ctx; 7360 memcpy(&new_ctx->trid, trid, sizeof(*trid)); 7361 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 7362 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 7363 return new_ctx; 7364 } 7365 7366 static void 7367 discovery_log_page_cb(void *cb_arg, int rc, const struct spdk_nvme_cpl *cpl, 7368 struct spdk_nvmf_discovery_log_page *log_page) 7369 { 7370 struct discovery_ctx *ctx = cb_arg; 7371 struct discovery_entry_ctx *entry_ctx, *tmp; 7372 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 7373 uint64_t numrec, i; 7374 bool found; 7375 7376 if (rc || spdk_nvme_cpl_is_error(cpl)) { 7377 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 7378 return; 7379 } 7380 7381 ctx->log_page = log_page; 7382 assert(ctx->attach_in_progress == 0); 7383 numrec = from_le64(&log_page->numrec); 7384 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->discovery_entry_ctxs, tailq, tmp) { 7385 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 7386 free(entry_ctx); 7387 } 7388 for (i = 0; i < numrec; i++) { 7389 found = false; 7390 new_entry = &log_page->entries[i]; 7391 if (new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY_CURRENT || 7392 new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) { 7393 struct discovery_entry_ctx *new_ctx; 7394 struct spdk_nvme_transport_id trid = {}; 7395 7396 build_trid_from_log_page_entry(&trid, new_entry); 7397 new_ctx = create_discovery_entry_ctx(ctx, &trid); 7398 if (new_ctx == NULL) { 7399 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 7400 break; 7401 } 7402 7403 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, new_ctx, tailq); 7404 continue; 7405 } 7406 TAILQ_FOREACH(entry_ctx, &ctx->nvm_entry_ctxs, tailq) { 7407 old_entry = &entry_ctx->entry; 7408 if (!memcmp(new_entry, old_entry, sizeof(*new_entry))) { 7409 found = true; 7410 break; 7411 } 7412 } 7413 if (!found) { 7414 struct discovery_entry_ctx *subnqn_ctx = NULL, *new_ctx; 7415 struct discovery_ctx *d_ctx; 7416 7417 TAILQ_FOREACH(d_ctx, &g_discovery_ctxs, tailq) { 7418 TAILQ_FOREACH(subnqn_ctx, &d_ctx->nvm_entry_ctxs, tailq) { 7419 if (!memcmp(subnqn_ctx->entry.subnqn, new_entry->subnqn, 7420 sizeof(new_entry->subnqn))) { 7421 break; 7422 } 7423 } 7424 if (subnqn_ctx) { 7425 break; 7426 } 7427 } 7428 7429 new_ctx = calloc(1, sizeof(*new_ctx)); 7430 if (new_ctx == NULL) { 7431 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 7432 break; 7433 } 7434 7435 new_ctx->ctx = ctx; 7436 memcpy(&new_ctx->entry, new_entry, sizeof(*new_entry)); 7437 build_trid_from_log_page_entry(&new_ctx->trid, new_entry); 7438 if (subnqn_ctx) { 7439 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s", subnqn_ctx->name); 7440 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new path for %s\n", 7441 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 7442 new_ctx->name); 7443 } else { 7444 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s%d", ctx->name, ctx->index++); 7445 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new subsystem %s\n", 7446 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 7447 new_ctx->name); 7448 } 7449 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 7450 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 7451 rc = spdk_bdev_nvme_create(&new_ctx->trid, new_ctx->name, NULL, 0, 7452 discovery_attach_controller_done, new_ctx, 7453 &new_ctx->drv_opts, &ctx->bdev_opts); 7454 if (rc == 0) { 7455 TAILQ_INSERT_TAIL(&ctx->nvm_entry_ctxs, new_ctx, tailq); 7456 ctx->attach_in_progress++; 7457 } else { 7458 DISCOVERY_ERRLOG(ctx, "spdk_bdev_nvme_create failed (%s)\n", spdk_strerror(-rc)); 7459 } 7460 } 7461 } 7462 7463 if (ctx->attach_in_progress == 0) { 7464 discovery_remove_controllers(ctx); 7465 } 7466 } 7467 7468 static void 7469 get_discovery_log_page(struct discovery_ctx *ctx) 7470 { 7471 int rc; 7472 7473 assert(ctx->in_progress == false); 7474 ctx->in_progress = true; 7475 rc = spdk_nvme_ctrlr_get_discovery_log_page(ctx->ctrlr, discovery_log_page_cb, ctx); 7476 if (rc != 0) { 7477 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 7478 } 7479 DISCOVERY_INFOLOG(ctx, "sent discovery log page command\n"); 7480 } 7481 7482 static void 7483 discovery_aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 7484 { 7485 struct discovery_ctx *ctx = arg; 7486 uint32_t log_page_id = (cpl->cdw0 & 0xFF0000) >> 16; 7487 7488 if (spdk_nvme_cpl_is_error(cpl)) { 7489 DISCOVERY_ERRLOG(ctx, "aer failed\n"); 7490 return; 7491 } 7492 7493 if (log_page_id != SPDK_NVME_LOG_DISCOVERY) { 7494 DISCOVERY_ERRLOG(ctx, "unexpected log page 0x%x\n", log_page_id); 7495 return; 7496 } 7497 7498 DISCOVERY_INFOLOG(ctx, "got aer\n"); 7499 if (ctx->in_progress) { 7500 ctx->pending = true; 7501 return; 7502 } 7503 7504 get_discovery_log_page(ctx); 7505 } 7506 7507 static void 7508 discovery_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 7509 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 7510 { 7511 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 7512 struct discovery_ctx *ctx; 7513 7514 ctx = SPDK_CONTAINEROF(user_opts, struct discovery_ctx, drv_opts); 7515 7516 DISCOVERY_INFOLOG(ctx, "discovery ctrlr attached\n"); 7517 ctx->probe_ctx = NULL; 7518 ctx->ctrlr = ctrlr; 7519 7520 if (ctx->rc != 0) { 7521 DISCOVERY_ERRLOG(ctx, "encountered error while attaching discovery ctrlr: %d\n", 7522 ctx->rc); 7523 return; 7524 } 7525 7526 spdk_nvme_ctrlr_register_aer_callback(ctx->ctrlr, discovery_aer_cb, ctx); 7527 } 7528 7529 static int 7530 discovery_poller(void *arg) 7531 { 7532 struct discovery_ctx *ctx = arg; 7533 struct spdk_nvme_transport_id *trid; 7534 int rc; 7535 7536 if (ctx->detach_ctx) { 7537 rc = spdk_nvme_detach_poll_async(ctx->detach_ctx); 7538 if (rc != -EAGAIN) { 7539 ctx->detach_ctx = NULL; 7540 ctx->ctrlr = NULL; 7541 } 7542 } else if (ctx->stop) { 7543 if (ctx->ctrlr != NULL) { 7544 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 7545 if (rc == 0) { 7546 return SPDK_POLLER_BUSY; 7547 } 7548 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 7549 } 7550 spdk_poller_unregister(&ctx->poller); 7551 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 7552 assert(ctx->start_cb_fn == NULL); 7553 if (ctx->stop_cb_fn != NULL) { 7554 ctx->stop_cb_fn(ctx->cb_ctx); 7555 } 7556 free_discovery_ctx(ctx); 7557 } else if (ctx->probe_ctx == NULL && ctx->ctrlr == NULL) { 7558 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 7559 DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); 7560 assert(ctx->initializing); 7561 spdk_poller_unregister(&ctx->poller); 7562 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 7563 complete_discovery_start(ctx, -ETIMEDOUT); 7564 stop_discovery(ctx, NULL, NULL); 7565 free_discovery_ctx(ctx); 7566 return SPDK_POLLER_BUSY; 7567 } 7568 7569 assert(ctx->entry_ctx_in_use == NULL); 7570 ctx->entry_ctx_in_use = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 7571 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 7572 trid = &ctx->entry_ctx_in_use->trid; 7573 7574 /* All controllers must be configured explicitely either for multipath or failover. 7575 * While discovery use multipath mode, we need to set this in bdev options as well. 7576 */ 7577 ctx->bdev_opts.multipath = true; 7578 7579 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, discovery_attach_cb); 7580 if (ctx->probe_ctx) { 7581 spdk_poller_unregister(&ctx->poller); 7582 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000); 7583 } else { 7584 DISCOVERY_ERRLOG(ctx, "could not start discovery connect\n"); 7585 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 7586 ctx->entry_ctx_in_use = NULL; 7587 } 7588 } else if (ctx->probe_ctx) { 7589 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 7590 DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); 7591 complete_discovery_start(ctx, -ETIMEDOUT); 7592 return SPDK_POLLER_BUSY; 7593 } 7594 7595 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 7596 if (rc != -EAGAIN) { 7597 if (ctx->rc != 0) { 7598 assert(ctx->initializing); 7599 stop_discovery(ctx, NULL, ctx->cb_ctx); 7600 } else { 7601 assert(rc == 0); 7602 DISCOVERY_INFOLOG(ctx, "discovery ctrlr connected\n"); 7603 ctx->rc = rc; 7604 get_discovery_log_page(ctx); 7605 } 7606 } 7607 } else { 7608 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 7609 DISCOVERY_ERRLOG(ctx, "timed out while attaching NVM ctrlrs\n"); 7610 complete_discovery_start(ctx, -ETIMEDOUT); 7611 /* We need to wait until all NVM ctrlrs are attached before we stop the 7612 * discovery service to make sure we don't detach a ctrlr that is still 7613 * being attached. 7614 */ 7615 if (ctx->attach_in_progress == 0) { 7616 stop_discovery(ctx, NULL, ctx->cb_ctx); 7617 return SPDK_POLLER_BUSY; 7618 } 7619 } 7620 7621 rc = spdk_nvme_ctrlr_process_admin_completions(ctx->ctrlr); 7622 if (rc < 0) { 7623 spdk_poller_unregister(&ctx->poller); 7624 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 7625 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 7626 ctx->entry_ctx_in_use = NULL; 7627 7628 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 7629 if (rc != 0) { 7630 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 7631 ctx->ctrlr = NULL; 7632 } 7633 } 7634 } 7635 7636 return SPDK_POLLER_BUSY; 7637 } 7638 7639 static void 7640 start_discovery_poller(void *arg) 7641 { 7642 struct discovery_ctx *ctx = arg; 7643 7644 TAILQ_INSERT_TAIL(&g_discovery_ctxs, ctx, tailq); 7645 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 7646 } 7647 7648 int 7649 bdev_nvme_start_discovery(struct spdk_nvme_transport_id *trid, 7650 const char *base_name, 7651 struct spdk_nvme_ctrlr_opts *drv_opts, 7652 struct spdk_bdev_nvme_ctrlr_opts *bdev_opts, 7653 uint64_t attach_timeout, 7654 bool from_mdns, 7655 spdk_bdev_nvme_start_discovery_fn cb_fn, void *cb_ctx) 7656 { 7657 struct discovery_ctx *ctx; 7658 struct discovery_entry_ctx *discovery_entry_ctx; 7659 7660 snprintf(trid->subnqn, sizeof(trid->subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN); 7661 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 7662 if (strcmp(ctx->name, base_name) == 0) { 7663 return -EEXIST; 7664 } 7665 7666 if (ctx->entry_ctx_in_use != NULL) { 7667 if (!spdk_nvme_transport_id_compare(trid, &ctx->entry_ctx_in_use->trid)) { 7668 return -EEXIST; 7669 } 7670 } 7671 7672 TAILQ_FOREACH(discovery_entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 7673 if (!spdk_nvme_transport_id_compare(trid, &discovery_entry_ctx->trid)) { 7674 return -EEXIST; 7675 } 7676 } 7677 } 7678 7679 ctx = calloc(1, sizeof(*ctx)); 7680 if (ctx == NULL) { 7681 return -ENOMEM; 7682 } 7683 7684 ctx->name = strdup(base_name); 7685 if (ctx->name == NULL) { 7686 free_discovery_ctx(ctx); 7687 return -ENOMEM; 7688 } 7689 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 7690 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 7691 ctx->from_mdns_discovery_service = from_mdns; 7692 ctx->bdev_opts.from_discovery_service = true; 7693 ctx->calling_thread = spdk_get_thread(); 7694 ctx->start_cb_fn = cb_fn; 7695 ctx->cb_ctx = cb_ctx; 7696 ctx->initializing = true; 7697 if (ctx->start_cb_fn) { 7698 /* We can use this when dumping json to denote if this RPC parameter 7699 * was specified or not. 7700 */ 7701 ctx->wait_for_attach = true; 7702 } 7703 if (attach_timeout != 0) { 7704 ctx->timeout_ticks = spdk_get_ticks() + attach_timeout * 7705 spdk_get_ticks_hz() / 1000ull; 7706 } 7707 TAILQ_INIT(&ctx->nvm_entry_ctxs); 7708 TAILQ_INIT(&ctx->discovery_entry_ctxs); 7709 memcpy(&ctx->trid, trid, sizeof(*trid)); 7710 /* Even if user did not specify hostnqn, we can still strdup("\0"); */ 7711 ctx->hostnqn = strdup(ctx->drv_opts.hostnqn); 7712 if (ctx->hostnqn == NULL) { 7713 free_discovery_ctx(ctx); 7714 return -ENOMEM; 7715 } 7716 discovery_entry_ctx = create_discovery_entry_ctx(ctx, trid); 7717 if (discovery_entry_ctx == NULL) { 7718 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 7719 free_discovery_ctx(ctx); 7720 return -ENOMEM; 7721 } 7722 7723 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, discovery_entry_ctx, tailq); 7724 spdk_thread_send_msg(g_bdev_nvme_init_thread, start_discovery_poller, ctx); 7725 return 0; 7726 } 7727 7728 int 7729 bdev_nvme_stop_discovery(const char *name, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 7730 { 7731 struct discovery_ctx *ctx; 7732 7733 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 7734 if (strcmp(name, ctx->name) == 0) { 7735 if (ctx->stop) { 7736 return -EALREADY; 7737 } 7738 /* If we're still starting the discovery service and ->rc is non-zero, we're 7739 * going to stop it as soon as we can 7740 */ 7741 if (ctx->initializing && ctx->rc != 0) { 7742 return -EALREADY; 7743 } 7744 stop_discovery(ctx, cb_fn, cb_ctx); 7745 return 0; 7746 } 7747 } 7748 7749 return -ENOENT; 7750 } 7751 7752 static int 7753 bdev_nvme_library_init(void) 7754 { 7755 g_bdev_nvme_init_thread = spdk_get_thread(); 7756 7757 spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_create_poll_group_cb, 7758 bdev_nvme_destroy_poll_group_cb, 7759 sizeof(struct nvme_poll_group), "nvme_poll_groups"); 7760 7761 return 0; 7762 } 7763 7764 static void 7765 bdev_nvme_fini_destruct_ctrlrs(void) 7766 { 7767 struct nvme_bdev_ctrlr *nbdev_ctrlr; 7768 struct nvme_ctrlr *nvme_ctrlr; 7769 7770 pthread_mutex_lock(&g_bdev_nvme_mutex); 7771 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 7772 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 7773 pthread_mutex_lock(&nvme_ctrlr->mutex); 7774 if (nvme_ctrlr->destruct) { 7775 /* This controller's destruction was already started 7776 * before the application started shutting down 7777 */ 7778 pthread_mutex_unlock(&nvme_ctrlr->mutex); 7779 continue; 7780 } 7781 nvme_ctrlr->destruct = true; 7782 pthread_mutex_unlock(&nvme_ctrlr->mutex); 7783 7784 spdk_thread_send_msg(nvme_ctrlr->thread, _nvme_ctrlr_destruct, 7785 nvme_ctrlr); 7786 } 7787 } 7788 7789 g_bdev_nvme_module_finish = true; 7790 if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 7791 pthread_mutex_unlock(&g_bdev_nvme_mutex); 7792 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 7793 spdk_bdev_module_fini_done(); 7794 return; 7795 } 7796 7797 pthread_mutex_unlock(&g_bdev_nvme_mutex); 7798 } 7799 7800 static void 7801 check_discovery_fini(void *arg) 7802 { 7803 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 7804 bdev_nvme_fini_destruct_ctrlrs(); 7805 } 7806 } 7807 7808 static void 7809 bdev_nvme_library_fini(void) 7810 { 7811 struct nvme_probe_skip_entry *entry, *entry_tmp; 7812 struct discovery_ctx *ctx; 7813 7814 spdk_poller_unregister(&g_hotplug_poller); 7815 free(g_hotplug_probe_ctx); 7816 g_hotplug_probe_ctx = NULL; 7817 7818 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) { 7819 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 7820 free(entry); 7821 } 7822 7823 assert(spdk_get_thread() == g_bdev_nvme_init_thread); 7824 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 7825 bdev_nvme_fini_destruct_ctrlrs(); 7826 } else { 7827 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 7828 stop_discovery(ctx, check_discovery_fini, NULL); 7829 } 7830 } 7831 } 7832 7833 static void 7834 bdev_nvme_verify_pi_error(struct nvme_bdev_io *bio) 7835 { 7836 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7837 struct spdk_bdev *bdev = bdev_io->bdev; 7838 struct spdk_dif_ctx dif_ctx; 7839 struct spdk_dif_error err_blk = {}; 7840 int rc; 7841 struct spdk_dif_ctx_init_ext_opts dif_opts; 7842 7843 dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format); 7844 dif_opts.dif_pi_format = bdev->dif_pi_format; 7845 rc = spdk_dif_ctx_init(&dif_ctx, 7846 bdev->blocklen, bdev->md_len, bdev->md_interleave, 7847 bdev->dif_is_head_of_md, bdev->dif_type, 7848 bdev_io->u.bdev.dif_check_flags, 7849 bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0, &dif_opts); 7850 if (rc != 0) { 7851 SPDK_ERRLOG("Initialization of DIF context failed\n"); 7852 return; 7853 } 7854 7855 if (bdev->md_interleave) { 7856 rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 7857 bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 7858 } else { 7859 struct iovec md_iov = { 7860 .iov_base = bdev_io->u.bdev.md_buf, 7861 .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len, 7862 }; 7863 7864 rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 7865 &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 7866 } 7867 7868 if (rc != 0) { 7869 SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n", 7870 err_blk.err_type, err_blk.err_offset); 7871 } else { 7872 SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n"); 7873 } 7874 } 7875 7876 static void 7877 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 7878 { 7879 struct nvme_bdev_io *bio = ref; 7880 7881 if (spdk_nvme_cpl_is_success(cpl)) { 7882 /* Run PI verification for read data buffer. */ 7883 bdev_nvme_verify_pi_error(bio); 7884 } 7885 7886 /* Return original completion status */ 7887 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 7888 } 7889 7890 static void 7891 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 7892 { 7893 struct nvme_bdev_io *bio = ref; 7894 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7895 int ret; 7896 7897 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 7898 SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n", 7899 cpl->status.sct, cpl->status.sc); 7900 7901 /* Save completion status to use after verifying PI error. */ 7902 bio->cpl = *cpl; 7903 7904 if (spdk_likely(nvme_io_path_is_available(bio->io_path))) { 7905 /* Read without PI checking to verify PI error. */ 7906 ret = bdev_nvme_no_pi_readv(bio, 7907 bdev_io->u.bdev.iovs, 7908 bdev_io->u.bdev.iovcnt, 7909 bdev_io->u.bdev.md_buf, 7910 bdev_io->u.bdev.num_blocks, 7911 bdev_io->u.bdev.offset_blocks); 7912 if (ret == 0) { 7913 return; 7914 } 7915 } 7916 } 7917 7918 bdev_nvme_io_complete_nvme_status(bio, cpl); 7919 } 7920 7921 static void 7922 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7923 { 7924 struct nvme_bdev_io *bio = ref; 7925 7926 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 7927 SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n", 7928 cpl->status.sct, cpl->status.sc); 7929 /* Run PI verification for write data buffer if PI error is detected. */ 7930 bdev_nvme_verify_pi_error(bio); 7931 } 7932 7933 bdev_nvme_io_complete_nvme_status(bio, cpl); 7934 } 7935 7936 static void 7937 bdev_nvme_zone_appendv_done(void *ref, const struct spdk_nvme_cpl *cpl) 7938 { 7939 struct nvme_bdev_io *bio = ref; 7940 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7941 7942 /* spdk_bdev_io_get_append_location() requires that the ALBA is stored in offset_blocks. 7943 * Additionally, offset_blocks has to be set before calling bdev_nvme_verify_pi_error(). 7944 */ 7945 bdev_io->u.bdev.offset_blocks = *(uint64_t *)&cpl->cdw0; 7946 7947 if (spdk_nvme_cpl_is_pi_error(cpl)) { 7948 SPDK_ERRLOG("zone append completed with PI error (sct=%d, sc=%d)\n", 7949 cpl->status.sct, cpl->status.sc); 7950 /* Run PI verification for zone append data buffer if PI error is detected. */ 7951 bdev_nvme_verify_pi_error(bio); 7952 } 7953 7954 bdev_nvme_io_complete_nvme_status(bio, cpl); 7955 } 7956 7957 static void 7958 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7959 { 7960 struct nvme_bdev_io *bio = ref; 7961 7962 if (spdk_nvme_cpl_is_pi_error(cpl)) { 7963 SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n", 7964 cpl->status.sct, cpl->status.sc); 7965 /* Run PI verification for compare data buffer if PI error is detected. */ 7966 bdev_nvme_verify_pi_error(bio); 7967 } 7968 7969 bdev_nvme_io_complete_nvme_status(bio, cpl); 7970 } 7971 7972 static void 7973 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7974 { 7975 struct nvme_bdev_io *bio = ref; 7976 7977 /* Compare operation completion */ 7978 if (!bio->first_fused_completed) { 7979 /* Save compare result for write callback */ 7980 bio->cpl = *cpl; 7981 bio->first_fused_completed = true; 7982 return; 7983 } 7984 7985 /* Write operation completion */ 7986 if (spdk_nvme_cpl_is_error(&bio->cpl)) { 7987 /* If bio->cpl is already an error, it means the compare operation failed. In that case, 7988 * complete the IO with the compare operation's status. 7989 */ 7990 if (!spdk_nvme_cpl_is_error(cpl)) { 7991 SPDK_ERRLOG("Unexpected write success after compare failure.\n"); 7992 } 7993 7994 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 7995 } else { 7996 bdev_nvme_io_complete_nvme_status(bio, cpl); 7997 } 7998 } 7999 8000 static void 8001 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl) 8002 { 8003 struct nvme_bdev_io *bio = ref; 8004 8005 bdev_nvme_io_complete_nvme_status(bio, cpl); 8006 } 8007 8008 static int 8009 fill_zone_from_report(struct spdk_bdev_zone_info *info, struct spdk_nvme_zns_zone_desc *desc) 8010 { 8011 switch (desc->zt) { 8012 case SPDK_NVME_ZONE_TYPE_SEQWR: 8013 info->type = SPDK_BDEV_ZONE_TYPE_SEQWR; 8014 break; 8015 default: 8016 SPDK_ERRLOG("Invalid zone type: %#x in zone report\n", desc->zt); 8017 return -EIO; 8018 } 8019 8020 switch (desc->zs) { 8021 case SPDK_NVME_ZONE_STATE_EMPTY: 8022 info->state = SPDK_BDEV_ZONE_STATE_EMPTY; 8023 break; 8024 case SPDK_NVME_ZONE_STATE_IOPEN: 8025 info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN; 8026 break; 8027 case SPDK_NVME_ZONE_STATE_EOPEN: 8028 info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN; 8029 break; 8030 case SPDK_NVME_ZONE_STATE_CLOSED: 8031 info->state = SPDK_BDEV_ZONE_STATE_CLOSED; 8032 break; 8033 case SPDK_NVME_ZONE_STATE_RONLY: 8034 info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY; 8035 break; 8036 case SPDK_NVME_ZONE_STATE_FULL: 8037 info->state = SPDK_BDEV_ZONE_STATE_FULL; 8038 break; 8039 case SPDK_NVME_ZONE_STATE_OFFLINE: 8040 info->state = SPDK_BDEV_ZONE_STATE_OFFLINE; 8041 break; 8042 default: 8043 SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", desc->zs); 8044 return -EIO; 8045 } 8046 8047 info->zone_id = desc->zslba; 8048 info->write_pointer = desc->wp; 8049 info->capacity = desc->zcap; 8050 8051 return 0; 8052 } 8053 8054 static void 8055 bdev_nvme_get_zone_info_done(void *ref, const struct spdk_nvme_cpl *cpl) 8056 { 8057 struct nvme_bdev_io *bio = ref; 8058 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 8059 uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id; 8060 uint32_t zones_to_copy = bdev_io->u.zone_mgmt.num_zones; 8061 struct spdk_bdev_zone_info *info = bdev_io->u.zone_mgmt.buf; 8062 uint64_t max_zones_per_buf, i; 8063 uint32_t zone_report_bufsize; 8064 struct spdk_nvme_ns *ns; 8065 struct spdk_nvme_qpair *qpair; 8066 int ret; 8067 8068 if (spdk_nvme_cpl_is_error(cpl)) { 8069 goto out_complete_io_nvme_cpl; 8070 } 8071 8072 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 8073 ret = -ENXIO; 8074 goto out_complete_io_ret; 8075 } 8076 8077 ns = bio->io_path->nvme_ns->ns; 8078 qpair = bio->io_path->qpair->qpair; 8079 8080 zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 8081 max_zones_per_buf = (zone_report_bufsize - sizeof(*bio->zone_report_buf)) / 8082 sizeof(bio->zone_report_buf->descs[0]); 8083 8084 if (bio->zone_report_buf->nr_zones > max_zones_per_buf) { 8085 ret = -EINVAL; 8086 goto out_complete_io_ret; 8087 } 8088 8089 if (!bio->zone_report_buf->nr_zones) { 8090 ret = -EINVAL; 8091 goto out_complete_io_ret; 8092 } 8093 8094 for (i = 0; i < bio->zone_report_buf->nr_zones && bio->handled_zones < zones_to_copy; i++) { 8095 ret = fill_zone_from_report(&info[bio->handled_zones], 8096 &bio->zone_report_buf->descs[i]); 8097 if (ret) { 8098 goto out_complete_io_ret; 8099 } 8100 bio->handled_zones++; 8101 } 8102 8103 if (bio->handled_zones < zones_to_copy) { 8104 uint64_t zone_size_lba = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 8105 uint64_t slba = zone_id + (zone_size_lba * bio->handled_zones); 8106 8107 memset(bio->zone_report_buf, 0, zone_report_bufsize); 8108 ret = spdk_nvme_zns_report_zones(ns, qpair, 8109 bio->zone_report_buf, zone_report_bufsize, 8110 slba, SPDK_NVME_ZRA_LIST_ALL, true, 8111 bdev_nvme_get_zone_info_done, bio); 8112 if (!ret) { 8113 return; 8114 } else { 8115 goto out_complete_io_ret; 8116 } 8117 } 8118 8119 out_complete_io_nvme_cpl: 8120 free(bio->zone_report_buf); 8121 bio->zone_report_buf = NULL; 8122 bdev_nvme_io_complete_nvme_status(bio, cpl); 8123 return; 8124 8125 out_complete_io_ret: 8126 free(bio->zone_report_buf); 8127 bio->zone_report_buf = NULL; 8128 bdev_nvme_io_complete(bio, ret); 8129 } 8130 8131 static void 8132 bdev_nvme_zone_management_done(void *ref, const struct spdk_nvme_cpl *cpl) 8133 { 8134 struct nvme_bdev_io *bio = ref; 8135 8136 bdev_nvme_io_complete_nvme_status(bio, cpl); 8137 } 8138 8139 static void 8140 bdev_nvme_admin_passthru_complete_nvme_status(void *ctx) 8141 { 8142 struct nvme_bdev_io *bio = ctx; 8143 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 8144 const struct spdk_nvme_cpl *cpl = &bio->cpl; 8145 8146 assert(bdev_nvme_io_type_is_admin(bdev_io->type)); 8147 8148 __bdev_nvme_io_complete(bdev_io, 0, cpl); 8149 } 8150 8151 static void 8152 bdev_nvme_abort_complete(void *ctx) 8153 { 8154 struct nvme_bdev_io *bio = ctx; 8155 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 8156 8157 if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) { 8158 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS, NULL); 8159 } else { 8160 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED, NULL); 8161 } 8162 } 8163 8164 static void 8165 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl) 8166 { 8167 struct nvme_bdev_io *bio = ref; 8168 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 8169 8170 bio->cpl = *cpl; 8171 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), bdev_nvme_abort_complete, bio); 8172 } 8173 8174 static void 8175 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl) 8176 { 8177 struct nvme_bdev_io *bio = ref; 8178 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 8179 8180 bio->cpl = *cpl; 8181 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 8182 bdev_nvme_admin_passthru_complete_nvme_status, bio); 8183 } 8184 8185 static void 8186 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset) 8187 { 8188 struct nvme_bdev_io *bio = ref; 8189 struct iovec *iov; 8190 8191 bio->iov_offset = sgl_offset; 8192 for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) { 8193 iov = &bio->iovs[bio->iovpos]; 8194 if (bio->iov_offset < iov->iov_len) { 8195 break; 8196 } 8197 8198 bio->iov_offset -= iov->iov_len; 8199 } 8200 } 8201 8202 static int 8203 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length) 8204 { 8205 struct nvme_bdev_io *bio = ref; 8206 struct iovec *iov; 8207 8208 assert(bio->iovpos < bio->iovcnt); 8209 8210 iov = &bio->iovs[bio->iovpos]; 8211 8212 *address = iov->iov_base; 8213 *length = iov->iov_len; 8214 8215 if (bio->iov_offset) { 8216 assert(bio->iov_offset <= iov->iov_len); 8217 *address += bio->iov_offset; 8218 *length -= bio->iov_offset; 8219 } 8220 8221 bio->iov_offset += *length; 8222 if (bio->iov_offset == iov->iov_len) { 8223 bio->iovpos++; 8224 bio->iov_offset = 0; 8225 } 8226 8227 return 0; 8228 } 8229 8230 static void 8231 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset) 8232 { 8233 struct nvme_bdev_io *bio = ref; 8234 struct iovec *iov; 8235 8236 bio->fused_iov_offset = sgl_offset; 8237 for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) { 8238 iov = &bio->fused_iovs[bio->fused_iovpos]; 8239 if (bio->fused_iov_offset < iov->iov_len) { 8240 break; 8241 } 8242 8243 bio->fused_iov_offset -= iov->iov_len; 8244 } 8245 } 8246 8247 static int 8248 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length) 8249 { 8250 struct nvme_bdev_io *bio = ref; 8251 struct iovec *iov; 8252 8253 assert(bio->fused_iovpos < bio->fused_iovcnt); 8254 8255 iov = &bio->fused_iovs[bio->fused_iovpos]; 8256 8257 *address = iov->iov_base; 8258 *length = iov->iov_len; 8259 8260 if (bio->fused_iov_offset) { 8261 assert(bio->fused_iov_offset <= iov->iov_len); 8262 *address += bio->fused_iov_offset; 8263 *length -= bio->fused_iov_offset; 8264 } 8265 8266 bio->fused_iov_offset += *length; 8267 if (bio->fused_iov_offset == iov->iov_len) { 8268 bio->fused_iovpos++; 8269 bio->fused_iov_offset = 0; 8270 } 8271 8272 return 0; 8273 } 8274 8275 static int 8276 bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 8277 void *md, uint64_t lba_count, uint64_t lba) 8278 { 8279 int rc; 8280 8281 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n", 8282 lba_count, lba); 8283 8284 bio->iovs = iov; 8285 bio->iovcnt = iovcnt; 8286 bio->iovpos = 0; 8287 bio->iov_offset = 0; 8288 8289 rc = spdk_nvme_ns_cmd_readv_with_md(bio->io_path->nvme_ns->ns, 8290 bio->io_path->qpair->qpair, 8291 lba, lba_count, 8292 bdev_nvme_no_pi_readv_done, bio, 0, 8293 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 8294 md, 0, 0); 8295 8296 if (rc != 0 && rc != -ENOMEM) { 8297 SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc); 8298 } 8299 return rc; 8300 } 8301 8302 static int 8303 bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 8304 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, 8305 struct spdk_memory_domain *domain, void *domain_ctx, 8306 struct spdk_accel_sequence *seq) 8307 { 8308 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8309 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8310 int rc; 8311 8312 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n", 8313 lba_count, lba); 8314 8315 bio->iovs = iov; 8316 bio->iovcnt = iovcnt; 8317 bio->iovpos = 0; 8318 bio->iov_offset = 0; 8319 8320 if (domain != NULL || seq != NULL) { 8321 bio->ext_opts.size = SPDK_SIZEOF(&bio->ext_opts, accel_sequence); 8322 bio->ext_opts.memory_domain = domain; 8323 bio->ext_opts.memory_domain_ctx = domain_ctx; 8324 bio->ext_opts.io_flags = flags; 8325 bio->ext_opts.metadata = md; 8326 bio->ext_opts.accel_sequence = seq; 8327 8328 if (iovcnt == 1) { 8329 rc = spdk_nvme_ns_cmd_read_ext(ns, qpair, iov[0].iov_base, lba, lba_count, bdev_nvme_readv_done, 8330 bio, &bio->ext_opts); 8331 } else { 8332 rc = spdk_nvme_ns_cmd_readv_ext(ns, qpair, lba, lba_count, 8333 bdev_nvme_readv_done, bio, 8334 bdev_nvme_queued_reset_sgl, 8335 bdev_nvme_queued_next_sge, 8336 &bio->ext_opts); 8337 } 8338 } else if (iovcnt == 1) { 8339 rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base, 8340 md, lba, lba_count, bdev_nvme_readv_done, 8341 bio, flags, 0, 0); 8342 } else { 8343 rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count, 8344 bdev_nvme_readv_done, bio, flags, 8345 bdev_nvme_queued_reset_sgl, 8346 bdev_nvme_queued_next_sge, md, 0, 0); 8347 } 8348 8349 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 8350 SPDK_ERRLOG("readv failed: rc = %d\n", rc); 8351 } 8352 return rc; 8353 } 8354 8355 static int 8356 bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 8357 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, 8358 struct spdk_memory_domain *domain, void *domain_ctx, 8359 struct spdk_accel_sequence *seq, 8360 union spdk_bdev_nvme_cdw12 cdw12, union spdk_bdev_nvme_cdw13 cdw13) 8361 { 8362 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8363 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8364 int rc; 8365 8366 SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 8367 lba_count, lba); 8368 8369 bio->iovs = iov; 8370 bio->iovcnt = iovcnt; 8371 bio->iovpos = 0; 8372 bio->iov_offset = 0; 8373 8374 if (domain != NULL || seq != NULL) { 8375 bio->ext_opts.size = SPDK_SIZEOF(&bio->ext_opts, accel_sequence); 8376 bio->ext_opts.memory_domain = domain; 8377 bio->ext_opts.memory_domain_ctx = domain_ctx; 8378 bio->ext_opts.io_flags = flags | SPDK_NVME_IO_FLAGS_DIRECTIVE(cdw12.write.dtype); 8379 bio->ext_opts.cdw13 = cdw13.raw; 8380 bio->ext_opts.metadata = md; 8381 bio->ext_opts.accel_sequence = seq; 8382 8383 if (iovcnt == 1) { 8384 rc = spdk_nvme_ns_cmd_write_ext(ns, qpair, iov[0].iov_base, lba, lba_count, bdev_nvme_writev_done, 8385 bio, &bio->ext_opts); 8386 } else { 8387 rc = spdk_nvme_ns_cmd_writev_ext(ns, qpair, lba, lba_count, 8388 bdev_nvme_writev_done, bio, 8389 bdev_nvme_queued_reset_sgl, 8390 bdev_nvme_queued_next_sge, 8391 &bio->ext_opts); 8392 } 8393 } else if (iovcnt == 1) { 8394 rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base, 8395 md, lba, lba_count, bdev_nvme_writev_done, 8396 bio, flags, 0, 0); 8397 } else { 8398 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 8399 bdev_nvme_writev_done, bio, flags, 8400 bdev_nvme_queued_reset_sgl, 8401 bdev_nvme_queued_next_sge, md, 0, 0); 8402 } 8403 8404 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 8405 SPDK_ERRLOG("writev failed: rc = %d\n", rc); 8406 } 8407 return rc; 8408 } 8409 8410 static int 8411 bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 8412 void *md, uint64_t lba_count, uint64_t zslba, 8413 uint32_t flags) 8414 { 8415 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8416 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8417 int rc; 8418 8419 SPDK_DEBUGLOG(bdev_nvme, "zone append %" PRIu64 " blocks to zone start lba %#" PRIx64 "\n", 8420 lba_count, zslba); 8421 8422 bio->iovs = iov; 8423 bio->iovcnt = iovcnt; 8424 bio->iovpos = 0; 8425 bio->iov_offset = 0; 8426 8427 if (iovcnt == 1) { 8428 rc = spdk_nvme_zns_zone_append_with_md(ns, qpair, iov[0].iov_base, md, zslba, 8429 lba_count, 8430 bdev_nvme_zone_appendv_done, bio, 8431 flags, 8432 0, 0); 8433 } else { 8434 rc = spdk_nvme_zns_zone_appendv_with_md(ns, qpair, zslba, lba_count, 8435 bdev_nvme_zone_appendv_done, bio, flags, 8436 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 8437 md, 0, 0); 8438 } 8439 8440 if (rc != 0 && rc != -ENOMEM) { 8441 SPDK_ERRLOG("zone append failed: rc = %d\n", rc); 8442 } 8443 return rc; 8444 } 8445 8446 static int 8447 bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 8448 void *md, uint64_t lba_count, uint64_t lba, 8449 uint32_t flags) 8450 { 8451 int rc; 8452 8453 SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n", 8454 lba_count, lba); 8455 8456 bio->iovs = iov; 8457 bio->iovcnt = iovcnt; 8458 bio->iovpos = 0; 8459 bio->iov_offset = 0; 8460 8461 rc = spdk_nvme_ns_cmd_comparev_with_md(bio->io_path->nvme_ns->ns, 8462 bio->io_path->qpair->qpair, 8463 lba, lba_count, 8464 bdev_nvme_comparev_done, bio, flags, 8465 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 8466 md, 0, 0); 8467 8468 if (rc != 0 && rc != -ENOMEM) { 8469 SPDK_ERRLOG("comparev failed: rc = %d\n", rc); 8470 } 8471 return rc; 8472 } 8473 8474 static int 8475 bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, 8476 struct iovec *write_iov, int write_iovcnt, 8477 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags) 8478 { 8479 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8480 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8481 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 8482 int rc; 8483 8484 SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 8485 lba_count, lba); 8486 8487 bio->iovs = cmp_iov; 8488 bio->iovcnt = cmp_iovcnt; 8489 bio->iovpos = 0; 8490 bio->iov_offset = 0; 8491 bio->fused_iovs = write_iov; 8492 bio->fused_iovcnt = write_iovcnt; 8493 bio->fused_iovpos = 0; 8494 bio->fused_iov_offset = 0; 8495 8496 if (bdev_io->num_retries == 0) { 8497 bio->first_fused_submitted = false; 8498 bio->first_fused_completed = false; 8499 } 8500 8501 if (!bio->first_fused_submitted) { 8502 flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST; 8503 memset(&bio->cpl, 0, sizeof(bio->cpl)); 8504 8505 rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count, 8506 bdev_nvme_comparev_and_writev_done, bio, flags, 8507 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0); 8508 if (rc == 0) { 8509 bio->first_fused_submitted = true; 8510 flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST; 8511 } else { 8512 if (rc != -ENOMEM) { 8513 SPDK_ERRLOG("compare failed: rc = %d\n", rc); 8514 } 8515 return rc; 8516 } 8517 } 8518 8519 flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND; 8520 8521 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 8522 bdev_nvme_comparev_and_writev_done, bio, flags, 8523 bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0); 8524 if (rc != 0 && rc != -ENOMEM) { 8525 SPDK_ERRLOG("write failed: rc = %d\n", rc); 8526 rc = 0; 8527 } 8528 8529 return rc; 8530 } 8531 8532 static int 8533 bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 8534 { 8535 struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES]; 8536 struct spdk_nvme_dsm_range *range; 8537 uint64_t offset, remaining; 8538 uint64_t num_ranges_u64; 8539 uint16_t num_ranges; 8540 int rc; 8541 8542 num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) / 8543 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 8544 if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) { 8545 SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks); 8546 return -EINVAL; 8547 } 8548 num_ranges = (uint16_t)num_ranges_u64; 8549 8550 offset = offset_blocks; 8551 remaining = num_blocks; 8552 range = &dsm_ranges[0]; 8553 8554 /* Fill max-size ranges until the remaining blocks fit into one range */ 8555 while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) { 8556 range->attributes.raw = 0; 8557 range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 8558 range->starting_lba = offset; 8559 8560 offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 8561 remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 8562 range++; 8563 } 8564 8565 /* Final range describes the remaining blocks */ 8566 range->attributes.raw = 0; 8567 range->length = remaining; 8568 range->starting_lba = offset; 8569 8570 rc = spdk_nvme_ns_cmd_dataset_management(bio->io_path->nvme_ns->ns, 8571 bio->io_path->qpair->qpair, 8572 SPDK_NVME_DSM_ATTR_DEALLOCATE, 8573 dsm_ranges, num_ranges, 8574 bdev_nvme_queued_done, bio); 8575 8576 return rc; 8577 } 8578 8579 static int 8580 bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 8581 { 8582 if (num_blocks > UINT16_MAX + 1) { 8583 SPDK_ERRLOG("NVMe write zeroes is limited to 16-bit block count\n"); 8584 return -EINVAL; 8585 } 8586 8587 return spdk_nvme_ns_cmd_write_zeroes(bio->io_path->nvme_ns->ns, 8588 bio->io_path->qpair->qpair, 8589 offset_blocks, num_blocks, 8590 bdev_nvme_queued_done, bio, 8591 0); 8592 } 8593 8594 static int 8595 bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones, 8596 struct spdk_bdev_zone_info *info) 8597 { 8598 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8599 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8600 uint32_t zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 8601 uint64_t zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 8602 uint64_t total_zones = spdk_nvme_zns_ns_get_num_zones(ns); 8603 8604 if (zone_id % zone_size != 0) { 8605 return -EINVAL; 8606 } 8607 8608 if (num_zones > total_zones || !num_zones) { 8609 return -EINVAL; 8610 } 8611 8612 assert(!bio->zone_report_buf); 8613 bio->zone_report_buf = calloc(1, zone_report_bufsize); 8614 if (!bio->zone_report_buf) { 8615 return -ENOMEM; 8616 } 8617 8618 bio->handled_zones = 0; 8619 8620 return spdk_nvme_zns_report_zones(ns, qpair, bio->zone_report_buf, zone_report_bufsize, 8621 zone_id, SPDK_NVME_ZRA_LIST_ALL, true, 8622 bdev_nvme_get_zone_info_done, bio); 8623 } 8624 8625 static int 8626 bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 8627 enum spdk_bdev_zone_action action) 8628 { 8629 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8630 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8631 8632 switch (action) { 8633 case SPDK_BDEV_ZONE_CLOSE: 8634 return spdk_nvme_zns_close_zone(ns, qpair, zone_id, false, 8635 bdev_nvme_zone_management_done, bio); 8636 case SPDK_BDEV_ZONE_FINISH: 8637 return spdk_nvme_zns_finish_zone(ns, qpair, zone_id, false, 8638 bdev_nvme_zone_management_done, bio); 8639 case SPDK_BDEV_ZONE_OPEN: 8640 return spdk_nvme_zns_open_zone(ns, qpair, zone_id, false, 8641 bdev_nvme_zone_management_done, bio); 8642 case SPDK_BDEV_ZONE_RESET: 8643 return spdk_nvme_zns_reset_zone(ns, qpair, zone_id, false, 8644 bdev_nvme_zone_management_done, bio); 8645 case SPDK_BDEV_ZONE_OFFLINE: 8646 return spdk_nvme_zns_offline_zone(ns, qpair, zone_id, false, 8647 bdev_nvme_zone_management_done, bio); 8648 default: 8649 return -EINVAL; 8650 } 8651 } 8652 8653 static void 8654 bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 8655 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) 8656 { 8657 struct nvme_io_path *io_path; 8658 struct nvme_ctrlr *nvme_ctrlr; 8659 uint32_t max_xfer_size; 8660 int rc = -ENXIO; 8661 8662 /* Choose the first ctrlr which is not failed. */ 8663 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 8664 nvme_ctrlr = io_path->qpair->ctrlr; 8665 8666 /* We should skip any unavailable nvme_ctrlr rather than checking 8667 * if the return value of spdk_nvme_ctrlr_cmd_admin_raw() is -ENXIO. 8668 */ 8669 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 8670 continue; 8671 } 8672 8673 max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ctrlr->ctrlr); 8674 8675 if (nbytes > max_xfer_size) { 8676 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 8677 rc = -EINVAL; 8678 goto err; 8679 } 8680 8681 rc = spdk_nvme_ctrlr_cmd_admin_raw(nvme_ctrlr->ctrlr, cmd, buf, (uint32_t)nbytes, 8682 bdev_nvme_admin_passthru_done, bio); 8683 if (rc == 0) { 8684 return; 8685 } 8686 } 8687 8688 err: 8689 bdev_nvme_admin_complete(bio, rc); 8690 } 8691 8692 static int 8693 bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 8694 void *buf, size_t nbytes) 8695 { 8696 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8697 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8698 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 8699 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 8700 8701 if (nbytes > max_xfer_size) { 8702 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 8703 return -EINVAL; 8704 } 8705 8706 /* 8707 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 8708 * so fill it out automatically. 8709 */ 8710 cmd->nsid = spdk_nvme_ns_get_id(ns); 8711 8712 return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf, 8713 (uint32_t)nbytes, bdev_nvme_queued_done, bio); 8714 } 8715 8716 static int 8717 bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 8718 void *buf, size_t nbytes, void *md_buf, size_t md_len) 8719 { 8720 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8721 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8722 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 8723 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 8724 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 8725 8726 if (nbytes > max_xfer_size) { 8727 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 8728 return -EINVAL; 8729 } 8730 8731 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 8732 SPDK_ERRLOG("invalid meta data buffer size\n"); 8733 return -EINVAL; 8734 } 8735 8736 /* 8737 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 8738 * so fill it out automatically. 8739 */ 8740 cmd->nsid = spdk_nvme_ns_get_id(ns); 8741 8742 return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf, 8743 (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio); 8744 } 8745 8746 static int 8747 bdev_nvme_iov_passthru_md(struct nvme_bdev_io *bio, 8748 struct spdk_nvme_cmd *cmd, struct iovec *iov, int iovcnt, 8749 size_t nbytes, void *md_buf, size_t md_len) 8750 { 8751 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8752 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8753 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 8754 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 8755 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 8756 8757 bio->iovs = iov; 8758 bio->iovcnt = iovcnt; 8759 bio->iovpos = 0; 8760 bio->iov_offset = 0; 8761 8762 if (nbytes > max_xfer_size) { 8763 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 8764 return -EINVAL; 8765 } 8766 8767 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 8768 SPDK_ERRLOG("invalid meta data buffer size\n"); 8769 return -EINVAL; 8770 } 8771 8772 /* 8773 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands 8774 * require a nsid, so fill it out automatically. 8775 */ 8776 cmd->nsid = spdk_nvme_ns_get_id(ns); 8777 8778 return spdk_nvme_ctrlr_cmd_iov_raw_with_md( 8779 ctrlr, qpair, cmd, (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio, 8780 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge); 8781 } 8782 8783 static void 8784 bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 8785 struct nvme_bdev_io *bio_to_abort) 8786 { 8787 struct nvme_io_path *io_path; 8788 int rc = 0; 8789 8790 rc = bdev_nvme_abort_retry_io(nbdev_ch, bio_to_abort); 8791 if (rc == 0) { 8792 bdev_nvme_admin_complete(bio, 0); 8793 return; 8794 } 8795 8796 io_path = bio_to_abort->io_path; 8797 if (io_path != NULL) { 8798 rc = spdk_nvme_ctrlr_cmd_abort_ext(io_path->qpair->ctrlr->ctrlr, 8799 io_path->qpair->qpair, 8800 bio_to_abort, 8801 bdev_nvme_abort_done, bio); 8802 } else { 8803 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 8804 rc = spdk_nvme_ctrlr_cmd_abort_ext(io_path->qpair->ctrlr->ctrlr, 8805 NULL, 8806 bio_to_abort, 8807 bdev_nvme_abort_done, bio); 8808 8809 if (rc != -ENOENT) { 8810 break; 8811 } 8812 } 8813 } 8814 8815 if (rc != 0) { 8816 /* If no command was found or there was any error, complete the abort 8817 * request with failure. 8818 */ 8819 bdev_nvme_admin_complete(bio, rc); 8820 } 8821 } 8822 8823 static int 8824 bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, uint64_t src_offset_blocks, 8825 uint64_t num_blocks) 8826 { 8827 struct spdk_nvme_scc_source_range range = { 8828 .slba = src_offset_blocks, 8829 .nlb = num_blocks - 1 8830 }; 8831 8832 return spdk_nvme_ns_cmd_copy(bio->io_path->nvme_ns->ns, 8833 bio->io_path->qpair->qpair, 8834 &range, 1, dst_offset_blocks, 8835 bdev_nvme_queued_done, bio); 8836 } 8837 8838 static void 8839 bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w) 8840 { 8841 const char *action; 8842 uint32_t i; 8843 8844 if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) { 8845 action = "reset"; 8846 } else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) { 8847 action = "abort"; 8848 } else { 8849 action = "none"; 8850 } 8851 8852 spdk_json_write_object_begin(w); 8853 8854 spdk_json_write_named_string(w, "method", "bdev_nvme_set_options"); 8855 8856 spdk_json_write_named_object_begin(w, "params"); 8857 spdk_json_write_named_string(w, "action_on_timeout", action); 8858 spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us); 8859 spdk_json_write_named_uint64(w, "timeout_admin_us", g_opts.timeout_admin_us); 8860 spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms); 8861 spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst); 8862 spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight); 8863 spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight); 8864 spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight); 8865 spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us); 8866 spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us); 8867 spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests); 8868 spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit); 8869 spdk_json_write_named_uint32(w, "transport_retry_count", g_opts.transport_retry_count); 8870 spdk_json_write_named_int32(w, "bdev_retry_count", g_opts.bdev_retry_count); 8871 spdk_json_write_named_uint8(w, "transport_ack_timeout", g_opts.transport_ack_timeout); 8872 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", g_opts.ctrlr_loss_timeout_sec); 8873 spdk_json_write_named_uint32(w, "reconnect_delay_sec", g_opts.reconnect_delay_sec); 8874 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", g_opts.fast_io_fail_timeout_sec); 8875 spdk_json_write_named_bool(w, "disable_auto_failback", g_opts.disable_auto_failback); 8876 spdk_json_write_named_bool(w, "generate_uuids", g_opts.generate_uuids); 8877 spdk_json_write_named_uint8(w, "transport_tos", g_opts.transport_tos); 8878 spdk_json_write_named_bool(w, "nvme_error_stat", g_opts.nvme_error_stat); 8879 spdk_json_write_named_uint32(w, "rdma_srq_size", g_opts.rdma_srq_size); 8880 spdk_json_write_named_bool(w, "io_path_stat", g_opts.io_path_stat); 8881 spdk_json_write_named_bool(w, "allow_accel_sequence", g_opts.allow_accel_sequence); 8882 spdk_json_write_named_uint32(w, "rdma_max_cq_size", g_opts.rdma_max_cq_size); 8883 spdk_json_write_named_uint16(w, "rdma_cm_event_timeout_ms", g_opts.rdma_cm_event_timeout_ms); 8884 spdk_json_write_named_array_begin(w, "dhchap_digests"); 8885 for (i = 0; i < 32; ++i) { 8886 if (g_opts.dhchap_digests & SPDK_BIT(i)) { 8887 spdk_json_write_string(w, spdk_nvme_dhchap_get_digest_name(i)); 8888 } 8889 } 8890 spdk_json_write_array_end(w); 8891 spdk_json_write_named_array_begin(w, "dhchap_dhgroups"); 8892 for (i = 0; i < 32; ++i) { 8893 if (g_opts.dhchap_dhgroups & SPDK_BIT(i)) { 8894 spdk_json_write_string(w, spdk_nvme_dhchap_get_dhgroup_name(i)); 8895 } 8896 } 8897 8898 spdk_json_write_array_end(w); 8899 spdk_json_write_named_bool(w, "rdma_umr_per_io", g_opts.rdma_umr_per_io); 8900 spdk_json_write_object_end(w); 8901 8902 spdk_json_write_object_end(w); 8903 } 8904 8905 static void 8906 bdev_nvme_discovery_config_json(struct spdk_json_write_ctx *w, struct discovery_ctx *ctx) 8907 { 8908 struct spdk_nvme_transport_id trid; 8909 8910 spdk_json_write_object_begin(w); 8911 8912 spdk_json_write_named_string(w, "method", "bdev_nvme_start_discovery"); 8913 8914 spdk_json_write_named_object_begin(w, "params"); 8915 spdk_json_write_named_string(w, "name", ctx->name); 8916 spdk_json_write_named_string(w, "hostnqn", ctx->hostnqn); 8917 8918 trid = ctx->trid; 8919 memset(trid.subnqn, 0, sizeof(trid.subnqn)); 8920 nvme_bdev_dump_trid_json(&trid, w); 8921 8922 spdk_json_write_named_bool(w, "wait_for_attach", ctx->wait_for_attach); 8923 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", ctx->bdev_opts.ctrlr_loss_timeout_sec); 8924 spdk_json_write_named_uint32(w, "reconnect_delay_sec", ctx->bdev_opts.reconnect_delay_sec); 8925 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 8926 ctx->bdev_opts.fast_io_fail_timeout_sec); 8927 spdk_json_write_object_end(w); 8928 8929 spdk_json_write_object_end(w); 8930 } 8931 8932 #ifdef SPDK_CONFIG_NVME_CUSE 8933 static void 8934 nvme_ctrlr_cuse_config_json(struct spdk_json_write_ctx *w, 8935 struct nvme_ctrlr *nvme_ctrlr) 8936 { 8937 size_t cuse_name_size = 128; 8938 char cuse_name[cuse_name_size]; 8939 8940 if (spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr, 8941 cuse_name, &cuse_name_size) != 0) { 8942 return; 8943 } 8944 8945 spdk_json_write_object_begin(w); 8946 8947 spdk_json_write_named_string(w, "method", "bdev_nvme_cuse_register"); 8948 8949 spdk_json_write_named_object_begin(w, "params"); 8950 spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name); 8951 spdk_json_write_object_end(w); 8952 8953 spdk_json_write_object_end(w); 8954 } 8955 #endif 8956 8957 static void 8958 nvme_ctrlr_config_json(struct spdk_json_write_ctx *w, 8959 struct nvme_ctrlr *nvme_ctrlr, 8960 struct nvme_path_id *path_id) 8961 { 8962 struct spdk_nvme_transport_id *trid; 8963 const struct spdk_nvme_ctrlr_opts *opts; 8964 8965 if (nvme_ctrlr->opts.from_discovery_service) { 8966 /* Do not emit an RPC for this - it will be implicitly 8967 * covered by a separate bdev_nvme_start_discovery or 8968 * bdev_nvme_start_mdns_discovery RPC. 8969 */ 8970 return; 8971 } 8972 8973 trid = &path_id->trid; 8974 8975 spdk_json_write_object_begin(w); 8976 8977 spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller"); 8978 8979 spdk_json_write_named_object_begin(w, "params"); 8980 spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name); 8981 nvme_bdev_dump_trid_json(trid, w); 8982 spdk_json_write_named_bool(w, "prchk_reftag", 8983 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0); 8984 spdk_json_write_named_bool(w, "prchk_guard", 8985 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0); 8986 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", nvme_ctrlr->opts.ctrlr_loss_timeout_sec); 8987 spdk_json_write_named_uint32(w, "reconnect_delay_sec", nvme_ctrlr->opts.reconnect_delay_sec); 8988 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 8989 nvme_ctrlr->opts.fast_io_fail_timeout_sec); 8990 if (nvme_ctrlr->psk != NULL) { 8991 spdk_json_write_named_string(w, "psk", spdk_key_get_name(nvme_ctrlr->psk)); 8992 } 8993 if (nvme_ctrlr->dhchap_key != NULL) { 8994 spdk_json_write_named_string(w, "dhchap_key", 8995 spdk_key_get_name(nvme_ctrlr->dhchap_key)); 8996 } 8997 if (nvme_ctrlr->dhchap_ctrlr_key != NULL) { 8998 spdk_json_write_named_string(w, "dhchap_ctrlr_key", 8999 spdk_key_get_name(nvme_ctrlr->dhchap_ctrlr_key)); 9000 } 9001 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 9002 spdk_json_write_named_string(w, "hostnqn", opts->hostnqn); 9003 spdk_json_write_named_bool(w, "hdgst", opts->header_digest); 9004 spdk_json_write_named_bool(w, "ddgst", opts->data_digest); 9005 if (opts->src_addr[0] != '\0') { 9006 spdk_json_write_named_string(w, "hostaddr", opts->src_addr); 9007 } 9008 if (opts->src_svcid[0] != '\0') { 9009 spdk_json_write_named_string(w, "hostsvcid", opts->src_svcid); 9010 } 9011 9012 if (nvme_ctrlr->opts.multipath) { 9013 spdk_json_write_named_string(w, "multipath", "multipath"); 9014 } 9015 spdk_json_write_object_end(w); 9016 9017 spdk_json_write_object_end(w); 9018 } 9019 9020 static void 9021 bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w) 9022 { 9023 spdk_json_write_object_begin(w); 9024 spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug"); 9025 9026 spdk_json_write_named_object_begin(w, "params"); 9027 spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us); 9028 spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled); 9029 spdk_json_write_object_end(w); 9030 9031 spdk_json_write_object_end(w); 9032 } 9033 9034 static int 9035 bdev_nvme_config_json(struct spdk_json_write_ctx *w) 9036 { 9037 struct nvme_bdev_ctrlr *nbdev_ctrlr; 9038 struct nvme_ctrlr *nvme_ctrlr; 9039 struct discovery_ctx *ctx; 9040 struct nvme_path_id *path_id; 9041 9042 bdev_nvme_opts_config_json(w); 9043 9044 pthread_mutex_lock(&g_bdev_nvme_mutex); 9045 9046 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 9047 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 9048 path_id = nvme_ctrlr->active_path_id; 9049 assert(path_id == TAILQ_FIRST(&nvme_ctrlr->trids)); 9050 nvme_ctrlr_config_json(w, nvme_ctrlr, path_id); 9051 9052 path_id = TAILQ_NEXT(path_id, link); 9053 while (path_id != NULL) { 9054 nvme_ctrlr_config_json(w, nvme_ctrlr, path_id); 9055 path_id = TAILQ_NEXT(path_id, link); 9056 } 9057 9058 #ifdef SPDK_CONFIG_NVME_CUSE 9059 nvme_ctrlr_cuse_config_json(w, nvme_ctrlr); 9060 #endif 9061 } 9062 } 9063 9064 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 9065 if (!ctx->from_mdns_discovery_service) { 9066 bdev_nvme_discovery_config_json(w, ctx); 9067 } 9068 } 9069 9070 bdev_nvme_mdns_discovery_config_json(w); 9071 9072 /* Dump as last parameter to give all NVMe bdevs chance to be constructed 9073 * before enabling hotplug poller. 9074 */ 9075 bdev_nvme_hotplug_config_json(w); 9076 9077 pthread_mutex_unlock(&g_bdev_nvme_mutex); 9078 return 0; 9079 } 9080 9081 struct spdk_nvme_ctrlr * 9082 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev) 9083 { 9084 struct nvme_bdev *nbdev; 9085 struct nvme_ns *nvme_ns; 9086 9087 if (!bdev || bdev->module != &nvme_if) { 9088 return NULL; 9089 } 9090 9091 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 9092 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 9093 assert(nvme_ns != NULL); 9094 9095 return nvme_ns->ctrlr->ctrlr; 9096 } 9097 9098 static bool 9099 nvme_io_path_is_current(struct nvme_io_path *io_path) 9100 { 9101 const struct nvme_bdev_channel *nbdev_ch; 9102 bool current; 9103 9104 if (!nvme_io_path_is_available(io_path)) { 9105 return false; 9106 } 9107 9108 nbdev_ch = io_path->nbdev_ch; 9109 if (nbdev_ch == NULL) { 9110 current = false; 9111 } else if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) { 9112 struct nvme_io_path *optimized_io_path = NULL; 9113 9114 STAILQ_FOREACH(optimized_io_path, &nbdev_ch->io_path_list, stailq) { 9115 if (optimized_io_path->nvme_ns->ana_state == SPDK_NVME_ANA_OPTIMIZED_STATE) { 9116 break; 9117 } 9118 } 9119 9120 /* A non-optimized path is only current if there are no optimized paths. */ 9121 current = (io_path->nvme_ns->ana_state == SPDK_NVME_ANA_OPTIMIZED_STATE) || 9122 (optimized_io_path == NULL); 9123 } else { 9124 if (nbdev_ch->current_io_path) { 9125 current = (io_path == nbdev_ch->current_io_path); 9126 } else { 9127 struct nvme_io_path *first_path; 9128 9129 /* We arrived here as there are no optimized paths for active-passive 9130 * mode. Check if this io_path is the first one available on the list. 9131 */ 9132 current = false; 9133 STAILQ_FOREACH(first_path, &nbdev_ch->io_path_list, stailq) { 9134 if (nvme_io_path_is_available(first_path)) { 9135 current = (io_path == first_path); 9136 break; 9137 } 9138 } 9139 } 9140 } 9141 9142 return current; 9143 } 9144 9145 static struct nvme_ctrlr * 9146 bdev_nvme_next_ctrlr_unsafe(struct nvme_bdev_ctrlr *nbdev_ctrlr, struct nvme_ctrlr *prev) 9147 { 9148 struct nvme_ctrlr *next; 9149 9150 /* Must be called under g_bdev_nvme_mutex */ 9151 next = prev != NULL ? TAILQ_NEXT(prev, tailq) : TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 9152 while (next != NULL) { 9153 /* ref can be 0 when the ctrlr was released, but hasn't been detached yet */ 9154 pthread_mutex_lock(&next->mutex); 9155 if (next->ref > 0) { 9156 next->ref++; 9157 pthread_mutex_unlock(&next->mutex); 9158 return next; 9159 } 9160 9161 pthread_mutex_unlock(&next->mutex); 9162 next = TAILQ_NEXT(next, tailq); 9163 } 9164 9165 return NULL; 9166 } 9167 9168 struct bdev_nvme_set_keys_ctx { 9169 struct nvme_ctrlr *nctrlr; 9170 struct spdk_key *dhchap_key; 9171 struct spdk_key *dhchap_ctrlr_key; 9172 struct spdk_thread *thread; 9173 bdev_nvme_set_keys_cb cb_fn; 9174 void *cb_ctx; 9175 int status; 9176 }; 9177 9178 static void 9179 bdev_nvme_free_set_keys_ctx(struct bdev_nvme_set_keys_ctx *ctx) 9180 { 9181 if (ctx == NULL) { 9182 return; 9183 } 9184 9185 spdk_keyring_put_key(ctx->dhchap_key); 9186 spdk_keyring_put_key(ctx->dhchap_ctrlr_key); 9187 free(ctx); 9188 } 9189 9190 static void 9191 _bdev_nvme_set_keys_done(void *_ctx) 9192 { 9193 struct bdev_nvme_set_keys_ctx *ctx = _ctx; 9194 9195 ctx->cb_fn(ctx->cb_ctx, ctx->status); 9196 9197 if (ctx->nctrlr != NULL) { 9198 nvme_ctrlr_put_ref(ctx->nctrlr); 9199 } 9200 bdev_nvme_free_set_keys_ctx(ctx); 9201 } 9202 9203 static void 9204 bdev_nvme_set_keys_done(struct bdev_nvme_set_keys_ctx *ctx, int status) 9205 { 9206 ctx->status = status; 9207 spdk_thread_exec_msg(ctx->thread, _bdev_nvme_set_keys_done, ctx); 9208 } 9209 9210 static void bdev_nvme_authenticate_ctrlr(struct bdev_nvme_set_keys_ctx *ctx); 9211 9212 static void 9213 bdev_nvme_authenticate_ctrlr_continue(struct bdev_nvme_set_keys_ctx *ctx) 9214 { 9215 struct nvme_ctrlr *next; 9216 9217 pthread_mutex_lock(&g_bdev_nvme_mutex); 9218 next = bdev_nvme_next_ctrlr_unsafe(NULL, ctx->nctrlr); 9219 pthread_mutex_unlock(&g_bdev_nvme_mutex); 9220 9221 nvme_ctrlr_put_ref(ctx->nctrlr); 9222 ctx->nctrlr = next; 9223 9224 if (next == NULL) { 9225 bdev_nvme_set_keys_done(ctx, 0); 9226 } else { 9227 bdev_nvme_authenticate_ctrlr(ctx); 9228 } 9229 } 9230 9231 static void 9232 bdev_nvme_authenticate_qpairs_done(struct spdk_io_channel_iter *i, int status) 9233 { 9234 struct bdev_nvme_set_keys_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 9235 9236 if (status != 0) { 9237 bdev_nvme_set_keys_done(ctx, status); 9238 return; 9239 } 9240 bdev_nvme_authenticate_ctrlr_continue(ctx); 9241 } 9242 9243 static void 9244 bdev_nvme_authenticate_qpair_done(void *ctx, int status) 9245 { 9246 spdk_for_each_channel_continue(ctx, status); 9247 } 9248 9249 static void 9250 bdev_nvme_authenticate_qpair(struct spdk_io_channel_iter *i) 9251 { 9252 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 9253 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch); 9254 struct nvme_qpair *qpair = ctrlr_ch->qpair; 9255 int rc; 9256 9257 if (!nvme_qpair_is_connected(qpair)) { 9258 spdk_for_each_channel_continue(i, 0); 9259 return; 9260 } 9261 9262 rc = spdk_nvme_qpair_authenticate(qpair->qpair, bdev_nvme_authenticate_qpair_done, i); 9263 if (rc != 0) { 9264 spdk_for_each_channel_continue(i, rc); 9265 } 9266 } 9267 9268 static void 9269 bdev_nvme_authenticate_ctrlr_done(void *_ctx, int status) 9270 { 9271 struct bdev_nvme_set_keys_ctx *ctx = _ctx; 9272 9273 if (status != 0) { 9274 bdev_nvme_set_keys_done(ctx, status); 9275 return; 9276 } 9277 9278 spdk_for_each_channel(ctx->nctrlr, bdev_nvme_authenticate_qpair, ctx, 9279 bdev_nvme_authenticate_qpairs_done); 9280 } 9281 9282 static void 9283 bdev_nvme_authenticate_ctrlr(struct bdev_nvme_set_keys_ctx *ctx) 9284 { 9285 struct spdk_nvme_ctrlr_key_opts opts = {}; 9286 struct nvme_ctrlr *nctrlr = ctx->nctrlr; 9287 int rc; 9288 9289 opts.size = SPDK_SIZEOF(&opts, dhchap_ctrlr_key); 9290 opts.dhchap_key = ctx->dhchap_key; 9291 opts.dhchap_ctrlr_key = ctx->dhchap_ctrlr_key; 9292 rc = spdk_nvme_ctrlr_set_keys(nctrlr->ctrlr, &opts); 9293 if (rc != 0) { 9294 bdev_nvme_set_keys_done(ctx, rc); 9295 return; 9296 } 9297 9298 if (ctx->dhchap_key != NULL) { 9299 rc = spdk_nvme_ctrlr_authenticate(nctrlr->ctrlr, 9300 bdev_nvme_authenticate_ctrlr_done, ctx); 9301 if (rc != 0) { 9302 bdev_nvme_set_keys_done(ctx, rc); 9303 } 9304 } else { 9305 bdev_nvme_authenticate_ctrlr_continue(ctx); 9306 } 9307 } 9308 9309 int 9310 bdev_nvme_set_keys(const char *name, const char *dhchap_key, const char *dhchap_ctrlr_key, 9311 bdev_nvme_set_keys_cb cb_fn, void *cb_ctx) 9312 { 9313 struct bdev_nvme_set_keys_ctx *ctx; 9314 struct nvme_bdev_ctrlr *nbdev_ctrlr; 9315 struct nvme_ctrlr *nctrlr; 9316 9317 ctx = calloc(1, sizeof(*ctx)); 9318 if (ctx == NULL) { 9319 return -ENOMEM; 9320 } 9321 9322 if (dhchap_key != NULL) { 9323 ctx->dhchap_key = spdk_keyring_get_key(dhchap_key); 9324 if (ctx->dhchap_key == NULL) { 9325 SPDK_ERRLOG("Could not find key %s for bdev %s\n", dhchap_key, name); 9326 bdev_nvme_free_set_keys_ctx(ctx); 9327 return -ENOKEY; 9328 } 9329 } 9330 if (dhchap_ctrlr_key != NULL) { 9331 ctx->dhchap_ctrlr_key = spdk_keyring_get_key(dhchap_ctrlr_key); 9332 if (ctx->dhchap_ctrlr_key == NULL) { 9333 SPDK_ERRLOG("Could not find key %s for bdev %s\n", dhchap_ctrlr_key, name); 9334 bdev_nvme_free_set_keys_ctx(ctx); 9335 return -ENOKEY; 9336 } 9337 } 9338 9339 pthread_mutex_lock(&g_bdev_nvme_mutex); 9340 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 9341 if (nbdev_ctrlr == NULL) { 9342 SPDK_ERRLOG("Could not find bdev_ctrlr %s\n", name); 9343 pthread_mutex_unlock(&g_bdev_nvme_mutex); 9344 bdev_nvme_free_set_keys_ctx(ctx); 9345 return -ENODEV; 9346 } 9347 nctrlr = bdev_nvme_next_ctrlr_unsafe(nbdev_ctrlr, NULL); 9348 if (nctrlr == NULL) { 9349 SPDK_ERRLOG("Could not find any nvme_ctrlrs on bdev_ctrlr %s\n", name); 9350 pthread_mutex_unlock(&g_bdev_nvme_mutex); 9351 bdev_nvme_free_set_keys_ctx(ctx); 9352 return -ENODEV; 9353 } 9354 pthread_mutex_unlock(&g_bdev_nvme_mutex); 9355 9356 ctx->nctrlr = nctrlr; 9357 ctx->cb_fn = cb_fn; 9358 ctx->cb_ctx = cb_ctx; 9359 ctx->thread = spdk_get_thread(); 9360 9361 bdev_nvme_authenticate_ctrlr(ctx); 9362 9363 return 0; 9364 } 9365 9366 void 9367 nvme_io_path_info_json(struct spdk_json_write_ctx *w, struct nvme_io_path *io_path) 9368 { 9369 struct nvme_ns *nvme_ns = io_path->nvme_ns; 9370 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 9371 const struct spdk_nvme_ctrlr_data *cdata; 9372 const struct spdk_nvme_transport_id *trid; 9373 const char *adrfam_str; 9374 9375 spdk_json_write_object_begin(w); 9376 9377 spdk_json_write_named_string(w, "bdev_name", nvme_ns->bdev->disk.name); 9378 9379 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 9380 trid = spdk_nvme_ctrlr_get_transport_id(nvme_ctrlr->ctrlr); 9381 9382 spdk_json_write_named_uint32(w, "cntlid", cdata->cntlid); 9383 spdk_json_write_named_bool(w, "current", nvme_io_path_is_current(io_path)); 9384 spdk_json_write_named_bool(w, "connected", nvme_qpair_is_connected(io_path->qpair)); 9385 spdk_json_write_named_bool(w, "accessible", nvme_ns_is_accessible(nvme_ns)); 9386 9387 spdk_json_write_named_object_begin(w, "transport"); 9388 spdk_json_write_named_string(w, "trtype", trid->trstring); 9389 spdk_json_write_named_string(w, "traddr", trid->traddr); 9390 if (trid->trsvcid[0] != '\0') { 9391 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 9392 } 9393 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 9394 if (adrfam_str) { 9395 spdk_json_write_named_string(w, "adrfam", adrfam_str); 9396 } 9397 spdk_json_write_object_end(w); 9398 9399 spdk_json_write_object_end(w); 9400 } 9401 9402 void 9403 bdev_nvme_get_discovery_info(struct spdk_json_write_ctx *w) 9404 { 9405 struct discovery_ctx *ctx; 9406 struct discovery_entry_ctx *entry_ctx; 9407 9408 spdk_json_write_array_begin(w); 9409 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 9410 spdk_json_write_object_begin(w); 9411 spdk_json_write_named_string(w, "name", ctx->name); 9412 9413 spdk_json_write_named_object_begin(w, "trid"); 9414 nvme_bdev_dump_trid_json(&ctx->trid, w); 9415 spdk_json_write_object_end(w); 9416 9417 spdk_json_write_named_array_begin(w, "referrals"); 9418 TAILQ_FOREACH(entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 9419 spdk_json_write_object_begin(w); 9420 spdk_json_write_named_object_begin(w, "trid"); 9421 nvme_bdev_dump_trid_json(&entry_ctx->trid, w); 9422 spdk_json_write_object_end(w); 9423 spdk_json_write_object_end(w); 9424 } 9425 spdk_json_write_array_end(w); 9426 9427 spdk_json_write_object_end(w); 9428 } 9429 spdk_json_write_array_end(w); 9430 } 9431 9432 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme) 9433 9434 static void 9435 bdev_nvme_trace(void) 9436 { 9437 struct spdk_trace_tpoint_opts opts[] = { 9438 { 9439 "BDEV_NVME_IO_START", TRACE_BDEV_NVME_IO_START, 9440 OWNER_TYPE_NONE, OBJECT_BDEV_NVME_IO, 1, 9441 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 9442 }, 9443 { 9444 "BDEV_NVME_IO_DONE", TRACE_BDEV_NVME_IO_DONE, 9445 OWNER_TYPE_NONE, OBJECT_BDEV_NVME_IO, 0, 9446 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 9447 } 9448 }; 9449 9450 9451 spdk_trace_register_object(OBJECT_BDEV_NVME_IO, 'N'); 9452 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 9453 spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_SUBMIT, OBJECT_BDEV_NVME_IO, 0); 9454 spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_SUBMIT, OBJECT_BDEV_NVME_IO, 0); 9455 spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_COMPLETE, OBJECT_BDEV_NVME_IO, 0); 9456 spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_COMPLETE, OBJECT_BDEV_NVME_IO, 0); 9457 } 9458 SPDK_TRACE_REGISTER_FN(bdev_nvme_trace, "bdev_nvme", TRACE_GROUP_BDEV_NVME) 9459