1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 * Copyright (c) 2022 Dell Inc, or its subsidiaries. All rights reserved. 6 */ 7 8 #include "spdk/stdinc.h" 9 10 #include "bdev_nvme.h" 11 12 #include "spdk/accel.h" 13 #include "spdk/config.h" 14 #include "spdk/endian.h" 15 #include "spdk/bdev.h" 16 #include "spdk/json.h" 17 #include "spdk/keyring.h" 18 #include "spdk/likely.h" 19 #include "spdk/nvme.h" 20 #include "spdk/nvme_ocssd.h" 21 #include "spdk/nvme_zns.h" 22 #include "spdk/opal.h" 23 #include "spdk/thread.h" 24 #include "spdk/trace.h" 25 #include "spdk/string.h" 26 #include "spdk/util.h" 27 #include "spdk/uuid.h" 28 29 #include "spdk/bdev_module.h" 30 #include "spdk/log.h" 31 32 #include "spdk_internal/usdt.h" 33 #include "spdk_internal/trace_defs.h" 34 35 #define CTRLR_STRING(nvme_ctrlr) \ 36 (spdk_nvme_trtype_is_fabrics(nvme_ctrlr->active_path_id->trid.trtype) ? \ 37 nvme_ctrlr->active_path_id->trid.subnqn : nvme_ctrlr->active_path_id->trid.traddr) 38 39 #define CTRLR_ID(nvme_ctrlr) (spdk_nvme_ctrlr_get_id(nvme_ctrlr->ctrlr)) 40 41 #define NVME_CTRLR_ERRLOG(ctrlr, format, ...) \ 42 SPDK_ERRLOG("[%s, %u] " format, CTRLR_STRING(ctrlr), CTRLR_ID(ctrlr), ##__VA_ARGS__); 43 44 #define NVME_CTRLR_WARNLOG(ctrlr, format, ...) \ 45 SPDK_WARNLOG("[%s, %u] " format, CTRLR_STRING(ctrlr), CTRLR_ID(ctrlr), ##__VA_ARGS__); 46 47 #define NVME_CTRLR_NOTICELOG(ctrlr, format, ...) \ 48 SPDK_NOTICELOG("[%s, %u] " format, CTRLR_STRING(ctrlr), CTRLR_ID(ctrlr), ##__VA_ARGS__); 49 50 #define NVME_CTRLR_INFOLOG(ctrlr, format, ...) \ 51 SPDK_INFOLOG(bdev_nvme, "[%s, %u] " format, CTRLR_STRING(ctrlr), CTRLR_ID(ctrlr), ##__VA_ARGS__); 52 53 #ifdef DEBUG 54 #define NVME_CTRLR_DEBUGLOG(ctrlr, format, ...) \ 55 SPDK_DEBUGLOG(bdev_nvme, "[%s, %u] " format, CTRLR_STRING(ctrlr), CTRLR_ID(ctrlr), ##__VA_ARGS__); 56 #else 57 #define NVME_CTRLR_DEBUGLOG(ctrlr, ...) do { } while (0) 58 #endif 59 60 #define BDEV_STRING(nbdev) (nbdev->disk.name) 61 62 #define NVME_BDEV_ERRLOG(nbdev, format, ...) \ 63 SPDK_ERRLOG("[%s] " format, BDEV_STRING(nbdev), ##__VA_ARGS__); 64 65 #define NVME_BDEV_WARNLOG(nbdev, format, ...) \ 66 SPDK_WARNLOG("[%s] " format, BDEV_STRING(nbdev), ##__VA_ARGS__); 67 68 #define NVME_BDEV_NOTICELOG(nbdev, format, ...) \ 69 SPDK_NOTICELOG("[%s] " format, BDEV_STRING(nbdev), ##__VA_ARGS__); 70 71 #define NVME_BDEV_INFOLOG(nbdev, format, ...) \ 72 SPDK_INFOLOG(bdev_nvme, "[%s] " format, BDEV_STRING(nbdev), ##__VA_ARGS__); 73 74 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true 75 #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS (10000) 76 77 #define NSID_STR_LEN 10 78 79 #define SPDK_CONTROLLER_NAME_MAX 512 80 81 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w); 82 83 struct nvme_bdev_io { 84 /** array of iovecs to transfer. */ 85 struct iovec *iovs; 86 87 /** Number of iovecs in iovs array. */ 88 int iovcnt; 89 90 /** Current iovec position. */ 91 int iovpos; 92 93 /** Offset in current iovec. */ 94 uint32_t iov_offset; 95 96 /** Offset in current iovec. */ 97 uint32_t fused_iov_offset; 98 99 /** array of iovecs to transfer. */ 100 struct iovec *fused_iovs; 101 102 /** Number of iovecs in iovs array. */ 103 int fused_iovcnt; 104 105 /** Current iovec position. */ 106 int fused_iovpos; 107 108 /** I/O path the current I/O or admin passthrough is submitted on, or the I/O path 109 * being reset in a reset I/O. 110 */ 111 struct nvme_io_path *io_path; 112 113 /** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */ 114 struct spdk_nvme_cpl cpl; 115 116 /** Extended IO opts passed by the user to bdev layer and mapped to NVME format */ 117 struct spdk_nvme_ns_cmd_ext_io_opts ext_opts; 118 119 /** Keeps track if first of fused commands was submitted */ 120 bool first_fused_submitted; 121 122 /** Keeps track if first of fused commands was completed */ 123 bool first_fused_completed; 124 125 /* How many times the current I/O was retried. */ 126 int32_t retry_count; 127 128 /** Expiration value in ticks to retry the current I/O. */ 129 uint64_t retry_ticks; 130 131 /** Temporary pointer to zone report buffer */ 132 struct spdk_nvme_zns_zone_report *zone_report_buf; 133 134 /** Keep track of how many zones that have been copied to the spdk_bdev_zone_info struct */ 135 uint64_t handled_zones; 136 137 /* Current tsc at submit time. */ 138 uint64_t submit_tsc; 139 140 /* Used to put nvme_bdev_io into the list */ 141 TAILQ_ENTRY(nvme_bdev_io) retry_link; 142 }; 143 144 struct nvme_probe_skip_entry { 145 struct spdk_nvme_transport_id trid; 146 TAILQ_ENTRY(nvme_probe_skip_entry) tailq; 147 }; 148 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */ 149 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER( 150 g_skipped_nvme_ctrlrs); 151 152 #define BDEV_NVME_DEFAULT_DIGESTS (SPDK_BIT(SPDK_NVMF_DHCHAP_HASH_SHA256) | \ 153 SPDK_BIT(SPDK_NVMF_DHCHAP_HASH_SHA384) | \ 154 SPDK_BIT(SPDK_NVMF_DHCHAP_HASH_SHA512)) 155 156 #define BDEV_NVME_DEFAULT_DHGROUPS (SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_NULL) | \ 157 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_2048) | \ 158 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_3072) | \ 159 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_4096) | \ 160 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_6144) | \ 161 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_8192)) 162 163 static struct spdk_bdev_nvme_opts g_opts = { 164 .action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE, 165 .keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS, 166 .timeout_us = 0, 167 .timeout_admin_us = 0, 168 .transport_retry_count = 4, 169 .arbitration_burst = 0, 170 .low_priority_weight = 0, 171 .medium_priority_weight = 0, 172 .high_priority_weight = 0, 173 .io_queue_requests = 0, 174 .nvme_adminq_poll_period_us = 10000ULL, 175 .nvme_ioq_poll_period_us = 0, 176 .delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT, 177 .bdev_retry_count = 3, 178 .ctrlr_loss_timeout_sec = 0, 179 .reconnect_delay_sec = 0, 180 .fast_io_fail_timeout_sec = 0, 181 .transport_ack_timeout = 0, 182 .disable_auto_failback = false, 183 .generate_uuids = false, 184 .transport_tos = 0, 185 .nvme_error_stat = false, 186 .io_path_stat = false, 187 .allow_accel_sequence = false, 188 .dhchap_digests = BDEV_NVME_DEFAULT_DIGESTS, 189 .dhchap_dhgroups = BDEV_NVME_DEFAULT_DHGROUPS, 190 }; 191 192 #define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL 193 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT 100000ULL 194 195 static int g_hot_insert_nvme_controller_index = 0; 196 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT; 197 static bool g_nvme_hotplug_enabled = false; 198 struct spdk_thread *g_bdev_nvme_init_thread; 199 static struct spdk_poller *g_hotplug_poller; 200 static struct spdk_poller *g_hotplug_probe_poller; 201 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx; 202 203 static void nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 204 struct nvme_async_probe_ctx *ctx); 205 static void nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 206 struct nvme_async_probe_ctx *ctx); 207 static int bdev_nvme_library_init(void); 208 static void bdev_nvme_library_fini(void); 209 static void _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, 210 struct spdk_bdev_io *bdev_io); 211 static void bdev_nvme_submit_request(struct spdk_io_channel *ch, 212 struct spdk_bdev_io *bdev_io); 213 static int bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 214 void *md, uint64_t lba_count, uint64_t lba, 215 uint32_t flags, struct spdk_memory_domain *domain, void *domain_ctx, 216 struct spdk_accel_sequence *seq); 217 static int bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 218 void *md, uint64_t lba_count, uint64_t lba); 219 static int bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 220 void *md, uint64_t lba_count, uint64_t lba, 221 uint32_t flags, struct spdk_memory_domain *domain, void *domain_ctx, 222 struct spdk_accel_sequence *seq, 223 union spdk_bdev_nvme_cdw12 cdw12, union spdk_bdev_nvme_cdw13 cdw13); 224 static int bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 225 void *md, uint64_t lba_count, 226 uint64_t zslba, uint32_t flags); 227 static int bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 228 void *md, uint64_t lba_count, uint64_t lba, 229 uint32_t flags); 230 static int bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, 231 struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov, 232 int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba, 233 uint32_t flags); 234 static int bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, 235 uint32_t num_zones, struct spdk_bdev_zone_info *info); 236 static int bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 237 enum spdk_bdev_zone_action action); 238 static void bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, 239 struct nvme_bdev_io *bio, 240 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); 241 static int bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 242 void *buf, size_t nbytes); 243 static int bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 244 void *buf, size_t nbytes, void *md_buf, size_t md_len); 245 static int bdev_nvme_iov_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 246 struct iovec *iov, int iovcnt, size_t nbytes, 247 void *md_buf, size_t md_len); 248 static void bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, 249 struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort); 250 static void bdev_nvme_reset_io(struct nvme_bdev *nbdev, struct nvme_bdev_io *bio); 251 static int bdev_nvme_reset_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 252 static int bdev_nvme_failover_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 253 static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr); 254 static int nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr); 255 256 static struct nvme_ns *nvme_ns_alloc(void); 257 static void nvme_ns_free(struct nvme_ns *ns); 258 259 static int 260 nvme_ns_cmp(struct nvme_ns *ns1, struct nvme_ns *ns2) 261 { 262 return ns1->id < ns2->id ? -1 : ns1->id > ns2->id; 263 } 264 265 RB_GENERATE_STATIC(nvme_ns_tree, nvme_ns, node, nvme_ns_cmp); 266 267 struct spdk_nvme_qpair * 268 bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch) 269 { 270 struct nvme_ctrlr_channel *ctrlr_ch; 271 272 assert(ctrlr_io_ch != NULL); 273 274 ctrlr_ch = spdk_io_channel_get_ctx(ctrlr_io_ch); 275 276 return ctrlr_ch->qpair->qpair; 277 } 278 279 static int 280 bdev_nvme_get_ctx_size(void) 281 { 282 return sizeof(struct nvme_bdev_io); 283 } 284 285 static struct spdk_bdev_module nvme_if = { 286 .name = "nvme", 287 .async_fini = true, 288 .module_init = bdev_nvme_library_init, 289 .module_fini = bdev_nvme_library_fini, 290 .config_json = bdev_nvme_config_json, 291 .get_ctx_size = bdev_nvme_get_ctx_size, 292 293 }; 294 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if) 295 296 struct nvme_bdev_ctrlrs g_nvme_bdev_ctrlrs = TAILQ_HEAD_INITIALIZER(g_nvme_bdev_ctrlrs); 297 pthread_mutex_t g_bdev_nvme_mutex = PTHREAD_MUTEX_INITIALIZER; 298 bool g_bdev_nvme_module_finish; 299 300 struct nvme_bdev_ctrlr * 301 nvme_bdev_ctrlr_get_by_name(const char *name) 302 { 303 struct nvme_bdev_ctrlr *nbdev_ctrlr; 304 305 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 306 if (strcmp(name, nbdev_ctrlr->name) == 0) { 307 break; 308 } 309 } 310 311 return nbdev_ctrlr; 312 } 313 314 static struct nvme_ctrlr * 315 nvme_bdev_ctrlr_get_ctrlr(struct nvme_bdev_ctrlr *nbdev_ctrlr, 316 const struct spdk_nvme_transport_id *trid, const char *hostnqn) 317 { 318 const struct spdk_nvme_ctrlr_opts *opts; 319 struct nvme_ctrlr *nvme_ctrlr; 320 321 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 322 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 323 if (spdk_nvme_transport_id_compare(trid, &nvme_ctrlr->active_path_id->trid) == 0 && 324 strcmp(hostnqn, opts->hostnqn) == 0) { 325 break; 326 } 327 } 328 329 return nvme_ctrlr; 330 } 331 332 struct nvme_ctrlr * 333 nvme_bdev_ctrlr_get_ctrlr_by_id(struct nvme_bdev_ctrlr *nbdev_ctrlr, 334 uint16_t cntlid) 335 { 336 struct nvme_ctrlr *nvme_ctrlr; 337 const struct spdk_nvme_ctrlr_data *cdata; 338 339 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 340 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 341 if (cdata->cntlid == cntlid) { 342 break; 343 } 344 } 345 346 return nvme_ctrlr; 347 } 348 349 static struct nvme_bdev * 350 nvme_bdev_ctrlr_get_bdev(struct nvme_bdev_ctrlr *nbdev_ctrlr, uint32_t nsid) 351 { 352 struct nvme_bdev *nbdev; 353 354 pthread_mutex_lock(&g_bdev_nvme_mutex); 355 TAILQ_FOREACH(nbdev, &nbdev_ctrlr->bdevs, tailq) { 356 if (nbdev->nsid == nsid) { 357 break; 358 } 359 } 360 pthread_mutex_unlock(&g_bdev_nvme_mutex); 361 362 return nbdev; 363 } 364 365 struct nvme_ns * 366 nvme_ctrlr_get_ns(struct nvme_ctrlr *nvme_ctrlr, uint32_t nsid) 367 { 368 struct nvme_ns ns; 369 370 assert(nsid > 0); 371 372 ns.id = nsid; 373 return RB_FIND(nvme_ns_tree, &nvme_ctrlr->namespaces, &ns); 374 } 375 376 struct nvme_ns * 377 nvme_ctrlr_get_first_active_ns(struct nvme_ctrlr *nvme_ctrlr) 378 { 379 return RB_MIN(nvme_ns_tree, &nvme_ctrlr->namespaces); 380 } 381 382 struct nvme_ns * 383 nvme_ctrlr_get_next_active_ns(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *ns) 384 { 385 if (ns == NULL) { 386 return NULL; 387 } 388 389 return RB_NEXT(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 390 } 391 392 static struct nvme_ctrlr * 393 nvme_ctrlr_get(const struct spdk_nvme_transport_id *trid, const char *hostnqn) 394 { 395 struct nvme_bdev_ctrlr *nbdev_ctrlr; 396 struct nvme_ctrlr *nvme_ctrlr = NULL; 397 398 pthread_mutex_lock(&g_bdev_nvme_mutex); 399 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 400 nvme_ctrlr = nvme_bdev_ctrlr_get_ctrlr(nbdev_ctrlr, trid, hostnqn); 401 if (nvme_ctrlr != NULL) { 402 break; 403 } 404 } 405 pthread_mutex_unlock(&g_bdev_nvme_mutex); 406 407 return nvme_ctrlr; 408 } 409 410 struct nvme_ctrlr * 411 nvme_ctrlr_get_by_name(const char *name) 412 { 413 struct nvme_bdev_ctrlr *nbdev_ctrlr; 414 struct nvme_ctrlr *nvme_ctrlr = NULL; 415 416 if (name == NULL) { 417 return NULL; 418 } 419 420 pthread_mutex_lock(&g_bdev_nvme_mutex); 421 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 422 if (nbdev_ctrlr != NULL) { 423 nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 424 } 425 pthread_mutex_unlock(&g_bdev_nvme_mutex); 426 427 return nvme_ctrlr; 428 } 429 430 void 431 nvme_bdev_ctrlr_for_each(nvme_bdev_ctrlr_for_each_fn fn, void *ctx) 432 { 433 struct nvme_bdev_ctrlr *nbdev_ctrlr; 434 435 pthread_mutex_lock(&g_bdev_nvme_mutex); 436 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 437 fn(nbdev_ctrlr, ctx); 438 } 439 pthread_mutex_unlock(&g_bdev_nvme_mutex); 440 } 441 442 struct nvme_ctrlr_channel_iter { 443 nvme_ctrlr_for_each_channel_msg fn; 444 nvme_ctrlr_for_each_channel_done cpl; 445 struct spdk_io_channel_iter *i; 446 void *ctx; 447 }; 448 449 void 450 nvme_ctrlr_for_each_channel_continue(struct nvme_ctrlr_channel_iter *iter, int status) 451 { 452 spdk_for_each_channel_continue(iter->i, status); 453 } 454 455 static void 456 nvme_ctrlr_each_channel_msg(struct spdk_io_channel_iter *i) 457 { 458 struct nvme_ctrlr_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 459 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 460 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 461 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch); 462 463 iter->i = i; 464 iter->fn(iter, nvme_ctrlr, ctrlr_ch, iter->ctx); 465 } 466 467 static void 468 nvme_ctrlr_each_channel_cpl(struct spdk_io_channel_iter *i, int status) 469 { 470 struct nvme_ctrlr_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 471 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 472 473 iter->i = i; 474 iter->cpl(nvme_ctrlr, iter->ctx, status); 475 476 free(iter); 477 } 478 479 void 480 nvme_ctrlr_for_each_channel(struct nvme_ctrlr *nvme_ctrlr, 481 nvme_ctrlr_for_each_channel_msg fn, void *ctx, 482 nvme_ctrlr_for_each_channel_done cpl) 483 { 484 struct nvme_ctrlr_channel_iter *iter; 485 486 assert(nvme_ctrlr != NULL && fn != NULL); 487 488 iter = calloc(1, sizeof(struct nvme_ctrlr_channel_iter)); 489 if (iter == NULL) { 490 SPDK_ERRLOG("Unable to allocate iterator\n"); 491 assert(false); 492 return; 493 } 494 495 iter->fn = fn; 496 iter->cpl = cpl; 497 iter->ctx = ctx; 498 499 spdk_for_each_channel(nvme_ctrlr, nvme_ctrlr_each_channel_msg, 500 iter, nvme_ctrlr_each_channel_cpl); 501 } 502 503 struct nvme_bdev_channel_iter { 504 nvme_bdev_for_each_channel_msg fn; 505 nvme_bdev_for_each_channel_done cpl; 506 struct spdk_io_channel_iter *i; 507 void *ctx; 508 }; 509 510 void 511 nvme_bdev_for_each_channel_continue(struct nvme_bdev_channel_iter *iter, int status) 512 { 513 spdk_for_each_channel_continue(iter->i, status); 514 } 515 516 static void 517 nvme_bdev_each_channel_msg(struct spdk_io_channel_iter *i) 518 { 519 struct nvme_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 520 struct nvme_bdev *nbdev = spdk_io_channel_iter_get_io_device(i); 521 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 522 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 523 524 iter->i = i; 525 iter->fn(iter, nbdev, nbdev_ch, iter->ctx); 526 } 527 528 static void 529 nvme_bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status) 530 { 531 struct nvme_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 532 struct nvme_bdev *nbdev = spdk_io_channel_iter_get_io_device(i); 533 534 iter->i = i; 535 iter->cpl(nbdev, iter->ctx, status); 536 537 free(iter); 538 } 539 540 void 541 nvme_bdev_for_each_channel(struct nvme_bdev *nbdev, 542 nvme_bdev_for_each_channel_msg fn, void *ctx, 543 nvme_bdev_for_each_channel_done cpl) 544 { 545 struct nvme_bdev_channel_iter *iter; 546 547 assert(nbdev != NULL && fn != NULL); 548 549 iter = calloc(1, sizeof(struct nvme_bdev_channel_iter)); 550 if (iter == NULL) { 551 SPDK_ERRLOG("Unable to allocate iterator\n"); 552 assert(false); 553 return; 554 } 555 556 iter->fn = fn; 557 iter->cpl = cpl; 558 iter->ctx = ctx; 559 560 spdk_for_each_channel(nbdev, nvme_bdev_each_channel_msg, iter, 561 nvme_bdev_each_channel_cpl); 562 } 563 564 void 565 nvme_bdev_dump_trid_json(const struct spdk_nvme_transport_id *trid, struct spdk_json_write_ctx *w) 566 { 567 const char *trtype_str; 568 const char *adrfam_str; 569 570 trtype_str = spdk_nvme_transport_id_trtype_str(trid->trtype); 571 if (trtype_str) { 572 spdk_json_write_named_string(w, "trtype", trtype_str); 573 } 574 575 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 576 if (adrfam_str) { 577 spdk_json_write_named_string(w, "adrfam", adrfam_str); 578 } 579 580 if (trid->traddr[0] != '\0') { 581 spdk_json_write_named_string(w, "traddr", trid->traddr); 582 } 583 584 if (trid->trsvcid[0] != '\0') { 585 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 586 } 587 588 if (trid->subnqn[0] != '\0') { 589 spdk_json_write_named_string(w, "subnqn", trid->subnqn); 590 } 591 } 592 593 static void 594 nvme_bdev_ctrlr_delete(struct nvme_bdev_ctrlr *nbdev_ctrlr, 595 struct nvme_ctrlr *nvme_ctrlr) 596 { 597 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_delete, nvme_ctrlr->nbdev_ctrlr->name); 598 pthread_mutex_lock(&g_bdev_nvme_mutex); 599 600 TAILQ_REMOVE(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 601 if (!TAILQ_EMPTY(&nbdev_ctrlr->ctrlrs)) { 602 pthread_mutex_unlock(&g_bdev_nvme_mutex); 603 604 return; 605 } 606 TAILQ_REMOVE(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 607 608 pthread_mutex_unlock(&g_bdev_nvme_mutex); 609 610 assert(TAILQ_EMPTY(&nbdev_ctrlr->bdevs)); 611 612 free(nbdev_ctrlr->name); 613 free(nbdev_ctrlr); 614 } 615 616 static void 617 _nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 618 { 619 struct nvme_path_id *path_id, *tmp_path; 620 struct nvme_ns *ns, *tmp_ns; 621 622 free(nvme_ctrlr->copied_ana_desc); 623 spdk_free(nvme_ctrlr->ana_log_page); 624 625 if (nvme_ctrlr->opal_dev) { 626 spdk_opal_dev_destruct(nvme_ctrlr->opal_dev); 627 nvme_ctrlr->opal_dev = NULL; 628 } 629 630 if (nvme_ctrlr->nbdev_ctrlr) { 631 nvme_bdev_ctrlr_delete(nvme_ctrlr->nbdev_ctrlr, nvme_ctrlr); 632 } 633 634 RB_FOREACH_SAFE(ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp_ns) { 635 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 636 nvme_ns_free(ns); 637 } 638 639 TAILQ_FOREACH_SAFE(path_id, &nvme_ctrlr->trids, link, tmp_path) { 640 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 641 free(path_id); 642 } 643 644 pthread_mutex_destroy(&nvme_ctrlr->mutex); 645 spdk_keyring_put_key(nvme_ctrlr->psk); 646 spdk_keyring_put_key(nvme_ctrlr->dhchap_key); 647 spdk_keyring_put_key(nvme_ctrlr->dhchap_ctrlr_key); 648 free(nvme_ctrlr); 649 650 pthread_mutex_lock(&g_bdev_nvme_mutex); 651 if (g_bdev_nvme_module_finish && TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 652 pthread_mutex_unlock(&g_bdev_nvme_mutex); 653 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 654 spdk_bdev_module_fini_done(); 655 return; 656 } 657 pthread_mutex_unlock(&g_bdev_nvme_mutex); 658 } 659 660 static int 661 nvme_detach_poller(void *arg) 662 { 663 struct nvme_ctrlr *nvme_ctrlr = arg; 664 int rc; 665 666 rc = spdk_nvme_detach_poll_async(nvme_ctrlr->detach_ctx); 667 if (rc != -EAGAIN) { 668 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 669 _nvme_ctrlr_delete(nvme_ctrlr); 670 } 671 672 return SPDK_POLLER_BUSY; 673 } 674 675 static void 676 nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 677 { 678 int rc; 679 680 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 681 682 if (spdk_interrupt_mode_is_enabled()) { 683 spdk_interrupt_unregister(&nvme_ctrlr->intr); 684 } 685 686 /* First, unregister the adminq poller, as the driver will poll adminq if necessary */ 687 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 688 689 /* If we got here, the reset/detach poller cannot be active */ 690 assert(nvme_ctrlr->reset_detach_poller == NULL); 691 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(nvme_detach_poller, 692 nvme_ctrlr, 1000); 693 if (nvme_ctrlr->reset_detach_poller == NULL) { 694 NVME_CTRLR_ERRLOG(nvme_ctrlr, "Failed to register detach poller\n"); 695 goto error; 696 } 697 698 rc = spdk_nvme_detach_async(nvme_ctrlr->ctrlr, &nvme_ctrlr->detach_ctx); 699 if (rc != 0) { 700 NVME_CTRLR_ERRLOG(nvme_ctrlr, "Failed to detach the NVMe controller\n"); 701 goto error; 702 } 703 704 return; 705 error: 706 /* We don't have a good way to handle errors here, so just do what we can and delete the 707 * controller without detaching the underlying NVMe device. 708 */ 709 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 710 _nvme_ctrlr_delete(nvme_ctrlr); 711 } 712 713 static void 714 nvme_ctrlr_unregister_cb(void *io_device) 715 { 716 struct nvme_ctrlr *nvme_ctrlr = io_device; 717 718 nvme_ctrlr_delete(nvme_ctrlr); 719 } 720 721 static void 722 nvme_ctrlr_unregister(void *ctx) 723 { 724 struct nvme_ctrlr *nvme_ctrlr = ctx; 725 726 spdk_io_device_unregister(nvme_ctrlr, nvme_ctrlr_unregister_cb); 727 } 728 729 static bool 730 nvme_ctrlr_can_be_unregistered(struct nvme_ctrlr *nvme_ctrlr) 731 { 732 if (!nvme_ctrlr->destruct) { 733 return false; 734 } 735 736 if (nvme_ctrlr->ref > 0) { 737 return false; 738 } 739 740 if (nvme_ctrlr->resetting) { 741 return false; 742 } 743 744 if (nvme_ctrlr->ana_log_page_updating) { 745 return false; 746 } 747 748 if (nvme_ctrlr->io_path_cache_clearing) { 749 return false; 750 } 751 752 return true; 753 } 754 755 static void 756 nvme_ctrlr_put_ref(struct nvme_ctrlr *nvme_ctrlr) 757 { 758 pthread_mutex_lock(&nvme_ctrlr->mutex); 759 SPDK_DTRACE_PROBE2(bdev_nvme_ctrlr_release, nvme_ctrlr->nbdev_ctrlr->name, nvme_ctrlr->ref); 760 761 assert(nvme_ctrlr->ref > 0); 762 nvme_ctrlr->ref--; 763 764 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 765 pthread_mutex_unlock(&nvme_ctrlr->mutex); 766 return; 767 } 768 769 pthread_mutex_unlock(&nvme_ctrlr->mutex); 770 771 spdk_thread_exec_msg(nvme_ctrlr->thread, nvme_ctrlr_unregister, nvme_ctrlr); 772 } 773 774 static void 775 nvme_ctrlr_get_ref(struct nvme_ctrlr *nvme_ctrlr) 776 { 777 pthread_mutex_lock(&nvme_ctrlr->mutex); 778 nvme_ctrlr->ref++; 779 pthread_mutex_unlock(&nvme_ctrlr->mutex); 780 } 781 782 static void 783 bdev_nvme_clear_current_io_path(struct nvme_bdev_channel *nbdev_ch) 784 { 785 nbdev_ch->current_io_path = NULL; 786 nbdev_ch->rr_counter = 0; 787 } 788 789 static struct nvme_io_path * 790 _bdev_nvme_get_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 791 { 792 struct nvme_io_path *io_path; 793 794 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 795 if (io_path->nvme_ns == nvme_ns) { 796 break; 797 } 798 } 799 800 return io_path; 801 } 802 803 static struct nvme_io_path * 804 nvme_io_path_alloc(void) 805 { 806 struct nvme_io_path *io_path; 807 808 io_path = calloc(1, sizeof(*io_path)); 809 if (io_path == NULL) { 810 SPDK_ERRLOG("Failed to alloc io_path.\n"); 811 return NULL; 812 } 813 814 if (g_opts.io_path_stat) { 815 io_path->stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); 816 if (io_path->stat == NULL) { 817 free(io_path); 818 SPDK_ERRLOG("Failed to alloc io_path stat.\n"); 819 return NULL; 820 } 821 spdk_bdev_reset_io_stat(io_path->stat, SPDK_BDEV_RESET_STAT_MAXMIN); 822 } 823 824 return io_path; 825 } 826 827 static void 828 nvme_io_path_free(struct nvme_io_path *io_path) 829 { 830 free(io_path->stat); 831 free(io_path); 832 } 833 834 static int 835 _bdev_nvme_add_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 836 { 837 struct nvme_io_path *io_path; 838 struct spdk_io_channel *ch; 839 struct nvme_ctrlr_channel *ctrlr_ch; 840 struct nvme_qpair *nvme_qpair; 841 842 io_path = nvme_io_path_alloc(); 843 if (io_path == NULL) { 844 return -ENOMEM; 845 } 846 847 io_path->nvme_ns = nvme_ns; 848 849 ch = spdk_get_io_channel(nvme_ns->ctrlr); 850 if (ch == NULL) { 851 nvme_io_path_free(io_path); 852 SPDK_ERRLOG("Failed to alloc io_channel.\n"); 853 return -ENOMEM; 854 } 855 856 ctrlr_ch = spdk_io_channel_get_ctx(ch); 857 858 nvme_qpair = ctrlr_ch->qpair; 859 assert(nvme_qpair != NULL); 860 861 io_path->qpair = nvme_qpair; 862 TAILQ_INSERT_TAIL(&nvme_qpair->io_path_list, io_path, tailq); 863 864 io_path->nbdev_ch = nbdev_ch; 865 STAILQ_INSERT_TAIL(&nbdev_ch->io_path_list, io_path, stailq); 866 867 bdev_nvme_clear_current_io_path(nbdev_ch); 868 869 return 0; 870 } 871 872 static void 873 bdev_nvme_clear_retry_io_path(struct nvme_bdev_channel *nbdev_ch, 874 struct nvme_io_path *io_path) 875 { 876 struct nvme_bdev_io *bio; 877 878 TAILQ_FOREACH(bio, &nbdev_ch->retry_io_list, retry_link) { 879 if (bio->io_path == io_path) { 880 bio->io_path = NULL; 881 } 882 } 883 } 884 885 static void 886 _bdev_nvme_delete_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *io_path) 887 { 888 struct spdk_io_channel *ch; 889 struct nvme_qpair *nvme_qpair; 890 struct nvme_ctrlr_channel *ctrlr_ch; 891 struct nvme_bdev *nbdev; 892 893 nbdev = spdk_io_channel_get_io_device(spdk_io_channel_from_ctx(nbdev_ch)); 894 895 /* Add the statistics to nvme_ns before this path is destroyed. */ 896 pthread_mutex_lock(&nbdev->mutex); 897 if (nbdev->ref != 0 && io_path->nvme_ns->stat != NULL && io_path->stat != NULL) { 898 spdk_bdev_add_io_stat(io_path->nvme_ns->stat, io_path->stat); 899 } 900 pthread_mutex_unlock(&nbdev->mutex); 901 902 bdev_nvme_clear_current_io_path(nbdev_ch); 903 bdev_nvme_clear_retry_io_path(nbdev_ch, io_path); 904 905 STAILQ_REMOVE(&nbdev_ch->io_path_list, io_path, nvme_io_path, stailq); 906 io_path->nbdev_ch = NULL; 907 908 nvme_qpair = io_path->qpair; 909 assert(nvme_qpair != NULL); 910 911 ctrlr_ch = nvme_qpair->ctrlr_ch; 912 assert(ctrlr_ch != NULL); 913 914 ch = spdk_io_channel_from_ctx(ctrlr_ch); 915 spdk_put_io_channel(ch); 916 917 /* After an io_path is removed, I/Os submitted to it may complete and update statistics 918 * of the io_path. To avoid heap-use-after-free error from this case, do not free the 919 * io_path here but free the io_path when the associated qpair is freed. It is ensured 920 * that all I/Os submitted to the io_path are completed when the associated qpair is freed. 921 */ 922 } 923 924 static void 925 _bdev_nvme_delete_io_paths(struct nvme_bdev_channel *nbdev_ch) 926 { 927 struct nvme_io_path *io_path, *tmp_io_path; 928 929 STAILQ_FOREACH_SAFE(io_path, &nbdev_ch->io_path_list, stailq, tmp_io_path) { 930 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 931 } 932 } 933 934 static int 935 bdev_nvme_create_bdev_channel_cb(void *io_device, void *ctx_buf) 936 { 937 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 938 struct nvme_bdev *nbdev = io_device; 939 struct nvme_ns *nvme_ns; 940 int rc; 941 942 STAILQ_INIT(&nbdev_ch->io_path_list); 943 TAILQ_INIT(&nbdev_ch->retry_io_list); 944 945 pthread_mutex_lock(&nbdev->mutex); 946 947 nbdev_ch->mp_policy = nbdev->mp_policy; 948 nbdev_ch->mp_selector = nbdev->mp_selector; 949 nbdev_ch->rr_min_io = nbdev->rr_min_io; 950 951 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 952 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 953 if (rc != 0) { 954 pthread_mutex_unlock(&nbdev->mutex); 955 956 _bdev_nvme_delete_io_paths(nbdev_ch); 957 return rc; 958 } 959 } 960 pthread_mutex_unlock(&nbdev->mutex); 961 962 return 0; 963 } 964 965 /* If cpl != NULL, complete the bdev_io with nvme status based on 'cpl'. 966 * If cpl == NULL, complete the bdev_io with bdev status based on 'status'. 967 */ 968 static inline void 969 __bdev_nvme_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status, 970 const struct spdk_nvme_cpl *cpl) 971 { 972 spdk_trace_record(TRACE_BDEV_NVME_IO_DONE, 0, 0, (uintptr_t)bdev_io->driver_ctx, 973 (uintptr_t)bdev_io); 974 if (cpl) { 975 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 976 } else { 977 spdk_bdev_io_complete(bdev_io, status); 978 } 979 } 980 981 static void bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch); 982 983 static void 984 bdev_nvme_destroy_bdev_channel_cb(void *io_device, void *ctx_buf) 985 { 986 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 987 988 bdev_nvme_abort_retry_ios(nbdev_ch); 989 _bdev_nvme_delete_io_paths(nbdev_ch); 990 } 991 992 static inline bool 993 bdev_nvme_io_type_is_admin(enum spdk_bdev_io_type io_type) 994 { 995 switch (io_type) { 996 case SPDK_BDEV_IO_TYPE_RESET: 997 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 998 case SPDK_BDEV_IO_TYPE_ABORT: 999 return true; 1000 default: 1001 break; 1002 } 1003 1004 return false; 1005 } 1006 1007 static inline bool 1008 nvme_ns_is_active(struct nvme_ns *nvme_ns) 1009 { 1010 if (spdk_unlikely(nvme_ns->ana_state_updating)) { 1011 return false; 1012 } 1013 1014 if (spdk_unlikely(nvme_ns->ns == NULL)) { 1015 return false; 1016 } 1017 1018 return true; 1019 } 1020 1021 static inline bool 1022 nvme_ns_is_accessible(struct nvme_ns *nvme_ns) 1023 { 1024 if (spdk_unlikely(!nvme_ns_is_active(nvme_ns))) { 1025 return false; 1026 } 1027 1028 switch (nvme_ns->ana_state) { 1029 case SPDK_NVME_ANA_OPTIMIZED_STATE: 1030 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 1031 return true; 1032 default: 1033 break; 1034 } 1035 1036 return false; 1037 } 1038 1039 static inline bool 1040 nvme_qpair_is_connected(struct nvme_qpair *nvme_qpair) 1041 { 1042 if (spdk_unlikely(nvme_qpair->qpair == NULL)) { 1043 return false; 1044 } 1045 1046 if (spdk_unlikely(spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) != 1047 SPDK_NVME_QPAIR_FAILURE_NONE)) { 1048 return false; 1049 } 1050 1051 if (spdk_unlikely(nvme_qpair->ctrlr_ch->reset_iter != NULL)) { 1052 return false; 1053 } 1054 1055 return true; 1056 } 1057 1058 static inline bool 1059 nvme_io_path_is_available(struct nvme_io_path *io_path) 1060 { 1061 if (spdk_unlikely(!nvme_qpair_is_connected(io_path->qpair))) { 1062 return false; 1063 } 1064 1065 if (spdk_unlikely(!nvme_ns_is_accessible(io_path->nvme_ns))) { 1066 return false; 1067 } 1068 1069 return true; 1070 } 1071 1072 static inline bool 1073 nvme_ctrlr_is_failed(struct nvme_ctrlr *nvme_ctrlr) 1074 { 1075 if (nvme_ctrlr->destruct) { 1076 return true; 1077 } 1078 1079 if (nvme_ctrlr->fast_io_fail_timedout) { 1080 return true; 1081 } 1082 1083 if (nvme_ctrlr->resetting) { 1084 if (nvme_ctrlr->opts.reconnect_delay_sec != 0) { 1085 return false; 1086 } else { 1087 return true; 1088 } 1089 } 1090 1091 if (nvme_ctrlr->reconnect_is_delayed) { 1092 return false; 1093 } 1094 1095 if (nvme_ctrlr->disabled) { 1096 return true; 1097 } 1098 1099 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 1100 return true; 1101 } else { 1102 return false; 1103 } 1104 } 1105 1106 static bool 1107 nvme_ctrlr_is_available(struct nvme_ctrlr *nvme_ctrlr) 1108 { 1109 if (nvme_ctrlr->destruct) { 1110 return false; 1111 } 1112 1113 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 1114 return false; 1115 } 1116 1117 if (nvme_ctrlr->resetting || nvme_ctrlr->reconnect_is_delayed) { 1118 return false; 1119 } 1120 1121 if (nvme_ctrlr->disabled) { 1122 return false; 1123 } 1124 1125 return true; 1126 } 1127 1128 /* Simulate circular linked list. */ 1129 static inline struct nvme_io_path * 1130 nvme_io_path_get_next(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *prev_path) 1131 { 1132 struct nvme_io_path *next_path; 1133 1134 if (prev_path != NULL) { 1135 next_path = STAILQ_NEXT(prev_path, stailq); 1136 if (next_path != NULL) { 1137 return next_path; 1138 } 1139 } 1140 1141 return STAILQ_FIRST(&nbdev_ch->io_path_list); 1142 } 1143 1144 static struct nvme_io_path * 1145 _bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 1146 { 1147 struct nvme_io_path *io_path, *start, *non_optimized = NULL; 1148 1149 start = nvme_io_path_get_next(nbdev_ch, nbdev_ch->current_io_path); 1150 1151 io_path = start; 1152 do { 1153 if (spdk_likely(nvme_io_path_is_available(io_path))) { 1154 switch (io_path->nvme_ns->ana_state) { 1155 case SPDK_NVME_ANA_OPTIMIZED_STATE: 1156 nbdev_ch->current_io_path = io_path; 1157 return io_path; 1158 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 1159 if (non_optimized == NULL) { 1160 non_optimized = io_path; 1161 } 1162 break; 1163 default: 1164 assert(false); 1165 break; 1166 } 1167 } 1168 io_path = nvme_io_path_get_next(nbdev_ch, io_path); 1169 } while (io_path != start); 1170 1171 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) { 1172 /* We come here only if there is no optimized path. Cache even non_optimized 1173 * path for load balance across multiple non_optimized paths. 1174 */ 1175 nbdev_ch->current_io_path = non_optimized; 1176 } 1177 1178 return non_optimized; 1179 } 1180 1181 static struct nvme_io_path * 1182 _bdev_nvme_find_io_path_min_qd(struct nvme_bdev_channel *nbdev_ch) 1183 { 1184 struct nvme_io_path *io_path; 1185 struct nvme_io_path *optimized = NULL, *non_optimized = NULL; 1186 uint32_t opt_min_qd = UINT32_MAX, non_opt_min_qd = UINT32_MAX; 1187 uint32_t num_outstanding_reqs; 1188 1189 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 1190 if (spdk_unlikely(!nvme_qpair_is_connected(io_path->qpair))) { 1191 /* The device is currently resetting. */ 1192 continue; 1193 } 1194 1195 if (spdk_unlikely(!nvme_ns_is_active(io_path->nvme_ns))) { 1196 continue; 1197 } 1198 1199 num_outstanding_reqs = spdk_nvme_qpair_get_num_outstanding_reqs(io_path->qpair->qpair); 1200 switch (io_path->nvme_ns->ana_state) { 1201 case SPDK_NVME_ANA_OPTIMIZED_STATE: 1202 if (num_outstanding_reqs < opt_min_qd) { 1203 opt_min_qd = num_outstanding_reqs; 1204 optimized = io_path; 1205 } 1206 break; 1207 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 1208 if (num_outstanding_reqs < non_opt_min_qd) { 1209 non_opt_min_qd = num_outstanding_reqs; 1210 non_optimized = io_path; 1211 } 1212 break; 1213 default: 1214 break; 1215 } 1216 } 1217 1218 /* don't cache io path for BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH selector */ 1219 if (optimized != NULL) { 1220 return optimized; 1221 } 1222 1223 return non_optimized; 1224 } 1225 1226 static inline struct nvme_io_path * 1227 bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 1228 { 1229 if (spdk_likely(nbdev_ch->current_io_path != NULL)) { 1230 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE) { 1231 return nbdev_ch->current_io_path; 1232 } else if (nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 1233 if (++nbdev_ch->rr_counter < nbdev_ch->rr_min_io) { 1234 return nbdev_ch->current_io_path; 1235 } 1236 nbdev_ch->rr_counter = 0; 1237 } 1238 } 1239 1240 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE || 1241 nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 1242 return _bdev_nvme_find_io_path(nbdev_ch); 1243 } else { 1244 return _bdev_nvme_find_io_path_min_qd(nbdev_ch); 1245 } 1246 } 1247 1248 /* Return true if there is any io_path whose qpair is active or ctrlr is not failed, 1249 * or false otherwise. 1250 * 1251 * If any io_path has an active qpair but find_io_path() returned NULL, its namespace 1252 * is likely to be non-accessible now but may become accessible. 1253 * 1254 * If any io_path has an unfailed ctrlr but find_io_path() returned NULL, the ctrlr 1255 * is likely to be resetting now but the reset may succeed. A ctrlr is set to unfailed 1256 * when starting to reset it but it is set to failed when the reset failed. Hence, if 1257 * a ctrlr is unfailed, it is likely that it works fine or is resetting. 1258 */ 1259 static bool 1260 any_io_path_may_become_available(struct nvme_bdev_channel *nbdev_ch) 1261 { 1262 struct nvme_io_path *io_path; 1263 1264 if (nbdev_ch->resetting) { 1265 return false; 1266 } 1267 1268 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 1269 if (io_path->nvme_ns->ana_transition_timedout) { 1270 continue; 1271 } 1272 1273 if (nvme_qpair_is_connected(io_path->qpair) || 1274 !nvme_ctrlr_is_failed(io_path->qpair->ctrlr)) { 1275 return true; 1276 } 1277 } 1278 1279 return false; 1280 } 1281 1282 static void 1283 bdev_nvme_retry_io(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io) 1284 { 1285 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 1286 struct spdk_io_channel *ch; 1287 1288 if (nbdev_io->io_path != NULL && nvme_io_path_is_available(nbdev_io->io_path)) { 1289 _bdev_nvme_submit_request(nbdev_ch, bdev_io); 1290 } else { 1291 ch = spdk_io_channel_from_ctx(nbdev_ch); 1292 bdev_nvme_submit_request(ch, bdev_io); 1293 } 1294 } 1295 1296 static int 1297 bdev_nvme_retry_ios(void *arg) 1298 { 1299 struct nvme_bdev_channel *nbdev_ch = arg; 1300 struct nvme_bdev_io *bio, *tmp_bio; 1301 uint64_t now, delay_us; 1302 1303 now = spdk_get_ticks(); 1304 1305 TAILQ_FOREACH_SAFE(bio, &nbdev_ch->retry_io_list, retry_link, tmp_bio) { 1306 if (bio->retry_ticks > now) { 1307 break; 1308 } 1309 1310 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bio, retry_link); 1311 1312 bdev_nvme_retry_io(nbdev_ch, spdk_bdev_io_from_ctx(bio)); 1313 } 1314 1315 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1316 1317 bio = TAILQ_FIRST(&nbdev_ch->retry_io_list); 1318 if (bio != NULL) { 1319 delay_us = (bio->retry_ticks - now) * SPDK_SEC_TO_USEC / spdk_get_ticks_hz(); 1320 1321 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 1322 delay_us); 1323 } 1324 1325 return SPDK_POLLER_BUSY; 1326 } 1327 1328 static void 1329 bdev_nvme_queue_retry_io(struct nvme_bdev_channel *nbdev_ch, 1330 struct nvme_bdev_io *bio, uint64_t delay_ms) 1331 { 1332 struct nvme_bdev_io *tmp_bio; 1333 1334 bio->retry_ticks = spdk_get_ticks() + delay_ms * spdk_get_ticks_hz() / 1000ULL; 1335 1336 TAILQ_FOREACH_REVERSE(tmp_bio, &nbdev_ch->retry_io_list, retry_io_head, retry_link) { 1337 if (tmp_bio->retry_ticks <= bio->retry_ticks) { 1338 TAILQ_INSERT_AFTER(&nbdev_ch->retry_io_list, tmp_bio, bio, 1339 retry_link); 1340 return; 1341 } 1342 } 1343 1344 /* No earlier I/Os were found. This I/O must be the new head. */ 1345 TAILQ_INSERT_HEAD(&nbdev_ch->retry_io_list, bio, retry_link); 1346 1347 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1348 1349 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 1350 delay_ms * 1000ULL); 1351 } 1352 1353 static void 1354 bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch) 1355 { 1356 struct nvme_bdev_io *bio, *tmp_bio; 1357 1358 TAILQ_FOREACH_SAFE(bio, &nbdev_ch->retry_io_list, retry_link, tmp_bio) { 1359 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bio, retry_link); 1360 __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_ABORTED, NULL); 1361 } 1362 1363 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1364 } 1365 1366 static int 1367 bdev_nvme_abort_retry_io(struct nvme_bdev_channel *nbdev_ch, 1368 struct nvme_bdev_io *bio_to_abort) 1369 { 1370 struct nvme_bdev_io *bio; 1371 1372 TAILQ_FOREACH(bio, &nbdev_ch->retry_io_list, retry_link) { 1373 if (bio == bio_to_abort) { 1374 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bio, retry_link); 1375 __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_ABORTED, NULL); 1376 return 0; 1377 } 1378 } 1379 1380 return -ENOENT; 1381 } 1382 1383 static void 1384 bdev_nvme_update_nvme_error_stat(struct spdk_bdev_io *bdev_io, const struct spdk_nvme_cpl *cpl) 1385 { 1386 struct nvme_bdev *nbdev; 1387 uint16_t sct, sc; 1388 1389 assert(spdk_nvme_cpl_is_error(cpl)); 1390 1391 nbdev = bdev_io->bdev->ctxt; 1392 1393 if (nbdev->err_stat == NULL) { 1394 return; 1395 } 1396 1397 sct = cpl->status.sct; 1398 sc = cpl->status.sc; 1399 1400 pthread_mutex_lock(&nbdev->mutex); 1401 1402 nbdev->err_stat->status_type[sct]++; 1403 switch (sct) { 1404 case SPDK_NVME_SCT_GENERIC: 1405 case SPDK_NVME_SCT_COMMAND_SPECIFIC: 1406 case SPDK_NVME_SCT_MEDIA_ERROR: 1407 case SPDK_NVME_SCT_PATH: 1408 nbdev->err_stat->status[sct][sc]++; 1409 break; 1410 default: 1411 break; 1412 } 1413 1414 pthread_mutex_unlock(&nbdev->mutex); 1415 } 1416 1417 static inline void 1418 bdev_nvme_update_io_path_stat(struct nvme_bdev_io *bio) 1419 { 1420 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1421 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 1422 uint32_t blocklen = bdev_io->bdev->blocklen; 1423 struct spdk_bdev_io_stat *stat; 1424 uint64_t tsc_diff; 1425 1426 if (bio->io_path->stat == NULL) { 1427 return; 1428 } 1429 1430 tsc_diff = spdk_get_ticks() - bio->submit_tsc; 1431 stat = bio->io_path->stat; 1432 1433 switch (bdev_io->type) { 1434 case SPDK_BDEV_IO_TYPE_READ: 1435 stat->bytes_read += num_blocks * blocklen; 1436 stat->num_read_ops++; 1437 stat->read_latency_ticks += tsc_diff; 1438 if (stat->max_read_latency_ticks < tsc_diff) { 1439 stat->max_read_latency_ticks = tsc_diff; 1440 } 1441 if (stat->min_read_latency_ticks > tsc_diff) { 1442 stat->min_read_latency_ticks = tsc_diff; 1443 } 1444 break; 1445 case SPDK_BDEV_IO_TYPE_WRITE: 1446 stat->bytes_written += num_blocks * blocklen; 1447 stat->num_write_ops++; 1448 stat->write_latency_ticks += tsc_diff; 1449 if (stat->max_write_latency_ticks < tsc_diff) { 1450 stat->max_write_latency_ticks = tsc_diff; 1451 } 1452 if (stat->min_write_latency_ticks > tsc_diff) { 1453 stat->min_write_latency_ticks = tsc_diff; 1454 } 1455 break; 1456 case SPDK_BDEV_IO_TYPE_UNMAP: 1457 stat->bytes_unmapped += num_blocks * blocklen; 1458 stat->num_unmap_ops++; 1459 stat->unmap_latency_ticks += tsc_diff; 1460 if (stat->max_unmap_latency_ticks < tsc_diff) { 1461 stat->max_unmap_latency_ticks = tsc_diff; 1462 } 1463 if (stat->min_unmap_latency_ticks > tsc_diff) { 1464 stat->min_unmap_latency_ticks = tsc_diff; 1465 } 1466 break; 1467 case SPDK_BDEV_IO_TYPE_ZCOPY: 1468 /* Track the data in the start phase only */ 1469 if (!bdev_io->u.bdev.zcopy.start) { 1470 break; 1471 } 1472 if (bdev_io->u.bdev.zcopy.populate) { 1473 stat->bytes_read += num_blocks * blocklen; 1474 stat->num_read_ops++; 1475 stat->read_latency_ticks += tsc_diff; 1476 if (stat->max_read_latency_ticks < tsc_diff) { 1477 stat->max_read_latency_ticks = tsc_diff; 1478 } 1479 if (stat->min_read_latency_ticks > tsc_diff) { 1480 stat->min_read_latency_ticks = tsc_diff; 1481 } 1482 } else { 1483 stat->bytes_written += num_blocks * blocklen; 1484 stat->num_write_ops++; 1485 stat->write_latency_ticks += tsc_diff; 1486 if (stat->max_write_latency_ticks < tsc_diff) { 1487 stat->max_write_latency_ticks = tsc_diff; 1488 } 1489 if (stat->min_write_latency_ticks > tsc_diff) { 1490 stat->min_write_latency_ticks = tsc_diff; 1491 } 1492 } 1493 break; 1494 case SPDK_BDEV_IO_TYPE_COPY: 1495 stat->bytes_copied += num_blocks * blocklen; 1496 stat->num_copy_ops++; 1497 stat->copy_latency_ticks += tsc_diff; 1498 if (stat->max_copy_latency_ticks < tsc_diff) { 1499 stat->max_copy_latency_ticks = tsc_diff; 1500 } 1501 if (stat->min_copy_latency_ticks > tsc_diff) { 1502 stat->min_copy_latency_ticks = tsc_diff; 1503 } 1504 break; 1505 default: 1506 break; 1507 } 1508 } 1509 1510 static bool 1511 bdev_nvme_check_retry_io(struct nvme_bdev_io *bio, 1512 const struct spdk_nvme_cpl *cpl, 1513 struct nvme_bdev_channel *nbdev_ch, 1514 uint64_t *_delay_ms) 1515 { 1516 struct nvme_io_path *io_path = bio->io_path; 1517 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 1518 const struct spdk_nvme_ctrlr_data *cdata; 1519 1520 if (spdk_nvme_cpl_is_path_error(cpl) || 1521 spdk_nvme_cpl_is_aborted_sq_deletion(cpl) || 1522 !nvme_io_path_is_available(io_path) || 1523 !nvme_ctrlr_is_available(nvme_ctrlr)) { 1524 bdev_nvme_clear_current_io_path(nbdev_ch); 1525 bio->io_path = NULL; 1526 if (spdk_nvme_cpl_is_ana_error(cpl)) { 1527 if (nvme_ctrlr_read_ana_log_page(nvme_ctrlr) == 0) { 1528 io_path->nvme_ns->ana_state_updating = true; 1529 } 1530 } 1531 if (!any_io_path_may_become_available(nbdev_ch)) { 1532 return false; 1533 } 1534 *_delay_ms = 0; 1535 } else { 1536 bio->retry_count++; 1537 1538 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 1539 1540 if (cpl->status.crd != 0) { 1541 *_delay_ms = cdata->crdt[cpl->status.crd] * 100; 1542 } else { 1543 *_delay_ms = 0; 1544 } 1545 } 1546 1547 return true; 1548 } 1549 1550 static inline void 1551 bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio, 1552 const struct spdk_nvme_cpl *cpl) 1553 { 1554 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1555 struct nvme_bdev_channel *nbdev_ch; 1556 uint64_t delay_ms; 1557 1558 assert(!bdev_nvme_io_type_is_admin(bdev_io->type)); 1559 1560 if (spdk_likely(spdk_nvme_cpl_is_success(cpl))) { 1561 bdev_nvme_update_io_path_stat(bio); 1562 goto complete; 1563 } 1564 1565 /* Update error counts before deciding if retry is needed. 1566 * Hence, error counts may be more than the number of I/O errors. 1567 */ 1568 bdev_nvme_update_nvme_error_stat(bdev_io, cpl); 1569 1570 if (cpl->status.dnr != 0 || spdk_nvme_cpl_is_aborted_by_request(cpl) || 1571 (g_opts.bdev_retry_count != -1 && bio->retry_count >= g_opts.bdev_retry_count)) { 1572 goto complete; 1573 } 1574 1575 /* At this point we don't know whether the sequence was successfully executed or not, so we 1576 * cannot retry the IO */ 1577 if (bdev_io->u.bdev.accel_sequence != NULL) { 1578 goto complete; 1579 } 1580 1581 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1582 1583 if (bdev_nvme_check_retry_io(bio, cpl, nbdev_ch, &delay_ms)) { 1584 bdev_nvme_queue_retry_io(nbdev_ch, bio, delay_ms); 1585 return; 1586 } 1587 1588 complete: 1589 bio->retry_count = 0; 1590 bio->submit_tsc = 0; 1591 bdev_io->u.bdev.accel_sequence = NULL; 1592 __bdev_nvme_io_complete(bdev_io, 0, cpl); 1593 } 1594 1595 static inline void 1596 bdev_nvme_io_complete(struct nvme_bdev_io *bio, int rc) 1597 { 1598 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1599 struct nvme_bdev_channel *nbdev_ch; 1600 enum spdk_bdev_io_status io_status; 1601 1602 assert(!bdev_nvme_io_type_is_admin(bdev_io->type)); 1603 1604 switch (rc) { 1605 case 0: 1606 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1607 break; 1608 case -ENOMEM: 1609 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1610 break; 1611 case -ENXIO: 1612 if (g_opts.bdev_retry_count == -1 || bio->retry_count < g_opts.bdev_retry_count) { 1613 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1614 1615 bdev_nvme_clear_current_io_path(nbdev_ch); 1616 bio->io_path = NULL; 1617 1618 if (any_io_path_may_become_available(nbdev_ch)) { 1619 bdev_nvme_queue_retry_io(nbdev_ch, bio, 1000ULL); 1620 return; 1621 } 1622 } 1623 1624 /* fallthrough */ 1625 default: 1626 spdk_accel_sequence_abort(bdev_io->u.bdev.accel_sequence); 1627 bdev_io->u.bdev.accel_sequence = NULL; 1628 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1629 break; 1630 } 1631 1632 bio->retry_count = 0; 1633 bio->submit_tsc = 0; 1634 __bdev_nvme_io_complete(bdev_io, io_status, NULL); 1635 } 1636 1637 static inline void 1638 bdev_nvme_admin_complete(struct nvme_bdev_io *bio, int rc) 1639 { 1640 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1641 enum spdk_bdev_io_status io_status; 1642 1643 switch (rc) { 1644 case 0: 1645 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1646 break; 1647 case -ENOMEM: 1648 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1649 break; 1650 case -ENXIO: 1651 /* fallthrough */ 1652 default: 1653 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1654 break; 1655 } 1656 1657 __bdev_nvme_io_complete(bdev_io, io_status, NULL); 1658 } 1659 1660 static void 1661 bdev_nvme_clear_io_path_caches_done(struct nvme_ctrlr *nvme_ctrlr, 1662 void *ctx, int status) 1663 { 1664 pthread_mutex_lock(&nvme_ctrlr->mutex); 1665 1666 assert(nvme_ctrlr->io_path_cache_clearing == true); 1667 nvme_ctrlr->io_path_cache_clearing = false; 1668 1669 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 1670 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1671 return; 1672 } 1673 1674 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1675 1676 nvme_ctrlr_unregister(nvme_ctrlr); 1677 } 1678 1679 static void 1680 _bdev_nvme_clear_io_path_cache(struct nvme_qpair *nvme_qpair) 1681 { 1682 struct nvme_io_path *io_path; 1683 1684 TAILQ_FOREACH(io_path, &nvme_qpair->io_path_list, tailq) { 1685 if (io_path->nbdev_ch == NULL) { 1686 continue; 1687 } 1688 bdev_nvme_clear_current_io_path(io_path->nbdev_ch); 1689 } 1690 } 1691 1692 static void 1693 bdev_nvme_clear_io_path_cache(struct nvme_ctrlr_channel_iter *i, 1694 struct nvme_ctrlr *nvme_ctrlr, 1695 struct nvme_ctrlr_channel *ctrlr_ch, 1696 void *ctx) 1697 { 1698 assert(ctrlr_ch->qpair != NULL); 1699 1700 _bdev_nvme_clear_io_path_cache(ctrlr_ch->qpair); 1701 1702 nvme_ctrlr_for_each_channel_continue(i, 0); 1703 } 1704 1705 static void 1706 bdev_nvme_clear_io_path_caches(struct nvme_ctrlr *nvme_ctrlr) 1707 { 1708 pthread_mutex_lock(&nvme_ctrlr->mutex); 1709 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 1710 nvme_ctrlr->io_path_cache_clearing) { 1711 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1712 return; 1713 } 1714 1715 nvme_ctrlr->io_path_cache_clearing = true; 1716 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1717 1718 nvme_ctrlr_for_each_channel(nvme_ctrlr, 1719 bdev_nvme_clear_io_path_cache, 1720 NULL, 1721 bdev_nvme_clear_io_path_caches_done); 1722 } 1723 1724 static struct nvme_qpair * 1725 nvme_poll_group_get_qpair(struct nvme_poll_group *group, struct spdk_nvme_qpair *qpair) 1726 { 1727 struct nvme_qpair *nvme_qpair; 1728 1729 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1730 if (nvme_qpair->qpair == qpair) { 1731 break; 1732 } 1733 } 1734 1735 return nvme_qpair; 1736 } 1737 1738 static void nvme_qpair_delete(struct nvme_qpair *nvme_qpair); 1739 1740 static void 1741 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx) 1742 { 1743 struct nvme_poll_group *group = poll_group_ctx; 1744 struct nvme_qpair *nvme_qpair; 1745 struct nvme_ctrlr *nvme_ctrlr; 1746 struct nvme_ctrlr_channel *ctrlr_ch; 1747 int status; 1748 1749 nvme_qpair = nvme_poll_group_get_qpair(group, qpair); 1750 if (nvme_qpair == NULL) { 1751 return; 1752 } 1753 1754 if (nvme_qpair->qpair != NULL) { 1755 spdk_nvme_ctrlr_free_io_qpair(nvme_qpair->qpair); 1756 nvme_qpair->qpair = NULL; 1757 } 1758 1759 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1760 1761 nvme_ctrlr = nvme_qpair->ctrlr; 1762 ctrlr_ch = nvme_qpair->ctrlr_ch; 1763 1764 if (ctrlr_ch != NULL) { 1765 if (ctrlr_ch->reset_iter != NULL) { 1766 /* We are in a full reset sequence. */ 1767 if (ctrlr_ch->connect_poller != NULL) { 1768 /* qpair was failed to connect. Abort the reset sequence. */ 1769 NVME_CTRLR_INFOLOG(nvme_ctrlr, 1770 "qpair %p was failed to connect. abort the reset ctrlr sequence.\n", 1771 qpair); 1772 spdk_poller_unregister(&ctrlr_ch->connect_poller); 1773 status = -1; 1774 } else { 1775 /* qpair was completed to disconnect. Just move to the next ctrlr_channel. */ 1776 NVME_CTRLR_INFOLOG(nvme_ctrlr, 1777 "qpair %p was disconnected and freed in a reset ctrlr sequence.\n", 1778 qpair); 1779 status = 0; 1780 } 1781 nvme_ctrlr_for_each_channel_continue(ctrlr_ch->reset_iter, status); 1782 ctrlr_ch->reset_iter = NULL; 1783 } else { 1784 /* qpair was disconnected unexpectedly. Reset controller for recovery. */ 1785 NVME_CTRLR_INFOLOG(nvme_ctrlr, "qpair %p was disconnected and freed. reset controller.\n", 1786 qpair); 1787 bdev_nvme_failover_ctrlr(nvme_ctrlr); 1788 } 1789 } else { 1790 /* In this case, ctrlr_channel is already deleted. */ 1791 NVME_CTRLR_INFOLOG(nvme_ctrlr, "qpair %p was disconnected and freed. delete nvme_qpair.\n", 1792 qpair); 1793 nvme_qpair_delete(nvme_qpair); 1794 } 1795 } 1796 1797 static void 1798 bdev_nvme_check_io_qpairs(struct nvme_poll_group *group) 1799 { 1800 struct nvme_qpair *nvme_qpair; 1801 1802 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1803 if (nvme_qpair->qpair == NULL || nvme_qpair->ctrlr_ch == NULL) { 1804 continue; 1805 } 1806 1807 if (spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) != 1808 SPDK_NVME_QPAIR_FAILURE_NONE) { 1809 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1810 } 1811 } 1812 } 1813 1814 static int 1815 bdev_nvme_poll(void *arg) 1816 { 1817 struct nvme_poll_group *group = arg; 1818 int64_t num_completions; 1819 1820 if (group->collect_spin_stat && group->start_ticks == 0) { 1821 group->start_ticks = spdk_get_ticks(); 1822 } 1823 1824 num_completions = spdk_nvme_poll_group_process_completions(group->group, 0, 1825 bdev_nvme_disconnected_qpair_cb); 1826 if (group->collect_spin_stat) { 1827 if (num_completions > 0) { 1828 if (group->end_ticks != 0) { 1829 group->spin_ticks += (group->end_ticks - group->start_ticks); 1830 group->end_ticks = 0; 1831 } 1832 group->start_ticks = 0; 1833 } else { 1834 group->end_ticks = spdk_get_ticks(); 1835 } 1836 } 1837 1838 if (spdk_unlikely(num_completions < 0)) { 1839 bdev_nvme_check_io_qpairs(group); 1840 } 1841 1842 return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 1843 } 1844 1845 static int bdev_nvme_poll_adminq(void *arg); 1846 1847 static void 1848 bdev_nvme_change_adminq_poll_period(struct nvme_ctrlr *nvme_ctrlr, uint64_t new_period_us) 1849 { 1850 if (spdk_interrupt_mode_is_enabled()) { 1851 return; 1852 } 1853 1854 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 1855 1856 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, 1857 nvme_ctrlr, new_period_us); 1858 } 1859 1860 static int 1861 bdev_nvme_poll_adminq(void *arg) 1862 { 1863 int32_t rc; 1864 struct nvme_ctrlr *nvme_ctrlr = arg; 1865 nvme_ctrlr_disconnected_cb disconnected_cb; 1866 1867 assert(nvme_ctrlr != NULL); 1868 1869 rc = spdk_nvme_ctrlr_process_admin_completions(nvme_ctrlr->ctrlr); 1870 if (rc < 0) { 1871 disconnected_cb = nvme_ctrlr->disconnected_cb; 1872 nvme_ctrlr->disconnected_cb = NULL; 1873 1874 if (disconnected_cb != NULL) { 1875 bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 1876 g_opts.nvme_adminq_poll_period_us); 1877 disconnected_cb(nvme_ctrlr); 1878 } else { 1879 bdev_nvme_failover_ctrlr(nvme_ctrlr); 1880 } 1881 } else if (spdk_nvme_ctrlr_get_admin_qp_failure_reason(nvme_ctrlr->ctrlr) != 1882 SPDK_NVME_QPAIR_FAILURE_NONE) { 1883 bdev_nvme_clear_io_path_caches(nvme_ctrlr); 1884 } 1885 1886 return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY; 1887 } 1888 1889 static void 1890 nvme_bdev_free(void *io_device) 1891 { 1892 struct nvme_bdev *nbdev = io_device; 1893 1894 pthread_mutex_destroy(&nbdev->mutex); 1895 free(nbdev->disk.name); 1896 free(nbdev->err_stat); 1897 free(nbdev); 1898 } 1899 1900 static int 1901 bdev_nvme_destruct(void *ctx) 1902 { 1903 struct nvme_bdev *nbdev = ctx; 1904 struct nvme_ns *nvme_ns, *tmp_nvme_ns; 1905 1906 SPDK_DTRACE_PROBE2(bdev_nvme_destruct, nbdev->nbdev_ctrlr->name, nbdev->nsid); 1907 1908 pthread_mutex_lock(&nbdev->mutex); 1909 1910 TAILQ_FOREACH_SAFE(nvme_ns, &nbdev->nvme_ns_list, tailq, tmp_nvme_ns) { 1911 pthread_mutex_lock(&nvme_ns->ctrlr->mutex); 1912 1913 nvme_ns->bdev = NULL; 1914 1915 assert(nvme_ns->id > 0); 1916 1917 if (nvme_ctrlr_get_ns(nvme_ns->ctrlr, nvme_ns->id) == NULL) { 1918 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1919 1920 nvme_ctrlr_put_ref(nvme_ns->ctrlr); 1921 nvme_ns_free(nvme_ns); 1922 } else { 1923 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1924 } 1925 } 1926 1927 pthread_mutex_unlock(&nbdev->mutex); 1928 1929 pthread_mutex_lock(&g_bdev_nvme_mutex); 1930 TAILQ_REMOVE(&nbdev->nbdev_ctrlr->bdevs, nbdev, tailq); 1931 pthread_mutex_unlock(&g_bdev_nvme_mutex); 1932 1933 spdk_io_device_unregister(nbdev, nvme_bdev_free); 1934 1935 return 0; 1936 } 1937 1938 static int 1939 bdev_nvme_create_qpair(struct nvme_qpair *nvme_qpair) 1940 { 1941 struct nvme_ctrlr *nvme_ctrlr; 1942 struct spdk_nvme_io_qpair_opts opts; 1943 struct spdk_nvme_qpair *qpair; 1944 int rc; 1945 1946 nvme_ctrlr = nvme_qpair->ctrlr; 1947 1948 spdk_nvme_ctrlr_get_default_io_qpair_opts(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1949 opts.create_only = true; 1950 /* In interrupt mode qpairs must be created in sync mode, else it will never be connected. 1951 * delay_cmd_submit must be false as in interrupt mode requests cannot be submitted in 1952 * completion context. 1953 */ 1954 if (!spdk_interrupt_mode_is_enabled()) { 1955 opts.async_mode = true; 1956 opts.delay_cmd_submit = g_opts.delay_cmd_submit; 1957 } 1958 opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests); 1959 g_opts.io_queue_requests = opts.io_queue_requests; 1960 1961 qpair = spdk_nvme_ctrlr_alloc_io_qpair(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1962 if (qpair == NULL) { 1963 return -1; 1964 } 1965 1966 SPDK_DTRACE_PROBE3(bdev_nvme_create_qpair, nvme_ctrlr->nbdev_ctrlr->name, 1967 spdk_nvme_qpair_get_id(qpair), spdk_thread_get_id(nvme_ctrlr->thread)); 1968 1969 assert(nvme_qpair->group != NULL); 1970 1971 rc = spdk_nvme_poll_group_add(nvme_qpair->group->group, qpair); 1972 if (rc != 0) { 1973 NVME_CTRLR_ERRLOG(nvme_ctrlr, "Unable to begin polling on NVMe Channel.\n"); 1974 goto err; 1975 } 1976 1977 rc = spdk_nvme_ctrlr_connect_io_qpair(nvme_ctrlr->ctrlr, qpair); 1978 if (rc != 0) { 1979 NVME_CTRLR_ERRLOG(nvme_ctrlr, "Unable to connect I/O qpair.\n"); 1980 goto err; 1981 } 1982 1983 nvme_qpair->qpair = qpair; 1984 1985 if (!g_opts.disable_auto_failback) { 1986 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1987 } 1988 1989 NVME_CTRLR_INFOLOG(nvme_ctrlr, "Connecting qpair %p:%u started.\n", 1990 qpair, spdk_nvme_qpair_get_id(qpair)); 1991 1992 return 0; 1993 1994 err: 1995 spdk_nvme_ctrlr_free_io_qpair(qpair); 1996 1997 return rc; 1998 } 1999 2000 static void bdev_nvme_reset_io_continue(void *cb_arg, int rc); 2001 2002 static void 2003 bdev_nvme_complete_pending_resets(struct nvme_ctrlr *nvme_ctrlr, bool success) 2004 { 2005 int rc = 0; 2006 struct nvme_bdev_io *bio; 2007 2008 if (!success) { 2009 rc = -1; 2010 } 2011 2012 while (!TAILQ_EMPTY(&nvme_ctrlr->pending_resets)) { 2013 bio = TAILQ_FIRST(&nvme_ctrlr->pending_resets); 2014 TAILQ_REMOVE(&nvme_ctrlr->pending_resets, bio, retry_link); 2015 2016 bdev_nvme_reset_io_continue(bio, rc); 2017 } 2018 } 2019 2020 /* This function marks the current trid as failed by storing the current ticks 2021 * and then sets the next trid to the active trid within a controller if exists. 2022 * 2023 * The purpose of the boolean return value is to request the caller to disconnect 2024 * the current trid now to try connecting the next trid. 2025 */ 2026 static bool 2027 bdev_nvme_failover_trid(struct nvme_ctrlr *nvme_ctrlr, bool remove, bool start) 2028 { 2029 struct nvme_path_id *path_id, *next_path; 2030 int rc __attribute__((unused)); 2031 2032 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 2033 assert(path_id); 2034 assert(path_id == nvme_ctrlr->active_path_id); 2035 next_path = TAILQ_NEXT(path_id, link); 2036 2037 /* Update the last failed time. It means the trid is failed if its last 2038 * failed time is non-zero. 2039 */ 2040 path_id->last_failed_tsc = spdk_get_ticks(); 2041 2042 if (next_path == NULL) { 2043 /* There is no alternate trid within a controller. */ 2044 return false; 2045 } 2046 2047 if (!start && nvme_ctrlr->opts.reconnect_delay_sec == 0) { 2048 /* Connect is not retried in a controller reset sequence. Connecting 2049 * the next trid will be done by the next bdev_nvme_failover_ctrlr() call. 2050 */ 2051 return false; 2052 } 2053 2054 assert(path_id->trid.trtype != SPDK_NVME_TRANSPORT_PCIE); 2055 2056 NVME_CTRLR_NOTICELOG(nvme_ctrlr, "Start failover from %s:%s to %s:%s\n", 2057 path_id->trid.traddr, path_id->trid.trsvcid, 2058 next_path->trid.traddr, next_path->trid.trsvcid); 2059 2060 spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr); 2061 nvme_ctrlr->active_path_id = next_path; 2062 rc = spdk_nvme_ctrlr_set_trid(nvme_ctrlr->ctrlr, &next_path->trid); 2063 assert(rc == 0); 2064 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 2065 if (!remove) { 2066 /** Shuffle the old trid to the end of the list and use the new one. 2067 * Allows for round robin through multiple connections. 2068 */ 2069 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, path_id, link); 2070 } else { 2071 free(path_id); 2072 } 2073 2074 if (start || next_path->last_failed_tsc == 0) { 2075 /* bdev_nvme_failover_ctrlr() is just called or the next trid is not failed 2076 * or used yet. Try the next trid now. 2077 */ 2078 return true; 2079 } 2080 2081 if (spdk_get_ticks() > next_path->last_failed_tsc + spdk_get_ticks_hz() * 2082 nvme_ctrlr->opts.reconnect_delay_sec) { 2083 /* Enough backoff passed since the next trid failed. Try the next trid now. */ 2084 return true; 2085 } 2086 2087 /* The next trid will be tried after reconnect_delay_sec seconds. */ 2088 return false; 2089 } 2090 2091 static bool 2092 bdev_nvme_check_ctrlr_loss_timeout(struct nvme_ctrlr *nvme_ctrlr) 2093 { 2094 int32_t elapsed; 2095 2096 if (nvme_ctrlr->opts.ctrlr_loss_timeout_sec == 0 || 2097 nvme_ctrlr->opts.ctrlr_loss_timeout_sec == -1) { 2098 return false; 2099 } 2100 2101 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 2102 if (elapsed >= nvme_ctrlr->opts.ctrlr_loss_timeout_sec) { 2103 return true; 2104 } else { 2105 return false; 2106 } 2107 } 2108 2109 static bool 2110 bdev_nvme_check_fast_io_fail_timeout(struct nvme_ctrlr *nvme_ctrlr) 2111 { 2112 uint32_t elapsed; 2113 2114 if (nvme_ctrlr->opts.fast_io_fail_timeout_sec == 0) { 2115 return false; 2116 } 2117 2118 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 2119 if (elapsed >= nvme_ctrlr->opts.fast_io_fail_timeout_sec) { 2120 return true; 2121 } else { 2122 return false; 2123 } 2124 } 2125 2126 static void bdev_nvme_reset_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, bool success); 2127 2128 static void 2129 nvme_ctrlr_disconnect(struct nvme_ctrlr *nvme_ctrlr, nvme_ctrlr_disconnected_cb cb_fn) 2130 { 2131 int rc; 2132 2133 NVME_CTRLR_INFOLOG(nvme_ctrlr, "Start disconnecting ctrlr.\n"); 2134 2135 rc = spdk_nvme_ctrlr_disconnect(nvme_ctrlr->ctrlr); 2136 if (rc != 0) { 2137 NVME_CTRLR_WARNLOG(nvme_ctrlr, "disconnecting ctrlr failed.\n"); 2138 2139 /* Disconnect fails if ctrlr is already resetting or removed. In this case, 2140 * fail the reset sequence immediately. 2141 */ 2142 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 2143 return; 2144 } 2145 2146 /* spdk_nvme_ctrlr_disconnect() may complete asynchronously later by polling adminq. 2147 * Set callback here to execute the specified operation after ctrlr is really disconnected. 2148 */ 2149 assert(nvme_ctrlr->disconnected_cb == NULL); 2150 nvme_ctrlr->disconnected_cb = cb_fn; 2151 2152 /* During disconnection, reduce the period to poll adminq more often. */ 2153 bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 0); 2154 } 2155 2156 enum bdev_nvme_op_after_reset { 2157 OP_NONE, 2158 OP_COMPLETE_PENDING_DESTRUCT, 2159 OP_DESTRUCT, 2160 OP_DELAYED_RECONNECT, 2161 OP_FAILOVER, 2162 }; 2163 2164 typedef enum bdev_nvme_op_after_reset _bdev_nvme_op_after_reset; 2165 2166 static _bdev_nvme_op_after_reset 2167 bdev_nvme_check_op_after_reset(struct nvme_ctrlr *nvme_ctrlr, bool success) 2168 { 2169 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 2170 /* Complete pending destruct after reset completes. */ 2171 return OP_COMPLETE_PENDING_DESTRUCT; 2172 } else if (nvme_ctrlr->pending_failover) { 2173 nvme_ctrlr->pending_failover = false; 2174 nvme_ctrlr->reset_start_tsc = 0; 2175 return OP_FAILOVER; 2176 } else if (success || nvme_ctrlr->opts.reconnect_delay_sec == 0) { 2177 nvme_ctrlr->reset_start_tsc = 0; 2178 return OP_NONE; 2179 } else if (bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 2180 return OP_DESTRUCT; 2181 } else { 2182 if (bdev_nvme_check_fast_io_fail_timeout(nvme_ctrlr)) { 2183 nvme_ctrlr->fast_io_fail_timedout = true; 2184 } 2185 return OP_DELAYED_RECONNECT; 2186 } 2187 } 2188 2189 static int bdev_nvme_delete_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool hotplug); 2190 static void bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 2191 2192 static int 2193 bdev_nvme_reconnect_delay_timer_expired(void *ctx) 2194 { 2195 struct nvme_ctrlr *nvme_ctrlr = ctx; 2196 2197 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect_delay, nvme_ctrlr->nbdev_ctrlr->name); 2198 pthread_mutex_lock(&nvme_ctrlr->mutex); 2199 2200 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 2201 2202 if (!nvme_ctrlr->reconnect_is_delayed) { 2203 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2204 return SPDK_POLLER_BUSY; 2205 } 2206 2207 nvme_ctrlr->reconnect_is_delayed = false; 2208 2209 if (nvme_ctrlr->destruct) { 2210 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2211 return SPDK_POLLER_BUSY; 2212 } 2213 2214 assert(nvme_ctrlr->resetting == false); 2215 nvme_ctrlr->resetting = true; 2216 2217 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2218 2219 spdk_poller_resume(nvme_ctrlr->adminq_timer_poller); 2220 2221 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2222 return SPDK_POLLER_BUSY; 2223 } 2224 2225 static void 2226 bdev_nvme_start_reconnect_delay_timer(struct nvme_ctrlr *nvme_ctrlr) 2227 { 2228 spdk_poller_pause(nvme_ctrlr->adminq_timer_poller); 2229 2230 assert(nvme_ctrlr->reconnect_is_delayed == false); 2231 nvme_ctrlr->reconnect_is_delayed = true; 2232 2233 assert(nvme_ctrlr->reconnect_delay_timer == NULL); 2234 nvme_ctrlr->reconnect_delay_timer = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_delay_timer_expired, 2235 nvme_ctrlr, 2236 nvme_ctrlr->opts.reconnect_delay_sec * SPDK_SEC_TO_USEC); 2237 } 2238 2239 static void remove_discovery_entry(struct nvme_ctrlr *nvme_ctrlr); 2240 2241 static void 2242 bdev_nvme_reset_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, bool success) 2243 { 2244 bdev_nvme_ctrlr_op_cb ctrlr_op_cb_fn = nvme_ctrlr->ctrlr_op_cb_fn; 2245 void *ctrlr_op_cb_arg = nvme_ctrlr->ctrlr_op_cb_arg; 2246 enum bdev_nvme_op_after_reset op_after_reset; 2247 2248 assert(nvme_ctrlr->thread == spdk_get_thread()); 2249 2250 pthread_mutex_lock(&nvme_ctrlr->mutex); 2251 if (!success) { 2252 /* Connecting the active trid failed. Set the next alternate trid to the 2253 * active trid if it exists. 2254 */ 2255 if (bdev_nvme_failover_trid(nvme_ctrlr, false, false)) { 2256 /* The next alternate trid exists and is ready to try. Try it now. */ 2257 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2258 2259 NVME_CTRLR_INFOLOG(nvme_ctrlr, "Try the next alternate trid %s:%s now.\n", 2260 nvme_ctrlr->active_path_id->trid.traddr, 2261 nvme_ctrlr->active_path_id->trid.trsvcid); 2262 2263 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr); 2264 return; 2265 } 2266 2267 /* We came here if there is no alternate trid or if the next trid exists but 2268 * is not ready to try. We will try the active trid after reconnect_delay_sec 2269 * seconds if it is non-zero or at the next reset call otherwise. 2270 */ 2271 } else { 2272 /* Connecting the active trid succeeded. Clear the last failed time because it 2273 * means the trid is failed if its last failed time is non-zero. 2274 */ 2275 nvme_ctrlr->active_path_id->last_failed_tsc = 0; 2276 } 2277 2278 NVME_CTRLR_INFOLOG(nvme_ctrlr, "Clear pending resets.\n"); 2279 2280 /* Make sure we clear any pending resets before returning. */ 2281 bdev_nvme_complete_pending_resets(nvme_ctrlr, success); 2282 2283 if (!success) { 2284 NVME_CTRLR_ERRLOG(nvme_ctrlr, "Resetting controller failed.\n"); 2285 } else { 2286 NVME_CTRLR_NOTICELOG(nvme_ctrlr, "Resetting controller successful.\n"); 2287 } 2288 2289 nvme_ctrlr->resetting = false; 2290 nvme_ctrlr->dont_retry = false; 2291 nvme_ctrlr->in_failover = false; 2292 2293 nvme_ctrlr->ctrlr_op_cb_fn = NULL; 2294 nvme_ctrlr->ctrlr_op_cb_arg = NULL; 2295 2296 op_after_reset = bdev_nvme_check_op_after_reset(nvme_ctrlr, success); 2297 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2298 2299 /* Delay callbacks when the next operation is a failover. */ 2300 if (ctrlr_op_cb_fn && op_after_reset != OP_FAILOVER) { 2301 ctrlr_op_cb_fn(ctrlr_op_cb_arg, success ? 0 : -1); 2302 } 2303 2304 switch (op_after_reset) { 2305 case OP_COMPLETE_PENDING_DESTRUCT: 2306 nvme_ctrlr_unregister(nvme_ctrlr); 2307 break; 2308 case OP_DESTRUCT: 2309 bdev_nvme_delete_ctrlr(nvme_ctrlr, false); 2310 remove_discovery_entry(nvme_ctrlr); 2311 break; 2312 case OP_DELAYED_RECONNECT: 2313 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_start_reconnect_delay_timer); 2314 break; 2315 case OP_FAILOVER: 2316 nvme_ctrlr->ctrlr_op_cb_fn = ctrlr_op_cb_fn; 2317 nvme_ctrlr->ctrlr_op_cb_arg = ctrlr_op_cb_arg; 2318 bdev_nvme_failover_ctrlr(nvme_ctrlr); 2319 break; 2320 default: 2321 break; 2322 } 2323 } 2324 2325 static void 2326 bdev_nvme_reset_create_qpairs_failed(struct nvme_ctrlr *nvme_ctrlr, void *ctx, int status) 2327 { 2328 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 2329 } 2330 2331 static void 2332 bdev_nvme_reset_destroy_qpair(struct nvme_ctrlr_channel_iter *i, 2333 struct nvme_ctrlr *nvme_ctrlr, 2334 struct nvme_ctrlr_channel *ctrlr_ch, void *ctx) 2335 { 2336 struct nvme_qpair *nvme_qpair; 2337 struct spdk_nvme_qpair *qpair; 2338 2339 nvme_qpair = ctrlr_ch->qpair; 2340 assert(nvme_qpair != NULL); 2341 2342 _bdev_nvme_clear_io_path_cache(nvme_qpair); 2343 2344 qpair = nvme_qpair->qpair; 2345 if (qpair != NULL) { 2346 NVME_CTRLR_INFOLOG(nvme_ctrlr, "Start disconnecting qpair %p:%u.\n", 2347 qpair, spdk_nvme_qpair_get_id(qpair)); 2348 2349 if (nvme_qpair->ctrlr->dont_retry) { 2350 spdk_nvme_qpair_set_abort_dnr(qpair, true); 2351 } 2352 spdk_nvme_ctrlr_disconnect_io_qpair(qpair); 2353 2354 /* The current full reset sequence will move to the next 2355 * ctrlr_channel after the qpair is actually disconnected. 2356 */ 2357 assert(ctrlr_ch->reset_iter == NULL); 2358 ctrlr_ch->reset_iter = i; 2359 } else { 2360 nvme_ctrlr_for_each_channel_continue(i, 0); 2361 } 2362 } 2363 2364 static void 2365 bdev_nvme_reset_create_qpairs_done(struct nvme_ctrlr *nvme_ctrlr, void *ctx, int status) 2366 { 2367 if (status == 0) { 2368 NVME_CTRLR_INFOLOG(nvme_ctrlr, "qpairs were created after ctrlr reset.\n"); 2369 2370 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, true); 2371 } else { 2372 NVME_CTRLR_INFOLOG(nvme_ctrlr, "qpairs were failed to create after ctrlr reset.\n"); 2373 2374 /* Delete the added qpairs and quiesce ctrlr to make the states clean. */ 2375 nvme_ctrlr_for_each_channel(nvme_ctrlr, 2376 bdev_nvme_reset_destroy_qpair, 2377 NULL, 2378 bdev_nvme_reset_create_qpairs_failed); 2379 } 2380 } 2381 2382 static int 2383 bdev_nvme_reset_check_qpair_connected(void *ctx) 2384 { 2385 struct nvme_ctrlr_channel *ctrlr_ch = ctx; 2386 struct nvme_qpair *nvme_qpair = ctrlr_ch->qpair; 2387 struct spdk_nvme_qpair *qpair; 2388 2389 if (ctrlr_ch->reset_iter == NULL) { 2390 /* qpair was already failed to connect and the reset sequence is being aborted. */ 2391 assert(ctrlr_ch->connect_poller == NULL); 2392 assert(nvme_qpair->qpair == NULL); 2393 2394 NVME_CTRLR_INFOLOG(nvme_qpair->ctrlr, 2395 "qpair was already failed to connect. reset is being aborted.\n"); 2396 return SPDK_POLLER_BUSY; 2397 } 2398 2399 qpair = nvme_qpair->qpair; 2400 assert(qpair != NULL); 2401 2402 if (!spdk_nvme_qpair_is_connected(qpair)) { 2403 return SPDK_POLLER_BUSY; 2404 } 2405 2406 NVME_CTRLR_INFOLOG(nvme_qpair->ctrlr, "qpair %p:%u was connected.\n", 2407 qpair, spdk_nvme_qpair_get_id(qpair)); 2408 2409 spdk_poller_unregister(&ctrlr_ch->connect_poller); 2410 2411 /* qpair was completed to connect. Move to the next ctrlr_channel */ 2412 nvme_ctrlr_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 2413 ctrlr_ch->reset_iter = NULL; 2414 2415 if (!g_opts.disable_auto_failback) { 2416 _bdev_nvme_clear_io_path_cache(nvme_qpair); 2417 } 2418 2419 return SPDK_POLLER_BUSY; 2420 } 2421 2422 static void 2423 bdev_nvme_reset_create_qpair(struct nvme_ctrlr_channel_iter *i, 2424 struct nvme_ctrlr *nvme_ctrlr, 2425 struct nvme_ctrlr_channel *ctrlr_ch, 2426 void *ctx) 2427 { 2428 struct nvme_qpair *nvme_qpair = ctrlr_ch->qpair; 2429 struct spdk_nvme_qpair *qpair; 2430 int rc = 0; 2431 2432 if (nvme_qpair->qpair == NULL) { 2433 rc = bdev_nvme_create_qpair(nvme_qpair); 2434 } 2435 if (rc == 0) { 2436 ctrlr_ch->connect_poller = SPDK_POLLER_REGISTER(bdev_nvme_reset_check_qpair_connected, 2437 ctrlr_ch, 0); 2438 2439 qpair = nvme_qpair->qpair; 2440 2441 NVME_CTRLR_INFOLOG(nvme_ctrlr, "Start checking qpair %p:%u to be connected.\n", 2442 qpair, spdk_nvme_qpair_get_id(qpair)); 2443 2444 /* The current full reset sequence will move to the next 2445 * ctrlr_channel after the qpair is actually connected. 2446 */ 2447 assert(ctrlr_ch->reset_iter == NULL); 2448 ctrlr_ch->reset_iter = i; 2449 } else { 2450 nvme_ctrlr_for_each_channel_continue(i, rc); 2451 } 2452 } 2453 2454 static void 2455 nvme_ctrlr_check_namespaces(struct nvme_ctrlr *nvme_ctrlr) 2456 { 2457 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 2458 struct nvme_ns *nvme_ns; 2459 2460 for (nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 2461 nvme_ns != NULL; 2462 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns)) { 2463 if (!spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) { 2464 SPDK_DEBUGLOG(bdev_nvme, "NSID %u was removed during reset.\n", nvme_ns->id); 2465 /* NS can be added again. Just nullify nvme_ns->ns. */ 2466 nvme_ns->ns = NULL; 2467 } 2468 } 2469 } 2470 2471 2472 static int 2473 bdev_nvme_reconnect_ctrlr_poll(void *arg) 2474 { 2475 struct nvme_ctrlr *nvme_ctrlr = arg; 2476 struct spdk_nvme_transport_id *trid; 2477 int rc = -ETIMEDOUT; 2478 2479 if (bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 2480 /* Mark the ctrlr as failed. The next call to 2481 * spdk_nvme_ctrlr_reconnect_poll_async() will then 2482 * do the necessary cleanup and return failure. 2483 */ 2484 spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr); 2485 } 2486 2487 rc = spdk_nvme_ctrlr_reconnect_poll_async(nvme_ctrlr->ctrlr); 2488 if (rc == -EAGAIN) { 2489 return SPDK_POLLER_BUSY; 2490 } 2491 2492 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 2493 if (rc == 0) { 2494 trid = &nvme_ctrlr->active_path_id->trid; 2495 2496 if (spdk_nvme_trtype_is_fabrics(trid->trtype)) { 2497 NVME_CTRLR_INFOLOG(nvme_ctrlr, "ctrlr was connected to %s:%s. Create qpairs.\n", 2498 trid->traddr, trid->trsvcid); 2499 } else { 2500 NVME_CTRLR_INFOLOG(nvme_ctrlr, "ctrlr was connected. Create qpairs.\n"); 2501 } 2502 2503 nvme_ctrlr_check_namespaces(nvme_ctrlr); 2504 2505 /* Recreate all of the I/O queue pairs */ 2506 nvme_ctrlr_for_each_channel(nvme_ctrlr, 2507 bdev_nvme_reset_create_qpair, 2508 NULL, 2509 bdev_nvme_reset_create_qpairs_done); 2510 } else { 2511 NVME_CTRLR_INFOLOG(nvme_ctrlr, "ctrlr could not be connected.\n"); 2512 2513 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 2514 } 2515 return SPDK_POLLER_BUSY; 2516 } 2517 2518 static void 2519 bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2520 { 2521 NVME_CTRLR_INFOLOG(nvme_ctrlr, "Start reconnecting ctrlr.\n"); 2522 2523 spdk_nvme_ctrlr_reconnect_async(nvme_ctrlr->ctrlr); 2524 2525 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect, nvme_ctrlr->nbdev_ctrlr->name); 2526 assert(nvme_ctrlr->reset_detach_poller == NULL); 2527 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_ctrlr_poll, 2528 nvme_ctrlr, 0); 2529 } 2530 2531 static void 2532 bdev_nvme_reset_destroy_qpair_done(struct nvme_ctrlr *nvme_ctrlr, void *ctx, int status) 2533 { 2534 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reset, nvme_ctrlr->nbdev_ctrlr->name); 2535 assert(status == 0); 2536 2537 NVME_CTRLR_INFOLOG(nvme_ctrlr, "qpairs were deleted.\n"); 2538 2539 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2540 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2541 } else { 2542 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr); 2543 } 2544 } 2545 2546 static void 2547 bdev_nvme_reset_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr) 2548 { 2549 NVME_CTRLR_INFOLOG(nvme_ctrlr, "Delete qpairs for reset.\n"); 2550 2551 nvme_ctrlr_for_each_channel(nvme_ctrlr, 2552 bdev_nvme_reset_destroy_qpair, 2553 NULL, 2554 bdev_nvme_reset_destroy_qpair_done); 2555 } 2556 2557 static void 2558 bdev_nvme_reconnect_ctrlr_now(void *ctx) 2559 { 2560 struct nvme_ctrlr *nvme_ctrlr = ctx; 2561 2562 assert(nvme_ctrlr->resetting == true); 2563 assert(nvme_ctrlr->thread == spdk_get_thread()); 2564 2565 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 2566 2567 spdk_poller_resume(nvme_ctrlr->adminq_timer_poller); 2568 2569 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2570 } 2571 2572 static void 2573 _bdev_nvme_reset_ctrlr(void *ctx) 2574 { 2575 struct nvme_ctrlr *nvme_ctrlr = ctx; 2576 2577 assert(nvme_ctrlr->resetting == true); 2578 assert(nvme_ctrlr->thread == spdk_get_thread()); 2579 2580 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2581 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reset_destroy_qpairs); 2582 } else { 2583 bdev_nvme_reset_destroy_qpairs(nvme_ctrlr); 2584 } 2585 } 2586 2587 static int 2588 bdev_nvme_reset_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, spdk_msg_fn *msg_fn) 2589 { 2590 if (nvme_ctrlr->destruct) { 2591 return -ENXIO; 2592 } 2593 2594 if (nvme_ctrlr->resetting) { 2595 NVME_CTRLR_NOTICELOG(nvme_ctrlr, "Unable to perform reset, already in progress.\n"); 2596 return -EBUSY; 2597 } 2598 2599 if (nvme_ctrlr->disabled) { 2600 NVME_CTRLR_NOTICELOG(nvme_ctrlr, "Unable to perform reset. Controller is disabled.\n"); 2601 return -EALREADY; 2602 } 2603 2604 nvme_ctrlr->resetting = true; 2605 nvme_ctrlr->dont_retry = true; 2606 2607 if (nvme_ctrlr->reconnect_is_delayed) { 2608 NVME_CTRLR_INFOLOG(nvme_ctrlr, "Reconnect is already scheduled.\n"); 2609 *msg_fn = bdev_nvme_reconnect_ctrlr_now; 2610 nvme_ctrlr->reconnect_is_delayed = false; 2611 } else { 2612 *msg_fn = _bdev_nvme_reset_ctrlr; 2613 assert(nvme_ctrlr->reset_start_tsc == 0); 2614 } 2615 2616 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2617 2618 return 0; 2619 } 2620 2621 static int 2622 bdev_nvme_reset_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2623 { 2624 spdk_msg_fn msg_fn; 2625 int rc; 2626 2627 pthread_mutex_lock(&nvme_ctrlr->mutex); 2628 rc = bdev_nvme_reset_ctrlr_unsafe(nvme_ctrlr, &msg_fn); 2629 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2630 2631 if (rc == 0) { 2632 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 2633 } 2634 2635 return rc; 2636 } 2637 2638 static int 2639 bdev_nvme_enable_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2640 { 2641 pthread_mutex_lock(&nvme_ctrlr->mutex); 2642 if (nvme_ctrlr->destruct) { 2643 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2644 return -ENXIO; 2645 } 2646 2647 if (nvme_ctrlr->resetting) { 2648 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2649 return -EBUSY; 2650 } 2651 2652 if (!nvme_ctrlr->disabled) { 2653 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2654 return -EALREADY; 2655 } 2656 2657 nvme_ctrlr->disabled = false; 2658 nvme_ctrlr->resetting = true; 2659 2660 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2661 2662 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2663 2664 spdk_thread_send_msg(nvme_ctrlr->thread, bdev_nvme_reconnect_ctrlr_now, nvme_ctrlr); 2665 return 0; 2666 } 2667 2668 static void 2669 bdev_nvme_disable_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr) 2670 { 2671 bdev_nvme_ctrlr_op_cb ctrlr_op_cb_fn = nvme_ctrlr->ctrlr_op_cb_fn; 2672 void *ctrlr_op_cb_arg = nvme_ctrlr->ctrlr_op_cb_arg; 2673 enum bdev_nvme_op_after_reset op_after_disable; 2674 2675 assert(nvme_ctrlr->thread == spdk_get_thread()); 2676 2677 nvme_ctrlr->ctrlr_op_cb_fn = NULL; 2678 nvme_ctrlr->ctrlr_op_cb_arg = NULL; 2679 2680 pthread_mutex_lock(&nvme_ctrlr->mutex); 2681 2682 nvme_ctrlr->resetting = false; 2683 nvme_ctrlr->dont_retry = false; 2684 2685 op_after_disable = bdev_nvme_check_op_after_reset(nvme_ctrlr, true); 2686 2687 nvme_ctrlr->disabled = true; 2688 spdk_poller_pause(nvme_ctrlr->adminq_timer_poller); 2689 2690 /* Make sure we clear any pending resets before returning. */ 2691 bdev_nvme_complete_pending_resets(nvme_ctrlr, true); 2692 2693 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2694 2695 if (ctrlr_op_cb_fn) { 2696 ctrlr_op_cb_fn(ctrlr_op_cb_arg, 0); 2697 } 2698 2699 switch (op_after_disable) { 2700 case OP_COMPLETE_PENDING_DESTRUCT: 2701 nvme_ctrlr_unregister(nvme_ctrlr); 2702 break; 2703 default: 2704 break; 2705 } 2706 } 2707 2708 static void 2709 bdev_nvme_disable_destroy_qpairs_done(struct nvme_ctrlr *nvme_ctrlr, void *ctx, int status) 2710 { 2711 assert(status == 0); 2712 2713 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2714 bdev_nvme_disable_ctrlr_complete(nvme_ctrlr); 2715 } else { 2716 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_disable_ctrlr_complete); 2717 } 2718 } 2719 2720 static void 2721 bdev_nvme_disable_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr) 2722 { 2723 nvme_ctrlr_for_each_channel(nvme_ctrlr, 2724 bdev_nvme_reset_destroy_qpair, 2725 NULL, 2726 bdev_nvme_disable_destroy_qpairs_done); 2727 } 2728 2729 static void 2730 _bdev_nvme_cancel_reconnect_and_disable_ctrlr(void *ctx) 2731 { 2732 struct nvme_ctrlr *nvme_ctrlr = ctx; 2733 2734 assert(nvme_ctrlr->resetting == true); 2735 assert(nvme_ctrlr->thread == spdk_get_thread()); 2736 2737 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 2738 2739 bdev_nvme_disable_ctrlr_complete(nvme_ctrlr); 2740 } 2741 2742 static void 2743 _bdev_nvme_disconnect_and_disable_ctrlr(void *ctx) 2744 { 2745 struct nvme_ctrlr *nvme_ctrlr = ctx; 2746 2747 assert(nvme_ctrlr->resetting == true); 2748 assert(nvme_ctrlr->thread == spdk_get_thread()); 2749 2750 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2751 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_disable_destroy_qpairs); 2752 } else { 2753 bdev_nvme_disable_destroy_qpairs(nvme_ctrlr); 2754 } 2755 } 2756 2757 static int 2758 bdev_nvme_disable_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2759 { 2760 spdk_msg_fn msg_fn; 2761 2762 pthread_mutex_lock(&nvme_ctrlr->mutex); 2763 if (nvme_ctrlr->destruct) { 2764 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2765 return -ENXIO; 2766 } 2767 2768 if (nvme_ctrlr->resetting) { 2769 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2770 return -EBUSY; 2771 } 2772 2773 if (nvme_ctrlr->disabled) { 2774 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2775 return -EALREADY; 2776 } 2777 2778 nvme_ctrlr->resetting = true; 2779 nvme_ctrlr->dont_retry = true; 2780 2781 if (nvme_ctrlr->reconnect_is_delayed) { 2782 msg_fn = _bdev_nvme_cancel_reconnect_and_disable_ctrlr; 2783 nvme_ctrlr->reconnect_is_delayed = false; 2784 } else { 2785 msg_fn = _bdev_nvme_disconnect_and_disable_ctrlr; 2786 } 2787 2788 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2789 2790 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2791 2792 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 2793 return 0; 2794 } 2795 2796 static int 2797 nvme_ctrlr_op(struct nvme_ctrlr *nvme_ctrlr, enum nvme_ctrlr_op op, 2798 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2799 { 2800 int rc; 2801 2802 switch (op) { 2803 case NVME_CTRLR_OP_RESET: 2804 rc = bdev_nvme_reset_ctrlr(nvme_ctrlr); 2805 break; 2806 case NVME_CTRLR_OP_ENABLE: 2807 rc = bdev_nvme_enable_ctrlr(nvme_ctrlr); 2808 break; 2809 case NVME_CTRLR_OP_DISABLE: 2810 rc = bdev_nvme_disable_ctrlr(nvme_ctrlr); 2811 break; 2812 default: 2813 rc = -EINVAL; 2814 break; 2815 } 2816 2817 if (rc == 0) { 2818 assert(nvme_ctrlr->ctrlr_op_cb_fn == NULL); 2819 assert(nvme_ctrlr->ctrlr_op_cb_arg == NULL); 2820 nvme_ctrlr->ctrlr_op_cb_fn = cb_fn; 2821 nvme_ctrlr->ctrlr_op_cb_arg = cb_arg; 2822 } 2823 return rc; 2824 } 2825 2826 struct nvme_ctrlr_op_rpc_ctx { 2827 struct nvme_ctrlr *nvme_ctrlr; 2828 struct spdk_thread *orig_thread; 2829 enum nvme_ctrlr_op op; 2830 int rc; 2831 bdev_nvme_ctrlr_op_cb cb_fn; 2832 void *cb_arg; 2833 }; 2834 2835 static void 2836 _nvme_ctrlr_op_rpc_complete(void *_ctx) 2837 { 2838 struct nvme_ctrlr_op_rpc_ctx *ctx = _ctx; 2839 2840 assert(ctx != NULL); 2841 assert(ctx->cb_fn != NULL); 2842 2843 ctx->cb_fn(ctx->cb_arg, ctx->rc); 2844 2845 free(ctx); 2846 } 2847 2848 static void 2849 nvme_ctrlr_op_rpc_complete(void *cb_arg, int rc) 2850 { 2851 struct nvme_ctrlr_op_rpc_ctx *ctx = cb_arg; 2852 2853 ctx->rc = rc; 2854 2855 spdk_thread_send_msg(ctx->orig_thread, _nvme_ctrlr_op_rpc_complete, ctx); 2856 } 2857 2858 void 2859 nvme_ctrlr_op_rpc(struct nvme_ctrlr *nvme_ctrlr, enum nvme_ctrlr_op op, 2860 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2861 { 2862 struct nvme_ctrlr_op_rpc_ctx *ctx; 2863 int rc; 2864 2865 assert(cb_fn != NULL); 2866 2867 ctx = calloc(1, sizeof(*ctx)); 2868 if (ctx == NULL) { 2869 NVME_CTRLR_ERRLOG(nvme_ctrlr, "Failed to allocate nvme_ctrlr_op_rpc_ctx.\n"); 2870 cb_fn(cb_arg, -ENOMEM); 2871 return; 2872 } 2873 2874 ctx->orig_thread = spdk_get_thread(); 2875 ctx->cb_fn = cb_fn; 2876 ctx->cb_arg = cb_arg; 2877 2878 rc = nvme_ctrlr_op(nvme_ctrlr, op, nvme_ctrlr_op_rpc_complete, ctx); 2879 if (rc == 0) { 2880 return; 2881 } else if (rc == -EALREADY) { 2882 rc = 0; 2883 } 2884 2885 nvme_ctrlr_op_rpc_complete(ctx, rc); 2886 } 2887 2888 static void nvme_bdev_ctrlr_op_rpc_continue(void *cb_arg, int rc); 2889 2890 static void 2891 _nvme_bdev_ctrlr_op_rpc_continue(void *_ctx) 2892 { 2893 struct nvme_ctrlr_op_rpc_ctx *ctx = _ctx; 2894 struct nvme_ctrlr *prev_nvme_ctrlr, *next_nvme_ctrlr; 2895 int rc; 2896 2897 prev_nvme_ctrlr = ctx->nvme_ctrlr; 2898 ctx->nvme_ctrlr = NULL; 2899 2900 if (ctx->rc != 0) { 2901 goto complete; 2902 } 2903 2904 next_nvme_ctrlr = TAILQ_NEXT(prev_nvme_ctrlr, tailq); 2905 if (next_nvme_ctrlr == NULL) { 2906 goto complete; 2907 } 2908 2909 rc = nvme_ctrlr_op(next_nvme_ctrlr, ctx->op, nvme_bdev_ctrlr_op_rpc_continue, ctx); 2910 if (rc == 0) { 2911 ctx->nvme_ctrlr = next_nvme_ctrlr; 2912 return; 2913 } else if (rc == -EALREADY) { 2914 ctx->nvme_ctrlr = next_nvme_ctrlr; 2915 rc = 0; 2916 } 2917 2918 ctx->rc = rc; 2919 2920 complete: 2921 ctx->cb_fn(ctx->cb_arg, ctx->rc); 2922 free(ctx); 2923 } 2924 2925 static void 2926 nvme_bdev_ctrlr_op_rpc_continue(void *cb_arg, int rc) 2927 { 2928 struct nvme_ctrlr_op_rpc_ctx *ctx = cb_arg; 2929 2930 ctx->rc = rc; 2931 2932 spdk_thread_send_msg(ctx->orig_thread, _nvme_bdev_ctrlr_op_rpc_continue, ctx); 2933 } 2934 2935 void 2936 nvme_bdev_ctrlr_op_rpc(struct nvme_bdev_ctrlr *nbdev_ctrlr, enum nvme_ctrlr_op op, 2937 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2938 { 2939 struct nvme_ctrlr_op_rpc_ctx *ctx; 2940 struct nvme_ctrlr *nvme_ctrlr; 2941 int rc; 2942 2943 assert(cb_fn != NULL); 2944 2945 ctx = calloc(1, sizeof(*ctx)); 2946 if (ctx == NULL) { 2947 SPDK_ERRLOG("Failed to allocate nvme_ctrlr_op_rpc_ctx.\n"); 2948 cb_fn(cb_arg, -ENOMEM); 2949 return; 2950 } 2951 2952 ctx->orig_thread = spdk_get_thread(); 2953 ctx->op = op; 2954 ctx->cb_fn = cb_fn; 2955 ctx->cb_arg = cb_arg; 2956 2957 nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 2958 assert(nvme_ctrlr != NULL); 2959 2960 rc = nvme_ctrlr_op(nvme_ctrlr, op, nvme_bdev_ctrlr_op_rpc_continue, ctx); 2961 if (rc == 0) { 2962 ctx->nvme_ctrlr = nvme_ctrlr; 2963 return; 2964 } else if (rc == -EALREADY) { 2965 ctx->nvme_ctrlr = nvme_ctrlr; 2966 rc = 0; 2967 } 2968 2969 nvme_bdev_ctrlr_op_rpc_continue(ctx, rc); 2970 } 2971 2972 static int _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio); 2973 2974 static void 2975 bdev_nvme_unfreeze_bdev_channel_done(struct nvme_bdev *nbdev, void *ctx, int status) 2976 { 2977 struct nvme_bdev_io *bio = ctx; 2978 enum spdk_bdev_io_status io_status; 2979 2980 if (bio->cpl.cdw0 == 0) { 2981 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 2982 } else { 2983 io_status = SPDK_BDEV_IO_STATUS_FAILED; 2984 } 2985 2986 NVME_BDEV_INFOLOG(nbdev, "reset_io %p completed, status:%d\n", bio, io_status); 2987 2988 __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), io_status, NULL); 2989 } 2990 2991 static void 2992 bdev_nvme_unfreeze_bdev_channel(struct nvme_bdev_channel_iter *i, 2993 struct nvme_bdev *nbdev, 2994 struct nvme_bdev_channel *nbdev_ch, void *ctx) 2995 { 2996 bdev_nvme_abort_retry_ios(nbdev_ch); 2997 nbdev_ch->resetting = false; 2998 2999 nvme_bdev_for_each_channel_continue(i, 0); 3000 } 3001 3002 static void 3003 bdev_nvme_reset_io_complete(struct nvme_bdev_io *bio) 3004 { 3005 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 3006 struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt; 3007 3008 /* Abort all queued I/Os for retry. */ 3009 nvme_bdev_for_each_channel(nbdev, 3010 bdev_nvme_unfreeze_bdev_channel, 3011 bio, 3012 bdev_nvme_unfreeze_bdev_channel_done); 3013 } 3014 3015 static void 3016 _bdev_nvme_reset_io_continue(void *ctx) 3017 { 3018 struct nvme_bdev_io *bio = ctx; 3019 struct nvme_io_path *prev_io_path, *next_io_path; 3020 int rc; 3021 3022 prev_io_path = bio->io_path; 3023 bio->io_path = NULL; 3024 3025 next_io_path = STAILQ_NEXT(prev_io_path, stailq); 3026 if (next_io_path == NULL) { 3027 goto complete; 3028 } 3029 3030 rc = _bdev_nvme_reset_io(next_io_path, bio); 3031 if (rc == 0) { 3032 return; 3033 } 3034 3035 complete: 3036 bdev_nvme_reset_io_complete(bio); 3037 } 3038 3039 static void 3040 bdev_nvme_reset_io_continue(void *cb_arg, int rc) 3041 { 3042 struct nvme_bdev_io *bio = cb_arg; 3043 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 3044 struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt; 3045 3046 NVME_BDEV_INFOLOG(nbdev, "continue reset_io %p, rc:%d\n", bio, rc); 3047 3048 /* Reset status is initialized as "failed". Set to "success" once we have at least one 3049 * successfully reset nvme_ctrlr. 3050 */ 3051 if (rc == 0) { 3052 bio->cpl.cdw0 = 0; 3053 } 3054 3055 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), _bdev_nvme_reset_io_continue, bio); 3056 } 3057 3058 static int 3059 _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio) 3060 { 3061 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 3062 struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt; 3063 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 3064 spdk_msg_fn msg_fn; 3065 int rc; 3066 3067 assert(bio->io_path == NULL); 3068 bio->io_path = io_path; 3069 3070 pthread_mutex_lock(&nvme_ctrlr->mutex); 3071 rc = bdev_nvme_reset_ctrlr_unsafe(nvme_ctrlr, &msg_fn); 3072 if (rc == -EBUSY) { 3073 /* 3074 * Reset call is queued only if it is from the app framework. This is on purpose so that 3075 * we don't interfere with the app framework reset strategy. i.e. we are deferring to the 3076 * upper level. If they are in the middle of a reset, we won't try to schedule another one. 3077 */ 3078 TAILQ_INSERT_TAIL(&nvme_ctrlr->pending_resets, bio, retry_link); 3079 } 3080 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3081 3082 if (rc == 0) { 3083 assert(nvme_ctrlr->ctrlr_op_cb_fn == NULL); 3084 assert(nvme_ctrlr->ctrlr_op_cb_arg == NULL); 3085 nvme_ctrlr->ctrlr_op_cb_fn = bdev_nvme_reset_io_continue; 3086 nvme_ctrlr->ctrlr_op_cb_arg = bio; 3087 3088 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 3089 3090 NVME_BDEV_INFOLOG(nbdev, "reset_io %p started resetting ctrlr [%s, %u].\n", 3091 bio, CTRLR_STRING(nvme_ctrlr), CTRLR_ID(nvme_ctrlr)); 3092 } else if (rc == -EBUSY) { 3093 rc = 0; 3094 3095 NVME_BDEV_INFOLOG(nbdev, "reset_io %p was queued to ctrlr [%s, %u].\n", 3096 bio, CTRLR_STRING(nvme_ctrlr), CTRLR_ID(nvme_ctrlr)); 3097 } else { 3098 NVME_BDEV_INFOLOG(nbdev, "reset_io %p could not reset ctrlr [%s, %u], rc:%d\n", 3099 bio, CTRLR_STRING(nvme_ctrlr), CTRLR_ID(nvme_ctrlr), rc); 3100 } 3101 3102 return rc; 3103 } 3104 3105 static void 3106 bdev_nvme_freeze_bdev_channel_done(struct nvme_bdev *nbdev, void *ctx, int status) 3107 { 3108 struct nvme_bdev_io *bio = ctx; 3109 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 3110 struct nvme_bdev_channel *nbdev_ch; 3111 struct nvme_io_path *io_path; 3112 int rc; 3113 3114 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 3115 3116 /* Initialize with failed status. With multipath it is enough to have at least one successful 3117 * nvme_ctrlr reset. If there is none, reset status will remain failed. 3118 */ 3119 bio->cpl.cdw0 = 1; 3120 3121 /* Reset all nvme_ctrlrs of a bdev controller sequentially. */ 3122 io_path = STAILQ_FIRST(&nbdev_ch->io_path_list); 3123 assert(io_path != NULL); 3124 3125 rc = _bdev_nvme_reset_io(io_path, bio); 3126 if (rc != 0) { 3127 /* If the current nvme_ctrlr is disabled, skip it and move to the next nvme_ctrlr. */ 3128 rc = (rc == -EALREADY) ? 0 : rc; 3129 3130 bdev_nvme_reset_io_continue(bio, rc); 3131 } 3132 } 3133 3134 static void 3135 bdev_nvme_freeze_bdev_channel(struct nvme_bdev_channel_iter *i, 3136 struct nvme_bdev *nbdev, 3137 struct nvme_bdev_channel *nbdev_ch, void *ctx) 3138 { 3139 nbdev_ch->resetting = true; 3140 3141 nvme_bdev_for_each_channel_continue(i, 0); 3142 } 3143 3144 static void 3145 bdev_nvme_reset_io(struct nvme_bdev *nbdev, struct nvme_bdev_io *bio) 3146 { 3147 NVME_BDEV_INFOLOG(nbdev, "reset_io %p started.\n", bio); 3148 3149 nvme_bdev_for_each_channel(nbdev, 3150 bdev_nvme_freeze_bdev_channel, 3151 bio, 3152 bdev_nvme_freeze_bdev_channel_done); 3153 } 3154 3155 static int 3156 bdev_nvme_failover_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool remove) 3157 { 3158 if (nvme_ctrlr->destruct) { 3159 /* Don't bother resetting if the controller is in the process of being destructed. */ 3160 return -ENXIO; 3161 } 3162 3163 if (nvme_ctrlr->resetting) { 3164 if (!nvme_ctrlr->in_failover) { 3165 NVME_CTRLR_NOTICELOG(nvme_ctrlr, 3166 "Reset is already in progress. Defer failover until reset completes.\n"); 3167 3168 /* Defer failover until reset completes. */ 3169 nvme_ctrlr->pending_failover = true; 3170 return -EINPROGRESS; 3171 } else { 3172 NVME_CTRLR_NOTICELOG(nvme_ctrlr, "Unable to perform failover, already in progress.\n"); 3173 return -EBUSY; 3174 } 3175 } 3176 3177 bdev_nvme_failover_trid(nvme_ctrlr, remove, true); 3178 3179 if (nvme_ctrlr->reconnect_is_delayed) { 3180 NVME_CTRLR_NOTICELOG(nvme_ctrlr, "Reconnect is already scheduled.\n"); 3181 3182 /* We rely on the next reconnect for the failover. */ 3183 return -EALREADY; 3184 } 3185 3186 if (nvme_ctrlr->disabled) { 3187 NVME_CTRLR_NOTICELOG(nvme_ctrlr, "Controller is disabled.\n"); 3188 3189 /* We rely on the enablement for the failover. */ 3190 return -EALREADY; 3191 } 3192 3193 nvme_ctrlr->resetting = true; 3194 nvme_ctrlr->in_failover = true; 3195 3196 assert(nvme_ctrlr->reset_start_tsc == 0); 3197 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 3198 3199 return 0; 3200 } 3201 3202 static int 3203 bdev_nvme_failover_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 3204 { 3205 int rc; 3206 3207 pthread_mutex_lock(&nvme_ctrlr->mutex); 3208 rc = bdev_nvme_failover_ctrlr_unsafe(nvme_ctrlr, false); 3209 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3210 3211 if (rc == 0) { 3212 spdk_thread_send_msg(nvme_ctrlr->thread, _bdev_nvme_reset_ctrlr, nvme_ctrlr); 3213 } else if (rc == -EALREADY) { 3214 rc = 0; 3215 } 3216 3217 return rc; 3218 } 3219 3220 static int bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, 3221 uint64_t num_blocks); 3222 3223 static int bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, 3224 uint64_t num_blocks); 3225 3226 static int bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, 3227 uint64_t src_offset_blocks, 3228 uint64_t num_blocks); 3229 3230 static void 3231 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 3232 bool success) 3233 { 3234 struct nvme_bdev_io *bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 3235 int ret; 3236 3237 if (!success) { 3238 ret = -EINVAL; 3239 goto exit; 3240 } 3241 3242 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 3243 ret = -ENXIO; 3244 goto exit; 3245 } 3246 3247 ret = bdev_nvme_readv(bio, 3248 bdev_io->u.bdev.iovs, 3249 bdev_io->u.bdev.iovcnt, 3250 bdev_io->u.bdev.md_buf, 3251 bdev_io->u.bdev.num_blocks, 3252 bdev_io->u.bdev.offset_blocks, 3253 bdev_io->u.bdev.dif_check_flags, 3254 bdev_io->u.bdev.memory_domain, 3255 bdev_io->u.bdev.memory_domain_ctx, 3256 bdev_io->u.bdev.accel_sequence); 3257 3258 exit: 3259 if (spdk_unlikely(ret != 0)) { 3260 bdev_nvme_io_complete(bio, ret); 3261 } 3262 } 3263 3264 static inline void 3265 _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io) 3266 { 3267 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 3268 struct spdk_bdev *bdev = bdev_io->bdev; 3269 struct nvme_bdev_io *nbdev_io_to_abort; 3270 int rc = 0; 3271 3272 switch (bdev_io->type) { 3273 case SPDK_BDEV_IO_TYPE_READ: 3274 if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) { 3275 3276 rc = bdev_nvme_readv(nbdev_io, 3277 bdev_io->u.bdev.iovs, 3278 bdev_io->u.bdev.iovcnt, 3279 bdev_io->u.bdev.md_buf, 3280 bdev_io->u.bdev.num_blocks, 3281 bdev_io->u.bdev.offset_blocks, 3282 bdev_io->u.bdev.dif_check_flags, 3283 bdev_io->u.bdev.memory_domain, 3284 bdev_io->u.bdev.memory_domain_ctx, 3285 bdev_io->u.bdev.accel_sequence); 3286 } else { 3287 spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb, 3288 bdev_io->u.bdev.num_blocks * bdev->blocklen); 3289 rc = 0; 3290 } 3291 break; 3292 case SPDK_BDEV_IO_TYPE_WRITE: 3293 rc = bdev_nvme_writev(nbdev_io, 3294 bdev_io->u.bdev.iovs, 3295 bdev_io->u.bdev.iovcnt, 3296 bdev_io->u.bdev.md_buf, 3297 bdev_io->u.bdev.num_blocks, 3298 bdev_io->u.bdev.offset_blocks, 3299 bdev_io->u.bdev.dif_check_flags, 3300 bdev_io->u.bdev.memory_domain, 3301 bdev_io->u.bdev.memory_domain_ctx, 3302 bdev_io->u.bdev.accel_sequence, 3303 bdev_io->u.bdev.nvme_cdw12, 3304 bdev_io->u.bdev.nvme_cdw13); 3305 break; 3306 case SPDK_BDEV_IO_TYPE_COMPARE: 3307 rc = bdev_nvme_comparev(nbdev_io, 3308 bdev_io->u.bdev.iovs, 3309 bdev_io->u.bdev.iovcnt, 3310 bdev_io->u.bdev.md_buf, 3311 bdev_io->u.bdev.num_blocks, 3312 bdev_io->u.bdev.offset_blocks, 3313 bdev_io->u.bdev.dif_check_flags); 3314 break; 3315 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 3316 rc = bdev_nvme_comparev_and_writev(nbdev_io, 3317 bdev_io->u.bdev.iovs, 3318 bdev_io->u.bdev.iovcnt, 3319 bdev_io->u.bdev.fused_iovs, 3320 bdev_io->u.bdev.fused_iovcnt, 3321 bdev_io->u.bdev.md_buf, 3322 bdev_io->u.bdev.num_blocks, 3323 bdev_io->u.bdev.offset_blocks, 3324 bdev_io->u.bdev.dif_check_flags); 3325 break; 3326 case SPDK_BDEV_IO_TYPE_UNMAP: 3327 rc = bdev_nvme_unmap(nbdev_io, 3328 bdev_io->u.bdev.offset_blocks, 3329 bdev_io->u.bdev.num_blocks); 3330 break; 3331 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3332 rc = bdev_nvme_write_zeroes(nbdev_io, 3333 bdev_io->u.bdev.offset_blocks, 3334 bdev_io->u.bdev.num_blocks); 3335 break; 3336 case SPDK_BDEV_IO_TYPE_RESET: 3337 nbdev_io->io_path = NULL; 3338 bdev_nvme_reset_io(bdev->ctxt, nbdev_io); 3339 return; 3340 3341 case SPDK_BDEV_IO_TYPE_FLUSH: 3342 bdev_nvme_io_complete(nbdev_io, 0); 3343 return; 3344 3345 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 3346 rc = bdev_nvme_zone_appendv(nbdev_io, 3347 bdev_io->u.bdev.iovs, 3348 bdev_io->u.bdev.iovcnt, 3349 bdev_io->u.bdev.md_buf, 3350 bdev_io->u.bdev.num_blocks, 3351 bdev_io->u.bdev.offset_blocks, 3352 bdev_io->u.bdev.dif_check_flags); 3353 break; 3354 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 3355 rc = bdev_nvme_get_zone_info(nbdev_io, 3356 bdev_io->u.zone_mgmt.zone_id, 3357 bdev_io->u.zone_mgmt.num_zones, 3358 bdev_io->u.zone_mgmt.buf); 3359 break; 3360 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 3361 rc = bdev_nvme_zone_management(nbdev_io, 3362 bdev_io->u.zone_mgmt.zone_id, 3363 bdev_io->u.zone_mgmt.zone_action); 3364 break; 3365 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 3366 nbdev_io->io_path = NULL; 3367 bdev_nvme_admin_passthru(nbdev_ch, 3368 nbdev_io, 3369 &bdev_io->u.nvme_passthru.cmd, 3370 bdev_io->u.nvme_passthru.buf, 3371 bdev_io->u.nvme_passthru.nbytes); 3372 return; 3373 3374 case SPDK_BDEV_IO_TYPE_NVME_IO: 3375 rc = bdev_nvme_io_passthru(nbdev_io, 3376 &bdev_io->u.nvme_passthru.cmd, 3377 bdev_io->u.nvme_passthru.buf, 3378 bdev_io->u.nvme_passthru.nbytes); 3379 break; 3380 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3381 rc = bdev_nvme_io_passthru_md(nbdev_io, 3382 &bdev_io->u.nvme_passthru.cmd, 3383 bdev_io->u.nvme_passthru.buf, 3384 bdev_io->u.nvme_passthru.nbytes, 3385 bdev_io->u.nvme_passthru.md_buf, 3386 bdev_io->u.nvme_passthru.md_len); 3387 break; 3388 case SPDK_BDEV_IO_TYPE_NVME_IOV_MD: 3389 rc = bdev_nvme_iov_passthru_md(nbdev_io, 3390 &bdev_io->u.nvme_passthru.cmd, 3391 bdev_io->u.nvme_passthru.iovs, 3392 bdev_io->u.nvme_passthru.iovcnt, 3393 bdev_io->u.nvme_passthru.nbytes, 3394 bdev_io->u.nvme_passthru.md_buf, 3395 bdev_io->u.nvme_passthru.md_len); 3396 break; 3397 case SPDK_BDEV_IO_TYPE_ABORT: 3398 nbdev_io->io_path = NULL; 3399 nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx; 3400 bdev_nvme_abort(nbdev_ch, 3401 nbdev_io, 3402 nbdev_io_to_abort); 3403 return; 3404 3405 case SPDK_BDEV_IO_TYPE_COPY: 3406 rc = bdev_nvme_copy(nbdev_io, 3407 bdev_io->u.bdev.offset_blocks, 3408 bdev_io->u.bdev.copy.src_offset_blocks, 3409 bdev_io->u.bdev.num_blocks); 3410 break; 3411 default: 3412 rc = -EINVAL; 3413 break; 3414 } 3415 3416 if (spdk_unlikely(rc != 0)) { 3417 bdev_nvme_io_complete(nbdev_io, rc); 3418 } 3419 } 3420 3421 static void 3422 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 3423 { 3424 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 3425 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 3426 3427 if (spdk_likely(nbdev_io->submit_tsc == 0)) { 3428 nbdev_io->submit_tsc = spdk_bdev_io_get_submit_tsc(bdev_io); 3429 } else { 3430 /* There are cases where submit_tsc != 0, i.e. retry I/O. 3431 * We need to update submit_tsc here. 3432 */ 3433 nbdev_io->submit_tsc = spdk_get_ticks(); 3434 } 3435 3436 spdk_trace_record(TRACE_BDEV_NVME_IO_START, 0, 0, (uintptr_t)nbdev_io, (uintptr_t)bdev_io); 3437 nbdev_io->io_path = bdev_nvme_find_io_path(nbdev_ch); 3438 if (spdk_unlikely(!nbdev_io->io_path)) { 3439 if (!bdev_nvme_io_type_is_admin(bdev_io->type)) { 3440 bdev_nvme_io_complete(nbdev_io, -ENXIO); 3441 return; 3442 } 3443 3444 /* Admin commands do not use the optimal I/O path. 3445 * Simply fall through even if it is not found. 3446 */ 3447 } 3448 3449 _bdev_nvme_submit_request(nbdev_ch, bdev_io); 3450 } 3451 3452 static bool 3453 bdev_nvme_is_supported_csi(enum spdk_nvme_csi csi) 3454 { 3455 switch (csi) { 3456 case SPDK_NVME_CSI_NVM: 3457 return true; 3458 case SPDK_NVME_CSI_ZNS: 3459 return true; 3460 default: 3461 return false; 3462 } 3463 } 3464 3465 static bool 3466 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 3467 { 3468 struct nvme_bdev *nbdev = ctx; 3469 struct nvme_ns *nvme_ns; 3470 struct spdk_nvme_ns *ns; 3471 struct spdk_nvme_ctrlr *ctrlr; 3472 const struct spdk_nvme_ctrlr_data *cdata; 3473 3474 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 3475 assert(nvme_ns != NULL); 3476 ns = nvme_ns->ns; 3477 if (ns == NULL) { 3478 return false; 3479 } 3480 3481 if (!bdev_nvme_is_supported_csi(spdk_nvme_ns_get_csi(ns))) { 3482 switch (io_type) { 3483 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 3484 case SPDK_BDEV_IO_TYPE_NVME_IO: 3485 return true; 3486 3487 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3488 return spdk_nvme_ns_get_md_size(ns) ? true : false; 3489 3490 default: 3491 return false; 3492 } 3493 } 3494 3495 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 3496 3497 switch (io_type) { 3498 case SPDK_BDEV_IO_TYPE_READ: 3499 case SPDK_BDEV_IO_TYPE_WRITE: 3500 case SPDK_BDEV_IO_TYPE_RESET: 3501 case SPDK_BDEV_IO_TYPE_FLUSH: 3502 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 3503 case SPDK_BDEV_IO_TYPE_NVME_IO: 3504 case SPDK_BDEV_IO_TYPE_ABORT: 3505 return true; 3506 3507 case SPDK_BDEV_IO_TYPE_COMPARE: 3508 return spdk_nvme_ns_supports_compare(ns); 3509 3510 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3511 return spdk_nvme_ns_get_md_size(ns) ? true : false; 3512 3513 case SPDK_BDEV_IO_TYPE_UNMAP: 3514 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3515 return cdata->oncs.dsm; 3516 3517 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3518 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3519 return cdata->oncs.write_zeroes; 3520 3521 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 3522 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 3523 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) { 3524 return true; 3525 } 3526 return false; 3527 3528 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 3529 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 3530 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS; 3531 3532 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 3533 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS && 3534 spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ZONE_APPEND_SUPPORTED; 3535 3536 case SPDK_BDEV_IO_TYPE_COPY: 3537 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3538 return cdata->oncs.copy; 3539 3540 default: 3541 return false; 3542 } 3543 } 3544 3545 static int 3546 nvme_qpair_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ctrlr_channel *ctrlr_ch) 3547 { 3548 struct nvme_qpair *nvme_qpair; 3549 struct spdk_io_channel *pg_ch; 3550 int rc; 3551 3552 nvme_qpair = calloc(1, sizeof(*nvme_qpair)); 3553 if (!nvme_qpair) { 3554 NVME_CTRLR_ERRLOG(nvme_ctrlr, "Failed to alloc nvme_qpair.\n"); 3555 return -1; 3556 } 3557 3558 TAILQ_INIT(&nvme_qpair->io_path_list); 3559 3560 nvme_qpair->ctrlr = nvme_ctrlr; 3561 nvme_qpair->ctrlr_ch = ctrlr_ch; 3562 3563 pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs); 3564 if (!pg_ch) { 3565 free(nvme_qpair); 3566 return -1; 3567 } 3568 3569 nvme_qpair->group = spdk_io_channel_get_ctx(pg_ch); 3570 3571 #ifdef SPDK_CONFIG_VTUNE 3572 nvme_qpair->group->collect_spin_stat = true; 3573 #else 3574 nvme_qpair->group->collect_spin_stat = false; 3575 #endif 3576 3577 if (!nvme_ctrlr->disabled) { 3578 /* If a nvme_ctrlr is disabled, don't try to create qpair for it. Qpair will 3579 * be created when it's enabled. 3580 */ 3581 rc = bdev_nvme_create_qpair(nvme_qpair); 3582 if (rc != 0) { 3583 /* nvme_ctrlr can't create IO qpair if connection is down. 3584 * If reconnect_delay_sec is non-zero, creating IO qpair is retried 3585 * after reconnect_delay_sec seconds. If bdev_retry_count is non-zero, 3586 * submitted IO will be queued until IO qpair is successfully created. 3587 * 3588 * Hence, if both are satisfied, ignore the failure. 3589 */ 3590 if (nvme_ctrlr->opts.reconnect_delay_sec == 0 || g_opts.bdev_retry_count == 0) { 3591 spdk_put_io_channel(pg_ch); 3592 free(nvme_qpair); 3593 return rc; 3594 } 3595 } 3596 } 3597 3598 TAILQ_INSERT_TAIL(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 3599 3600 ctrlr_ch->qpair = nvme_qpair; 3601 3602 nvme_ctrlr_get_ref(nvme_ctrlr); 3603 3604 return 0; 3605 } 3606 3607 static int 3608 bdev_nvme_create_ctrlr_channel_cb(void *io_device, void *ctx_buf) 3609 { 3610 struct nvme_ctrlr *nvme_ctrlr = io_device; 3611 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 3612 3613 return nvme_qpair_create(nvme_ctrlr, ctrlr_ch); 3614 } 3615 3616 static void 3617 nvme_qpair_delete(struct nvme_qpair *nvme_qpair) 3618 { 3619 struct nvme_io_path *io_path, *next; 3620 3621 assert(nvme_qpair->group != NULL); 3622 3623 TAILQ_FOREACH_SAFE(io_path, &nvme_qpair->io_path_list, tailq, next) { 3624 TAILQ_REMOVE(&nvme_qpair->io_path_list, io_path, tailq); 3625 nvme_io_path_free(io_path); 3626 } 3627 3628 TAILQ_REMOVE(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 3629 3630 spdk_put_io_channel(spdk_io_channel_from_ctx(nvme_qpair->group)); 3631 3632 nvme_ctrlr_put_ref(nvme_qpair->ctrlr); 3633 3634 free(nvme_qpair); 3635 } 3636 3637 static void 3638 bdev_nvme_destroy_ctrlr_channel_cb(void *io_device, void *ctx_buf) 3639 { 3640 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 3641 struct nvme_qpair *nvme_qpair; 3642 3643 nvme_qpair = ctrlr_ch->qpair; 3644 assert(nvme_qpair != NULL); 3645 3646 _bdev_nvme_clear_io_path_cache(nvme_qpair); 3647 3648 if (nvme_qpair->qpair != NULL) { 3649 /* Always try to disconnect the qpair, even if a reset is in progress. 3650 * The qpair may have been created after the reset process started. 3651 */ 3652 spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); 3653 if (ctrlr_ch->reset_iter) { 3654 /* Skip current ctrlr_channel in a full reset sequence because 3655 * it is being deleted now. 3656 */ 3657 nvme_ctrlr_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 3658 } 3659 3660 /* We cannot release a reference to the poll group now. 3661 * The qpair may be disconnected asynchronously later. 3662 * We need to poll it until it is actually disconnected. 3663 * Just detach the qpair from the deleting ctrlr_channel. 3664 */ 3665 nvme_qpair->ctrlr_ch = NULL; 3666 } else { 3667 assert(ctrlr_ch->reset_iter == NULL); 3668 3669 nvme_qpair_delete(nvme_qpair); 3670 } 3671 } 3672 3673 static inline struct spdk_io_channel * 3674 bdev_nvme_get_accel_channel(struct nvme_poll_group *group) 3675 { 3676 if (spdk_unlikely(!group->accel_channel)) { 3677 group->accel_channel = spdk_accel_get_io_channel(); 3678 if (!group->accel_channel) { 3679 SPDK_ERRLOG("Cannot get the accel_channel for bdev nvme polling group=%p\n", 3680 group); 3681 return NULL; 3682 } 3683 } 3684 3685 return group->accel_channel; 3686 } 3687 3688 static void 3689 bdev_nvme_finish_sequence(void *seq, spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) 3690 { 3691 spdk_accel_sequence_finish(seq, cb_fn, cb_arg); 3692 } 3693 3694 static void 3695 bdev_nvme_abort_sequence(void *seq) 3696 { 3697 spdk_accel_sequence_abort(seq); 3698 } 3699 3700 static void 3701 bdev_nvme_reverse_sequence(void *seq) 3702 { 3703 spdk_accel_sequence_reverse(seq); 3704 } 3705 3706 static int 3707 bdev_nvme_append_crc32c(void *ctx, void **seq, uint32_t *dst, struct iovec *iovs, uint32_t iovcnt, 3708 struct spdk_memory_domain *domain, void *domain_ctx, uint32_t seed, 3709 spdk_nvme_accel_step_cb cb_fn, void *cb_arg) 3710 { 3711 struct spdk_io_channel *ch; 3712 struct nvme_poll_group *group = ctx; 3713 3714 ch = bdev_nvme_get_accel_channel(group); 3715 if (spdk_unlikely(ch == NULL)) { 3716 return -ENOMEM; 3717 } 3718 3719 return spdk_accel_append_crc32c((struct spdk_accel_sequence **)seq, ch, dst, iovs, iovcnt, 3720 domain, domain_ctx, seed, cb_fn, cb_arg); 3721 } 3722 3723 static int 3724 bdev_nvme_append_copy(void *ctx, void **seq, struct iovec *dst_iovs, uint32_t dst_iovcnt, 3725 struct spdk_memory_domain *dst_domain, void *dst_domain_ctx, 3726 struct iovec *src_iovs, uint32_t src_iovcnt, 3727 struct spdk_memory_domain *src_domain, void *src_domain_ctx, 3728 spdk_nvme_accel_step_cb cb_fn, void *cb_arg) 3729 { 3730 struct spdk_io_channel *ch; 3731 struct nvme_poll_group *group = ctx; 3732 3733 ch = bdev_nvme_get_accel_channel(group); 3734 if (spdk_unlikely(ch == NULL)) { 3735 return -ENOMEM; 3736 } 3737 3738 return spdk_accel_append_copy((struct spdk_accel_sequence **)seq, ch, 3739 dst_iovs, dst_iovcnt, dst_domain, dst_domain_ctx, 3740 src_iovs, src_iovcnt, src_domain, src_domain_ctx, 3741 cb_fn, cb_arg); 3742 } 3743 3744 static struct spdk_nvme_accel_fn_table g_bdev_nvme_accel_fn_table = { 3745 .table_size = sizeof(struct spdk_nvme_accel_fn_table), 3746 .append_crc32c = bdev_nvme_append_crc32c, 3747 .append_copy = bdev_nvme_append_copy, 3748 .finish_sequence = bdev_nvme_finish_sequence, 3749 .reverse_sequence = bdev_nvme_reverse_sequence, 3750 .abort_sequence = bdev_nvme_abort_sequence, 3751 }; 3752 3753 static void 3754 bdev_nvme_poll_group_interrupt_cb(struct spdk_nvme_poll_group *group, void *ctx) 3755 { 3756 bdev_nvme_poll(ctx); 3757 } 3758 3759 static int 3760 bdev_nvme_create_poll_group_cb(void *io_device, void *ctx_buf) 3761 { 3762 struct nvme_poll_group *group = ctx_buf; 3763 struct spdk_fd_group *fgrp; 3764 uint64_t period; 3765 int rc; 3766 3767 TAILQ_INIT(&group->qpair_list); 3768 3769 group->group = spdk_nvme_poll_group_create(group, &g_bdev_nvme_accel_fn_table); 3770 if (group->group == NULL) { 3771 return -1; 3772 } 3773 3774 period = spdk_interrupt_mode_is_enabled() ? 0 : g_opts.nvme_ioq_poll_period_us; 3775 group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, period); 3776 3777 if (group->poller == NULL) { 3778 spdk_nvme_poll_group_destroy(group->group); 3779 return -1; 3780 } 3781 3782 if (spdk_interrupt_mode_is_enabled()) { 3783 spdk_poller_register_interrupt(group->poller, NULL, NULL); 3784 3785 fgrp = spdk_nvme_poll_group_get_fd_group(group->group); 3786 if (fgrp == NULL) { 3787 spdk_nvme_poll_group_destroy(group->group); 3788 return -1; 3789 } 3790 3791 rc = spdk_nvme_poll_group_set_interrupt_callback(group->group, 3792 bdev_nvme_poll_group_interrupt_cb, group); 3793 if (rc != 0) { 3794 spdk_nvme_poll_group_destroy(group->group); 3795 return -1; 3796 } 3797 3798 group->intr = spdk_interrupt_register_fd_group(fgrp, "bdev_nvme_interrupt"); 3799 if (!group->intr) { 3800 spdk_nvme_poll_group_destroy(group->group); 3801 return -1; 3802 } 3803 } 3804 3805 return 0; 3806 } 3807 3808 static void 3809 bdev_nvme_destroy_poll_group_cb(void *io_device, void *ctx_buf) 3810 { 3811 struct nvme_poll_group *group = ctx_buf; 3812 3813 assert(TAILQ_EMPTY(&group->qpair_list)); 3814 3815 if (group->accel_channel) { 3816 spdk_put_io_channel(group->accel_channel); 3817 } 3818 3819 if (spdk_interrupt_mode_is_enabled()) { 3820 spdk_interrupt_unregister(&group->intr); 3821 } 3822 3823 spdk_poller_unregister(&group->poller); 3824 if (spdk_nvme_poll_group_destroy(group->group)) { 3825 SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.\n"); 3826 assert(false); 3827 } 3828 } 3829 3830 static struct spdk_io_channel * 3831 bdev_nvme_get_io_channel(void *ctx) 3832 { 3833 struct nvme_bdev *nbdev = ctx; 3834 3835 return spdk_get_io_channel(nbdev); 3836 } 3837 3838 static void * 3839 bdev_nvme_get_module_ctx(void *ctx) 3840 { 3841 struct nvme_bdev *nbdev = ctx; 3842 struct nvme_ns *nvme_ns; 3843 3844 if (!nbdev || nbdev->disk.module != &nvme_if) { 3845 return NULL; 3846 } 3847 3848 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 3849 if (!nvme_ns) { 3850 return NULL; 3851 } 3852 3853 return nvme_ns->ns; 3854 } 3855 3856 static const char * 3857 _nvme_ana_state_str(enum spdk_nvme_ana_state ana_state) 3858 { 3859 switch (ana_state) { 3860 case SPDK_NVME_ANA_OPTIMIZED_STATE: 3861 return "optimized"; 3862 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 3863 return "non_optimized"; 3864 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 3865 return "inaccessible"; 3866 case SPDK_NVME_ANA_PERSISTENT_LOSS_STATE: 3867 return "persistent_loss"; 3868 case SPDK_NVME_ANA_CHANGE_STATE: 3869 return "change"; 3870 default: 3871 return NULL; 3872 } 3873 } 3874 3875 static int 3876 bdev_nvme_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size) 3877 { 3878 struct spdk_memory_domain **_domains = NULL; 3879 struct nvme_bdev *nbdev = ctx; 3880 struct nvme_ns *nvme_ns; 3881 int i = 0, _array_size = array_size; 3882 int rc = 0; 3883 3884 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 3885 if (domains && array_size >= i) { 3886 _domains = &domains[i]; 3887 } else { 3888 _domains = NULL; 3889 } 3890 rc = spdk_nvme_ctrlr_get_memory_domains(nvme_ns->ctrlr->ctrlr, _domains, _array_size); 3891 if (rc > 0) { 3892 i += rc; 3893 if (_array_size >= rc) { 3894 _array_size -= rc; 3895 } else { 3896 _array_size = 0; 3897 } 3898 } else if (rc < 0) { 3899 return rc; 3900 } 3901 } 3902 3903 return i; 3904 } 3905 3906 static const char * 3907 nvme_ctrlr_get_state_str(struct nvme_ctrlr *nvme_ctrlr) 3908 { 3909 if (nvme_ctrlr->destruct) { 3910 return "deleting"; 3911 } else if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 3912 return "failed"; 3913 } else if (nvme_ctrlr->resetting) { 3914 return "resetting"; 3915 } else if (nvme_ctrlr->reconnect_is_delayed > 0) { 3916 return "reconnect_is_delayed"; 3917 } else if (nvme_ctrlr->disabled) { 3918 return "disabled"; 3919 } else { 3920 return "enabled"; 3921 } 3922 } 3923 3924 void 3925 nvme_ctrlr_info_json(struct spdk_json_write_ctx *w, struct nvme_ctrlr *nvme_ctrlr) 3926 { 3927 struct spdk_nvme_transport_id *trid; 3928 const struct spdk_nvme_ctrlr_opts *opts; 3929 const struct spdk_nvme_ctrlr_data *cdata; 3930 struct nvme_path_id *path_id; 3931 int32_t numa_id; 3932 3933 spdk_json_write_object_begin(w); 3934 3935 spdk_json_write_named_string(w, "state", nvme_ctrlr_get_state_str(nvme_ctrlr)); 3936 3937 #ifdef SPDK_CONFIG_NVME_CUSE 3938 size_t cuse_name_size = 128; 3939 char cuse_name[cuse_name_size]; 3940 3941 int rc = spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr, cuse_name, &cuse_name_size); 3942 if (rc == 0) { 3943 spdk_json_write_named_string(w, "cuse_device", cuse_name); 3944 } 3945 #endif 3946 trid = &nvme_ctrlr->active_path_id->trid; 3947 spdk_json_write_named_object_begin(w, "trid"); 3948 nvme_bdev_dump_trid_json(trid, w); 3949 spdk_json_write_object_end(w); 3950 3951 path_id = TAILQ_NEXT(nvme_ctrlr->active_path_id, link); 3952 if (path_id != NULL) { 3953 spdk_json_write_named_array_begin(w, "alternate_trids"); 3954 do { 3955 trid = &path_id->trid; 3956 spdk_json_write_object_begin(w); 3957 nvme_bdev_dump_trid_json(trid, w); 3958 spdk_json_write_object_end(w); 3959 3960 path_id = TAILQ_NEXT(path_id, link); 3961 } while (path_id != NULL); 3962 spdk_json_write_array_end(w); 3963 } 3964 3965 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 3966 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 3967 3968 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 3969 spdk_json_write_named_object_begin(w, "host"); 3970 spdk_json_write_named_string(w, "nqn", opts->hostnqn); 3971 spdk_json_write_named_string(w, "addr", opts->src_addr); 3972 spdk_json_write_named_string(w, "svcid", opts->src_svcid); 3973 spdk_json_write_object_end(w); 3974 3975 numa_id = spdk_nvme_ctrlr_get_numa_id(nvme_ctrlr->ctrlr); 3976 if (numa_id != SPDK_ENV_NUMA_ID_ANY) { 3977 spdk_json_write_named_uint32(w, "numa_id", numa_id); 3978 } 3979 spdk_json_write_object_end(w); 3980 } 3981 3982 static void 3983 nvme_namespace_info_json(struct spdk_json_write_ctx *w, 3984 struct nvme_ns *nvme_ns) 3985 { 3986 struct spdk_nvme_ns *ns; 3987 struct spdk_nvme_ctrlr *ctrlr; 3988 const struct spdk_nvme_ctrlr_data *cdata; 3989 const struct spdk_nvme_transport_id *trid; 3990 union spdk_nvme_vs_register vs; 3991 const struct spdk_nvme_ns_data *nsdata; 3992 char buf[128]; 3993 3994 ns = nvme_ns->ns; 3995 if (ns == NULL) { 3996 return; 3997 } 3998 3999 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 4000 4001 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 4002 trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); 4003 vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr); 4004 4005 spdk_json_write_object_begin(w); 4006 4007 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 4008 spdk_json_write_named_string(w, "pci_address", trid->traddr); 4009 } 4010 4011 spdk_json_write_named_object_begin(w, "trid"); 4012 4013 nvme_bdev_dump_trid_json(trid, w); 4014 4015 spdk_json_write_object_end(w); 4016 4017 #ifdef SPDK_CONFIG_NVME_CUSE 4018 size_t cuse_name_size = 128; 4019 char cuse_name[cuse_name_size]; 4020 4021 int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns), 4022 cuse_name, &cuse_name_size); 4023 if (rc == 0) { 4024 spdk_json_write_named_string(w, "cuse_device", cuse_name); 4025 } 4026 #endif 4027 4028 spdk_json_write_named_object_begin(w, "ctrlr_data"); 4029 4030 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 4031 4032 spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid); 4033 4034 snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn); 4035 spdk_str_trim(buf); 4036 spdk_json_write_named_string(w, "model_number", buf); 4037 4038 snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn); 4039 spdk_str_trim(buf); 4040 spdk_json_write_named_string(w, "serial_number", buf); 4041 4042 snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr); 4043 spdk_str_trim(buf); 4044 spdk_json_write_named_string(w, "firmware_revision", buf); 4045 4046 if (cdata->subnqn[0] != '\0') { 4047 spdk_json_write_named_string(w, "subnqn", cdata->subnqn); 4048 } 4049 4050 spdk_json_write_named_object_begin(w, "oacs"); 4051 4052 spdk_json_write_named_uint32(w, "security", cdata->oacs.security); 4053 spdk_json_write_named_uint32(w, "format", cdata->oacs.format); 4054 spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware); 4055 spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage); 4056 4057 spdk_json_write_object_end(w); 4058 4059 spdk_json_write_named_bool(w, "multi_ctrlr", cdata->cmic.multi_ctrlr); 4060 spdk_json_write_named_bool(w, "ana_reporting", cdata->cmic.ana_reporting); 4061 4062 spdk_json_write_object_end(w); 4063 4064 spdk_json_write_named_object_begin(w, "vs"); 4065 4066 spdk_json_write_name(w, "nvme_version"); 4067 if (vs.bits.ter) { 4068 spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter); 4069 } else { 4070 spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr); 4071 } 4072 4073 spdk_json_write_object_end(w); 4074 4075 nsdata = spdk_nvme_ns_get_data(ns); 4076 4077 spdk_json_write_named_object_begin(w, "ns_data"); 4078 4079 spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns)); 4080 4081 if (cdata->cmic.ana_reporting) { 4082 spdk_json_write_named_string(w, "ana_state", 4083 _nvme_ana_state_str(nvme_ns->ana_state)); 4084 } 4085 4086 spdk_json_write_named_bool(w, "can_share", nsdata->nmic.can_share); 4087 4088 spdk_json_write_object_end(w); 4089 4090 if (cdata->oacs.security) { 4091 spdk_json_write_named_object_begin(w, "security"); 4092 4093 spdk_json_write_named_bool(w, "opal", nvme_ns->bdev->opal); 4094 4095 spdk_json_write_object_end(w); 4096 } 4097 4098 spdk_json_write_object_end(w); 4099 } 4100 4101 static const char * 4102 nvme_bdev_get_mp_policy_str(struct nvme_bdev *nbdev) 4103 { 4104 switch (nbdev->mp_policy) { 4105 case BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE: 4106 return "active_passive"; 4107 case BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE: 4108 return "active_active"; 4109 default: 4110 assert(false); 4111 return "invalid"; 4112 } 4113 } 4114 4115 static const char * 4116 nvme_bdev_get_mp_selector_str(struct nvme_bdev *nbdev) 4117 { 4118 switch (nbdev->mp_selector) { 4119 case BDEV_NVME_MP_SELECTOR_ROUND_ROBIN: 4120 return "round_robin"; 4121 case BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH: 4122 return "queue_depth"; 4123 default: 4124 assert(false); 4125 return "invalid"; 4126 } 4127 } 4128 4129 static int 4130 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) 4131 { 4132 struct nvme_bdev *nbdev = ctx; 4133 struct nvme_ns *nvme_ns; 4134 4135 pthread_mutex_lock(&nbdev->mutex); 4136 spdk_json_write_named_array_begin(w, "nvme"); 4137 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 4138 nvme_namespace_info_json(w, nvme_ns); 4139 } 4140 spdk_json_write_array_end(w); 4141 spdk_json_write_named_string(w, "mp_policy", nvme_bdev_get_mp_policy_str(nbdev)); 4142 if (nbdev->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) { 4143 spdk_json_write_named_string(w, "selector", nvme_bdev_get_mp_selector_str(nbdev)); 4144 if (nbdev->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 4145 spdk_json_write_named_uint32(w, "rr_min_io", nbdev->rr_min_io); 4146 } 4147 } 4148 pthread_mutex_unlock(&nbdev->mutex); 4149 4150 return 0; 4151 } 4152 4153 static void 4154 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 4155 { 4156 /* No config per bdev needed */ 4157 } 4158 4159 static uint64_t 4160 bdev_nvme_get_spin_time(struct spdk_io_channel *ch) 4161 { 4162 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 4163 struct nvme_io_path *io_path; 4164 struct nvme_poll_group *group; 4165 uint64_t spin_time = 0; 4166 4167 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 4168 group = io_path->qpair->group; 4169 4170 if (!group || !group->collect_spin_stat) { 4171 continue; 4172 } 4173 4174 if (group->end_ticks != 0) { 4175 group->spin_ticks += (group->end_ticks - group->start_ticks); 4176 group->end_ticks = 0; 4177 } 4178 4179 spin_time += group->spin_ticks; 4180 group->start_ticks = 0; 4181 group->spin_ticks = 0; 4182 } 4183 4184 return (spin_time * 1000000ULL) / spdk_get_ticks_hz(); 4185 } 4186 4187 static void 4188 bdev_nvme_reset_device_stat(void *ctx) 4189 { 4190 struct nvme_bdev *nbdev = ctx; 4191 4192 if (nbdev->err_stat != NULL) { 4193 memset(nbdev->err_stat, 0, sizeof(struct nvme_error_stat)); 4194 } 4195 } 4196 4197 /* JSON string should be lowercases and underscore delimited string. */ 4198 static void 4199 bdev_nvme_format_nvme_status(char *dst, const char *src) 4200 { 4201 char tmp[256]; 4202 4203 spdk_strcpy_replace(dst, 256, src, " - ", "_"); 4204 spdk_strcpy_replace(tmp, 256, dst, "-", "_"); 4205 spdk_strcpy_replace(dst, 256, tmp, " ", "_"); 4206 spdk_strlwr(dst); 4207 } 4208 4209 static void 4210 bdev_nvme_dump_device_stat_json(void *ctx, struct spdk_json_write_ctx *w) 4211 { 4212 struct nvme_bdev *nbdev = ctx; 4213 struct spdk_nvme_status status = {}; 4214 uint16_t sct, sc; 4215 char status_json[256]; 4216 const char *status_str; 4217 4218 if (nbdev->err_stat == NULL) { 4219 return; 4220 } 4221 4222 spdk_json_write_named_object_begin(w, "nvme_error"); 4223 4224 spdk_json_write_named_object_begin(w, "status_type"); 4225 for (sct = 0; sct < 8; sct++) { 4226 if (nbdev->err_stat->status_type[sct] == 0) { 4227 continue; 4228 } 4229 status.sct = sct; 4230 4231 status_str = spdk_nvme_cpl_get_status_type_string(&status); 4232 assert(status_str != NULL); 4233 bdev_nvme_format_nvme_status(status_json, status_str); 4234 4235 spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status_type[sct]); 4236 } 4237 spdk_json_write_object_end(w); 4238 4239 spdk_json_write_named_object_begin(w, "status_code"); 4240 for (sct = 0; sct < 4; sct++) { 4241 status.sct = sct; 4242 for (sc = 0; sc < 256; sc++) { 4243 if (nbdev->err_stat->status[sct][sc] == 0) { 4244 continue; 4245 } 4246 status.sc = sc; 4247 4248 status_str = spdk_nvme_cpl_get_status_string(&status); 4249 assert(status_str != NULL); 4250 bdev_nvme_format_nvme_status(status_json, status_str); 4251 4252 spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status[sct][sc]); 4253 } 4254 } 4255 spdk_json_write_object_end(w); 4256 4257 spdk_json_write_object_end(w); 4258 } 4259 4260 static bool 4261 bdev_nvme_accel_sequence_supported(void *ctx, enum spdk_bdev_io_type type) 4262 { 4263 struct nvme_bdev *nbdev = ctx; 4264 struct nvme_ns *nvme_ns; 4265 struct spdk_nvme_ctrlr *ctrlr; 4266 4267 if (!g_opts.allow_accel_sequence) { 4268 return false; 4269 } 4270 4271 switch (type) { 4272 case SPDK_BDEV_IO_TYPE_WRITE: 4273 case SPDK_BDEV_IO_TYPE_READ: 4274 break; 4275 default: 4276 return false; 4277 } 4278 4279 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 4280 assert(nvme_ns != NULL); 4281 4282 ctrlr = nvme_ns->ctrlr->ctrlr; 4283 assert(ctrlr != NULL); 4284 4285 return spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ACCEL_SEQUENCE_SUPPORTED; 4286 } 4287 4288 static const struct spdk_bdev_fn_table nvmelib_fn_table = { 4289 .destruct = bdev_nvme_destruct, 4290 .submit_request = bdev_nvme_submit_request, 4291 .io_type_supported = bdev_nvme_io_type_supported, 4292 .get_io_channel = bdev_nvme_get_io_channel, 4293 .dump_info_json = bdev_nvme_dump_info_json, 4294 .write_config_json = bdev_nvme_write_config_json, 4295 .get_spin_time = bdev_nvme_get_spin_time, 4296 .get_module_ctx = bdev_nvme_get_module_ctx, 4297 .get_memory_domains = bdev_nvme_get_memory_domains, 4298 .accel_sequence_supported = bdev_nvme_accel_sequence_supported, 4299 .reset_device_stat = bdev_nvme_reset_device_stat, 4300 .dump_device_stat_json = bdev_nvme_dump_device_stat_json, 4301 }; 4302 4303 typedef int (*bdev_nvme_parse_ana_log_page_cb)( 4304 const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg); 4305 4306 static int 4307 bdev_nvme_parse_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 4308 bdev_nvme_parse_ana_log_page_cb cb_fn, void *cb_arg) 4309 { 4310 struct spdk_nvme_ana_group_descriptor *copied_desc; 4311 uint8_t *orig_desc; 4312 uint32_t i, desc_size, copy_len; 4313 int rc = 0; 4314 4315 if (nvme_ctrlr->ana_log_page == NULL) { 4316 return -EINVAL; 4317 } 4318 4319 copied_desc = nvme_ctrlr->copied_ana_desc; 4320 4321 orig_desc = (uint8_t *)nvme_ctrlr->ana_log_page + sizeof(struct spdk_nvme_ana_page); 4322 copy_len = nvme_ctrlr->max_ana_log_page_size - sizeof(struct spdk_nvme_ana_page); 4323 4324 for (i = 0; i < nvme_ctrlr->ana_log_page->num_ana_group_desc; i++) { 4325 memcpy(copied_desc, orig_desc, copy_len); 4326 4327 rc = cb_fn(copied_desc, cb_arg); 4328 if (rc != 0) { 4329 break; 4330 } 4331 4332 desc_size = sizeof(struct spdk_nvme_ana_group_descriptor) + 4333 copied_desc->num_of_nsid * sizeof(uint32_t); 4334 orig_desc += desc_size; 4335 copy_len -= desc_size; 4336 } 4337 4338 return rc; 4339 } 4340 4341 static int 4342 nvme_ns_ana_transition_timedout(void *ctx) 4343 { 4344 struct nvme_ns *nvme_ns = ctx; 4345 4346 spdk_poller_unregister(&nvme_ns->anatt_timer); 4347 nvme_ns->ana_transition_timedout = true; 4348 4349 return SPDK_POLLER_BUSY; 4350 } 4351 4352 static void 4353 _nvme_ns_set_ana_state(struct nvme_ns *nvme_ns, 4354 const struct spdk_nvme_ana_group_descriptor *desc) 4355 { 4356 const struct spdk_nvme_ctrlr_data *cdata; 4357 4358 nvme_ns->ana_group_id = desc->ana_group_id; 4359 nvme_ns->ana_state = desc->ana_state; 4360 nvme_ns->ana_state_updating = false; 4361 4362 switch (nvme_ns->ana_state) { 4363 case SPDK_NVME_ANA_OPTIMIZED_STATE: 4364 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 4365 nvme_ns->ana_transition_timedout = false; 4366 spdk_poller_unregister(&nvme_ns->anatt_timer); 4367 break; 4368 4369 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 4370 case SPDK_NVME_ANA_CHANGE_STATE: 4371 if (nvme_ns->anatt_timer != NULL) { 4372 break; 4373 } 4374 4375 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 4376 nvme_ns->anatt_timer = SPDK_POLLER_REGISTER(nvme_ns_ana_transition_timedout, 4377 nvme_ns, 4378 cdata->anatt * SPDK_SEC_TO_USEC); 4379 break; 4380 default: 4381 break; 4382 } 4383 } 4384 4385 static int 4386 nvme_ns_set_ana_state(const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg) 4387 { 4388 struct nvme_ns *nvme_ns = cb_arg; 4389 uint32_t i; 4390 4391 assert(nvme_ns->ns != NULL); 4392 4393 for (i = 0; i < desc->num_of_nsid; i++) { 4394 if (desc->nsid[i] != spdk_nvme_ns_get_id(nvme_ns->ns)) { 4395 continue; 4396 } 4397 4398 _nvme_ns_set_ana_state(nvme_ns, desc); 4399 return 1; 4400 } 4401 4402 return 0; 4403 } 4404 4405 static int 4406 nvme_generate_uuid(const char *sn, uint32_t nsid, struct spdk_uuid *uuid) 4407 { 4408 int rc = 0; 4409 struct spdk_uuid new_uuid, namespace_uuid; 4410 char merged_str[SPDK_NVME_CTRLR_SN_LEN + NSID_STR_LEN + 1] = {'\0'}; 4411 /* This namespace UUID was generated using uuid_generate() method. */ 4412 const char *namespace_str = {"edaed2de-24bc-4b07-b559-f47ecbe730fd"}; 4413 int size; 4414 4415 assert(strlen(sn) <= SPDK_NVME_CTRLR_SN_LEN); 4416 4417 spdk_uuid_set_null(&new_uuid); 4418 spdk_uuid_set_null(&namespace_uuid); 4419 4420 size = snprintf(merged_str, sizeof(merged_str), "%s%"PRIu32, sn, nsid); 4421 if (size <= 0 || (unsigned long)size >= sizeof(merged_str)) { 4422 return -EINVAL; 4423 } 4424 4425 spdk_uuid_parse(&namespace_uuid, namespace_str); 4426 4427 rc = spdk_uuid_generate_sha1(&new_uuid, &namespace_uuid, merged_str, size); 4428 if (rc == 0) { 4429 memcpy(uuid, &new_uuid, sizeof(struct spdk_uuid)); 4430 } 4431 4432 return rc; 4433 } 4434 4435 static int 4436 nbdev_create(struct spdk_bdev *disk, const char *base_name, 4437 struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns, 4438 struct spdk_bdev_nvme_ctrlr_opts *bdev_opts, void *ctx) 4439 { 4440 const struct spdk_uuid *uuid; 4441 const uint8_t *nguid; 4442 const struct spdk_nvme_ctrlr_data *cdata; 4443 const struct spdk_nvme_ns_data *nsdata; 4444 const struct spdk_nvme_ctrlr_opts *opts; 4445 enum spdk_nvme_csi csi; 4446 uint32_t atomic_bs, phys_bs, bs; 4447 char sn_tmp[SPDK_NVME_CTRLR_SN_LEN + 1] = {'\0'}; 4448 int rc; 4449 4450 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 4451 csi = spdk_nvme_ns_get_csi(ns); 4452 opts = spdk_nvme_ctrlr_get_opts(ctrlr); 4453 4454 switch (csi) { 4455 case SPDK_NVME_CSI_NVM: 4456 disk->product_name = "NVMe disk"; 4457 break; 4458 case SPDK_NVME_CSI_ZNS: 4459 disk->product_name = "NVMe ZNS disk"; 4460 disk->zoned = true; 4461 disk->zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 4462 disk->max_zone_append_size = spdk_nvme_zns_ctrlr_get_max_zone_append_size(ctrlr) / 4463 spdk_nvme_ns_get_extended_sector_size(ns); 4464 disk->max_open_zones = spdk_nvme_zns_ns_get_max_open_zones(ns); 4465 disk->max_active_zones = spdk_nvme_zns_ns_get_max_active_zones(ns); 4466 break; 4467 default: 4468 if (bdev_opts->allow_unrecognized_csi) { 4469 disk->product_name = "NVMe Passthrough disk"; 4470 break; 4471 } 4472 SPDK_ERRLOG("unsupported CSI: %u\n", csi); 4473 return -ENOTSUP; 4474 } 4475 4476 nguid = spdk_nvme_ns_get_nguid(ns); 4477 if (!nguid) { 4478 uuid = spdk_nvme_ns_get_uuid(ns); 4479 if (uuid) { 4480 disk->uuid = *uuid; 4481 } else if (g_opts.generate_uuids) { 4482 spdk_strcpy_pad(sn_tmp, cdata->sn, SPDK_NVME_CTRLR_SN_LEN, '\0'); 4483 rc = nvme_generate_uuid(sn_tmp, spdk_nvme_ns_get_id(ns), &disk->uuid); 4484 if (rc < 0) { 4485 SPDK_ERRLOG("UUID generation failed (%s)\n", spdk_strerror(-rc)); 4486 return rc; 4487 } 4488 } 4489 } else { 4490 memcpy(&disk->uuid, nguid, sizeof(disk->uuid)); 4491 } 4492 4493 disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns)); 4494 if (!disk->name) { 4495 return -ENOMEM; 4496 } 4497 4498 disk->write_cache = 0; 4499 if (cdata->vwc.present) { 4500 /* Enable if the Volatile Write Cache exists */ 4501 disk->write_cache = 1; 4502 } 4503 if (cdata->oncs.write_zeroes) { 4504 disk->max_write_zeroes = UINT16_MAX + 1; 4505 } 4506 disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns); 4507 disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns); 4508 disk->max_segment_size = spdk_nvme_ctrlr_get_max_xfer_size(ctrlr); 4509 disk->ctratt.raw = cdata->ctratt.raw; 4510 disk->nsid = spdk_nvme_ns_get_id(ns); 4511 /* NVMe driver will split one request into multiple requests 4512 * based on MDTS and stripe boundary, the bdev layer will use 4513 * max_segment_size and max_num_segments to split one big IO 4514 * into multiple requests, then small request can't run out 4515 * of NVMe internal requests data structure. 4516 */ 4517 if (opts && opts->io_queue_requests) { 4518 disk->max_num_segments = opts->io_queue_requests / 2; 4519 } 4520 if (spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_SGL_SUPPORTED) { 4521 /* The nvme driver will try to split I/O that have too many 4522 * SGEs, but it doesn't work if that last SGE doesn't end on 4523 * an aggregate total that is block aligned. The bdev layer has 4524 * a more robust splitting framework, so use that instead for 4525 * this case. (See issue #3269.) 4526 */ 4527 uint16_t max_sges = spdk_nvme_ctrlr_get_max_sges(ctrlr); 4528 4529 if (disk->max_num_segments == 0) { 4530 disk->max_num_segments = max_sges; 4531 } else { 4532 disk->max_num_segments = spdk_min(disk->max_num_segments, max_sges); 4533 } 4534 } 4535 disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns); 4536 4537 nsdata = spdk_nvme_ns_get_data(ns); 4538 bs = spdk_nvme_ns_get_sector_size(ns); 4539 atomic_bs = bs; 4540 phys_bs = bs; 4541 if (nsdata->nabo == 0) { 4542 if (nsdata->nsfeat.ns_atomic_write_unit && nsdata->nawupf) { 4543 atomic_bs = bs * (1 + nsdata->nawupf); 4544 } else { 4545 atomic_bs = bs * (1 + cdata->awupf); 4546 } 4547 } 4548 if (nsdata->nsfeat.optperf) { 4549 phys_bs = bs * (1 + nsdata->npwg); 4550 } 4551 disk->phys_blocklen = spdk_min(phys_bs, atomic_bs); 4552 4553 disk->md_len = spdk_nvme_ns_get_md_size(ns); 4554 if (disk->md_len != 0) { 4555 disk->md_interleave = nsdata->flbas.extended; 4556 disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns); 4557 if (disk->dif_type != SPDK_DIF_DISABLE) { 4558 disk->dif_is_head_of_md = nsdata->dps.md_start; 4559 disk->dif_check_flags = bdev_opts->prchk_flags; 4560 disk->dif_pi_format = (enum spdk_dif_pi_format)spdk_nvme_ns_get_pi_format(ns); 4561 } 4562 } 4563 4564 if (!(spdk_nvme_ctrlr_get_flags(ctrlr) & 4565 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) { 4566 disk->acwu = 0; 4567 } else if (nsdata->nsfeat.ns_atomic_write_unit) { 4568 disk->acwu = nsdata->nacwu + 1; /* 0-based */ 4569 } else { 4570 disk->acwu = cdata->acwu + 1; /* 0-based */ 4571 } 4572 4573 if (cdata->oncs.copy) { 4574 /* For now bdev interface allows only single segment copy */ 4575 disk->max_copy = nsdata->mssrl; 4576 } 4577 4578 disk->ctxt = ctx; 4579 disk->fn_table = &nvmelib_fn_table; 4580 disk->module = &nvme_if; 4581 4582 disk->numa.id_valid = 1; 4583 disk->numa.id = spdk_nvme_ctrlr_get_numa_id(ctrlr); 4584 4585 return 0; 4586 } 4587 4588 static struct nvme_bdev * 4589 nvme_bdev_alloc(void) 4590 { 4591 struct nvme_bdev *nbdev; 4592 int rc; 4593 4594 nbdev = calloc(1, sizeof(*nbdev)); 4595 if (!nbdev) { 4596 SPDK_ERRLOG("nbdev calloc() failed\n"); 4597 return NULL; 4598 } 4599 4600 if (g_opts.nvme_error_stat) { 4601 nbdev->err_stat = calloc(1, sizeof(struct nvme_error_stat)); 4602 if (!nbdev->err_stat) { 4603 SPDK_ERRLOG("err_stat calloc() failed\n"); 4604 free(nbdev); 4605 return NULL; 4606 } 4607 } 4608 4609 rc = pthread_mutex_init(&nbdev->mutex, NULL); 4610 if (rc != 0) { 4611 free(nbdev->err_stat); 4612 free(nbdev); 4613 return NULL; 4614 } 4615 4616 nbdev->ref = 1; 4617 nbdev->mp_policy = BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE; 4618 nbdev->mp_selector = BDEV_NVME_MP_SELECTOR_ROUND_ROBIN; 4619 nbdev->rr_min_io = UINT32_MAX; 4620 TAILQ_INIT(&nbdev->nvme_ns_list); 4621 4622 return nbdev; 4623 } 4624 4625 static int 4626 nvme_bdev_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4627 { 4628 struct nvme_bdev *nbdev; 4629 struct nvme_bdev_ctrlr *nbdev_ctrlr = nvme_ctrlr->nbdev_ctrlr; 4630 int rc; 4631 4632 nbdev = nvme_bdev_alloc(); 4633 if (nbdev == NULL) { 4634 SPDK_ERRLOG("Failed to allocate NVMe bdev\n"); 4635 return -ENOMEM; 4636 } 4637 4638 nbdev->opal = nvme_ctrlr->opal_dev != NULL; 4639 4640 rc = nbdev_create(&nbdev->disk, nbdev_ctrlr->name, nvme_ctrlr->ctrlr, 4641 nvme_ns->ns, &nvme_ctrlr->opts, nbdev); 4642 if (rc != 0) { 4643 SPDK_ERRLOG("Failed to create NVMe disk\n"); 4644 nvme_bdev_free(nbdev); 4645 return rc; 4646 } 4647 4648 spdk_io_device_register(nbdev, 4649 bdev_nvme_create_bdev_channel_cb, 4650 bdev_nvme_destroy_bdev_channel_cb, 4651 sizeof(struct nvme_bdev_channel), 4652 nbdev->disk.name); 4653 4654 nvme_ns->bdev = nbdev; 4655 nbdev->nsid = nvme_ns->id; 4656 TAILQ_INSERT_TAIL(&nbdev->nvme_ns_list, nvme_ns, tailq); 4657 4658 pthread_mutex_lock(&g_bdev_nvme_mutex); 4659 4660 nbdev->nbdev_ctrlr = nbdev_ctrlr; 4661 TAILQ_INSERT_TAIL(&nbdev_ctrlr->bdevs, nbdev, tailq); 4662 4663 rc = spdk_bdev_register(&nbdev->disk); 4664 if (rc != 0) { 4665 SPDK_ERRLOG("spdk_bdev_register() failed\n"); 4666 spdk_io_device_unregister(nbdev, NULL); 4667 nvme_ns->bdev = NULL; 4668 4669 TAILQ_REMOVE(&nbdev_ctrlr->bdevs, nbdev, tailq); 4670 4671 pthread_mutex_unlock(&g_bdev_nvme_mutex); 4672 4673 nvme_bdev_free(nbdev); 4674 return rc; 4675 } 4676 4677 pthread_mutex_unlock(&g_bdev_nvme_mutex); 4678 4679 return 0; 4680 } 4681 4682 static bool 4683 bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2) 4684 { 4685 const struct spdk_nvme_ns_data *nsdata1, *nsdata2; 4686 const struct spdk_uuid *uuid1, *uuid2; 4687 4688 nsdata1 = spdk_nvme_ns_get_data(ns1); 4689 nsdata2 = spdk_nvme_ns_get_data(ns2); 4690 uuid1 = spdk_nvme_ns_get_uuid(ns1); 4691 uuid2 = spdk_nvme_ns_get_uuid(ns2); 4692 4693 return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid)) == 0 && 4694 nsdata1->eui64 == nsdata2->eui64 && 4695 ((uuid1 == NULL && uuid2 == NULL) || 4696 (uuid1 != NULL && uuid2 != NULL && spdk_uuid_compare(uuid1, uuid2) == 0)) && 4697 spdk_nvme_ns_get_csi(ns1) == spdk_nvme_ns_get_csi(ns2); 4698 } 4699 4700 static bool 4701 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 4702 struct spdk_nvme_ctrlr_opts *opts) 4703 { 4704 struct nvme_probe_skip_entry *entry; 4705 4706 TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) { 4707 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 4708 return false; 4709 } 4710 } 4711 4712 opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst; 4713 opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight; 4714 opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight; 4715 opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight; 4716 opts->disable_read_ana_log_page = true; 4717 4718 SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr); 4719 4720 return true; 4721 } 4722 4723 static void 4724 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl) 4725 { 4726 struct nvme_ctrlr *nvme_ctrlr = ctx; 4727 4728 if (spdk_nvme_cpl_is_error(cpl)) { 4729 NVME_CTRLR_WARNLOG(nvme_ctrlr, "Abort failed. Resetting controller. sc is %u, sct is %u.\n", 4730 cpl->status.sc, cpl->status.sct); 4731 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4732 } else if (cpl->cdw0 & 0x1) { 4733 NVME_CTRLR_WARNLOG(nvme_ctrlr, "Specified command could not be aborted.\n"); 4734 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4735 } 4736 } 4737 4738 static void 4739 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr, 4740 struct spdk_nvme_qpair *qpair, uint16_t cid) 4741 { 4742 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 4743 union spdk_nvme_csts_register csts; 4744 int rc; 4745 4746 assert(nvme_ctrlr->ctrlr == ctrlr); 4747 4748 NVME_CTRLR_WARNLOG(nvme_ctrlr, "Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", 4749 ctrlr, qpair, cid); 4750 4751 /* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O 4752 * queue. (Note: qpair == NULL when there's an admin cmd timeout.) Otherwise we 4753 * would submit another fabrics cmd on the admin queue to read CSTS and check for its 4754 * completion recursively. 4755 */ 4756 if (nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) { 4757 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); 4758 if (csts.bits.cfs) { 4759 NVME_CTRLR_ERRLOG(nvme_ctrlr, "Controller Fatal Status, reset required\n"); 4760 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4761 return; 4762 } 4763 } 4764 4765 switch (g_opts.action_on_timeout) { 4766 case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT: 4767 if (qpair) { 4768 /* Don't send abort to ctrlr when ctrlr is not available. */ 4769 pthread_mutex_lock(&nvme_ctrlr->mutex); 4770 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 4771 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4772 NVME_CTRLR_NOTICELOG(nvme_ctrlr, "Quit abort. Ctrlr is not available.\n"); 4773 return; 4774 } 4775 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4776 4777 rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid, 4778 nvme_abort_cpl, nvme_ctrlr); 4779 if (rc == 0) { 4780 return; 4781 } 4782 4783 NVME_CTRLR_ERRLOG(nvme_ctrlr, "Unable to send abort. Resetting, rc is %d.\n", rc); 4784 } 4785 4786 /* FALLTHROUGH */ 4787 case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET: 4788 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4789 break; 4790 case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE: 4791 NVME_CTRLR_DEBUGLOG(nvme_ctrlr, "No action for nvme controller timeout.\n"); 4792 break; 4793 default: 4794 NVME_CTRLR_ERRLOG(nvme_ctrlr, "An invalid timeout action value is found.\n"); 4795 break; 4796 } 4797 } 4798 4799 static struct nvme_ns * 4800 nvme_ns_alloc(void) 4801 { 4802 struct nvme_ns *nvme_ns; 4803 4804 nvme_ns = calloc(1, sizeof(struct nvme_ns)); 4805 if (nvme_ns == NULL) { 4806 return NULL; 4807 } 4808 4809 if (g_opts.io_path_stat) { 4810 nvme_ns->stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); 4811 if (nvme_ns->stat == NULL) { 4812 free(nvme_ns); 4813 return NULL; 4814 } 4815 spdk_bdev_reset_io_stat(nvme_ns->stat, SPDK_BDEV_RESET_STAT_MAXMIN); 4816 } 4817 4818 return nvme_ns; 4819 } 4820 4821 static void 4822 nvme_ns_free(struct nvme_ns *nvme_ns) 4823 { 4824 free(nvme_ns->stat); 4825 free(nvme_ns); 4826 } 4827 4828 static void 4829 nvme_ctrlr_populate_namespace_done(struct nvme_ns *nvme_ns, int rc) 4830 { 4831 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 4832 struct nvme_async_probe_ctx *ctx = nvme_ns->probe_ctx; 4833 4834 if (rc == 0) { 4835 nvme_ns->probe_ctx = NULL; 4836 nvme_ctrlr_get_ref(nvme_ctrlr); 4837 } else { 4838 pthread_mutex_lock(&nvme_ctrlr->mutex); 4839 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4840 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4841 4842 nvme_ns_free(nvme_ns); 4843 } 4844 4845 if (ctx) { 4846 ctx->populates_in_progress--; 4847 if (ctx->populates_in_progress == 0) { 4848 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 4849 } 4850 } 4851 } 4852 4853 static void 4854 bdev_nvme_add_io_path(struct nvme_bdev_channel_iter *i, 4855 struct nvme_bdev *nbdev, 4856 struct nvme_bdev_channel *nbdev_ch, void *ctx) 4857 { 4858 struct nvme_ns *nvme_ns = ctx; 4859 int rc; 4860 4861 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 4862 if (rc != 0) { 4863 SPDK_ERRLOG("Failed to add I/O path to bdev_channel dynamically.\n"); 4864 } 4865 4866 nvme_bdev_for_each_channel_continue(i, rc); 4867 } 4868 4869 static void 4870 bdev_nvme_delete_io_path(struct nvme_bdev_channel_iter *i, 4871 struct nvme_bdev *nbdev, 4872 struct nvme_bdev_channel *nbdev_ch, void *ctx) 4873 { 4874 struct nvme_ns *nvme_ns = ctx; 4875 struct nvme_io_path *io_path; 4876 4877 io_path = _bdev_nvme_get_io_path(nbdev_ch, nvme_ns); 4878 if (io_path != NULL) { 4879 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 4880 } 4881 4882 nvme_bdev_for_each_channel_continue(i, 0); 4883 } 4884 4885 static void 4886 bdev_nvme_add_io_path_failed(struct nvme_bdev *nbdev, void *ctx, int status) 4887 { 4888 struct nvme_ns *nvme_ns = ctx; 4889 4890 nvme_ctrlr_populate_namespace_done(nvme_ns, -1); 4891 } 4892 4893 static void 4894 bdev_nvme_add_io_path_done(struct nvme_bdev *nbdev, void *ctx, int status) 4895 { 4896 struct nvme_ns *nvme_ns = ctx; 4897 4898 if (status == 0) { 4899 nvme_ctrlr_populate_namespace_done(nvme_ns, 0); 4900 } else { 4901 /* Delete the added io_paths and fail populating the namespace. */ 4902 nvme_bdev_for_each_channel(nbdev, 4903 bdev_nvme_delete_io_path, 4904 nvme_ns, 4905 bdev_nvme_add_io_path_failed); 4906 } 4907 } 4908 4909 static int 4910 nvme_bdev_add_ns(struct nvme_bdev *nbdev, struct nvme_ns *nvme_ns) 4911 { 4912 struct nvme_ns *tmp_ns; 4913 const struct spdk_nvme_ns_data *nsdata; 4914 4915 nsdata = spdk_nvme_ns_get_data(nvme_ns->ns); 4916 if (!nsdata->nmic.can_share) { 4917 SPDK_ERRLOG("Namespace cannot be shared.\n"); 4918 return -EINVAL; 4919 } 4920 4921 pthread_mutex_lock(&nbdev->mutex); 4922 4923 tmp_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 4924 assert(tmp_ns != NULL); 4925 4926 if (tmp_ns->ns != NULL && !bdev_nvme_compare_ns(nvme_ns->ns, tmp_ns->ns)) { 4927 pthread_mutex_unlock(&nbdev->mutex); 4928 SPDK_ERRLOG("Namespaces are not identical.\n"); 4929 return -EINVAL; 4930 } 4931 4932 nbdev->ref++; 4933 TAILQ_INSERT_TAIL(&nbdev->nvme_ns_list, nvme_ns, tailq); 4934 nvme_ns->bdev = nbdev; 4935 4936 pthread_mutex_unlock(&nbdev->mutex); 4937 4938 /* Add nvme_io_path to nvme_bdev_channels dynamically. */ 4939 nvme_bdev_for_each_channel(nbdev, 4940 bdev_nvme_add_io_path, 4941 nvme_ns, 4942 bdev_nvme_add_io_path_done); 4943 4944 return 0; 4945 } 4946 4947 static void 4948 nvme_ctrlr_populate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4949 { 4950 struct spdk_nvme_ns *ns; 4951 struct nvme_bdev *bdev; 4952 int rc = 0; 4953 4954 ns = spdk_nvme_ctrlr_get_ns(nvme_ctrlr->ctrlr, nvme_ns->id); 4955 if (!ns) { 4956 NVME_CTRLR_DEBUGLOG(nvme_ctrlr, "Invalid NS %d\n", nvme_ns->id); 4957 rc = -EINVAL; 4958 goto done; 4959 } 4960 4961 nvme_ns->ns = ns; 4962 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 4963 4964 if (nvme_ctrlr->ana_log_page != NULL) { 4965 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ns_set_ana_state, nvme_ns); 4966 } 4967 4968 bdev = nvme_bdev_ctrlr_get_bdev(nvme_ctrlr->nbdev_ctrlr, nvme_ns->id); 4969 if (bdev == NULL) { 4970 rc = nvme_bdev_create(nvme_ctrlr, nvme_ns); 4971 } else { 4972 rc = nvme_bdev_add_ns(bdev, nvme_ns); 4973 if (rc == 0) { 4974 return; 4975 } 4976 } 4977 done: 4978 nvme_ctrlr_populate_namespace_done(nvme_ns, rc); 4979 } 4980 4981 static void 4982 nvme_ctrlr_depopulate_namespace_done(struct nvme_ns *nvme_ns) 4983 { 4984 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 4985 4986 assert(nvme_ctrlr != NULL); 4987 4988 pthread_mutex_lock(&nvme_ctrlr->mutex); 4989 4990 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4991 4992 if (nvme_ns->bdev != NULL) { 4993 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4994 return; 4995 } 4996 4997 nvme_ns_free(nvme_ns); 4998 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4999 5000 nvme_ctrlr_put_ref(nvme_ctrlr); 5001 } 5002 5003 static void 5004 bdev_nvme_delete_io_path_done(struct nvme_bdev *nbdev, void *ctx, int status) 5005 { 5006 struct nvme_ns *nvme_ns = ctx; 5007 5008 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 5009 } 5010 5011 static void 5012 nvme_ctrlr_depopulate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 5013 { 5014 struct nvme_bdev *nbdev; 5015 5016 if (nvme_ns->depopulating) { 5017 /* Maybe we received 2 AENs in a row */ 5018 return; 5019 } 5020 nvme_ns->depopulating = true; 5021 5022 spdk_poller_unregister(&nvme_ns->anatt_timer); 5023 5024 nbdev = nvme_ns->bdev; 5025 if (nbdev != NULL) { 5026 pthread_mutex_lock(&nbdev->mutex); 5027 5028 assert(nbdev->ref > 0); 5029 nbdev->ref--; 5030 if (nbdev->ref == 0) { 5031 pthread_mutex_unlock(&nbdev->mutex); 5032 5033 spdk_bdev_unregister(&nbdev->disk, NULL, NULL); 5034 } else { 5035 /* spdk_bdev_unregister() is not called until the last nvme_ns is 5036 * depopulated. Hence we need to remove nvme_ns from bdev->nvme_ns_list 5037 * and clear nvme_ns->bdev here. 5038 */ 5039 TAILQ_REMOVE(&nbdev->nvme_ns_list, nvme_ns, tailq); 5040 5041 pthread_mutex_lock(&nvme_ns->ctrlr->mutex); 5042 nvme_ns->bdev = NULL; 5043 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 5044 5045 pthread_mutex_unlock(&nbdev->mutex); 5046 5047 /* Delete nvme_io_paths from nvme_bdev_channels dynamically. After that, 5048 * we call depopulate_namespace_done() to avoid use-after-free. 5049 */ 5050 nvme_bdev_for_each_channel(nbdev, 5051 bdev_nvme_delete_io_path, 5052 nvme_ns, 5053 bdev_nvme_delete_io_path_done); 5054 return; 5055 } 5056 } 5057 5058 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 5059 } 5060 5061 static void 5062 nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 5063 struct nvme_async_probe_ctx *ctx) 5064 { 5065 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 5066 struct nvme_ns *nvme_ns, *next; 5067 struct spdk_nvme_ns *ns; 5068 struct nvme_bdev *nbdev; 5069 uint32_t nsid; 5070 int rc; 5071 uint64_t num_sectors; 5072 5073 if (ctx) { 5074 /* Initialize this count to 1 to handle the populate functions 5075 * calling nvme_ctrlr_populate_namespace_done() immediately. 5076 */ 5077 ctx->populates_in_progress = 1; 5078 } 5079 5080 /* First loop over our existing namespaces and see if they have been 5081 * removed. */ 5082 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 5083 while (nvme_ns != NULL) { 5084 next = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 5085 5086 if (spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) { 5087 /* NS is still there or added again. Its attributes may have changed. */ 5088 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id); 5089 if (nvme_ns->ns != ns) { 5090 assert(nvme_ns->ns == NULL); 5091 nvme_ns->ns = ns; 5092 NVME_CTRLR_DEBUGLOG(nvme_ctrlr, "NSID %u was added\n", nvme_ns->id); 5093 } 5094 5095 num_sectors = spdk_nvme_ns_get_num_sectors(ns); 5096 nbdev = nvme_ns->bdev; 5097 assert(nbdev != NULL); 5098 if (nbdev->disk.blockcnt != num_sectors) { 5099 NVME_CTRLR_NOTICELOG(nvme_ctrlr, 5100 "NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n", 5101 nvme_ns->id, 5102 nbdev->disk.name, 5103 nbdev->disk.blockcnt, 5104 num_sectors); 5105 rc = spdk_bdev_notify_blockcnt_change(&nbdev->disk, num_sectors); 5106 if (rc != 0) { 5107 NVME_CTRLR_ERRLOG(nvme_ctrlr, 5108 "Could not change num blocks for nvme bdev: name %s, errno: %d.\n", 5109 nbdev->disk.name, rc); 5110 } 5111 } 5112 } else { 5113 /* Namespace was removed */ 5114 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 5115 } 5116 5117 nvme_ns = next; 5118 } 5119 5120 /* Loop through all of the namespaces at the nvme level and see if any of them are new */ 5121 nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 5122 while (nsid != 0) { 5123 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 5124 5125 if (nvme_ns == NULL) { 5126 /* Found a new one */ 5127 nvme_ns = nvme_ns_alloc(); 5128 if (nvme_ns == NULL) { 5129 NVME_CTRLR_ERRLOG(nvme_ctrlr, "Failed to allocate namespace\n"); 5130 /* This just fails to attach the namespace. It may work on a future attempt. */ 5131 continue; 5132 } 5133 5134 nvme_ns->id = nsid; 5135 nvme_ns->ctrlr = nvme_ctrlr; 5136 5137 nvme_ns->bdev = NULL; 5138 5139 if (ctx) { 5140 ctx->populates_in_progress++; 5141 } 5142 nvme_ns->probe_ctx = ctx; 5143 5144 pthread_mutex_lock(&nvme_ctrlr->mutex); 5145 RB_INSERT(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 5146 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5147 5148 nvme_ctrlr_populate_namespace(nvme_ctrlr, nvme_ns); 5149 } 5150 5151 nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid); 5152 } 5153 5154 if (ctx) { 5155 /* Decrement this count now that the loop is over to account 5156 * for the one we started with. If the count is then 0, we 5157 * know any populate_namespace functions completed immediately, 5158 * so we'll kick the callback here. 5159 */ 5160 ctx->populates_in_progress--; 5161 if (ctx->populates_in_progress == 0) { 5162 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 5163 } 5164 } 5165 5166 } 5167 5168 static void 5169 nvme_ctrlr_depopulate_namespaces(struct nvme_ctrlr *nvme_ctrlr) 5170 { 5171 struct nvme_ns *nvme_ns, *tmp; 5172 5173 RB_FOREACH_SAFE(nvme_ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp) { 5174 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 5175 } 5176 } 5177 5178 static uint32_t 5179 nvme_ctrlr_get_ana_log_page_size(struct nvme_ctrlr *nvme_ctrlr) 5180 { 5181 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 5182 const struct spdk_nvme_ctrlr_data *cdata; 5183 uint32_t nsid, ns_count = 0; 5184 5185 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5186 5187 for (nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 5188 nsid != 0; nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid)) { 5189 ns_count++; 5190 } 5191 5192 return sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 5193 sizeof(struct spdk_nvme_ana_group_descriptor) + ns_count * 5194 sizeof(uint32_t); 5195 } 5196 5197 static int 5198 nvme_ctrlr_set_ana_states(const struct spdk_nvme_ana_group_descriptor *desc, 5199 void *cb_arg) 5200 { 5201 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 5202 struct nvme_ns *nvme_ns; 5203 uint32_t i, nsid; 5204 5205 for (i = 0; i < desc->num_of_nsid; i++) { 5206 nsid = desc->nsid[i]; 5207 if (nsid == 0) { 5208 continue; 5209 } 5210 5211 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 5212 5213 if (nvme_ns == NULL) { 5214 /* Target told us that an inactive namespace had an ANA change */ 5215 continue; 5216 } 5217 5218 _nvme_ns_set_ana_state(nvme_ns, desc); 5219 } 5220 5221 return 0; 5222 } 5223 5224 static void 5225 bdev_nvme_disable_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 5226 { 5227 struct nvme_ns *nvme_ns; 5228 5229 spdk_free(nvme_ctrlr->ana_log_page); 5230 nvme_ctrlr->ana_log_page = NULL; 5231 5232 for (nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 5233 nvme_ns != NULL; 5234 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns)) { 5235 nvme_ns->ana_state_updating = false; 5236 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 5237 } 5238 } 5239 5240 static void 5241 nvme_ctrlr_read_ana_log_page_done(void *ctx, const struct spdk_nvme_cpl *cpl) 5242 { 5243 struct nvme_ctrlr *nvme_ctrlr = ctx; 5244 5245 if (cpl != NULL && spdk_nvme_cpl_is_success(cpl)) { 5246 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ctrlr_set_ana_states, 5247 nvme_ctrlr); 5248 } else { 5249 bdev_nvme_disable_read_ana_log_page(nvme_ctrlr); 5250 } 5251 5252 pthread_mutex_lock(&nvme_ctrlr->mutex); 5253 5254 assert(nvme_ctrlr->ana_log_page_updating == true); 5255 nvme_ctrlr->ana_log_page_updating = false; 5256 5257 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 5258 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5259 5260 nvme_ctrlr_unregister(nvme_ctrlr); 5261 } else { 5262 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5263 5264 bdev_nvme_clear_io_path_caches(nvme_ctrlr); 5265 } 5266 } 5267 5268 static int 5269 nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 5270 { 5271 uint32_t ana_log_page_size; 5272 int rc; 5273 5274 if (nvme_ctrlr->ana_log_page == NULL) { 5275 return -EINVAL; 5276 } 5277 5278 ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); 5279 5280 if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { 5281 NVME_CTRLR_ERRLOG(nvme_ctrlr, 5282 "ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", 5283 ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); 5284 return -EINVAL; 5285 } 5286 5287 pthread_mutex_lock(&nvme_ctrlr->mutex); 5288 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 5289 nvme_ctrlr->ana_log_page_updating) { 5290 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5291 return -EBUSY; 5292 } 5293 5294 nvme_ctrlr->ana_log_page_updating = true; 5295 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5296 5297 rc = spdk_nvme_ctrlr_cmd_get_log_page(nvme_ctrlr->ctrlr, 5298 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 5299 SPDK_NVME_GLOBAL_NS_TAG, 5300 nvme_ctrlr->ana_log_page, 5301 ana_log_page_size, 0, 5302 nvme_ctrlr_read_ana_log_page_done, 5303 nvme_ctrlr); 5304 if (rc != 0) { 5305 nvme_ctrlr_read_ana_log_page_done(nvme_ctrlr, NULL); 5306 } 5307 5308 return rc; 5309 } 5310 5311 static void 5312 dummy_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 5313 { 5314 } 5315 5316 struct bdev_nvme_set_preferred_path_ctx { 5317 struct spdk_bdev_desc *desc; 5318 struct nvme_ns *nvme_ns; 5319 bdev_nvme_set_preferred_path_cb cb_fn; 5320 void *cb_arg; 5321 }; 5322 5323 static void 5324 bdev_nvme_set_preferred_path_done(struct nvme_bdev *nbdev, void *_ctx, int status) 5325 { 5326 struct bdev_nvme_set_preferred_path_ctx *ctx = _ctx; 5327 5328 assert(ctx != NULL); 5329 assert(ctx->desc != NULL); 5330 assert(ctx->cb_fn != NULL); 5331 5332 spdk_bdev_close(ctx->desc); 5333 5334 ctx->cb_fn(ctx->cb_arg, status); 5335 5336 free(ctx); 5337 } 5338 5339 static void 5340 _bdev_nvme_set_preferred_path(struct nvme_bdev_channel_iter *i, 5341 struct nvme_bdev *nbdev, 5342 struct nvme_bdev_channel *nbdev_ch, void *_ctx) 5343 { 5344 struct bdev_nvme_set_preferred_path_ctx *ctx = _ctx; 5345 struct nvme_io_path *io_path, *prev; 5346 5347 prev = NULL; 5348 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 5349 if (io_path->nvme_ns == ctx->nvme_ns) { 5350 break; 5351 } 5352 prev = io_path; 5353 } 5354 5355 if (io_path != NULL) { 5356 if (prev != NULL) { 5357 STAILQ_REMOVE_AFTER(&nbdev_ch->io_path_list, prev, stailq); 5358 STAILQ_INSERT_HEAD(&nbdev_ch->io_path_list, io_path, stailq); 5359 } 5360 5361 /* We can set io_path to nbdev_ch->current_io_path directly here. 5362 * However, it needs to be conditional. To simplify the code, 5363 * just clear nbdev_ch->current_io_path and let find_io_path() 5364 * fill it. 5365 * 5366 * Automatic failback may be disabled. Hence even if the io_path is 5367 * already at the head, clear nbdev_ch->current_io_path. 5368 */ 5369 bdev_nvme_clear_current_io_path(nbdev_ch); 5370 } 5371 5372 nvme_bdev_for_each_channel_continue(i, 0); 5373 } 5374 5375 static struct nvme_ns * 5376 bdev_nvme_set_preferred_ns(struct nvme_bdev *nbdev, uint16_t cntlid) 5377 { 5378 struct nvme_ns *nvme_ns, *prev; 5379 const struct spdk_nvme_ctrlr_data *cdata; 5380 5381 prev = NULL; 5382 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 5383 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 5384 5385 if (cdata->cntlid == cntlid) { 5386 break; 5387 } 5388 prev = nvme_ns; 5389 } 5390 5391 if (nvme_ns != NULL && prev != NULL) { 5392 TAILQ_REMOVE(&nbdev->nvme_ns_list, nvme_ns, tailq); 5393 TAILQ_INSERT_HEAD(&nbdev->nvme_ns_list, nvme_ns, tailq); 5394 } 5395 5396 return nvme_ns; 5397 } 5398 5399 /* This function supports only multipath mode. There is only a single I/O path 5400 * for each NVMe-oF controller. Hence, just move the matched I/O path to the 5401 * head of the I/O path list for each NVMe bdev channel. 5402 * 5403 * NVMe bdev channel may be acquired after completing this function. move the 5404 * matched namespace to the head of the namespace list for the NVMe bdev too. 5405 */ 5406 void 5407 bdev_nvme_set_preferred_path(const char *name, uint16_t cntlid, 5408 bdev_nvme_set_preferred_path_cb cb_fn, void *cb_arg) 5409 { 5410 struct bdev_nvme_set_preferred_path_ctx *ctx; 5411 struct spdk_bdev *bdev; 5412 struct nvme_bdev *nbdev; 5413 int rc = 0; 5414 5415 assert(cb_fn != NULL); 5416 5417 ctx = calloc(1, sizeof(*ctx)); 5418 if (ctx == NULL) { 5419 SPDK_ERRLOG("Failed to alloc context.\n"); 5420 rc = -ENOMEM; 5421 goto err_alloc; 5422 } 5423 5424 ctx->cb_fn = cb_fn; 5425 ctx->cb_arg = cb_arg; 5426 5427 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 5428 if (rc != 0) { 5429 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 5430 goto err_open; 5431 } 5432 5433 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 5434 5435 if (bdev->module != &nvme_if) { 5436 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 5437 rc = -ENODEV; 5438 goto err_bdev; 5439 } 5440 5441 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 5442 5443 pthread_mutex_lock(&nbdev->mutex); 5444 5445 ctx->nvme_ns = bdev_nvme_set_preferred_ns(nbdev, cntlid); 5446 if (ctx->nvme_ns == NULL) { 5447 pthread_mutex_unlock(&nbdev->mutex); 5448 5449 SPDK_ERRLOG("bdev %s does not have namespace to controller %u.\n", name, cntlid); 5450 rc = -ENODEV; 5451 goto err_bdev; 5452 } 5453 5454 pthread_mutex_unlock(&nbdev->mutex); 5455 5456 nvme_bdev_for_each_channel(nbdev, 5457 _bdev_nvme_set_preferred_path, 5458 ctx, 5459 bdev_nvme_set_preferred_path_done); 5460 return; 5461 5462 err_bdev: 5463 spdk_bdev_close(ctx->desc); 5464 err_open: 5465 free(ctx); 5466 err_alloc: 5467 cb_fn(cb_arg, rc); 5468 } 5469 5470 struct bdev_nvme_set_multipath_policy_ctx { 5471 struct spdk_bdev_desc *desc; 5472 spdk_bdev_nvme_set_multipath_policy_cb cb_fn; 5473 void *cb_arg; 5474 }; 5475 5476 static void 5477 bdev_nvme_set_multipath_policy_done(struct nvme_bdev *nbdev, void *_ctx, int status) 5478 { 5479 struct bdev_nvme_set_multipath_policy_ctx *ctx = _ctx; 5480 5481 assert(ctx != NULL); 5482 assert(ctx->desc != NULL); 5483 assert(ctx->cb_fn != NULL); 5484 5485 spdk_bdev_close(ctx->desc); 5486 5487 ctx->cb_fn(ctx->cb_arg, status); 5488 5489 free(ctx); 5490 } 5491 5492 static void 5493 _bdev_nvme_set_multipath_policy(struct nvme_bdev_channel_iter *i, 5494 struct nvme_bdev *nbdev, 5495 struct nvme_bdev_channel *nbdev_ch, void *ctx) 5496 { 5497 nbdev_ch->mp_policy = nbdev->mp_policy; 5498 nbdev_ch->mp_selector = nbdev->mp_selector; 5499 nbdev_ch->rr_min_io = nbdev->rr_min_io; 5500 bdev_nvme_clear_current_io_path(nbdev_ch); 5501 5502 nvme_bdev_for_each_channel_continue(i, 0); 5503 } 5504 5505 void 5506 spdk_bdev_nvme_set_multipath_policy(const char *name, enum spdk_bdev_nvme_multipath_policy policy, 5507 enum spdk_bdev_nvme_multipath_selector selector, uint32_t rr_min_io, 5508 spdk_bdev_nvme_set_multipath_policy_cb cb_fn, void *cb_arg) 5509 { 5510 struct bdev_nvme_set_multipath_policy_ctx *ctx; 5511 struct spdk_bdev *bdev; 5512 struct nvme_bdev *nbdev; 5513 int rc; 5514 5515 assert(cb_fn != NULL); 5516 5517 switch (policy) { 5518 case BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE: 5519 break; 5520 case BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE: 5521 switch (selector) { 5522 case BDEV_NVME_MP_SELECTOR_ROUND_ROBIN: 5523 if (rr_min_io == UINT32_MAX) { 5524 rr_min_io = 1; 5525 } else if (rr_min_io == 0) { 5526 rc = -EINVAL; 5527 goto exit; 5528 } 5529 break; 5530 case BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH: 5531 break; 5532 default: 5533 rc = -EINVAL; 5534 goto exit; 5535 } 5536 break; 5537 default: 5538 rc = -EINVAL; 5539 goto exit; 5540 } 5541 5542 ctx = calloc(1, sizeof(*ctx)); 5543 if (ctx == NULL) { 5544 SPDK_ERRLOG("Failed to alloc context.\n"); 5545 rc = -ENOMEM; 5546 goto exit; 5547 } 5548 5549 ctx->cb_fn = cb_fn; 5550 ctx->cb_arg = cb_arg; 5551 5552 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 5553 if (rc != 0) { 5554 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 5555 rc = -ENODEV; 5556 goto err_open; 5557 } 5558 5559 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 5560 if (bdev->module != &nvme_if) { 5561 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 5562 rc = -ENODEV; 5563 goto err_module; 5564 } 5565 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 5566 5567 pthread_mutex_lock(&nbdev->mutex); 5568 nbdev->mp_policy = policy; 5569 nbdev->mp_selector = selector; 5570 nbdev->rr_min_io = rr_min_io; 5571 pthread_mutex_unlock(&nbdev->mutex); 5572 5573 nvme_bdev_for_each_channel(nbdev, 5574 _bdev_nvme_set_multipath_policy, 5575 ctx, 5576 bdev_nvme_set_multipath_policy_done); 5577 return; 5578 5579 err_module: 5580 spdk_bdev_close(ctx->desc); 5581 err_open: 5582 free(ctx); 5583 exit: 5584 cb_fn(cb_arg, rc); 5585 } 5586 5587 static void 5588 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 5589 { 5590 struct nvme_ctrlr *nvme_ctrlr = arg; 5591 union spdk_nvme_async_event_completion event; 5592 5593 if (spdk_nvme_cpl_is_error(cpl)) { 5594 SPDK_WARNLOG("AER request execute failed\n"); 5595 return; 5596 } 5597 5598 event.raw = cpl->cdw0; 5599 if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 5600 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) { 5601 nvme_ctrlr_populate_namespaces(nvme_ctrlr, NULL); 5602 } else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 5603 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_ANA_CHANGE)) { 5604 nvme_ctrlr_read_ana_log_page(nvme_ctrlr); 5605 } 5606 } 5607 5608 static void 5609 free_nvme_async_probe_ctx(struct nvme_async_probe_ctx *ctx) 5610 { 5611 spdk_keyring_put_key(ctx->drv_opts.tls_psk); 5612 spdk_keyring_put_key(ctx->drv_opts.dhchap_key); 5613 spdk_keyring_put_key(ctx->drv_opts.dhchap_ctrlr_key); 5614 free(ctx->base_name); 5615 free(ctx); 5616 } 5617 5618 static void 5619 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, int rc) 5620 { 5621 if (ctx->cb_fn) { 5622 ctx->cb_fn(ctx->cb_ctx, ctx->reported_bdevs, rc); 5623 } 5624 5625 ctx->namespaces_populated = true; 5626 if (ctx->probe_done) { 5627 /* The probe was already completed, so we need to free the context 5628 * here. This can happen for cases like OCSSD, where we need to 5629 * send additional commands to the SSD after attach. 5630 */ 5631 free_nvme_async_probe_ctx(ctx); 5632 } 5633 } 5634 5635 static int 5636 bdev_nvme_remove_poller(void *ctx) 5637 { 5638 struct spdk_nvme_transport_id trid_pcie; 5639 5640 if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 5641 spdk_poller_unregister(&g_hotplug_poller); 5642 return SPDK_POLLER_IDLE; 5643 } 5644 5645 memset(&trid_pcie, 0, sizeof(trid_pcie)); 5646 spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE); 5647 5648 if (spdk_nvme_scan_attached(&trid_pcie)) { 5649 SPDK_ERRLOG_RATELIMIT("spdk_nvme_scan_attached() failed\n"); 5650 } 5651 5652 return SPDK_POLLER_BUSY; 5653 } 5654 5655 static void 5656 nvme_ctrlr_create_done(struct nvme_ctrlr *nvme_ctrlr, 5657 struct nvme_async_probe_ctx *ctx) 5658 { 5659 struct spdk_nvme_transport_id *trid = &nvme_ctrlr->active_path_id->trid; 5660 5661 if (spdk_nvme_trtype_is_fabrics(trid->trtype)) { 5662 NVME_CTRLR_INFOLOG(nvme_ctrlr, "ctrlr was created to %s:%s\n", 5663 trid->traddr, trid->trsvcid); 5664 } else { 5665 NVME_CTRLR_INFOLOG(nvme_ctrlr, "ctrlr was created\n"); 5666 } 5667 5668 spdk_io_device_register(nvme_ctrlr, 5669 bdev_nvme_create_ctrlr_channel_cb, 5670 bdev_nvme_destroy_ctrlr_channel_cb, 5671 sizeof(struct nvme_ctrlr_channel), 5672 nvme_ctrlr->nbdev_ctrlr->name); 5673 5674 nvme_ctrlr_populate_namespaces(nvme_ctrlr, ctx); 5675 5676 if (g_hotplug_poller == NULL) { 5677 g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_remove_poller, NULL, 5678 NVME_HOTPLUG_POLL_PERIOD_DEFAULT); 5679 } 5680 } 5681 5682 static void 5683 nvme_ctrlr_init_ana_log_page_done(void *_ctx, const struct spdk_nvme_cpl *cpl) 5684 { 5685 struct nvme_ctrlr *nvme_ctrlr = _ctx; 5686 struct nvme_async_probe_ctx *ctx = nvme_ctrlr->probe_ctx; 5687 5688 nvme_ctrlr->probe_ctx = NULL; 5689 5690 if (spdk_nvme_cpl_is_error(cpl)) { 5691 nvme_ctrlr_delete(nvme_ctrlr); 5692 5693 if (ctx != NULL) { 5694 ctx->reported_bdevs = 0; 5695 populate_namespaces_cb(ctx, -1); 5696 } 5697 return; 5698 } 5699 5700 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 5701 } 5702 5703 static int 5704 nvme_ctrlr_init_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 5705 struct nvme_async_probe_ctx *ctx) 5706 { 5707 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 5708 const struct spdk_nvme_ctrlr_data *cdata; 5709 uint32_t ana_log_page_size; 5710 5711 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5712 5713 /* Set buffer size enough to include maximum number of allowed namespaces. */ 5714 ana_log_page_size = sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 5715 sizeof(struct spdk_nvme_ana_group_descriptor) + cdata->mnan * 5716 sizeof(uint32_t); 5717 5718 nvme_ctrlr->ana_log_page = spdk_zmalloc(ana_log_page_size, 64, NULL, 5719 SPDK_ENV_NUMA_ID_ANY, SPDK_MALLOC_DMA); 5720 if (nvme_ctrlr->ana_log_page == NULL) { 5721 NVME_CTRLR_ERRLOG(nvme_ctrlr, "could not allocate ANA log page buffer\n"); 5722 return -ENXIO; 5723 } 5724 5725 /* Each descriptor in a ANA log page is not ensured to be 8-bytes aligned. 5726 * Hence copy each descriptor to a temporary area when parsing it. 5727 * 5728 * Allocate a buffer whose size is as large as ANA log page buffer because 5729 * we do not know the size of a descriptor until actually reading it. 5730 */ 5731 nvme_ctrlr->copied_ana_desc = calloc(1, ana_log_page_size); 5732 if (nvme_ctrlr->copied_ana_desc == NULL) { 5733 NVME_CTRLR_ERRLOG(nvme_ctrlr, "could not allocate a buffer to parse ANA descriptor\n"); 5734 return -ENOMEM; 5735 } 5736 5737 nvme_ctrlr->max_ana_log_page_size = ana_log_page_size; 5738 5739 nvme_ctrlr->probe_ctx = ctx; 5740 5741 /* Then, set the read size only to include the current active namespaces. */ 5742 ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); 5743 5744 if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { 5745 NVME_CTRLR_ERRLOG(nvme_ctrlr, "ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", 5746 ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); 5747 return -EINVAL; 5748 } 5749 5750 return spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, 5751 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 5752 SPDK_NVME_GLOBAL_NS_TAG, 5753 nvme_ctrlr->ana_log_page, 5754 ana_log_page_size, 0, 5755 nvme_ctrlr_init_ana_log_page_done, 5756 nvme_ctrlr); 5757 } 5758 5759 /* hostnqn and subnqn were already verified before attaching a controller. 5760 * Hence check only the multipath capability and cntlid here. 5761 */ 5762 static bool 5763 bdev_nvme_check_multipath(struct nvme_bdev_ctrlr *nbdev_ctrlr, struct spdk_nvme_ctrlr *ctrlr) 5764 { 5765 struct nvme_ctrlr *tmp; 5766 const struct spdk_nvme_ctrlr_data *cdata, *tmp_cdata; 5767 5768 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5769 5770 if (!cdata->cmic.multi_ctrlr) { 5771 SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); 5772 return false; 5773 } 5774 5775 TAILQ_FOREACH(tmp, &nbdev_ctrlr->ctrlrs, tailq) { 5776 tmp_cdata = spdk_nvme_ctrlr_get_data(tmp->ctrlr); 5777 5778 if (!tmp_cdata->cmic.multi_ctrlr) { 5779 NVME_CTRLR_ERRLOG(tmp, "Ctrlr%u does not support multipath.\n", cdata->cntlid); 5780 return false; 5781 } 5782 if (cdata->cntlid == tmp_cdata->cntlid) { 5783 NVME_CTRLR_ERRLOG(tmp, "cntlid %u are duplicated.\n", tmp_cdata->cntlid); 5784 return false; 5785 } 5786 } 5787 5788 return true; 5789 } 5790 5791 5792 static int 5793 nvme_bdev_ctrlr_create(const char *name, struct nvme_ctrlr *nvme_ctrlr) 5794 { 5795 struct nvme_bdev_ctrlr *nbdev_ctrlr; 5796 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 5797 struct nvme_ctrlr *nctrlr; 5798 int rc = 0; 5799 5800 pthread_mutex_lock(&g_bdev_nvme_mutex); 5801 5802 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 5803 if (nbdev_ctrlr != NULL) { 5804 if (!bdev_nvme_check_multipath(nbdev_ctrlr, ctrlr)) { 5805 rc = -EINVAL; 5806 goto exit; 5807 } 5808 TAILQ_FOREACH(nctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 5809 if (nctrlr->opts.multipath != nvme_ctrlr->opts.multipath) { 5810 /* All controllers with the same name must be configured the same 5811 * way, either for multipath or failover. If the configuration doesn't 5812 * match - report error. 5813 */ 5814 rc = -EINVAL; 5815 goto exit; 5816 } 5817 } 5818 } else { 5819 nbdev_ctrlr = calloc(1, sizeof(*nbdev_ctrlr)); 5820 if (nbdev_ctrlr == NULL) { 5821 NVME_CTRLR_ERRLOG(nvme_ctrlr, "Failed to allocate nvme_bdev_ctrlr.\n"); 5822 rc = -ENOMEM; 5823 goto exit; 5824 } 5825 nbdev_ctrlr->name = strdup(name); 5826 if (nbdev_ctrlr->name == NULL) { 5827 NVME_CTRLR_ERRLOG(nvme_ctrlr, "Failed to allocate name of nvme_bdev_ctrlr.\n"); 5828 free(nbdev_ctrlr); 5829 goto exit; 5830 } 5831 TAILQ_INIT(&nbdev_ctrlr->ctrlrs); 5832 TAILQ_INIT(&nbdev_ctrlr->bdevs); 5833 TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 5834 } 5835 nvme_ctrlr->nbdev_ctrlr = nbdev_ctrlr; 5836 TAILQ_INSERT_TAIL(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 5837 exit: 5838 pthread_mutex_unlock(&g_bdev_nvme_mutex); 5839 return rc; 5840 } 5841 5842 static int 5843 nvme_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr, 5844 const char *name, 5845 const struct spdk_nvme_transport_id *trid, 5846 struct nvme_async_probe_ctx *ctx) 5847 { 5848 struct nvme_ctrlr *nvme_ctrlr; 5849 struct nvme_path_id *path_id; 5850 const struct spdk_nvme_ctrlr_data *cdata; 5851 struct spdk_event_handler_opts opts = { 5852 .opts_size = SPDK_SIZEOF(&opts, fd_type), 5853 }; 5854 uint64_t period; 5855 int fd, rc; 5856 5857 nvme_ctrlr = calloc(1, sizeof(*nvme_ctrlr)); 5858 if (nvme_ctrlr == NULL) { 5859 SPDK_ERRLOG("Failed to allocate device struct\n"); 5860 return -ENOMEM; 5861 } 5862 5863 rc = pthread_mutex_init(&nvme_ctrlr->mutex, NULL); 5864 if (rc != 0) { 5865 free(nvme_ctrlr); 5866 return rc; 5867 } 5868 5869 TAILQ_INIT(&nvme_ctrlr->trids); 5870 TAILQ_INIT(&nvme_ctrlr->pending_resets); 5871 RB_INIT(&nvme_ctrlr->namespaces); 5872 5873 /* Get another reference to the key, so the first one can be released from probe_ctx */ 5874 if (ctx != NULL) { 5875 if (ctx->drv_opts.tls_psk != NULL) { 5876 nvme_ctrlr->psk = spdk_keyring_get_key( 5877 spdk_key_get_name(ctx->drv_opts.tls_psk)); 5878 if (nvme_ctrlr->psk == NULL) { 5879 /* Could only happen if the key was removed in the meantime */ 5880 SPDK_ERRLOG("Couldn't get a reference to the key '%s'\n", 5881 spdk_key_get_name(ctx->drv_opts.tls_psk)); 5882 rc = -ENOKEY; 5883 goto err; 5884 } 5885 } 5886 5887 if (ctx->drv_opts.dhchap_key != NULL) { 5888 nvme_ctrlr->dhchap_key = spdk_keyring_get_key( 5889 spdk_key_get_name(ctx->drv_opts.dhchap_key)); 5890 if (nvme_ctrlr->dhchap_key == NULL) { 5891 SPDK_ERRLOG("Couldn't get a reference to the key '%s'\n", 5892 spdk_key_get_name(ctx->drv_opts.dhchap_key)); 5893 rc = -ENOKEY; 5894 goto err; 5895 } 5896 } 5897 5898 if (ctx->drv_opts.dhchap_ctrlr_key != NULL) { 5899 nvme_ctrlr->dhchap_ctrlr_key = 5900 spdk_keyring_get_key( 5901 spdk_key_get_name(ctx->drv_opts.dhchap_ctrlr_key)); 5902 if (nvme_ctrlr->dhchap_ctrlr_key == NULL) { 5903 SPDK_ERRLOG("Couldn't get a reference to the key '%s'\n", 5904 spdk_key_get_name(ctx->drv_opts.dhchap_ctrlr_key)); 5905 rc = -ENOKEY; 5906 goto err; 5907 } 5908 } 5909 } 5910 5911 /* Check if we manage to enable interrupts on the controller. */ 5912 if (spdk_interrupt_mode_is_enabled() && ctx != NULL && !ctx->drv_opts.enable_interrupts) { 5913 SPDK_ERRLOG("Failed to enable interrupts on the controller\n"); 5914 rc = -ENOTSUP; 5915 goto err; 5916 } 5917 5918 path_id = calloc(1, sizeof(*path_id)); 5919 if (path_id == NULL) { 5920 SPDK_ERRLOG("Failed to allocate trid entry pointer\n"); 5921 rc = -ENOMEM; 5922 goto err; 5923 } 5924 5925 path_id->trid = *trid; 5926 if (ctx != NULL) { 5927 memcpy(path_id->hostid.hostaddr, ctx->drv_opts.src_addr, sizeof(path_id->hostid.hostaddr)); 5928 memcpy(path_id->hostid.hostsvcid, ctx->drv_opts.src_svcid, sizeof(path_id->hostid.hostsvcid)); 5929 } 5930 nvme_ctrlr->active_path_id = path_id; 5931 TAILQ_INSERT_HEAD(&nvme_ctrlr->trids, path_id, link); 5932 5933 nvme_ctrlr->thread = spdk_get_thread(); 5934 nvme_ctrlr->ctrlr = ctrlr; 5935 nvme_ctrlr->ref = 1; 5936 5937 if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) { 5938 SPDK_ERRLOG("OCSSDs are not supported"); 5939 rc = -ENOTSUP; 5940 goto err; 5941 } 5942 5943 if (ctx != NULL) { 5944 memcpy(&nvme_ctrlr->opts, &ctx->bdev_opts, sizeof(ctx->bdev_opts)); 5945 } else { 5946 spdk_bdev_nvme_get_default_ctrlr_opts(&nvme_ctrlr->opts); 5947 } 5948 5949 period = spdk_interrupt_mode_is_enabled() ? 0 : g_opts.nvme_adminq_poll_period_us; 5950 5951 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_ctrlr, 5952 period); 5953 5954 if (spdk_interrupt_mode_is_enabled()) { 5955 spdk_poller_register_interrupt(nvme_ctrlr->adminq_timer_poller, NULL, NULL); 5956 5957 fd = spdk_nvme_ctrlr_get_admin_qp_fd(nvme_ctrlr->ctrlr, &opts); 5958 if (fd < 0) { 5959 rc = fd; 5960 goto err; 5961 } 5962 5963 nvme_ctrlr->intr = SPDK_INTERRUPT_REGISTER_EXT(fd, bdev_nvme_poll_adminq, 5964 nvme_ctrlr, &opts); 5965 if (!nvme_ctrlr->intr) { 5966 rc = -EINVAL; 5967 goto err; 5968 } 5969 } 5970 5971 if (g_opts.timeout_us > 0) { 5972 /* Register timeout callback. Timeout values for IO vs. admin reqs can be different. */ 5973 /* If timeout_admin_us is 0 (not specified), admin uses same timeout as IO. */ 5974 uint64_t adm_timeout_us = (g_opts.timeout_admin_us == 0) ? 5975 g_opts.timeout_us : g_opts.timeout_admin_us; 5976 spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us, 5977 adm_timeout_us, timeout_cb, nvme_ctrlr); 5978 } 5979 5980 spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_ctrlr); 5981 spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_ctrlr); 5982 5983 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 5984 SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) { 5985 nvme_ctrlr->opal_dev = spdk_opal_dev_construct(ctrlr); 5986 } 5987 5988 rc = nvme_bdev_ctrlr_create(name, nvme_ctrlr); 5989 if (rc != 0) { 5990 goto err; 5991 } 5992 5993 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5994 5995 if (cdata->cmic.ana_reporting) { 5996 rc = nvme_ctrlr_init_ana_log_page(nvme_ctrlr, ctx); 5997 if (rc == 0) { 5998 return 0; 5999 } 6000 } else { 6001 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 6002 return 0; 6003 } 6004 6005 err: 6006 nvme_ctrlr_delete(nvme_ctrlr); 6007 return rc; 6008 } 6009 6010 void 6011 spdk_bdev_nvme_get_default_ctrlr_opts(struct spdk_bdev_nvme_ctrlr_opts *opts) 6012 { 6013 opts->prchk_flags = 0; 6014 opts->ctrlr_loss_timeout_sec = g_opts.ctrlr_loss_timeout_sec; 6015 opts->reconnect_delay_sec = g_opts.reconnect_delay_sec; 6016 opts->fast_io_fail_timeout_sec = g_opts.fast_io_fail_timeout_sec; 6017 opts->multipath = true; 6018 } 6019 6020 static void 6021 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 6022 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *drv_opts) 6023 { 6024 char *name; 6025 6026 name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++); 6027 if (!name) { 6028 SPDK_ERRLOG("Failed to assign name to NVMe device\n"); 6029 return; 6030 } 6031 6032 if (nvme_ctrlr_create(ctrlr, name, trid, NULL) == 0) { 6033 SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name); 6034 } else { 6035 SPDK_ERRLOG("Failed to attach to %s (%s)\n", trid->traddr, name); 6036 } 6037 6038 free(name); 6039 } 6040 6041 static void 6042 _nvme_ctrlr_destruct(void *ctx) 6043 { 6044 struct nvme_ctrlr *nvme_ctrlr = ctx; 6045 6046 nvme_ctrlr_depopulate_namespaces(nvme_ctrlr); 6047 nvme_ctrlr_put_ref(nvme_ctrlr); 6048 } 6049 6050 static int 6051 bdev_nvme_delete_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) 6052 { 6053 struct nvme_probe_skip_entry *entry; 6054 6055 /* The controller's destruction was already started */ 6056 if (nvme_ctrlr->destruct) { 6057 return -EALREADY; 6058 } 6059 6060 if (!hotplug && 6061 nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { 6062 entry = calloc(1, sizeof(*entry)); 6063 if (!entry) { 6064 return -ENOMEM; 6065 } 6066 entry->trid = nvme_ctrlr->active_path_id->trid; 6067 TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq); 6068 } 6069 6070 nvme_ctrlr->destruct = true; 6071 return 0; 6072 } 6073 6074 static int 6075 bdev_nvme_delete_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) 6076 { 6077 int rc; 6078 6079 pthread_mutex_lock(&nvme_ctrlr->mutex); 6080 rc = bdev_nvme_delete_ctrlr_unsafe(nvme_ctrlr, hotplug); 6081 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6082 6083 if (rc == 0) { 6084 _nvme_ctrlr_destruct(nvme_ctrlr); 6085 } else if (rc == -EALREADY) { 6086 rc = 0; 6087 } 6088 6089 return rc; 6090 } 6091 6092 static void 6093 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr) 6094 { 6095 struct nvme_ctrlr *nvme_ctrlr = cb_ctx; 6096 6097 bdev_nvme_delete_ctrlr(nvme_ctrlr, true); 6098 } 6099 6100 static int 6101 bdev_nvme_hotplug_probe(void *arg) 6102 { 6103 if (g_hotplug_probe_ctx == NULL) { 6104 spdk_poller_unregister(&g_hotplug_probe_poller); 6105 return SPDK_POLLER_IDLE; 6106 } 6107 6108 if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) { 6109 g_hotplug_probe_ctx = NULL; 6110 spdk_poller_unregister(&g_hotplug_probe_poller); 6111 } 6112 6113 return SPDK_POLLER_BUSY; 6114 } 6115 6116 static int 6117 bdev_nvme_hotplug(void *arg) 6118 { 6119 struct spdk_nvme_transport_id trid_pcie; 6120 6121 if (g_hotplug_probe_ctx) { 6122 return SPDK_POLLER_BUSY; 6123 } 6124 6125 memset(&trid_pcie, 0, sizeof(trid_pcie)); 6126 spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE); 6127 6128 g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL, 6129 hotplug_probe_cb, attach_cb, NULL); 6130 6131 if (g_hotplug_probe_ctx) { 6132 assert(g_hotplug_probe_poller == NULL); 6133 g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000); 6134 } 6135 6136 return SPDK_POLLER_BUSY; 6137 } 6138 6139 void 6140 spdk_bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts, size_t opts_size) 6141 { 6142 if (!opts) { 6143 SPDK_ERRLOG("opts should not be NULL\n"); 6144 return; 6145 } 6146 6147 if (!opts_size) { 6148 SPDK_ERRLOG("opts_size should not be zero value\n"); 6149 return; 6150 } 6151 6152 opts->opts_size = opts_size; 6153 6154 #define SET_FIELD(field, defval) \ 6155 opts->field = SPDK_GET_FIELD(&g_opts, field, defval, opts_size); \ 6156 6157 SET_FIELD(action_on_timeout, 0); 6158 SET_FIELD(keep_alive_timeout_ms, 0); 6159 SET_FIELD(timeout_us, 0); 6160 SET_FIELD(timeout_admin_us, 0); 6161 SET_FIELD(transport_retry_count, 0); 6162 SET_FIELD(arbitration_burst, 0); 6163 SET_FIELD(low_priority_weight, 0); 6164 SET_FIELD(medium_priority_weight, 0); 6165 SET_FIELD(high_priority_weight, 0); 6166 SET_FIELD(io_queue_requests, 0); 6167 SET_FIELD(nvme_adminq_poll_period_us, 0); 6168 SET_FIELD(nvme_ioq_poll_period_us, 0); 6169 SET_FIELD(delay_cmd_submit, 0); 6170 SET_FIELD(bdev_retry_count, 0); 6171 SET_FIELD(ctrlr_loss_timeout_sec, 0); 6172 SET_FIELD(reconnect_delay_sec, 0); 6173 SET_FIELD(fast_io_fail_timeout_sec, 0); 6174 SET_FIELD(transport_ack_timeout, 0); 6175 SET_FIELD(disable_auto_failback, false); 6176 SET_FIELD(generate_uuids, false); 6177 SET_FIELD(transport_tos, 0); 6178 SET_FIELD(nvme_error_stat, false); 6179 SET_FIELD(io_path_stat, false); 6180 SET_FIELD(allow_accel_sequence, false); 6181 SET_FIELD(rdma_srq_size, 0); 6182 SET_FIELD(rdma_max_cq_size, 0); 6183 SET_FIELD(rdma_cm_event_timeout_ms, 0); 6184 SET_FIELD(dhchap_digests, 0); 6185 SET_FIELD(dhchap_dhgroups, 0); 6186 6187 #undef SET_FIELD 6188 6189 /* Do not remove this statement, you should always update this statement when you adding a new field, 6190 * and do not forget to add the SET_FIELD statement for your added field. */ 6191 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_nvme_opts) == 120, "Incorrect size"); 6192 } 6193 6194 static bool bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, 6195 uint32_t reconnect_delay_sec, 6196 uint32_t fast_io_fail_timeout_sec); 6197 6198 static int 6199 bdev_nvme_validate_opts(const struct spdk_bdev_nvme_opts *opts) 6200 { 6201 if ((opts->timeout_us == 0) && (opts->timeout_admin_us != 0)) { 6202 /* Can't set timeout_admin_us without also setting timeout_us */ 6203 SPDK_WARNLOG("Invalid options: Can't have (timeout_us == 0) with (timeout_admin_us > 0)\n"); 6204 return -EINVAL; 6205 } 6206 6207 if (opts->bdev_retry_count < -1) { 6208 SPDK_WARNLOG("Invalid option: bdev_retry_count can't be less than -1.\n"); 6209 return -EINVAL; 6210 } 6211 6212 if (!bdev_nvme_check_io_error_resiliency_params(opts->ctrlr_loss_timeout_sec, 6213 opts->reconnect_delay_sec, 6214 opts->fast_io_fail_timeout_sec)) { 6215 return -EINVAL; 6216 } 6217 6218 return 0; 6219 } 6220 6221 int 6222 spdk_bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts) 6223 { 6224 if (!opts) { 6225 SPDK_ERRLOG("opts cannot be NULL\n"); 6226 return -1; 6227 } 6228 6229 if (!opts->opts_size) { 6230 SPDK_ERRLOG("opts_size inside opts cannot be zero value\n"); 6231 return -1; 6232 } 6233 6234 int ret; 6235 6236 ret = bdev_nvme_validate_opts(opts); 6237 if (ret) { 6238 SPDK_WARNLOG("Failed to set nvme opts.\n"); 6239 return ret; 6240 } 6241 6242 if (g_bdev_nvme_init_thread != NULL) { 6243 if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 6244 return -EPERM; 6245 } 6246 } 6247 6248 if (opts->rdma_srq_size != 0 || 6249 opts->rdma_max_cq_size != 0 || 6250 opts->rdma_cm_event_timeout_ms != 0) { 6251 struct spdk_nvme_transport_opts drv_opts; 6252 6253 spdk_nvme_transport_get_opts(&drv_opts, sizeof(drv_opts)); 6254 if (opts->rdma_srq_size != 0) { 6255 drv_opts.rdma_srq_size = opts->rdma_srq_size; 6256 } 6257 if (opts->rdma_max_cq_size != 0) { 6258 drv_opts.rdma_max_cq_size = opts->rdma_max_cq_size; 6259 } 6260 if (opts->rdma_cm_event_timeout_ms != 0) { 6261 drv_opts.rdma_cm_event_timeout_ms = opts->rdma_cm_event_timeout_ms; 6262 } 6263 6264 ret = spdk_nvme_transport_set_opts(&drv_opts, sizeof(drv_opts)); 6265 if (ret) { 6266 SPDK_ERRLOG("Failed to set NVMe transport opts.\n"); 6267 return ret; 6268 } 6269 } 6270 6271 #define SET_FIELD(field, defval) \ 6272 g_opts.field = SPDK_GET_FIELD(opts, field, defval, opts->opts_size); \ 6273 6274 SET_FIELD(action_on_timeout, 0); 6275 SET_FIELD(keep_alive_timeout_ms, 0); 6276 SET_FIELD(timeout_us, 0); 6277 SET_FIELD(timeout_admin_us, 0); 6278 SET_FIELD(transport_retry_count, 0); 6279 SET_FIELD(arbitration_burst, 0); 6280 SET_FIELD(low_priority_weight, 0); 6281 SET_FIELD(medium_priority_weight, 0); 6282 SET_FIELD(high_priority_weight, 0); 6283 SET_FIELD(io_queue_requests, 0); 6284 SET_FIELD(nvme_adminq_poll_period_us, 0); 6285 SET_FIELD(nvme_ioq_poll_period_us, 0); 6286 SET_FIELD(delay_cmd_submit, 0); 6287 SET_FIELD(bdev_retry_count, 0); 6288 SET_FIELD(ctrlr_loss_timeout_sec, 0); 6289 SET_FIELD(reconnect_delay_sec, 0); 6290 SET_FIELD(fast_io_fail_timeout_sec, 0); 6291 SET_FIELD(transport_ack_timeout, 0); 6292 SET_FIELD(disable_auto_failback, false); 6293 SET_FIELD(generate_uuids, false); 6294 SET_FIELD(transport_tos, 0); 6295 SET_FIELD(nvme_error_stat, false); 6296 SET_FIELD(io_path_stat, false); 6297 SET_FIELD(allow_accel_sequence, false); 6298 SET_FIELD(rdma_srq_size, 0); 6299 SET_FIELD(rdma_max_cq_size, 0); 6300 SET_FIELD(rdma_cm_event_timeout_ms, 0); 6301 SET_FIELD(dhchap_digests, 0); 6302 SET_FIELD(dhchap_dhgroups, 0); 6303 6304 g_opts.opts_size = opts->opts_size; 6305 6306 #undef SET_FIELD 6307 6308 return 0; 6309 } 6310 6311 struct set_nvme_hotplug_ctx { 6312 uint64_t period_us; 6313 bool enabled; 6314 spdk_msg_fn fn; 6315 void *fn_ctx; 6316 }; 6317 6318 static void 6319 set_nvme_hotplug_period_cb(void *_ctx) 6320 { 6321 struct set_nvme_hotplug_ctx *ctx = _ctx; 6322 6323 spdk_poller_unregister(&g_hotplug_poller); 6324 if (ctx->enabled) { 6325 g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us); 6326 } else { 6327 g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_remove_poller, NULL, 6328 NVME_HOTPLUG_POLL_PERIOD_DEFAULT); 6329 } 6330 6331 g_nvme_hotplug_poll_period_us = ctx->period_us; 6332 g_nvme_hotplug_enabled = ctx->enabled; 6333 if (ctx->fn) { 6334 ctx->fn(ctx->fn_ctx); 6335 } 6336 6337 free(ctx); 6338 } 6339 6340 int 6341 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx) 6342 { 6343 struct set_nvme_hotplug_ctx *ctx; 6344 6345 if (enabled == true && !spdk_process_is_primary()) { 6346 return -EPERM; 6347 } 6348 6349 ctx = calloc(1, sizeof(*ctx)); 6350 if (ctx == NULL) { 6351 return -ENOMEM; 6352 } 6353 6354 period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us; 6355 ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX); 6356 ctx->enabled = enabled; 6357 ctx->fn = cb; 6358 ctx->fn_ctx = cb_ctx; 6359 6360 spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx); 6361 return 0; 6362 } 6363 6364 static void 6365 nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 6366 struct nvme_async_probe_ctx *ctx) 6367 { 6368 struct nvme_ns *nvme_ns; 6369 struct nvme_bdev *nvme_bdev; 6370 size_t j; 6371 6372 assert(nvme_ctrlr != NULL); 6373 6374 if (ctx->names == NULL) { 6375 ctx->reported_bdevs = 0; 6376 populate_namespaces_cb(ctx, 0); 6377 return; 6378 } 6379 6380 /* 6381 * Report the new bdevs that were created in this call. 6382 * There can be more than one bdev per NVMe controller. 6383 */ 6384 j = 0; 6385 6386 pthread_mutex_lock(&nvme_ctrlr->mutex); 6387 6388 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 6389 while (nvme_ns != NULL) { 6390 nvme_bdev = nvme_ns->bdev; 6391 if (j < ctx->max_bdevs) { 6392 ctx->names[j] = nvme_bdev->disk.name; 6393 j++; 6394 } else { 6395 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6396 6397 NVME_CTRLR_ERRLOG(nvme_ctrlr, 6398 "Maximum number of namespaces supported per NVMe controller is %du. " 6399 "Unable to return all names of created bdevs\n", 6400 ctx->max_bdevs); 6401 ctx->reported_bdevs = 0; 6402 populate_namespaces_cb(ctx, -ERANGE); 6403 return; 6404 } 6405 6406 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 6407 } 6408 6409 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6410 6411 ctx->reported_bdevs = j; 6412 populate_namespaces_cb(ctx, 0); 6413 } 6414 6415 static int 6416 bdev_nvme_check_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 6417 struct spdk_nvme_ctrlr *new_ctrlr, 6418 struct spdk_nvme_transport_id *trid) 6419 { 6420 struct nvme_path_id *tmp_trid; 6421 6422 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 6423 NVME_CTRLR_ERRLOG(nvme_ctrlr, "PCIe failover is not supported.\n"); 6424 return -ENOTSUP; 6425 } 6426 6427 /* Currently we only support failover to the same transport type. */ 6428 if (nvme_ctrlr->active_path_id->trid.trtype != trid->trtype) { 6429 NVME_CTRLR_WARNLOG(nvme_ctrlr, 6430 "Failover from trtype: %s to a different trtype: %s is not supported currently\n", 6431 spdk_nvme_transport_id_trtype_str(nvme_ctrlr->active_path_id->trid.trtype), 6432 spdk_nvme_transport_id_trtype_str(trid->trtype)); 6433 return -EINVAL; 6434 } 6435 6436 6437 /* Currently we only support failover to the same NQN. */ 6438 if (strncmp(trid->subnqn, nvme_ctrlr->active_path_id->trid.subnqn, SPDK_NVMF_NQN_MAX_LEN)) { 6439 NVME_CTRLR_WARNLOG(nvme_ctrlr, 6440 "Failover from subnqn: %s to a different subnqn: %s is not supported currently\n", 6441 nvme_ctrlr->active_path_id->trid.subnqn, trid->subnqn); 6442 return -EINVAL; 6443 } 6444 6445 /* Skip all the other checks if we've already registered this path. */ 6446 TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) { 6447 if (!spdk_nvme_transport_id_compare(&tmp_trid->trid, trid)) { 6448 NVME_CTRLR_WARNLOG(nvme_ctrlr, "This path (traddr: %s subnqn: %s) is already registered\n", 6449 trid->traddr, trid->subnqn); 6450 return -EALREADY; 6451 } 6452 } 6453 6454 return 0; 6455 } 6456 6457 static int 6458 bdev_nvme_check_secondary_namespace(struct nvme_ctrlr *nvme_ctrlr, 6459 struct spdk_nvme_ctrlr *new_ctrlr) 6460 { 6461 struct nvme_ns *nvme_ns; 6462 struct spdk_nvme_ns *new_ns; 6463 6464 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 6465 while (nvme_ns != NULL) { 6466 new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nvme_ns->id); 6467 assert(new_ns != NULL); 6468 6469 if (!bdev_nvme_compare_ns(nvme_ns->ns, new_ns)) { 6470 return -EINVAL; 6471 } 6472 6473 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 6474 } 6475 6476 return 0; 6477 } 6478 6479 static int 6480 _bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 6481 struct spdk_nvme_transport_id *trid) 6482 { 6483 struct nvme_path_id *active_id, *new_trid, *tmp_trid; 6484 6485 new_trid = calloc(1, sizeof(*new_trid)); 6486 if (new_trid == NULL) { 6487 return -ENOMEM; 6488 } 6489 new_trid->trid = *trid; 6490 6491 active_id = nvme_ctrlr->active_path_id; 6492 assert(active_id != NULL); 6493 assert(active_id == TAILQ_FIRST(&nvme_ctrlr->trids)); 6494 6495 /* Skip the active trid not to replace it until it is failed. */ 6496 tmp_trid = TAILQ_NEXT(active_id, link); 6497 if (tmp_trid == NULL) { 6498 goto add_tail; 6499 } 6500 6501 /* It means the trid is faled if its last failed time is non-zero. 6502 * Insert the new alternate trid before any failed trid. 6503 */ 6504 TAILQ_FOREACH_FROM(tmp_trid, &nvme_ctrlr->trids, link) { 6505 if (tmp_trid->last_failed_tsc != 0) { 6506 TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link); 6507 return 0; 6508 } 6509 } 6510 6511 add_tail: 6512 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, new_trid, link); 6513 return 0; 6514 } 6515 6516 /* This is the case that a secondary path is added to an existing 6517 * nvme_ctrlr for failover. After checking if it can access the same 6518 * namespaces as the primary path, it is disconnected until failover occurs. 6519 */ 6520 static int 6521 bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 6522 struct spdk_nvme_ctrlr *new_ctrlr, 6523 struct spdk_nvme_transport_id *trid) 6524 { 6525 int rc; 6526 6527 assert(nvme_ctrlr != NULL); 6528 6529 pthread_mutex_lock(&nvme_ctrlr->mutex); 6530 6531 rc = bdev_nvme_check_secondary_trid(nvme_ctrlr, new_ctrlr, trid); 6532 if (rc != 0) { 6533 goto exit; 6534 } 6535 6536 rc = bdev_nvme_check_secondary_namespace(nvme_ctrlr, new_ctrlr); 6537 if (rc != 0) { 6538 goto exit; 6539 } 6540 6541 rc = _bdev_nvme_add_secondary_trid(nvme_ctrlr, trid); 6542 6543 exit: 6544 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6545 6546 spdk_nvme_detach(new_ctrlr); 6547 6548 return rc; 6549 } 6550 6551 static void 6552 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 6553 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 6554 { 6555 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 6556 struct nvme_async_probe_ctx *ctx; 6557 int rc; 6558 6559 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 6560 ctx->ctrlr_attached = true; 6561 6562 rc = nvme_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx); 6563 if (rc != 0) { 6564 ctx->reported_bdevs = 0; 6565 populate_namespaces_cb(ctx, rc); 6566 } 6567 } 6568 6569 6570 static void 6571 connect_set_failover_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 6572 struct spdk_nvme_ctrlr *ctrlr, 6573 const struct spdk_nvme_ctrlr_opts *opts) 6574 { 6575 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 6576 struct nvme_ctrlr *nvme_ctrlr; 6577 struct nvme_async_probe_ctx *ctx; 6578 int rc; 6579 6580 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 6581 ctx->ctrlr_attached = true; 6582 6583 nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->base_name); 6584 if (nvme_ctrlr) { 6585 rc = bdev_nvme_add_secondary_trid(nvme_ctrlr, ctrlr, &ctx->trid); 6586 } else { 6587 rc = -ENODEV; 6588 } 6589 6590 ctx->reported_bdevs = 0; 6591 populate_namespaces_cb(ctx, rc); 6592 } 6593 6594 static int 6595 bdev_nvme_async_poll(void *arg) 6596 { 6597 struct nvme_async_probe_ctx *ctx = arg; 6598 int rc; 6599 6600 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 6601 if (spdk_unlikely(rc != -EAGAIN)) { 6602 ctx->probe_done = true; 6603 spdk_poller_unregister(&ctx->poller); 6604 if (!ctx->ctrlr_attached) { 6605 /* The probe is done, but no controller was attached. 6606 * That means we had a failure, so report -EIO back to 6607 * the caller (usually the RPC). populate_namespaces_cb() 6608 * will take care of freeing the nvme_async_probe_ctx. 6609 */ 6610 ctx->reported_bdevs = 0; 6611 populate_namespaces_cb(ctx, -EIO); 6612 } else if (ctx->namespaces_populated) { 6613 /* The namespaces for the attached controller were all 6614 * populated and the response was already sent to the 6615 * caller (usually the RPC). So free the context here. 6616 */ 6617 free_nvme_async_probe_ctx(ctx); 6618 } 6619 } 6620 6621 return SPDK_POLLER_BUSY; 6622 } 6623 6624 static bool 6625 bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, 6626 uint32_t reconnect_delay_sec, 6627 uint32_t fast_io_fail_timeout_sec) 6628 { 6629 if (ctrlr_loss_timeout_sec < -1) { 6630 SPDK_ERRLOG("ctrlr_loss_timeout_sec can't be less than -1.\n"); 6631 return false; 6632 } else if (ctrlr_loss_timeout_sec == -1) { 6633 if (reconnect_delay_sec == 0) { 6634 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 6635 return false; 6636 } else if (fast_io_fail_timeout_sec != 0 && 6637 fast_io_fail_timeout_sec < reconnect_delay_sec) { 6638 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io-fail_timeout_sec.\n"); 6639 return false; 6640 } 6641 } else if (ctrlr_loss_timeout_sec != 0) { 6642 if (reconnect_delay_sec == 0) { 6643 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 6644 return false; 6645 } else if (reconnect_delay_sec > (uint32_t)ctrlr_loss_timeout_sec) { 6646 SPDK_ERRLOG("reconnect_delay_sec can't be more than ctrlr_loss_timeout_sec.\n"); 6647 return false; 6648 } else if (fast_io_fail_timeout_sec != 0) { 6649 if (fast_io_fail_timeout_sec < reconnect_delay_sec) { 6650 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io_fail_timeout_sec.\n"); 6651 return false; 6652 } else if (fast_io_fail_timeout_sec > (uint32_t)ctrlr_loss_timeout_sec) { 6653 SPDK_ERRLOG("fast_io_fail_timeout_sec can't be more than ctrlr_loss_timeout_sec.\n"); 6654 return false; 6655 } 6656 } 6657 } else if (reconnect_delay_sec != 0 || fast_io_fail_timeout_sec != 0) { 6658 SPDK_ERRLOG("Both reconnect_delay_sec and fast_io_fail_timeout_sec must be 0 if ctrlr_loss_timeout_sec is 0.\n"); 6659 return false; 6660 } 6661 6662 return true; 6663 } 6664 6665 int 6666 spdk_bdev_nvme_create(struct spdk_nvme_transport_id *trid, 6667 const char *base_name, 6668 const char **names, 6669 uint32_t count, 6670 spdk_bdev_nvme_create_cb cb_fn, 6671 void *cb_ctx, 6672 struct spdk_nvme_ctrlr_opts *drv_opts, 6673 struct spdk_bdev_nvme_ctrlr_opts *bdev_opts) 6674 { 6675 struct nvme_probe_skip_entry *entry, *tmp; 6676 struct nvme_async_probe_ctx *ctx; 6677 spdk_nvme_attach_cb attach_cb; 6678 struct nvme_ctrlr *nvme_ctrlr; 6679 int len; 6680 6681 /* TODO expand this check to include both the host and target TRIDs. 6682 * Only if both are the same should we fail. 6683 */ 6684 if (nvme_ctrlr_get(trid, drv_opts->hostnqn) != NULL) { 6685 SPDK_ERRLOG("A controller with the provided trid (traddr: %s, hostnqn: %s) " 6686 "already exists.\n", trid->traddr, drv_opts->hostnqn); 6687 return -EEXIST; 6688 } 6689 6690 len = strnlen(base_name, SPDK_CONTROLLER_NAME_MAX); 6691 6692 if (len == 0 || len == SPDK_CONTROLLER_NAME_MAX) { 6693 SPDK_ERRLOG("controller name must be between 1 and %d characters\n", SPDK_CONTROLLER_NAME_MAX - 1); 6694 return -EINVAL; 6695 } 6696 6697 if (bdev_opts != NULL && 6698 !bdev_nvme_check_io_error_resiliency_params(bdev_opts->ctrlr_loss_timeout_sec, 6699 bdev_opts->reconnect_delay_sec, 6700 bdev_opts->fast_io_fail_timeout_sec)) { 6701 return -EINVAL; 6702 } 6703 6704 ctx = calloc(1, sizeof(*ctx)); 6705 if (!ctx) { 6706 return -ENOMEM; 6707 } 6708 ctx->base_name = strdup(base_name); 6709 if (!ctx->base_name) { 6710 free(ctx); 6711 return -ENOMEM; 6712 } 6713 ctx->names = names; 6714 ctx->max_bdevs = count; 6715 ctx->cb_fn = cb_fn; 6716 ctx->cb_ctx = cb_ctx; 6717 ctx->trid = *trid; 6718 6719 if (bdev_opts) { 6720 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 6721 } else { 6722 spdk_bdev_nvme_get_default_ctrlr_opts(&ctx->bdev_opts); 6723 } 6724 6725 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 6726 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) { 6727 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 6728 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 6729 free(entry); 6730 break; 6731 } 6732 } 6733 } 6734 6735 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 6736 ctx->drv_opts.transport_retry_count = g_opts.transport_retry_count; 6737 ctx->drv_opts.transport_ack_timeout = g_opts.transport_ack_timeout; 6738 ctx->drv_opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms; 6739 ctx->drv_opts.disable_read_ana_log_page = true; 6740 ctx->drv_opts.transport_tos = g_opts.transport_tos; 6741 6742 if (spdk_interrupt_mode_is_enabled()) { 6743 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 6744 ctx->drv_opts.enable_interrupts = true; 6745 } else { 6746 SPDK_ERRLOG("Interrupt mode is only supported with PCIe transport\n"); 6747 free_nvme_async_probe_ctx(ctx); 6748 return -ENOTSUP; 6749 } 6750 } 6751 6752 if (ctx->bdev_opts.psk != NULL) { 6753 ctx->drv_opts.tls_psk = spdk_keyring_get_key(ctx->bdev_opts.psk); 6754 if (ctx->drv_opts.tls_psk == NULL) { 6755 SPDK_ERRLOG("Could not load PSK: %s\n", ctx->bdev_opts.psk); 6756 free_nvme_async_probe_ctx(ctx); 6757 return -ENOKEY; 6758 } 6759 } 6760 6761 if (ctx->bdev_opts.dhchap_key != NULL) { 6762 ctx->drv_opts.dhchap_key = spdk_keyring_get_key(ctx->bdev_opts.dhchap_key); 6763 if (ctx->drv_opts.dhchap_key == NULL) { 6764 SPDK_ERRLOG("Could not load DH-HMAC-CHAP key: %s\n", 6765 ctx->bdev_opts.dhchap_key); 6766 free_nvme_async_probe_ctx(ctx); 6767 return -ENOKEY; 6768 } 6769 6770 ctx->drv_opts.dhchap_digests = g_opts.dhchap_digests; 6771 ctx->drv_opts.dhchap_dhgroups = g_opts.dhchap_dhgroups; 6772 } 6773 if (ctx->bdev_opts.dhchap_ctrlr_key != NULL) { 6774 ctx->drv_opts.dhchap_ctrlr_key = 6775 spdk_keyring_get_key(ctx->bdev_opts.dhchap_ctrlr_key); 6776 if (ctx->drv_opts.dhchap_ctrlr_key == NULL) { 6777 SPDK_ERRLOG("Could not load DH-HMAC-CHAP controller key: %s\n", 6778 ctx->bdev_opts.dhchap_ctrlr_key); 6779 free_nvme_async_probe_ctx(ctx); 6780 return -ENOKEY; 6781 } 6782 } 6783 6784 if (nvme_bdev_ctrlr_get_by_name(base_name) == NULL || ctx->bdev_opts.multipath) { 6785 attach_cb = connect_attach_cb; 6786 } else { 6787 attach_cb = connect_set_failover_cb; 6788 } 6789 6790 nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->base_name); 6791 if (nvme_ctrlr && nvme_ctrlr->opts.multipath != ctx->bdev_opts.multipath) { 6792 /* All controllers with the same name must be configured the same 6793 * way, either for multipath or failover. If the configuration doesn't 6794 * match - report error. 6795 */ 6796 free_nvme_async_probe_ctx(ctx); 6797 return -EINVAL; 6798 } 6799 6800 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, attach_cb); 6801 if (ctx->probe_ctx == NULL) { 6802 SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr); 6803 free_nvme_async_probe_ctx(ctx); 6804 return -ENODEV; 6805 } 6806 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000); 6807 6808 return 0; 6809 } 6810 6811 struct bdev_nvme_delete_ctx { 6812 char *name; 6813 struct nvme_path_id path_id; 6814 bdev_nvme_delete_done_fn delete_done; 6815 void *delete_done_ctx; 6816 uint64_t timeout_ticks; 6817 struct spdk_poller *poller; 6818 }; 6819 6820 static void 6821 free_bdev_nvme_delete_ctx(struct bdev_nvme_delete_ctx *ctx) 6822 { 6823 if (ctx != NULL) { 6824 free(ctx->name); 6825 free(ctx); 6826 } 6827 } 6828 6829 static bool 6830 nvme_path_id_compare(struct nvme_path_id *p, const struct nvme_path_id *path_id) 6831 { 6832 if (path_id->trid.trtype != 0) { 6833 if (path_id->trid.trtype == SPDK_NVME_TRANSPORT_CUSTOM) { 6834 if (strcasecmp(path_id->trid.trstring, p->trid.trstring) != 0) { 6835 return false; 6836 } 6837 } else { 6838 if (path_id->trid.trtype != p->trid.trtype) { 6839 return false; 6840 } 6841 } 6842 } 6843 6844 if (!spdk_mem_all_zero(path_id->trid.traddr, sizeof(path_id->trid.traddr))) { 6845 if (strcasecmp(path_id->trid.traddr, p->trid.traddr) != 0) { 6846 return false; 6847 } 6848 } 6849 6850 if (path_id->trid.adrfam != 0) { 6851 if (path_id->trid.adrfam != p->trid.adrfam) { 6852 return false; 6853 } 6854 } 6855 6856 if (!spdk_mem_all_zero(path_id->trid.trsvcid, sizeof(path_id->trid.trsvcid))) { 6857 if (strcasecmp(path_id->trid.trsvcid, p->trid.trsvcid) != 0) { 6858 return false; 6859 } 6860 } 6861 6862 if (!spdk_mem_all_zero(path_id->trid.subnqn, sizeof(path_id->trid.subnqn))) { 6863 if (strcmp(path_id->trid.subnqn, p->trid.subnqn) != 0) { 6864 return false; 6865 } 6866 } 6867 6868 if (!spdk_mem_all_zero(path_id->hostid.hostaddr, sizeof(path_id->hostid.hostaddr))) { 6869 if (strcmp(path_id->hostid.hostaddr, p->hostid.hostaddr) != 0) { 6870 return false; 6871 } 6872 } 6873 6874 if (!spdk_mem_all_zero(path_id->hostid.hostsvcid, sizeof(path_id->hostid.hostsvcid))) { 6875 if (strcmp(path_id->hostid.hostsvcid, p->hostid.hostsvcid) != 0) { 6876 return false; 6877 } 6878 } 6879 6880 return true; 6881 } 6882 6883 static bool 6884 nvme_path_id_exists(const char *name, const struct nvme_path_id *path_id) 6885 { 6886 struct nvme_bdev_ctrlr *nbdev_ctrlr; 6887 struct nvme_ctrlr *ctrlr; 6888 struct nvme_path_id *p; 6889 6890 pthread_mutex_lock(&g_bdev_nvme_mutex); 6891 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 6892 if (!nbdev_ctrlr) { 6893 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6894 return false; 6895 } 6896 6897 TAILQ_FOREACH(ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 6898 pthread_mutex_lock(&ctrlr->mutex); 6899 TAILQ_FOREACH(p, &ctrlr->trids, link) { 6900 if (nvme_path_id_compare(p, path_id)) { 6901 pthread_mutex_unlock(&ctrlr->mutex); 6902 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6903 return true; 6904 } 6905 } 6906 pthread_mutex_unlock(&ctrlr->mutex); 6907 } 6908 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6909 6910 return false; 6911 } 6912 6913 static int 6914 bdev_nvme_delete_complete_poll(void *arg) 6915 { 6916 struct bdev_nvme_delete_ctx *ctx = arg; 6917 int rc = 0; 6918 6919 if (nvme_path_id_exists(ctx->name, &ctx->path_id)) { 6920 if (ctx->timeout_ticks > spdk_get_ticks()) { 6921 return SPDK_POLLER_BUSY; 6922 } 6923 6924 SPDK_ERRLOG("NVMe path '%s' still exists after delete\n", ctx->name); 6925 rc = -ETIMEDOUT; 6926 } 6927 6928 spdk_poller_unregister(&ctx->poller); 6929 6930 ctx->delete_done(ctx->delete_done_ctx, rc); 6931 free_bdev_nvme_delete_ctx(ctx); 6932 6933 return SPDK_POLLER_BUSY; 6934 } 6935 6936 static int 6937 _bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, const struct nvme_path_id *path_id) 6938 { 6939 struct nvme_path_id *p, *t; 6940 spdk_msg_fn msg_fn; 6941 int rc = -ENXIO; 6942 6943 pthread_mutex_lock(&nvme_ctrlr->mutex); 6944 6945 TAILQ_FOREACH_REVERSE_SAFE(p, &nvme_ctrlr->trids, nvme_paths, link, t) { 6946 if (p == TAILQ_FIRST(&nvme_ctrlr->trids)) { 6947 break; 6948 } 6949 6950 if (!nvme_path_id_compare(p, path_id)) { 6951 continue; 6952 } 6953 6954 /* We are not using the specified path. */ 6955 TAILQ_REMOVE(&nvme_ctrlr->trids, p, link); 6956 free(p); 6957 rc = 0; 6958 } 6959 6960 if (p == NULL || !nvme_path_id_compare(p, path_id)) { 6961 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6962 return rc; 6963 } 6964 6965 /* If we made it here, then this path is a match! Now we need to remove it. */ 6966 6967 /* This is the active path in use right now. The active path is always the first in the list. */ 6968 assert(p == nvme_ctrlr->active_path_id); 6969 6970 if (!TAILQ_NEXT(p, link)) { 6971 /* The current path is the only path. */ 6972 msg_fn = _nvme_ctrlr_destruct; 6973 rc = bdev_nvme_delete_ctrlr_unsafe(nvme_ctrlr, false); 6974 } else { 6975 /* There is an alternative path. */ 6976 msg_fn = _bdev_nvme_reset_ctrlr; 6977 rc = bdev_nvme_failover_ctrlr_unsafe(nvme_ctrlr, true); 6978 } 6979 6980 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6981 6982 if (rc == 0) { 6983 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 6984 } else if (rc == -EALREADY) { 6985 rc = 0; 6986 } 6987 6988 return rc; 6989 } 6990 6991 int 6992 bdev_nvme_delete(const char *name, const struct nvme_path_id *path_id, 6993 bdev_nvme_delete_done_fn delete_done, void *delete_done_ctx) 6994 { 6995 struct nvme_bdev_ctrlr *nbdev_ctrlr; 6996 struct nvme_ctrlr *nvme_ctrlr, *tmp_nvme_ctrlr; 6997 struct bdev_nvme_delete_ctx *ctx = NULL; 6998 int rc = -ENXIO, _rc; 6999 7000 if (name == NULL || path_id == NULL) { 7001 rc = -EINVAL; 7002 goto exit; 7003 } 7004 7005 pthread_mutex_lock(&g_bdev_nvme_mutex); 7006 7007 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 7008 if (nbdev_ctrlr == NULL) { 7009 pthread_mutex_unlock(&g_bdev_nvme_mutex); 7010 7011 SPDK_ERRLOG("Failed to find NVMe bdev controller\n"); 7012 rc = -ENODEV; 7013 goto exit; 7014 } 7015 7016 TAILQ_FOREACH_SAFE(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq, tmp_nvme_ctrlr) { 7017 _rc = _bdev_nvme_delete(nvme_ctrlr, path_id); 7018 if (_rc < 0 && _rc != -ENXIO) { 7019 pthread_mutex_unlock(&g_bdev_nvme_mutex); 7020 rc = _rc; 7021 goto exit; 7022 } else if (_rc == 0) { 7023 /* We traverse all remaining nvme_ctrlrs even if one nvme_ctrlr 7024 * was deleted successfully. To remember the successful deletion, 7025 * overwrite rc only if _rc is zero. 7026 */ 7027 rc = 0; 7028 } 7029 } 7030 7031 pthread_mutex_unlock(&g_bdev_nvme_mutex); 7032 7033 if (rc != 0 || delete_done == NULL) { 7034 goto exit; 7035 } 7036 7037 ctx = calloc(1, sizeof(*ctx)); 7038 if (ctx == NULL) { 7039 SPDK_ERRLOG("Failed to allocate context for bdev_nvme_delete\n"); 7040 rc = -ENOMEM; 7041 goto exit; 7042 } 7043 7044 ctx->name = strdup(name); 7045 if (ctx->name == NULL) { 7046 SPDK_ERRLOG("Failed to copy controller name for deletion\n"); 7047 rc = -ENOMEM; 7048 goto exit; 7049 } 7050 7051 ctx->delete_done = delete_done; 7052 ctx->delete_done_ctx = delete_done_ctx; 7053 ctx->path_id = *path_id; 7054 ctx->timeout_ticks = spdk_get_ticks() + 10 * spdk_get_ticks_hz(); 7055 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_delete_complete_poll, ctx, 1000); 7056 if (ctx->poller == NULL) { 7057 SPDK_ERRLOG("Failed to register bdev_nvme_delete poller\n"); 7058 rc = -ENOMEM; 7059 goto exit; 7060 } 7061 7062 exit: 7063 if (rc != 0) { 7064 free_bdev_nvme_delete_ctx(ctx); 7065 } 7066 7067 return rc; 7068 } 7069 7070 #define DISCOVERY_INFOLOG(ctx, format, ...) \ 7071 SPDK_INFOLOG(bdev_nvme, "Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 7072 7073 #define DISCOVERY_ERRLOG(ctx, format, ...) \ 7074 SPDK_ERRLOG("Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 7075 7076 struct discovery_entry_ctx { 7077 char name[128]; 7078 struct spdk_nvme_transport_id trid; 7079 struct spdk_nvme_ctrlr_opts drv_opts; 7080 struct spdk_nvmf_discovery_log_page_entry entry; 7081 TAILQ_ENTRY(discovery_entry_ctx) tailq; 7082 struct discovery_ctx *ctx; 7083 }; 7084 7085 struct discovery_ctx { 7086 char *name; 7087 spdk_bdev_nvme_start_discovery_fn start_cb_fn; 7088 spdk_bdev_nvme_stop_discovery_fn stop_cb_fn; 7089 void *cb_ctx; 7090 struct spdk_nvme_probe_ctx *probe_ctx; 7091 struct spdk_nvme_detach_ctx *detach_ctx; 7092 struct spdk_nvme_ctrlr *ctrlr; 7093 struct spdk_nvme_transport_id trid; 7094 struct discovery_entry_ctx *entry_ctx_in_use; 7095 struct spdk_poller *poller; 7096 struct spdk_nvme_ctrlr_opts drv_opts; 7097 struct spdk_bdev_nvme_ctrlr_opts bdev_opts; 7098 struct spdk_nvmf_discovery_log_page *log_page; 7099 TAILQ_ENTRY(discovery_ctx) tailq; 7100 TAILQ_HEAD(, discovery_entry_ctx) nvm_entry_ctxs; 7101 TAILQ_HEAD(, discovery_entry_ctx) discovery_entry_ctxs; 7102 int rc; 7103 bool wait_for_attach; 7104 uint64_t timeout_ticks; 7105 /* Denotes that the discovery service is being started. We're waiting 7106 * for the initial connection to the discovery controller to be 7107 * established and attach discovered NVM ctrlrs. 7108 */ 7109 bool initializing; 7110 /* Denotes if a discovery is currently in progress for this context. 7111 * That includes connecting to newly discovered subsystems. Used to 7112 * ensure we do not start a new discovery until an existing one is 7113 * complete. 7114 */ 7115 bool in_progress; 7116 7117 /* Denotes if another discovery is needed after the one in progress 7118 * completes. Set when we receive an AER completion while a discovery 7119 * is already in progress. 7120 */ 7121 bool pending; 7122 7123 /* Signal to the discovery context poller that it should stop the 7124 * discovery service, including detaching from the current discovery 7125 * controller. 7126 */ 7127 bool stop; 7128 7129 struct spdk_thread *calling_thread; 7130 uint32_t index; 7131 uint32_t attach_in_progress; 7132 char *hostnqn; 7133 7134 /* Denotes if the discovery service was started by the mdns discovery. 7135 */ 7136 bool from_mdns_discovery_service; 7137 }; 7138 7139 TAILQ_HEAD(discovery_ctxs, discovery_ctx); 7140 static struct discovery_ctxs g_discovery_ctxs = TAILQ_HEAD_INITIALIZER(g_discovery_ctxs); 7141 7142 static void get_discovery_log_page(struct discovery_ctx *ctx); 7143 7144 static void 7145 free_discovery_ctx(struct discovery_ctx *ctx) 7146 { 7147 free(ctx->log_page); 7148 free(ctx->hostnqn); 7149 free(ctx->name); 7150 free(ctx); 7151 } 7152 7153 static void 7154 discovery_complete(struct discovery_ctx *ctx) 7155 { 7156 ctx->initializing = false; 7157 ctx->in_progress = false; 7158 if (ctx->pending) { 7159 ctx->pending = false; 7160 get_discovery_log_page(ctx); 7161 } 7162 } 7163 7164 static void 7165 build_trid_from_log_page_entry(struct spdk_nvme_transport_id *trid, 7166 struct spdk_nvmf_discovery_log_page_entry *entry) 7167 { 7168 char *space; 7169 7170 trid->trtype = entry->trtype; 7171 trid->adrfam = entry->adrfam; 7172 memcpy(trid->traddr, entry->traddr, sizeof(entry->traddr)); 7173 memcpy(trid->trsvcid, entry->trsvcid, sizeof(entry->trsvcid)); 7174 /* Because the source buffer (entry->subnqn) is longer than trid->subnqn, and 7175 * before call to this function trid->subnqn is zeroed out, we need 7176 * to copy sizeof(trid->subnqn) minus one byte to make sure the last character 7177 * remains 0. Then we can shorten the string (replace ' ' with 0) if required 7178 */ 7179 memcpy(trid->subnqn, entry->subnqn, sizeof(trid->subnqn) - 1); 7180 7181 /* We want the traddr, trsvcid and subnqn fields to be NULL-terminated. 7182 * But the log page entries typically pad them with spaces, not zeroes. 7183 * So add a NULL terminator to each of these fields at the appropriate 7184 * location. 7185 */ 7186 space = strchr(trid->traddr, ' '); 7187 if (space) { 7188 *space = 0; 7189 } 7190 space = strchr(trid->trsvcid, ' '); 7191 if (space) { 7192 *space = 0; 7193 } 7194 space = strchr(trid->subnqn, ' '); 7195 if (space) { 7196 *space = 0; 7197 } 7198 } 7199 7200 static void 7201 _stop_discovery(void *_ctx) 7202 { 7203 struct discovery_ctx *ctx = _ctx; 7204 7205 if (ctx->attach_in_progress > 0) { 7206 spdk_thread_send_msg(spdk_get_thread(), _stop_discovery, ctx); 7207 return; 7208 } 7209 7210 ctx->stop = true; 7211 7212 while (!TAILQ_EMPTY(&ctx->nvm_entry_ctxs)) { 7213 struct discovery_entry_ctx *entry_ctx; 7214 struct nvme_path_id path = {}; 7215 7216 entry_ctx = TAILQ_FIRST(&ctx->nvm_entry_ctxs); 7217 path.trid = entry_ctx->trid; 7218 bdev_nvme_delete(entry_ctx->name, &path, NULL, NULL); 7219 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 7220 free(entry_ctx); 7221 } 7222 7223 while (!TAILQ_EMPTY(&ctx->discovery_entry_ctxs)) { 7224 struct discovery_entry_ctx *entry_ctx; 7225 7226 entry_ctx = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 7227 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 7228 free(entry_ctx); 7229 } 7230 7231 free(ctx->entry_ctx_in_use); 7232 ctx->entry_ctx_in_use = NULL; 7233 } 7234 7235 static void 7236 stop_discovery(struct discovery_ctx *ctx, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 7237 { 7238 ctx->stop_cb_fn = cb_fn; 7239 ctx->cb_ctx = cb_ctx; 7240 7241 if (ctx->attach_in_progress > 0) { 7242 DISCOVERY_INFOLOG(ctx, "stopping discovery with attach_in_progress: %"PRIu32"\n", 7243 ctx->attach_in_progress); 7244 } 7245 7246 _stop_discovery(ctx); 7247 } 7248 7249 static void 7250 remove_discovery_entry(struct nvme_ctrlr *nvme_ctrlr) 7251 { 7252 struct discovery_ctx *d_ctx; 7253 struct nvme_path_id *path_id; 7254 struct spdk_nvme_transport_id trid = {}; 7255 struct discovery_entry_ctx *entry_ctx, *tmp; 7256 7257 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 7258 7259 TAILQ_FOREACH(d_ctx, &g_discovery_ctxs, tailq) { 7260 TAILQ_FOREACH_SAFE(entry_ctx, &d_ctx->nvm_entry_ctxs, tailq, tmp) { 7261 build_trid_from_log_page_entry(&trid, &entry_ctx->entry); 7262 if (spdk_nvme_transport_id_compare(&trid, &path_id->trid) != 0) { 7263 continue; 7264 } 7265 7266 TAILQ_REMOVE(&d_ctx->nvm_entry_ctxs, entry_ctx, tailq); 7267 free(entry_ctx); 7268 DISCOVERY_INFOLOG(d_ctx, "Remove discovery entry: %s:%s:%s\n", 7269 trid.subnqn, trid.traddr, trid.trsvcid); 7270 7271 /* Fail discovery ctrlr to force reattach attempt */ 7272 spdk_nvme_ctrlr_fail(d_ctx->ctrlr); 7273 } 7274 } 7275 } 7276 7277 static void 7278 discovery_remove_controllers(struct discovery_ctx *ctx) 7279 { 7280 struct spdk_nvmf_discovery_log_page *log_page = ctx->log_page; 7281 struct discovery_entry_ctx *entry_ctx, *tmp; 7282 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 7283 struct spdk_nvme_transport_id old_trid = {}; 7284 uint64_t numrec, i; 7285 bool found; 7286 7287 numrec = from_le64(&log_page->numrec); 7288 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->nvm_entry_ctxs, tailq, tmp) { 7289 found = false; 7290 old_entry = &entry_ctx->entry; 7291 build_trid_from_log_page_entry(&old_trid, old_entry); 7292 for (i = 0; i < numrec; i++) { 7293 new_entry = &log_page->entries[i]; 7294 if (!memcmp(old_entry, new_entry, sizeof(*old_entry))) { 7295 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s found again\n", 7296 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 7297 found = true; 7298 break; 7299 } 7300 } 7301 if (!found) { 7302 struct nvme_path_id path = {}; 7303 7304 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s not found\n", 7305 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 7306 7307 path.trid = entry_ctx->trid; 7308 bdev_nvme_delete(entry_ctx->name, &path, NULL, NULL); 7309 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 7310 free(entry_ctx); 7311 } 7312 } 7313 free(log_page); 7314 ctx->log_page = NULL; 7315 discovery_complete(ctx); 7316 } 7317 7318 static void 7319 complete_discovery_start(struct discovery_ctx *ctx, int status) 7320 { 7321 ctx->timeout_ticks = 0; 7322 ctx->rc = status; 7323 if (ctx->start_cb_fn) { 7324 ctx->start_cb_fn(ctx->cb_ctx, status); 7325 ctx->start_cb_fn = NULL; 7326 ctx->cb_ctx = NULL; 7327 } 7328 } 7329 7330 static void 7331 discovery_attach_controller_done(void *cb_ctx, size_t bdev_count, int rc) 7332 { 7333 struct discovery_entry_ctx *entry_ctx = cb_ctx; 7334 struct discovery_ctx *ctx = entry_ctx->ctx; 7335 7336 DISCOVERY_INFOLOG(ctx, "attach %s done\n", entry_ctx->name); 7337 ctx->attach_in_progress--; 7338 if (ctx->attach_in_progress == 0) { 7339 complete_discovery_start(ctx, ctx->rc); 7340 if (ctx->initializing && ctx->rc != 0) { 7341 DISCOVERY_ERRLOG(ctx, "stopping discovery due to errors: %d\n", ctx->rc); 7342 stop_discovery(ctx, NULL, ctx->cb_ctx); 7343 } else { 7344 discovery_remove_controllers(ctx); 7345 } 7346 } 7347 } 7348 7349 static struct discovery_entry_ctx * 7350 create_discovery_entry_ctx(struct discovery_ctx *ctx, struct spdk_nvme_transport_id *trid) 7351 { 7352 struct discovery_entry_ctx *new_ctx; 7353 7354 new_ctx = calloc(1, sizeof(*new_ctx)); 7355 if (new_ctx == NULL) { 7356 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 7357 return NULL; 7358 } 7359 7360 new_ctx->ctx = ctx; 7361 memcpy(&new_ctx->trid, trid, sizeof(*trid)); 7362 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 7363 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 7364 return new_ctx; 7365 } 7366 7367 static void 7368 discovery_log_page_cb(void *cb_arg, int rc, const struct spdk_nvme_cpl *cpl, 7369 struct spdk_nvmf_discovery_log_page *log_page) 7370 { 7371 struct discovery_ctx *ctx = cb_arg; 7372 struct discovery_entry_ctx *entry_ctx, *tmp; 7373 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 7374 uint64_t numrec, i; 7375 bool found; 7376 7377 if (rc || spdk_nvme_cpl_is_error(cpl)) { 7378 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 7379 return; 7380 } 7381 7382 ctx->log_page = log_page; 7383 assert(ctx->attach_in_progress == 0); 7384 numrec = from_le64(&log_page->numrec); 7385 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->discovery_entry_ctxs, tailq, tmp) { 7386 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 7387 free(entry_ctx); 7388 } 7389 for (i = 0; i < numrec; i++) { 7390 found = false; 7391 new_entry = &log_page->entries[i]; 7392 if (new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY_CURRENT || 7393 new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) { 7394 struct discovery_entry_ctx *new_ctx; 7395 struct spdk_nvme_transport_id trid = {}; 7396 7397 build_trid_from_log_page_entry(&trid, new_entry); 7398 new_ctx = create_discovery_entry_ctx(ctx, &trid); 7399 if (new_ctx == NULL) { 7400 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 7401 break; 7402 } 7403 7404 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, new_ctx, tailq); 7405 continue; 7406 } 7407 TAILQ_FOREACH(entry_ctx, &ctx->nvm_entry_ctxs, tailq) { 7408 old_entry = &entry_ctx->entry; 7409 if (!memcmp(new_entry, old_entry, sizeof(*new_entry))) { 7410 found = true; 7411 break; 7412 } 7413 } 7414 if (!found) { 7415 struct discovery_entry_ctx *subnqn_ctx = NULL, *new_ctx; 7416 struct discovery_ctx *d_ctx; 7417 7418 TAILQ_FOREACH(d_ctx, &g_discovery_ctxs, tailq) { 7419 TAILQ_FOREACH(subnqn_ctx, &d_ctx->nvm_entry_ctxs, tailq) { 7420 if (!memcmp(subnqn_ctx->entry.subnqn, new_entry->subnqn, 7421 sizeof(new_entry->subnqn))) { 7422 break; 7423 } 7424 } 7425 if (subnqn_ctx) { 7426 break; 7427 } 7428 } 7429 7430 new_ctx = calloc(1, sizeof(*new_ctx)); 7431 if (new_ctx == NULL) { 7432 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 7433 break; 7434 } 7435 7436 new_ctx->ctx = ctx; 7437 memcpy(&new_ctx->entry, new_entry, sizeof(*new_entry)); 7438 build_trid_from_log_page_entry(&new_ctx->trid, new_entry); 7439 if (subnqn_ctx) { 7440 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s", subnqn_ctx->name); 7441 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new path for %s\n", 7442 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 7443 new_ctx->name); 7444 } else { 7445 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s%d", ctx->name, ctx->index++); 7446 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new subsystem %s\n", 7447 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 7448 new_ctx->name); 7449 } 7450 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 7451 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 7452 rc = spdk_bdev_nvme_create(&new_ctx->trid, new_ctx->name, NULL, 0, 7453 discovery_attach_controller_done, new_ctx, 7454 &new_ctx->drv_opts, &ctx->bdev_opts); 7455 if (rc == 0) { 7456 TAILQ_INSERT_TAIL(&ctx->nvm_entry_ctxs, new_ctx, tailq); 7457 ctx->attach_in_progress++; 7458 } else { 7459 DISCOVERY_ERRLOG(ctx, "spdk_bdev_nvme_create failed (%s)\n", spdk_strerror(-rc)); 7460 } 7461 } 7462 } 7463 7464 if (ctx->attach_in_progress == 0) { 7465 discovery_remove_controllers(ctx); 7466 } 7467 } 7468 7469 static void 7470 get_discovery_log_page(struct discovery_ctx *ctx) 7471 { 7472 int rc; 7473 7474 assert(ctx->in_progress == false); 7475 ctx->in_progress = true; 7476 rc = spdk_nvme_ctrlr_get_discovery_log_page(ctx->ctrlr, discovery_log_page_cb, ctx); 7477 if (rc != 0) { 7478 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 7479 } 7480 DISCOVERY_INFOLOG(ctx, "sent discovery log page command\n"); 7481 } 7482 7483 static void 7484 discovery_aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 7485 { 7486 struct discovery_ctx *ctx = arg; 7487 uint32_t log_page_id = (cpl->cdw0 & 0xFF0000) >> 16; 7488 7489 if (spdk_nvme_cpl_is_error(cpl)) { 7490 DISCOVERY_ERRLOG(ctx, "aer failed\n"); 7491 return; 7492 } 7493 7494 if (log_page_id != SPDK_NVME_LOG_DISCOVERY) { 7495 DISCOVERY_ERRLOG(ctx, "unexpected log page 0x%x\n", log_page_id); 7496 return; 7497 } 7498 7499 DISCOVERY_INFOLOG(ctx, "got aer\n"); 7500 if (ctx->in_progress) { 7501 ctx->pending = true; 7502 return; 7503 } 7504 7505 get_discovery_log_page(ctx); 7506 } 7507 7508 static void 7509 discovery_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 7510 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 7511 { 7512 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 7513 struct discovery_ctx *ctx; 7514 7515 ctx = SPDK_CONTAINEROF(user_opts, struct discovery_ctx, drv_opts); 7516 7517 DISCOVERY_INFOLOG(ctx, "discovery ctrlr attached\n"); 7518 ctx->probe_ctx = NULL; 7519 ctx->ctrlr = ctrlr; 7520 7521 if (ctx->rc != 0) { 7522 DISCOVERY_ERRLOG(ctx, "encountered error while attaching discovery ctrlr: %d\n", 7523 ctx->rc); 7524 return; 7525 } 7526 7527 spdk_nvme_ctrlr_register_aer_callback(ctx->ctrlr, discovery_aer_cb, ctx); 7528 } 7529 7530 static int 7531 discovery_poller(void *arg) 7532 { 7533 struct discovery_ctx *ctx = arg; 7534 struct spdk_nvme_transport_id *trid; 7535 int rc; 7536 7537 if (ctx->detach_ctx) { 7538 rc = spdk_nvme_detach_poll_async(ctx->detach_ctx); 7539 if (rc != -EAGAIN) { 7540 ctx->detach_ctx = NULL; 7541 ctx->ctrlr = NULL; 7542 } 7543 } else if (ctx->stop) { 7544 if (ctx->ctrlr != NULL) { 7545 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 7546 if (rc == 0) { 7547 return SPDK_POLLER_BUSY; 7548 } 7549 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 7550 } 7551 spdk_poller_unregister(&ctx->poller); 7552 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 7553 assert(ctx->start_cb_fn == NULL); 7554 if (ctx->stop_cb_fn != NULL) { 7555 ctx->stop_cb_fn(ctx->cb_ctx); 7556 } 7557 free_discovery_ctx(ctx); 7558 } else if (ctx->probe_ctx == NULL && ctx->ctrlr == NULL) { 7559 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 7560 DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); 7561 assert(ctx->initializing); 7562 spdk_poller_unregister(&ctx->poller); 7563 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 7564 complete_discovery_start(ctx, -ETIMEDOUT); 7565 stop_discovery(ctx, NULL, NULL); 7566 free_discovery_ctx(ctx); 7567 return SPDK_POLLER_BUSY; 7568 } 7569 7570 assert(ctx->entry_ctx_in_use == NULL); 7571 ctx->entry_ctx_in_use = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 7572 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 7573 trid = &ctx->entry_ctx_in_use->trid; 7574 7575 /* All controllers must be configured explicitely either for multipath or failover. 7576 * While discovery use multipath mode, we need to set this in bdev options as well. 7577 */ 7578 ctx->bdev_opts.multipath = true; 7579 7580 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, discovery_attach_cb); 7581 if (ctx->probe_ctx) { 7582 spdk_poller_unregister(&ctx->poller); 7583 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000); 7584 } else { 7585 DISCOVERY_ERRLOG(ctx, "could not start discovery connect\n"); 7586 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 7587 ctx->entry_ctx_in_use = NULL; 7588 } 7589 } else if (ctx->probe_ctx) { 7590 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 7591 DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); 7592 complete_discovery_start(ctx, -ETIMEDOUT); 7593 return SPDK_POLLER_BUSY; 7594 } 7595 7596 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 7597 if (rc != -EAGAIN) { 7598 if (ctx->rc != 0) { 7599 assert(ctx->initializing); 7600 stop_discovery(ctx, NULL, ctx->cb_ctx); 7601 } else { 7602 assert(rc == 0); 7603 DISCOVERY_INFOLOG(ctx, "discovery ctrlr connected\n"); 7604 ctx->rc = rc; 7605 get_discovery_log_page(ctx); 7606 } 7607 } 7608 } else { 7609 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 7610 DISCOVERY_ERRLOG(ctx, "timed out while attaching NVM ctrlrs\n"); 7611 complete_discovery_start(ctx, -ETIMEDOUT); 7612 /* We need to wait until all NVM ctrlrs are attached before we stop the 7613 * discovery service to make sure we don't detach a ctrlr that is still 7614 * being attached. 7615 */ 7616 if (ctx->attach_in_progress == 0) { 7617 stop_discovery(ctx, NULL, ctx->cb_ctx); 7618 return SPDK_POLLER_BUSY; 7619 } 7620 } 7621 7622 rc = spdk_nvme_ctrlr_process_admin_completions(ctx->ctrlr); 7623 if (rc < 0) { 7624 spdk_poller_unregister(&ctx->poller); 7625 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 7626 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 7627 ctx->entry_ctx_in_use = NULL; 7628 7629 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 7630 if (rc != 0) { 7631 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 7632 ctx->ctrlr = NULL; 7633 } 7634 } 7635 } 7636 7637 return SPDK_POLLER_BUSY; 7638 } 7639 7640 static void 7641 start_discovery_poller(void *arg) 7642 { 7643 struct discovery_ctx *ctx = arg; 7644 7645 TAILQ_INSERT_TAIL(&g_discovery_ctxs, ctx, tailq); 7646 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 7647 } 7648 7649 int 7650 bdev_nvme_start_discovery(struct spdk_nvme_transport_id *trid, 7651 const char *base_name, 7652 struct spdk_nvme_ctrlr_opts *drv_opts, 7653 struct spdk_bdev_nvme_ctrlr_opts *bdev_opts, 7654 uint64_t attach_timeout, 7655 bool from_mdns, 7656 spdk_bdev_nvme_start_discovery_fn cb_fn, void *cb_ctx) 7657 { 7658 struct discovery_ctx *ctx; 7659 struct discovery_entry_ctx *discovery_entry_ctx; 7660 7661 snprintf(trid->subnqn, sizeof(trid->subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN); 7662 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 7663 if (strcmp(ctx->name, base_name) == 0) { 7664 return -EEXIST; 7665 } 7666 7667 if (ctx->entry_ctx_in_use != NULL) { 7668 if (!spdk_nvme_transport_id_compare(trid, &ctx->entry_ctx_in_use->trid)) { 7669 return -EEXIST; 7670 } 7671 } 7672 7673 TAILQ_FOREACH(discovery_entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 7674 if (!spdk_nvme_transport_id_compare(trid, &discovery_entry_ctx->trid)) { 7675 return -EEXIST; 7676 } 7677 } 7678 } 7679 7680 ctx = calloc(1, sizeof(*ctx)); 7681 if (ctx == NULL) { 7682 return -ENOMEM; 7683 } 7684 7685 ctx->name = strdup(base_name); 7686 if (ctx->name == NULL) { 7687 free_discovery_ctx(ctx); 7688 return -ENOMEM; 7689 } 7690 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 7691 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 7692 ctx->from_mdns_discovery_service = from_mdns; 7693 ctx->bdev_opts.from_discovery_service = true; 7694 ctx->calling_thread = spdk_get_thread(); 7695 ctx->start_cb_fn = cb_fn; 7696 ctx->cb_ctx = cb_ctx; 7697 ctx->initializing = true; 7698 if (ctx->start_cb_fn) { 7699 /* We can use this when dumping json to denote if this RPC parameter 7700 * was specified or not. 7701 */ 7702 ctx->wait_for_attach = true; 7703 } 7704 if (attach_timeout != 0) { 7705 ctx->timeout_ticks = spdk_get_ticks() + attach_timeout * 7706 spdk_get_ticks_hz() / 1000ull; 7707 } 7708 TAILQ_INIT(&ctx->nvm_entry_ctxs); 7709 TAILQ_INIT(&ctx->discovery_entry_ctxs); 7710 memcpy(&ctx->trid, trid, sizeof(*trid)); 7711 /* Even if user did not specify hostnqn, we can still strdup("\0"); */ 7712 ctx->hostnqn = strdup(ctx->drv_opts.hostnqn); 7713 if (ctx->hostnqn == NULL) { 7714 free_discovery_ctx(ctx); 7715 return -ENOMEM; 7716 } 7717 discovery_entry_ctx = create_discovery_entry_ctx(ctx, trid); 7718 if (discovery_entry_ctx == NULL) { 7719 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 7720 free_discovery_ctx(ctx); 7721 return -ENOMEM; 7722 } 7723 7724 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, discovery_entry_ctx, tailq); 7725 spdk_thread_send_msg(g_bdev_nvme_init_thread, start_discovery_poller, ctx); 7726 return 0; 7727 } 7728 7729 int 7730 bdev_nvme_stop_discovery(const char *name, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 7731 { 7732 struct discovery_ctx *ctx; 7733 7734 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 7735 if (strcmp(name, ctx->name) == 0) { 7736 if (ctx->stop) { 7737 return -EALREADY; 7738 } 7739 /* If we're still starting the discovery service and ->rc is non-zero, we're 7740 * going to stop it as soon as we can 7741 */ 7742 if (ctx->initializing && ctx->rc != 0) { 7743 return -EALREADY; 7744 } 7745 stop_discovery(ctx, cb_fn, cb_ctx); 7746 return 0; 7747 } 7748 } 7749 7750 return -ENOENT; 7751 } 7752 7753 static int 7754 bdev_nvme_library_init(void) 7755 { 7756 g_bdev_nvme_init_thread = spdk_get_thread(); 7757 7758 spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_create_poll_group_cb, 7759 bdev_nvme_destroy_poll_group_cb, 7760 sizeof(struct nvme_poll_group), "nvme_poll_groups"); 7761 7762 return 0; 7763 } 7764 7765 static void 7766 bdev_nvme_fini_destruct_ctrlrs(void) 7767 { 7768 struct nvme_bdev_ctrlr *nbdev_ctrlr; 7769 struct nvme_ctrlr *nvme_ctrlr; 7770 7771 pthread_mutex_lock(&g_bdev_nvme_mutex); 7772 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 7773 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 7774 pthread_mutex_lock(&nvme_ctrlr->mutex); 7775 if (nvme_ctrlr->destruct) { 7776 /* This controller's destruction was already started 7777 * before the application started shutting down 7778 */ 7779 pthread_mutex_unlock(&nvme_ctrlr->mutex); 7780 continue; 7781 } 7782 nvme_ctrlr->destruct = true; 7783 pthread_mutex_unlock(&nvme_ctrlr->mutex); 7784 7785 spdk_thread_send_msg(nvme_ctrlr->thread, _nvme_ctrlr_destruct, 7786 nvme_ctrlr); 7787 } 7788 } 7789 7790 g_bdev_nvme_module_finish = true; 7791 if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 7792 pthread_mutex_unlock(&g_bdev_nvme_mutex); 7793 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 7794 spdk_bdev_module_fini_done(); 7795 return; 7796 } 7797 7798 pthread_mutex_unlock(&g_bdev_nvme_mutex); 7799 } 7800 7801 static void 7802 check_discovery_fini(void *arg) 7803 { 7804 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 7805 bdev_nvme_fini_destruct_ctrlrs(); 7806 } 7807 } 7808 7809 static void 7810 bdev_nvme_library_fini(void) 7811 { 7812 struct nvme_probe_skip_entry *entry, *entry_tmp; 7813 struct discovery_ctx *ctx; 7814 7815 spdk_poller_unregister(&g_hotplug_poller); 7816 free(g_hotplug_probe_ctx); 7817 g_hotplug_probe_ctx = NULL; 7818 7819 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) { 7820 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 7821 free(entry); 7822 } 7823 7824 assert(spdk_get_thread() == g_bdev_nvme_init_thread); 7825 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 7826 bdev_nvme_fini_destruct_ctrlrs(); 7827 } else { 7828 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 7829 stop_discovery(ctx, check_discovery_fini, NULL); 7830 } 7831 } 7832 } 7833 7834 static void 7835 bdev_nvme_verify_pi_error(struct nvme_bdev_io *bio) 7836 { 7837 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7838 struct spdk_bdev *bdev = bdev_io->bdev; 7839 struct spdk_dif_ctx dif_ctx; 7840 struct spdk_dif_error err_blk = {}; 7841 int rc; 7842 struct spdk_dif_ctx_init_ext_opts dif_opts; 7843 7844 dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format); 7845 dif_opts.dif_pi_format = bdev->dif_pi_format; 7846 rc = spdk_dif_ctx_init(&dif_ctx, 7847 bdev->blocklen, bdev->md_len, bdev->md_interleave, 7848 bdev->dif_is_head_of_md, bdev->dif_type, 7849 bdev_io->u.bdev.dif_check_flags, 7850 bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0, &dif_opts); 7851 if (rc != 0) { 7852 SPDK_ERRLOG("Initialization of DIF context failed\n"); 7853 return; 7854 } 7855 7856 if (bdev->md_interleave) { 7857 rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 7858 bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 7859 } else { 7860 struct iovec md_iov = { 7861 .iov_base = bdev_io->u.bdev.md_buf, 7862 .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len, 7863 }; 7864 7865 rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 7866 &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 7867 } 7868 7869 if (rc != 0) { 7870 SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n", 7871 err_blk.err_type, err_blk.err_offset); 7872 } else { 7873 SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n"); 7874 } 7875 } 7876 7877 static void 7878 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 7879 { 7880 struct nvme_bdev_io *bio = ref; 7881 7882 if (spdk_nvme_cpl_is_success(cpl)) { 7883 /* Run PI verification for read data buffer. */ 7884 bdev_nvme_verify_pi_error(bio); 7885 } 7886 7887 /* Return original completion status */ 7888 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 7889 } 7890 7891 static void 7892 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 7893 { 7894 struct nvme_bdev_io *bio = ref; 7895 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7896 int ret; 7897 7898 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 7899 SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n", 7900 cpl->status.sct, cpl->status.sc); 7901 7902 /* Save completion status to use after verifying PI error. */ 7903 bio->cpl = *cpl; 7904 7905 if (spdk_likely(nvme_io_path_is_available(bio->io_path))) { 7906 /* Read without PI checking to verify PI error. */ 7907 ret = bdev_nvme_no_pi_readv(bio, 7908 bdev_io->u.bdev.iovs, 7909 bdev_io->u.bdev.iovcnt, 7910 bdev_io->u.bdev.md_buf, 7911 bdev_io->u.bdev.num_blocks, 7912 bdev_io->u.bdev.offset_blocks); 7913 if (ret == 0) { 7914 return; 7915 } 7916 } 7917 } 7918 7919 bdev_nvme_io_complete_nvme_status(bio, cpl); 7920 } 7921 7922 static void 7923 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7924 { 7925 struct nvme_bdev_io *bio = ref; 7926 7927 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 7928 SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n", 7929 cpl->status.sct, cpl->status.sc); 7930 /* Run PI verification for write data buffer if PI error is detected. */ 7931 bdev_nvme_verify_pi_error(bio); 7932 } 7933 7934 bdev_nvme_io_complete_nvme_status(bio, cpl); 7935 } 7936 7937 static void 7938 bdev_nvme_zone_appendv_done(void *ref, const struct spdk_nvme_cpl *cpl) 7939 { 7940 struct nvme_bdev_io *bio = ref; 7941 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7942 7943 /* spdk_bdev_io_get_append_location() requires that the ALBA is stored in offset_blocks. 7944 * Additionally, offset_blocks has to be set before calling bdev_nvme_verify_pi_error(). 7945 */ 7946 bdev_io->u.bdev.offset_blocks = *(uint64_t *)&cpl->cdw0; 7947 7948 if (spdk_nvme_cpl_is_pi_error(cpl)) { 7949 SPDK_ERRLOG("zone append completed with PI error (sct=%d, sc=%d)\n", 7950 cpl->status.sct, cpl->status.sc); 7951 /* Run PI verification for zone append data buffer if PI error is detected. */ 7952 bdev_nvme_verify_pi_error(bio); 7953 } 7954 7955 bdev_nvme_io_complete_nvme_status(bio, cpl); 7956 } 7957 7958 static void 7959 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7960 { 7961 struct nvme_bdev_io *bio = ref; 7962 7963 if (spdk_nvme_cpl_is_pi_error(cpl)) { 7964 SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n", 7965 cpl->status.sct, cpl->status.sc); 7966 /* Run PI verification for compare data buffer if PI error is detected. */ 7967 bdev_nvme_verify_pi_error(bio); 7968 } 7969 7970 bdev_nvme_io_complete_nvme_status(bio, cpl); 7971 } 7972 7973 static void 7974 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7975 { 7976 struct nvme_bdev_io *bio = ref; 7977 7978 /* Compare operation completion */ 7979 if (!bio->first_fused_completed) { 7980 /* Save compare result for write callback */ 7981 bio->cpl = *cpl; 7982 bio->first_fused_completed = true; 7983 return; 7984 } 7985 7986 /* Write operation completion */ 7987 if (spdk_nvme_cpl_is_error(&bio->cpl)) { 7988 /* If bio->cpl is already an error, it means the compare operation failed. In that case, 7989 * complete the IO with the compare operation's status. 7990 */ 7991 if (!spdk_nvme_cpl_is_error(cpl)) { 7992 SPDK_ERRLOG("Unexpected write success after compare failure.\n"); 7993 } 7994 7995 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 7996 } else { 7997 bdev_nvme_io_complete_nvme_status(bio, cpl); 7998 } 7999 } 8000 8001 static void 8002 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl) 8003 { 8004 struct nvme_bdev_io *bio = ref; 8005 8006 bdev_nvme_io_complete_nvme_status(bio, cpl); 8007 } 8008 8009 static int 8010 fill_zone_from_report(struct spdk_bdev_zone_info *info, struct spdk_nvme_zns_zone_desc *desc) 8011 { 8012 switch (desc->zt) { 8013 case SPDK_NVME_ZONE_TYPE_SEQWR: 8014 info->type = SPDK_BDEV_ZONE_TYPE_SEQWR; 8015 break; 8016 default: 8017 SPDK_ERRLOG("Invalid zone type: %#x in zone report\n", desc->zt); 8018 return -EIO; 8019 } 8020 8021 switch (desc->zs) { 8022 case SPDK_NVME_ZONE_STATE_EMPTY: 8023 info->state = SPDK_BDEV_ZONE_STATE_EMPTY; 8024 break; 8025 case SPDK_NVME_ZONE_STATE_IOPEN: 8026 info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN; 8027 break; 8028 case SPDK_NVME_ZONE_STATE_EOPEN: 8029 info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN; 8030 break; 8031 case SPDK_NVME_ZONE_STATE_CLOSED: 8032 info->state = SPDK_BDEV_ZONE_STATE_CLOSED; 8033 break; 8034 case SPDK_NVME_ZONE_STATE_RONLY: 8035 info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY; 8036 break; 8037 case SPDK_NVME_ZONE_STATE_FULL: 8038 info->state = SPDK_BDEV_ZONE_STATE_FULL; 8039 break; 8040 case SPDK_NVME_ZONE_STATE_OFFLINE: 8041 info->state = SPDK_BDEV_ZONE_STATE_OFFLINE; 8042 break; 8043 default: 8044 SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", desc->zs); 8045 return -EIO; 8046 } 8047 8048 info->zone_id = desc->zslba; 8049 info->write_pointer = desc->wp; 8050 info->capacity = desc->zcap; 8051 8052 return 0; 8053 } 8054 8055 static void 8056 bdev_nvme_get_zone_info_done(void *ref, const struct spdk_nvme_cpl *cpl) 8057 { 8058 struct nvme_bdev_io *bio = ref; 8059 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 8060 uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id; 8061 uint32_t zones_to_copy = bdev_io->u.zone_mgmt.num_zones; 8062 struct spdk_bdev_zone_info *info = bdev_io->u.zone_mgmt.buf; 8063 uint64_t max_zones_per_buf, i; 8064 uint32_t zone_report_bufsize; 8065 struct spdk_nvme_ns *ns; 8066 struct spdk_nvme_qpair *qpair; 8067 int ret; 8068 8069 if (spdk_nvme_cpl_is_error(cpl)) { 8070 goto out_complete_io_nvme_cpl; 8071 } 8072 8073 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 8074 ret = -ENXIO; 8075 goto out_complete_io_ret; 8076 } 8077 8078 ns = bio->io_path->nvme_ns->ns; 8079 qpair = bio->io_path->qpair->qpair; 8080 8081 zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 8082 max_zones_per_buf = (zone_report_bufsize - sizeof(*bio->zone_report_buf)) / 8083 sizeof(bio->zone_report_buf->descs[0]); 8084 8085 if (bio->zone_report_buf->nr_zones > max_zones_per_buf) { 8086 ret = -EINVAL; 8087 goto out_complete_io_ret; 8088 } 8089 8090 if (!bio->zone_report_buf->nr_zones) { 8091 ret = -EINVAL; 8092 goto out_complete_io_ret; 8093 } 8094 8095 for (i = 0; i < bio->zone_report_buf->nr_zones && bio->handled_zones < zones_to_copy; i++) { 8096 ret = fill_zone_from_report(&info[bio->handled_zones], 8097 &bio->zone_report_buf->descs[i]); 8098 if (ret) { 8099 goto out_complete_io_ret; 8100 } 8101 bio->handled_zones++; 8102 } 8103 8104 if (bio->handled_zones < zones_to_copy) { 8105 uint64_t zone_size_lba = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 8106 uint64_t slba = zone_id + (zone_size_lba * bio->handled_zones); 8107 8108 memset(bio->zone_report_buf, 0, zone_report_bufsize); 8109 ret = spdk_nvme_zns_report_zones(ns, qpair, 8110 bio->zone_report_buf, zone_report_bufsize, 8111 slba, SPDK_NVME_ZRA_LIST_ALL, true, 8112 bdev_nvme_get_zone_info_done, bio); 8113 if (!ret) { 8114 return; 8115 } else { 8116 goto out_complete_io_ret; 8117 } 8118 } 8119 8120 out_complete_io_nvme_cpl: 8121 free(bio->zone_report_buf); 8122 bio->zone_report_buf = NULL; 8123 bdev_nvme_io_complete_nvme_status(bio, cpl); 8124 return; 8125 8126 out_complete_io_ret: 8127 free(bio->zone_report_buf); 8128 bio->zone_report_buf = NULL; 8129 bdev_nvme_io_complete(bio, ret); 8130 } 8131 8132 static void 8133 bdev_nvme_zone_management_done(void *ref, const struct spdk_nvme_cpl *cpl) 8134 { 8135 struct nvme_bdev_io *bio = ref; 8136 8137 bdev_nvme_io_complete_nvme_status(bio, cpl); 8138 } 8139 8140 static void 8141 bdev_nvme_admin_passthru_complete_nvme_status(void *ctx) 8142 { 8143 struct nvme_bdev_io *bio = ctx; 8144 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 8145 const struct spdk_nvme_cpl *cpl = &bio->cpl; 8146 8147 assert(bdev_nvme_io_type_is_admin(bdev_io->type)); 8148 8149 __bdev_nvme_io_complete(bdev_io, 0, cpl); 8150 } 8151 8152 static void 8153 bdev_nvme_abort_complete(void *ctx) 8154 { 8155 struct nvme_bdev_io *bio = ctx; 8156 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 8157 8158 if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) { 8159 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS, NULL); 8160 } else { 8161 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED, NULL); 8162 } 8163 } 8164 8165 static void 8166 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl) 8167 { 8168 struct nvme_bdev_io *bio = ref; 8169 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 8170 8171 bio->cpl = *cpl; 8172 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), bdev_nvme_abort_complete, bio); 8173 } 8174 8175 static void 8176 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl) 8177 { 8178 struct nvme_bdev_io *bio = ref; 8179 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 8180 8181 bio->cpl = *cpl; 8182 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 8183 bdev_nvme_admin_passthru_complete_nvme_status, bio); 8184 } 8185 8186 static void 8187 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset) 8188 { 8189 struct nvme_bdev_io *bio = ref; 8190 struct iovec *iov; 8191 8192 bio->iov_offset = sgl_offset; 8193 for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) { 8194 iov = &bio->iovs[bio->iovpos]; 8195 if (bio->iov_offset < iov->iov_len) { 8196 break; 8197 } 8198 8199 bio->iov_offset -= iov->iov_len; 8200 } 8201 } 8202 8203 static int 8204 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length) 8205 { 8206 struct nvme_bdev_io *bio = ref; 8207 struct iovec *iov; 8208 8209 assert(bio->iovpos < bio->iovcnt); 8210 8211 iov = &bio->iovs[bio->iovpos]; 8212 8213 *address = iov->iov_base; 8214 *length = iov->iov_len; 8215 8216 if (bio->iov_offset) { 8217 assert(bio->iov_offset <= iov->iov_len); 8218 *address += bio->iov_offset; 8219 *length -= bio->iov_offset; 8220 } 8221 8222 bio->iov_offset += *length; 8223 if (bio->iov_offset == iov->iov_len) { 8224 bio->iovpos++; 8225 bio->iov_offset = 0; 8226 } 8227 8228 return 0; 8229 } 8230 8231 static void 8232 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset) 8233 { 8234 struct nvme_bdev_io *bio = ref; 8235 struct iovec *iov; 8236 8237 bio->fused_iov_offset = sgl_offset; 8238 for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) { 8239 iov = &bio->fused_iovs[bio->fused_iovpos]; 8240 if (bio->fused_iov_offset < iov->iov_len) { 8241 break; 8242 } 8243 8244 bio->fused_iov_offset -= iov->iov_len; 8245 } 8246 } 8247 8248 static int 8249 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length) 8250 { 8251 struct nvme_bdev_io *bio = ref; 8252 struct iovec *iov; 8253 8254 assert(bio->fused_iovpos < bio->fused_iovcnt); 8255 8256 iov = &bio->fused_iovs[bio->fused_iovpos]; 8257 8258 *address = iov->iov_base; 8259 *length = iov->iov_len; 8260 8261 if (bio->fused_iov_offset) { 8262 assert(bio->fused_iov_offset <= iov->iov_len); 8263 *address += bio->fused_iov_offset; 8264 *length -= bio->fused_iov_offset; 8265 } 8266 8267 bio->fused_iov_offset += *length; 8268 if (bio->fused_iov_offset == iov->iov_len) { 8269 bio->fused_iovpos++; 8270 bio->fused_iov_offset = 0; 8271 } 8272 8273 return 0; 8274 } 8275 8276 static int 8277 bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 8278 void *md, uint64_t lba_count, uint64_t lba) 8279 { 8280 int rc; 8281 8282 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n", 8283 lba_count, lba); 8284 8285 bio->iovs = iov; 8286 bio->iovcnt = iovcnt; 8287 bio->iovpos = 0; 8288 bio->iov_offset = 0; 8289 8290 rc = spdk_nvme_ns_cmd_readv_with_md(bio->io_path->nvme_ns->ns, 8291 bio->io_path->qpair->qpair, 8292 lba, lba_count, 8293 bdev_nvme_no_pi_readv_done, bio, 0, 8294 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 8295 md, 0, 0); 8296 8297 if (rc != 0 && rc != -ENOMEM) { 8298 SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc); 8299 } 8300 return rc; 8301 } 8302 8303 static int 8304 bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 8305 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, 8306 struct spdk_memory_domain *domain, void *domain_ctx, 8307 struct spdk_accel_sequence *seq) 8308 { 8309 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8310 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8311 int rc; 8312 8313 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n", 8314 lba_count, lba); 8315 8316 bio->iovs = iov; 8317 bio->iovcnt = iovcnt; 8318 bio->iovpos = 0; 8319 bio->iov_offset = 0; 8320 8321 if (domain != NULL || seq != NULL) { 8322 bio->ext_opts.size = SPDK_SIZEOF(&bio->ext_opts, accel_sequence); 8323 bio->ext_opts.memory_domain = domain; 8324 bio->ext_opts.memory_domain_ctx = domain_ctx; 8325 bio->ext_opts.io_flags = flags; 8326 bio->ext_opts.metadata = md; 8327 bio->ext_opts.accel_sequence = seq; 8328 8329 if (iovcnt == 1) { 8330 rc = spdk_nvme_ns_cmd_read_ext(ns, qpair, iov[0].iov_base, lba, lba_count, bdev_nvme_readv_done, 8331 bio, &bio->ext_opts); 8332 } else { 8333 rc = spdk_nvme_ns_cmd_readv_ext(ns, qpair, lba, lba_count, 8334 bdev_nvme_readv_done, bio, 8335 bdev_nvme_queued_reset_sgl, 8336 bdev_nvme_queued_next_sge, 8337 &bio->ext_opts); 8338 } 8339 } else if (iovcnt == 1) { 8340 rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base, 8341 md, lba, lba_count, bdev_nvme_readv_done, 8342 bio, flags, 0, 0); 8343 } else { 8344 rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count, 8345 bdev_nvme_readv_done, bio, flags, 8346 bdev_nvme_queued_reset_sgl, 8347 bdev_nvme_queued_next_sge, md, 0, 0); 8348 } 8349 8350 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 8351 SPDK_ERRLOG("readv failed: rc = %d\n", rc); 8352 } 8353 return rc; 8354 } 8355 8356 static int 8357 bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 8358 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, 8359 struct spdk_memory_domain *domain, void *domain_ctx, 8360 struct spdk_accel_sequence *seq, 8361 union spdk_bdev_nvme_cdw12 cdw12, union spdk_bdev_nvme_cdw13 cdw13) 8362 { 8363 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8364 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8365 int rc; 8366 8367 SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 8368 lba_count, lba); 8369 8370 bio->iovs = iov; 8371 bio->iovcnt = iovcnt; 8372 bio->iovpos = 0; 8373 bio->iov_offset = 0; 8374 8375 if (domain != NULL || seq != NULL) { 8376 bio->ext_opts.size = SPDK_SIZEOF(&bio->ext_opts, accel_sequence); 8377 bio->ext_opts.memory_domain = domain; 8378 bio->ext_opts.memory_domain_ctx = domain_ctx; 8379 bio->ext_opts.io_flags = flags | SPDK_NVME_IO_FLAGS_DIRECTIVE(cdw12.write.dtype); 8380 bio->ext_opts.cdw13 = cdw13.raw; 8381 bio->ext_opts.metadata = md; 8382 bio->ext_opts.accel_sequence = seq; 8383 8384 if (iovcnt == 1) { 8385 rc = spdk_nvme_ns_cmd_write_ext(ns, qpair, iov[0].iov_base, lba, lba_count, bdev_nvme_writev_done, 8386 bio, &bio->ext_opts); 8387 } else { 8388 rc = spdk_nvme_ns_cmd_writev_ext(ns, qpair, lba, lba_count, 8389 bdev_nvme_writev_done, bio, 8390 bdev_nvme_queued_reset_sgl, 8391 bdev_nvme_queued_next_sge, 8392 &bio->ext_opts); 8393 } 8394 } else if (iovcnt == 1) { 8395 rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base, 8396 md, lba, lba_count, bdev_nvme_writev_done, 8397 bio, flags, 0, 0); 8398 } else { 8399 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 8400 bdev_nvme_writev_done, bio, flags, 8401 bdev_nvme_queued_reset_sgl, 8402 bdev_nvme_queued_next_sge, md, 0, 0); 8403 } 8404 8405 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 8406 SPDK_ERRLOG("writev failed: rc = %d\n", rc); 8407 } 8408 return rc; 8409 } 8410 8411 static int 8412 bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 8413 void *md, uint64_t lba_count, uint64_t zslba, 8414 uint32_t flags) 8415 { 8416 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8417 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8418 int rc; 8419 8420 SPDK_DEBUGLOG(bdev_nvme, "zone append %" PRIu64 " blocks to zone start lba %#" PRIx64 "\n", 8421 lba_count, zslba); 8422 8423 bio->iovs = iov; 8424 bio->iovcnt = iovcnt; 8425 bio->iovpos = 0; 8426 bio->iov_offset = 0; 8427 8428 if (iovcnt == 1) { 8429 rc = spdk_nvme_zns_zone_append_with_md(ns, qpair, iov[0].iov_base, md, zslba, 8430 lba_count, 8431 bdev_nvme_zone_appendv_done, bio, 8432 flags, 8433 0, 0); 8434 } else { 8435 rc = spdk_nvme_zns_zone_appendv_with_md(ns, qpair, zslba, lba_count, 8436 bdev_nvme_zone_appendv_done, bio, flags, 8437 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 8438 md, 0, 0); 8439 } 8440 8441 if (rc != 0 && rc != -ENOMEM) { 8442 SPDK_ERRLOG("zone append failed: rc = %d\n", rc); 8443 } 8444 return rc; 8445 } 8446 8447 static int 8448 bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 8449 void *md, uint64_t lba_count, uint64_t lba, 8450 uint32_t flags) 8451 { 8452 int rc; 8453 8454 SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n", 8455 lba_count, lba); 8456 8457 bio->iovs = iov; 8458 bio->iovcnt = iovcnt; 8459 bio->iovpos = 0; 8460 bio->iov_offset = 0; 8461 8462 rc = spdk_nvme_ns_cmd_comparev_with_md(bio->io_path->nvme_ns->ns, 8463 bio->io_path->qpair->qpair, 8464 lba, lba_count, 8465 bdev_nvme_comparev_done, bio, flags, 8466 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 8467 md, 0, 0); 8468 8469 if (rc != 0 && rc != -ENOMEM) { 8470 SPDK_ERRLOG("comparev failed: rc = %d\n", rc); 8471 } 8472 return rc; 8473 } 8474 8475 static int 8476 bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, 8477 struct iovec *write_iov, int write_iovcnt, 8478 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags) 8479 { 8480 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8481 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8482 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 8483 int rc; 8484 8485 SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 8486 lba_count, lba); 8487 8488 bio->iovs = cmp_iov; 8489 bio->iovcnt = cmp_iovcnt; 8490 bio->iovpos = 0; 8491 bio->iov_offset = 0; 8492 bio->fused_iovs = write_iov; 8493 bio->fused_iovcnt = write_iovcnt; 8494 bio->fused_iovpos = 0; 8495 bio->fused_iov_offset = 0; 8496 8497 if (bdev_io->num_retries == 0) { 8498 bio->first_fused_submitted = false; 8499 bio->first_fused_completed = false; 8500 } 8501 8502 if (!bio->first_fused_submitted) { 8503 flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST; 8504 memset(&bio->cpl, 0, sizeof(bio->cpl)); 8505 8506 rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count, 8507 bdev_nvme_comparev_and_writev_done, bio, flags, 8508 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0); 8509 if (rc == 0) { 8510 bio->first_fused_submitted = true; 8511 flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST; 8512 } else { 8513 if (rc != -ENOMEM) { 8514 SPDK_ERRLOG("compare failed: rc = %d\n", rc); 8515 } 8516 return rc; 8517 } 8518 } 8519 8520 flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND; 8521 8522 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 8523 bdev_nvme_comparev_and_writev_done, bio, flags, 8524 bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0); 8525 if (rc != 0 && rc != -ENOMEM) { 8526 SPDK_ERRLOG("write failed: rc = %d\n", rc); 8527 rc = 0; 8528 } 8529 8530 return rc; 8531 } 8532 8533 static int 8534 bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 8535 { 8536 struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES]; 8537 struct spdk_nvme_dsm_range *range; 8538 uint64_t offset, remaining; 8539 uint64_t num_ranges_u64; 8540 uint16_t num_ranges; 8541 int rc; 8542 8543 num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) / 8544 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 8545 if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) { 8546 SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks); 8547 return -EINVAL; 8548 } 8549 num_ranges = (uint16_t)num_ranges_u64; 8550 8551 offset = offset_blocks; 8552 remaining = num_blocks; 8553 range = &dsm_ranges[0]; 8554 8555 /* Fill max-size ranges until the remaining blocks fit into one range */ 8556 while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) { 8557 range->attributes.raw = 0; 8558 range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 8559 range->starting_lba = offset; 8560 8561 offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 8562 remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 8563 range++; 8564 } 8565 8566 /* Final range describes the remaining blocks */ 8567 range->attributes.raw = 0; 8568 range->length = remaining; 8569 range->starting_lba = offset; 8570 8571 rc = spdk_nvme_ns_cmd_dataset_management(bio->io_path->nvme_ns->ns, 8572 bio->io_path->qpair->qpair, 8573 SPDK_NVME_DSM_ATTR_DEALLOCATE, 8574 dsm_ranges, num_ranges, 8575 bdev_nvme_queued_done, bio); 8576 8577 return rc; 8578 } 8579 8580 static int 8581 bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 8582 { 8583 if (num_blocks > UINT16_MAX + 1) { 8584 SPDK_ERRLOG("NVMe write zeroes is limited to 16-bit block count\n"); 8585 return -EINVAL; 8586 } 8587 8588 return spdk_nvme_ns_cmd_write_zeroes(bio->io_path->nvme_ns->ns, 8589 bio->io_path->qpair->qpair, 8590 offset_blocks, num_blocks, 8591 bdev_nvme_queued_done, bio, 8592 0); 8593 } 8594 8595 static int 8596 bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones, 8597 struct spdk_bdev_zone_info *info) 8598 { 8599 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8600 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8601 uint32_t zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 8602 uint64_t zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 8603 uint64_t total_zones = spdk_nvme_zns_ns_get_num_zones(ns); 8604 8605 if (zone_id % zone_size != 0) { 8606 return -EINVAL; 8607 } 8608 8609 if (num_zones > total_zones || !num_zones) { 8610 return -EINVAL; 8611 } 8612 8613 assert(!bio->zone_report_buf); 8614 bio->zone_report_buf = calloc(1, zone_report_bufsize); 8615 if (!bio->zone_report_buf) { 8616 return -ENOMEM; 8617 } 8618 8619 bio->handled_zones = 0; 8620 8621 return spdk_nvme_zns_report_zones(ns, qpair, bio->zone_report_buf, zone_report_bufsize, 8622 zone_id, SPDK_NVME_ZRA_LIST_ALL, true, 8623 bdev_nvme_get_zone_info_done, bio); 8624 } 8625 8626 static int 8627 bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 8628 enum spdk_bdev_zone_action action) 8629 { 8630 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8631 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8632 8633 switch (action) { 8634 case SPDK_BDEV_ZONE_CLOSE: 8635 return spdk_nvme_zns_close_zone(ns, qpair, zone_id, false, 8636 bdev_nvme_zone_management_done, bio); 8637 case SPDK_BDEV_ZONE_FINISH: 8638 return spdk_nvme_zns_finish_zone(ns, qpair, zone_id, false, 8639 bdev_nvme_zone_management_done, bio); 8640 case SPDK_BDEV_ZONE_OPEN: 8641 return spdk_nvme_zns_open_zone(ns, qpair, zone_id, false, 8642 bdev_nvme_zone_management_done, bio); 8643 case SPDK_BDEV_ZONE_RESET: 8644 return spdk_nvme_zns_reset_zone(ns, qpair, zone_id, false, 8645 bdev_nvme_zone_management_done, bio); 8646 case SPDK_BDEV_ZONE_OFFLINE: 8647 return spdk_nvme_zns_offline_zone(ns, qpair, zone_id, false, 8648 bdev_nvme_zone_management_done, bio); 8649 default: 8650 return -EINVAL; 8651 } 8652 } 8653 8654 static void 8655 bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 8656 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) 8657 { 8658 struct nvme_io_path *io_path; 8659 struct nvme_ctrlr *nvme_ctrlr; 8660 uint32_t max_xfer_size; 8661 int rc = -ENXIO; 8662 8663 /* Choose the first ctrlr which is not failed. */ 8664 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 8665 nvme_ctrlr = io_path->qpair->ctrlr; 8666 8667 /* We should skip any unavailable nvme_ctrlr rather than checking 8668 * if the return value of spdk_nvme_ctrlr_cmd_admin_raw() is -ENXIO. 8669 */ 8670 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 8671 continue; 8672 } 8673 8674 max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ctrlr->ctrlr); 8675 8676 if (nbytes > max_xfer_size) { 8677 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 8678 rc = -EINVAL; 8679 goto err; 8680 } 8681 8682 rc = spdk_nvme_ctrlr_cmd_admin_raw(nvme_ctrlr->ctrlr, cmd, buf, (uint32_t)nbytes, 8683 bdev_nvme_admin_passthru_done, bio); 8684 if (rc == 0) { 8685 return; 8686 } 8687 } 8688 8689 err: 8690 bdev_nvme_admin_complete(bio, rc); 8691 } 8692 8693 static int 8694 bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 8695 void *buf, size_t nbytes) 8696 { 8697 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8698 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8699 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 8700 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 8701 8702 if (nbytes > max_xfer_size) { 8703 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 8704 return -EINVAL; 8705 } 8706 8707 /* 8708 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 8709 * so fill it out automatically. 8710 */ 8711 cmd->nsid = spdk_nvme_ns_get_id(ns); 8712 8713 return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf, 8714 (uint32_t)nbytes, bdev_nvme_queued_done, bio); 8715 } 8716 8717 static int 8718 bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 8719 void *buf, size_t nbytes, void *md_buf, size_t md_len) 8720 { 8721 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8722 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8723 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 8724 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 8725 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 8726 8727 if (nbytes > max_xfer_size) { 8728 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 8729 return -EINVAL; 8730 } 8731 8732 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 8733 SPDK_ERRLOG("invalid meta data buffer size\n"); 8734 return -EINVAL; 8735 } 8736 8737 /* 8738 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 8739 * so fill it out automatically. 8740 */ 8741 cmd->nsid = spdk_nvme_ns_get_id(ns); 8742 8743 return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf, 8744 (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio); 8745 } 8746 8747 static int 8748 bdev_nvme_iov_passthru_md(struct nvme_bdev_io *bio, 8749 struct spdk_nvme_cmd *cmd, struct iovec *iov, int iovcnt, 8750 size_t nbytes, void *md_buf, size_t md_len) 8751 { 8752 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8753 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8754 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 8755 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 8756 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 8757 8758 bio->iovs = iov; 8759 bio->iovcnt = iovcnt; 8760 bio->iovpos = 0; 8761 bio->iov_offset = 0; 8762 8763 if (nbytes > max_xfer_size) { 8764 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 8765 return -EINVAL; 8766 } 8767 8768 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 8769 SPDK_ERRLOG("invalid meta data buffer size\n"); 8770 return -EINVAL; 8771 } 8772 8773 /* 8774 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands 8775 * require a nsid, so fill it out automatically. 8776 */ 8777 cmd->nsid = spdk_nvme_ns_get_id(ns); 8778 8779 return spdk_nvme_ctrlr_cmd_iov_raw_with_md( 8780 ctrlr, qpair, cmd, (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio, 8781 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge); 8782 } 8783 8784 static void 8785 bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 8786 struct nvme_bdev_io *bio_to_abort) 8787 { 8788 struct nvme_io_path *io_path; 8789 int rc = 0; 8790 8791 rc = bdev_nvme_abort_retry_io(nbdev_ch, bio_to_abort); 8792 if (rc == 0) { 8793 bdev_nvme_admin_complete(bio, 0); 8794 return; 8795 } 8796 8797 io_path = bio_to_abort->io_path; 8798 if (io_path != NULL) { 8799 rc = spdk_nvme_ctrlr_cmd_abort_ext(io_path->qpair->ctrlr->ctrlr, 8800 io_path->qpair->qpair, 8801 bio_to_abort, 8802 bdev_nvme_abort_done, bio); 8803 } else { 8804 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 8805 rc = spdk_nvme_ctrlr_cmd_abort_ext(io_path->qpair->ctrlr->ctrlr, 8806 NULL, 8807 bio_to_abort, 8808 bdev_nvme_abort_done, bio); 8809 8810 if (rc != -ENOENT) { 8811 break; 8812 } 8813 } 8814 } 8815 8816 if (rc != 0) { 8817 /* If no command was found or there was any error, complete the abort 8818 * request with failure. 8819 */ 8820 bdev_nvme_admin_complete(bio, rc); 8821 } 8822 } 8823 8824 static int 8825 bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, uint64_t src_offset_blocks, 8826 uint64_t num_blocks) 8827 { 8828 struct spdk_nvme_scc_source_range range = { 8829 .slba = src_offset_blocks, 8830 .nlb = num_blocks - 1 8831 }; 8832 8833 return spdk_nvme_ns_cmd_copy(bio->io_path->nvme_ns->ns, 8834 bio->io_path->qpair->qpair, 8835 &range, 1, dst_offset_blocks, 8836 bdev_nvme_queued_done, bio); 8837 } 8838 8839 static void 8840 bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w) 8841 { 8842 const char *action; 8843 uint32_t i; 8844 8845 if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) { 8846 action = "reset"; 8847 } else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) { 8848 action = "abort"; 8849 } else { 8850 action = "none"; 8851 } 8852 8853 spdk_json_write_object_begin(w); 8854 8855 spdk_json_write_named_string(w, "method", "bdev_nvme_set_options"); 8856 8857 spdk_json_write_named_object_begin(w, "params"); 8858 spdk_json_write_named_string(w, "action_on_timeout", action); 8859 spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us); 8860 spdk_json_write_named_uint64(w, "timeout_admin_us", g_opts.timeout_admin_us); 8861 spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms); 8862 spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst); 8863 spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight); 8864 spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight); 8865 spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight); 8866 spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us); 8867 spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us); 8868 spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests); 8869 spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit); 8870 spdk_json_write_named_uint32(w, "transport_retry_count", g_opts.transport_retry_count); 8871 spdk_json_write_named_int32(w, "bdev_retry_count", g_opts.bdev_retry_count); 8872 spdk_json_write_named_uint8(w, "transport_ack_timeout", g_opts.transport_ack_timeout); 8873 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", g_opts.ctrlr_loss_timeout_sec); 8874 spdk_json_write_named_uint32(w, "reconnect_delay_sec", g_opts.reconnect_delay_sec); 8875 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", g_opts.fast_io_fail_timeout_sec); 8876 spdk_json_write_named_bool(w, "disable_auto_failback", g_opts.disable_auto_failback); 8877 spdk_json_write_named_bool(w, "generate_uuids", g_opts.generate_uuids); 8878 spdk_json_write_named_uint8(w, "transport_tos", g_opts.transport_tos); 8879 spdk_json_write_named_bool(w, "nvme_error_stat", g_opts.nvme_error_stat); 8880 spdk_json_write_named_uint32(w, "rdma_srq_size", g_opts.rdma_srq_size); 8881 spdk_json_write_named_bool(w, "io_path_stat", g_opts.io_path_stat); 8882 spdk_json_write_named_bool(w, "allow_accel_sequence", g_opts.allow_accel_sequence); 8883 spdk_json_write_named_uint32(w, "rdma_max_cq_size", g_opts.rdma_max_cq_size); 8884 spdk_json_write_named_uint16(w, "rdma_cm_event_timeout_ms", g_opts.rdma_cm_event_timeout_ms); 8885 spdk_json_write_named_array_begin(w, "dhchap_digests"); 8886 for (i = 0; i < 32; ++i) { 8887 if (g_opts.dhchap_digests & SPDK_BIT(i)) { 8888 spdk_json_write_string(w, spdk_nvme_dhchap_get_digest_name(i)); 8889 } 8890 } 8891 spdk_json_write_array_end(w); 8892 spdk_json_write_named_array_begin(w, "dhchap_dhgroups"); 8893 for (i = 0; i < 32; ++i) { 8894 if (g_opts.dhchap_dhgroups & SPDK_BIT(i)) { 8895 spdk_json_write_string(w, spdk_nvme_dhchap_get_dhgroup_name(i)); 8896 } 8897 } 8898 8899 spdk_json_write_array_end(w); 8900 spdk_json_write_object_end(w); 8901 8902 spdk_json_write_object_end(w); 8903 } 8904 8905 static void 8906 bdev_nvme_discovery_config_json(struct spdk_json_write_ctx *w, struct discovery_ctx *ctx) 8907 { 8908 struct spdk_nvme_transport_id trid; 8909 8910 spdk_json_write_object_begin(w); 8911 8912 spdk_json_write_named_string(w, "method", "bdev_nvme_start_discovery"); 8913 8914 spdk_json_write_named_object_begin(w, "params"); 8915 spdk_json_write_named_string(w, "name", ctx->name); 8916 spdk_json_write_named_string(w, "hostnqn", ctx->hostnqn); 8917 8918 trid = ctx->trid; 8919 memset(trid.subnqn, 0, sizeof(trid.subnqn)); 8920 nvme_bdev_dump_trid_json(&trid, w); 8921 8922 spdk_json_write_named_bool(w, "wait_for_attach", ctx->wait_for_attach); 8923 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", ctx->bdev_opts.ctrlr_loss_timeout_sec); 8924 spdk_json_write_named_uint32(w, "reconnect_delay_sec", ctx->bdev_opts.reconnect_delay_sec); 8925 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 8926 ctx->bdev_opts.fast_io_fail_timeout_sec); 8927 spdk_json_write_object_end(w); 8928 8929 spdk_json_write_object_end(w); 8930 } 8931 8932 #ifdef SPDK_CONFIG_NVME_CUSE 8933 static void 8934 nvme_ctrlr_cuse_config_json(struct spdk_json_write_ctx *w, 8935 struct nvme_ctrlr *nvme_ctrlr) 8936 { 8937 size_t cuse_name_size = 128; 8938 char cuse_name[cuse_name_size]; 8939 8940 if (spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr, 8941 cuse_name, &cuse_name_size) != 0) { 8942 return; 8943 } 8944 8945 spdk_json_write_object_begin(w); 8946 8947 spdk_json_write_named_string(w, "method", "bdev_nvme_cuse_register"); 8948 8949 spdk_json_write_named_object_begin(w, "params"); 8950 spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name); 8951 spdk_json_write_object_end(w); 8952 8953 spdk_json_write_object_end(w); 8954 } 8955 #endif 8956 8957 static void 8958 nvme_ctrlr_config_json(struct spdk_json_write_ctx *w, 8959 struct nvme_ctrlr *nvme_ctrlr, 8960 struct nvme_path_id *path_id) 8961 { 8962 struct spdk_nvme_transport_id *trid; 8963 const struct spdk_nvme_ctrlr_opts *opts; 8964 8965 if (nvme_ctrlr->opts.from_discovery_service) { 8966 /* Do not emit an RPC for this - it will be implicitly 8967 * covered by a separate bdev_nvme_start_discovery or 8968 * bdev_nvme_start_mdns_discovery RPC. 8969 */ 8970 return; 8971 } 8972 8973 trid = &path_id->trid; 8974 8975 spdk_json_write_object_begin(w); 8976 8977 spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller"); 8978 8979 spdk_json_write_named_object_begin(w, "params"); 8980 spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name); 8981 nvme_bdev_dump_trid_json(trid, w); 8982 spdk_json_write_named_bool(w, "prchk_reftag", 8983 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0); 8984 spdk_json_write_named_bool(w, "prchk_guard", 8985 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0); 8986 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", nvme_ctrlr->opts.ctrlr_loss_timeout_sec); 8987 spdk_json_write_named_uint32(w, "reconnect_delay_sec", nvme_ctrlr->opts.reconnect_delay_sec); 8988 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 8989 nvme_ctrlr->opts.fast_io_fail_timeout_sec); 8990 if (nvme_ctrlr->psk != NULL) { 8991 spdk_json_write_named_string(w, "psk", spdk_key_get_name(nvme_ctrlr->psk)); 8992 } 8993 if (nvme_ctrlr->dhchap_key != NULL) { 8994 spdk_json_write_named_string(w, "dhchap_key", 8995 spdk_key_get_name(nvme_ctrlr->dhchap_key)); 8996 } 8997 if (nvme_ctrlr->dhchap_ctrlr_key != NULL) { 8998 spdk_json_write_named_string(w, "dhchap_ctrlr_key", 8999 spdk_key_get_name(nvme_ctrlr->dhchap_ctrlr_key)); 9000 } 9001 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 9002 spdk_json_write_named_string(w, "hostnqn", opts->hostnqn); 9003 spdk_json_write_named_bool(w, "hdgst", opts->header_digest); 9004 spdk_json_write_named_bool(w, "ddgst", opts->data_digest); 9005 if (opts->src_addr[0] != '\0') { 9006 spdk_json_write_named_string(w, "hostaddr", opts->src_addr); 9007 } 9008 if (opts->src_svcid[0] != '\0') { 9009 spdk_json_write_named_string(w, "hostsvcid", opts->src_svcid); 9010 } 9011 9012 if (nvme_ctrlr->opts.multipath) { 9013 spdk_json_write_named_string(w, "multipath", "multipath"); 9014 } 9015 spdk_json_write_object_end(w); 9016 9017 spdk_json_write_object_end(w); 9018 } 9019 9020 static void 9021 bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w) 9022 { 9023 spdk_json_write_object_begin(w); 9024 spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug"); 9025 9026 spdk_json_write_named_object_begin(w, "params"); 9027 spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us); 9028 spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled); 9029 spdk_json_write_object_end(w); 9030 9031 spdk_json_write_object_end(w); 9032 } 9033 9034 static int 9035 bdev_nvme_config_json(struct spdk_json_write_ctx *w) 9036 { 9037 struct nvme_bdev_ctrlr *nbdev_ctrlr; 9038 struct nvme_ctrlr *nvme_ctrlr; 9039 struct discovery_ctx *ctx; 9040 struct nvme_path_id *path_id; 9041 9042 bdev_nvme_opts_config_json(w); 9043 9044 pthread_mutex_lock(&g_bdev_nvme_mutex); 9045 9046 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 9047 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 9048 path_id = nvme_ctrlr->active_path_id; 9049 assert(path_id == TAILQ_FIRST(&nvme_ctrlr->trids)); 9050 nvme_ctrlr_config_json(w, nvme_ctrlr, path_id); 9051 9052 path_id = TAILQ_NEXT(path_id, link); 9053 while (path_id != NULL) { 9054 nvme_ctrlr_config_json(w, nvme_ctrlr, path_id); 9055 path_id = TAILQ_NEXT(path_id, link); 9056 } 9057 9058 #ifdef SPDK_CONFIG_NVME_CUSE 9059 nvme_ctrlr_cuse_config_json(w, nvme_ctrlr); 9060 #endif 9061 } 9062 } 9063 9064 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 9065 if (!ctx->from_mdns_discovery_service) { 9066 bdev_nvme_discovery_config_json(w, ctx); 9067 } 9068 } 9069 9070 bdev_nvme_mdns_discovery_config_json(w); 9071 9072 /* Dump as last parameter to give all NVMe bdevs chance to be constructed 9073 * before enabling hotplug poller. 9074 */ 9075 bdev_nvme_hotplug_config_json(w); 9076 9077 pthread_mutex_unlock(&g_bdev_nvme_mutex); 9078 return 0; 9079 } 9080 9081 struct spdk_nvme_ctrlr * 9082 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev) 9083 { 9084 struct nvme_bdev *nbdev; 9085 struct nvme_ns *nvme_ns; 9086 9087 if (!bdev || bdev->module != &nvme_if) { 9088 return NULL; 9089 } 9090 9091 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 9092 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 9093 assert(nvme_ns != NULL); 9094 9095 return nvme_ns->ctrlr->ctrlr; 9096 } 9097 9098 static bool 9099 nvme_io_path_is_current(struct nvme_io_path *io_path) 9100 { 9101 const struct nvme_bdev_channel *nbdev_ch; 9102 bool current; 9103 9104 if (!nvme_io_path_is_available(io_path)) { 9105 return false; 9106 } 9107 9108 nbdev_ch = io_path->nbdev_ch; 9109 if (nbdev_ch == NULL) { 9110 current = false; 9111 } else if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) { 9112 struct nvme_io_path *optimized_io_path = NULL; 9113 9114 STAILQ_FOREACH(optimized_io_path, &nbdev_ch->io_path_list, stailq) { 9115 if (optimized_io_path->nvme_ns->ana_state == SPDK_NVME_ANA_OPTIMIZED_STATE) { 9116 break; 9117 } 9118 } 9119 9120 /* A non-optimized path is only current if there are no optimized paths. */ 9121 current = (io_path->nvme_ns->ana_state == SPDK_NVME_ANA_OPTIMIZED_STATE) || 9122 (optimized_io_path == NULL); 9123 } else { 9124 if (nbdev_ch->current_io_path) { 9125 current = (io_path == nbdev_ch->current_io_path); 9126 } else { 9127 struct nvme_io_path *first_path; 9128 9129 /* We arrived here as there are no optimized paths for active-passive 9130 * mode. Check if this io_path is the first one available on the list. 9131 */ 9132 current = false; 9133 STAILQ_FOREACH(first_path, &nbdev_ch->io_path_list, stailq) { 9134 if (nvme_io_path_is_available(first_path)) { 9135 current = (io_path == first_path); 9136 break; 9137 } 9138 } 9139 } 9140 } 9141 9142 return current; 9143 } 9144 9145 static struct nvme_ctrlr * 9146 bdev_nvme_next_ctrlr_unsafe(struct nvme_bdev_ctrlr *nbdev_ctrlr, struct nvme_ctrlr *prev) 9147 { 9148 struct nvme_ctrlr *next; 9149 9150 /* Must be called under g_bdev_nvme_mutex */ 9151 next = prev != NULL ? TAILQ_NEXT(prev, tailq) : TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 9152 while (next != NULL) { 9153 /* ref can be 0 when the ctrlr was released, but hasn't been detached yet */ 9154 pthread_mutex_lock(&next->mutex); 9155 if (next->ref > 0) { 9156 next->ref++; 9157 pthread_mutex_unlock(&next->mutex); 9158 return next; 9159 } 9160 9161 pthread_mutex_unlock(&next->mutex); 9162 next = TAILQ_NEXT(next, tailq); 9163 } 9164 9165 return NULL; 9166 } 9167 9168 struct bdev_nvme_set_keys_ctx { 9169 struct nvme_ctrlr *nctrlr; 9170 struct spdk_key *dhchap_key; 9171 struct spdk_key *dhchap_ctrlr_key; 9172 struct spdk_thread *thread; 9173 bdev_nvme_set_keys_cb cb_fn; 9174 void *cb_ctx; 9175 int status; 9176 }; 9177 9178 static void 9179 bdev_nvme_free_set_keys_ctx(struct bdev_nvme_set_keys_ctx *ctx) 9180 { 9181 if (ctx == NULL) { 9182 return; 9183 } 9184 9185 spdk_keyring_put_key(ctx->dhchap_key); 9186 spdk_keyring_put_key(ctx->dhchap_ctrlr_key); 9187 free(ctx); 9188 } 9189 9190 static void 9191 _bdev_nvme_set_keys_done(void *_ctx) 9192 { 9193 struct bdev_nvme_set_keys_ctx *ctx = _ctx; 9194 9195 ctx->cb_fn(ctx->cb_ctx, ctx->status); 9196 9197 if (ctx->nctrlr != NULL) { 9198 nvme_ctrlr_put_ref(ctx->nctrlr); 9199 } 9200 bdev_nvme_free_set_keys_ctx(ctx); 9201 } 9202 9203 static void 9204 bdev_nvme_set_keys_done(struct bdev_nvme_set_keys_ctx *ctx, int status) 9205 { 9206 ctx->status = status; 9207 spdk_thread_exec_msg(ctx->thread, _bdev_nvme_set_keys_done, ctx); 9208 } 9209 9210 static void bdev_nvme_authenticate_ctrlr(struct bdev_nvme_set_keys_ctx *ctx); 9211 9212 static void 9213 bdev_nvme_authenticate_ctrlr_continue(struct bdev_nvme_set_keys_ctx *ctx) 9214 { 9215 struct nvme_ctrlr *next; 9216 9217 pthread_mutex_lock(&g_bdev_nvme_mutex); 9218 next = bdev_nvme_next_ctrlr_unsafe(NULL, ctx->nctrlr); 9219 pthread_mutex_unlock(&g_bdev_nvme_mutex); 9220 9221 nvme_ctrlr_put_ref(ctx->nctrlr); 9222 ctx->nctrlr = next; 9223 9224 if (next == NULL) { 9225 bdev_nvme_set_keys_done(ctx, 0); 9226 } else { 9227 bdev_nvme_authenticate_ctrlr(ctx); 9228 } 9229 } 9230 9231 static void 9232 bdev_nvme_authenticate_qpairs_done(struct spdk_io_channel_iter *i, int status) 9233 { 9234 struct bdev_nvme_set_keys_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 9235 9236 if (status != 0) { 9237 bdev_nvme_set_keys_done(ctx, status); 9238 return; 9239 } 9240 bdev_nvme_authenticate_ctrlr_continue(ctx); 9241 } 9242 9243 static void 9244 bdev_nvme_authenticate_qpair_done(void *ctx, int status) 9245 { 9246 spdk_for_each_channel_continue(ctx, status); 9247 } 9248 9249 static void 9250 bdev_nvme_authenticate_qpair(struct spdk_io_channel_iter *i) 9251 { 9252 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 9253 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch); 9254 struct nvme_qpair *qpair = ctrlr_ch->qpair; 9255 int rc; 9256 9257 if (!nvme_qpair_is_connected(qpair)) { 9258 spdk_for_each_channel_continue(i, 0); 9259 return; 9260 } 9261 9262 rc = spdk_nvme_qpair_authenticate(qpair->qpair, bdev_nvme_authenticate_qpair_done, i); 9263 if (rc != 0) { 9264 spdk_for_each_channel_continue(i, rc); 9265 } 9266 } 9267 9268 static void 9269 bdev_nvme_authenticate_ctrlr_done(void *_ctx, int status) 9270 { 9271 struct bdev_nvme_set_keys_ctx *ctx = _ctx; 9272 9273 if (status != 0) { 9274 bdev_nvme_set_keys_done(ctx, status); 9275 return; 9276 } 9277 9278 spdk_for_each_channel(ctx->nctrlr, bdev_nvme_authenticate_qpair, ctx, 9279 bdev_nvme_authenticate_qpairs_done); 9280 } 9281 9282 static void 9283 bdev_nvme_authenticate_ctrlr(struct bdev_nvme_set_keys_ctx *ctx) 9284 { 9285 struct spdk_nvme_ctrlr_key_opts opts = {}; 9286 struct nvme_ctrlr *nctrlr = ctx->nctrlr; 9287 int rc; 9288 9289 opts.size = SPDK_SIZEOF(&opts, dhchap_ctrlr_key); 9290 opts.dhchap_key = ctx->dhchap_key; 9291 opts.dhchap_ctrlr_key = ctx->dhchap_ctrlr_key; 9292 rc = spdk_nvme_ctrlr_set_keys(nctrlr->ctrlr, &opts); 9293 if (rc != 0) { 9294 bdev_nvme_set_keys_done(ctx, rc); 9295 return; 9296 } 9297 9298 if (ctx->dhchap_key != NULL) { 9299 rc = spdk_nvme_ctrlr_authenticate(nctrlr->ctrlr, 9300 bdev_nvme_authenticate_ctrlr_done, ctx); 9301 if (rc != 0) { 9302 bdev_nvme_set_keys_done(ctx, rc); 9303 } 9304 } else { 9305 bdev_nvme_authenticate_ctrlr_continue(ctx); 9306 } 9307 } 9308 9309 int 9310 bdev_nvme_set_keys(const char *name, const char *dhchap_key, const char *dhchap_ctrlr_key, 9311 bdev_nvme_set_keys_cb cb_fn, void *cb_ctx) 9312 { 9313 struct bdev_nvme_set_keys_ctx *ctx; 9314 struct nvme_bdev_ctrlr *nbdev_ctrlr; 9315 struct nvme_ctrlr *nctrlr; 9316 9317 ctx = calloc(1, sizeof(*ctx)); 9318 if (ctx == NULL) { 9319 return -ENOMEM; 9320 } 9321 9322 if (dhchap_key != NULL) { 9323 ctx->dhchap_key = spdk_keyring_get_key(dhchap_key); 9324 if (ctx->dhchap_key == NULL) { 9325 SPDK_ERRLOG("Could not find key %s for bdev %s\n", dhchap_key, name); 9326 bdev_nvme_free_set_keys_ctx(ctx); 9327 return -ENOKEY; 9328 } 9329 } 9330 if (dhchap_ctrlr_key != NULL) { 9331 ctx->dhchap_ctrlr_key = spdk_keyring_get_key(dhchap_ctrlr_key); 9332 if (ctx->dhchap_ctrlr_key == NULL) { 9333 SPDK_ERRLOG("Could not find key %s for bdev %s\n", dhchap_ctrlr_key, name); 9334 bdev_nvme_free_set_keys_ctx(ctx); 9335 return -ENOKEY; 9336 } 9337 } 9338 9339 pthread_mutex_lock(&g_bdev_nvme_mutex); 9340 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 9341 if (nbdev_ctrlr == NULL) { 9342 SPDK_ERRLOG("Could not find bdev_ctrlr %s\n", name); 9343 pthread_mutex_unlock(&g_bdev_nvme_mutex); 9344 bdev_nvme_free_set_keys_ctx(ctx); 9345 return -ENODEV; 9346 } 9347 nctrlr = bdev_nvme_next_ctrlr_unsafe(nbdev_ctrlr, NULL); 9348 if (nctrlr == NULL) { 9349 SPDK_ERRLOG("Could not find any nvme_ctrlrs on bdev_ctrlr %s\n", name); 9350 pthread_mutex_unlock(&g_bdev_nvme_mutex); 9351 bdev_nvme_free_set_keys_ctx(ctx); 9352 return -ENODEV; 9353 } 9354 pthread_mutex_unlock(&g_bdev_nvme_mutex); 9355 9356 ctx->nctrlr = nctrlr; 9357 ctx->cb_fn = cb_fn; 9358 ctx->cb_ctx = cb_ctx; 9359 ctx->thread = spdk_get_thread(); 9360 9361 bdev_nvme_authenticate_ctrlr(ctx); 9362 9363 return 0; 9364 } 9365 9366 void 9367 nvme_io_path_info_json(struct spdk_json_write_ctx *w, struct nvme_io_path *io_path) 9368 { 9369 struct nvme_ns *nvme_ns = io_path->nvme_ns; 9370 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 9371 const struct spdk_nvme_ctrlr_data *cdata; 9372 const struct spdk_nvme_transport_id *trid; 9373 const char *adrfam_str; 9374 9375 spdk_json_write_object_begin(w); 9376 9377 spdk_json_write_named_string(w, "bdev_name", nvme_ns->bdev->disk.name); 9378 9379 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 9380 trid = spdk_nvme_ctrlr_get_transport_id(nvme_ctrlr->ctrlr); 9381 9382 spdk_json_write_named_uint32(w, "cntlid", cdata->cntlid); 9383 spdk_json_write_named_bool(w, "current", nvme_io_path_is_current(io_path)); 9384 spdk_json_write_named_bool(w, "connected", nvme_qpair_is_connected(io_path->qpair)); 9385 spdk_json_write_named_bool(w, "accessible", nvme_ns_is_accessible(nvme_ns)); 9386 9387 spdk_json_write_named_object_begin(w, "transport"); 9388 spdk_json_write_named_string(w, "trtype", trid->trstring); 9389 spdk_json_write_named_string(w, "traddr", trid->traddr); 9390 if (trid->trsvcid[0] != '\0') { 9391 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 9392 } 9393 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 9394 if (adrfam_str) { 9395 spdk_json_write_named_string(w, "adrfam", adrfam_str); 9396 } 9397 spdk_json_write_object_end(w); 9398 9399 spdk_json_write_object_end(w); 9400 } 9401 9402 void 9403 bdev_nvme_get_discovery_info(struct spdk_json_write_ctx *w) 9404 { 9405 struct discovery_ctx *ctx; 9406 struct discovery_entry_ctx *entry_ctx; 9407 9408 spdk_json_write_array_begin(w); 9409 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 9410 spdk_json_write_object_begin(w); 9411 spdk_json_write_named_string(w, "name", ctx->name); 9412 9413 spdk_json_write_named_object_begin(w, "trid"); 9414 nvme_bdev_dump_trid_json(&ctx->trid, w); 9415 spdk_json_write_object_end(w); 9416 9417 spdk_json_write_named_array_begin(w, "referrals"); 9418 TAILQ_FOREACH(entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 9419 spdk_json_write_object_begin(w); 9420 spdk_json_write_named_object_begin(w, "trid"); 9421 nvme_bdev_dump_trid_json(&entry_ctx->trid, w); 9422 spdk_json_write_object_end(w); 9423 spdk_json_write_object_end(w); 9424 } 9425 spdk_json_write_array_end(w); 9426 9427 spdk_json_write_object_end(w); 9428 } 9429 spdk_json_write_array_end(w); 9430 } 9431 9432 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme) 9433 9434 static void 9435 bdev_nvme_trace(void) 9436 { 9437 struct spdk_trace_tpoint_opts opts[] = { 9438 { 9439 "BDEV_NVME_IO_START", TRACE_BDEV_NVME_IO_START, 9440 OWNER_TYPE_NONE, OBJECT_BDEV_NVME_IO, 1, 9441 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 9442 }, 9443 { 9444 "BDEV_NVME_IO_DONE", TRACE_BDEV_NVME_IO_DONE, 9445 OWNER_TYPE_NONE, OBJECT_BDEV_NVME_IO, 0, 9446 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 9447 } 9448 }; 9449 9450 9451 spdk_trace_register_object(OBJECT_BDEV_NVME_IO, 'N'); 9452 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 9453 spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_SUBMIT, OBJECT_BDEV_NVME_IO, 0); 9454 spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_SUBMIT, OBJECT_BDEV_NVME_IO, 0); 9455 spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_COMPLETE, OBJECT_BDEV_NVME_IO, 0); 9456 spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_COMPLETE, OBJECT_BDEV_NVME_IO, 0); 9457 } 9458 SPDK_TRACE_REGISTER_FN(bdev_nvme_trace, "bdev_nvme", TRACE_GROUP_BDEV_NVME) 9459