1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 * Copyright (c) 2022 Dell Inc, or its subsidiaries. All rights reserved. 6 */ 7 8 #include "spdk/stdinc.h" 9 10 #include "bdev_nvme.h" 11 12 #include "spdk/accel.h" 13 #include "spdk/config.h" 14 #include "spdk/endian.h" 15 #include "spdk/bdev.h" 16 #include "spdk/json.h" 17 #include "spdk/keyring.h" 18 #include "spdk/likely.h" 19 #include "spdk/nvme.h" 20 #include "spdk/nvme_ocssd.h" 21 #include "spdk/nvme_zns.h" 22 #include "spdk/opal.h" 23 #include "spdk/thread.h" 24 #include "spdk/trace.h" 25 #include "spdk/string.h" 26 #include "spdk/util.h" 27 #include "spdk/uuid.h" 28 29 #include "spdk/bdev_module.h" 30 #include "spdk/log.h" 31 32 #include "spdk_internal/usdt.h" 33 #include "spdk_internal/trace_defs.h" 34 35 #define CTRLR_STRING(nvme_ctrlr) \ 36 (spdk_nvme_trtype_is_fabrics(nvme_ctrlr->active_path_id->trid.trtype) ? \ 37 nvme_ctrlr->active_path_id->trid.subnqn : nvme_ctrlr->active_path_id->trid.traddr) 38 39 #define CTRLR_ID(nvme_ctrlr) (spdk_nvme_ctrlr_get_id(nvme_ctrlr->ctrlr)) 40 41 #define NVME_CTRLR_ERRLOG(ctrlr, format, ...) \ 42 SPDK_ERRLOG("[%s, %u] " format, CTRLR_STRING(ctrlr), CTRLR_ID(ctrlr), ##__VA_ARGS__); 43 44 #define NVME_CTRLR_WARNLOG(ctrlr, format, ...) \ 45 SPDK_WARNLOG("[%s, %u] " format, CTRLR_STRING(ctrlr), CTRLR_ID(ctrlr), ##__VA_ARGS__); 46 47 #define NVME_CTRLR_NOTICELOG(ctrlr, format, ...) \ 48 SPDK_NOTICELOG("[%s, %u] " format, CTRLR_STRING(ctrlr), CTRLR_ID(ctrlr), ##__VA_ARGS__); 49 50 #define NVME_CTRLR_INFOLOG(ctrlr, format, ...) \ 51 SPDK_INFOLOG(bdev_nvme, "[%s, %u] " format, CTRLR_STRING(ctrlr), CTRLR_ID(ctrlr), ##__VA_ARGS__); 52 53 #ifdef DEBUG 54 #define NVME_CTRLR_DEBUGLOG(ctrlr, format, ...) \ 55 SPDK_DEBUGLOG(bdev_nvme, "[%s, %u] " format, CTRLR_STRING(ctrlr), CTRLR_ID(ctrlr), ##__VA_ARGS__); 56 #else 57 #define NVME_CTRLR_DEBUGLOG(ctrlr, ...) do { } while (0) 58 #endif 59 60 #define BDEV_STRING(nbdev) (nbdev->disk.name) 61 62 #define NVME_BDEV_ERRLOG(nbdev, format, ...) \ 63 SPDK_ERRLOG("[%s] " format, BDEV_STRING(nbdev), ##__VA_ARGS__); 64 65 #define NVME_BDEV_WARNLOG(nbdev, format, ...) \ 66 SPDK_WARNLOG("[%s] " format, BDEV_STRING(nbdev), ##__VA_ARGS__); 67 68 #define NVME_BDEV_NOTICELOG(nbdev, format, ...) \ 69 SPDK_NOTICELOG("[%s] " format, BDEV_STRING(nbdev), ##__VA_ARGS__); 70 71 #define NVME_BDEV_INFOLOG(nbdev, format, ...) \ 72 SPDK_INFOLOG(bdev_nvme, "[%s] " format, BDEV_STRING(nbdev), ##__VA_ARGS__); 73 74 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true 75 #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS (10000) 76 77 #define NSID_STR_LEN 10 78 79 #define SPDK_CONTROLLER_NAME_MAX 512 80 81 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w); 82 83 struct nvme_bdev_io { 84 /** array of iovecs to transfer. */ 85 struct iovec *iovs; 86 87 /** Number of iovecs in iovs array. */ 88 int iovcnt; 89 90 /** Current iovec position. */ 91 int iovpos; 92 93 /** Offset in current iovec. */ 94 uint32_t iov_offset; 95 96 /** Offset in current iovec. */ 97 uint32_t fused_iov_offset; 98 99 /** array of iovecs to transfer. */ 100 struct iovec *fused_iovs; 101 102 /** Number of iovecs in iovs array. */ 103 int fused_iovcnt; 104 105 /** Current iovec position. */ 106 int fused_iovpos; 107 108 /** I/O path the current I/O or admin passthrough is submitted on, or the I/O path 109 * being reset in a reset I/O. 110 */ 111 struct nvme_io_path *io_path; 112 113 /** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */ 114 struct spdk_nvme_cpl cpl; 115 116 /** Extended IO opts passed by the user to bdev layer and mapped to NVME format */ 117 struct spdk_nvme_ns_cmd_ext_io_opts ext_opts; 118 119 /** Keeps track if first of fused commands was submitted */ 120 bool first_fused_submitted; 121 122 /** Keeps track if first of fused commands was completed */ 123 bool first_fused_completed; 124 125 /* How many times the current I/O was retried. */ 126 int32_t retry_count; 127 128 /** Expiration value in ticks to retry the current I/O. */ 129 uint64_t retry_ticks; 130 131 /** Temporary pointer to zone report buffer */ 132 struct spdk_nvme_zns_zone_report *zone_report_buf; 133 134 /** Keep track of how many zones that have been copied to the spdk_bdev_zone_info struct */ 135 uint64_t handled_zones; 136 137 /* Current tsc at submit time. */ 138 uint64_t submit_tsc; 139 140 /* Used to put nvme_bdev_io into the list */ 141 TAILQ_ENTRY(nvme_bdev_io) retry_link; 142 }; 143 144 struct nvme_probe_skip_entry { 145 struct spdk_nvme_transport_id trid; 146 TAILQ_ENTRY(nvme_probe_skip_entry) tailq; 147 }; 148 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */ 149 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER( 150 g_skipped_nvme_ctrlrs); 151 152 #define BDEV_NVME_DEFAULT_DIGESTS (SPDK_BIT(SPDK_NVMF_DHCHAP_HASH_SHA256) | \ 153 SPDK_BIT(SPDK_NVMF_DHCHAP_HASH_SHA384) | \ 154 SPDK_BIT(SPDK_NVMF_DHCHAP_HASH_SHA512)) 155 156 #define BDEV_NVME_DEFAULT_DHGROUPS (SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_NULL) | \ 157 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_2048) | \ 158 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_3072) | \ 159 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_4096) | \ 160 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_6144) | \ 161 SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_8192)) 162 163 static struct spdk_bdev_nvme_opts g_opts = { 164 .action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE, 165 .keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS, 166 .timeout_us = 0, 167 .timeout_admin_us = 0, 168 .transport_retry_count = 4, 169 .arbitration_burst = 0, 170 .low_priority_weight = 0, 171 .medium_priority_weight = 0, 172 .high_priority_weight = 0, 173 .io_queue_requests = 0, 174 .nvme_adminq_poll_period_us = 10000ULL, 175 .nvme_ioq_poll_period_us = 0, 176 .delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT, 177 .bdev_retry_count = 3, 178 .ctrlr_loss_timeout_sec = 0, 179 .reconnect_delay_sec = 0, 180 .fast_io_fail_timeout_sec = 0, 181 .transport_ack_timeout = 0, 182 .disable_auto_failback = false, 183 .generate_uuids = false, 184 .transport_tos = 0, 185 .nvme_error_stat = false, 186 .io_path_stat = false, 187 .allow_accel_sequence = false, 188 .dhchap_digests = BDEV_NVME_DEFAULT_DIGESTS, 189 .dhchap_dhgroups = BDEV_NVME_DEFAULT_DHGROUPS, 190 }; 191 192 #define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL 193 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT 100000ULL 194 195 static int g_hot_insert_nvme_controller_index = 0; 196 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT; 197 static bool g_nvme_hotplug_enabled = false; 198 struct spdk_thread *g_bdev_nvme_init_thread; 199 static struct spdk_poller *g_hotplug_poller; 200 static struct spdk_poller *g_hotplug_probe_poller; 201 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx; 202 203 static void nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 204 struct nvme_async_probe_ctx *ctx); 205 static void nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 206 struct nvme_async_probe_ctx *ctx); 207 static int bdev_nvme_library_init(void); 208 static void bdev_nvme_library_fini(void); 209 static void _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, 210 struct spdk_bdev_io *bdev_io); 211 static void bdev_nvme_submit_request(struct spdk_io_channel *ch, 212 struct spdk_bdev_io *bdev_io); 213 static int bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 214 void *md, uint64_t lba_count, uint64_t lba, 215 uint32_t flags, struct spdk_memory_domain *domain, void *domain_ctx, 216 struct spdk_accel_sequence *seq); 217 static int bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 218 void *md, uint64_t lba_count, uint64_t lba); 219 static int bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 220 void *md, uint64_t lba_count, uint64_t lba, 221 uint32_t flags, struct spdk_memory_domain *domain, void *domain_ctx, 222 struct spdk_accel_sequence *seq, 223 union spdk_bdev_nvme_cdw12 cdw12, union spdk_bdev_nvme_cdw13 cdw13); 224 static int bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 225 void *md, uint64_t lba_count, 226 uint64_t zslba, uint32_t flags); 227 static int bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 228 void *md, uint64_t lba_count, uint64_t lba, 229 uint32_t flags); 230 static int bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, 231 struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov, 232 int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba, 233 uint32_t flags); 234 static int bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, 235 uint32_t num_zones, struct spdk_bdev_zone_info *info); 236 static int bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 237 enum spdk_bdev_zone_action action); 238 static void bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, 239 struct nvme_bdev_io *bio, 240 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); 241 static int bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 242 void *buf, size_t nbytes); 243 static int bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 244 void *buf, size_t nbytes, void *md_buf, size_t md_len); 245 static int bdev_nvme_iov_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 246 struct iovec *iov, int iovcnt, size_t nbytes, 247 void *md_buf, size_t md_len); 248 static void bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, 249 struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort); 250 static void bdev_nvme_reset_io(struct nvme_bdev *nbdev, struct nvme_bdev_io *bio); 251 static int bdev_nvme_reset_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 252 static int bdev_nvme_failover_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 253 static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr); 254 static int nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr); 255 256 static struct nvme_ns *nvme_ns_alloc(void); 257 static void nvme_ns_free(struct nvme_ns *ns); 258 259 static int 260 nvme_ns_cmp(struct nvme_ns *ns1, struct nvme_ns *ns2) 261 { 262 return ns1->id < ns2->id ? -1 : ns1->id > ns2->id; 263 } 264 265 RB_GENERATE_STATIC(nvme_ns_tree, nvme_ns, node, nvme_ns_cmp); 266 267 struct spdk_nvme_qpair * 268 bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch) 269 { 270 struct nvme_ctrlr_channel *ctrlr_ch; 271 272 assert(ctrlr_io_ch != NULL); 273 274 ctrlr_ch = spdk_io_channel_get_ctx(ctrlr_io_ch); 275 276 return ctrlr_ch->qpair->qpair; 277 } 278 279 static int 280 bdev_nvme_get_ctx_size(void) 281 { 282 return sizeof(struct nvme_bdev_io); 283 } 284 285 static struct spdk_bdev_module nvme_if = { 286 .name = "nvme", 287 .async_fini = true, 288 .module_init = bdev_nvme_library_init, 289 .module_fini = bdev_nvme_library_fini, 290 .config_json = bdev_nvme_config_json, 291 .get_ctx_size = bdev_nvme_get_ctx_size, 292 293 }; 294 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if) 295 296 struct nvme_bdev_ctrlrs g_nvme_bdev_ctrlrs = TAILQ_HEAD_INITIALIZER(g_nvme_bdev_ctrlrs); 297 pthread_mutex_t g_bdev_nvme_mutex = PTHREAD_MUTEX_INITIALIZER; 298 bool g_bdev_nvme_module_finish; 299 300 struct nvme_bdev_ctrlr * 301 nvme_bdev_ctrlr_get_by_name(const char *name) 302 { 303 struct nvme_bdev_ctrlr *nbdev_ctrlr; 304 305 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 306 if (strcmp(name, nbdev_ctrlr->name) == 0) { 307 break; 308 } 309 } 310 311 return nbdev_ctrlr; 312 } 313 314 static struct nvme_ctrlr * 315 nvme_bdev_ctrlr_get_ctrlr(struct nvme_bdev_ctrlr *nbdev_ctrlr, 316 const struct spdk_nvme_transport_id *trid, const char *hostnqn) 317 { 318 const struct spdk_nvme_ctrlr_opts *opts; 319 struct nvme_ctrlr *nvme_ctrlr; 320 321 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 322 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 323 if (spdk_nvme_transport_id_compare(trid, &nvme_ctrlr->active_path_id->trid) == 0 && 324 strcmp(hostnqn, opts->hostnqn) == 0) { 325 break; 326 } 327 } 328 329 return nvme_ctrlr; 330 } 331 332 struct nvme_ctrlr * 333 nvme_bdev_ctrlr_get_ctrlr_by_id(struct nvme_bdev_ctrlr *nbdev_ctrlr, 334 uint16_t cntlid) 335 { 336 struct nvme_ctrlr *nvme_ctrlr; 337 const struct spdk_nvme_ctrlr_data *cdata; 338 339 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 340 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 341 if (cdata->cntlid == cntlid) { 342 break; 343 } 344 } 345 346 return nvme_ctrlr; 347 } 348 349 static struct nvme_bdev * 350 nvme_bdev_ctrlr_get_bdev(struct nvme_bdev_ctrlr *nbdev_ctrlr, uint32_t nsid) 351 { 352 struct nvme_bdev *bdev; 353 354 pthread_mutex_lock(&g_bdev_nvme_mutex); 355 TAILQ_FOREACH(bdev, &nbdev_ctrlr->bdevs, tailq) { 356 if (bdev->nsid == nsid) { 357 break; 358 } 359 } 360 pthread_mutex_unlock(&g_bdev_nvme_mutex); 361 362 return bdev; 363 } 364 365 struct nvme_ns * 366 nvme_ctrlr_get_ns(struct nvme_ctrlr *nvme_ctrlr, uint32_t nsid) 367 { 368 struct nvme_ns ns; 369 370 assert(nsid > 0); 371 372 ns.id = nsid; 373 return RB_FIND(nvme_ns_tree, &nvme_ctrlr->namespaces, &ns); 374 } 375 376 struct nvme_ns * 377 nvme_ctrlr_get_first_active_ns(struct nvme_ctrlr *nvme_ctrlr) 378 { 379 return RB_MIN(nvme_ns_tree, &nvme_ctrlr->namespaces); 380 } 381 382 struct nvme_ns * 383 nvme_ctrlr_get_next_active_ns(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *ns) 384 { 385 if (ns == NULL) { 386 return NULL; 387 } 388 389 return RB_NEXT(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 390 } 391 392 static struct nvme_ctrlr * 393 nvme_ctrlr_get(const struct spdk_nvme_transport_id *trid, const char *hostnqn) 394 { 395 struct nvme_bdev_ctrlr *nbdev_ctrlr; 396 struct nvme_ctrlr *nvme_ctrlr = NULL; 397 398 pthread_mutex_lock(&g_bdev_nvme_mutex); 399 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 400 nvme_ctrlr = nvme_bdev_ctrlr_get_ctrlr(nbdev_ctrlr, trid, hostnqn); 401 if (nvme_ctrlr != NULL) { 402 break; 403 } 404 } 405 pthread_mutex_unlock(&g_bdev_nvme_mutex); 406 407 return nvme_ctrlr; 408 } 409 410 struct nvme_ctrlr * 411 nvme_ctrlr_get_by_name(const char *name) 412 { 413 struct nvme_bdev_ctrlr *nbdev_ctrlr; 414 struct nvme_ctrlr *nvme_ctrlr = NULL; 415 416 if (name == NULL) { 417 return NULL; 418 } 419 420 pthread_mutex_lock(&g_bdev_nvme_mutex); 421 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 422 if (nbdev_ctrlr != NULL) { 423 nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 424 } 425 pthread_mutex_unlock(&g_bdev_nvme_mutex); 426 427 return nvme_ctrlr; 428 } 429 430 void 431 nvme_bdev_ctrlr_for_each(nvme_bdev_ctrlr_for_each_fn fn, void *ctx) 432 { 433 struct nvme_bdev_ctrlr *nbdev_ctrlr; 434 435 pthread_mutex_lock(&g_bdev_nvme_mutex); 436 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 437 fn(nbdev_ctrlr, ctx); 438 } 439 pthread_mutex_unlock(&g_bdev_nvme_mutex); 440 } 441 442 struct nvme_ctrlr_channel_iter { 443 nvme_ctrlr_for_each_channel_msg fn; 444 nvme_ctrlr_for_each_channel_done cpl; 445 struct spdk_io_channel_iter *i; 446 void *ctx; 447 }; 448 449 void 450 nvme_ctrlr_for_each_channel_continue(struct nvme_ctrlr_channel_iter *iter, int status) 451 { 452 spdk_for_each_channel_continue(iter->i, status); 453 } 454 455 static void 456 nvme_ctrlr_each_channel_msg(struct spdk_io_channel_iter *i) 457 { 458 struct nvme_ctrlr_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 459 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 460 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 461 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch); 462 463 iter->i = i; 464 iter->fn(iter, nvme_ctrlr, ctrlr_ch, iter->ctx); 465 } 466 467 static void 468 nvme_ctrlr_each_channel_cpl(struct spdk_io_channel_iter *i, int status) 469 { 470 struct nvme_ctrlr_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 471 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 472 473 iter->i = i; 474 iter->cpl(nvme_ctrlr, iter->ctx, status); 475 476 free(iter); 477 } 478 479 void 480 nvme_ctrlr_for_each_channel(struct nvme_ctrlr *nvme_ctrlr, 481 nvme_ctrlr_for_each_channel_msg fn, void *ctx, 482 nvme_ctrlr_for_each_channel_done cpl) 483 { 484 struct nvme_ctrlr_channel_iter *iter; 485 486 assert(nvme_ctrlr != NULL && fn != NULL); 487 488 iter = calloc(1, sizeof(struct nvme_ctrlr_channel_iter)); 489 if (iter == NULL) { 490 SPDK_ERRLOG("Unable to allocate iterator\n"); 491 assert(false); 492 return; 493 } 494 495 iter->fn = fn; 496 iter->cpl = cpl; 497 iter->ctx = ctx; 498 499 spdk_for_each_channel(nvme_ctrlr, nvme_ctrlr_each_channel_msg, 500 iter, nvme_ctrlr_each_channel_cpl); 501 } 502 503 struct nvme_bdev_channel_iter { 504 nvme_bdev_for_each_channel_msg fn; 505 nvme_bdev_for_each_channel_done cpl; 506 struct spdk_io_channel_iter *i; 507 void *ctx; 508 }; 509 510 void 511 nvme_bdev_for_each_channel_continue(struct nvme_bdev_channel_iter *iter, int status) 512 { 513 spdk_for_each_channel_continue(iter->i, status); 514 } 515 516 static void 517 nvme_bdev_each_channel_msg(struct spdk_io_channel_iter *i) 518 { 519 struct nvme_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 520 struct nvme_bdev *nbdev = spdk_io_channel_iter_get_io_device(i); 521 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 522 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 523 524 iter->i = i; 525 iter->fn(iter, nbdev, nbdev_ch, iter->ctx); 526 } 527 528 static void 529 nvme_bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status) 530 { 531 struct nvme_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 532 struct nvme_bdev *nbdev = spdk_io_channel_iter_get_io_device(i); 533 534 iter->i = i; 535 iter->cpl(nbdev, iter->ctx, status); 536 537 free(iter); 538 } 539 540 void 541 nvme_bdev_for_each_channel(struct nvme_bdev *nbdev, 542 nvme_bdev_for_each_channel_msg fn, void *ctx, 543 nvme_bdev_for_each_channel_done cpl) 544 { 545 struct nvme_bdev_channel_iter *iter; 546 547 assert(nbdev != NULL && fn != NULL); 548 549 iter = calloc(1, sizeof(struct nvme_bdev_channel_iter)); 550 if (iter == NULL) { 551 SPDK_ERRLOG("Unable to allocate iterator\n"); 552 assert(false); 553 return; 554 } 555 556 iter->fn = fn; 557 iter->cpl = cpl; 558 iter->ctx = ctx; 559 560 spdk_for_each_channel(nbdev, nvme_bdev_each_channel_msg, iter, 561 nvme_bdev_each_channel_cpl); 562 } 563 564 void 565 nvme_bdev_dump_trid_json(const struct spdk_nvme_transport_id *trid, struct spdk_json_write_ctx *w) 566 { 567 const char *trtype_str; 568 const char *adrfam_str; 569 570 trtype_str = spdk_nvme_transport_id_trtype_str(trid->trtype); 571 if (trtype_str) { 572 spdk_json_write_named_string(w, "trtype", trtype_str); 573 } 574 575 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 576 if (adrfam_str) { 577 spdk_json_write_named_string(w, "adrfam", adrfam_str); 578 } 579 580 if (trid->traddr[0] != '\0') { 581 spdk_json_write_named_string(w, "traddr", trid->traddr); 582 } 583 584 if (trid->trsvcid[0] != '\0') { 585 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 586 } 587 588 if (trid->subnqn[0] != '\0') { 589 spdk_json_write_named_string(w, "subnqn", trid->subnqn); 590 } 591 } 592 593 static void 594 nvme_bdev_ctrlr_delete(struct nvme_bdev_ctrlr *nbdev_ctrlr, 595 struct nvme_ctrlr *nvme_ctrlr) 596 { 597 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_delete, nvme_ctrlr->nbdev_ctrlr->name); 598 pthread_mutex_lock(&g_bdev_nvme_mutex); 599 600 TAILQ_REMOVE(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 601 if (!TAILQ_EMPTY(&nbdev_ctrlr->ctrlrs)) { 602 pthread_mutex_unlock(&g_bdev_nvme_mutex); 603 604 return; 605 } 606 TAILQ_REMOVE(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 607 608 pthread_mutex_unlock(&g_bdev_nvme_mutex); 609 610 assert(TAILQ_EMPTY(&nbdev_ctrlr->bdevs)); 611 612 free(nbdev_ctrlr->name); 613 free(nbdev_ctrlr); 614 } 615 616 static void 617 _nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 618 { 619 struct nvme_path_id *path_id, *tmp_path; 620 struct nvme_ns *ns, *tmp_ns; 621 622 free(nvme_ctrlr->copied_ana_desc); 623 spdk_free(nvme_ctrlr->ana_log_page); 624 625 if (nvme_ctrlr->opal_dev) { 626 spdk_opal_dev_destruct(nvme_ctrlr->opal_dev); 627 nvme_ctrlr->opal_dev = NULL; 628 } 629 630 if (nvme_ctrlr->nbdev_ctrlr) { 631 nvme_bdev_ctrlr_delete(nvme_ctrlr->nbdev_ctrlr, nvme_ctrlr); 632 } 633 634 RB_FOREACH_SAFE(ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp_ns) { 635 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, ns); 636 nvme_ns_free(ns); 637 } 638 639 TAILQ_FOREACH_SAFE(path_id, &nvme_ctrlr->trids, link, tmp_path) { 640 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 641 free(path_id); 642 } 643 644 pthread_mutex_destroy(&nvme_ctrlr->mutex); 645 spdk_keyring_put_key(nvme_ctrlr->psk); 646 spdk_keyring_put_key(nvme_ctrlr->dhchap_key); 647 spdk_keyring_put_key(nvme_ctrlr->dhchap_ctrlr_key); 648 free(nvme_ctrlr); 649 650 pthread_mutex_lock(&g_bdev_nvme_mutex); 651 if (g_bdev_nvme_module_finish && TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 652 pthread_mutex_unlock(&g_bdev_nvme_mutex); 653 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 654 spdk_bdev_module_fini_done(); 655 return; 656 } 657 pthread_mutex_unlock(&g_bdev_nvme_mutex); 658 } 659 660 static int 661 nvme_detach_poller(void *arg) 662 { 663 struct nvme_ctrlr *nvme_ctrlr = arg; 664 int rc; 665 666 rc = spdk_nvme_detach_poll_async(nvme_ctrlr->detach_ctx); 667 if (rc != -EAGAIN) { 668 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 669 _nvme_ctrlr_delete(nvme_ctrlr); 670 } 671 672 return SPDK_POLLER_BUSY; 673 } 674 675 static void 676 nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr) 677 { 678 int rc; 679 680 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 681 682 if (spdk_interrupt_mode_is_enabled()) { 683 spdk_interrupt_unregister(&nvme_ctrlr->intr); 684 } 685 686 /* First, unregister the adminq poller, as the driver will poll adminq if necessary */ 687 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 688 689 /* If we got here, the reset/detach poller cannot be active */ 690 assert(nvme_ctrlr->reset_detach_poller == NULL); 691 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(nvme_detach_poller, 692 nvme_ctrlr, 1000); 693 if (nvme_ctrlr->reset_detach_poller == NULL) { 694 NVME_CTRLR_ERRLOG(nvme_ctrlr, "Failed to register detach poller\n"); 695 goto error; 696 } 697 698 rc = spdk_nvme_detach_async(nvme_ctrlr->ctrlr, &nvme_ctrlr->detach_ctx); 699 if (rc != 0) { 700 NVME_CTRLR_ERRLOG(nvme_ctrlr, "Failed to detach the NVMe controller\n"); 701 goto error; 702 } 703 704 return; 705 error: 706 /* We don't have a good way to handle errors here, so just do what we can and delete the 707 * controller without detaching the underlying NVMe device. 708 */ 709 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 710 _nvme_ctrlr_delete(nvme_ctrlr); 711 } 712 713 static void 714 nvme_ctrlr_unregister_cb(void *io_device) 715 { 716 struct nvme_ctrlr *nvme_ctrlr = io_device; 717 718 nvme_ctrlr_delete(nvme_ctrlr); 719 } 720 721 static void 722 nvme_ctrlr_unregister(void *ctx) 723 { 724 struct nvme_ctrlr *nvme_ctrlr = ctx; 725 726 spdk_io_device_unregister(nvme_ctrlr, nvme_ctrlr_unregister_cb); 727 } 728 729 static bool 730 nvme_ctrlr_can_be_unregistered(struct nvme_ctrlr *nvme_ctrlr) 731 { 732 if (!nvme_ctrlr->destruct) { 733 return false; 734 } 735 736 if (nvme_ctrlr->ref > 0) { 737 return false; 738 } 739 740 if (nvme_ctrlr->resetting) { 741 return false; 742 } 743 744 if (nvme_ctrlr->ana_log_page_updating) { 745 return false; 746 } 747 748 if (nvme_ctrlr->io_path_cache_clearing) { 749 return false; 750 } 751 752 return true; 753 } 754 755 static void 756 nvme_ctrlr_put_ref(struct nvme_ctrlr *nvme_ctrlr) 757 { 758 pthread_mutex_lock(&nvme_ctrlr->mutex); 759 SPDK_DTRACE_PROBE2(bdev_nvme_ctrlr_release, nvme_ctrlr->nbdev_ctrlr->name, nvme_ctrlr->ref); 760 761 assert(nvme_ctrlr->ref > 0); 762 nvme_ctrlr->ref--; 763 764 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 765 pthread_mutex_unlock(&nvme_ctrlr->mutex); 766 return; 767 } 768 769 pthread_mutex_unlock(&nvme_ctrlr->mutex); 770 771 spdk_thread_exec_msg(nvme_ctrlr->thread, nvme_ctrlr_unregister, nvme_ctrlr); 772 } 773 774 static void 775 nvme_ctrlr_get_ref(struct nvme_ctrlr *nvme_ctrlr) 776 { 777 pthread_mutex_lock(&nvme_ctrlr->mutex); 778 nvme_ctrlr->ref++; 779 pthread_mutex_unlock(&nvme_ctrlr->mutex); 780 } 781 782 static void 783 bdev_nvme_clear_current_io_path(struct nvme_bdev_channel *nbdev_ch) 784 { 785 nbdev_ch->current_io_path = NULL; 786 nbdev_ch->rr_counter = 0; 787 } 788 789 static struct nvme_io_path * 790 _bdev_nvme_get_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 791 { 792 struct nvme_io_path *io_path; 793 794 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 795 if (io_path->nvme_ns == nvme_ns) { 796 break; 797 } 798 } 799 800 return io_path; 801 } 802 803 static struct nvme_io_path * 804 nvme_io_path_alloc(void) 805 { 806 struct nvme_io_path *io_path; 807 808 io_path = calloc(1, sizeof(*io_path)); 809 if (io_path == NULL) { 810 SPDK_ERRLOG("Failed to alloc io_path.\n"); 811 return NULL; 812 } 813 814 if (g_opts.io_path_stat) { 815 io_path->stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); 816 if (io_path->stat == NULL) { 817 free(io_path); 818 SPDK_ERRLOG("Failed to alloc io_path stat.\n"); 819 return NULL; 820 } 821 spdk_bdev_reset_io_stat(io_path->stat, SPDK_BDEV_RESET_STAT_MAXMIN); 822 } 823 824 return io_path; 825 } 826 827 static void 828 nvme_io_path_free(struct nvme_io_path *io_path) 829 { 830 free(io_path->stat); 831 free(io_path); 832 } 833 834 static int 835 _bdev_nvme_add_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns) 836 { 837 struct nvme_io_path *io_path; 838 struct spdk_io_channel *ch; 839 struct nvme_ctrlr_channel *ctrlr_ch; 840 struct nvme_qpair *nvme_qpair; 841 842 io_path = nvme_io_path_alloc(); 843 if (io_path == NULL) { 844 return -ENOMEM; 845 } 846 847 io_path->nvme_ns = nvme_ns; 848 849 ch = spdk_get_io_channel(nvme_ns->ctrlr); 850 if (ch == NULL) { 851 nvme_io_path_free(io_path); 852 SPDK_ERRLOG("Failed to alloc io_channel.\n"); 853 return -ENOMEM; 854 } 855 856 ctrlr_ch = spdk_io_channel_get_ctx(ch); 857 858 nvme_qpair = ctrlr_ch->qpair; 859 assert(nvme_qpair != NULL); 860 861 io_path->qpair = nvme_qpair; 862 TAILQ_INSERT_TAIL(&nvme_qpair->io_path_list, io_path, tailq); 863 864 io_path->nbdev_ch = nbdev_ch; 865 STAILQ_INSERT_TAIL(&nbdev_ch->io_path_list, io_path, stailq); 866 867 bdev_nvme_clear_current_io_path(nbdev_ch); 868 869 return 0; 870 } 871 872 static void 873 bdev_nvme_clear_retry_io_path(struct nvme_bdev_channel *nbdev_ch, 874 struct nvme_io_path *io_path) 875 { 876 struct nvme_bdev_io *bio; 877 878 TAILQ_FOREACH(bio, &nbdev_ch->retry_io_list, retry_link) { 879 if (bio->io_path == io_path) { 880 bio->io_path = NULL; 881 } 882 } 883 } 884 885 static void 886 _bdev_nvme_delete_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *io_path) 887 { 888 struct spdk_io_channel *ch; 889 struct nvme_qpair *nvme_qpair; 890 struct nvme_ctrlr_channel *ctrlr_ch; 891 struct nvme_bdev *nbdev; 892 893 nbdev = spdk_io_channel_get_io_device(spdk_io_channel_from_ctx(nbdev_ch)); 894 895 /* Add the statistics to nvme_ns before this path is destroyed. */ 896 pthread_mutex_lock(&nbdev->mutex); 897 if (nbdev->ref != 0 && io_path->nvme_ns->stat != NULL && io_path->stat != NULL) { 898 spdk_bdev_add_io_stat(io_path->nvme_ns->stat, io_path->stat); 899 } 900 pthread_mutex_unlock(&nbdev->mutex); 901 902 bdev_nvme_clear_current_io_path(nbdev_ch); 903 bdev_nvme_clear_retry_io_path(nbdev_ch, io_path); 904 905 STAILQ_REMOVE(&nbdev_ch->io_path_list, io_path, nvme_io_path, stailq); 906 io_path->nbdev_ch = NULL; 907 908 nvme_qpair = io_path->qpair; 909 assert(nvme_qpair != NULL); 910 911 ctrlr_ch = nvme_qpair->ctrlr_ch; 912 assert(ctrlr_ch != NULL); 913 914 ch = spdk_io_channel_from_ctx(ctrlr_ch); 915 spdk_put_io_channel(ch); 916 917 /* After an io_path is removed, I/Os submitted to it may complete and update statistics 918 * of the io_path. To avoid heap-use-after-free error from this case, do not free the 919 * io_path here but free the io_path when the associated qpair is freed. It is ensured 920 * that all I/Os submitted to the io_path are completed when the associated qpair is freed. 921 */ 922 } 923 924 static void 925 _bdev_nvme_delete_io_paths(struct nvme_bdev_channel *nbdev_ch) 926 { 927 struct nvme_io_path *io_path, *tmp_io_path; 928 929 STAILQ_FOREACH_SAFE(io_path, &nbdev_ch->io_path_list, stailq, tmp_io_path) { 930 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 931 } 932 } 933 934 static int 935 bdev_nvme_create_bdev_channel_cb(void *io_device, void *ctx_buf) 936 { 937 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 938 struct nvme_bdev *nbdev = io_device; 939 struct nvme_ns *nvme_ns; 940 int rc; 941 942 STAILQ_INIT(&nbdev_ch->io_path_list); 943 TAILQ_INIT(&nbdev_ch->retry_io_list); 944 945 pthread_mutex_lock(&nbdev->mutex); 946 947 nbdev_ch->mp_policy = nbdev->mp_policy; 948 nbdev_ch->mp_selector = nbdev->mp_selector; 949 nbdev_ch->rr_min_io = nbdev->rr_min_io; 950 951 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 952 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 953 if (rc != 0) { 954 pthread_mutex_unlock(&nbdev->mutex); 955 956 _bdev_nvme_delete_io_paths(nbdev_ch); 957 return rc; 958 } 959 } 960 pthread_mutex_unlock(&nbdev->mutex); 961 962 return 0; 963 } 964 965 /* If cpl != NULL, complete the bdev_io with nvme status based on 'cpl'. 966 * If cpl == NULL, complete the bdev_io with bdev status based on 'status'. 967 */ 968 static inline void 969 __bdev_nvme_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status, 970 const struct spdk_nvme_cpl *cpl) 971 { 972 spdk_trace_record(TRACE_BDEV_NVME_IO_DONE, 0, 0, (uintptr_t)bdev_io->driver_ctx, 973 (uintptr_t)bdev_io); 974 if (cpl) { 975 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 976 } else { 977 spdk_bdev_io_complete(bdev_io, status); 978 } 979 } 980 981 static void bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch); 982 983 static void 984 bdev_nvme_destroy_bdev_channel_cb(void *io_device, void *ctx_buf) 985 { 986 struct nvme_bdev_channel *nbdev_ch = ctx_buf; 987 988 bdev_nvme_abort_retry_ios(nbdev_ch); 989 _bdev_nvme_delete_io_paths(nbdev_ch); 990 } 991 992 static inline bool 993 bdev_nvme_io_type_is_admin(enum spdk_bdev_io_type io_type) 994 { 995 switch (io_type) { 996 case SPDK_BDEV_IO_TYPE_RESET: 997 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 998 case SPDK_BDEV_IO_TYPE_ABORT: 999 return true; 1000 default: 1001 break; 1002 } 1003 1004 return false; 1005 } 1006 1007 static inline bool 1008 nvme_ns_is_active(struct nvme_ns *nvme_ns) 1009 { 1010 if (spdk_unlikely(nvme_ns->ana_state_updating)) { 1011 return false; 1012 } 1013 1014 if (spdk_unlikely(nvme_ns->ns == NULL)) { 1015 return false; 1016 } 1017 1018 return true; 1019 } 1020 1021 static inline bool 1022 nvme_ns_is_accessible(struct nvme_ns *nvme_ns) 1023 { 1024 if (spdk_unlikely(!nvme_ns_is_active(nvme_ns))) { 1025 return false; 1026 } 1027 1028 switch (nvme_ns->ana_state) { 1029 case SPDK_NVME_ANA_OPTIMIZED_STATE: 1030 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 1031 return true; 1032 default: 1033 break; 1034 } 1035 1036 return false; 1037 } 1038 1039 static inline bool 1040 nvme_qpair_is_connected(struct nvme_qpair *nvme_qpair) 1041 { 1042 if (spdk_unlikely(nvme_qpair->qpair == NULL)) { 1043 return false; 1044 } 1045 1046 if (spdk_unlikely(spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) != 1047 SPDK_NVME_QPAIR_FAILURE_NONE)) { 1048 return false; 1049 } 1050 1051 if (spdk_unlikely(nvme_qpair->ctrlr_ch->reset_iter != NULL)) { 1052 return false; 1053 } 1054 1055 return true; 1056 } 1057 1058 static inline bool 1059 nvme_io_path_is_available(struct nvme_io_path *io_path) 1060 { 1061 if (spdk_unlikely(!nvme_qpair_is_connected(io_path->qpair))) { 1062 return false; 1063 } 1064 1065 if (spdk_unlikely(!nvme_ns_is_accessible(io_path->nvme_ns))) { 1066 return false; 1067 } 1068 1069 return true; 1070 } 1071 1072 static inline bool 1073 nvme_ctrlr_is_failed(struct nvme_ctrlr *nvme_ctrlr) 1074 { 1075 if (nvme_ctrlr->destruct) { 1076 return true; 1077 } 1078 1079 if (nvme_ctrlr->fast_io_fail_timedout) { 1080 return true; 1081 } 1082 1083 if (nvme_ctrlr->resetting) { 1084 if (nvme_ctrlr->opts.reconnect_delay_sec != 0) { 1085 return false; 1086 } else { 1087 return true; 1088 } 1089 } 1090 1091 if (nvme_ctrlr->reconnect_is_delayed) { 1092 return false; 1093 } 1094 1095 if (nvme_ctrlr->disabled) { 1096 return true; 1097 } 1098 1099 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 1100 return true; 1101 } else { 1102 return false; 1103 } 1104 } 1105 1106 static bool 1107 nvme_ctrlr_is_available(struct nvme_ctrlr *nvme_ctrlr) 1108 { 1109 if (nvme_ctrlr->destruct) { 1110 return false; 1111 } 1112 1113 if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 1114 return false; 1115 } 1116 1117 if (nvme_ctrlr->resetting || nvme_ctrlr->reconnect_is_delayed) { 1118 return false; 1119 } 1120 1121 if (nvme_ctrlr->disabled) { 1122 return false; 1123 } 1124 1125 return true; 1126 } 1127 1128 /* Simulate circular linked list. */ 1129 static inline struct nvme_io_path * 1130 nvme_io_path_get_next(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *prev_path) 1131 { 1132 struct nvme_io_path *next_path; 1133 1134 if (prev_path != NULL) { 1135 next_path = STAILQ_NEXT(prev_path, stailq); 1136 if (next_path != NULL) { 1137 return next_path; 1138 } 1139 } 1140 1141 return STAILQ_FIRST(&nbdev_ch->io_path_list); 1142 } 1143 1144 static struct nvme_io_path * 1145 _bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 1146 { 1147 struct nvme_io_path *io_path, *start, *non_optimized = NULL; 1148 1149 start = nvme_io_path_get_next(nbdev_ch, nbdev_ch->current_io_path); 1150 1151 io_path = start; 1152 do { 1153 if (spdk_likely(nvme_io_path_is_available(io_path))) { 1154 switch (io_path->nvme_ns->ana_state) { 1155 case SPDK_NVME_ANA_OPTIMIZED_STATE: 1156 nbdev_ch->current_io_path = io_path; 1157 return io_path; 1158 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 1159 if (non_optimized == NULL) { 1160 non_optimized = io_path; 1161 } 1162 break; 1163 default: 1164 assert(false); 1165 break; 1166 } 1167 } 1168 io_path = nvme_io_path_get_next(nbdev_ch, io_path); 1169 } while (io_path != start); 1170 1171 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) { 1172 /* We come here only if there is no optimized path. Cache even non_optimized 1173 * path for load balance across multiple non_optimized paths. 1174 */ 1175 nbdev_ch->current_io_path = non_optimized; 1176 } 1177 1178 return non_optimized; 1179 } 1180 1181 static struct nvme_io_path * 1182 _bdev_nvme_find_io_path_min_qd(struct nvme_bdev_channel *nbdev_ch) 1183 { 1184 struct nvme_io_path *io_path; 1185 struct nvme_io_path *optimized = NULL, *non_optimized = NULL; 1186 uint32_t opt_min_qd = UINT32_MAX, non_opt_min_qd = UINT32_MAX; 1187 uint32_t num_outstanding_reqs; 1188 1189 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 1190 if (spdk_unlikely(!nvme_qpair_is_connected(io_path->qpair))) { 1191 /* The device is currently resetting. */ 1192 continue; 1193 } 1194 1195 if (spdk_unlikely(!nvme_ns_is_active(io_path->nvme_ns))) { 1196 continue; 1197 } 1198 1199 num_outstanding_reqs = spdk_nvme_qpair_get_num_outstanding_reqs(io_path->qpair->qpair); 1200 switch (io_path->nvme_ns->ana_state) { 1201 case SPDK_NVME_ANA_OPTIMIZED_STATE: 1202 if (num_outstanding_reqs < opt_min_qd) { 1203 opt_min_qd = num_outstanding_reqs; 1204 optimized = io_path; 1205 } 1206 break; 1207 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 1208 if (num_outstanding_reqs < non_opt_min_qd) { 1209 non_opt_min_qd = num_outstanding_reqs; 1210 non_optimized = io_path; 1211 } 1212 break; 1213 default: 1214 break; 1215 } 1216 } 1217 1218 /* don't cache io path for BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH selector */ 1219 if (optimized != NULL) { 1220 return optimized; 1221 } 1222 1223 return non_optimized; 1224 } 1225 1226 static inline struct nvme_io_path * 1227 bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch) 1228 { 1229 if (spdk_likely(nbdev_ch->current_io_path != NULL)) { 1230 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE) { 1231 return nbdev_ch->current_io_path; 1232 } else if (nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 1233 if (++nbdev_ch->rr_counter < nbdev_ch->rr_min_io) { 1234 return nbdev_ch->current_io_path; 1235 } 1236 nbdev_ch->rr_counter = 0; 1237 } 1238 } 1239 1240 if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE || 1241 nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 1242 return _bdev_nvme_find_io_path(nbdev_ch); 1243 } else { 1244 return _bdev_nvme_find_io_path_min_qd(nbdev_ch); 1245 } 1246 } 1247 1248 /* Return true if there is any io_path whose qpair is active or ctrlr is not failed, 1249 * or false otherwise. 1250 * 1251 * If any io_path has an active qpair but find_io_path() returned NULL, its namespace 1252 * is likely to be non-accessible now but may become accessible. 1253 * 1254 * If any io_path has an unfailed ctrlr but find_io_path() returned NULL, the ctrlr 1255 * is likely to be resetting now but the reset may succeed. A ctrlr is set to unfailed 1256 * when starting to reset it but it is set to failed when the reset failed. Hence, if 1257 * a ctrlr is unfailed, it is likely that it works fine or is resetting. 1258 */ 1259 static bool 1260 any_io_path_may_become_available(struct nvme_bdev_channel *nbdev_ch) 1261 { 1262 struct nvme_io_path *io_path; 1263 1264 if (nbdev_ch->resetting) { 1265 return false; 1266 } 1267 1268 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 1269 if (io_path->nvme_ns->ana_transition_timedout) { 1270 continue; 1271 } 1272 1273 if (nvme_qpair_is_connected(io_path->qpair) || 1274 !nvme_ctrlr_is_failed(io_path->qpair->ctrlr)) { 1275 return true; 1276 } 1277 } 1278 1279 return false; 1280 } 1281 1282 static void 1283 bdev_nvme_retry_io(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io) 1284 { 1285 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 1286 struct spdk_io_channel *ch; 1287 1288 if (nbdev_io->io_path != NULL && nvme_io_path_is_available(nbdev_io->io_path)) { 1289 _bdev_nvme_submit_request(nbdev_ch, bdev_io); 1290 } else { 1291 ch = spdk_io_channel_from_ctx(nbdev_ch); 1292 bdev_nvme_submit_request(ch, bdev_io); 1293 } 1294 } 1295 1296 static int 1297 bdev_nvme_retry_ios(void *arg) 1298 { 1299 struct nvme_bdev_channel *nbdev_ch = arg; 1300 struct nvme_bdev_io *bio, *tmp_bio; 1301 uint64_t now, delay_us; 1302 1303 now = spdk_get_ticks(); 1304 1305 TAILQ_FOREACH_SAFE(bio, &nbdev_ch->retry_io_list, retry_link, tmp_bio) { 1306 if (bio->retry_ticks > now) { 1307 break; 1308 } 1309 1310 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bio, retry_link); 1311 1312 bdev_nvme_retry_io(nbdev_ch, spdk_bdev_io_from_ctx(bio)); 1313 } 1314 1315 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1316 1317 bio = TAILQ_FIRST(&nbdev_ch->retry_io_list); 1318 if (bio != NULL) { 1319 delay_us = (bio->retry_ticks - now) * SPDK_SEC_TO_USEC / spdk_get_ticks_hz(); 1320 1321 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 1322 delay_us); 1323 } 1324 1325 return SPDK_POLLER_BUSY; 1326 } 1327 1328 static void 1329 bdev_nvme_queue_retry_io(struct nvme_bdev_channel *nbdev_ch, 1330 struct nvme_bdev_io *bio, uint64_t delay_ms) 1331 { 1332 struct nvme_bdev_io *tmp_bio; 1333 1334 bio->retry_ticks = spdk_get_ticks() + delay_ms * spdk_get_ticks_hz() / 1000ULL; 1335 1336 TAILQ_FOREACH_REVERSE(tmp_bio, &nbdev_ch->retry_io_list, retry_io_head, retry_link) { 1337 if (tmp_bio->retry_ticks <= bio->retry_ticks) { 1338 TAILQ_INSERT_AFTER(&nbdev_ch->retry_io_list, tmp_bio, bio, 1339 retry_link); 1340 return; 1341 } 1342 } 1343 1344 /* No earlier I/Os were found. This I/O must be the new head. */ 1345 TAILQ_INSERT_HEAD(&nbdev_ch->retry_io_list, bio, retry_link); 1346 1347 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1348 1349 nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch, 1350 delay_ms * 1000ULL); 1351 } 1352 1353 static void 1354 bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch) 1355 { 1356 struct nvme_bdev_io *bio, *tmp_bio; 1357 1358 TAILQ_FOREACH_SAFE(bio, &nbdev_ch->retry_io_list, retry_link, tmp_bio) { 1359 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bio, retry_link); 1360 __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_ABORTED, NULL); 1361 } 1362 1363 spdk_poller_unregister(&nbdev_ch->retry_io_poller); 1364 } 1365 1366 static int 1367 bdev_nvme_abort_retry_io(struct nvme_bdev_channel *nbdev_ch, 1368 struct nvme_bdev_io *bio_to_abort) 1369 { 1370 struct nvme_bdev_io *bio; 1371 1372 TAILQ_FOREACH(bio, &nbdev_ch->retry_io_list, retry_link) { 1373 if (bio == bio_to_abort) { 1374 TAILQ_REMOVE(&nbdev_ch->retry_io_list, bio, retry_link); 1375 __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_ABORTED, NULL); 1376 return 0; 1377 } 1378 } 1379 1380 return -ENOENT; 1381 } 1382 1383 static void 1384 bdev_nvme_update_nvme_error_stat(struct spdk_bdev_io *bdev_io, const struct spdk_nvme_cpl *cpl) 1385 { 1386 struct nvme_bdev *nbdev; 1387 uint16_t sct, sc; 1388 1389 assert(spdk_nvme_cpl_is_error(cpl)); 1390 1391 nbdev = bdev_io->bdev->ctxt; 1392 1393 if (nbdev->err_stat == NULL) { 1394 return; 1395 } 1396 1397 sct = cpl->status.sct; 1398 sc = cpl->status.sc; 1399 1400 pthread_mutex_lock(&nbdev->mutex); 1401 1402 nbdev->err_stat->status_type[sct]++; 1403 switch (sct) { 1404 case SPDK_NVME_SCT_GENERIC: 1405 case SPDK_NVME_SCT_COMMAND_SPECIFIC: 1406 case SPDK_NVME_SCT_MEDIA_ERROR: 1407 case SPDK_NVME_SCT_PATH: 1408 nbdev->err_stat->status[sct][sc]++; 1409 break; 1410 default: 1411 break; 1412 } 1413 1414 pthread_mutex_unlock(&nbdev->mutex); 1415 } 1416 1417 static inline void 1418 bdev_nvme_update_io_path_stat(struct nvme_bdev_io *bio) 1419 { 1420 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1421 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 1422 uint32_t blocklen = bdev_io->bdev->blocklen; 1423 struct spdk_bdev_io_stat *stat; 1424 uint64_t tsc_diff; 1425 1426 if (bio->io_path->stat == NULL) { 1427 return; 1428 } 1429 1430 tsc_diff = spdk_get_ticks() - bio->submit_tsc; 1431 stat = bio->io_path->stat; 1432 1433 switch (bdev_io->type) { 1434 case SPDK_BDEV_IO_TYPE_READ: 1435 stat->bytes_read += num_blocks * blocklen; 1436 stat->num_read_ops++; 1437 stat->read_latency_ticks += tsc_diff; 1438 if (stat->max_read_latency_ticks < tsc_diff) { 1439 stat->max_read_latency_ticks = tsc_diff; 1440 } 1441 if (stat->min_read_latency_ticks > tsc_diff) { 1442 stat->min_read_latency_ticks = tsc_diff; 1443 } 1444 break; 1445 case SPDK_BDEV_IO_TYPE_WRITE: 1446 stat->bytes_written += num_blocks * blocklen; 1447 stat->num_write_ops++; 1448 stat->write_latency_ticks += tsc_diff; 1449 if (stat->max_write_latency_ticks < tsc_diff) { 1450 stat->max_write_latency_ticks = tsc_diff; 1451 } 1452 if (stat->min_write_latency_ticks > tsc_diff) { 1453 stat->min_write_latency_ticks = tsc_diff; 1454 } 1455 break; 1456 case SPDK_BDEV_IO_TYPE_UNMAP: 1457 stat->bytes_unmapped += num_blocks * blocklen; 1458 stat->num_unmap_ops++; 1459 stat->unmap_latency_ticks += tsc_diff; 1460 if (stat->max_unmap_latency_ticks < tsc_diff) { 1461 stat->max_unmap_latency_ticks = tsc_diff; 1462 } 1463 if (stat->min_unmap_latency_ticks > tsc_diff) { 1464 stat->min_unmap_latency_ticks = tsc_diff; 1465 } 1466 break; 1467 case SPDK_BDEV_IO_TYPE_ZCOPY: 1468 /* Track the data in the start phase only */ 1469 if (!bdev_io->u.bdev.zcopy.start) { 1470 break; 1471 } 1472 if (bdev_io->u.bdev.zcopy.populate) { 1473 stat->bytes_read += num_blocks * blocklen; 1474 stat->num_read_ops++; 1475 stat->read_latency_ticks += tsc_diff; 1476 if (stat->max_read_latency_ticks < tsc_diff) { 1477 stat->max_read_latency_ticks = tsc_diff; 1478 } 1479 if (stat->min_read_latency_ticks > tsc_diff) { 1480 stat->min_read_latency_ticks = tsc_diff; 1481 } 1482 } else { 1483 stat->bytes_written += num_blocks * blocklen; 1484 stat->num_write_ops++; 1485 stat->write_latency_ticks += tsc_diff; 1486 if (stat->max_write_latency_ticks < tsc_diff) { 1487 stat->max_write_latency_ticks = tsc_diff; 1488 } 1489 if (stat->min_write_latency_ticks > tsc_diff) { 1490 stat->min_write_latency_ticks = tsc_diff; 1491 } 1492 } 1493 break; 1494 case SPDK_BDEV_IO_TYPE_COPY: 1495 stat->bytes_copied += num_blocks * blocklen; 1496 stat->num_copy_ops++; 1497 stat->copy_latency_ticks += tsc_diff; 1498 if (stat->max_copy_latency_ticks < tsc_diff) { 1499 stat->max_copy_latency_ticks = tsc_diff; 1500 } 1501 if (stat->min_copy_latency_ticks > tsc_diff) { 1502 stat->min_copy_latency_ticks = tsc_diff; 1503 } 1504 break; 1505 default: 1506 break; 1507 } 1508 } 1509 1510 static bool 1511 bdev_nvme_check_retry_io(struct nvme_bdev_io *bio, 1512 const struct spdk_nvme_cpl *cpl, 1513 struct nvme_bdev_channel *nbdev_ch, 1514 uint64_t *_delay_ms) 1515 { 1516 struct nvme_io_path *io_path = bio->io_path; 1517 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 1518 const struct spdk_nvme_ctrlr_data *cdata; 1519 1520 if (spdk_nvme_cpl_is_path_error(cpl) || 1521 spdk_nvme_cpl_is_aborted_sq_deletion(cpl) || 1522 !nvme_io_path_is_available(io_path) || 1523 !nvme_ctrlr_is_available(nvme_ctrlr)) { 1524 bdev_nvme_clear_current_io_path(nbdev_ch); 1525 bio->io_path = NULL; 1526 if (spdk_nvme_cpl_is_ana_error(cpl)) { 1527 if (nvme_ctrlr_read_ana_log_page(nvme_ctrlr) == 0) { 1528 io_path->nvme_ns->ana_state_updating = true; 1529 } 1530 } 1531 if (!any_io_path_may_become_available(nbdev_ch)) { 1532 return false; 1533 } 1534 *_delay_ms = 0; 1535 } else { 1536 bio->retry_count++; 1537 1538 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 1539 1540 if (cpl->status.crd != 0) { 1541 *_delay_ms = cdata->crdt[cpl->status.crd] * 100; 1542 } else { 1543 *_delay_ms = 0; 1544 } 1545 } 1546 1547 return true; 1548 } 1549 1550 static inline void 1551 bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio, 1552 const struct spdk_nvme_cpl *cpl) 1553 { 1554 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1555 struct nvme_bdev_channel *nbdev_ch; 1556 uint64_t delay_ms; 1557 1558 assert(!bdev_nvme_io_type_is_admin(bdev_io->type)); 1559 1560 if (spdk_likely(spdk_nvme_cpl_is_success(cpl))) { 1561 bdev_nvme_update_io_path_stat(bio); 1562 goto complete; 1563 } 1564 1565 /* Update error counts before deciding if retry is needed. 1566 * Hence, error counts may be more than the number of I/O errors. 1567 */ 1568 bdev_nvme_update_nvme_error_stat(bdev_io, cpl); 1569 1570 if (cpl->status.dnr != 0 || spdk_nvme_cpl_is_aborted_by_request(cpl) || 1571 (g_opts.bdev_retry_count != -1 && bio->retry_count >= g_opts.bdev_retry_count)) { 1572 goto complete; 1573 } 1574 1575 /* At this point we don't know whether the sequence was successfully executed or not, so we 1576 * cannot retry the IO */ 1577 if (bdev_io->u.bdev.accel_sequence != NULL) { 1578 goto complete; 1579 } 1580 1581 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1582 1583 if (bdev_nvme_check_retry_io(bio, cpl, nbdev_ch, &delay_ms)) { 1584 bdev_nvme_queue_retry_io(nbdev_ch, bio, delay_ms); 1585 return; 1586 } 1587 1588 complete: 1589 bio->retry_count = 0; 1590 bio->submit_tsc = 0; 1591 bdev_io->u.bdev.accel_sequence = NULL; 1592 __bdev_nvme_io_complete(bdev_io, 0, cpl); 1593 } 1594 1595 static inline void 1596 bdev_nvme_io_complete(struct nvme_bdev_io *bio, int rc) 1597 { 1598 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1599 struct nvme_bdev_channel *nbdev_ch; 1600 enum spdk_bdev_io_status io_status; 1601 1602 assert(!bdev_nvme_io_type_is_admin(bdev_io->type)); 1603 1604 switch (rc) { 1605 case 0: 1606 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1607 break; 1608 case -ENOMEM: 1609 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1610 break; 1611 case -ENXIO: 1612 if (g_opts.bdev_retry_count == -1 || bio->retry_count < g_opts.bdev_retry_count) { 1613 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 1614 1615 bdev_nvme_clear_current_io_path(nbdev_ch); 1616 bio->io_path = NULL; 1617 1618 if (any_io_path_may_become_available(nbdev_ch)) { 1619 bdev_nvme_queue_retry_io(nbdev_ch, bio, 1000ULL); 1620 return; 1621 } 1622 } 1623 1624 /* fallthrough */ 1625 default: 1626 spdk_accel_sequence_abort(bdev_io->u.bdev.accel_sequence); 1627 bdev_io->u.bdev.accel_sequence = NULL; 1628 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1629 break; 1630 } 1631 1632 bio->retry_count = 0; 1633 bio->submit_tsc = 0; 1634 __bdev_nvme_io_complete(bdev_io, io_status, NULL); 1635 } 1636 1637 static inline void 1638 bdev_nvme_admin_complete(struct nvme_bdev_io *bio, int rc) 1639 { 1640 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 1641 enum spdk_bdev_io_status io_status; 1642 1643 switch (rc) { 1644 case 0: 1645 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 1646 break; 1647 case -ENOMEM: 1648 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 1649 break; 1650 case -ENXIO: 1651 /* fallthrough */ 1652 default: 1653 io_status = SPDK_BDEV_IO_STATUS_FAILED; 1654 break; 1655 } 1656 1657 __bdev_nvme_io_complete(bdev_io, io_status, NULL); 1658 } 1659 1660 static void 1661 bdev_nvme_clear_io_path_caches_done(struct nvme_ctrlr *nvme_ctrlr, 1662 void *ctx, int status) 1663 { 1664 pthread_mutex_lock(&nvme_ctrlr->mutex); 1665 1666 assert(nvme_ctrlr->io_path_cache_clearing == true); 1667 nvme_ctrlr->io_path_cache_clearing = false; 1668 1669 if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 1670 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1671 return; 1672 } 1673 1674 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1675 1676 nvme_ctrlr_unregister(nvme_ctrlr); 1677 } 1678 1679 static void 1680 _bdev_nvme_clear_io_path_cache(struct nvme_qpair *nvme_qpair) 1681 { 1682 struct nvme_io_path *io_path; 1683 1684 TAILQ_FOREACH(io_path, &nvme_qpair->io_path_list, tailq) { 1685 if (io_path->nbdev_ch == NULL) { 1686 continue; 1687 } 1688 bdev_nvme_clear_current_io_path(io_path->nbdev_ch); 1689 } 1690 } 1691 1692 static void 1693 bdev_nvme_clear_io_path_cache(struct nvme_ctrlr_channel_iter *i, 1694 struct nvme_ctrlr *nvme_ctrlr, 1695 struct nvme_ctrlr_channel *ctrlr_ch, 1696 void *ctx) 1697 { 1698 assert(ctrlr_ch->qpair != NULL); 1699 1700 _bdev_nvme_clear_io_path_cache(ctrlr_ch->qpair); 1701 1702 nvme_ctrlr_for_each_channel_continue(i, 0); 1703 } 1704 1705 static void 1706 bdev_nvme_clear_io_path_caches(struct nvme_ctrlr *nvme_ctrlr) 1707 { 1708 pthread_mutex_lock(&nvme_ctrlr->mutex); 1709 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 1710 nvme_ctrlr->io_path_cache_clearing) { 1711 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1712 return; 1713 } 1714 1715 nvme_ctrlr->io_path_cache_clearing = true; 1716 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1717 1718 nvme_ctrlr_for_each_channel(nvme_ctrlr, 1719 bdev_nvme_clear_io_path_cache, 1720 NULL, 1721 bdev_nvme_clear_io_path_caches_done); 1722 } 1723 1724 static struct nvme_qpair * 1725 nvme_poll_group_get_qpair(struct nvme_poll_group *group, struct spdk_nvme_qpair *qpair) 1726 { 1727 struct nvme_qpair *nvme_qpair; 1728 1729 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1730 if (nvme_qpair->qpair == qpair) { 1731 break; 1732 } 1733 } 1734 1735 return nvme_qpair; 1736 } 1737 1738 static void nvme_qpair_delete(struct nvme_qpair *nvme_qpair); 1739 1740 static void 1741 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx) 1742 { 1743 struct nvme_poll_group *group = poll_group_ctx; 1744 struct nvme_qpair *nvme_qpair; 1745 struct nvme_ctrlr *nvme_ctrlr; 1746 struct nvme_ctrlr_channel *ctrlr_ch; 1747 int status; 1748 1749 nvme_qpair = nvme_poll_group_get_qpair(group, qpair); 1750 if (nvme_qpair == NULL) { 1751 return; 1752 } 1753 1754 if (nvme_qpair->qpair != NULL) { 1755 spdk_nvme_ctrlr_free_io_qpair(nvme_qpair->qpair); 1756 nvme_qpair->qpair = NULL; 1757 } 1758 1759 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1760 1761 nvme_ctrlr = nvme_qpair->ctrlr; 1762 ctrlr_ch = nvme_qpair->ctrlr_ch; 1763 1764 if (ctrlr_ch != NULL) { 1765 if (ctrlr_ch->reset_iter != NULL) { 1766 /* We are in a full reset sequence. */ 1767 if (ctrlr_ch->connect_poller != NULL) { 1768 /* qpair was failed to connect. Abort the reset sequence. */ 1769 NVME_CTRLR_INFOLOG(nvme_ctrlr, 1770 "qpair %p was failed to connect. abort the reset ctrlr sequence.\n", 1771 qpair); 1772 spdk_poller_unregister(&ctrlr_ch->connect_poller); 1773 status = -1; 1774 } else { 1775 /* qpair was completed to disconnect. Just move to the next ctrlr_channel. */ 1776 NVME_CTRLR_INFOLOG(nvme_ctrlr, 1777 "qpair %p was disconnected and freed in a reset ctrlr sequence.\n", 1778 qpair); 1779 status = 0; 1780 } 1781 nvme_ctrlr_for_each_channel_continue(ctrlr_ch->reset_iter, status); 1782 ctrlr_ch->reset_iter = NULL; 1783 } else { 1784 /* qpair was disconnected unexpectedly. Reset controller for recovery. */ 1785 NVME_CTRLR_INFOLOG(nvme_ctrlr, "qpair %p was disconnected and freed. reset controller.\n", 1786 qpair); 1787 bdev_nvme_failover_ctrlr(nvme_ctrlr); 1788 } 1789 } else { 1790 /* In this case, ctrlr_channel is already deleted. */ 1791 NVME_CTRLR_INFOLOG(nvme_ctrlr, "qpair %p was disconnected and freed. delete nvme_qpair.\n", 1792 qpair); 1793 nvme_qpair_delete(nvme_qpair); 1794 } 1795 } 1796 1797 static void 1798 bdev_nvme_check_io_qpairs(struct nvme_poll_group *group) 1799 { 1800 struct nvme_qpair *nvme_qpair; 1801 1802 TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) { 1803 if (nvme_qpair->qpair == NULL || nvme_qpair->ctrlr_ch == NULL) { 1804 continue; 1805 } 1806 1807 if (spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) != 1808 SPDK_NVME_QPAIR_FAILURE_NONE) { 1809 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1810 } 1811 } 1812 } 1813 1814 static int 1815 bdev_nvme_poll(void *arg) 1816 { 1817 struct nvme_poll_group *group = arg; 1818 int64_t num_completions; 1819 1820 if (group->collect_spin_stat && group->start_ticks == 0) { 1821 group->start_ticks = spdk_get_ticks(); 1822 } 1823 1824 num_completions = spdk_nvme_poll_group_process_completions(group->group, 0, 1825 bdev_nvme_disconnected_qpair_cb); 1826 if (group->collect_spin_stat) { 1827 if (num_completions > 0) { 1828 if (group->end_ticks != 0) { 1829 group->spin_ticks += (group->end_ticks - group->start_ticks); 1830 group->end_ticks = 0; 1831 } 1832 group->start_ticks = 0; 1833 } else { 1834 group->end_ticks = spdk_get_ticks(); 1835 } 1836 } 1837 1838 if (spdk_unlikely(num_completions < 0)) { 1839 bdev_nvme_check_io_qpairs(group); 1840 } 1841 1842 return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 1843 } 1844 1845 static int bdev_nvme_poll_adminq(void *arg); 1846 1847 static void 1848 bdev_nvme_change_adminq_poll_period(struct nvme_ctrlr *nvme_ctrlr, uint64_t new_period_us) 1849 { 1850 if (spdk_interrupt_mode_is_enabled()) { 1851 return; 1852 } 1853 1854 spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); 1855 1856 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, 1857 nvme_ctrlr, new_period_us); 1858 } 1859 1860 static int 1861 bdev_nvme_poll_adminq(void *arg) 1862 { 1863 int32_t rc; 1864 struct nvme_ctrlr *nvme_ctrlr = arg; 1865 nvme_ctrlr_disconnected_cb disconnected_cb; 1866 1867 assert(nvme_ctrlr != NULL); 1868 1869 rc = spdk_nvme_ctrlr_process_admin_completions(nvme_ctrlr->ctrlr); 1870 if (rc < 0) { 1871 disconnected_cb = nvme_ctrlr->disconnected_cb; 1872 nvme_ctrlr->disconnected_cb = NULL; 1873 1874 if (disconnected_cb != NULL) { 1875 bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 1876 g_opts.nvme_adminq_poll_period_us); 1877 disconnected_cb(nvme_ctrlr); 1878 } else { 1879 bdev_nvme_failover_ctrlr(nvme_ctrlr); 1880 } 1881 } else if (spdk_nvme_ctrlr_get_admin_qp_failure_reason(nvme_ctrlr->ctrlr) != 1882 SPDK_NVME_QPAIR_FAILURE_NONE) { 1883 bdev_nvme_clear_io_path_caches(nvme_ctrlr); 1884 } 1885 1886 return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY; 1887 } 1888 1889 static void 1890 nvme_bdev_free(void *io_device) 1891 { 1892 struct nvme_bdev *nvme_disk = io_device; 1893 1894 pthread_mutex_destroy(&nvme_disk->mutex); 1895 free(nvme_disk->disk.name); 1896 free(nvme_disk->err_stat); 1897 free(nvme_disk); 1898 } 1899 1900 static int 1901 bdev_nvme_destruct(void *ctx) 1902 { 1903 struct nvme_bdev *nvme_disk = ctx; 1904 struct nvme_ns *nvme_ns, *tmp_nvme_ns; 1905 1906 SPDK_DTRACE_PROBE2(bdev_nvme_destruct, nvme_disk->nbdev_ctrlr->name, nvme_disk->nsid); 1907 1908 TAILQ_FOREACH_SAFE(nvme_ns, &nvme_disk->nvme_ns_list, tailq, tmp_nvme_ns) { 1909 pthread_mutex_lock(&nvme_ns->ctrlr->mutex); 1910 1911 nvme_ns->bdev = NULL; 1912 1913 assert(nvme_ns->id > 0); 1914 1915 if (nvme_ctrlr_get_ns(nvme_ns->ctrlr, nvme_ns->id) == NULL) { 1916 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1917 1918 nvme_ctrlr_put_ref(nvme_ns->ctrlr); 1919 nvme_ns_free(nvme_ns); 1920 } else { 1921 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 1922 } 1923 } 1924 1925 pthread_mutex_lock(&g_bdev_nvme_mutex); 1926 TAILQ_REMOVE(&nvme_disk->nbdev_ctrlr->bdevs, nvme_disk, tailq); 1927 pthread_mutex_unlock(&g_bdev_nvme_mutex); 1928 1929 spdk_io_device_unregister(nvme_disk, nvme_bdev_free); 1930 1931 return 0; 1932 } 1933 1934 static int 1935 bdev_nvme_create_qpair(struct nvme_qpair *nvme_qpair) 1936 { 1937 struct nvme_ctrlr *nvme_ctrlr; 1938 struct spdk_nvme_io_qpair_opts opts; 1939 struct spdk_nvme_qpair *qpair; 1940 int rc; 1941 1942 nvme_ctrlr = nvme_qpair->ctrlr; 1943 1944 spdk_nvme_ctrlr_get_default_io_qpair_opts(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1945 opts.create_only = true; 1946 /* In interrupt mode qpairs must be created in sync mode, else it will never be connected. 1947 * delay_cmd_submit must be false as in interrupt mode requests cannot be submitted in 1948 * completion context. 1949 */ 1950 if (!spdk_interrupt_mode_is_enabled()) { 1951 opts.async_mode = true; 1952 opts.delay_cmd_submit = g_opts.delay_cmd_submit; 1953 } 1954 opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests); 1955 g_opts.io_queue_requests = opts.io_queue_requests; 1956 1957 qpair = spdk_nvme_ctrlr_alloc_io_qpair(nvme_ctrlr->ctrlr, &opts, sizeof(opts)); 1958 if (qpair == NULL) { 1959 return -1; 1960 } 1961 1962 SPDK_DTRACE_PROBE3(bdev_nvme_create_qpair, nvme_ctrlr->nbdev_ctrlr->name, 1963 spdk_nvme_qpair_get_id(qpair), spdk_thread_get_id(nvme_ctrlr->thread)); 1964 1965 assert(nvme_qpair->group != NULL); 1966 1967 rc = spdk_nvme_poll_group_add(nvme_qpair->group->group, qpair); 1968 if (rc != 0) { 1969 NVME_CTRLR_ERRLOG(nvme_ctrlr, "Unable to begin polling on NVMe Channel.\n"); 1970 goto err; 1971 } 1972 1973 rc = spdk_nvme_ctrlr_connect_io_qpair(nvme_ctrlr->ctrlr, qpair); 1974 if (rc != 0) { 1975 NVME_CTRLR_ERRLOG(nvme_ctrlr, "Unable to connect I/O qpair.\n"); 1976 goto err; 1977 } 1978 1979 nvme_qpair->qpair = qpair; 1980 1981 if (!g_opts.disable_auto_failback) { 1982 _bdev_nvme_clear_io_path_cache(nvme_qpair); 1983 } 1984 1985 NVME_CTRLR_INFOLOG(nvme_ctrlr, "Connecting qpair %p:%u started.\n", 1986 qpair, spdk_nvme_qpair_get_id(qpair)); 1987 1988 return 0; 1989 1990 err: 1991 spdk_nvme_ctrlr_free_io_qpair(qpair); 1992 1993 return rc; 1994 } 1995 1996 static void bdev_nvme_reset_io_continue(void *cb_arg, int rc); 1997 1998 static void 1999 bdev_nvme_complete_pending_resets(struct nvme_ctrlr *nvme_ctrlr, bool success) 2000 { 2001 int rc = 0; 2002 struct nvme_bdev_io *bio; 2003 2004 if (!success) { 2005 rc = -1; 2006 } 2007 2008 while (!TAILQ_EMPTY(&nvme_ctrlr->pending_resets)) { 2009 bio = TAILQ_FIRST(&nvme_ctrlr->pending_resets); 2010 TAILQ_REMOVE(&nvme_ctrlr->pending_resets, bio, retry_link); 2011 2012 bdev_nvme_reset_io_continue(bio, rc); 2013 } 2014 } 2015 2016 /* This function marks the current trid as failed by storing the current ticks 2017 * and then sets the next trid to the active trid within a controller if exists. 2018 * 2019 * The purpose of the boolean return value is to request the caller to disconnect 2020 * the current trid now to try connecting the next trid. 2021 */ 2022 static bool 2023 bdev_nvme_failover_trid(struct nvme_ctrlr *nvme_ctrlr, bool remove, bool start) 2024 { 2025 struct nvme_path_id *path_id, *next_path; 2026 int rc __attribute__((unused)); 2027 2028 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 2029 assert(path_id); 2030 assert(path_id == nvme_ctrlr->active_path_id); 2031 next_path = TAILQ_NEXT(path_id, link); 2032 2033 /* Update the last failed time. It means the trid is failed if its last 2034 * failed time is non-zero. 2035 */ 2036 path_id->last_failed_tsc = spdk_get_ticks(); 2037 2038 if (next_path == NULL) { 2039 /* There is no alternate trid within a controller. */ 2040 return false; 2041 } 2042 2043 if (!start && nvme_ctrlr->opts.reconnect_delay_sec == 0) { 2044 /* Connect is not retried in a controller reset sequence. Connecting 2045 * the next trid will be done by the next bdev_nvme_failover_ctrlr() call. 2046 */ 2047 return false; 2048 } 2049 2050 assert(path_id->trid.trtype != SPDK_NVME_TRANSPORT_PCIE); 2051 2052 NVME_CTRLR_NOTICELOG(nvme_ctrlr, "Start failover from %s:%s to %s:%s\n", 2053 path_id->trid.traddr, path_id->trid.trsvcid, 2054 next_path->trid.traddr, next_path->trid.trsvcid); 2055 2056 spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr); 2057 nvme_ctrlr->active_path_id = next_path; 2058 rc = spdk_nvme_ctrlr_set_trid(nvme_ctrlr->ctrlr, &next_path->trid); 2059 assert(rc == 0); 2060 TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link); 2061 if (!remove) { 2062 /** Shuffle the old trid to the end of the list and use the new one. 2063 * Allows for round robin through multiple connections. 2064 */ 2065 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, path_id, link); 2066 } else { 2067 free(path_id); 2068 } 2069 2070 if (start || next_path->last_failed_tsc == 0) { 2071 /* bdev_nvme_failover_ctrlr() is just called or the next trid is not failed 2072 * or used yet. Try the next trid now. 2073 */ 2074 return true; 2075 } 2076 2077 if (spdk_get_ticks() > next_path->last_failed_tsc + spdk_get_ticks_hz() * 2078 nvme_ctrlr->opts.reconnect_delay_sec) { 2079 /* Enough backoff passed since the next trid failed. Try the next trid now. */ 2080 return true; 2081 } 2082 2083 /* The next trid will be tried after reconnect_delay_sec seconds. */ 2084 return false; 2085 } 2086 2087 static bool 2088 bdev_nvme_check_ctrlr_loss_timeout(struct nvme_ctrlr *nvme_ctrlr) 2089 { 2090 int32_t elapsed; 2091 2092 if (nvme_ctrlr->opts.ctrlr_loss_timeout_sec == 0 || 2093 nvme_ctrlr->opts.ctrlr_loss_timeout_sec == -1) { 2094 return false; 2095 } 2096 2097 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 2098 if (elapsed >= nvme_ctrlr->opts.ctrlr_loss_timeout_sec) { 2099 return true; 2100 } else { 2101 return false; 2102 } 2103 } 2104 2105 static bool 2106 bdev_nvme_check_fast_io_fail_timeout(struct nvme_ctrlr *nvme_ctrlr) 2107 { 2108 uint32_t elapsed; 2109 2110 if (nvme_ctrlr->opts.fast_io_fail_timeout_sec == 0) { 2111 return false; 2112 } 2113 2114 elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz(); 2115 if (elapsed >= nvme_ctrlr->opts.fast_io_fail_timeout_sec) { 2116 return true; 2117 } else { 2118 return false; 2119 } 2120 } 2121 2122 static void bdev_nvme_reset_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, bool success); 2123 2124 static void 2125 nvme_ctrlr_disconnect(struct nvme_ctrlr *nvme_ctrlr, nvme_ctrlr_disconnected_cb cb_fn) 2126 { 2127 int rc; 2128 2129 NVME_CTRLR_INFOLOG(nvme_ctrlr, "Start disconnecting ctrlr.\n"); 2130 2131 rc = spdk_nvme_ctrlr_disconnect(nvme_ctrlr->ctrlr); 2132 if (rc != 0) { 2133 NVME_CTRLR_WARNLOG(nvme_ctrlr, "disconnecting ctrlr failed.\n"); 2134 2135 /* Disconnect fails if ctrlr is already resetting or removed. In this case, 2136 * fail the reset sequence immediately. 2137 */ 2138 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 2139 return; 2140 } 2141 2142 /* spdk_nvme_ctrlr_disconnect() may complete asynchronously later by polling adminq. 2143 * Set callback here to execute the specified operation after ctrlr is really disconnected. 2144 */ 2145 assert(nvme_ctrlr->disconnected_cb == NULL); 2146 nvme_ctrlr->disconnected_cb = cb_fn; 2147 2148 /* During disconnection, reduce the period to poll adminq more often. */ 2149 bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 0); 2150 } 2151 2152 enum bdev_nvme_op_after_reset { 2153 OP_NONE, 2154 OP_COMPLETE_PENDING_DESTRUCT, 2155 OP_DESTRUCT, 2156 OP_DELAYED_RECONNECT, 2157 OP_FAILOVER, 2158 }; 2159 2160 typedef enum bdev_nvme_op_after_reset _bdev_nvme_op_after_reset; 2161 2162 static _bdev_nvme_op_after_reset 2163 bdev_nvme_check_op_after_reset(struct nvme_ctrlr *nvme_ctrlr, bool success) 2164 { 2165 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 2166 /* Complete pending destruct after reset completes. */ 2167 return OP_COMPLETE_PENDING_DESTRUCT; 2168 } else if (nvme_ctrlr->pending_failover) { 2169 nvme_ctrlr->pending_failover = false; 2170 nvme_ctrlr->reset_start_tsc = 0; 2171 return OP_FAILOVER; 2172 } else if (success || nvme_ctrlr->opts.reconnect_delay_sec == 0) { 2173 nvme_ctrlr->reset_start_tsc = 0; 2174 return OP_NONE; 2175 } else if (bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 2176 return OP_DESTRUCT; 2177 } else { 2178 if (bdev_nvme_check_fast_io_fail_timeout(nvme_ctrlr)) { 2179 nvme_ctrlr->fast_io_fail_timedout = true; 2180 } 2181 return OP_DELAYED_RECONNECT; 2182 } 2183 } 2184 2185 static int bdev_nvme_delete_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool hotplug); 2186 static void bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr); 2187 2188 static int 2189 bdev_nvme_reconnect_delay_timer_expired(void *ctx) 2190 { 2191 struct nvme_ctrlr *nvme_ctrlr = ctx; 2192 2193 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect_delay, nvme_ctrlr->nbdev_ctrlr->name); 2194 pthread_mutex_lock(&nvme_ctrlr->mutex); 2195 2196 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 2197 2198 if (!nvme_ctrlr->reconnect_is_delayed) { 2199 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2200 return SPDK_POLLER_BUSY; 2201 } 2202 2203 nvme_ctrlr->reconnect_is_delayed = false; 2204 2205 if (nvme_ctrlr->destruct) { 2206 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2207 return SPDK_POLLER_BUSY; 2208 } 2209 2210 assert(nvme_ctrlr->resetting == false); 2211 nvme_ctrlr->resetting = true; 2212 2213 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2214 2215 spdk_poller_resume(nvme_ctrlr->adminq_timer_poller); 2216 2217 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2218 return SPDK_POLLER_BUSY; 2219 } 2220 2221 static void 2222 bdev_nvme_start_reconnect_delay_timer(struct nvme_ctrlr *nvme_ctrlr) 2223 { 2224 spdk_poller_pause(nvme_ctrlr->adminq_timer_poller); 2225 2226 assert(nvme_ctrlr->reconnect_is_delayed == false); 2227 nvme_ctrlr->reconnect_is_delayed = true; 2228 2229 assert(nvme_ctrlr->reconnect_delay_timer == NULL); 2230 nvme_ctrlr->reconnect_delay_timer = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_delay_timer_expired, 2231 nvme_ctrlr, 2232 nvme_ctrlr->opts.reconnect_delay_sec * SPDK_SEC_TO_USEC); 2233 } 2234 2235 static void remove_discovery_entry(struct nvme_ctrlr *nvme_ctrlr); 2236 2237 static void 2238 bdev_nvme_reset_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, bool success) 2239 { 2240 bdev_nvme_ctrlr_op_cb ctrlr_op_cb_fn = nvme_ctrlr->ctrlr_op_cb_fn; 2241 void *ctrlr_op_cb_arg = nvme_ctrlr->ctrlr_op_cb_arg; 2242 enum bdev_nvme_op_after_reset op_after_reset; 2243 2244 assert(nvme_ctrlr->thread == spdk_get_thread()); 2245 2246 pthread_mutex_lock(&nvme_ctrlr->mutex); 2247 if (!success) { 2248 /* Connecting the active trid failed. Set the next alternate trid to the 2249 * active trid if it exists. 2250 */ 2251 if (bdev_nvme_failover_trid(nvme_ctrlr, false, false)) { 2252 /* The next alternate trid exists and is ready to try. Try it now. */ 2253 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2254 2255 NVME_CTRLR_INFOLOG(nvme_ctrlr, "Try the next alternate trid %s:%s now.\n", 2256 nvme_ctrlr->active_path_id->trid.traddr, 2257 nvme_ctrlr->active_path_id->trid.trsvcid); 2258 2259 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr); 2260 return; 2261 } 2262 2263 /* We came here if there is no alternate trid or if the next trid exists but 2264 * is not ready to try. We will try the active trid after reconnect_delay_sec 2265 * seconds if it is non-zero or at the next reset call otherwise. 2266 */ 2267 } else { 2268 /* Connecting the active trid succeeded. Clear the last failed time because it 2269 * means the trid is failed if its last failed time is non-zero. 2270 */ 2271 nvme_ctrlr->active_path_id->last_failed_tsc = 0; 2272 } 2273 2274 NVME_CTRLR_INFOLOG(nvme_ctrlr, "Clear pending resets.\n"); 2275 2276 /* Make sure we clear any pending resets before returning. */ 2277 bdev_nvme_complete_pending_resets(nvme_ctrlr, success); 2278 2279 if (!success) { 2280 NVME_CTRLR_ERRLOG(nvme_ctrlr, "Resetting controller failed.\n"); 2281 } else { 2282 NVME_CTRLR_NOTICELOG(nvme_ctrlr, "Resetting controller successful.\n"); 2283 } 2284 2285 nvme_ctrlr->resetting = false; 2286 nvme_ctrlr->dont_retry = false; 2287 nvme_ctrlr->in_failover = false; 2288 2289 nvme_ctrlr->ctrlr_op_cb_fn = NULL; 2290 nvme_ctrlr->ctrlr_op_cb_arg = NULL; 2291 2292 op_after_reset = bdev_nvme_check_op_after_reset(nvme_ctrlr, success); 2293 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2294 2295 /* Delay callbacks when the next operation is a failover. */ 2296 if (ctrlr_op_cb_fn && op_after_reset != OP_FAILOVER) { 2297 ctrlr_op_cb_fn(ctrlr_op_cb_arg, success ? 0 : -1); 2298 } 2299 2300 switch (op_after_reset) { 2301 case OP_COMPLETE_PENDING_DESTRUCT: 2302 nvme_ctrlr_unregister(nvme_ctrlr); 2303 break; 2304 case OP_DESTRUCT: 2305 bdev_nvme_delete_ctrlr(nvme_ctrlr, false); 2306 remove_discovery_entry(nvme_ctrlr); 2307 break; 2308 case OP_DELAYED_RECONNECT: 2309 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_start_reconnect_delay_timer); 2310 break; 2311 case OP_FAILOVER: 2312 nvme_ctrlr->ctrlr_op_cb_fn = ctrlr_op_cb_fn; 2313 nvme_ctrlr->ctrlr_op_cb_arg = ctrlr_op_cb_arg; 2314 bdev_nvme_failover_ctrlr(nvme_ctrlr); 2315 break; 2316 default: 2317 break; 2318 } 2319 } 2320 2321 static void 2322 bdev_nvme_reset_create_qpairs_failed(struct nvme_ctrlr *nvme_ctrlr, void *ctx, int status) 2323 { 2324 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 2325 } 2326 2327 static void 2328 bdev_nvme_reset_destroy_qpair(struct nvme_ctrlr_channel_iter *i, 2329 struct nvme_ctrlr *nvme_ctrlr, 2330 struct nvme_ctrlr_channel *ctrlr_ch, void *ctx) 2331 { 2332 struct nvme_qpair *nvme_qpair; 2333 struct spdk_nvme_qpair *qpair; 2334 2335 nvme_qpair = ctrlr_ch->qpair; 2336 assert(nvme_qpair != NULL); 2337 2338 _bdev_nvme_clear_io_path_cache(nvme_qpair); 2339 2340 qpair = nvme_qpair->qpair; 2341 if (qpair != NULL) { 2342 NVME_CTRLR_INFOLOG(nvme_ctrlr, "Start disconnecting qpair %p:%u.\n", 2343 qpair, spdk_nvme_qpair_get_id(qpair)); 2344 2345 if (nvme_qpair->ctrlr->dont_retry) { 2346 spdk_nvme_qpair_set_abort_dnr(qpair, true); 2347 } 2348 spdk_nvme_ctrlr_disconnect_io_qpair(qpair); 2349 2350 /* The current full reset sequence will move to the next 2351 * ctrlr_channel after the qpair is actually disconnected. 2352 */ 2353 assert(ctrlr_ch->reset_iter == NULL); 2354 ctrlr_ch->reset_iter = i; 2355 } else { 2356 nvme_ctrlr_for_each_channel_continue(i, 0); 2357 } 2358 } 2359 2360 static void 2361 bdev_nvme_reset_create_qpairs_done(struct nvme_ctrlr *nvme_ctrlr, void *ctx, int status) 2362 { 2363 if (status == 0) { 2364 NVME_CTRLR_INFOLOG(nvme_ctrlr, "qpairs were created after ctrlr reset.\n"); 2365 2366 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, true); 2367 } else { 2368 NVME_CTRLR_INFOLOG(nvme_ctrlr, "qpairs were failed to create after ctrlr reset.\n"); 2369 2370 /* Delete the added qpairs and quiesce ctrlr to make the states clean. */ 2371 nvme_ctrlr_for_each_channel(nvme_ctrlr, 2372 bdev_nvme_reset_destroy_qpair, 2373 NULL, 2374 bdev_nvme_reset_create_qpairs_failed); 2375 } 2376 } 2377 2378 static int 2379 bdev_nvme_reset_check_qpair_connected(void *ctx) 2380 { 2381 struct nvme_ctrlr_channel *ctrlr_ch = ctx; 2382 struct nvme_qpair *nvme_qpair = ctrlr_ch->qpair; 2383 struct spdk_nvme_qpair *qpair; 2384 2385 if (ctrlr_ch->reset_iter == NULL) { 2386 /* qpair was already failed to connect and the reset sequence is being aborted. */ 2387 assert(ctrlr_ch->connect_poller == NULL); 2388 assert(nvme_qpair->qpair == NULL); 2389 2390 NVME_CTRLR_INFOLOG(nvme_qpair->ctrlr, 2391 "qpair was already failed to connect. reset is being aborted.\n"); 2392 return SPDK_POLLER_BUSY; 2393 } 2394 2395 qpair = nvme_qpair->qpair; 2396 assert(qpair != NULL); 2397 2398 if (!spdk_nvme_qpair_is_connected(qpair)) { 2399 return SPDK_POLLER_BUSY; 2400 } 2401 2402 NVME_CTRLR_INFOLOG(nvme_qpair->ctrlr, "qpair %p:%u was connected.\n", 2403 qpair, spdk_nvme_qpair_get_id(qpair)); 2404 2405 spdk_poller_unregister(&ctrlr_ch->connect_poller); 2406 2407 /* qpair was completed to connect. Move to the next ctrlr_channel */ 2408 nvme_ctrlr_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 2409 ctrlr_ch->reset_iter = NULL; 2410 2411 if (!g_opts.disable_auto_failback) { 2412 _bdev_nvme_clear_io_path_cache(nvme_qpair); 2413 } 2414 2415 return SPDK_POLLER_BUSY; 2416 } 2417 2418 static void 2419 bdev_nvme_reset_create_qpair(struct nvme_ctrlr_channel_iter *i, 2420 struct nvme_ctrlr *nvme_ctrlr, 2421 struct nvme_ctrlr_channel *ctrlr_ch, 2422 void *ctx) 2423 { 2424 struct nvme_qpair *nvme_qpair = ctrlr_ch->qpair; 2425 struct spdk_nvme_qpair *qpair; 2426 int rc = 0; 2427 2428 if (nvme_qpair->qpair == NULL) { 2429 rc = bdev_nvme_create_qpair(nvme_qpair); 2430 } 2431 if (rc == 0) { 2432 ctrlr_ch->connect_poller = SPDK_POLLER_REGISTER(bdev_nvme_reset_check_qpair_connected, 2433 ctrlr_ch, 0); 2434 2435 qpair = nvme_qpair->qpair; 2436 2437 NVME_CTRLR_INFOLOG(nvme_ctrlr, "Start checking qpair %p:%u to be connected.\n", 2438 qpair, spdk_nvme_qpair_get_id(qpair)); 2439 2440 /* The current full reset sequence will move to the next 2441 * ctrlr_channel after the qpair is actually connected. 2442 */ 2443 assert(ctrlr_ch->reset_iter == NULL); 2444 ctrlr_ch->reset_iter = i; 2445 } else { 2446 nvme_ctrlr_for_each_channel_continue(i, rc); 2447 } 2448 } 2449 2450 static void 2451 nvme_ctrlr_check_namespaces(struct nvme_ctrlr *nvme_ctrlr) 2452 { 2453 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 2454 struct nvme_ns *nvme_ns; 2455 2456 for (nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 2457 nvme_ns != NULL; 2458 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns)) { 2459 if (!spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) { 2460 SPDK_DEBUGLOG(bdev_nvme, "NSID %u was removed during reset.\n", nvme_ns->id); 2461 /* NS can be added again. Just nullify nvme_ns->ns. */ 2462 nvme_ns->ns = NULL; 2463 } 2464 } 2465 } 2466 2467 2468 static int 2469 bdev_nvme_reconnect_ctrlr_poll(void *arg) 2470 { 2471 struct nvme_ctrlr *nvme_ctrlr = arg; 2472 struct spdk_nvme_transport_id *trid; 2473 int rc = -ETIMEDOUT; 2474 2475 if (bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) { 2476 /* Mark the ctrlr as failed. The next call to 2477 * spdk_nvme_ctrlr_reconnect_poll_async() will then 2478 * do the necessary cleanup and return failure. 2479 */ 2480 spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr); 2481 } 2482 2483 rc = spdk_nvme_ctrlr_reconnect_poll_async(nvme_ctrlr->ctrlr); 2484 if (rc == -EAGAIN) { 2485 return SPDK_POLLER_BUSY; 2486 } 2487 2488 spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller); 2489 if (rc == 0) { 2490 trid = &nvme_ctrlr->active_path_id->trid; 2491 2492 if (spdk_nvme_trtype_is_fabrics(trid->trtype)) { 2493 NVME_CTRLR_INFOLOG(nvme_ctrlr, "ctrlr was connected to %s:%s. Create qpairs.\n", 2494 trid->traddr, trid->trsvcid); 2495 } else { 2496 NVME_CTRLR_INFOLOG(nvme_ctrlr, "ctrlr was connected. Create qpairs.\n"); 2497 } 2498 2499 nvme_ctrlr_check_namespaces(nvme_ctrlr); 2500 2501 /* Recreate all of the I/O queue pairs */ 2502 nvme_ctrlr_for_each_channel(nvme_ctrlr, 2503 bdev_nvme_reset_create_qpair, 2504 NULL, 2505 bdev_nvme_reset_create_qpairs_done); 2506 } else { 2507 NVME_CTRLR_INFOLOG(nvme_ctrlr, "ctrlr could not be connected.\n"); 2508 2509 bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false); 2510 } 2511 return SPDK_POLLER_BUSY; 2512 } 2513 2514 static void 2515 bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2516 { 2517 NVME_CTRLR_INFOLOG(nvme_ctrlr, "Start reconnecting ctrlr.\n"); 2518 2519 spdk_nvme_ctrlr_reconnect_async(nvme_ctrlr->ctrlr); 2520 2521 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect, nvme_ctrlr->nbdev_ctrlr->name); 2522 assert(nvme_ctrlr->reset_detach_poller == NULL); 2523 nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_ctrlr_poll, 2524 nvme_ctrlr, 0); 2525 } 2526 2527 static void 2528 bdev_nvme_reset_destroy_qpair_done(struct nvme_ctrlr *nvme_ctrlr, void *ctx, int status) 2529 { 2530 SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reset, nvme_ctrlr->nbdev_ctrlr->name); 2531 assert(status == 0); 2532 2533 NVME_CTRLR_INFOLOG(nvme_ctrlr, "qpairs were deleted.\n"); 2534 2535 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2536 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2537 } else { 2538 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr); 2539 } 2540 } 2541 2542 static void 2543 bdev_nvme_reset_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr) 2544 { 2545 NVME_CTRLR_INFOLOG(nvme_ctrlr, "Delete qpairs for reset.\n"); 2546 2547 nvme_ctrlr_for_each_channel(nvme_ctrlr, 2548 bdev_nvme_reset_destroy_qpair, 2549 NULL, 2550 bdev_nvme_reset_destroy_qpair_done); 2551 } 2552 2553 static void 2554 bdev_nvme_reconnect_ctrlr_now(void *ctx) 2555 { 2556 struct nvme_ctrlr *nvme_ctrlr = ctx; 2557 2558 assert(nvme_ctrlr->resetting == true); 2559 assert(nvme_ctrlr->thread == spdk_get_thread()); 2560 2561 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 2562 2563 spdk_poller_resume(nvme_ctrlr->adminq_timer_poller); 2564 2565 bdev_nvme_reconnect_ctrlr(nvme_ctrlr); 2566 } 2567 2568 static void 2569 _bdev_nvme_reset_ctrlr(void *ctx) 2570 { 2571 struct nvme_ctrlr *nvme_ctrlr = ctx; 2572 2573 assert(nvme_ctrlr->resetting == true); 2574 assert(nvme_ctrlr->thread == spdk_get_thread()); 2575 2576 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2577 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reset_destroy_qpairs); 2578 } else { 2579 bdev_nvme_reset_destroy_qpairs(nvme_ctrlr); 2580 } 2581 } 2582 2583 static int 2584 bdev_nvme_reset_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, spdk_msg_fn *msg_fn) 2585 { 2586 if (nvme_ctrlr->destruct) { 2587 return -ENXIO; 2588 } 2589 2590 if (nvme_ctrlr->resetting) { 2591 NVME_CTRLR_NOTICELOG(nvme_ctrlr, "Unable to perform reset, already in progress.\n"); 2592 return -EBUSY; 2593 } 2594 2595 if (nvme_ctrlr->disabled) { 2596 NVME_CTRLR_NOTICELOG(nvme_ctrlr, "Unable to perform reset. Controller is disabled.\n"); 2597 return -EALREADY; 2598 } 2599 2600 nvme_ctrlr->resetting = true; 2601 nvme_ctrlr->dont_retry = true; 2602 2603 if (nvme_ctrlr->reconnect_is_delayed) { 2604 NVME_CTRLR_INFOLOG(nvme_ctrlr, "Reconnect is already scheduled.\n"); 2605 *msg_fn = bdev_nvme_reconnect_ctrlr_now; 2606 nvme_ctrlr->reconnect_is_delayed = false; 2607 } else { 2608 *msg_fn = _bdev_nvme_reset_ctrlr; 2609 assert(nvme_ctrlr->reset_start_tsc == 0); 2610 } 2611 2612 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2613 2614 return 0; 2615 } 2616 2617 static int 2618 bdev_nvme_reset_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2619 { 2620 spdk_msg_fn msg_fn; 2621 int rc; 2622 2623 pthread_mutex_lock(&nvme_ctrlr->mutex); 2624 rc = bdev_nvme_reset_ctrlr_unsafe(nvme_ctrlr, &msg_fn); 2625 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2626 2627 if (rc == 0) { 2628 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 2629 } 2630 2631 return rc; 2632 } 2633 2634 static int 2635 bdev_nvme_enable_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2636 { 2637 pthread_mutex_lock(&nvme_ctrlr->mutex); 2638 if (nvme_ctrlr->destruct) { 2639 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2640 return -ENXIO; 2641 } 2642 2643 if (nvme_ctrlr->resetting) { 2644 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2645 return -EBUSY; 2646 } 2647 2648 if (!nvme_ctrlr->disabled) { 2649 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2650 return -EALREADY; 2651 } 2652 2653 nvme_ctrlr->disabled = false; 2654 nvme_ctrlr->resetting = true; 2655 2656 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2657 2658 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2659 2660 spdk_thread_send_msg(nvme_ctrlr->thread, bdev_nvme_reconnect_ctrlr_now, nvme_ctrlr); 2661 return 0; 2662 } 2663 2664 static void 2665 bdev_nvme_disable_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr) 2666 { 2667 bdev_nvme_ctrlr_op_cb ctrlr_op_cb_fn = nvme_ctrlr->ctrlr_op_cb_fn; 2668 void *ctrlr_op_cb_arg = nvme_ctrlr->ctrlr_op_cb_arg; 2669 enum bdev_nvme_op_after_reset op_after_disable; 2670 2671 assert(nvme_ctrlr->thread == spdk_get_thread()); 2672 2673 nvme_ctrlr->ctrlr_op_cb_fn = NULL; 2674 nvme_ctrlr->ctrlr_op_cb_arg = NULL; 2675 2676 pthread_mutex_lock(&nvme_ctrlr->mutex); 2677 2678 nvme_ctrlr->resetting = false; 2679 nvme_ctrlr->dont_retry = false; 2680 2681 op_after_disable = bdev_nvme_check_op_after_reset(nvme_ctrlr, true); 2682 2683 nvme_ctrlr->disabled = true; 2684 spdk_poller_pause(nvme_ctrlr->adminq_timer_poller); 2685 2686 /* Make sure we clear any pending resets before returning. */ 2687 bdev_nvme_complete_pending_resets(nvme_ctrlr, true); 2688 2689 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2690 2691 if (ctrlr_op_cb_fn) { 2692 ctrlr_op_cb_fn(ctrlr_op_cb_arg, 0); 2693 } 2694 2695 switch (op_after_disable) { 2696 case OP_COMPLETE_PENDING_DESTRUCT: 2697 nvme_ctrlr_unregister(nvme_ctrlr); 2698 break; 2699 default: 2700 break; 2701 } 2702 } 2703 2704 static void 2705 bdev_nvme_disable_destroy_qpairs_done(struct nvme_ctrlr *nvme_ctrlr, void *ctx, int status) 2706 { 2707 assert(status == 0); 2708 2709 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2710 bdev_nvme_disable_ctrlr_complete(nvme_ctrlr); 2711 } else { 2712 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_disable_ctrlr_complete); 2713 } 2714 } 2715 2716 static void 2717 bdev_nvme_disable_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr) 2718 { 2719 nvme_ctrlr_for_each_channel(nvme_ctrlr, 2720 bdev_nvme_reset_destroy_qpair, 2721 NULL, 2722 bdev_nvme_disable_destroy_qpairs_done); 2723 } 2724 2725 static void 2726 _bdev_nvme_cancel_reconnect_and_disable_ctrlr(void *ctx) 2727 { 2728 struct nvme_ctrlr *nvme_ctrlr = ctx; 2729 2730 assert(nvme_ctrlr->resetting == true); 2731 assert(nvme_ctrlr->thread == spdk_get_thread()); 2732 2733 spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer); 2734 2735 bdev_nvme_disable_ctrlr_complete(nvme_ctrlr); 2736 } 2737 2738 static void 2739 _bdev_nvme_disconnect_and_disable_ctrlr(void *ctx) 2740 { 2741 struct nvme_ctrlr *nvme_ctrlr = ctx; 2742 2743 assert(nvme_ctrlr->resetting == true); 2744 assert(nvme_ctrlr->thread == spdk_get_thread()); 2745 2746 if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) { 2747 nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_disable_destroy_qpairs); 2748 } else { 2749 bdev_nvme_disable_destroy_qpairs(nvme_ctrlr); 2750 } 2751 } 2752 2753 static int 2754 bdev_nvme_disable_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 2755 { 2756 spdk_msg_fn msg_fn; 2757 2758 pthread_mutex_lock(&nvme_ctrlr->mutex); 2759 if (nvme_ctrlr->destruct) { 2760 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2761 return -ENXIO; 2762 } 2763 2764 if (nvme_ctrlr->resetting) { 2765 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2766 return -EBUSY; 2767 } 2768 2769 if (nvme_ctrlr->disabled) { 2770 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2771 return -EALREADY; 2772 } 2773 2774 nvme_ctrlr->resetting = true; 2775 nvme_ctrlr->dont_retry = true; 2776 2777 if (nvme_ctrlr->reconnect_is_delayed) { 2778 msg_fn = _bdev_nvme_cancel_reconnect_and_disable_ctrlr; 2779 nvme_ctrlr->reconnect_is_delayed = false; 2780 } else { 2781 msg_fn = _bdev_nvme_disconnect_and_disable_ctrlr; 2782 } 2783 2784 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 2785 2786 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2787 2788 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 2789 return 0; 2790 } 2791 2792 static int 2793 nvme_ctrlr_op(struct nvme_ctrlr *nvme_ctrlr, enum nvme_ctrlr_op op, 2794 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2795 { 2796 int rc; 2797 2798 switch (op) { 2799 case NVME_CTRLR_OP_RESET: 2800 rc = bdev_nvme_reset_ctrlr(nvme_ctrlr); 2801 break; 2802 case NVME_CTRLR_OP_ENABLE: 2803 rc = bdev_nvme_enable_ctrlr(nvme_ctrlr); 2804 break; 2805 case NVME_CTRLR_OP_DISABLE: 2806 rc = bdev_nvme_disable_ctrlr(nvme_ctrlr); 2807 break; 2808 default: 2809 rc = -EINVAL; 2810 break; 2811 } 2812 2813 if (rc == 0) { 2814 assert(nvme_ctrlr->ctrlr_op_cb_fn == NULL); 2815 assert(nvme_ctrlr->ctrlr_op_cb_arg == NULL); 2816 nvme_ctrlr->ctrlr_op_cb_fn = cb_fn; 2817 nvme_ctrlr->ctrlr_op_cb_arg = cb_arg; 2818 } 2819 return rc; 2820 } 2821 2822 struct nvme_ctrlr_op_rpc_ctx { 2823 struct nvme_ctrlr *nvme_ctrlr; 2824 struct spdk_thread *orig_thread; 2825 enum nvme_ctrlr_op op; 2826 int rc; 2827 bdev_nvme_ctrlr_op_cb cb_fn; 2828 void *cb_arg; 2829 }; 2830 2831 static void 2832 _nvme_ctrlr_op_rpc_complete(void *_ctx) 2833 { 2834 struct nvme_ctrlr_op_rpc_ctx *ctx = _ctx; 2835 2836 assert(ctx != NULL); 2837 assert(ctx->cb_fn != NULL); 2838 2839 ctx->cb_fn(ctx->cb_arg, ctx->rc); 2840 2841 free(ctx); 2842 } 2843 2844 static void 2845 nvme_ctrlr_op_rpc_complete(void *cb_arg, int rc) 2846 { 2847 struct nvme_ctrlr_op_rpc_ctx *ctx = cb_arg; 2848 2849 ctx->rc = rc; 2850 2851 spdk_thread_send_msg(ctx->orig_thread, _nvme_ctrlr_op_rpc_complete, ctx); 2852 } 2853 2854 void 2855 nvme_ctrlr_op_rpc(struct nvme_ctrlr *nvme_ctrlr, enum nvme_ctrlr_op op, 2856 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2857 { 2858 struct nvme_ctrlr_op_rpc_ctx *ctx; 2859 int rc; 2860 2861 assert(cb_fn != NULL); 2862 2863 ctx = calloc(1, sizeof(*ctx)); 2864 if (ctx == NULL) { 2865 NVME_CTRLR_ERRLOG(nvme_ctrlr, "Failed to allocate nvme_ctrlr_op_rpc_ctx.\n"); 2866 cb_fn(cb_arg, -ENOMEM); 2867 return; 2868 } 2869 2870 ctx->orig_thread = spdk_get_thread(); 2871 ctx->cb_fn = cb_fn; 2872 ctx->cb_arg = cb_arg; 2873 2874 rc = nvme_ctrlr_op(nvme_ctrlr, op, nvme_ctrlr_op_rpc_complete, ctx); 2875 if (rc == 0) { 2876 return; 2877 } else if (rc == -EALREADY) { 2878 rc = 0; 2879 } 2880 2881 nvme_ctrlr_op_rpc_complete(ctx, rc); 2882 } 2883 2884 static void nvme_bdev_ctrlr_op_rpc_continue(void *cb_arg, int rc); 2885 2886 static void 2887 _nvme_bdev_ctrlr_op_rpc_continue(void *_ctx) 2888 { 2889 struct nvme_ctrlr_op_rpc_ctx *ctx = _ctx; 2890 struct nvme_ctrlr *prev_nvme_ctrlr, *next_nvme_ctrlr; 2891 int rc; 2892 2893 prev_nvme_ctrlr = ctx->nvme_ctrlr; 2894 ctx->nvme_ctrlr = NULL; 2895 2896 if (ctx->rc != 0) { 2897 goto complete; 2898 } 2899 2900 next_nvme_ctrlr = TAILQ_NEXT(prev_nvme_ctrlr, tailq); 2901 if (next_nvme_ctrlr == NULL) { 2902 goto complete; 2903 } 2904 2905 rc = nvme_ctrlr_op(next_nvme_ctrlr, ctx->op, nvme_bdev_ctrlr_op_rpc_continue, ctx); 2906 if (rc == 0) { 2907 ctx->nvme_ctrlr = next_nvme_ctrlr; 2908 return; 2909 } else if (rc == -EALREADY) { 2910 ctx->nvme_ctrlr = next_nvme_ctrlr; 2911 rc = 0; 2912 } 2913 2914 ctx->rc = rc; 2915 2916 complete: 2917 ctx->cb_fn(ctx->cb_arg, ctx->rc); 2918 free(ctx); 2919 } 2920 2921 static void 2922 nvme_bdev_ctrlr_op_rpc_continue(void *cb_arg, int rc) 2923 { 2924 struct nvme_ctrlr_op_rpc_ctx *ctx = cb_arg; 2925 2926 ctx->rc = rc; 2927 2928 spdk_thread_send_msg(ctx->orig_thread, _nvme_bdev_ctrlr_op_rpc_continue, ctx); 2929 } 2930 2931 void 2932 nvme_bdev_ctrlr_op_rpc(struct nvme_bdev_ctrlr *nbdev_ctrlr, enum nvme_ctrlr_op op, 2933 bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg) 2934 { 2935 struct nvme_ctrlr_op_rpc_ctx *ctx; 2936 struct nvme_ctrlr *nvme_ctrlr; 2937 int rc; 2938 2939 assert(cb_fn != NULL); 2940 2941 ctx = calloc(1, sizeof(*ctx)); 2942 if (ctx == NULL) { 2943 SPDK_ERRLOG("Failed to allocate nvme_ctrlr_op_rpc_ctx.\n"); 2944 cb_fn(cb_arg, -ENOMEM); 2945 return; 2946 } 2947 2948 ctx->orig_thread = spdk_get_thread(); 2949 ctx->op = op; 2950 ctx->cb_fn = cb_fn; 2951 ctx->cb_arg = cb_arg; 2952 2953 nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 2954 assert(nvme_ctrlr != NULL); 2955 2956 rc = nvme_ctrlr_op(nvme_ctrlr, op, nvme_bdev_ctrlr_op_rpc_continue, ctx); 2957 if (rc == 0) { 2958 ctx->nvme_ctrlr = nvme_ctrlr; 2959 return; 2960 } else if (rc == -EALREADY) { 2961 ctx->nvme_ctrlr = nvme_ctrlr; 2962 rc = 0; 2963 } 2964 2965 nvme_bdev_ctrlr_op_rpc_continue(ctx, rc); 2966 } 2967 2968 static int _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio); 2969 2970 static void 2971 bdev_nvme_unfreeze_bdev_channel_done(struct nvme_bdev *nbdev, void *ctx, int status) 2972 { 2973 struct nvme_bdev_io *bio = ctx; 2974 enum spdk_bdev_io_status io_status; 2975 2976 if (bio->cpl.cdw0 == 0) { 2977 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 2978 } else { 2979 io_status = SPDK_BDEV_IO_STATUS_FAILED; 2980 } 2981 2982 NVME_BDEV_INFOLOG(nbdev, "reset_io %p completed, status:%d\n", bio, io_status); 2983 2984 __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), io_status, NULL); 2985 } 2986 2987 static void 2988 bdev_nvme_unfreeze_bdev_channel(struct nvme_bdev_channel_iter *i, 2989 struct nvme_bdev *nbdev, 2990 struct nvme_bdev_channel *nbdev_ch, void *ctx) 2991 { 2992 bdev_nvme_abort_retry_ios(nbdev_ch); 2993 nbdev_ch->resetting = false; 2994 2995 nvme_bdev_for_each_channel_continue(i, 0); 2996 } 2997 2998 static void 2999 bdev_nvme_reset_io_complete(struct nvme_bdev_io *bio) 3000 { 3001 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 3002 struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt; 3003 3004 /* Abort all queued I/Os for retry. */ 3005 nvme_bdev_for_each_channel(nbdev, 3006 bdev_nvme_unfreeze_bdev_channel, 3007 bio, 3008 bdev_nvme_unfreeze_bdev_channel_done); 3009 } 3010 3011 static void 3012 _bdev_nvme_reset_io_continue(void *ctx) 3013 { 3014 struct nvme_bdev_io *bio = ctx; 3015 struct nvme_io_path *prev_io_path, *next_io_path; 3016 int rc; 3017 3018 prev_io_path = bio->io_path; 3019 bio->io_path = NULL; 3020 3021 next_io_path = STAILQ_NEXT(prev_io_path, stailq); 3022 if (next_io_path == NULL) { 3023 goto complete; 3024 } 3025 3026 rc = _bdev_nvme_reset_io(next_io_path, bio); 3027 if (rc == 0) { 3028 return; 3029 } 3030 3031 complete: 3032 bdev_nvme_reset_io_complete(bio); 3033 } 3034 3035 static void 3036 bdev_nvme_reset_io_continue(void *cb_arg, int rc) 3037 { 3038 struct nvme_bdev_io *bio = cb_arg; 3039 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 3040 struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt; 3041 3042 NVME_BDEV_INFOLOG(nbdev, "continue reset_io %p, rc:%d\n", bio, rc); 3043 3044 /* Reset status is initialized as "failed". Set to "success" once we have at least one 3045 * successfully reset nvme_ctrlr. 3046 */ 3047 if (rc == 0) { 3048 bio->cpl.cdw0 = 0; 3049 } 3050 3051 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), _bdev_nvme_reset_io_continue, bio); 3052 } 3053 3054 static int 3055 _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio) 3056 { 3057 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 3058 struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt; 3059 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 3060 spdk_msg_fn msg_fn; 3061 int rc; 3062 3063 assert(bio->io_path == NULL); 3064 bio->io_path = io_path; 3065 3066 pthread_mutex_lock(&nvme_ctrlr->mutex); 3067 rc = bdev_nvme_reset_ctrlr_unsafe(nvme_ctrlr, &msg_fn); 3068 if (rc == -EBUSY) { 3069 /* 3070 * Reset call is queued only if it is from the app framework. This is on purpose so that 3071 * we don't interfere with the app framework reset strategy. i.e. we are deferring to the 3072 * upper level. If they are in the middle of a reset, we won't try to schedule another one. 3073 */ 3074 TAILQ_INSERT_TAIL(&nvme_ctrlr->pending_resets, bio, retry_link); 3075 } 3076 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3077 3078 if (rc == 0) { 3079 assert(nvme_ctrlr->ctrlr_op_cb_fn == NULL); 3080 assert(nvme_ctrlr->ctrlr_op_cb_arg == NULL); 3081 nvme_ctrlr->ctrlr_op_cb_fn = bdev_nvme_reset_io_continue; 3082 nvme_ctrlr->ctrlr_op_cb_arg = bio; 3083 3084 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 3085 3086 NVME_BDEV_INFOLOG(nbdev, "reset_io %p started resetting ctrlr [%s, %u].\n", 3087 bio, CTRLR_STRING(nvme_ctrlr), CTRLR_ID(nvme_ctrlr)); 3088 } else if (rc == -EBUSY) { 3089 rc = 0; 3090 3091 NVME_BDEV_INFOLOG(nbdev, "reset_io %p was queued to ctrlr [%s, %u].\n", 3092 bio, CTRLR_STRING(nvme_ctrlr), CTRLR_ID(nvme_ctrlr)); 3093 } else { 3094 NVME_BDEV_INFOLOG(nbdev, "reset_io %p could not reset ctrlr [%s, %u], rc:%d\n", 3095 bio, CTRLR_STRING(nvme_ctrlr), CTRLR_ID(nvme_ctrlr), rc); 3096 } 3097 3098 return rc; 3099 } 3100 3101 static void 3102 bdev_nvme_freeze_bdev_channel_done(struct nvme_bdev *nbdev, void *ctx, int status) 3103 { 3104 struct nvme_bdev_io *bio = ctx; 3105 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 3106 struct nvme_bdev_channel *nbdev_ch; 3107 struct nvme_io_path *io_path; 3108 int rc; 3109 3110 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 3111 3112 /* Initialize with failed status. With multipath it is enough to have at least one successful 3113 * nvme_ctrlr reset. If there is none, reset status will remain failed. 3114 */ 3115 bio->cpl.cdw0 = 1; 3116 3117 /* Reset all nvme_ctrlrs of a bdev controller sequentially. */ 3118 io_path = STAILQ_FIRST(&nbdev_ch->io_path_list); 3119 assert(io_path != NULL); 3120 3121 rc = _bdev_nvme_reset_io(io_path, bio); 3122 if (rc != 0) { 3123 /* If the current nvme_ctrlr is disabled, skip it and move to the next nvme_ctrlr. */ 3124 rc = (rc == -EALREADY) ? 0 : rc; 3125 3126 bdev_nvme_reset_io_continue(bio, rc); 3127 } 3128 } 3129 3130 static void 3131 bdev_nvme_freeze_bdev_channel(struct nvme_bdev_channel_iter *i, 3132 struct nvme_bdev *nbdev, 3133 struct nvme_bdev_channel *nbdev_ch, void *ctx) 3134 { 3135 nbdev_ch->resetting = true; 3136 3137 nvme_bdev_for_each_channel_continue(i, 0); 3138 } 3139 3140 static void 3141 bdev_nvme_reset_io(struct nvme_bdev *nbdev, struct nvme_bdev_io *bio) 3142 { 3143 NVME_BDEV_INFOLOG(nbdev, "reset_io %p started.\n", bio); 3144 3145 nvme_bdev_for_each_channel(nbdev, 3146 bdev_nvme_freeze_bdev_channel, 3147 bio, 3148 bdev_nvme_freeze_bdev_channel_done); 3149 } 3150 3151 static int 3152 bdev_nvme_failover_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool remove) 3153 { 3154 if (nvme_ctrlr->destruct) { 3155 /* Don't bother resetting if the controller is in the process of being destructed. */ 3156 return -ENXIO; 3157 } 3158 3159 if (nvme_ctrlr->resetting) { 3160 if (!nvme_ctrlr->in_failover) { 3161 NVME_CTRLR_NOTICELOG(nvme_ctrlr, 3162 "Reset is already in progress. Defer failover until reset completes.\n"); 3163 3164 /* Defer failover until reset completes. */ 3165 nvme_ctrlr->pending_failover = true; 3166 return -EINPROGRESS; 3167 } else { 3168 NVME_CTRLR_NOTICELOG(nvme_ctrlr, "Unable to perform failover, already in progress.\n"); 3169 return -EBUSY; 3170 } 3171 } 3172 3173 bdev_nvme_failover_trid(nvme_ctrlr, remove, true); 3174 3175 if (nvme_ctrlr->reconnect_is_delayed) { 3176 NVME_CTRLR_NOTICELOG(nvme_ctrlr, "Reconnect is already scheduled.\n"); 3177 3178 /* We rely on the next reconnect for the failover. */ 3179 return -EALREADY; 3180 } 3181 3182 if (nvme_ctrlr->disabled) { 3183 NVME_CTRLR_NOTICELOG(nvme_ctrlr, "Controller is disabled.\n"); 3184 3185 /* We rely on the enablement for the failover. */ 3186 return -EALREADY; 3187 } 3188 3189 nvme_ctrlr->resetting = true; 3190 nvme_ctrlr->in_failover = true; 3191 3192 assert(nvme_ctrlr->reset_start_tsc == 0); 3193 nvme_ctrlr->reset_start_tsc = spdk_get_ticks(); 3194 3195 return 0; 3196 } 3197 3198 static int 3199 bdev_nvme_failover_ctrlr(struct nvme_ctrlr *nvme_ctrlr) 3200 { 3201 int rc; 3202 3203 pthread_mutex_lock(&nvme_ctrlr->mutex); 3204 rc = bdev_nvme_failover_ctrlr_unsafe(nvme_ctrlr, false); 3205 pthread_mutex_unlock(&nvme_ctrlr->mutex); 3206 3207 if (rc == 0) { 3208 spdk_thread_send_msg(nvme_ctrlr->thread, _bdev_nvme_reset_ctrlr, nvme_ctrlr); 3209 } else if (rc == -EALREADY) { 3210 rc = 0; 3211 } 3212 3213 return rc; 3214 } 3215 3216 static int bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, 3217 uint64_t num_blocks); 3218 3219 static int bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, 3220 uint64_t num_blocks); 3221 3222 static int bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, 3223 uint64_t src_offset_blocks, 3224 uint64_t num_blocks); 3225 3226 static void 3227 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 3228 bool success) 3229 { 3230 struct nvme_bdev_io *bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 3231 int ret; 3232 3233 if (!success) { 3234 ret = -EINVAL; 3235 goto exit; 3236 } 3237 3238 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 3239 ret = -ENXIO; 3240 goto exit; 3241 } 3242 3243 ret = bdev_nvme_readv(bio, 3244 bdev_io->u.bdev.iovs, 3245 bdev_io->u.bdev.iovcnt, 3246 bdev_io->u.bdev.md_buf, 3247 bdev_io->u.bdev.num_blocks, 3248 bdev_io->u.bdev.offset_blocks, 3249 bdev_io->u.bdev.dif_check_flags, 3250 bdev_io->u.bdev.memory_domain, 3251 bdev_io->u.bdev.memory_domain_ctx, 3252 bdev_io->u.bdev.accel_sequence); 3253 3254 exit: 3255 if (spdk_unlikely(ret != 0)) { 3256 bdev_nvme_io_complete(bio, ret); 3257 } 3258 } 3259 3260 static inline void 3261 _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io) 3262 { 3263 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 3264 struct spdk_bdev *bdev = bdev_io->bdev; 3265 struct nvme_bdev_io *nbdev_io_to_abort; 3266 int rc = 0; 3267 3268 switch (bdev_io->type) { 3269 case SPDK_BDEV_IO_TYPE_READ: 3270 if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) { 3271 3272 rc = bdev_nvme_readv(nbdev_io, 3273 bdev_io->u.bdev.iovs, 3274 bdev_io->u.bdev.iovcnt, 3275 bdev_io->u.bdev.md_buf, 3276 bdev_io->u.bdev.num_blocks, 3277 bdev_io->u.bdev.offset_blocks, 3278 bdev_io->u.bdev.dif_check_flags, 3279 bdev_io->u.bdev.memory_domain, 3280 bdev_io->u.bdev.memory_domain_ctx, 3281 bdev_io->u.bdev.accel_sequence); 3282 } else { 3283 spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb, 3284 bdev_io->u.bdev.num_blocks * bdev->blocklen); 3285 rc = 0; 3286 } 3287 break; 3288 case SPDK_BDEV_IO_TYPE_WRITE: 3289 rc = bdev_nvme_writev(nbdev_io, 3290 bdev_io->u.bdev.iovs, 3291 bdev_io->u.bdev.iovcnt, 3292 bdev_io->u.bdev.md_buf, 3293 bdev_io->u.bdev.num_blocks, 3294 bdev_io->u.bdev.offset_blocks, 3295 bdev_io->u.bdev.dif_check_flags, 3296 bdev_io->u.bdev.memory_domain, 3297 bdev_io->u.bdev.memory_domain_ctx, 3298 bdev_io->u.bdev.accel_sequence, 3299 bdev_io->u.bdev.nvme_cdw12, 3300 bdev_io->u.bdev.nvme_cdw13); 3301 break; 3302 case SPDK_BDEV_IO_TYPE_COMPARE: 3303 rc = bdev_nvme_comparev(nbdev_io, 3304 bdev_io->u.bdev.iovs, 3305 bdev_io->u.bdev.iovcnt, 3306 bdev_io->u.bdev.md_buf, 3307 bdev_io->u.bdev.num_blocks, 3308 bdev_io->u.bdev.offset_blocks, 3309 bdev_io->u.bdev.dif_check_flags); 3310 break; 3311 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 3312 rc = bdev_nvme_comparev_and_writev(nbdev_io, 3313 bdev_io->u.bdev.iovs, 3314 bdev_io->u.bdev.iovcnt, 3315 bdev_io->u.bdev.fused_iovs, 3316 bdev_io->u.bdev.fused_iovcnt, 3317 bdev_io->u.bdev.md_buf, 3318 bdev_io->u.bdev.num_blocks, 3319 bdev_io->u.bdev.offset_blocks, 3320 bdev_io->u.bdev.dif_check_flags); 3321 break; 3322 case SPDK_BDEV_IO_TYPE_UNMAP: 3323 rc = bdev_nvme_unmap(nbdev_io, 3324 bdev_io->u.bdev.offset_blocks, 3325 bdev_io->u.bdev.num_blocks); 3326 break; 3327 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3328 rc = bdev_nvme_write_zeroes(nbdev_io, 3329 bdev_io->u.bdev.offset_blocks, 3330 bdev_io->u.bdev.num_blocks); 3331 break; 3332 case SPDK_BDEV_IO_TYPE_RESET: 3333 nbdev_io->io_path = NULL; 3334 bdev_nvme_reset_io(bdev->ctxt, nbdev_io); 3335 return; 3336 3337 case SPDK_BDEV_IO_TYPE_FLUSH: 3338 bdev_nvme_io_complete(nbdev_io, 0); 3339 return; 3340 3341 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 3342 rc = bdev_nvme_zone_appendv(nbdev_io, 3343 bdev_io->u.bdev.iovs, 3344 bdev_io->u.bdev.iovcnt, 3345 bdev_io->u.bdev.md_buf, 3346 bdev_io->u.bdev.num_blocks, 3347 bdev_io->u.bdev.offset_blocks, 3348 bdev_io->u.bdev.dif_check_flags); 3349 break; 3350 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 3351 rc = bdev_nvme_get_zone_info(nbdev_io, 3352 bdev_io->u.zone_mgmt.zone_id, 3353 bdev_io->u.zone_mgmt.num_zones, 3354 bdev_io->u.zone_mgmt.buf); 3355 break; 3356 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 3357 rc = bdev_nvme_zone_management(nbdev_io, 3358 bdev_io->u.zone_mgmt.zone_id, 3359 bdev_io->u.zone_mgmt.zone_action); 3360 break; 3361 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 3362 nbdev_io->io_path = NULL; 3363 bdev_nvme_admin_passthru(nbdev_ch, 3364 nbdev_io, 3365 &bdev_io->u.nvme_passthru.cmd, 3366 bdev_io->u.nvme_passthru.buf, 3367 bdev_io->u.nvme_passthru.nbytes); 3368 return; 3369 3370 case SPDK_BDEV_IO_TYPE_NVME_IO: 3371 rc = bdev_nvme_io_passthru(nbdev_io, 3372 &bdev_io->u.nvme_passthru.cmd, 3373 bdev_io->u.nvme_passthru.buf, 3374 bdev_io->u.nvme_passthru.nbytes); 3375 break; 3376 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3377 rc = bdev_nvme_io_passthru_md(nbdev_io, 3378 &bdev_io->u.nvme_passthru.cmd, 3379 bdev_io->u.nvme_passthru.buf, 3380 bdev_io->u.nvme_passthru.nbytes, 3381 bdev_io->u.nvme_passthru.md_buf, 3382 bdev_io->u.nvme_passthru.md_len); 3383 break; 3384 case SPDK_BDEV_IO_TYPE_NVME_IOV_MD: 3385 rc = bdev_nvme_iov_passthru_md(nbdev_io, 3386 &bdev_io->u.nvme_passthru.cmd, 3387 bdev_io->u.nvme_passthru.iovs, 3388 bdev_io->u.nvme_passthru.iovcnt, 3389 bdev_io->u.nvme_passthru.nbytes, 3390 bdev_io->u.nvme_passthru.md_buf, 3391 bdev_io->u.nvme_passthru.md_len); 3392 break; 3393 case SPDK_BDEV_IO_TYPE_ABORT: 3394 nbdev_io->io_path = NULL; 3395 nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx; 3396 bdev_nvme_abort(nbdev_ch, 3397 nbdev_io, 3398 nbdev_io_to_abort); 3399 return; 3400 3401 case SPDK_BDEV_IO_TYPE_COPY: 3402 rc = bdev_nvme_copy(nbdev_io, 3403 bdev_io->u.bdev.offset_blocks, 3404 bdev_io->u.bdev.copy.src_offset_blocks, 3405 bdev_io->u.bdev.num_blocks); 3406 break; 3407 default: 3408 rc = -EINVAL; 3409 break; 3410 } 3411 3412 if (spdk_unlikely(rc != 0)) { 3413 bdev_nvme_io_complete(nbdev_io, rc); 3414 } 3415 } 3416 3417 static void 3418 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 3419 { 3420 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 3421 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 3422 3423 if (spdk_likely(nbdev_io->submit_tsc == 0)) { 3424 nbdev_io->submit_tsc = spdk_bdev_io_get_submit_tsc(bdev_io); 3425 } else { 3426 /* There are cases where submit_tsc != 0, i.e. retry I/O. 3427 * We need to update submit_tsc here. 3428 */ 3429 nbdev_io->submit_tsc = spdk_get_ticks(); 3430 } 3431 3432 spdk_trace_record(TRACE_BDEV_NVME_IO_START, 0, 0, (uintptr_t)nbdev_io, (uintptr_t)bdev_io); 3433 nbdev_io->io_path = bdev_nvme_find_io_path(nbdev_ch); 3434 if (spdk_unlikely(!nbdev_io->io_path)) { 3435 if (!bdev_nvme_io_type_is_admin(bdev_io->type)) { 3436 bdev_nvme_io_complete(nbdev_io, -ENXIO); 3437 return; 3438 } 3439 3440 /* Admin commands do not use the optimal I/O path. 3441 * Simply fall through even if it is not found. 3442 */ 3443 } 3444 3445 _bdev_nvme_submit_request(nbdev_ch, bdev_io); 3446 } 3447 3448 static bool 3449 bdev_nvme_is_supported_csi(enum spdk_nvme_csi csi) 3450 { 3451 switch (csi) { 3452 case SPDK_NVME_CSI_NVM: 3453 return true; 3454 case SPDK_NVME_CSI_ZNS: 3455 return true; 3456 default: 3457 return false; 3458 } 3459 } 3460 3461 static bool 3462 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 3463 { 3464 struct nvme_bdev *nbdev = ctx; 3465 struct nvme_ns *nvme_ns; 3466 struct spdk_nvme_ns *ns; 3467 struct spdk_nvme_ctrlr *ctrlr; 3468 const struct spdk_nvme_ctrlr_data *cdata; 3469 3470 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 3471 assert(nvme_ns != NULL); 3472 ns = nvme_ns->ns; 3473 if (ns == NULL) { 3474 return false; 3475 } 3476 3477 if (!bdev_nvme_is_supported_csi(spdk_nvme_ns_get_csi(ns))) { 3478 switch (io_type) { 3479 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 3480 case SPDK_BDEV_IO_TYPE_NVME_IO: 3481 return true; 3482 3483 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3484 return spdk_nvme_ns_get_md_size(ns) ? true : false; 3485 3486 default: 3487 return false; 3488 } 3489 } 3490 3491 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 3492 3493 switch (io_type) { 3494 case SPDK_BDEV_IO_TYPE_READ: 3495 case SPDK_BDEV_IO_TYPE_WRITE: 3496 case SPDK_BDEV_IO_TYPE_RESET: 3497 case SPDK_BDEV_IO_TYPE_FLUSH: 3498 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 3499 case SPDK_BDEV_IO_TYPE_NVME_IO: 3500 case SPDK_BDEV_IO_TYPE_ABORT: 3501 return true; 3502 3503 case SPDK_BDEV_IO_TYPE_COMPARE: 3504 return spdk_nvme_ns_supports_compare(ns); 3505 3506 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3507 return spdk_nvme_ns_get_md_size(ns) ? true : false; 3508 3509 case SPDK_BDEV_IO_TYPE_UNMAP: 3510 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3511 return cdata->oncs.dsm; 3512 3513 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3514 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3515 return cdata->oncs.write_zeroes; 3516 3517 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 3518 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 3519 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) { 3520 return true; 3521 } 3522 return false; 3523 3524 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 3525 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 3526 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS; 3527 3528 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 3529 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS && 3530 spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ZONE_APPEND_SUPPORTED; 3531 3532 case SPDK_BDEV_IO_TYPE_COPY: 3533 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3534 return cdata->oncs.copy; 3535 3536 default: 3537 return false; 3538 } 3539 } 3540 3541 static int 3542 nvme_qpair_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ctrlr_channel *ctrlr_ch) 3543 { 3544 struct nvme_qpair *nvme_qpair; 3545 struct spdk_io_channel *pg_ch; 3546 int rc; 3547 3548 nvme_qpair = calloc(1, sizeof(*nvme_qpair)); 3549 if (!nvme_qpair) { 3550 NVME_CTRLR_ERRLOG(nvme_ctrlr, "Failed to alloc nvme_qpair.\n"); 3551 return -1; 3552 } 3553 3554 TAILQ_INIT(&nvme_qpair->io_path_list); 3555 3556 nvme_qpair->ctrlr = nvme_ctrlr; 3557 nvme_qpair->ctrlr_ch = ctrlr_ch; 3558 3559 pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs); 3560 if (!pg_ch) { 3561 free(nvme_qpair); 3562 return -1; 3563 } 3564 3565 nvme_qpair->group = spdk_io_channel_get_ctx(pg_ch); 3566 3567 #ifdef SPDK_CONFIG_VTUNE 3568 nvme_qpair->group->collect_spin_stat = true; 3569 #else 3570 nvme_qpair->group->collect_spin_stat = false; 3571 #endif 3572 3573 if (!nvme_ctrlr->disabled) { 3574 /* If a nvme_ctrlr is disabled, don't try to create qpair for it. Qpair will 3575 * be created when it's enabled. 3576 */ 3577 rc = bdev_nvme_create_qpair(nvme_qpair); 3578 if (rc != 0) { 3579 /* nvme_ctrlr can't create IO qpair if connection is down. 3580 * If reconnect_delay_sec is non-zero, creating IO qpair is retried 3581 * after reconnect_delay_sec seconds. If bdev_retry_count is non-zero, 3582 * submitted IO will be queued until IO qpair is successfully created. 3583 * 3584 * Hence, if both are satisfied, ignore the failure. 3585 */ 3586 if (nvme_ctrlr->opts.reconnect_delay_sec == 0 || g_opts.bdev_retry_count == 0) { 3587 spdk_put_io_channel(pg_ch); 3588 free(nvme_qpair); 3589 return rc; 3590 } 3591 } 3592 } 3593 3594 TAILQ_INSERT_TAIL(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 3595 3596 ctrlr_ch->qpair = nvme_qpair; 3597 3598 nvme_ctrlr_get_ref(nvme_ctrlr); 3599 3600 return 0; 3601 } 3602 3603 static int 3604 bdev_nvme_create_ctrlr_channel_cb(void *io_device, void *ctx_buf) 3605 { 3606 struct nvme_ctrlr *nvme_ctrlr = io_device; 3607 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 3608 3609 return nvme_qpair_create(nvme_ctrlr, ctrlr_ch); 3610 } 3611 3612 static void 3613 nvme_qpair_delete(struct nvme_qpair *nvme_qpair) 3614 { 3615 struct nvme_io_path *io_path, *next; 3616 3617 assert(nvme_qpair->group != NULL); 3618 3619 TAILQ_FOREACH_SAFE(io_path, &nvme_qpair->io_path_list, tailq, next) { 3620 TAILQ_REMOVE(&nvme_qpair->io_path_list, io_path, tailq); 3621 nvme_io_path_free(io_path); 3622 } 3623 3624 TAILQ_REMOVE(&nvme_qpair->group->qpair_list, nvme_qpair, tailq); 3625 3626 spdk_put_io_channel(spdk_io_channel_from_ctx(nvme_qpair->group)); 3627 3628 nvme_ctrlr_put_ref(nvme_qpair->ctrlr); 3629 3630 free(nvme_qpair); 3631 } 3632 3633 static void 3634 bdev_nvme_destroy_ctrlr_channel_cb(void *io_device, void *ctx_buf) 3635 { 3636 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 3637 struct nvme_qpair *nvme_qpair; 3638 3639 nvme_qpair = ctrlr_ch->qpair; 3640 assert(nvme_qpair != NULL); 3641 3642 _bdev_nvme_clear_io_path_cache(nvme_qpair); 3643 3644 if (nvme_qpair->qpair != NULL) { 3645 /* Always try to disconnect the qpair, even if a reset is in progress. 3646 * The qpair may have been created after the reset process started. 3647 */ 3648 spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair); 3649 if (ctrlr_ch->reset_iter) { 3650 /* Skip current ctrlr_channel in a full reset sequence because 3651 * it is being deleted now. 3652 */ 3653 nvme_ctrlr_for_each_channel_continue(ctrlr_ch->reset_iter, 0); 3654 } 3655 3656 /* We cannot release a reference to the poll group now. 3657 * The qpair may be disconnected asynchronously later. 3658 * We need to poll it until it is actually disconnected. 3659 * Just detach the qpair from the deleting ctrlr_channel. 3660 */ 3661 nvme_qpair->ctrlr_ch = NULL; 3662 } else { 3663 assert(ctrlr_ch->reset_iter == NULL); 3664 3665 nvme_qpair_delete(nvme_qpair); 3666 } 3667 } 3668 3669 static inline struct spdk_io_channel * 3670 bdev_nvme_get_accel_channel(struct nvme_poll_group *group) 3671 { 3672 if (spdk_unlikely(!group->accel_channel)) { 3673 group->accel_channel = spdk_accel_get_io_channel(); 3674 if (!group->accel_channel) { 3675 SPDK_ERRLOG("Cannot get the accel_channel for bdev nvme polling group=%p\n", 3676 group); 3677 return NULL; 3678 } 3679 } 3680 3681 return group->accel_channel; 3682 } 3683 3684 static void 3685 bdev_nvme_finish_sequence(void *seq, spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) 3686 { 3687 spdk_accel_sequence_finish(seq, cb_fn, cb_arg); 3688 } 3689 3690 static void 3691 bdev_nvme_abort_sequence(void *seq) 3692 { 3693 spdk_accel_sequence_abort(seq); 3694 } 3695 3696 static void 3697 bdev_nvme_reverse_sequence(void *seq) 3698 { 3699 spdk_accel_sequence_reverse(seq); 3700 } 3701 3702 static int 3703 bdev_nvme_append_crc32c(void *ctx, void **seq, uint32_t *dst, struct iovec *iovs, uint32_t iovcnt, 3704 struct spdk_memory_domain *domain, void *domain_ctx, uint32_t seed, 3705 spdk_nvme_accel_step_cb cb_fn, void *cb_arg) 3706 { 3707 struct spdk_io_channel *ch; 3708 struct nvme_poll_group *group = ctx; 3709 3710 ch = bdev_nvme_get_accel_channel(group); 3711 if (spdk_unlikely(ch == NULL)) { 3712 return -ENOMEM; 3713 } 3714 3715 return spdk_accel_append_crc32c((struct spdk_accel_sequence **)seq, ch, dst, iovs, iovcnt, 3716 domain, domain_ctx, seed, cb_fn, cb_arg); 3717 } 3718 3719 static int 3720 bdev_nvme_append_copy(void *ctx, void **seq, struct iovec *dst_iovs, uint32_t dst_iovcnt, 3721 struct spdk_memory_domain *dst_domain, void *dst_domain_ctx, 3722 struct iovec *src_iovs, uint32_t src_iovcnt, 3723 struct spdk_memory_domain *src_domain, void *src_domain_ctx, 3724 spdk_nvme_accel_step_cb cb_fn, void *cb_arg) 3725 { 3726 struct spdk_io_channel *ch; 3727 struct nvme_poll_group *group = ctx; 3728 3729 ch = bdev_nvme_get_accel_channel(group); 3730 if (spdk_unlikely(ch == NULL)) { 3731 return -ENOMEM; 3732 } 3733 3734 return spdk_accel_append_copy((struct spdk_accel_sequence **)seq, ch, 3735 dst_iovs, dst_iovcnt, dst_domain, dst_domain_ctx, 3736 src_iovs, src_iovcnt, src_domain, src_domain_ctx, 3737 cb_fn, cb_arg); 3738 } 3739 3740 static struct spdk_nvme_accel_fn_table g_bdev_nvme_accel_fn_table = { 3741 .table_size = sizeof(struct spdk_nvme_accel_fn_table), 3742 .append_crc32c = bdev_nvme_append_crc32c, 3743 .append_copy = bdev_nvme_append_copy, 3744 .finish_sequence = bdev_nvme_finish_sequence, 3745 .reverse_sequence = bdev_nvme_reverse_sequence, 3746 .abort_sequence = bdev_nvme_abort_sequence, 3747 }; 3748 3749 static int 3750 bdev_nvme_interrupt_wrapper(void *ctx) 3751 { 3752 int num_events; 3753 struct nvme_poll_group *group = ctx; 3754 3755 num_events = spdk_nvme_poll_group_wait(group->group, bdev_nvme_disconnected_qpair_cb); 3756 if (spdk_unlikely(num_events < 0)) { 3757 bdev_nvme_check_io_qpairs(group); 3758 } 3759 3760 return num_events; 3761 } 3762 3763 static int 3764 bdev_nvme_create_poll_group_cb(void *io_device, void *ctx_buf) 3765 { 3766 struct nvme_poll_group *group = ctx_buf; 3767 uint64_t period; 3768 int fd; 3769 3770 TAILQ_INIT(&group->qpair_list); 3771 3772 group->group = spdk_nvme_poll_group_create(group, &g_bdev_nvme_accel_fn_table); 3773 if (group->group == NULL) { 3774 return -1; 3775 } 3776 3777 period = spdk_interrupt_mode_is_enabled() ? 0 : g_opts.nvme_ioq_poll_period_us; 3778 group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, period); 3779 3780 if (group->poller == NULL) { 3781 spdk_nvme_poll_group_destroy(group->group); 3782 return -1; 3783 } 3784 3785 if (spdk_interrupt_mode_is_enabled()) { 3786 spdk_poller_register_interrupt(group->poller, NULL, NULL); 3787 3788 fd = spdk_nvme_poll_group_get_fd(group->group); 3789 if (fd < 0) { 3790 spdk_nvme_poll_group_destroy(group->group); 3791 return -1; 3792 } 3793 3794 group->intr = SPDK_INTERRUPT_REGISTER(fd, bdev_nvme_interrupt_wrapper, group); 3795 if (!group->intr) { 3796 spdk_nvme_poll_group_destroy(group->group); 3797 return -1; 3798 } 3799 } 3800 3801 return 0; 3802 } 3803 3804 static void 3805 bdev_nvme_destroy_poll_group_cb(void *io_device, void *ctx_buf) 3806 { 3807 struct nvme_poll_group *group = ctx_buf; 3808 3809 assert(TAILQ_EMPTY(&group->qpair_list)); 3810 3811 if (group->accel_channel) { 3812 spdk_put_io_channel(group->accel_channel); 3813 } 3814 3815 if (spdk_interrupt_mode_is_enabled()) { 3816 spdk_interrupt_unregister(&group->intr); 3817 } 3818 3819 spdk_poller_unregister(&group->poller); 3820 if (spdk_nvme_poll_group_destroy(group->group)) { 3821 SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.\n"); 3822 assert(false); 3823 } 3824 } 3825 3826 static struct spdk_io_channel * 3827 bdev_nvme_get_io_channel(void *ctx) 3828 { 3829 struct nvme_bdev *nvme_bdev = ctx; 3830 3831 return spdk_get_io_channel(nvme_bdev); 3832 } 3833 3834 static void * 3835 bdev_nvme_get_module_ctx(void *ctx) 3836 { 3837 struct nvme_bdev *nvme_bdev = ctx; 3838 struct nvme_ns *nvme_ns; 3839 3840 if (!nvme_bdev || nvme_bdev->disk.module != &nvme_if) { 3841 return NULL; 3842 } 3843 3844 nvme_ns = TAILQ_FIRST(&nvme_bdev->nvme_ns_list); 3845 if (!nvme_ns) { 3846 return NULL; 3847 } 3848 3849 return nvme_ns->ns; 3850 } 3851 3852 static const char * 3853 _nvme_ana_state_str(enum spdk_nvme_ana_state ana_state) 3854 { 3855 switch (ana_state) { 3856 case SPDK_NVME_ANA_OPTIMIZED_STATE: 3857 return "optimized"; 3858 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 3859 return "non_optimized"; 3860 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 3861 return "inaccessible"; 3862 case SPDK_NVME_ANA_PERSISTENT_LOSS_STATE: 3863 return "persistent_loss"; 3864 case SPDK_NVME_ANA_CHANGE_STATE: 3865 return "change"; 3866 default: 3867 return NULL; 3868 } 3869 } 3870 3871 static int 3872 bdev_nvme_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size) 3873 { 3874 struct spdk_memory_domain **_domains = NULL; 3875 struct nvme_bdev *nbdev = ctx; 3876 struct nvme_ns *nvme_ns; 3877 int i = 0, _array_size = array_size; 3878 int rc = 0; 3879 3880 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 3881 if (domains && array_size >= i) { 3882 _domains = &domains[i]; 3883 } else { 3884 _domains = NULL; 3885 } 3886 rc = spdk_nvme_ctrlr_get_memory_domains(nvme_ns->ctrlr->ctrlr, _domains, _array_size); 3887 if (rc > 0) { 3888 i += rc; 3889 if (_array_size >= rc) { 3890 _array_size -= rc; 3891 } else { 3892 _array_size = 0; 3893 } 3894 } else if (rc < 0) { 3895 return rc; 3896 } 3897 } 3898 3899 return i; 3900 } 3901 3902 static const char * 3903 nvme_ctrlr_get_state_str(struct nvme_ctrlr *nvme_ctrlr) 3904 { 3905 if (nvme_ctrlr->destruct) { 3906 return "deleting"; 3907 } else if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) { 3908 return "failed"; 3909 } else if (nvme_ctrlr->resetting) { 3910 return "resetting"; 3911 } else if (nvme_ctrlr->reconnect_is_delayed > 0) { 3912 return "reconnect_is_delayed"; 3913 } else if (nvme_ctrlr->disabled) { 3914 return "disabled"; 3915 } else { 3916 return "enabled"; 3917 } 3918 } 3919 3920 void 3921 nvme_ctrlr_info_json(struct spdk_json_write_ctx *w, struct nvme_ctrlr *nvme_ctrlr) 3922 { 3923 struct spdk_nvme_transport_id *trid; 3924 const struct spdk_nvme_ctrlr_opts *opts; 3925 const struct spdk_nvme_ctrlr_data *cdata; 3926 struct nvme_path_id *path_id; 3927 int32_t numa_id; 3928 3929 spdk_json_write_object_begin(w); 3930 3931 spdk_json_write_named_string(w, "state", nvme_ctrlr_get_state_str(nvme_ctrlr)); 3932 3933 #ifdef SPDK_CONFIG_NVME_CUSE 3934 size_t cuse_name_size = 128; 3935 char cuse_name[cuse_name_size]; 3936 3937 int rc = spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr, cuse_name, &cuse_name_size); 3938 if (rc == 0) { 3939 spdk_json_write_named_string(w, "cuse_device", cuse_name); 3940 } 3941 #endif 3942 trid = &nvme_ctrlr->active_path_id->trid; 3943 spdk_json_write_named_object_begin(w, "trid"); 3944 nvme_bdev_dump_trid_json(trid, w); 3945 spdk_json_write_object_end(w); 3946 3947 path_id = TAILQ_NEXT(nvme_ctrlr->active_path_id, link); 3948 if (path_id != NULL) { 3949 spdk_json_write_named_array_begin(w, "alternate_trids"); 3950 do { 3951 trid = &path_id->trid; 3952 spdk_json_write_object_begin(w); 3953 nvme_bdev_dump_trid_json(trid, w); 3954 spdk_json_write_object_end(w); 3955 3956 path_id = TAILQ_NEXT(path_id, link); 3957 } while (path_id != NULL); 3958 spdk_json_write_array_end(w); 3959 } 3960 3961 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 3962 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 3963 3964 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 3965 spdk_json_write_named_object_begin(w, "host"); 3966 spdk_json_write_named_string(w, "nqn", opts->hostnqn); 3967 spdk_json_write_named_string(w, "addr", opts->src_addr); 3968 spdk_json_write_named_string(w, "svcid", opts->src_svcid); 3969 spdk_json_write_object_end(w); 3970 3971 numa_id = spdk_nvme_ctrlr_get_numa_id(nvme_ctrlr->ctrlr); 3972 if (numa_id != SPDK_ENV_NUMA_ID_ANY) { 3973 spdk_json_write_named_uint32(w, "numa_id", numa_id); 3974 } 3975 spdk_json_write_object_end(w); 3976 } 3977 3978 static void 3979 nvme_namespace_info_json(struct spdk_json_write_ctx *w, 3980 struct nvme_ns *nvme_ns) 3981 { 3982 struct spdk_nvme_ns *ns; 3983 struct spdk_nvme_ctrlr *ctrlr; 3984 const struct spdk_nvme_ctrlr_data *cdata; 3985 const struct spdk_nvme_transport_id *trid; 3986 union spdk_nvme_vs_register vs; 3987 const struct spdk_nvme_ns_data *nsdata; 3988 char buf[128]; 3989 3990 ns = nvme_ns->ns; 3991 if (ns == NULL) { 3992 return; 3993 } 3994 3995 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 3996 3997 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 3998 trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); 3999 vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr); 4000 4001 spdk_json_write_object_begin(w); 4002 4003 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 4004 spdk_json_write_named_string(w, "pci_address", trid->traddr); 4005 } 4006 4007 spdk_json_write_named_object_begin(w, "trid"); 4008 4009 nvme_bdev_dump_trid_json(trid, w); 4010 4011 spdk_json_write_object_end(w); 4012 4013 #ifdef SPDK_CONFIG_NVME_CUSE 4014 size_t cuse_name_size = 128; 4015 char cuse_name[cuse_name_size]; 4016 4017 int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns), 4018 cuse_name, &cuse_name_size); 4019 if (rc == 0) { 4020 spdk_json_write_named_string(w, "cuse_device", cuse_name); 4021 } 4022 #endif 4023 4024 spdk_json_write_named_object_begin(w, "ctrlr_data"); 4025 4026 spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid); 4027 4028 spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid); 4029 4030 snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn); 4031 spdk_str_trim(buf); 4032 spdk_json_write_named_string(w, "model_number", buf); 4033 4034 snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn); 4035 spdk_str_trim(buf); 4036 spdk_json_write_named_string(w, "serial_number", buf); 4037 4038 snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr); 4039 spdk_str_trim(buf); 4040 spdk_json_write_named_string(w, "firmware_revision", buf); 4041 4042 if (cdata->subnqn[0] != '\0') { 4043 spdk_json_write_named_string(w, "subnqn", cdata->subnqn); 4044 } 4045 4046 spdk_json_write_named_object_begin(w, "oacs"); 4047 4048 spdk_json_write_named_uint32(w, "security", cdata->oacs.security); 4049 spdk_json_write_named_uint32(w, "format", cdata->oacs.format); 4050 spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware); 4051 spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage); 4052 4053 spdk_json_write_object_end(w); 4054 4055 spdk_json_write_named_bool(w, "multi_ctrlr", cdata->cmic.multi_ctrlr); 4056 spdk_json_write_named_bool(w, "ana_reporting", cdata->cmic.ana_reporting); 4057 4058 spdk_json_write_object_end(w); 4059 4060 spdk_json_write_named_object_begin(w, "vs"); 4061 4062 spdk_json_write_name(w, "nvme_version"); 4063 if (vs.bits.ter) { 4064 spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter); 4065 } else { 4066 spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr); 4067 } 4068 4069 spdk_json_write_object_end(w); 4070 4071 nsdata = spdk_nvme_ns_get_data(ns); 4072 4073 spdk_json_write_named_object_begin(w, "ns_data"); 4074 4075 spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns)); 4076 4077 if (cdata->cmic.ana_reporting) { 4078 spdk_json_write_named_string(w, "ana_state", 4079 _nvme_ana_state_str(nvme_ns->ana_state)); 4080 } 4081 4082 spdk_json_write_named_bool(w, "can_share", nsdata->nmic.can_share); 4083 4084 spdk_json_write_object_end(w); 4085 4086 if (cdata->oacs.security) { 4087 spdk_json_write_named_object_begin(w, "security"); 4088 4089 spdk_json_write_named_bool(w, "opal", nvme_ns->bdev->opal); 4090 4091 spdk_json_write_object_end(w); 4092 } 4093 4094 spdk_json_write_object_end(w); 4095 } 4096 4097 static const char * 4098 nvme_bdev_get_mp_policy_str(struct nvme_bdev *nbdev) 4099 { 4100 switch (nbdev->mp_policy) { 4101 case BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE: 4102 return "active_passive"; 4103 case BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE: 4104 return "active_active"; 4105 default: 4106 assert(false); 4107 return "invalid"; 4108 } 4109 } 4110 4111 static const char * 4112 nvme_bdev_get_mp_selector_str(struct nvme_bdev *nbdev) 4113 { 4114 switch (nbdev->mp_selector) { 4115 case BDEV_NVME_MP_SELECTOR_ROUND_ROBIN: 4116 return "round_robin"; 4117 case BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH: 4118 return "queue_depth"; 4119 default: 4120 assert(false); 4121 return "invalid"; 4122 } 4123 } 4124 4125 static int 4126 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) 4127 { 4128 struct nvme_bdev *nvme_bdev = ctx; 4129 struct nvme_ns *nvme_ns; 4130 4131 pthread_mutex_lock(&nvme_bdev->mutex); 4132 spdk_json_write_named_array_begin(w, "nvme"); 4133 TAILQ_FOREACH(nvme_ns, &nvme_bdev->nvme_ns_list, tailq) { 4134 nvme_namespace_info_json(w, nvme_ns); 4135 } 4136 spdk_json_write_array_end(w); 4137 spdk_json_write_named_string(w, "mp_policy", nvme_bdev_get_mp_policy_str(nvme_bdev)); 4138 if (nvme_bdev->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) { 4139 spdk_json_write_named_string(w, "selector", nvme_bdev_get_mp_selector_str(nvme_bdev)); 4140 if (nvme_bdev->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) { 4141 spdk_json_write_named_uint32(w, "rr_min_io", nvme_bdev->rr_min_io); 4142 } 4143 } 4144 pthread_mutex_unlock(&nvme_bdev->mutex); 4145 4146 return 0; 4147 } 4148 4149 static void 4150 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 4151 { 4152 /* No config per bdev needed */ 4153 } 4154 4155 static uint64_t 4156 bdev_nvme_get_spin_time(struct spdk_io_channel *ch) 4157 { 4158 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 4159 struct nvme_io_path *io_path; 4160 struct nvme_poll_group *group; 4161 uint64_t spin_time = 0; 4162 4163 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 4164 group = io_path->qpair->group; 4165 4166 if (!group || !group->collect_spin_stat) { 4167 continue; 4168 } 4169 4170 if (group->end_ticks != 0) { 4171 group->spin_ticks += (group->end_ticks - group->start_ticks); 4172 group->end_ticks = 0; 4173 } 4174 4175 spin_time += group->spin_ticks; 4176 group->start_ticks = 0; 4177 group->spin_ticks = 0; 4178 } 4179 4180 return (spin_time * 1000000ULL) / spdk_get_ticks_hz(); 4181 } 4182 4183 static void 4184 bdev_nvme_reset_device_stat(void *ctx) 4185 { 4186 struct nvme_bdev *nbdev = ctx; 4187 4188 if (nbdev->err_stat != NULL) { 4189 memset(nbdev->err_stat, 0, sizeof(struct nvme_error_stat)); 4190 } 4191 } 4192 4193 /* JSON string should be lowercases and underscore delimited string. */ 4194 static void 4195 bdev_nvme_format_nvme_status(char *dst, const char *src) 4196 { 4197 char tmp[256]; 4198 4199 spdk_strcpy_replace(dst, 256, src, " - ", "_"); 4200 spdk_strcpy_replace(tmp, 256, dst, "-", "_"); 4201 spdk_strcpy_replace(dst, 256, tmp, " ", "_"); 4202 spdk_strlwr(dst); 4203 } 4204 4205 static void 4206 bdev_nvme_dump_device_stat_json(void *ctx, struct spdk_json_write_ctx *w) 4207 { 4208 struct nvme_bdev *nbdev = ctx; 4209 struct spdk_nvme_status status = {}; 4210 uint16_t sct, sc; 4211 char status_json[256]; 4212 const char *status_str; 4213 4214 if (nbdev->err_stat == NULL) { 4215 return; 4216 } 4217 4218 spdk_json_write_named_object_begin(w, "nvme_error"); 4219 4220 spdk_json_write_named_object_begin(w, "status_type"); 4221 for (sct = 0; sct < 8; sct++) { 4222 if (nbdev->err_stat->status_type[sct] == 0) { 4223 continue; 4224 } 4225 status.sct = sct; 4226 4227 status_str = spdk_nvme_cpl_get_status_type_string(&status); 4228 assert(status_str != NULL); 4229 bdev_nvme_format_nvme_status(status_json, status_str); 4230 4231 spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status_type[sct]); 4232 } 4233 spdk_json_write_object_end(w); 4234 4235 spdk_json_write_named_object_begin(w, "status_code"); 4236 for (sct = 0; sct < 4; sct++) { 4237 status.sct = sct; 4238 for (sc = 0; sc < 256; sc++) { 4239 if (nbdev->err_stat->status[sct][sc] == 0) { 4240 continue; 4241 } 4242 status.sc = sc; 4243 4244 status_str = spdk_nvme_cpl_get_status_string(&status); 4245 assert(status_str != NULL); 4246 bdev_nvme_format_nvme_status(status_json, status_str); 4247 4248 spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status[sct][sc]); 4249 } 4250 } 4251 spdk_json_write_object_end(w); 4252 4253 spdk_json_write_object_end(w); 4254 } 4255 4256 static bool 4257 bdev_nvme_accel_sequence_supported(void *ctx, enum spdk_bdev_io_type type) 4258 { 4259 struct nvme_bdev *nbdev = ctx; 4260 struct nvme_ns *nvme_ns; 4261 struct spdk_nvme_ctrlr *ctrlr; 4262 4263 if (!g_opts.allow_accel_sequence) { 4264 return false; 4265 } 4266 4267 switch (type) { 4268 case SPDK_BDEV_IO_TYPE_WRITE: 4269 case SPDK_BDEV_IO_TYPE_READ: 4270 break; 4271 default: 4272 return false; 4273 } 4274 4275 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 4276 assert(nvme_ns != NULL); 4277 4278 ctrlr = nvme_ns->ctrlr->ctrlr; 4279 assert(ctrlr != NULL); 4280 4281 return spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ACCEL_SEQUENCE_SUPPORTED; 4282 } 4283 4284 static const struct spdk_bdev_fn_table nvmelib_fn_table = { 4285 .destruct = bdev_nvme_destruct, 4286 .submit_request = bdev_nvme_submit_request, 4287 .io_type_supported = bdev_nvme_io_type_supported, 4288 .get_io_channel = bdev_nvme_get_io_channel, 4289 .dump_info_json = bdev_nvme_dump_info_json, 4290 .write_config_json = bdev_nvme_write_config_json, 4291 .get_spin_time = bdev_nvme_get_spin_time, 4292 .get_module_ctx = bdev_nvme_get_module_ctx, 4293 .get_memory_domains = bdev_nvme_get_memory_domains, 4294 .accel_sequence_supported = bdev_nvme_accel_sequence_supported, 4295 .reset_device_stat = bdev_nvme_reset_device_stat, 4296 .dump_device_stat_json = bdev_nvme_dump_device_stat_json, 4297 }; 4298 4299 typedef int (*bdev_nvme_parse_ana_log_page_cb)( 4300 const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg); 4301 4302 static int 4303 bdev_nvme_parse_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 4304 bdev_nvme_parse_ana_log_page_cb cb_fn, void *cb_arg) 4305 { 4306 struct spdk_nvme_ana_group_descriptor *copied_desc; 4307 uint8_t *orig_desc; 4308 uint32_t i, desc_size, copy_len; 4309 int rc = 0; 4310 4311 if (nvme_ctrlr->ana_log_page == NULL) { 4312 return -EINVAL; 4313 } 4314 4315 copied_desc = nvme_ctrlr->copied_ana_desc; 4316 4317 orig_desc = (uint8_t *)nvme_ctrlr->ana_log_page + sizeof(struct spdk_nvme_ana_page); 4318 copy_len = nvme_ctrlr->max_ana_log_page_size - sizeof(struct spdk_nvme_ana_page); 4319 4320 for (i = 0; i < nvme_ctrlr->ana_log_page->num_ana_group_desc; i++) { 4321 memcpy(copied_desc, orig_desc, copy_len); 4322 4323 rc = cb_fn(copied_desc, cb_arg); 4324 if (rc != 0) { 4325 break; 4326 } 4327 4328 desc_size = sizeof(struct spdk_nvme_ana_group_descriptor) + 4329 copied_desc->num_of_nsid * sizeof(uint32_t); 4330 orig_desc += desc_size; 4331 copy_len -= desc_size; 4332 } 4333 4334 return rc; 4335 } 4336 4337 static int 4338 nvme_ns_ana_transition_timedout(void *ctx) 4339 { 4340 struct nvme_ns *nvme_ns = ctx; 4341 4342 spdk_poller_unregister(&nvme_ns->anatt_timer); 4343 nvme_ns->ana_transition_timedout = true; 4344 4345 return SPDK_POLLER_BUSY; 4346 } 4347 4348 static void 4349 _nvme_ns_set_ana_state(struct nvme_ns *nvme_ns, 4350 const struct spdk_nvme_ana_group_descriptor *desc) 4351 { 4352 const struct spdk_nvme_ctrlr_data *cdata; 4353 4354 nvme_ns->ana_group_id = desc->ana_group_id; 4355 nvme_ns->ana_state = desc->ana_state; 4356 nvme_ns->ana_state_updating = false; 4357 4358 switch (nvme_ns->ana_state) { 4359 case SPDK_NVME_ANA_OPTIMIZED_STATE: 4360 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 4361 nvme_ns->ana_transition_timedout = false; 4362 spdk_poller_unregister(&nvme_ns->anatt_timer); 4363 break; 4364 4365 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 4366 case SPDK_NVME_ANA_CHANGE_STATE: 4367 if (nvme_ns->anatt_timer != NULL) { 4368 break; 4369 } 4370 4371 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 4372 nvme_ns->anatt_timer = SPDK_POLLER_REGISTER(nvme_ns_ana_transition_timedout, 4373 nvme_ns, 4374 cdata->anatt * SPDK_SEC_TO_USEC); 4375 break; 4376 default: 4377 break; 4378 } 4379 } 4380 4381 static int 4382 nvme_ns_set_ana_state(const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg) 4383 { 4384 struct nvme_ns *nvme_ns = cb_arg; 4385 uint32_t i; 4386 4387 assert(nvme_ns->ns != NULL); 4388 4389 for (i = 0; i < desc->num_of_nsid; i++) { 4390 if (desc->nsid[i] != spdk_nvme_ns_get_id(nvme_ns->ns)) { 4391 continue; 4392 } 4393 4394 _nvme_ns_set_ana_state(nvme_ns, desc); 4395 return 1; 4396 } 4397 4398 return 0; 4399 } 4400 4401 static int 4402 nvme_generate_uuid(const char *sn, uint32_t nsid, struct spdk_uuid *uuid) 4403 { 4404 int rc = 0; 4405 struct spdk_uuid new_uuid, namespace_uuid; 4406 char merged_str[SPDK_NVME_CTRLR_SN_LEN + NSID_STR_LEN + 1] = {'\0'}; 4407 /* This namespace UUID was generated using uuid_generate() method. */ 4408 const char *namespace_str = {"edaed2de-24bc-4b07-b559-f47ecbe730fd"}; 4409 int size; 4410 4411 assert(strlen(sn) <= SPDK_NVME_CTRLR_SN_LEN); 4412 4413 spdk_uuid_set_null(&new_uuid); 4414 spdk_uuid_set_null(&namespace_uuid); 4415 4416 size = snprintf(merged_str, sizeof(merged_str), "%s%"PRIu32, sn, nsid); 4417 if (size <= 0 || (unsigned long)size >= sizeof(merged_str)) { 4418 return -EINVAL; 4419 } 4420 4421 spdk_uuid_parse(&namespace_uuid, namespace_str); 4422 4423 rc = spdk_uuid_generate_sha1(&new_uuid, &namespace_uuid, merged_str, size); 4424 if (rc == 0) { 4425 memcpy(uuid, &new_uuid, sizeof(struct spdk_uuid)); 4426 } 4427 4428 return rc; 4429 } 4430 4431 static int 4432 nvme_disk_create(struct spdk_bdev *disk, const char *base_name, 4433 struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns, 4434 struct spdk_bdev_nvme_ctrlr_opts *bdev_opts, void *ctx) 4435 { 4436 const struct spdk_uuid *uuid; 4437 const uint8_t *nguid; 4438 const struct spdk_nvme_ctrlr_data *cdata; 4439 const struct spdk_nvme_ns_data *nsdata; 4440 const struct spdk_nvme_ctrlr_opts *opts; 4441 enum spdk_nvme_csi csi; 4442 uint32_t atomic_bs, phys_bs, bs; 4443 char sn_tmp[SPDK_NVME_CTRLR_SN_LEN + 1] = {'\0'}; 4444 int rc; 4445 4446 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 4447 csi = spdk_nvme_ns_get_csi(ns); 4448 opts = spdk_nvme_ctrlr_get_opts(ctrlr); 4449 4450 switch (csi) { 4451 case SPDK_NVME_CSI_NVM: 4452 disk->product_name = "NVMe disk"; 4453 break; 4454 case SPDK_NVME_CSI_ZNS: 4455 disk->product_name = "NVMe ZNS disk"; 4456 disk->zoned = true; 4457 disk->zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 4458 disk->max_zone_append_size = spdk_nvme_zns_ctrlr_get_max_zone_append_size(ctrlr) / 4459 spdk_nvme_ns_get_extended_sector_size(ns); 4460 disk->max_open_zones = spdk_nvme_zns_ns_get_max_open_zones(ns); 4461 disk->max_active_zones = spdk_nvme_zns_ns_get_max_active_zones(ns); 4462 break; 4463 default: 4464 if (bdev_opts->allow_unrecognized_csi) { 4465 disk->product_name = "NVMe Passthrough disk"; 4466 break; 4467 } 4468 SPDK_ERRLOG("unsupported CSI: %u\n", csi); 4469 return -ENOTSUP; 4470 } 4471 4472 nguid = spdk_nvme_ns_get_nguid(ns); 4473 if (!nguid) { 4474 uuid = spdk_nvme_ns_get_uuid(ns); 4475 if (uuid) { 4476 disk->uuid = *uuid; 4477 } else if (g_opts.generate_uuids) { 4478 spdk_strcpy_pad(sn_tmp, cdata->sn, SPDK_NVME_CTRLR_SN_LEN, '\0'); 4479 rc = nvme_generate_uuid(sn_tmp, spdk_nvme_ns_get_id(ns), &disk->uuid); 4480 if (rc < 0) { 4481 SPDK_ERRLOG("UUID generation failed (%s)\n", spdk_strerror(-rc)); 4482 return rc; 4483 } 4484 } 4485 } else { 4486 memcpy(&disk->uuid, nguid, sizeof(disk->uuid)); 4487 } 4488 4489 disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns)); 4490 if (!disk->name) { 4491 return -ENOMEM; 4492 } 4493 4494 disk->write_cache = 0; 4495 if (cdata->vwc.present) { 4496 /* Enable if the Volatile Write Cache exists */ 4497 disk->write_cache = 1; 4498 } 4499 if (cdata->oncs.write_zeroes) { 4500 disk->max_write_zeroes = UINT16_MAX + 1; 4501 } 4502 disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns); 4503 disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns); 4504 disk->max_segment_size = spdk_nvme_ctrlr_get_max_xfer_size(ctrlr); 4505 disk->ctratt.raw = cdata->ctratt.raw; 4506 disk->nsid = spdk_nvme_ns_get_id(ns); 4507 /* NVMe driver will split one request into multiple requests 4508 * based on MDTS and stripe boundary, the bdev layer will use 4509 * max_segment_size and max_num_segments to split one big IO 4510 * into multiple requests, then small request can't run out 4511 * of NVMe internal requests data structure. 4512 */ 4513 if (opts && opts->io_queue_requests) { 4514 disk->max_num_segments = opts->io_queue_requests / 2; 4515 } 4516 if (spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_SGL_SUPPORTED) { 4517 /* The nvme driver will try to split I/O that have too many 4518 * SGEs, but it doesn't work if that last SGE doesn't end on 4519 * an aggregate total that is block aligned. The bdev layer has 4520 * a more robust splitting framework, so use that instead for 4521 * this case. (See issue #3269.) 4522 */ 4523 uint16_t max_sges = spdk_nvme_ctrlr_get_max_sges(ctrlr); 4524 4525 if (disk->max_num_segments == 0) { 4526 disk->max_num_segments = max_sges; 4527 } else { 4528 disk->max_num_segments = spdk_min(disk->max_num_segments, max_sges); 4529 } 4530 } 4531 disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns); 4532 4533 nsdata = spdk_nvme_ns_get_data(ns); 4534 bs = spdk_nvme_ns_get_sector_size(ns); 4535 atomic_bs = bs; 4536 phys_bs = bs; 4537 if (nsdata->nabo == 0) { 4538 if (nsdata->nsfeat.ns_atomic_write_unit && nsdata->nawupf) { 4539 atomic_bs = bs * (1 + nsdata->nawupf); 4540 } else { 4541 atomic_bs = bs * (1 + cdata->awupf); 4542 } 4543 } 4544 if (nsdata->nsfeat.optperf) { 4545 phys_bs = bs * (1 + nsdata->npwg); 4546 } 4547 disk->phys_blocklen = spdk_min(phys_bs, atomic_bs); 4548 4549 disk->md_len = spdk_nvme_ns_get_md_size(ns); 4550 if (disk->md_len != 0) { 4551 disk->md_interleave = nsdata->flbas.extended; 4552 disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns); 4553 if (disk->dif_type != SPDK_DIF_DISABLE) { 4554 disk->dif_is_head_of_md = nsdata->dps.md_start; 4555 disk->dif_check_flags = bdev_opts->prchk_flags; 4556 disk->dif_pi_format = (enum spdk_dif_pi_format)spdk_nvme_ns_get_pi_format(ns); 4557 } 4558 } 4559 4560 if (!(spdk_nvme_ctrlr_get_flags(ctrlr) & 4561 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) { 4562 disk->acwu = 0; 4563 } else if (nsdata->nsfeat.ns_atomic_write_unit) { 4564 disk->acwu = nsdata->nacwu + 1; /* 0-based */ 4565 } else { 4566 disk->acwu = cdata->acwu + 1; /* 0-based */ 4567 } 4568 4569 if (cdata->oncs.copy) { 4570 /* For now bdev interface allows only single segment copy */ 4571 disk->max_copy = nsdata->mssrl; 4572 } 4573 4574 disk->ctxt = ctx; 4575 disk->fn_table = &nvmelib_fn_table; 4576 disk->module = &nvme_if; 4577 4578 disk->numa.id_valid = 1; 4579 disk->numa.id = spdk_nvme_ctrlr_get_numa_id(ctrlr); 4580 4581 return 0; 4582 } 4583 4584 static struct nvme_bdev * 4585 nvme_bdev_alloc(void) 4586 { 4587 struct nvme_bdev *bdev; 4588 int rc; 4589 4590 bdev = calloc(1, sizeof(*bdev)); 4591 if (!bdev) { 4592 SPDK_ERRLOG("bdev calloc() failed\n"); 4593 return NULL; 4594 } 4595 4596 if (g_opts.nvme_error_stat) { 4597 bdev->err_stat = calloc(1, sizeof(struct nvme_error_stat)); 4598 if (!bdev->err_stat) { 4599 SPDK_ERRLOG("err_stat calloc() failed\n"); 4600 free(bdev); 4601 return NULL; 4602 } 4603 } 4604 4605 rc = pthread_mutex_init(&bdev->mutex, NULL); 4606 if (rc != 0) { 4607 free(bdev->err_stat); 4608 free(bdev); 4609 return NULL; 4610 } 4611 4612 bdev->ref = 1; 4613 bdev->mp_policy = BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE; 4614 bdev->mp_selector = BDEV_NVME_MP_SELECTOR_ROUND_ROBIN; 4615 bdev->rr_min_io = UINT32_MAX; 4616 TAILQ_INIT(&bdev->nvme_ns_list); 4617 4618 return bdev; 4619 } 4620 4621 static int 4622 nvme_bdev_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4623 { 4624 struct nvme_bdev *bdev; 4625 struct nvme_bdev_ctrlr *nbdev_ctrlr = nvme_ctrlr->nbdev_ctrlr; 4626 int rc; 4627 4628 bdev = nvme_bdev_alloc(); 4629 if (bdev == NULL) { 4630 SPDK_ERRLOG("Failed to allocate NVMe bdev\n"); 4631 return -ENOMEM; 4632 } 4633 4634 bdev->opal = nvme_ctrlr->opal_dev != NULL; 4635 4636 rc = nvme_disk_create(&bdev->disk, nbdev_ctrlr->name, nvme_ctrlr->ctrlr, 4637 nvme_ns->ns, &nvme_ctrlr->opts, bdev); 4638 if (rc != 0) { 4639 SPDK_ERRLOG("Failed to create NVMe disk\n"); 4640 nvme_bdev_free(bdev); 4641 return rc; 4642 } 4643 4644 spdk_io_device_register(bdev, 4645 bdev_nvme_create_bdev_channel_cb, 4646 bdev_nvme_destroy_bdev_channel_cb, 4647 sizeof(struct nvme_bdev_channel), 4648 bdev->disk.name); 4649 4650 nvme_ns->bdev = bdev; 4651 bdev->nsid = nvme_ns->id; 4652 TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); 4653 4654 bdev->nbdev_ctrlr = nbdev_ctrlr; 4655 TAILQ_INSERT_TAIL(&nbdev_ctrlr->bdevs, bdev, tailq); 4656 4657 rc = spdk_bdev_register(&bdev->disk); 4658 if (rc != 0) { 4659 SPDK_ERRLOG("spdk_bdev_register() failed\n"); 4660 spdk_io_device_unregister(bdev, NULL); 4661 nvme_ns->bdev = NULL; 4662 TAILQ_REMOVE(&nbdev_ctrlr->bdevs, bdev, tailq); 4663 nvme_bdev_free(bdev); 4664 return rc; 4665 } 4666 4667 return 0; 4668 } 4669 4670 static bool 4671 bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2) 4672 { 4673 const struct spdk_nvme_ns_data *nsdata1, *nsdata2; 4674 const struct spdk_uuid *uuid1, *uuid2; 4675 4676 nsdata1 = spdk_nvme_ns_get_data(ns1); 4677 nsdata2 = spdk_nvme_ns_get_data(ns2); 4678 uuid1 = spdk_nvme_ns_get_uuid(ns1); 4679 uuid2 = spdk_nvme_ns_get_uuid(ns2); 4680 4681 return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid)) == 0 && 4682 nsdata1->eui64 == nsdata2->eui64 && 4683 ((uuid1 == NULL && uuid2 == NULL) || 4684 (uuid1 != NULL && uuid2 != NULL && spdk_uuid_compare(uuid1, uuid2) == 0)) && 4685 spdk_nvme_ns_get_csi(ns1) == spdk_nvme_ns_get_csi(ns2); 4686 } 4687 4688 static bool 4689 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 4690 struct spdk_nvme_ctrlr_opts *opts) 4691 { 4692 struct nvme_probe_skip_entry *entry; 4693 4694 TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) { 4695 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 4696 return false; 4697 } 4698 } 4699 4700 opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst; 4701 opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight; 4702 opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight; 4703 opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight; 4704 opts->disable_read_ana_log_page = true; 4705 4706 SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr); 4707 4708 return true; 4709 } 4710 4711 static void 4712 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl) 4713 { 4714 struct nvme_ctrlr *nvme_ctrlr = ctx; 4715 4716 if (spdk_nvme_cpl_is_error(cpl)) { 4717 NVME_CTRLR_WARNLOG(nvme_ctrlr, "Abort failed. Resetting controller. sc is %u, sct is %u.\n", 4718 cpl->status.sc, cpl->status.sct); 4719 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4720 } else if (cpl->cdw0 & 0x1) { 4721 NVME_CTRLR_WARNLOG(nvme_ctrlr, "Specified command could not be aborted.\n"); 4722 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4723 } 4724 } 4725 4726 static void 4727 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr, 4728 struct spdk_nvme_qpair *qpair, uint16_t cid) 4729 { 4730 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 4731 union spdk_nvme_csts_register csts; 4732 int rc; 4733 4734 assert(nvme_ctrlr->ctrlr == ctrlr); 4735 4736 NVME_CTRLR_WARNLOG(nvme_ctrlr, "Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", 4737 ctrlr, qpair, cid); 4738 4739 /* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O 4740 * queue. (Note: qpair == NULL when there's an admin cmd timeout.) Otherwise we 4741 * would submit another fabrics cmd on the admin queue to read CSTS and check for its 4742 * completion recursively. 4743 */ 4744 if (nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) { 4745 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); 4746 if (csts.bits.cfs) { 4747 NVME_CTRLR_ERRLOG(nvme_ctrlr, "Controller Fatal Status, reset required\n"); 4748 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4749 return; 4750 } 4751 } 4752 4753 switch (g_opts.action_on_timeout) { 4754 case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT: 4755 if (qpair) { 4756 /* Don't send abort to ctrlr when ctrlr is not available. */ 4757 pthread_mutex_lock(&nvme_ctrlr->mutex); 4758 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 4759 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4760 NVME_CTRLR_NOTICELOG(nvme_ctrlr, "Quit abort. Ctrlr is not available.\n"); 4761 return; 4762 } 4763 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4764 4765 rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid, 4766 nvme_abort_cpl, nvme_ctrlr); 4767 if (rc == 0) { 4768 return; 4769 } 4770 4771 NVME_CTRLR_ERRLOG(nvme_ctrlr, "Unable to send abort. Resetting, rc is %d.\n", rc); 4772 } 4773 4774 /* FALLTHROUGH */ 4775 case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET: 4776 bdev_nvme_reset_ctrlr(nvme_ctrlr); 4777 break; 4778 case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE: 4779 NVME_CTRLR_DEBUGLOG(nvme_ctrlr, "No action for nvme controller timeout.\n"); 4780 break; 4781 default: 4782 NVME_CTRLR_ERRLOG(nvme_ctrlr, "An invalid timeout action value is found.\n"); 4783 break; 4784 } 4785 } 4786 4787 static struct nvme_ns * 4788 nvme_ns_alloc(void) 4789 { 4790 struct nvme_ns *nvme_ns; 4791 4792 nvme_ns = calloc(1, sizeof(struct nvme_ns)); 4793 if (nvme_ns == NULL) { 4794 return NULL; 4795 } 4796 4797 if (g_opts.io_path_stat) { 4798 nvme_ns->stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); 4799 if (nvme_ns->stat == NULL) { 4800 free(nvme_ns); 4801 return NULL; 4802 } 4803 spdk_bdev_reset_io_stat(nvme_ns->stat, SPDK_BDEV_RESET_STAT_MAXMIN); 4804 } 4805 4806 return nvme_ns; 4807 } 4808 4809 static void 4810 nvme_ns_free(struct nvme_ns *nvme_ns) 4811 { 4812 free(nvme_ns->stat); 4813 free(nvme_ns); 4814 } 4815 4816 static void 4817 nvme_ctrlr_populate_namespace_done(struct nvme_ns *nvme_ns, int rc) 4818 { 4819 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 4820 struct nvme_async_probe_ctx *ctx = nvme_ns->probe_ctx; 4821 4822 if (rc == 0) { 4823 nvme_ns->probe_ctx = NULL; 4824 nvme_ctrlr_get_ref(nvme_ctrlr); 4825 } else { 4826 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4827 nvme_ns_free(nvme_ns); 4828 } 4829 4830 if (ctx) { 4831 ctx->populates_in_progress--; 4832 if (ctx->populates_in_progress == 0) { 4833 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 4834 } 4835 } 4836 } 4837 4838 static void 4839 bdev_nvme_add_io_path(struct nvme_bdev_channel_iter *i, 4840 struct nvme_bdev *nbdev, 4841 struct nvme_bdev_channel *nbdev_ch, void *ctx) 4842 { 4843 struct nvme_ns *nvme_ns = ctx; 4844 int rc; 4845 4846 rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns); 4847 if (rc != 0) { 4848 SPDK_ERRLOG("Failed to add I/O path to bdev_channel dynamically.\n"); 4849 } 4850 4851 nvme_bdev_for_each_channel_continue(i, rc); 4852 } 4853 4854 static void 4855 bdev_nvme_delete_io_path(struct nvme_bdev_channel_iter *i, 4856 struct nvme_bdev *nbdev, 4857 struct nvme_bdev_channel *nbdev_ch, void *ctx) 4858 { 4859 struct nvme_ns *nvme_ns = ctx; 4860 struct nvme_io_path *io_path; 4861 4862 io_path = _bdev_nvme_get_io_path(nbdev_ch, nvme_ns); 4863 if (io_path != NULL) { 4864 _bdev_nvme_delete_io_path(nbdev_ch, io_path); 4865 } 4866 4867 nvme_bdev_for_each_channel_continue(i, 0); 4868 } 4869 4870 static void 4871 bdev_nvme_add_io_path_failed(struct nvme_bdev *nbdev, void *ctx, int status) 4872 { 4873 struct nvme_ns *nvme_ns = ctx; 4874 4875 nvme_ctrlr_populate_namespace_done(nvme_ns, -1); 4876 } 4877 4878 static void 4879 bdev_nvme_add_io_path_done(struct nvme_bdev *nbdev, void *ctx, int status) 4880 { 4881 struct nvme_ns *nvme_ns = ctx; 4882 4883 if (status == 0) { 4884 nvme_ctrlr_populate_namespace_done(nvme_ns, 0); 4885 } else { 4886 /* Delete the added io_paths and fail populating the namespace. */ 4887 nvme_bdev_for_each_channel(nbdev, 4888 bdev_nvme_delete_io_path, 4889 nvme_ns, 4890 bdev_nvme_add_io_path_failed); 4891 } 4892 } 4893 4894 static int 4895 nvme_bdev_add_ns(struct nvme_bdev *bdev, struct nvme_ns *nvme_ns) 4896 { 4897 struct nvme_ns *tmp_ns; 4898 const struct spdk_nvme_ns_data *nsdata; 4899 4900 nsdata = spdk_nvme_ns_get_data(nvme_ns->ns); 4901 if (!nsdata->nmic.can_share) { 4902 SPDK_ERRLOG("Namespace cannot be shared.\n"); 4903 return -EINVAL; 4904 } 4905 4906 pthread_mutex_lock(&bdev->mutex); 4907 4908 tmp_ns = TAILQ_FIRST(&bdev->nvme_ns_list); 4909 assert(tmp_ns != NULL); 4910 4911 if (tmp_ns->ns != NULL && !bdev_nvme_compare_ns(nvme_ns->ns, tmp_ns->ns)) { 4912 pthread_mutex_unlock(&bdev->mutex); 4913 SPDK_ERRLOG("Namespaces are not identical.\n"); 4914 return -EINVAL; 4915 } 4916 4917 bdev->ref++; 4918 TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq); 4919 nvme_ns->bdev = bdev; 4920 4921 pthread_mutex_unlock(&bdev->mutex); 4922 4923 /* Add nvme_io_path to nvme_bdev_channels dynamically. */ 4924 nvme_bdev_for_each_channel(bdev, 4925 bdev_nvme_add_io_path, 4926 nvme_ns, 4927 bdev_nvme_add_io_path_done); 4928 4929 return 0; 4930 } 4931 4932 static void 4933 nvme_ctrlr_populate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4934 { 4935 struct spdk_nvme_ns *ns; 4936 struct nvme_bdev *bdev; 4937 int rc = 0; 4938 4939 ns = spdk_nvme_ctrlr_get_ns(nvme_ctrlr->ctrlr, nvme_ns->id); 4940 if (!ns) { 4941 NVME_CTRLR_DEBUGLOG(nvme_ctrlr, "Invalid NS %d\n", nvme_ns->id); 4942 rc = -EINVAL; 4943 goto done; 4944 } 4945 4946 nvme_ns->ns = ns; 4947 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 4948 4949 if (nvme_ctrlr->ana_log_page != NULL) { 4950 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ns_set_ana_state, nvme_ns); 4951 } 4952 4953 bdev = nvme_bdev_ctrlr_get_bdev(nvme_ctrlr->nbdev_ctrlr, nvme_ns->id); 4954 if (bdev == NULL) { 4955 rc = nvme_bdev_create(nvme_ctrlr, nvme_ns); 4956 } else { 4957 rc = nvme_bdev_add_ns(bdev, nvme_ns); 4958 if (rc == 0) { 4959 return; 4960 } 4961 } 4962 done: 4963 nvme_ctrlr_populate_namespace_done(nvme_ns, rc); 4964 } 4965 4966 static void 4967 nvme_ctrlr_depopulate_namespace_done(struct nvme_ns *nvme_ns) 4968 { 4969 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 4970 4971 assert(nvme_ctrlr != NULL); 4972 4973 pthread_mutex_lock(&nvme_ctrlr->mutex); 4974 4975 RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 4976 4977 if (nvme_ns->bdev != NULL) { 4978 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4979 return; 4980 } 4981 4982 nvme_ns_free(nvme_ns); 4983 pthread_mutex_unlock(&nvme_ctrlr->mutex); 4984 4985 nvme_ctrlr_put_ref(nvme_ctrlr); 4986 } 4987 4988 static void 4989 bdev_nvme_delete_io_path_done(struct nvme_bdev *nbdev, void *ctx, int status) 4990 { 4991 struct nvme_ns *nvme_ns = ctx; 4992 4993 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 4994 } 4995 4996 static void 4997 nvme_ctrlr_depopulate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 4998 { 4999 struct nvme_bdev *bdev; 5000 5001 spdk_poller_unregister(&nvme_ns->anatt_timer); 5002 5003 bdev = nvme_ns->bdev; 5004 if (bdev != NULL) { 5005 pthread_mutex_lock(&bdev->mutex); 5006 5007 assert(bdev->ref > 0); 5008 bdev->ref--; 5009 if (bdev->ref == 0) { 5010 pthread_mutex_unlock(&bdev->mutex); 5011 5012 spdk_bdev_unregister(&bdev->disk, NULL, NULL); 5013 } else { 5014 /* spdk_bdev_unregister() is not called until the last nvme_ns is 5015 * depopulated. Hence we need to remove nvme_ns from bdev->nvme_ns_list 5016 * and clear nvme_ns->bdev here. 5017 */ 5018 TAILQ_REMOVE(&bdev->nvme_ns_list, nvme_ns, tailq); 5019 nvme_ns->bdev = NULL; 5020 5021 pthread_mutex_unlock(&bdev->mutex); 5022 5023 /* Delete nvme_io_paths from nvme_bdev_channels dynamically. After that, 5024 * we call depopulate_namespace_done() to avoid use-after-free. 5025 */ 5026 nvme_bdev_for_each_channel(bdev, 5027 bdev_nvme_delete_io_path, 5028 nvme_ns, 5029 bdev_nvme_delete_io_path_done); 5030 return; 5031 } 5032 } 5033 5034 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 5035 } 5036 5037 static void 5038 nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 5039 struct nvme_async_probe_ctx *ctx) 5040 { 5041 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 5042 struct nvme_ns *nvme_ns, *next; 5043 struct spdk_nvme_ns *ns; 5044 struct nvme_bdev *bdev; 5045 uint32_t nsid; 5046 int rc; 5047 uint64_t num_sectors; 5048 5049 if (ctx) { 5050 /* Initialize this count to 1 to handle the populate functions 5051 * calling nvme_ctrlr_populate_namespace_done() immediately. 5052 */ 5053 ctx->populates_in_progress = 1; 5054 } 5055 5056 /* First loop over our existing namespaces and see if they have been 5057 * removed. */ 5058 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 5059 while (nvme_ns != NULL) { 5060 next = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 5061 5062 if (spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) { 5063 /* NS is still there or added again. Its attributes may have changed. */ 5064 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id); 5065 if (nvme_ns->ns != ns) { 5066 assert(nvme_ns->ns == NULL); 5067 nvme_ns->ns = ns; 5068 NVME_CTRLR_DEBUGLOG(nvme_ctrlr, "NSID %u was added\n", nvme_ns->id); 5069 } 5070 5071 num_sectors = spdk_nvme_ns_get_num_sectors(ns); 5072 bdev = nvme_ns->bdev; 5073 assert(bdev != NULL); 5074 if (bdev->disk.blockcnt != num_sectors) { 5075 NVME_CTRLR_NOTICELOG(nvme_ctrlr, 5076 "NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n", 5077 nvme_ns->id, 5078 bdev->disk.name, 5079 bdev->disk.blockcnt, 5080 num_sectors); 5081 rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors); 5082 if (rc != 0) { 5083 NVME_CTRLR_ERRLOG(nvme_ctrlr, 5084 "Could not change num blocks for nvme bdev: name %s, errno: %d.\n", 5085 bdev->disk.name, rc); 5086 } 5087 } 5088 } else { 5089 /* Namespace was removed */ 5090 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 5091 } 5092 5093 nvme_ns = next; 5094 } 5095 5096 /* Loop through all of the namespaces at the nvme level and see if any of them are new */ 5097 nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 5098 while (nsid != 0) { 5099 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 5100 5101 if (nvme_ns == NULL) { 5102 /* Found a new one */ 5103 nvme_ns = nvme_ns_alloc(); 5104 if (nvme_ns == NULL) { 5105 NVME_CTRLR_ERRLOG(nvme_ctrlr, "Failed to allocate namespace\n"); 5106 /* This just fails to attach the namespace. It may work on a future attempt. */ 5107 continue; 5108 } 5109 5110 nvme_ns->id = nsid; 5111 nvme_ns->ctrlr = nvme_ctrlr; 5112 5113 nvme_ns->bdev = NULL; 5114 5115 if (ctx) { 5116 ctx->populates_in_progress++; 5117 } 5118 nvme_ns->probe_ctx = ctx; 5119 5120 RB_INSERT(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns); 5121 5122 nvme_ctrlr_populate_namespace(nvme_ctrlr, nvme_ns); 5123 } 5124 5125 nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid); 5126 } 5127 5128 if (ctx) { 5129 /* Decrement this count now that the loop is over to account 5130 * for the one we started with. If the count is then 0, we 5131 * know any populate_namespace functions completed immediately, 5132 * so we'll kick the callback here. 5133 */ 5134 ctx->populates_in_progress--; 5135 if (ctx->populates_in_progress == 0) { 5136 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 5137 } 5138 } 5139 5140 } 5141 5142 static void 5143 nvme_ctrlr_depopulate_namespaces(struct nvme_ctrlr *nvme_ctrlr) 5144 { 5145 struct nvme_ns *nvme_ns, *tmp; 5146 5147 RB_FOREACH_SAFE(nvme_ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp) { 5148 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 5149 } 5150 } 5151 5152 static uint32_t 5153 nvme_ctrlr_get_ana_log_page_size(struct nvme_ctrlr *nvme_ctrlr) 5154 { 5155 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 5156 const struct spdk_nvme_ctrlr_data *cdata; 5157 uint32_t nsid, ns_count = 0; 5158 5159 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5160 5161 for (nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); 5162 nsid != 0; nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid)) { 5163 ns_count++; 5164 } 5165 5166 return sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 5167 sizeof(struct spdk_nvme_ana_group_descriptor) + ns_count * 5168 sizeof(uint32_t); 5169 } 5170 5171 static int 5172 nvme_ctrlr_set_ana_states(const struct spdk_nvme_ana_group_descriptor *desc, 5173 void *cb_arg) 5174 { 5175 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 5176 struct nvme_ns *nvme_ns; 5177 uint32_t i, nsid; 5178 5179 for (i = 0; i < desc->num_of_nsid; i++) { 5180 nsid = desc->nsid[i]; 5181 if (nsid == 0) { 5182 continue; 5183 } 5184 5185 nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid); 5186 5187 if (nvme_ns == NULL) { 5188 /* Target told us that an inactive namespace had an ANA change */ 5189 continue; 5190 } 5191 5192 _nvme_ns_set_ana_state(nvme_ns, desc); 5193 } 5194 5195 return 0; 5196 } 5197 5198 static void 5199 bdev_nvme_disable_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 5200 { 5201 struct nvme_ns *nvme_ns; 5202 5203 spdk_free(nvme_ctrlr->ana_log_page); 5204 nvme_ctrlr->ana_log_page = NULL; 5205 5206 for (nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 5207 nvme_ns != NULL; 5208 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns)) { 5209 nvme_ns->ana_state_updating = false; 5210 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 5211 } 5212 } 5213 5214 static void 5215 nvme_ctrlr_read_ana_log_page_done(void *ctx, const struct spdk_nvme_cpl *cpl) 5216 { 5217 struct nvme_ctrlr *nvme_ctrlr = ctx; 5218 5219 if (cpl != NULL && spdk_nvme_cpl_is_success(cpl)) { 5220 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ctrlr_set_ana_states, 5221 nvme_ctrlr); 5222 } else { 5223 bdev_nvme_disable_read_ana_log_page(nvme_ctrlr); 5224 } 5225 5226 pthread_mutex_lock(&nvme_ctrlr->mutex); 5227 5228 assert(nvme_ctrlr->ana_log_page_updating == true); 5229 nvme_ctrlr->ana_log_page_updating = false; 5230 5231 if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) { 5232 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5233 5234 nvme_ctrlr_unregister(nvme_ctrlr); 5235 } else { 5236 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5237 5238 bdev_nvme_clear_io_path_caches(nvme_ctrlr); 5239 } 5240 } 5241 5242 static int 5243 nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 5244 { 5245 uint32_t ana_log_page_size; 5246 int rc; 5247 5248 if (nvme_ctrlr->ana_log_page == NULL) { 5249 return -EINVAL; 5250 } 5251 5252 ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); 5253 5254 if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { 5255 NVME_CTRLR_ERRLOG(nvme_ctrlr, 5256 "ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", 5257 ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); 5258 return -EINVAL; 5259 } 5260 5261 pthread_mutex_lock(&nvme_ctrlr->mutex); 5262 if (!nvme_ctrlr_is_available(nvme_ctrlr) || 5263 nvme_ctrlr->ana_log_page_updating) { 5264 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5265 return -EBUSY; 5266 } 5267 5268 nvme_ctrlr->ana_log_page_updating = true; 5269 pthread_mutex_unlock(&nvme_ctrlr->mutex); 5270 5271 rc = spdk_nvme_ctrlr_cmd_get_log_page(nvme_ctrlr->ctrlr, 5272 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 5273 SPDK_NVME_GLOBAL_NS_TAG, 5274 nvme_ctrlr->ana_log_page, 5275 ana_log_page_size, 0, 5276 nvme_ctrlr_read_ana_log_page_done, 5277 nvme_ctrlr); 5278 if (rc != 0) { 5279 nvme_ctrlr_read_ana_log_page_done(nvme_ctrlr, NULL); 5280 } 5281 5282 return rc; 5283 } 5284 5285 static void 5286 dummy_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 5287 { 5288 } 5289 5290 struct bdev_nvme_set_preferred_path_ctx { 5291 struct spdk_bdev_desc *desc; 5292 struct nvme_ns *nvme_ns; 5293 bdev_nvme_set_preferred_path_cb cb_fn; 5294 void *cb_arg; 5295 }; 5296 5297 static void 5298 bdev_nvme_set_preferred_path_done(struct nvme_bdev *nbdev, void *_ctx, int status) 5299 { 5300 struct bdev_nvme_set_preferred_path_ctx *ctx = _ctx; 5301 5302 assert(ctx != NULL); 5303 assert(ctx->desc != NULL); 5304 assert(ctx->cb_fn != NULL); 5305 5306 spdk_bdev_close(ctx->desc); 5307 5308 ctx->cb_fn(ctx->cb_arg, status); 5309 5310 free(ctx); 5311 } 5312 5313 static void 5314 _bdev_nvme_set_preferred_path(struct nvme_bdev_channel_iter *i, 5315 struct nvme_bdev *nbdev, 5316 struct nvme_bdev_channel *nbdev_ch, void *_ctx) 5317 { 5318 struct bdev_nvme_set_preferred_path_ctx *ctx = _ctx; 5319 struct nvme_io_path *io_path, *prev; 5320 5321 prev = NULL; 5322 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 5323 if (io_path->nvme_ns == ctx->nvme_ns) { 5324 break; 5325 } 5326 prev = io_path; 5327 } 5328 5329 if (io_path != NULL) { 5330 if (prev != NULL) { 5331 STAILQ_REMOVE_AFTER(&nbdev_ch->io_path_list, prev, stailq); 5332 STAILQ_INSERT_HEAD(&nbdev_ch->io_path_list, io_path, stailq); 5333 } 5334 5335 /* We can set io_path to nbdev_ch->current_io_path directly here. 5336 * However, it needs to be conditional. To simplify the code, 5337 * just clear nbdev_ch->current_io_path and let find_io_path() 5338 * fill it. 5339 * 5340 * Automatic failback may be disabled. Hence even if the io_path is 5341 * already at the head, clear nbdev_ch->current_io_path. 5342 */ 5343 bdev_nvme_clear_current_io_path(nbdev_ch); 5344 } 5345 5346 nvme_bdev_for_each_channel_continue(i, 0); 5347 } 5348 5349 static struct nvme_ns * 5350 bdev_nvme_set_preferred_ns(struct nvme_bdev *nbdev, uint16_t cntlid) 5351 { 5352 struct nvme_ns *nvme_ns, *prev; 5353 const struct spdk_nvme_ctrlr_data *cdata; 5354 5355 prev = NULL; 5356 TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) { 5357 cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr); 5358 5359 if (cdata->cntlid == cntlid) { 5360 break; 5361 } 5362 prev = nvme_ns; 5363 } 5364 5365 if (nvme_ns != NULL && prev != NULL) { 5366 TAILQ_REMOVE(&nbdev->nvme_ns_list, nvme_ns, tailq); 5367 TAILQ_INSERT_HEAD(&nbdev->nvme_ns_list, nvme_ns, tailq); 5368 } 5369 5370 return nvme_ns; 5371 } 5372 5373 /* This function supports only multipath mode. There is only a single I/O path 5374 * for each NVMe-oF controller. Hence, just move the matched I/O path to the 5375 * head of the I/O path list for each NVMe bdev channel. 5376 * 5377 * NVMe bdev channel may be acquired after completing this function. move the 5378 * matched namespace to the head of the namespace list for the NVMe bdev too. 5379 */ 5380 void 5381 bdev_nvme_set_preferred_path(const char *name, uint16_t cntlid, 5382 bdev_nvme_set_preferred_path_cb cb_fn, void *cb_arg) 5383 { 5384 struct bdev_nvme_set_preferred_path_ctx *ctx; 5385 struct spdk_bdev *bdev; 5386 struct nvme_bdev *nbdev; 5387 int rc = 0; 5388 5389 assert(cb_fn != NULL); 5390 5391 ctx = calloc(1, sizeof(*ctx)); 5392 if (ctx == NULL) { 5393 SPDK_ERRLOG("Failed to alloc context.\n"); 5394 rc = -ENOMEM; 5395 goto err_alloc; 5396 } 5397 5398 ctx->cb_fn = cb_fn; 5399 ctx->cb_arg = cb_arg; 5400 5401 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 5402 if (rc != 0) { 5403 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 5404 goto err_open; 5405 } 5406 5407 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 5408 5409 if (bdev->module != &nvme_if) { 5410 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 5411 rc = -ENODEV; 5412 goto err_bdev; 5413 } 5414 5415 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 5416 5417 pthread_mutex_lock(&nbdev->mutex); 5418 5419 ctx->nvme_ns = bdev_nvme_set_preferred_ns(nbdev, cntlid); 5420 if (ctx->nvme_ns == NULL) { 5421 pthread_mutex_unlock(&nbdev->mutex); 5422 5423 SPDK_ERRLOG("bdev %s does not have namespace to controller %u.\n", name, cntlid); 5424 rc = -ENODEV; 5425 goto err_bdev; 5426 } 5427 5428 pthread_mutex_unlock(&nbdev->mutex); 5429 5430 nvme_bdev_for_each_channel(nbdev, 5431 _bdev_nvme_set_preferred_path, 5432 ctx, 5433 bdev_nvme_set_preferred_path_done); 5434 return; 5435 5436 err_bdev: 5437 spdk_bdev_close(ctx->desc); 5438 err_open: 5439 free(ctx); 5440 err_alloc: 5441 cb_fn(cb_arg, rc); 5442 } 5443 5444 struct bdev_nvme_set_multipath_policy_ctx { 5445 struct spdk_bdev_desc *desc; 5446 spdk_bdev_nvme_set_multipath_policy_cb cb_fn; 5447 void *cb_arg; 5448 }; 5449 5450 static void 5451 bdev_nvme_set_multipath_policy_done(struct nvme_bdev *nbdev, void *_ctx, int status) 5452 { 5453 struct bdev_nvme_set_multipath_policy_ctx *ctx = _ctx; 5454 5455 assert(ctx != NULL); 5456 assert(ctx->desc != NULL); 5457 assert(ctx->cb_fn != NULL); 5458 5459 spdk_bdev_close(ctx->desc); 5460 5461 ctx->cb_fn(ctx->cb_arg, status); 5462 5463 free(ctx); 5464 } 5465 5466 static void 5467 _bdev_nvme_set_multipath_policy(struct nvme_bdev_channel_iter *i, 5468 struct nvme_bdev *nbdev, 5469 struct nvme_bdev_channel *nbdev_ch, void *ctx) 5470 { 5471 nbdev_ch->mp_policy = nbdev->mp_policy; 5472 nbdev_ch->mp_selector = nbdev->mp_selector; 5473 nbdev_ch->rr_min_io = nbdev->rr_min_io; 5474 bdev_nvme_clear_current_io_path(nbdev_ch); 5475 5476 nvme_bdev_for_each_channel_continue(i, 0); 5477 } 5478 5479 void 5480 spdk_bdev_nvme_set_multipath_policy(const char *name, enum spdk_bdev_nvme_multipath_policy policy, 5481 enum spdk_bdev_nvme_multipath_selector selector, uint32_t rr_min_io, 5482 spdk_bdev_nvme_set_multipath_policy_cb cb_fn, void *cb_arg) 5483 { 5484 struct bdev_nvme_set_multipath_policy_ctx *ctx; 5485 struct spdk_bdev *bdev; 5486 struct nvme_bdev *nbdev; 5487 int rc; 5488 5489 assert(cb_fn != NULL); 5490 5491 switch (policy) { 5492 case BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE: 5493 break; 5494 case BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE: 5495 switch (selector) { 5496 case BDEV_NVME_MP_SELECTOR_ROUND_ROBIN: 5497 if (rr_min_io == UINT32_MAX) { 5498 rr_min_io = 1; 5499 } else if (rr_min_io == 0) { 5500 rc = -EINVAL; 5501 goto exit; 5502 } 5503 break; 5504 case BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH: 5505 break; 5506 default: 5507 rc = -EINVAL; 5508 goto exit; 5509 } 5510 break; 5511 default: 5512 rc = -EINVAL; 5513 goto exit; 5514 } 5515 5516 ctx = calloc(1, sizeof(*ctx)); 5517 if (ctx == NULL) { 5518 SPDK_ERRLOG("Failed to alloc context.\n"); 5519 rc = -ENOMEM; 5520 goto exit; 5521 } 5522 5523 ctx->cb_fn = cb_fn; 5524 ctx->cb_arg = cb_arg; 5525 5526 rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc); 5527 if (rc != 0) { 5528 SPDK_ERRLOG("Failed to open bdev %s.\n", name); 5529 rc = -ENODEV; 5530 goto err_open; 5531 } 5532 5533 bdev = spdk_bdev_desc_get_bdev(ctx->desc); 5534 if (bdev->module != &nvme_if) { 5535 SPDK_ERRLOG("bdev %s is not registered in this module.\n", name); 5536 rc = -ENODEV; 5537 goto err_module; 5538 } 5539 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 5540 5541 pthread_mutex_lock(&nbdev->mutex); 5542 nbdev->mp_policy = policy; 5543 nbdev->mp_selector = selector; 5544 nbdev->rr_min_io = rr_min_io; 5545 pthread_mutex_unlock(&nbdev->mutex); 5546 5547 nvme_bdev_for_each_channel(nbdev, 5548 _bdev_nvme_set_multipath_policy, 5549 ctx, 5550 bdev_nvme_set_multipath_policy_done); 5551 return; 5552 5553 err_module: 5554 spdk_bdev_close(ctx->desc); 5555 err_open: 5556 free(ctx); 5557 exit: 5558 cb_fn(cb_arg, rc); 5559 } 5560 5561 static void 5562 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 5563 { 5564 struct nvme_ctrlr *nvme_ctrlr = arg; 5565 union spdk_nvme_async_event_completion event; 5566 5567 if (spdk_nvme_cpl_is_error(cpl)) { 5568 SPDK_WARNLOG("AER request execute failed\n"); 5569 return; 5570 } 5571 5572 event.raw = cpl->cdw0; 5573 if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 5574 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) { 5575 nvme_ctrlr_populate_namespaces(nvme_ctrlr, NULL); 5576 } else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 5577 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_ANA_CHANGE)) { 5578 nvme_ctrlr_read_ana_log_page(nvme_ctrlr); 5579 } 5580 } 5581 5582 static void 5583 free_nvme_async_probe_ctx(struct nvme_async_probe_ctx *ctx) 5584 { 5585 spdk_keyring_put_key(ctx->drv_opts.tls_psk); 5586 spdk_keyring_put_key(ctx->drv_opts.dhchap_key); 5587 spdk_keyring_put_key(ctx->drv_opts.dhchap_ctrlr_key); 5588 free(ctx->base_name); 5589 free(ctx); 5590 } 5591 5592 static void 5593 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, int rc) 5594 { 5595 if (ctx->cb_fn) { 5596 ctx->cb_fn(ctx->cb_ctx, ctx->reported_bdevs, rc); 5597 } 5598 5599 ctx->namespaces_populated = true; 5600 if (ctx->probe_done) { 5601 /* The probe was already completed, so we need to free the context 5602 * here. This can happen for cases like OCSSD, where we need to 5603 * send additional commands to the SSD after attach. 5604 */ 5605 free_nvme_async_probe_ctx(ctx); 5606 } 5607 } 5608 5609 static int 5610 bdev_nvme_remove_poller(void *ctx) 5611 { 5612 struct spdk_nvme_transport_id trid_pcie; 5613 5614 if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 5615 spdk_poller_unregister(&g_hotplug_poller); 5616 return SPDK_POLLER_IDLE; 5617 } 5618 5619 memset(&trid_pcie, 0, sizeof(trid_pcie)); 5620 spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE); 5621 5622 if (spdk_nvme_scan_attached(&trid_pcie)) { 5623 SPDK_ERRLOG_RATELIMIT("spdk_nvme_scan_attached() failed\n"); 5624 } 5625 5626 return SPDK_POLLER_BUSY; 5627 } 5628 5629 static void 5630 nvme_ctrlr_create_done(struct nvme_ctrlr *nvme_ctrlr, 5631 struct nvme_async_probe_ctx *ctx) 5632 { 5633 struct spdk_nvme_transport_id *trid = &nvme_ctrlr->active_path_id->trid; 5634 5635 if (spdk_nvme_trtype_is_fabrics(trid->trtype)) { 5636 NVME_CTRLR_INFOLOG(nvme_ctrlr, "ctrlr was created to %s:%s\n", 5637 trid->traddr, trid->trsvcid); 5638 } else { 5639 NVME_CTRLR_INFOLOG(nvme_ctrlr, "ctrlr was created\n"); 5640 } 5641 5642 spdk_io_device_register(nvme_ctrlr, 5643 bdev_nvme_create_ctrlr_channel_cb, 5644 bdev_nvme_destroy_ctrlr_channel_cb, 5645 sizeof(struct nvme_ctrlr_channel), 5646 nvme_ctrlr->nbdev_ctrlr->name); 5647 5648 nvme_ctrlr_populate_namespaces(nvme_ctrlr, ctx); 5649 5650 if (g_hotplug_poller == NULL) { 5651 g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_remove_poller, NULL, 5652 NVME_HOTPLUG_POLL_PERIOD_DEFAULT); 5653 } 5654 } 5655 5656 static void 5657 nvme_ctrlr_init_ana_log_page_done(void *_ctx, const struct spdk_nvme_cpl *cpl) 5658 { 5659 struct nvme_ctrlr *nvme_ctrlr = _ctx; 5660 struct nvme_async_probe_ctx *ctx = nvme_ctrlr->probe_ctx; 5661 5662 nvme_ctrlr->probe_ctx = NULL; 5663 5664 if (spdk_nvme_cpl_is_error(cpl)) { 5665 nvme_ctrlr_delete(nvme_ctrlr); 5666 5667 if (ctx != NULL) { 5668 ctx->reported_bdevs = 0; 5669 populate_namespaces_cb(ctx, -1); 5670 } 5671 return; 5672 } 5673 5674 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 5675 } 5676 5677 static int 5678 nvme_ctrlr_init_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 5679 struct nvme_async_probe_ctx *ctx) 5680 { 5681 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 5682 const struct spdk_nvme_ctrlr_data *cdata; 5683 uint32_t ana_log_page_size; 5684 5685 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5686 5687 /* Set buffer size enough to include maximum number of allowed namespaces. */ 5688 ana_log_page_size = sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 5689 sizeof(struct spdk_nvme_ana_group_descriptor) + cdata->mnan * 5690 sizeof(uint32_t); 5691 5692 nvme_ctrlr->ana_log_page = spdk_zmalloc(ana_log_page_size, 64, NULL, 5693 SPDK_ENV_NUMA_ID_ANY, SPDK_MALLOC_DMA); 5694 if (nvme_ctrlr->ana_log_page == NULL) { 5695 NVME_CTRLR_ERRLOG(nvme_ctrlr, "could not allocate ANA log page buffer\n"); 5696 return -ENXIO; 5697 } 5698 5699 /* Each descriptor in a ANA log page is not ensured to be 8-bytes aligned. 5700 * Hence copy each descriptor to a temporary area when parsing it. 5701 * 5702 * Allocate a buffer whose size is as large as ANA log page buffer because 5703 * we do not know the size of a descriptor until actually reading it. 5704 */ 5705 nvme_ctrlr->copied_ana_desc = calloc(1, ana_log_page_size); 5706 if (nvme_ctrlr->copied_ana_desc == NULL) { 5707 NVME_CTRLR_ERRLOG(nvme_ctrlr, "could not allocate a buffer to parse ANA descriptor\n"); 5708 return -ENOMEM; 5709 } 5710 5711 nvme_ctrlr->max_ana_log_page_size = ana_log_page_size; 5712 5713 nvme_ctrlr->probe_ctx = ctx; 5714 5715 /* Then, set the read size only to include the current active namespaces. */ 5716 ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr); 5717 5718 if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) { 5719 NVME_CTRLR_ERRLOG(nvme_ctrlr, "ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n", 5720 ana_log_page_size, nvme_ctrlr->max_ana_log_page_size); 5721 return -EINVAL; 5722 } 5723 5724 return spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, 5725 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 5726 SPDK_NVME_GLOBAL_NS_TAG, 5727 nvme_ctrlr->ana_log_page, 5728 ana_log_page_size, 0, 5729 nvme_ctrlr_init_ana_log_page_done, 5730 nvme_ctrlr); 5731 } 5732 5733 /* hostnqn and subnqn were already verified before attaching a controller. 5734 * Hence check only the multipath capability and cntlid here. 5735 */ 5736 static bool 5737 bdev_nvme_check_multipath(struct nvme_bdev_ctrlr *nbdev_ctrlr, struct spdk_nvme_ctrlr *ctrlr) 5738 { 5739 struct nvme_ctrlr *tmp; 5740 const struct spdk_nvme_ctrlr_data *cdata, *tmp_cdata; 5741 5742 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5743 5744 if (!cdata->cmic.multi_ctrlr) { 5745 SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid); 5746 return false; 5747 } 5748 5749 TAILQ_FOREACH(tmp, &nbdev_ctrlr->ctrlrs, tailq) { 5750 tmp_cdata = spdk_nvme_ctrlr_get_data(tmp->ctrlr); 5751 5752 if (!tmp_cdata->cmic.multi_ctrlr) { 5753 NVME_CTRLR_ERRLOG(tmp, "Ctrlr%u does not support multipath.\n", cdata->cntlid); 5754 return false; 5755 } 5756 if (cdata->cntlid == tmp_cdata->cntlid) { 5757 NVME_CTRLR_ERRLOG(tmp, "cntlid %u are duplicated.\n", tmp_cdata->cntlid); 5758 return false; 5759 } 5760 } 5761 5762 return true; 5763 } 5764 5765 5766 static int 5767 nvme_bdev_ctrlr_create(const char *name, struct nvme_ctrlr *nvme_ctrlr) 5768 { 5769 struct nvme_bdev_ctrlr *nbdev_ctrlr; 5770 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 5771 struct nvme_ctrlr *nctrlr; 5772 int rc = 0; 5773 5774 pthread_mutex_lock(&g_bdev_nvme_mutex); 5775 5776 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 5777 if (nbdev_ctrlr != NULL) { 5778 if (!bdev_nvme_check_multipath(nbdev_ctrlr, ctrlr)) { 5779 rc = -EINVAL; 5780 goto exit; 5781 } 5782 TAILQ_FOREACH(nctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 5783 if (nctrlr->opts.multipath != nvme_ctrlr->opts.multipath) { 5784 /* All controllers with the same name must be configured the same 5785 * way, either for multipath or failover. If the configuration doesn't 5786 * match - report error. 5787 */ 5788 rc = -EINVAL; 5789 goto exit; 5790 } 5791 } 5792 } else { 5793 nbdev_ctrlr = calloc(1, sizeof(*nbdev_ctrlr)); 5794 if (nbdev_ctrlr == NULL) { 5795 NVME_CTRLR_ERRLOG(nvme_ctrlr, "Failed to allocate nvme_bdev_ctrlr.\n"); 5796 rc = -ENOMEM; 5797 goto exit; 5798 } 5799 nbdev_ctrlr->name = strdup(name); 5800 if (nbdev_ctrlr->name == NULL) { 5801 NVME_CTRLR_ERRLOG(nvme_ctrlr, "Failed to allocate name of nvme_bdev_ctrlr.\n"); 5802 free(nbdev_ctrlr); 5803 goto exit; 5804 } 5805 TAILQ_INIT(&nbdev_ctrlr->ctrlrs); 5806 TAILQ_INIT(&nbdev_ctrlr->bdevs); 5807 TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq); 5808 } 5809 nvme_ctrlr->nbdev_ctrlr = nbdev_ctrlr; 5810 TAILQ_INSERT_TAIL(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq); 5811 exit: 5812 pthread_mutex_unlock(&g_bdev_nvme_mutex); 5813 return rc; 5814 } 5815 5816 static int 5817 nvme_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr, 5818 const char *name, 5819 const struct spdk_nvme_transport_id *trid, 5820 struct nvme_async_probe_ctx *ctx) 5821 { 5822 struct nvme_ctrlr *nvme_ctrlr; 5823 struct nvme_path_id *path_id; 5824 const struct spdk_nvme_ctrlr_data *cdata; 5825 struct spdk_event_handler_opts opts = { 5826 .opts_size = SPDK_SIZEOF(&opts, fd_type), 5827 }; 5828 uint64_t period; 5829 int fd, rc; 5830 5831 nvme_ctrlr = calloc(1, sizeof(*nvme_ctrlr)); 5832 if (nvme_ctrlr == NULL) { 5833 SPDK_ERRLOG("Failed to allocate device struct\n"); 5834 return -ENOMEM; 5835 } 5836 5837 rc = pthread_mutex_init(&nvme_ctrlr->mutex, NULL); 5838 if (rc != 0) { 5839 free(nvme_ctrlr); 5840 return rc; 5841 } 5842 5843 TAILQ_INIT(&nvme_ctrlr->trids); 5844 TAILQ_INIT(&nvme_ctrlr->pending_resets); 5845 RB_INIT(&nvme_ctrlr->namespaces); 5846 5847 /* Get another reference to the key, so the first one can be released from probe_ctx */ 5848 if (ctx != NULL) { 5849 if (ctx->drv_opts.tls_psk != NULL) { 5850 nvme_ctrlr->psk = spdk_keyring_get_key( 5851 spdk_key_get_name(ctx->drv_opts.tls_psk)); 5852 if (nvme_ctrlr->psk == NULL) { 5853 /* Could only happen if the key was removed in the meantime */ 5854 SPDK_ERRLOG("Couldn't get a reference to the key '%s'\n", 5855 spdk_key_get_name(ctx->drv_opts.tls_psk)); 5856 rc = -ENOKEY; 5857 goto err; 5858 } 5859 } 5860 5861 if (ctx->drv_opts.dhchap_key != NULL) { 5862 nvme_ctrlr->dhchap_key = spdk_keyring_get_key( 5863 spdk_key_get_name(ctx->drv_opts.dhchap_key)); 5864 if (nvme_ctrlr->dhchap_key == NULL) { 5865 SPDK_ERRLOG("Couldn't get a reference to the key '%s'\n", 5866 spdk_key_get_name(ctx->drv_opts.dhchap_key)); 5867 rc = -ENOKEY; 5868 goto err; 5869 } 5870 } 5871 5872 if (ctx->drv_opts.dhchap_ctrlr_key != NULL) { 5873 nvme_ctrlr->dhchap_ctrlr_key = 5874 spdk_keyring_get_key( 5875 spdk_key_get_name(ctx->drv_opts.dhchap_ctrlr_key)); 5876 if (nvme_ctrlr->dhchap_ctrlr_key == NULL) { 5877 SPDK_ERRLOG("Couldn't get a reference to the key '%s'\n", 5878 spdk_key_get_name(ctx->drv_opts.dhchap_ctrlr_key)); 5879 rc = -ENOKEY; 5880 goto err; 5881 } 5882 } 5883 } 5884 5885 /* Check if we manage to enable interrupts on the controller. */ 5886 if (spdk_interrupt_mode_is_enabled() && ctx != NULL && !ctx->drv_opts.enable_interrupts) { 5887 SPDK_ERRLOG("Failed to enable interrupts on the controller\n"); 5888 rc = -ENOTSUP; 5889 goto err; 5890 } 5891 5892 path_id = calloc(1, sizeof(*path_id)); 5893 if (path_id == NULL) { 5894 SPDK_ERRLOG("Failed to allocate trid entry pointer\n"); 5895 rc = -ENOMEM; 5896 goto err; 5897 } 5898 5899 path_id->trid = *trid; 5900 if (ctx != NULL) { 5901 memcpy(path_id->hostid.hostaddr, ctx->drv_opts.src_addr, sizeof(path_id->hostid.hostaddr)); 5902 memcpy(path_id->hostid.hostsvcid, ctx->drv_opts.src_svcid, sizeof(path_id->hostid.hostsvcid)); 5903 } 5904 nvme_ctrlr->active_path_id = path_id; 5905 TAILQ_INSERT_HEAD(&nvme_ctrlr->trids, path_id, link); 5906 5907 nvme_ctrlr->thread = spdk_get_thread(); 5908 nvme_ctrlr->ctrlr = ctrlr; 5909 nvme_ctrlr->ref = 1; 5910 5911 if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) { 5912 SPDK_ERRLOG("OCSSDs are not supported"); 5913 rc = -ENOTSUP; 5914 goto err; 5915 } 5916 5917 if (ctx != NULL) { 5918 memcpy(&nvme_ctrlr->opts, &ctx->bdev_opts, sizeof(ctx->bdev_opts)); 5919 } else { 5920 spdk_bdev_nvme_get_default_ctrlr_opts(&nvme_ctrlr->opts); 5921 } 5922 5923 period = spdk_interrupt_mode_is_enabled() ? 0 : g_opts.nvme_adminq_poll_period_us; 5924 5925 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_ctrlr, 5926 period); 5927 5928 if (spdk_interrupt_mode_is_enabled()) { 5929 spdk_poller_register_interrupt(nvme_ctrlr->adminq_timer_poller, NULL, NULL); 5930 5931 fd = spdk_nvme_ctrlr_get_admin_qp_fd(nvme_ctrlr->ctrlr, &opts); 5932 if (fd < 0) { 5933 rc = fd; 5934 goto err; 5935 } 5936 5937 nvme_ctrlr->intr = SPDK_INTERRUPT_REGISTER_EXT(fd, bdev_nvme_poll_adminq, 5938 nvme_ctrlr, &opts); 5939 if (!nvme_ctrlr->intr) { 5940 rc = -EINVAL; 5941 goto err; 5942 } 5943 } 5944 5945 if (g_opts.timeout_us > 0) { 5946 /* Register timeout callback. Timeout values for IO vs. admin reqs can be different. */ 5947 /* If timeout_admin_us is 0 (not specified), admin uses same timeout as IO. */ 5948 uint64_t adm_timeout_us = (g_opts.timeout_admin_us == 0) ? 5949 g_opts.timeout_us : g_opts.timeout_admin_us; 5950 spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us, 5951 adm_timeout_us, timeout_cb, nvme_ctrlr); 5952 } 5953 5954 spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_ctrlr); 5955 spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_ctrlr); 5956 5957 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 5958 SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) { 5959 nvme_ctrlr->opal_dev = spdk_opal_dev_construct(ctrlr); 5960 } 5961 5962 rc = nvme_bdev_ctrlr_create(name, nvme_ctrlr); 5963 if (rc != 0) { 5964 goto err; 5965 } 5966 5967 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 5968 5969 if (cdata->cmic.ana_reporting) { 5970 rc = nvme_ctrlr_init_ana_log_page(nvme_ctrlr, ctx); 5971 if (rc == 0) { 5972 return 0; 5973 } 5974 } else { 5975 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 5976 return 0; 5977 } 5978 5979 err: 5980 nvme_ctrlr_delete(nvme_ctrlr); 5981 return rc; 5982 } 5983 5984 void 5985 spdk_bdev_nvme_get_default_ctrlr_opts(struct spdk_bdev_nvme_ctrlr_opts *opts) 5986 { 5987 opts->prchk_flags = 0; 5988 opts->ctrlr_loss_timeout_sec = g_opts.ctrlr_loss_timeout_sec; 5989 opts->reconnect_delay_sec = g_opts.reconnect_delay_sec; 5990 opts->fast_io_fail_timeout_sec = g_opts.fast_io_fail_timeout_sec; 5991 opts->multipath = true; 5992 } 5993 5994 static void 5995 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 5996 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *drv_opts) 5997 { 5998 char *name; 5999 6000 name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++); 6001 if (!name) { 6002 SPDK_ERRLOG("Failed to assign name to NVMe device\n"); 6003 return; 6004 } 6005 6006 if (nvme_ctrlr_create(ctrlr, name, trid, NULL) == 0) { 6007 SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name); 6008 } else { 6009 SPDK_ERRLOG("Failed to attach to %s (%s)\n", trid->traddr, name); 6010 } 6011 6012 free(name); 6013 } 6014 6015 static void 6016 _nvme_ctrlr_destruct(void *ctx) 6017 { 6018 struct nvme_ctrlr *nvme_ctrlr = ctx; 6019 6020 nvme_ctrlr_depopulate_namespaces(nvme_ctrlr); 6021 nvme_ctrlr_put_ref(nvme_ctrlr); 6022 } 6023 6024 static int 6025 bdev_nvme_delete_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) 6026 { 6027 struct nvme_probe_skip_entry *entry; 6028 6029 /* The controller's destruction was already started */ 6030 if (nvme_ctrlr->destruct) { 6031 return -EALREADY; 6032 } 6033 6034 if (!hotplug && 6035 nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { 6036 entry = calloc(1, sizeof(*entry)); 6037 if (!entry) { 6038 return -ENOMEM; 6039 } 6040 entry->trid = nvme_ctrlr->active_path_id->trid; 6041 TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq); 6042 } 6043 6044 nvme_ctrlr->destruct = true; 6045 return 0; 6046 } 6047 6048 static int 6049 bdev_nvme_delete_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) 6050 { 6051 int rc; 6052 6053 pthread_mutex_lock(&nvme_ctrlr->mutex); 6054 rc = bdev_nvme_delete_ctrlr_unsafe(nvme_ctrlr, hotplug); 6055 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6056 6057 if (rc == 0) { 6058 _nvme_ctrlr_destruct(nvme_ctrlr); 6059 } else if (rc == -EALREADY) { 6060 rc = 0; 6061 } 6062 6063 return rc; 6064 } 6065 6066 static void 6067 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr) 6068 { 6069 struct nvme_ctrlr *nvme_ctrlr = cb_ctx; 6070 6071 bdev_nvme_delete_ctrlr(nvme_ctrlr, true); 6072 } 6073 6074 static int 6075 bdev_nvme_hotplug_probe(void *arg) 6076 { 6077 if (g_hotplug_probe_ctx == NULL) { 6078 spdk_poller_unregister(&g_hotplug_probe_poller); 6079 return SPDK_POLLER_IDLE; 6080 } 6081 6082 if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) { 6083 g_hotplug_probe_ctx = NULL; 6084 spdk_poller_unregister(&g_hotplug_probe_poller); 6085 } 6086 6087 return SPDK_POLLER_BUSY; 6088 } 6089 6090 static int 6091 bdev_nvme_hotplug(void *arg) 6092 { 6093 struct spdk_nvme_transport_id trid_pcie; 6094 6095 if (g_hotplug_probe_ctx) { 6096 return SPDK_POLLER_BUSY; 6097 } 6098 6099 memset(&trid_pcie, 0, sizeof(trid_pcie)); 6100 spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE); 6101 6102 g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL, 6103 hotplug_probe_cb, attach_cb, NULL); 6104 6105 if (g_hotplug_probe_ctx) { 6106 assert(g_hotplug_probe_poller == NULL); 6107 g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000); 6108 } 6109 6110 return SPDK_POLLER_BUSY; 6111 } 6112 6113 void 6114 spdk_bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts, size_t opts_size) 6115 { 6116 if (!opts) { 6117 SPDK_ERRLOG("opts should not be NULL\n"); 6118 return; 6119 } 6120 6121 if (!opts_size) { 6122 SPDK_ERRLOG("opts_size should not be zero value\n"); 6123 return; 6124 } 6125 6126 opts->opts_size = opts_size; 6127 6128 #define SET_FIELD(field, defval) \ 6129 opts->field = SPDK_GET_FIELD(&g_opts, field, defval, opts_size); \ 6130 6131 SET_FIELD(action_on_timeout, 0); 6132 SET_FIELD(keep_alive_timeout_ms, 0); 6133 SET_FIELD(timeout_us, 0); 6134 SET_FIELD(timeout_admin_us, 0); 6135 SET_FIELD(transport_retry_count, 0); 6136 SET_FIELD(arbitration_burst, 0); 6137 SET_FIELD(low_priority_weight, 0); 6138 SET_FIELD(medium_priority_weight, 0); 6139 SET_FIELD(high_priority_weight, 0); 6140 SET_FIELD(io_queue_requests, 0); 6141 SET_FIELD(nvme_adminq_poll_period_us, 0); 6142 SET_FIELD(nvme_ioq_poll_period_us, 0); 6143 SET_FIELD(delay_cmd_submit, 0); 6144 SET_FIELD(bdev_retry_count, 0); 6145 SET_FIELD(ctrlr_loss_timeout_sec, 0); 6146 SET_FIELD(reconnect_delay_sec, 0); 6147 SET_FIELD(fast_io_fail_timeout_sec, 0); 6148 SET_FIELD(transport_ack_timeout, 0); 6149 SET_FIELD(disable_auto_failback, false); 6150 SET_FIELD(generate_uuids, false); 6151 SET_FIELD(transport_tos, 0); 6152 SET_FIELD(nvme_error_stat, false); 6153 SET_FIELD(io_path_stat, false); 6154 SET_FIELD(allow_accel_sequence, false); 6155 SET_FIELD(rdma_srq_size, 0); 6156 SET_FIELD(rdma_max_cq_size, 0); 6157 SET_FIELD(rdma_cm_event_timeout_ms, 0); 6158 SET_FIELD(dhchap_digests, 0); 6159 SET_FIELD(dhchap_dhgroups, 0); 6160 6161 #undef SET_FIELD 6162 6163 /* Do not remove this statement, you should always update this statement when you adding a new field, 6164 * and do not forget to add the SET_FIELD statement for your added field. */ 6165 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_nvme_opts) == 120, "Incorrect size"); 6166 } 6167 6168 static bool bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, 6169 uint32_t reconnect_delay_sec, 6170 uint32_t fast_io_fail_timeout_sec); 6171 6172 static int 6173 bdev_nvme_validate_opts(const struct spdk_bdev_nvme_opts *opts) 6174 { 6175 if ((opts->timeout_us == 0) && (opts->timeout_admin_us != 0)) { 6176 /* Can't set timeout_admin_us without also setting timeout_us */ 6177 SPDK_WARNLOG("Invalid options: Can't have (timeout_us == 0) with (timeout_admin_us > 0)\n"); 6178 return -EINVAL; 6179 } 6180 6181 if (opts->bdev_retry_count < -1) { 6182 SPDK_WARNLOG("Invalid option: bdev_retry_count can't be less than -1.\n"); 6183 return -EINVAL; 6184 } 6185 6186 if (!bdev_nvme_check_io_error_resiliency_params(opts->ctrlr_loss_timeout_sec, 6187 opts->reconnect_delay_sec, 6188 opts->fast_io_fail_timeout_sec)) { 6189 return -EINVAL; 6190 } 6191 6192 return 0; 6193 } 6194 6195 int 6196 spdk_bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts) 6197 { 6198 if (!opts) { 6199 SPDK_ERRLOG("opts cannot be NULL\n"); 6200 return -1; 6201 } 6202 6203 if (!opts->opts_size) { 6204 SPDK_ERRLOG("opts_size inside opts cannot be zero value\n"); 6205 return -1; 6206 } 6207 6208 int ret; 6209 6210 ret = bdev_nvme_validate_opts(opts); 6211 if (ret) { 6212 SPDK_WARNLOG("Failed to set nvme opts.\n"); 6213 return ret; 6214 } 6215 6216 if (g_bdev_nvme_init_thread != NULL) { 6217 if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 6218 return -EPERM; 6219 } 6220 } 6221 6222 if (opts->rdma_srq_size != 0 || 6223 opts->rdma_max_cq_size != 0 || 6224 opts->rdma_cm_event_timeout_ms != 0) { 6225 struct spdk_nvme_transport_opts drv_opts; 6226 6227 spdk_nvme_transport_get_opts(&drv_opts, sizeof(drv_opts)); 6228 if (opts->rdma_srq_size != 0) { 6229 drv_opts.rdma_srq_size = opts->rdma_srq_size; 6230 } 6231 if (opts->rdma_max_cq_size != 0) { 6232 drv_opts.rdma_max_cq_size = opts->rdma_max_cq_size; 6233 } 6234 if (opts->rdma_cm_event_timeout_ms != 0) { 6235 drv_opts.rdma_cm_event_timeout_ms = opts->rdma_cm_event_timeout_ms; 6236 } 6237 6238 ret = spdk_nvme_transport_set_opts(&drv_opts, sizeof(drv_opts)); 6239 if (ret) { 6240 SPDK_ERRLOG("Failed to set NVMe transport opts.\n"); 6241 return ret; 6242 } 6243 } 6244 6245 #define SET_FIELD(field, defval) \ 6246 g_opts.field = SPDK_GET_FIELD(opts, field, defval, opts->opts_size); \ 6247 6248 SET_FIELD(action_on_timeout, 0); 6249 SET_FIELD(keep_alive_timeout_ms, 0); 6250 SET_FIELD(timeout_us, 0); 6251 SET_FIELD(timeout_admin_us, 0); 6252 SET_FIELD(transport_retry_count, 0); 6253 SET_FIELD(arbitration_burst, 0); 6254 SET_FIELD(low_priority_weight, 0); 6255 SET_FIELD(medium_priority_weight, 0); 6256 SET_FIELD(high_priority_weight, 0); 6257 SET_FIELD(io_queue_requests, 0); 6258 SET_FIELD(nvme_adminq_poll_period_us, 0); 6259 SET_FIELD(nvme_ioq_poll_period_us, 0); 6260 SET_FIELD(delay_cmd_submit, 0); 6261 SET_FIELD(bdev_retry_count, 0); 6262 SET_FIELD(ctrlr_loss_timeout_sec, 0); 6263 SET_FIELD(reconnect_delay_sec, 0); 6264 SET_FIELD(fast_io_fail_timeout_sec, 0); 6265 SET_FIELD(transport_ack_timeout, 0); 6266 SET_FIELD(disable_auto_failback, false); 6267 SET_FIELD(generate_uuids, false); 6268 SET_FIELD(transport_tos, 0); 6269 SET_FIELD(nvme_error_stat, false); 6270 SET_FIELD(io_path_stat, false); 6271 SET_FIELD(allow_accel_sequence, false); 6272 SET_FIELD(rdma_srq_size, 0); 6273 SET_FIELD(rdma_max_cq_size, 0); 6274 SET_FIELD(rdma_cm_event_timeout_ms, 0); 6275 SET_FIELD(dhchap_digests, 0); 6276 SET_FIELD(dhchap_dhgroups, 0); 6277 6278 g_opts.opts_size = opts->opts_size; 6279 6280 #undef SET_FIELD 6281 6282 return 0; 6283 } 6284 6285 struct set_nvme_hotplug_ctx { 6286 uint64_t period_us; 6287 bool enabled; 6288 spdk_msg_fn fn; 6289 void *fn_ctx; 6290 }; 6291 6292 static void 6293 set_nvme_hotplug_period_cb(void *_ctx) 6294 { 6295 struct set_nvme_hotplug_ctx *ctx = _ctx; 6296 6297 spdk_poller_unregister(&g_hotplug_poller); 6298 if (ctx->enabled) { 6299 g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us); 6300 } else { 6301 g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_remove_poller, NULL, 6302 NVME_HOTPLUG_POLL_PERIOD_DEFAULT); 6303 } 6304 6305 g_nvme_hotplug_poll_period_us = ctx->period_us; 6306 g_nvme_hotplug_enabled = ctx->enabled; 6307 if (ctx->fn) { 6308 ctx->fn(ctx->fn_ctx); 6309 } 6310 6311 free(ctx); 6312 } 6313 6314 int 6315 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx) 6316 { 6317 struct set_nvme_hotplug_ctx *ctx; 6318 6319 if (enabled == true && !spdk_process_is_primary()) { 6320 return -EPERM; 6321 } 6322 6323 ctx = calloc(1, sizeof(*ctx)); 6324 if (ctx == NULL) { 6325 return -ENOMEM; 6326 } 6327 6328 period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us; 6329 ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX); 6330 ctx->enabled = enabled; 6331 ctx->fn = cb; 6332 ctx->fn_ctx = cb_ctx; 6333 6334 spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx); 6335 return 0; 6336 } 6337 6338 static void 6339 nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 6340 struct nvme_async_probe_ctx *ctx) 6341 { 6342 struct nvme_ns *nvme_ns; 6343 struct nvme_bdev *nvme_bdev; 6344 size_t j; 6345 6346 assert(nvme_ctrlr != NULL); 6347 6348 if (ctx->names == NULL) { 6349 ctx->reported_bdevs = 0; 6350 populate_namespaces_cb(ctx, 0); 6351 return; 6352 } 6353 6354 /* 6355 * Report the new bdevs that were created in this call. 6356 * There can be more than one bdev per NVMe controller. 6357 */ 6358 j = 0; 6359 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 6360 while (nvme_ns != NULL) { 6361 nvme_bdev = nvme_ns->bdev; 6362 if (j < ctx->max_bdevs) { 6363 ctx->names[j] = nvme_bdev->disk.name; 6364 j++; 6365 } else { 6366 NVME_CTRLR_ERRLOG(nvme_ctrlr, 6367 "Maximum number of namespaces supported per NVMe controller is %du. " 6368 "Unable to return all names of created bdevs\n", 6369 ctx->max_bdevs); 6370 ctx->reported_bdevs = 0; 6371 populate_namespaces_cb(ctx, -ERANGE); 6372 return; 6373 } 6374 6375 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 6376 } 6377 6378 ctx->reported_bdevs = j; 6379 populate_namespaces_cb(ctx, 0); 6380 } 6381 6382 static int 6383 bdev_nvme_check_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 6384 struct spdk_nvme_ctrlr *new_ctrlr, 6385 struct spdk_nvme_transport_id *trid) 6386 { 6387 struct nvme_path_id *tmp_trid; 6388 6389 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 6390 NVME_CTRLR_ERRLOG(nvme_ctrlr, "PCIe failover is not supported.\n"); 6391 return -ENOTSUP; 6392 } 6393 6394 /* Currently we only support failover to the same transport type. */ 6395 if (nvme_ctrlr->active_path_id->trid.trtype != trid->trtype) { 6396 NVME_CTRLR_WARNLOG(nvme_ctrlr, 6397 "Failover from trtype: %s to a different trtype: %s is not supported currently\n", 6398 spdk_nvme_transport_id_trtype_str(nvme_ctrlr->active_path_id->trid.trtype), 6399 spdk_nvme_transport_id_trtype_str(trid->trtype)); 6400 return -EINVAL; 6401 } 6402 6403 6404 /* Currently we only support failover to the same NQN. */ 6405 if (strncmp(trid->subnqn, nvme_ctrlr->active_path_id->trid.subnqn, SPDK_NVMF_NQN_MAX_LEN)) { 6406 NVME_CTRLR_WARNLOG(nvme_ctrlr, 6407 "Failover from subnqn: %s to a different subnqn: %s is not supported currently\n", 6408 nvme_ctrlr->active_path_id->trid.subnqn, trid->subnqn); 6409 return -EINVAL; 6410 } 6411 6412 /* Skip all the other checks if we've already registered this path. */ 6413 TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) { 6414 if (!spdk_nvme_transport_id_compare(&tmp_trid->trid, trid)) { 6415 NVME_CTRLR_WARNLOG(nvme_ctrlr, "This path (traddr: %s subnqn: %s) is already registered\n", 6416 trid->traddr, trid->subnqn); 6417 return -EALREADY; 6418 } 6419 } 6420 6421 return 0; 6422 } 6423 6424 static int 6425 bdev_nvme_check_secondary_namespace(struct nvme_ctrlr *nvme_ctrlr, 6426 struct spdk_nvme_ctrlr *new_ctrlr) 6427 { 6428 struct nvme_ns *nvme_ns; 6429 struct spdk_nvme_ns *new_ns; 6430 6431 nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr); 6432 while (nvme_ns != NULL) { 6433 new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nvme_ns->id); 6434 assert(new_ns != NULL); 6435 6436 if (!bdev_nvme_compare_ns(nvme_ns->ns, new_ns)) { 6437 return -EINVAL; 6438 } 6439 6440 nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns); 6441 } 6442 6443 return 0; 6444 } 6445 6446 static int 6447 _bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 6448 struct spdk_nvme_transport_id *trid) 6449 { 6450 struct nvme_path_id *active_id, *new_trid, *tmp_trid; 6451 6452 new_trid = calloc(1, sizeof(*new_trid)); 6453 if (new_trid == NULL) { 6454 return -ENOMEM; 6455 } 6456 new_trid->trid = *trid; 6457 6458 active_id = nvme_ctrlr->active_path_id; 6459 assert(active_id != NULL); 6460 assert(active_id == TAILQ_FIRST(&nvme_ctrlr->trids)); 6461 6462 /* Skip the active trid not to replace it until it is failed. */ 6463 tmp_trid = TAILQ_NEXT(active_id, link); 6464 if (tmp_trid == NULL) { 6465 goto add_tail; 6466 } 6467 6468 /* It means the trid is faled if its last failed time is non-zero. 6469 * Insert the new alternate trid before any failed trid. 6470 */ 6471 TAILQ_FOREACH_FROM(tmp_trid, &nvme_ctrlr->trids, link) { 6472 if (tmp_trid->last_failed_tsc != 0) { 6473 TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link); 6474 return 0; 6475 } 6476 } 6477 6478 add_tail: 6479 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, new_trid, link); 6480 return 0; 6481 } 6482 6483 /* This is the case that a secondary path is added to an existing 6484 * nvme_ctrlr for failover. After checking if it can access the same 6485 * namespaces as the primary path, it is disconnected until failover occurs. 6486 */ 6487 static int 6488 bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 6489 struct spdk_nvme_ctrlr *new_ctrlr, 6490 struct spdk_nvme_transport_id *trid) 6491 { 6492 int rc; 6493 6494 assert(nvme_ctrlr != NULL); 6495 6496 pthread_mutex_lock(&nvme_ctrlr->mutex); 6497 6498 rc = bdev_nvme_check_secondary_trid(nvme_ctrlr, new_ctrlr, trid); 6499 if (rc != 0) { 6500 goto exit; 6501 } 6502 6503 rc = bdev_nvme_check_secondary_namespace(nvme_ctrlr, new_ctrlr); 6504 if (rc != 0) { 6505 goto exit; 6506 } 6507 6508 rc = _bdev_nvme_add_secondary_trid(nvme_ctrlr, trid); 6509 6510 exit: 6511 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6512 6513 spdk_nvme_detach(new_ctrlr); 6514 6515 return rc; 6516 } 6517 6518 static void 6519 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 6520 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 6521 { 6522 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 6523 struct nvme_async_probe_ctx *ctx; 6524 int rc; 6525 6526 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 6527 ctx->ctrlr_attached = true; 6528 6529 rc = nvme_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx); 6530 if (rc != 0) { 6531 ctx->reported_bdevs = 0; 6532 populate_namespaces_cb(ctx, rc); 6533 } 6534 } 6535 6536 6537 static void 6538 connect_set_failover_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 6539 struct spdk_nvme_ctrlr *ctrlr, 6540 const struct spdk_nvme_ctrlr_opts *opts) 6541 { 6542 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 6543 struct nvme_ctrlr *nvme_ctrlr; 6544 struct nvme_async_probe_ctx *ctx; 6545 int rc; 6546 6547 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts); 6548 ctx->ctrlr_attached = true; 6549 6550 nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->base_name); 6551 if (nvme_ctrlr) { 6552 rc = bdev_nvme_add_secondary_trid(nvme_ctrlr, ctrlr, &ctx->trid); 6553 } else { 6554 rc = -ENODEV; 6555 } 6556 6557 ctx->reported_bdevs = 0; 6558 populate_namespaces_cb(ctx, rc); 6559 } 6560 6561 static int 6562 bdev_nvme_async_poll(void *arg) 6563 { 6564 struct nvme_async_probe_ctx *ctx = arg; 6565 int rc; 6566 6567 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 6568 if (spdk_unlikely(rc != -EAGAIN)) { 6569 ctx->probe_done = true; 6570 spdk_poller_unregister(&ctx->poller); 6571 if (!ctx->ctrlr_attached) { 6572 /* The probe is done, but no controller was attached. 6573 * That means we had a failure, so report -EIO back to 6574 * the caller (usually the RPC). populate_namespaces_cb() 6575 * will take care of freeing the nvme_async_probe_ctx. 6576 */ 6577 ctx->reported_bdevs = 0; 6578 populate_namespaces_cb(ctx, -EIO); 6579 } else if (ctx->namespaces_populated) { 6580 /* The namespaces for the attached controller were all 6581 * populated and the response was already sent to the 6582 * caller (usually the RPC). So free the context here. 6583 */ 6584 free_nvme_async_probe_ctx(ctx); 6585 } 6586 } 6587 6588 return SPDK_POLLER_BUSY; 6589 } 6590 6591 static bool 6592 bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec, 6593 uint32_t reconnect_delay_sec, 6594 uint32_t fast_io_fail_timeout_sec) 6595 { 6596 if (ctrlr_loss_timeout_sec < -1) { 6597 SPDK_ERRLOG("ctrlr_loss_timeout_sec can't be less than -1.\n"); 6598 return false; 6599 } else if (ctrlr_loss_timeout_sec == -1) { 6600 if (reconnect_delay_sec == 0) { 6601 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 6602 return false; 6603 } else if (fast_io_fail_timeout_sec != 0 && 6604 fast_io_fail_timeout_sec < reconnect_delay_sec) { 6605 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io-fail_timeout_sec.\n"); 6606 return false; 6607 } 6608 } else if (ctrlr_loss_timeout_sec != 0) { 6609 if (reconnect_delay_sec == 0) { 6610 SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n"); 6611 return false; 6612 } else if (reconnect_delay_sec > (uint32_t)ctrlr_loss_timeout_sec) { 6613 SPDK_ERRLOG("reconnect_delay_sec can't be more than ctrlr_loss_timeout_sec.\n"); 6614 return false; 6615 } else if (fast_io_fail_timeout_sec != 0) { 6616 if (fast_io_fail_timeout_sec < reconnect_delay_sec) { 6617 SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io_fail_timeout_sec.\n"); 6618 return false; 6619 } else if (fast_io_fail_timeout_sec > (uint32_t)ctrlr_loss_timeout_sec) { 6620 SPDK_ERRLOG("fast_io_fail_timeout_sec can't be more than ctrlr_loss_timeout_sec.\n"); 6621 return false; 6622 } 6623 } 6624 } else if (reconnect_delay_sec != 0 || fast_io_fail_timeout_sec != 0) { 6625 SPDK_ERRLOG("Both reconnect_delay_sec and fast_io_fail_timeout_sec must be 0 if ctrlr_loss_timeout_sec is 0.\n"); 6626 return false; 6627 } 6628 6629 return true; 6630 } 6631 6632 int 6633 spdk_bdev_nvme_create(struct spdk_nvme_transport_id *trid, 6634 const char *base_name, 6635 const char **names, 6636 uint32_t count, 6637 spdk_bdev_nvme_create_cb cb_fn, 6638 void *cb_ctx, 6639 struct spdk_nvme_ctrlr_opts *drv_opts, 6640 struct spdk_bdev_nvme_ctrlr_opts *bdev_opts) 6641 { 6642 struct nvme_probe_skip_entry *entry, *tmp; 6643 struct nvme_async_probe_ctx *ctx; 6644 spdk_nvme_attach_cb attach_cb; 6645 struct nvme_ctrlr *nvme_ctrlr; 6646 int len; 6647 6648 /* TODO expand this check to include both the host and target TRIDs. 6649 * Only if both are the same should we fail. 6650 */ 6651 if (nvme_ctrlr_get(trid, drv_opts->hostnqn) != NULL) { 6652 SPDK_ERRLOG("A controller with the provided trid (traddr: %s, hostnqn: %s) " 6653 "already exists.\n", trid->traddr, drv_opts->hostnqn); 6654 return -EEXIST; 6655 } 6656 6657 len = strnlen(base_name, SPDK_CONTROLLER_NAME_MAX); 6658 6659 if (len == 0 || len == SPDK_CONTROLLER_NAME_MAX) { 6660 SPDK_ERRLOG("controller name must be between 1 and %d characters\n", SPDK_CONTROLLER_NAME_MAX - 1); 6661 return -EINVAL; 6662 } 6663 6664 if (bdev_opts != NULL && 6665 !bdev_nvme_check_io_error_resiliency_params(bdev_opts->ctrlr_loss_timeout_sec, 6666 bdev_opts->reconnect_delay_sec, 6667 bdev_opts->fast_io_fail_timeout_sec)) { 6668 return -EINVAL; 6669 } 6670 6671 ctx = calloc(1, sizeof(*ctx)); 6672 if (!ctx) { 6673 return -ENOMEM; 6674 } 6675 ctx->base_name = strdup(base_name); 6676 if (!ctx->base_name) { 6677 free(ctx); 6678 return -ENOMEM; 6679 } 6680 ctx->names = names; 6681 ctx->max_bdevs = count; 6682 ctx->cb_fn = cb_fn; 6683 ctx->cb_ctx = cb_ctx; 6684 ctx->trid = *trid; 6685 6686 if (bdev_opts) { 6687 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 6688 } else { 6689 spdk_bdev_nvme_get_default_ctrlr_opts(&ctx->bdev_opts); 6690 } 6691 6692 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 6693 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) { 6694 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 6695 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 6696 free(entry); 6697 break; 6698 } 6699 } 6700 } 6701 6702 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 6703 ctx->drv_opts.transport_retry_count = g_opts.transport_retry_count; 6704 ctx->drv_opts.transport_ack_timeout = g_opts.transport_ack_timeout; 6705 ctx->drv_opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms; 6706 ctx->drv_opts.disable_read_ana_log_page = true; 6707 ctx->drv_opts.transport_tos = g_opts.transport_tos; 6708 6709 if (spdk_interrupt_mode_is_enabled()) { 6710 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 6711 ctx->drv_opts.enable_interrupts = true; 6712 } else { 6713 SPDK_ERRLOG("Interrupt mode is only supported with PCIe transport\n"); 6714 free_nvme_async_probe_ctx(ctx); 6715 return -ENOTSUP; 6716 } 6717 } 6718 6719 if (ctx->bdev_opts.psk != NULL) { 6720 ctx->drv_opts.tls_psk = spdk_keyring_get_key(ctx->bdev_opts.psk); 6721 if (ctx->drv_opts.tls_psk == NULL) { 6722 SPDK_ERRLOG("Could not load PSK: %s\n", ctx->bdev_opts.psk); 6723 free_nvme_async_probe_ctx(ctx); 6724 return -ENOKEY; 6725 } 6726 } 6727 6728 if (ctx->bdev_opts.dhchap_key != NULL) { 6729 ctx->drv_opts.dhchap_key = spdk_keyring_get_key(ctx->bdev_opts.dhchap_key); 6730 if (ctx->drv_opts.dhchap_key == NULL) { 6731 SPDK_ERRLOG("Could not load DH-HMAC-CHAP key: %s\n", 6732 ctx->bdev_opts.dhchap_key); 6733 free_nvme_async_probe_ctx(ctx); 6734 return -ENOKEY; 6735 } 6736 6737 ctx->drv_opts.dhchap_digests = g_opts.dhchap_digests; 6738 ctx->drv_opts.dhchap_dhgroups = g_opts.dhchap_dhgroups; 6739 } 6740 if (ctx->bdev_opts.dhchap_ctrlr_key != NULL) { 6741 ctx->drv_opts.dhchap_ctrlr_key = 6742 spdk_keyring_get_key(ctx->bdev_opts.dhchap_ctrlr_key); 6743 if (ctx->drv_opts.dhchap_ctrlr_key == NULL) { 6744 SPDK_ERRLOG("Could not load DH-HMAC-CHAP controller key: %s\n", 6745 ctx->bdev_opts.dhchap_ctrlr_key); 6746 free_nvme_async_probe_ctx(ctx); 6747 return -ENOKEY; 6748 } 6749 } 6750 6751 if (nvme_bdev_ctrlr_get_by_name(base_name) == NULL || ctx->bdev_opts.multipath) { 6752 attach_cb = connect_attach_cb; 6753 } else { 6754 attach_cb = connect_set_failover_cb; 6755 } 6756 6757 nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->base_name); 6758 if (nvme_ctrlr && nvme_ctrlr->opts.multipath != ctx->bdev_opts.multipath) { 6759 /* All controllers with the same name must be configured the same 6760 * way, either for multipath or failover. If the configuration doesn't 6761 * match - report error. 6762 */ 6763 free_nvme_async_probe_ctx(ctx); 6764 return -EINVAL; 6765 } 6766 6767 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, attach_cb); 6768 if (ctx->probe_ctx == NULL) { 6769 SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr); 6770 free_nvme_async_probe_ctx(ctx); 6771 return -ENODEV; 6772 } 6773 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000); 6774 6775 return 0; 6776 } 6777 6778 struct bdev_nvme_delete_ctx { 6779 char *name; 6780 struct nvme_path_id path_id; 6781 bdev_nvme_delete_done_fn delete_done; 6782 void *delete_done_ctx; 6783 uint64_t timeout_ticks; 6784 struct spdk_poller *poller; 6785 }; 6786 6787 static void 6788 free_bdev_nvme_delete_ctx(struct bdev_nvme_delete_ctx *ctx) 6789 { 6790 if (ctx != NULL) { 6791 free(ctx->name); 6792 free(ctx); 6793 } 6794 } 6795 6796 static bool 6797 nvme_path_id_compare(struct nvme_path_id *p, const struct nvme_path_id *path_id) 6798 { 6799 if (path_id->trid.trtype != 0) { 6800 if (path_id->trid.trtype == SPDK_NVME_TRANSPORT_CUSTOM) { 6801 if (strcasecmp(path_id->trid.trstring, p->trid.trstring) != 0) { 6802 return false; 6803 } 6804 } else { 6805 if (path_id->trid.trtype != p->trid.trtype) { 6806 return false; 6807 } 6808 } 6809 } 6810 6811 if (!spdk_mem_all_zero(path_id->trid.traddr, sizeof(path_id->trid.traddr))) { 6812 if (strcasecmp(path_id->trid.traddr, p->trid.traddr) != 0) { 6813 return false; 6814 } 6815 } 6816 6817 if (path_id->trid.adrfam != 0) { 6818 if (path_id->trid.adrfam != p->trid.adrfam) { 6819 return false; 6820 } 6821 } 6822 6823 if (!spdk_mem_all_zero(path_id->trid.trsvcid, sizeof(path_id->trid.trsvcid))) { 6824 if (strcasecmp(path_id->trid.trsvcid, p->trid.trsvcid) != 0) { 6825 return false; 6826 } 6827 } 6828 6829 if (!spdk_mem_all_zero(path_id->trid.subnqn, sizeof(path_id->trid.subnqn))) { 6830 if (strcmp(path_id->trid.subnqn, p->trid.subnqn) != 0) { 6831 return false; 6832 } 6833 } 6834 6835 if (!spdk_mem_all_zero(path_id->hostid.hostaddr, sizeof(path_id->hostid.hostaddr))) { 6836 if (strcmp(path_id->hostid.hostaddr, p->hostid.hostaddr) != 0) { 6837 return false; 6838 } 6839 } 6840 6841 if (!spdk_mem_all_zero(path_id->hostid.hostsvcid, sizeof(path_id->hostid.hostsvcid))) { 6842 if (strcmp(path_id->hostid.hostsvcid, p->hostid.hostsvcid) != 0) { 6843 return false; 6844 } 6845 } 6846 6847 return true; 6848 } 6849 6850 static bool 6851 nvme_path_id_exists(const char *name, const struct nvme_path_id *path_id) 6852 { 6853 struct nvme_bdev_ctrlr *nbdev_ctrlr; 6854 struct nvme_ctrlr *ctrlr; 6855 struct nvme_path_id *p; 6856 6857 pthread_mutex_lock(&g_bdev_nvme_mutex); 6858 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 6859 if (!nbdev_ctrlr) { 6860 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6861 return false; 6862 } 6863 6864 TAILQ_FOREACH(ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 6865 pthread_mutex_lock(&ctrlr->mutex); 6866 TAILQ_FOREACH(p, &ctrlr->trids, link) { 6867 if (nvme_path_id_compare(p, path_id)) { 6868 pthread_mutex_unlock(&ctrlr->mutex); 6869 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6870 return true; 6871 } 6872 } 6873 pthread_mutex_unlock(&ctrlr->mutex); 6874 } 6875 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6876 6877 return false; 6878 } 6879 6880 static int 6881 bdev_nvme_delete_complete_poll(void *arg) 6882 { 6883 struct bdev_nvme_delete_ctx *ctx = arg; 6884 int rc = 0; 6885 6886 if (nvme_path_id_exists(ctx->name, &ctx->path_id)) { 6887 if (ctx->timeout_ticks > spdk_get_ticks()) { 6888 return SPDK_POLLER_BUSY; 6889 } 6890 6891 SPDK_ERRLOG("NVMe path '%s' still exists after delete\n", ctx->name); 6892 rc = -ETIMEDOUT; 6893 } 6894 6895 spdk_poller_unregister(&ctx->poller); 6896 6897 ctx->delete_done(ctx->delete_done_ctx, rc); 6898 free_bdev_nvme_delete_ctx(ctx); 6899 6900 return SPDK_POLLER_BUSY; 6901 } 6902 6903 static int 6904 _bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, const struct nvme_path_id *path_id) 6905 { 6906 struct nvme_path_id *p, *t; 6907 spdk_msg_fn msg_fn; 6908 int rc = -ENXIO; 6909 6910 pthread_mutex_lock(&nvme_ctrlr->mutex); 6911 6912 TAILQ_FOREACH_REVERSE_SAFE(p, &nvme_ctrlr->trids, nvme_paths, link, t) { 6913 if (p == TAILQ_FIRST(&nvme_ctrlr->trids)) { 6914 break; 6915 } 6916 6917 if (!nvme_path_id_compare(p, path_id)) { 6918 continue; 6919 } 6920 6921 /* We are not using the specified path. */ 6922 TAILQ_REMOVE(&nvme_ctrlr->trids, p, link); 6923 free(p); 6924 rc = 0; 6925 } 6926 6927 if (p == NULL || !nvme_path_id_compare(p, path_id)) { 6928 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6929 return rc; 6930 } 6931 6932 /* If we made it here, then this path is a match! Now we need to remove it. */ 6933 6934 /* This is the active path in use right now. The active path is always the first in the list. */ 6935 assert(p == nvme_ctrlr->active_path_id); 6936 6937 if (!TAILQ_NEXT(p, link)) { 6938 /* The current path is the only path. */ 6939 msg_fn = _nvme_ctrlr_destruct; 6940 rc = bdev_nvme_delete_ctrlr_unsafe(nvme_ctrlr, false); 6941 } else { 6942 /* There is an alternative path. */ 6943 msg_fn = _bdev_nvme_reset_ctrlr; 6944 rc = bdev_nvme_failover_ctrlr_unsafe(nvme_ctrlr, true); 6945 } 6946 6947 pthread_mutex_unlock(&nvme_ctrlr->mutex); 6948 6949 if (rc == 0) { 6950 spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr); 6951 } else if (rc == -EALREADY) { 6952 rc = 0; 6953 } 6954 6955 return rc; 6956 } 6957 6958 int 6959 bdev_nvme_delete(const char *name, const struct nvme_path_id *path_id, 6960 bdev_nvme_delete_done_fn delete_done, void *delete_done_ctx) 6961 { 6962 struct nvme_bdev_ctrlr *nbdev_ctrlr; 6963 struct nvme_ctrlr *nvme_ctrlr, *tmp_nvme_ctrlr; 6964 struct bdev_nvme_delete_ctx *ctx = NULL; 6965 int rc = -ENXIO, _rc; 6966 6967 if (name == NULL || path_id == NULL) { 6968 rc = -EINVAL; 6969 goto exit; 6970 } 6971 6972 pthread_mutex_lock(&g_bdev_nvme_mutex); 6973 6974 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 6975 if (nbdev_ctrlr == NULL) { 6976 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6977 6978 SPDK_ERRLOG("Failed to find NVMe bdev controller\n"); 6979 rc = -ENODEV; 6980 goto exit; 6981 } 6982 6983 TAILQ_FOREACH_SAFE(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq, tmp_nvme_ctrlr) { 6984 _rc = _bdev_nvme_delete(nvme_ctrlr, path_id); 6985 if (_rc < 0 && _rc != -ENXIO) { 6986 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6987 rc = _rc; 6988 goto exit; 6989 } else if (_rc == 0) { 6990 /* We traverse all remaining nvme_ctrlrs even if one nvme_ctrlr 6991 * was deleted successfully. To remember the successful deletion, 6992 * overwrite rc only if _rc is zero. 6993 */ 6994 rc = 0; 6995 } 6996 } 6997 6998 pthread_mutex_unlock(&g_bdev_nvme_mutex); 6999 7000 if (rc != 0 || delete_done == NULL) { 7001 goto exit; 7002 } 7003 7004 ctx = calloc(1, sizeof(*ctx)); 7005 if (ctx == NULL) { 7006 SPDK_ERRLOG("Failed to allocate context for bdev_nvme_delete\n"); 7007 rc = -ENOMEM; 7008 goto exit; 7009 } 7010 7011 ctx->name = strdup(name); 7012 if (ctx->name == NULL) { 7013 SPDK_ERRLOG("Failed to copy controller name for deletion\n"); 7014 rc = -ENOMEM; 7015 goto exit; 7016 } 7017 7018 ctx->delete_done = delete_done; 7019 ctx->delete_done_ctx = delete_done_ctx; 7020 ctx->path_id = *path_id; 7021 ctx->timeout_ticks = spdk_get_ticks() + 10 * spdk_get_ticks_hz(); 7022 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_delete_complete_poll, ctx, 1000); 7023 if (ctx->poller == NULL) { 7024 SPDK_ERRLOG("Failed to register bdev_nvme_delete poller\n"); 7025 rc = -ENOMEM; 7026 goto exit; 7027 } 7028 7029 exit: 7030 if (rc != 0) { 7031 free_bdev_nvme_delete_ctx(ctx); 7032 } 7033 7034 return rc; 7035 } 7036 7037 #define DISCOVERY_INFOLOG(ctx, format, ...) \ 7038 SPDK_INFOLOG(bdev_nvme, "Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 7039 7040 #define DISCOVERY_ERRLOG(ctx, format, ...) \ 7041 SPDK_ERRLOG("Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__); 7042 7043 struct discovery_entry_ctx { 7044 char name[128]; 7045 struct spdk_nvme_transport_id trid; 7046 struct spdk_nvme_ctrlr_opts drv_opts; 7047 struct spdk_nvmf_discovery_log_page_entry entry; 7048 TAILQ_ENTRY(discovery_entry_ctx) tailq; 7049 struct discovery_ctx *ctx; 7050 }; 7051 7052 struct discovery_ctx { 7053 char *name; 7054 spdk_bdev_nvme_start_discovery_fn start_cb_fn; 7055 spdk_bdev_nvme_stop_discovery_fn stop_cb_fn; 7056 void *cb_ctx; 7057 struct spdk_nvme_probe_ctx *probe_ctx; 7058 struct spdk_nvme_detach_ctx *detach_ctx; 7059 struct spdk_nvme_ctrlr *ctrlr; 7060 struct spdk_nvme_transport_id trid; 7061 struct discovery_entry_ctx *entry_ctx_in_use; 7062 struct spdk_poller *poller; 7063 struct spdk_nvme_ctrlr_opts drv_opts; 7064 struct spdk_bdev_nvme_ctrlr_opts bdev_opts; 7065 struct spdk_nvmf_discovery_log_page *log_page; 7066 TAILQ_ENTRY(discovery_ctx) tailq; 7067 TAILQ_HEAD(, discovery_entry_ctx) nvm_entry_ctxs; 7068 TAILQ_HEAD(, discovery_entry_ctx) discovery_entry_ctxs; 7069 int rc; 7070 bool wait_for_attach; 7071 uint64_t timeout_ticks; 7072 /* Denotes that the discovery service is being started. We're waiting 7073 * for the initial connection to the discovery controller to be 7074 * established and attach discovered NVM ctrlrs. 7075 */ 7076 bool initializing; 7077 /* Denotes if a discovery is currently in progress for this context. 7078 * That includes connecting to newly discovered subsystems. Used to 7079 * ensure we do not start a new discovery until an existing one is 7080 * complete. 7081 */ 7082 bool in_progress; 7083 7084 /* Denotes if another discovery is needed after the one in progress 7085 * completes. Set when we receive an AER completion while a discovery 7086 * is already in progress. 7087 */ 7088 bool pending; 7089 7090 /* Signal to the discovery context poller that it should stop the 7091 * discovery service, including detaching from the current discovery 7092 * controller. 7093 */ 7094 bool stop; 7095 7096 struct spdk_thread *calling_thread; 7097 uint32_t index; 7098 uint32_t attach_in_progress; 7099 char *hostnqn; 7100 7101 /* Denotes if the discovery service was started by the mdns discovery. 7102 */ 7103 bool from_mdns_discovery_service; 7104 }; 7105 7106 TAILQ_HEAD(discovery_ctxs, discovery_ctx); 7107 static struct discovery_ctxs g_discovery_ctxs = TAILQ_HEAD_INITIALIZER(g_discovery_ctxs); 7108 7109 static void get_discovery_log_page(struct discovery_ctx *ctx); 7110 7111 static void 7112 free_discovery_ctx(struct discovery_ctx *ctx) 7113 { 7114 free(ctx->log_page); 7115 free(ctx->hostnqn); 7116 free(ctx->name); 7117 free(ctx); 7118 } 7119 7120 static void 7121 discovery_complete(struct discovery_ctx *ctx) 7122 { 7123 ctx->initializing = false; 7124 ctx->in_progress = false; 7125 if (ctx->pending) { 7126 ctx->pending = false; 7127 get_discovery_log_page(ctx); 7128 } 7129 } 7130 7131 static void 7132 build_trid_from_log_page_entry(struct spdk_nvme_transport_id *trid, 7133 struct spdk_nvmf_discovery_log_page_entry *entry) 7134 { 7135 char *space; 7136 7137 trid->trtype = entry->trtype; 7138 trid->adrfam = entry->adrfam; 7139 memcpy(trid->traddr, entry->traddr, sizeof(entry->traddr)); 7140 memcpy(trid->trsvcid, entry->trsvcid, sizeof(entry->trsvcid)); 7141 /* Because the source buffer (entry->subnqn) is longer than trid->subnqn, and 7142 * before call to this function trid->subnqn is zeroed out, we need 7143 * to copy sizeof(trid->subnqn) minus one byte to make sure the last character 7144 * remains 0. Then we can shorten the string (replace ' ' with 0) if required 7145 */ 7146 memcpy(trid->subnqn, entry->subnqn, sizeof(trid->subnqn) - 1); 7147 7148 /* We want the traddr, trsvcid and subnqn fields to be NULL-terminated. 7149 * But the log page entries typically pad them with spaces, not zeroes. 7150 * So add a NULL terminator to each of these fields at the appropriate 7151 * location. 7152 */ 7153 space = strchr(trid->traddr, ' '); 7154 if (space) { 7155 *space = 0; 7156 } 7157 space = strchr(trid->trsvcid, ' '); 7158 if (space) { 7159 *space = 0; 7160 } 7161 space = strchr(trid->subnqn, ' '); 7162 if (space) { 7163 *space = 0; 7164 } 7165 } 7166 7167 static void 7168 _stop_discovery(void *_ctx) 7169 { 7170 struct discovery_ctx *ctx = _ctx; 7171 7172 if (ctx->attach_in_progress > 0) { 7173 spdk_thread_send_msg(spdk_get_thread(), _stop_discovery, ctx); 7174 return; 7175 } 7176 7177 ctx->stop = true; 7178 7179 while (!TAILQ_EMPTY(&ctx->nvm_entry_ctxs)) { 7180 struct discovery_entry_ctx *entry_ctx; 7181 struct nvme_path_id path = {}; 7182 7183 entry_ctx = TAILQ_FIRST(&ctx->nvm_entry_ctxs); 7184 path.trid = entry_ctx->trid; 7185 bdev_nvme_delete(entry_ctx->name, &path, NULL, NULL); 7186 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 7187 free(entry_ctx); 7188 } 7189 7190 while (!TAILQ_EMPTY(&ctx->discovery_entry_ctxs)) { 7191 struct discovery_entry_ctx *entry_ctx; 7192 7193 entry_ctx = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 7194 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 7195 free(entry_ctx); 7196 } 7197 7198 free(ctx->entry_ctx_in_use); 7199 ctx->entry_ctx_in_use = NULL; 7200 } 7201 7202 static void 7203 stop_discovery(struct discovery_ctx *ctx, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 7204 { 7205 ctx->stop_cb_fn = cb_fn; 7206 ctx->cb_ctx = cb_ctx; 7207 7208 if (ctx->attach_in_progress > 0) { 7209 DISCOVERY_INFOLOG(ctx, "stopping discovery with attach_in_progress: %"PRIu32"\n", 7210 ctx->attach_in_progress); 7211 } 7212 7213 _stop_discovery(ctx); 7214 } 7215 7216 static void 7217 remove_discovery_entry(struct nvme_ctrlr *nvme_ctrlr) 7218 { 7219 struct discovery_ctx *d_ctx; 7220 struct nvme_path_id *path_id; 7221 struct spdk_nvme_transport_id trid = {}; 7222 struct discovery_entry_ctx *entry_ctx, *tmp; 7223 7224 path_id = TAILQ_FIRST(&nvme_ctrlr->trids); 7225 7226 TAILQ_FOREACH(d_ctx, &g_discovery_ctxs, tailq) { 7227 TAILQ_FOREACH_SAFE(entry_ctx, &d_ctx->nvm_entry_ctxs, tailq, tmp) { 7228 build_trid_from_log_page_entry(&trid, &entry_ctx->entry); 7229 if (spdk_nvme_transport_id_compare(&trid, &path_id->trid) != 0) { 7230 continue; 7231 } 7232 7233 TAILQ_REMOVE(&d_ctx->nvm_entry_ctxs, entry_ctx, tailq); 7234 free(entry_ctx); 7235 DISCOVERY_INFOLOG(d_ctx, "Remove discovery entry: %s:%s:%s\n", 7236 trid.subnqn, trid.traddr, trid.trsvcid); 7237 7238 /* Fail discovery ctrlr to force reattach attempt */ 7239 spdk_nvme_ctrlr_fail(d_ctx->ctrlr); 7240 } 7241 } 7242 } 7243 7244 static void 7245 discovery_remove_controllers(struct discovery_ctx *ctx) 7246 { 7247 struct spdk_nvmf_discovery_log_page *log_page = ctx->log_page; 7248 struct discovery_entry_ctx *entry_ctx, *tmp; 7249 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 7250 struct spdk_nvme_transport_id old_trid = {}; 7251 uint64_t numrec, i; 7252 bool found; 7253 7254 numrec = from_le64(&log_page->numrec); 7255 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->nvm_entry_ctxs, tailq, tmp) { 7256 found = false; 7257 old_entry = &entry_ctx->entry; 7258 build_trid_from_log_page_entry(&old_trid, old_entry); 7259 for (i = 0; i < numrec; i++) { 7260 new_entry = &log_page->entries[i]; 7261 if (!memcmp(old_entry, new_entry, sizeof(*old_entry))) { 7262 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s found again\n", 7263 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 7264 found = true; 7265 break; 7266 } 7267 } 7268 if (!found) { 7269 struct nvme_path_id path = {}; 7270 7271 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s not found\n", 7272 old_trid.subnqn, old_trid.traddr, old_trid.trsvcid); 7273 7274 path.trid = entry_ctx->trid; 7275 bdev_nvme_delete(entry_ctx->name, &path, NULL, NULL); 7276 TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq); 7277 free(entry_ctx); 7278 } 7279 } 7280 free(log_page); 7281 ctx->log_page = NULL; 7282 discovery_complete(ctx); 7283 } 7284 7285 static void 7286 complete_discovery_start(struct discovery_ctx *ctx, int status) 7287 { 7288 ctx->timeout_ticks = 0; 7289 ctx->rc = status; 7290 if (ctx->start_cb_fn) { 7291 ctx->start_cb_fn(ctx->cb_ctx, status); 7292 ctx->start_cb_fn = NULL; 7293 ctx->cb_ctx = NULL; 7294 } 7295 } 7296 7297 static void 7298 discovery_attach_controller_done(void *cb_ctx, size_t bdev_count, int rc) 7299 { 7300 struct discovery_entry_ctx *entry_ctx = cb_ctx; 7301 struct discovery_ctx *ctx = entry_ctx->ctx; 7302 7303 DISCOVERY_INFOLOG(ctx, "attach %s done\n", entry_ctx->name); 7304 ctx->attach_in_progress--; 7305 if (ctx->attach_in_progress == 0) { 7306 complete_discovery_start(ctx, ctx->rc); 7307 if (ctx->initializing && ctx->rc != 0) { 7308 DISCOVERY_ERRLOG(ctx, "stopping discovery due to errors: %d\n", ctx->rc); 7309 stop_discovery(ctx, NULL, ctx->cb_ctx); 7310 } else { 7311 discovery_remove_controllers(ctx); 7312 } 7313 } 7314 } 7315 7316 static struct discovery_entry_ctx * 7317 create_discovery_entry_ctx(struct discovery_ctx *ctx, struct spdk_nvme_transport_id *trid) 7318 { 7319 struct discovery_entry_ctx *new_ctx; 7320 7321 new_ctx = calloc(1, sizeof(*new_ctx)); 7322 if (new_ctx == NULL) { 7323 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 7324 return NULL; 7325 } 7326 7327 new_ctx->ctx = ctx; 7328 memcpy(&new_ctx->trid, trid, sizeof(*trid)); 7329 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 7330 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 7331 return new_ctx; 7332 } 7333 7334 static void 7335 discovery_log_page_cb(void *cb_arg, int rc, const struct spdk_nvme_cpl *cpl, 7336 struct spdk_nvmf_discovery_log_page *log_page) 7337 { 7338 struct discovery_ctx *ctx = cb_arg; 7339 struct discovery_entry_ctx *entry_ctx, *tmp; 7340 struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry; 7341 uint64_t numrec, i; 7342 bool found; 7343 7344 if (rc || spdk_nvme_cpl_is_error(cpl)) { 7345 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 7346 return; 7347 } 7348 7349 ctx->log_page = log_page; 7350 assert(ctx->attach_in_progress == 0); 7351 numrec = from_le64(&log_page->numrec); 7352 TAILQ_FOREACH_SAFE(entry_ctx, &ctx->discovery_entry_ctxs, tailq, tmp) { 7353 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq); 7354 free(entry_ctx); 7355 } 7356 for (i = 0; i < numrec; i++) { 7357 found = false; 7358 new_entry = &log_page->entries[i]; 7359 if (new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY_CURRENT || 7360 new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) { 7361 struct discovery_entry_ctx *new_ctx; 7362 struct spdk_nvme_transport_id trid = {}; 7363 7364 build_trid_from_log_page_entry(&trid, new_entry); 7365 new_ctx = create_discovery_entry_ctx(ctx, &trid); 7366 if (new_ctx == NULL) { 7367 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 7368 break; 7369 } 7370 7371 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, new_ctx, tailq); 7372 continue; 7373 } 7374 TAILQ_FOREACH(entry_ctx, &ctx->nvm_entry_ctxs, tailq) { 7375 old_entry = &entry_ctx->entry; 7376 if (!memcmp(new_entry, old_entry, sizeof(*new_entry))) { 7377 found = true; 7378 break; 7379 } 7380 } 7381 if (!found) { 7382 struct discovery_entry_ctx *subnqn_ctx = NULL, *new_ctx; 7383 struct discovery_ctx *d_ctx; 7384 7385 TAILQ_FOREACH(d_ctx, &g_discovery_ctxs, tailq) { 7386 TAILQ_FOREACH(subnqn_ctx, &d_ctx->nvm_entry_ctxs, tailq) { 7387 if (!memcmp(subnqn_ctx->entry.subnqn, new_entry->subnqn, 7388 sizeof(new_entry->subnqn))) { 7389 break; 7390 } 7391 } 7392 if (subnqn_ctx) { 7393 break; 7394 } 7395 } 7396 7397 new_ctx = calloc(1, sizeof(*new_ctx)); 7398 if (new_ctx == NULL) { 7399 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 7400 break; 7401 } 7402 7403 new_ctx->ctx = ctx; 7404 memcpy(&new_ctx->entry, new_entry, sizeof(*new_entry)); 7405 build_trid_from_log_page_entry(&new_ctx->trid, new_entry); 7406 if (subnqn_ctx) { 7407 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s", subnqn_ctx->name); 7408 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new path for %s\n", 7409 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 7410 new_ctx->name); 7411 } else { 7412 snprintf(new_ctx->name, sizeof(new_ctx->name), "%s%d", ctx->name, ctx->index++); 7413 DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new subsystem %s\n", 7414 new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid, 7415 new_ctx->name); 7416 } 7417 spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts)); 7418 snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn); 7419 rc = spdk_bdev_nvme_create(&new_ctx->trid, new_ctx->name, NULL, 0, 7420 discovery_attach_controller_done, new_ctx, 7421 &new_ctx->drv_opts, &ctx->bdev_opts); 7422 if (rc == 0) { 7423 TAILQ_INSERT_TAIL(&ctx->nvm_entry_ctxs, new_ctx, tailq); 7424 ctx->attach_in_progress++; 7425 } else { 7426 DISCOVERY_ERRLOG(ctx, "spdk_bdev_nvme_create failed (%s)\n", spdk_strerror(-rc)); 7427 } 7428 } 7429 } 7430 7431 if (ctx->attach_in_progress == 0) { 7432 discovery_remove_controllers(ctx); 7433 } 7434 } 7435 7436 static void 7437 get_discovery_log_page(struct discovery_ctx *ctx) 7438 { 7439 int rc; 7440 7441 assert(ctx->in_progress == false); 7442 ctx->in_progress = true; 7443 rc = spdk_nvme_ctrlr_get_discovery_log_page(ctx->ctrlr, discovery_log_page_cb, ctx); 7444 if (rc != 0) { 7445 DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n"); 7446 } 7447 DISCOVERY_INFOLOG(ctx, "sent discovery log page command\n"); 7448 } 7449 7450 static void 7451 discovery_aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 7452 { 7453 struct discovery_ctx *ctx = arg; 7454 uint32_t log_page_id = (cpl->cdw0 & 0xFF0000) >> 16; 7455 7456 if (spdk_nvme_cpl_is_error(cpl)) { 7457 DISCOVERY_ERRLOG(ctx, "aer failed\n"); 7458 return; 7459 } 7460 7461 if (log_page_id != SPDK_NVME_LOG_DISCOVERY) { 7462 DISCOVERY_ERRLOG(ctx, "unexpected log page 0x%x\n", log_page_id); 7463 return; 7464 } 7465 7466 DISCOVERY_INFOLOG(ctx, "got aer\n"); 7467 if (ctx->in_progress) { 7468 ctx->pending = true; 7469 return; 7470 } 7471 7472 get_discovery_log_page(ctx); 7473 } 7474 7475 static void 7476 discovery_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 7477 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 7478 { 7479 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 7480 struct discovery_ctx *ctx; 7481 7482 ctx = SPDK_CONTAINEROF(user_opts, struct discovery_ctx, drv_opts); 7483 7484 DISCOVERY_INFOLOG(ctx, "discovery ctrlr attached\n"); 7485 ctx->probe_ctx = NULL; 7486 ctx->ctrlr = ctrlr; 7487 7488 if (ctx->rc != 0) { 7489 DISCOVERY_ERRLOG(ctx, "encountered error while attaching discovery ctrlr: %d\n", 7490 ctx->rc); 7491 return; 7492 } 7493 7494 spdk_nvme_ctrlr_register_aer_callback(ctx->ctrlr, discovery_aer_cb, ctx); 7495 } 7496 7497 static int 7498 discovery_poller(void *arg) 7499 { 7500 struct discovery_ctx *ctx = arg; 7501 struct spdk_nvme_transport_id *trid; 7502 int rc; 7503 7504 if (ctx->detach_ctx) { 7505 rc = spdk_nvme_detach_poll_async(ctx->detach_ctx); 7506 if (rc != -EAGAIN) { 7507 ctx->detach_ctx = NULL; 7508 ctx->ctrlr = NULL; 7509 } 7510 } else if (ctx->stop) { 7511 if (ctx->ctrlr != NULL) { 7512 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 7513 if (rc == 0) { 7514 return SPDK_POLLER_BUSY; 7515 } 7516 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 7517 } 7518 spdk_poller_unregister(&ctx->poller); 7519 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 7520 assert(ctx->start_cb_fn == NULL); 7521 if (ctx->stop_cb_fn != NULL) { 7522 ctx->stop_cb_fn(ctx->cb_ctx); 7523 } 7524 free_discovery_ctx(ctx); 7525 } else if (ctx->probe_ctx == NULL && ctx->ctrlr == NULL) { 7526 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 7527 DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); 7528 assert(ctx->initializing); 7529 spdk_poller_unregister(&ctx->poller); 7530 TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq); 7531 complete_discovery_start(ctx, -ETIMEDOUT); 7532 stop_discovery(ctx, NULL, NULL); 7533 free_discovery_ctx(ctx); 7534 return SPDK_POLLER_BUSY; 7535 } 7536 7537 assert(ctx->entry_ctx_in_use == NULL); 7538 ctx->entry_ctx_in_use = TAILQ_FIRST(&ctx->discovery_entry_ctxs); 7539 TAILQ_REMOVE(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 7540 trid = &ctx->entry_ctx_in_use->trid; 7541 7542 /* All controllers must be configured explicitely either for multipath or failover. 7543 * While discovery use multipath mode, we need to set this in bdev options as well. 7544 */ 7545 ctx->bdev_opts.multipath = true; 7546 7547 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, discovery_attach_cb); 7548 if (ctx->probe_ctx) { 7549 spdk_poller_unregister(&ctx->poller); 7550 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000); 7551 } else { 7552 DISCOVERY_ERRLOG(ctx, "could not start discovery connect\n"); 7553 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 7554 ctx->entry_ctx_in_use = NULL; 7555 } 7556 } else if (ctx->probe_ctx) { 7557 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 7558 DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n"); 7559 complete_discovery_start(ctx, -ETIMEDOUT); 7560 return SPDK_POLLER_BUSY; 7561 } 7562 7563 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 7564 if (rc != -EAGAIN) { 7565 if (ctx->rc != 0) { 7566 assert(ctx->initializing); 7567 stop_discovery(ctx, NULL, ctx->cb_ctx); 7568 } else { 7569 assert(rc == 0); 7570 DISCOVERY_INFOLOG(ctx, "discovery ctrlr connected\n"); 7571 ctx->rc = rc; 7572 get_discovery_log_page(ctx); 7573 } 7574 } 7575 } else { 7576 if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) { 7577 DISCOVERY_ERRLOG(ctx, "timed out while attaching NVM ctrlrs\n"); 7578 complete_discovery_start(ctx, -ETIMEDOUT); 7579 /* We need to wait until all NVM ctrlrs are attached before we stop the 7580 * discovery service to make sure we don't detach a ctrlr that is still 7581 * being attached. 7582 */ 7583 if (ctx->attach_in_progress == 0) { 7584 stop_discovery(ctx, NULL, ctx->cb_ctx); 7585 return SPDK_POLLER_BUSY; 7586 } 7587 } 7588 7589 rc = spdk_nvme_ctrlr_process_admin_completions(ctx->ctrlr); 7590 if (rc < 0) { 7591 spdk_poller_unregister(&ctx->poller); 7592 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 7593 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq); 7594 ctx->entry_ctx_in_use = NULL; 7595 7596 rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx); 7597 if (rc != 0) { 7598 DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n"); 7599 ctx->ctrlr = NULL; 7600 } 7601 } 7602 } 7603 7604 return SPDK_POLLER_BUSY; 7605 } 7606 7607 static void 7608 start_discovery_poller(void *arg) 7609 { 7610 struct discovery_ctx *ctx = arg; 7611 7612 TAILQ_INSERT_TAIL(&g_discovery_ctxs, ctx, tailq); 7613 ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000); 7614 } 7615 7616 int 7617 bdev_nvme_start_discovery(struct spdk_nvme_transport_id *trid, 7618 const char *base_name, 7619 struct spdk_nvme_ctrlr_opts *drv_opts, 7620 struct spdk_bdev_nvme_ctrlr_opts *bdev_opts, 7621 uint64_t attach_timeout, 7622 bool from_mdns, 7623 spdk_bdev_nvme_start_discovery_fn cb_fn, void *cb_ctx) 7624 { 7625 struct discovery_ctx *ctx; 7626 struct discovery_entry_ctx *discovery_entry_ctx; 7627 7628 snprintf(trid->subnqn, sizeof(trid->subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN); 7629 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 7630 if (strcmp(ctx->name, base_name) == 0) { 7631 return -EEXIST; 7632 } 7633 7634 if (ctx->entry_ctx_in_use != NULL) { 7635 if (!spdk_nvme_transport_id_compare(trid, &ctx->entry_ctx_in_use->trid)) { 7636 return -EEXIST; 7637 } 7638 } 7639 7640 TAILQ_FOREACH(discovery_entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 7641 if (!spdk_nvme_transport_id_compare(trid, &discovery_entry_ctx->trid)) { 7642 return -EEXIST; 7643 } 7644 } 7645 } 7646 7647 ctx = calloc(1, sizeof(*ctx)); 7648 if (ctx == NULL) { 7649 return -ENOMEM; 7650 } 7651 7652 ctx->name = strdup(base_name); 7653 if (ctx->name == NULL) { 7654 free_discovery_ctx(ctx); 7655 return -ENOMEM; 7656 } 7657 memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts)); 7658 memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts)); 7659 ctx->from_mdns_discovery_service = from_mdns; 7660 ctx->bdev_opts.from_discovery_service = true; 7661 ctx->calling_thread = spdk_get_thread(); 7662 ctx->start_cb_fn = cb_fn; 7663 ctx->cb_ctx = cb_ctx; 7664 ctx->initializing = true; 7665 if (ctx->start_cb_fn) { 7666 /* We can use this when dumping json to denote if this RPC parameter 7667 * was specified or not. 7668 */ 7669 ctx->wait_for_attach = true; 7670 } 7671 if (attach_timeout != 0) { 7672 ctx->timeout_ticks = spdk_get_ticks() + attach_timeout * 7673 spdk_get_ticks_hz() / 1000ull; 7674 } 7675 TAILQ_INIT(&ctx->nvm_entry_ctxs); 7676 TAILQ_INIT(&ctx->discovery_entry_ctxs); 7677 memcpy(&ctx->trid, trid, sizeof(*trid)); 7678 /* Even if user did not specify hostnqn, we can still strdup("\0"); */ 7679 ctx->hostnqn = strdup(ctx->drv_opts.hostnqn); 7680 if (ctx->hostnqn == NULL) { 7681 free_discovery_ctx(ctx); 7682 return -ENOMEM; 7683 } 7684 discovery_entry_ctx = create_discovery_entry_ctx(ctx, trid); 7685 if (discovery_entry_ctx == NULL) { 7686 DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n"); 7687 free_discovery_ctx(ctx); 7688 return -ENOMEM; 7689 } 7690 7691 TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, discovery_entry_ctx, tailq); 7692 spdk_thread_send_msg(g_bdev_nvme_init_thread, start_discovery_poller, ctx); 7693 return 0; 7694 } 7695 7696 int 7697 bdev_nvme_stop_discovery(const char *name, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx) 7698 { 7699 struct discovery_ctx *ctx; 7700 7701 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 7702 if (strcmp(name, ctx->name) == 0) { 7703 if (ctx->stop) { 7704 return -EALREADY; 7705 } 7706 /* If we're still starting the discovery service and ->rc is non-zero, we're 7707 * going to stop it as soon as we can 7708 */ 7709 if (ctx->initializing && ctx->rc != 0) { 7710 return -EALREADY; 7711 } 7712 stop_discovery(ctx, cb_fn, cb_ctx); 7713 return 0; 7714 } 7715 } 7716 7717 return -ENOENT; 7718 } 7719 7720 static int 7721 bdev_nvme_library_init(void) 7722 { 7723 g_bdev_nvme_init_thread = spdk_get_thread(); 7724 7725 spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_create_poll_group_cb, 7726 bdev_nvme_destroy_poll_group_cb, 7727 sizeof(struct nvme_poll_group), "nvme_poll_groups"); 7728 7729 return 0; 7730 } 7731 7732 static void 7733 bdev_nvme_fini_destruct_ctrlrs(void) 7734 { 7735 struct nvme_bdev_ctrlr *nbdev_ctrlr; 7736 struct nvme_ctrlr *nvme_ctrlr; 7737 7738 pthread_mutex_lock(&g_bdev_nvme_mutex); 7739 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 7740 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 7741 pthread_mutex_lock(&nvme_ctrlr->mutex); 7742 if (nvme_ctrlr->destruct) { 7743 /* This controller's destruction was already started 7744 * before the application started shutting down 7745 */ 7746 pthread_mutex_unlock(&nvme_ctrlr->mutex); 7747 continue; 7748 } 7749 nvme_ctrlr->destruct = true; 7750 pthread_mutex_unlock(&nvme_ctrlr->mutex); 7751 7752 spdk_thread_send_msg(nvme_ctrlr->thread, _nvme_ctrlr_destruct, 7753 nvme_ctrlr); 7754 } 7755 } 7756 7757 g_bdev_nvme_module_finish = true; 7758 if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 7759 pthread_mutex_unlock(&g_bdev_nvme_mutex); 7760 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 7761 spdk_bdev_module_fini_done(); 7762 return; 7763 } 7764 7765 pthread_mutex_unlock(&g_bdev_nvme_mutex); 7766 } 7767 7768 static void 7769 check_discovery_fini(void *arg) 7770 { 7771 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 7772 bdev_nvme_fini_destruct_ctrlrs(); 7773 } 7774 } 7775 7776 static void 7777 bdev_nvme_library_fini(void) 7778 { 7779 struct nvme_probe_skip_entry *entry, *entry_tmp; 7780 struct discovery_ctx *ctx; 7781 7782 spdk_poller_unregister(&g_hotplug_poller); 7783 free(g_hotplug_probe_ctx); 7784 g_hotplug_probe_ctx = NULL; 7785 7786 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) { 7787 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 7788 free(entry); 7789 } 7790 7791 assert(spdk_get_thread() == g_bdev_nvme_init_thread); 7792 if (TAILQ_EMPTY(&g_discovery_ctxs)) { 7793 bdev_nvme_fini_destruct_ctrlrs(); 7794 } else { 7795 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 7796 stop_discovery(ctx, check_discovery_fini, NULL); 7797 } 7798 } 7799 } 7800 7801 static void 7802 bdev_nvme_verify_pi_error(struct nvme_bdev_io *bio) 7803 { 7804 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7805 struct spdk_bdev *bdev = bdev_io->bdev; 7806 struct spdk_dif_ctx dif_ctx; 7807 struct spdk_dif_error err_blk = {}; 7808 int rc; 7809 struct spdk_dif_ctx_init_ext_opts dif_opts; 7810 7811 dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format); 7812 dif_opts.dif_pi_format = bdev->dif_pi_format; 7813 rc = spdk_dif_ctx_init(&dif_ctx, 7814 bdev->blocklen, bdev->md_len, bdev->md_interleave, 7815 bdev->dif_is_head_of_md, bdev->dif_type, 7816 bdev_io->u.bdev.dif_check_flags, 7817 bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0, &dif_opts); 7818 if (rc != 0) { 7819 SPDK_ERRLOG("Initialization of DIF context failed\n"); 7820 return; 7821 } 7822 7823 if (bdev->md_interleave) { 7824 rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 7825 bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 7826 } else { 7827 struct iovec md_iov = { 7828 .iov_base = bdev_io->u.bdev.md_buf, 7829 .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len, 7830 }; 7831 7832 rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 7833 &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 7834 } 7835 7836 if (rc != 0) { 7837 SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n", 7838 err_blk.err_type, err_blk.err_offset); 7839 } else { 7840 SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n"); 7841 } 7842 } 7843 7844 static void 7845 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 7846 { 7847 struct nvme_bdev_io *bio = ref; 7848 7849 if (spdk_nvme_cpl_is_success(cpl)) { 7850 /* Run PI verification for read data buffer. */ 7851 bdev_nvme_verify_pi_error(bio); 7852 } 7853 7854 /* Return original completion status */ 7855 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 7856 } 7857 7858 static void 7859 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 7860 { 7861 struct nvme_bdev_io *bio = ref; 7862 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7863 int ret; 7864 7865 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 7866 SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n", 7867 cpl->status.sct, cpl->status.sc); 7868 7869 /* Save completion status to use after verifying PI error. */ 7870 bio->cpl = *cpl; 7871 7872 if (spdk_likely(nvme_io_path_is_available(bio->io_path))) { 7873 /* Read without PI checking to verify PI error. */ 7874 ret = bdev_nvme_no_pi_readv(bio, 7875 bdev_io->u.bdev.iovs, 7876 bdev_io->u.bdev.iovcnt, 7877 bdev_io->u.bdev.md_buf, 7878 bdev_io->u.bdev.num_blocks, 7879 bdev_io->u.bdev.offset_blocks); 7880 if (ret == 0) { 7881 return; 7882 } 7883 } 7884 } 7885 7886 bdev_nvme_io_complete_nvme_status(bio, cpl); 7887 } 7888 7889 static void 7890 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7891 { 7892 struct nvme_bdev_io *bio = ref; 7893 7894 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 7895 SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n", 7896 cpl->status.sct, cpl->status.sc); 7897 /* Run PI verification for write data buffer if PI error is detected. */ 7898 bdev_nvme_verify_pi_error(bio); 7899 } 7900 7901 bdev_nvme_io_complete_nvme_status(bio, cpl); 7902 } 7903 7904 static void 7905 bdev_nvme_zone_appendv_done(void *ref, const struct spdk_nvme_cpl *cpl) 7906 { 7907 struct nvme_bdev_io *bio = ref; 7908 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 7909 7910 /* spdk_bdev_io_get_append_location() requires that the ALBA is stored in offset_blocks. 7911 * Additionally, offset_blocks has to be set before calling bdev_nvme_verify_pi_error(). 7912 */ 7913 bdev_io->u.bdev.offset_blocks = *(uint64_t *)&cpl->cdw0; 7914 7915 if (spdk_nvme_cpl_is_pi_error(cpl)) { 7916 SPDK_ERRLOG("zone append completed with PI error (sct=%d, sc=%d)\n", 7917 cpl->status.sct, cpl->status.sc); 7918 /* Run PI verification for zone append data buffer if PI error is detected. */ 7919 bdev_nvme_verify_pi_error(bio); 7920 } 7921 7922 bdev_nvme_io_complete_nvme_status(bio, cpl); 7923 } 7924 7925 static void 7926 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7927 { 7928 struct nvme_bdev_io *bio = ref; 7929 7930 if (spdk_nvme_cpl_is_pi_error(cpl)) { 7931 SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n", 7932 cpl->status.sct, cpl->status.sc); 7933 /* Run PI verification for compare data buffer if PI error is detected. */ 7934 bdev_nvme_verify_pi_error(bio); 7935 } 7936 7937 bdev_nvme_io_complete_nvme_status(bio, cpl); 7938 } 7939 7940 static void 7941 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 7942 { 7943 struct nvme_bdev_io *bio = ref; 7944 7945 /* Compare operation completion */ 7946 if (!bio->first_fused_completed) { 7947 /* Save compare result for write callback */ 7948 bio->cpl = *cpl; 7949 bio->first_fused_completed = true; 7950 return; 7951 } 7952 7953 /* Write operation completion */ 7954 if (spdk_nvme_cpl_is_error(&bio->cpl)) { 7955 /* If bio->cpl is already an error, it means the compare operation failed. In that case, 7956 * complete the IO with the compare operation's status. 7957 */ 7958 if (!spdk_nvme_cpl_is_error(cpl)) { 7959 SPDK_ERRLOG("Unexpected write success after compare failure.\n"); 7960 } 7961 7962 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 7963 } else { 7964 bdev_nvme_io_complete_nvme_status(bio, cpl); 7965 } 7966 } 7967 7968 static void 7969 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl) 7970 { 7971 struct nvme_bdev_io *bio = ref; 7972 7973 bdev_nvme_io_complete_nvme_status(bio, cpl); 7974 } 7975 7976 static int 7977 fill_zone_from_report(struct spdk_bdev_zone_info *info, struct spdk_nvme_zns_zone_desc *desc) 7978 { 7979 switch (desc->zt) { 7980 case SPDK_NVME_ZONE_TYPE_SEQWR: 7981 info->type = SPDK_BDEV_ZONE_TYPE_SEQWR; 7982 break; 7983 default: 7984 SPDK_ERRLOG("Invalid zone type: %#x in zone report\n", desc->zt); 7985 return -EIO; 7986 } 7987 7988 switch (desc->zs) { 7989 case SPDK_NVME_ZONE_STATE_EMPTY: 7990 info->state = SPDK_BDEV_ZONE_STATE_EMPTY; 7991 break; 7992 case SPDK_NVME_ZONE_STATE_IOPEN: 7993 info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN; 7994 break; 7995 case SPDK_NVME_ZONE_STATE_EOPEN: 7996 info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN; 7997 break; 7998 case SPDK_NVME_ZONE_STATE_CLOSED: 7999 info->state = SPDK_BDEV_ZONE_STATE_CLOSED; 8000 break; 8001 case SPDK_NVME_ZONE_STATE_RONLY: 8002 info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY; 8003 break; 8004 case SPDK_NVME_ZONE_STATE_FULL: 8005 info->state = SPDK_BDEV_ZONE_STATE_FULL; 8006 break; 8007 case SPDK_NVME_ZONE_STATE_OFFLINE: 8008 info->state = SPDK_BDEV_ZONE_STATE_OFFLINE; 8009 break; 8010 default: 8011 SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", desc->zs); 8012 return -EIO; 8013 } 8014 8015 info->zone_id = desc->zslba; 8016 info->write_pointer = desc->wp; 8017 info->capacity = desc->zcap; 8018 8019 return 0; 8020 } 8021 8022 static void 8023 bdev_nvme_get_zone_info_done(void *ref, const struct spdk_nvme_cpl *cpl) 8024 { 8025 struct nvme_bdev_io *bio = ref; 8026 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 8027 uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id; 8028 uint32_t zones_to_copy = bdev_io->u.zone_mgmt.num_zones; 8029 struct spdk_bdev_zone_info *info = bdev_io->u.zone_mgmt.buf; 8030 uint64_t max_zones_per_buf, i; 8031 uint32_t zone_report_bufsize; 8032 struct spdk_nvme_ns *ns; 8033 struct spdk_nvme_qpair *qpair; 8034 int ret; 8035 8036 if (spdk_nvme_cpl_is_error(cpl)) { 8037 goto out_complete_io_nvme_cpl; 8038 } 8039 8040 if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) { 8041 ret = -ENXIO; 8042 goto out_complete_io_ret; 8043 } 8044 8045 ns = bio->io_path->nvme_ns->ns; 8046 qpair = bio->io_path->qpair->qpair; 8047 8048 zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 8049 max_zones_per_buf = (zone_report_bufsize - sizeof(*bio->zone_report_buf)) / 8050 sizeof(bio->zone_report_buf->descs[0]); 8051 8052 if (bio->zone_report_buf->nr_zones > max_zones_per_buf) { 8053 ret = -EINVAL; 8054 goto out_complete_io_ret; 8055 } 8056 8057 if (!bio->zone_report_buf->nr_zones) { 8058 ret = -EINVAL; 8059 goto out_complete_io_ret; 8060 } 8061 8062 for (i = 0; i < bio->zone_report_buf->nr_zones && bio->handled_zones < zones_to_copy; i++) { 8063 ret = fill_zone_from_report(&info[bio->handled_zones], 8064 &bio->zone_report_buf->descs[i]); 8065 if (ret) { 8066 goto out_complete_io_ret; 8067 } 8068 bio->handled_zones++; 8069 } 8070 8071 if (bio->handled_zones < zones_to_copy) { 8072 uint64_t zone_size_lba = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 8073 uint64_t slba = zone_id + (zone_size_lba * bio->handled_zones); 8074 8075 memset(bio->zone_report_buf, 0, zone_report_bufsize); 8076 ret = spdk_nvme_zns_report_zones(ns, qpair, 8077 bio->zone_report_buf, zone_report_bufsize, 8078 slba, SPDK_NVME_ZRA_LIST_ALL, true, 8079 bdev_nvme_get_zone_info_done, bio); 8080 if (!ret) { 8081 return; 8082 } else { 8083 goto out_complete_io_ret; 8084 } 8085 } 8086 8087 out_complete_io_nvme_cpl: 8088 free(bio->zone_report_buf); 8089 bio->zone_report_buf = NULL; 8090 bdev_nvme_io_complete_nvme_status(bio, cpl); 8091 return; 8092 8093 out_complete_io_ret: 8094 free(bio->zone_report_buf); 8095 bio->zone_report_buf = NULL; 8096 bdev_nvme_io_complete(bio, ret); 8097 } 8098 8099 static void 8100 bdev_nvme_zone_management_done(void *ref, const struct spdk_nvme_cpl *cpl) 8101 { 8102 struct nvme_bdev_io *bio = ref; 8103 8104 bdev_nvme_io_complete_nvme_status(bio, cpl); 8105 } 8106 8107 static void 8108 bdev_nvme_admin_passthru_complete_nvme_status(void *ctx) 8109 { 8110 struct nvme_bdev_io *bio = ctx; 8111 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 8112 const struct spdk_nvme_cpl *cpl = &bio->cpl; 8113 8114 assert(bdev_nvme_io_type_is_admin(bdev_io->type)); 8115 8116 __bdev_nvme_io_complete(bdev_io, 0, cpl); 8117 } 8118 8119 static void 8120 bdev_nvme_abort_complete(void *ctx) 8121 { 8122 struct nvme_bdev_io *bio = ctx; 8123 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 8124 8125 if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) { 8126 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS, NULL); 8127 } else { 8128 __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED, NULL); 8129 } 8130 } 8131 8132 static void 8133 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl) 8134 { 8135 struct nvme_bdev_io *bio = ref; 8136 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 8137 8138 bio->cpl = *cpl; 8139 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), bdev_nvme_abort_complete, bio); 8140 } 8141 8142 static void 8143 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl) 8144 { 8145 struct nvme_bdev_io *bio = ref; 8146 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 8147 8148 bio->cpl = *cpl; 8149 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 8150 bdev_nvme_admin_passthru_complete_nvme_status, bio); 8151 } 8152 8153 static void 8154 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset) 8155 { 8156 struct nvme_bdev_io *bio = ref; 8157 struct iovec *iov; 8158 8159 bio->iov_offset = sgl_offset; 8160 for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) { 8161 iov = &bio->iovs[bio->iovpos]; 8162 if (bio->iov_offset < iov->iov_len) { 8163 break; 8164 } 8165 8166 bio->iov_offset -= iov->iov_len; 8167 } 8168 } 8169 8170 static int 8171 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length) 8172 { 8173 struct nvme_bdev_io *bio = ref; 8174 struct iovec *iov; 8175 8176 assert(bio->iovpos < bio->iovcnt); 8177 8178 iov = &bio->iovs[bio->iovpos]; 8179 8180 *address = iov->iov_base; 8181 *length = iov->iov_len; 8182 8183 if (bio->iov_offset) { 8184 assert(bio->iov_offset <= iov->iov_len); 8185 *address += bio->iov_offset; 8186 *length -= bio->iov_offset; 8187 } 8188 8189 bio->iov_offset += *length; 8190 if (bio->iov_offset == iov->iov_len) { 8191 bio->iovpos++; 8192 bio->iov_offset = 0; 8193 } 8194 8195 return 0; 8196 } 8197 8198 static void 8199 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset) 8200 { 8201 struct nvme_bdev_io *bio = ref; 8202 struct iovec *iov; 8203 8204 bio->fused_iov_offset = sgl_offset; 8205 for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) { 8206 iov = &bio->fused_iovs[bio->fused_iovpos]; 8207 if (bio->fused_iov_offset < iov->iov_len) { 8208 break; 8209 } 8210 8211 bio->fused_iov_offset -= iov->iov_len; 8212 } 8213 } 8214 8215 static int 8216 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length) 8217 { 8218 struct nvme_bdev_io *bio = ref; 8219 struct iovec *iov; 8220 8221 assert(bio->fused_iovpos < bio->fused_iovcnt); 8222 8223 iov = &bio->fused_iovs[bio->fused_iovpos]; 8224 8225 *address = iov->iov_base; 8226 *length = iov->iov_len; 8227 8228 if (bio->fused_iov_offset) { 8229 assert(bio->fused_iov_offset <= iov->iov_len); 8230 *address += bio->fused_iov_offset; 8231 *length -= bio->fused_iov_offset; 8232 } 8233 8234 bio->fused_iov_offset += *length; 8235 if (bio->fused_iov_offset == iov->iov_len) { 8236 bio->fused_iovpos++; 8237 bio->fused_iov_offset = 0; 8238 } 8239 8240 return 0; 8241 } 8242 8243 static int 8244 bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 8245 void *md, uint64_t lba_count, uint64_t lba) 8246 { 8247 int rc; 8248 8249 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n", 8250 lba_count, lba); 8251 8252 bio->iovs = iov; 8253 bio->iovcnt = iovcnt; 8254 bio->iovpos = 0; 8255 bio->iov_offset = 0; 8256 8257 rc = spdk_nvme_ns_cmd_readv_with_md(bio->io_path->nvme_ns->ns, 8258 bio->io_path->qpair->qpair, 8259 lba, lba_count, 8260 bdev_nvme_no_pi_readv_done, bio, 0, 8261 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 8262 md, 0, 0); 8263 8264 if (rc != 0 && rc != -ENOMEM) { 8265 SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc); 8266 } 8267 return rc; 8268 } 8269 8270 static int 8271 bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 8272 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, 8273 struct spdk_memory_domain *domain, void *domain_ctx, 8274 struct spdk_accel_sequence *seq) 8275 { 8276 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8277 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8278 int rc; 8279 8280 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n", 8281 lba_count, lba); 8282 8283 bio->iovs = iov; 8284 bio->iovcnt = iovcnt; 8285 bio->iovpos = 0; 8286 bio->iov_offset = 0; 8287 8288 if (domain != NULL || seq != NULL) { 8289 bio->ext_opts.size = SPDK_SIZEOF(&bio->ext_opts, accel_sequence); 8290 bio->ext_opts.memory_domain = domain; 8291 bio->ext_opts.memory_domain_ctx = domain_ctx; 8292 bio->ext_opts.io_flags = flags; 8293 bio->ext_opts.metadata = md; 8294 bio->ext_opts.accel_sequence = seq; 8295 8296 if (iovcnt == 1) { 8297 rc = spdk_nvme_ns_cmd_read_ext(ns, qpair, iov[0].iov_base, lba, lba_count, bdev_nvme_readv_done, 8298 bio, &bio->ext_opts); 8299 } else { 8300 rc = spdk_nvme_ns_cmd_readv_ext(ns, qpair, lba, lba_count, 8301 bdev_nvme_readv_done, bio, 8302 bdev_nvme_queued_reset_sgl, 8303 bdev_nvme_queued_next_sge, 8304 &bio->ext_opts); 8305 } 8306 } else if (iovcnt == 1) { 8307 rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base, 8308 md, lba, lba_count, bdev_nvme_readv_done, 8309 bio, flags, 0, 0); 8310 } else { 8311 rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count, 8312 bdev_nvme_readv_done, bio, flags, 8313 bdev_nvme_queued_reset_sgl, 8314 bdev_nvme_queued_next_sge, md, 0, 0); 8315 } 8316 8317 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 8318 SPDK_ERRLOG("readv failed: rc = %d\n", rc); 8319 } 8320 return rc; 8321 } 8322 8323 static int 8324 bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 8325 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, 8326 struct spdk_memory_domain *domain, void *domain_ctx, 8327 struct spdk_accel_sequence *seq, 8328 union spdk_bdev_nvme_cdw12 cdw12, union spdk_bdev_nvme_cdw13 cdw13) 8329 { 8330 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8331 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8332 int rc; 8333 8334 SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 8335 lba_count, lba); 8336 8337 bio->iovs = iov; 8338 bio->iovcnt = iovcnt; 8339 bio->iovpos = 0; 8340 bio->iov_offset = 0; 8341 8342 if (domain != NULL || seq != NULL) { 8343 bio->ext_opts.size = SPDK_SIZEOF(&bio->ext_opts, accel_sequence); 8344 bio->ext_opts.memory_domain = domain; 8345 bio->ext_opts.memory_domain_ctx = domain_ctx; 8346 bio->ext_opts.io_flags = flags | SPDK_NVME_IO_FLAGS_DIRECTIVE(cdw12.write.dtype); 8347 bio->ext_opts.cdw13 = cdw13.raw; 8348 bio->ext_opts.metadata = md; 8349 bio->ext_opts.accel_sequence = seq; 8350 8351 if (iovcnt == 1) { 8352 rc = spdk_nvme_ns_cmd_write_ext(ns, qpair, iov[0].iov_base, lba, lba_count, bdev_nvme_writev_done, 8353 bio, &bio->ext_opts); 8354 } else { 8355 rc = spdk_nvme_ns_cmd_writev_ext(ns, qpair, lba, lba_count, 8356 bdev_nvme_writev_done, bio, 8357 bdev_nvme_queued_reset_sgl, 8358 bdev_nvme_queued_next_sge, 8359 &bio->ext_opts); 8360 } 8361 } else if (iovcnt == 1) { 8362 rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base, 8363 md, lba, lba_count, bdev_nvme_writev_done, 8364 bio, flags, 0, 0); 8365 } else { 8366 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 8367 bdev_nvme_writev_done, bio, flags, 8368 bdev_nvme_queued_reset_sgl, 8369 bdev_nvme_queued_next_sge, md, 0, 0); 8370 } 8371 8372 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 8373 SPDK_ERRLOG("writev failed: rc = %d\n", rc); 8374 } 8375 return rc; 8376 } 8377 8378 static int 8379 bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 8380 void *md, uint64_t lba_count, uint64_t zslba, 8381 uint32_t flags) 8382 { 8383 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8384 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8385 int rc; 8386 8387 SPDK_DEBUGLOG(bdev_nvme, "zone append %" PRIu64 " blocks to zone start lba %#" PRIx64 "\n", 8388 lba_count, zslba); 8389 8390 bio->iovs = iov; 8391 bio->iovcnt = iovcnt; 8392 bio->iovpos = 0; 8393 bio->iov_offset = 0; 8394 8395 if (iovcnt == 1) { 8396 rc = spdk_nvme_zns_zone_append_with_md(ns, qpair, iov[0].iov_base, md, zslba, 8397 lba_count, 8398 bdev_nvme_zone_appendv_done, bio, 8399 flags, 8400 0, 0); 8401 } else { 8402 rc = spdk_nvme_zns_zone_appendv_with_md(ns, qpair, zslba, lba_count, 8403 bdev_nvme_zone_appendv_done, bio, flags, 8404 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 8405 md, 0, 0); 8406 } 8407 8408 if (rc != 0 && rc != -ENOMEM) { 8409 SPDK_ERRLOG("zone append failed: rc = %d\n", rc); 8410 } 8411 return rc; 8412 } 8413 8414 static int 8415 bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 8416 void *md, uint64_t lba_count, uint64_t lba, 8417 uint32_t flags) 8418 { 8419 int rc; 8420 8421 SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n", 8422 lba_count, lba); 8423 8424 bio->iovs = iov; 8425 bio->iovcnt = iovcnt; 8426 bio->iovpos = 0; 8427 bio->iov_offset = 0; 8428 8429 rc = spdk_nvme_ns_cmd_comparev_with_md(bio->io_path->nvme_ns->ns, 8430 bio->io_path->qpair->qpair, 8431 lba, lba_count, 8432 bdev_nvme_comparev_done, bio, flags, 8433 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 8434 md, 0, 0); 8435 8436 if (rc != 0 && rc != -ENOMEM) { 8437 SPDK_ERRLOG("comparev failed: rc = %d\n", rc); 8438 } 8439 return rc; 8440 } 8441 8442 static int 8443 bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, 8444 struct iovec *write_iov, int write_iovcnt, 8445 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags) 8446 { 8447 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8448 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8449 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 8450 int rc; 8451 8452 SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 8453 lba_count, lba); 8454 8455 bio->iovs = cmp_iov; 8456 bio->iovcnt = cmp_iovcnt; 8457 bio->iovpos = 0; 8458 bio->iov_offset = 0; 8459 bio->fused_iovs = write_iov; 8460 bio->fused_iovcnt = write_iovcnt; 8461 bio->fused_iovpos = 0; 8462 bio->fused_iov_offset = 0; 8463 8464 if (bdev_io->num_retries == 0) { 8465 bio->first_fused_submitted = false; 8466 bio->first_fused_completed = false; 8467 } 8468 8469 if (!bio->first_fused_submitted) { 8470 flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST; 8471 memset(&bio->cpl, 0, sizeof(bio->cpl)); 8472 8473 rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count, 8474 bdev_nvme_comparev_and_writev_done, bio, flags, 8475 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0); 8476 if (rc == 0) { 8477 bio->first_fused_submitted = true; 8478 flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST; 8479 } else { 8480 if (rc != -ENOMEM) { 8481 SPDK_ERRLOG("compare failed: rc = %d\n", rc); 8482 } 8483 return rc; 8484 } 8485 } 8486 8487 flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND; 8488 8489 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 8490 bdev_nvme_comparev_and_writev_done, bio, flags, 8491 bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0); 8492 if (rc != 0 && rc != -ENOMEM) { 8493 SPDK_ERRLOG("write failed: rc = %d\n", rc); 8494 rc = 0; 8495 } 8496 8497 return rc; 8498 } 8499 8500 static int 8501 bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 8502 { 8503 struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES]; 8504 struct spdk_nvme_dsm_range *range; 8505 uint64_t offset, remaining; 8506 uint64_t num_ranges_u64; 8507 uint16_t num_ranges; 8508 int rc; 8509 8510 num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) / 8511 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 8512 if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) { 8513 SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks); 8514 return -EINVAL; 8515 } 8516 num_ranges = (uint16_t)num_ranges_u64; 8517 8518 offset = offset_blocks; 8519 remaining = num_blocks; 8520 range = &dsm_ranges[0]; 8521 8522 /* Fill max-size ranges until the remaining blocks fit into one range */ 8523 while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) { 8524 range->attributes.raw = 0; 8525 range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 8526 range->starting_lba = offset; 8527 8528 offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 8529 remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 8530 range++; 8531 } 8532 8533 /* Final range describes the remaining blocks */ 8534 range->attributes.raw = 0; 8535 range->length = remaining; 8536 range->starting_lba = offset; 8537 8538 rc = spdk_nvme_ns_cmd_dataset_management(bio->io_path->nvme_ns->ns, 8539 bio->io_path->qpair->qpair, 8540 SPDK_NVME_DSM_ATTR_DEALLOCATE, 8541 dsm_ranges, num_ranges, 8542 bdev_nvme_queued_done, bio); 8543 8544 return rc; 8545 } 8546 8547 static int 8548 bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks) 8549 { 8550 if (num_blocks > UINT16_MAX + 1) { 8551 SPDK_ERRLOG("NVMe write zeroes is limited to 16-bit block count\n"); 8552 return -EINVAL; 8553 } 8554 8555 return spdk_nvme_ns_cmd_write_zeroes(bio->io_path->nvme_ns->ns, 8556 bio->io_path->qpair->qpair, 8557 offset_blocks, num_blocks, 8558 bdev_nvme_queued_done, bio, 8559 0); 8560 } 8561 8562 static int 8563 bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones, 8564 struct spdk_bdev_zone_info *info) 8565 { 8566 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8567 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8568 uint32_t zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 8569 uint64_t zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 8570 uint64_t total_zones = spdk_nvme_zns_ns_get_num_zones(ns); 8571 8572 if (zone_id % zone_size != 0) { 8573 return -EINVAL; 8574 } 8575 8576 if (num_zones > total_zones || !num_zones) { 8577 return -EINVAL; 8578 } 8579 8580 assert(!bio->zone_report_buf); 8581 bio->zone_report_buf = calloc(1, zone_report_bufsize); 8582 if (!bio->zone_report_buf) { 8583 return -ENOMEM; 8584 } 8585 8586 bio->handled_zones = 0; 8587 8588 return spdk_nvme_zns_report_zones(ns, qpair, bio->zone_report_buf, zone_report_bufsize, 8589 zone_id, SPDK_NVME_ZRA_LIST_ALL, true, 8590 bdev_nvme_get_zone_info_done, bio); 8591 } 8592 8593 static int 8594 bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id, 8595 enum spdk_bdev_zone_action action) 8596 { 8597 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8598 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8599 8600 switch (action) { 8601 case SPDK_BDEV_ZONE_CLOSE: 8602 return spdk_nvme_zns_close_zone(ns, qpair, zone_id, false, 8603 bdev_nvme_zone_management_done, bio); 8604 case SPDK_BDEV_ZONE_FINISH: 8605 return spdk_nvme_zns_finish_zone(ns, qpair, zone_id, false, 8606 bdev_nvme_zone_management_done, bio); 8607 case SPDK_BDEV_ZONE_OPEN: 8608 return spdk_nvme_zns_open_zone(ns, qpair, zone_id, false, 8609 bdev_nvme_zone_management_done, bio); 8610 case SPDK_BDEV_ZONE_RESET: 8611 return spdk_nvme_zns_reset_zone(ns, qpair, zone_id, false, 8612 bdev_nvme_zone_management_done, bio); 8613 case SPDK_BDEV_ZONE_OFFLINE: 8614 return spdk_nvme_zns_offline_zone(ns, qpair, zone_id, false, 8615 bdev_nvme_zone_management_done, bio); 8616 default: 8617 return -EINVAL; 8618 } 8619 } 8620 8621 static void 8622 bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 8623 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) 8624 { 8625 struct nvme_io_path *io_path; 8626 struct nvme_ctrlr *nvme_ctrlr; 8627 uint32_t max_xfer_size; 8628 int rc = -ENXIO; 8629 8630 /* Choose the first ctrlr which is not failed. */ 8631 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 8632 nvme_ctrlr = io_path->qpair->ctrlr; 8633 8634 /* We should skip any unavailable nvme_ctrlr rather than checking 8635 * if the return value of spdk_nvme_ctrlr_cmd_admin_raw() is -ENXIO. 8636 */ 8637 if (!nvme_ctrlr_is_available(nvme_ctrlr)) { 8638 continue; 8639 } 8640 8641 max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ctrlr->ctrlr); 8642 8643 if (nbytes > max_xfer_size) { 8644 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 8645 rc = -EINVAL; 8646 goto err; 8647 } 8648 8649 rc = spdk_nvme_ctrlr_cmd_admin_raw(nvme_ctrlr->ctrlr, cmd, buf, (uint32_t)nbytes, 8650 bdev_nvme_admin_passthru_done, bio); 8651 if (rc == 0) { 8652 return; 8653 } 8654 } 8655 8656 err: 8657 bdev_nvme_admin_complete(bio, rc); 8658 } 8659 8660 static int 8661 bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 8662 void *buf, size_t nbytes) 8663 { 8664 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8665 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8666 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 8667 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 8668 8669 if (nbytes > max_xfer_size) { 8670 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 8671 return -EINVAL; 8672 } 8673 8674 /* 8675 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 8676 * so fill it out automatically. 8677 */ 8678 cmd->nsid = spdk_nvme_ns_get_id(ns); 8679 8680 return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf, 8681 (uint32_t)nbytes, bdev_nvme_queued_done, bio); 8682 } 8683 8684 static int 8685 bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd, 8686 void *buf, size_t nbytes, void *md_buf, size_t md_len) 8687 { 8688 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8689 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8690 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 8691 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 8692 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 8693 8694 if (nbytes > max_xfer_size) { 8695 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 8696 return -EINVAL; 8697 } 8698 8699 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 8700 SPDK_ERRLOG("invalid meta data buffer size\n"); 8701 return -EINVAL; 8702 } 8703 8704 /* 8705 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 8706 * so fill it out automatically. 8707 */ 8708 cmd->nsid = spdk_nvme_ns_get_id(ns); 8709 8710 return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf, 8711 (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio); 8712 } 8713 8714 static int 8715 bdev_nvme_iov_passthru_md(struct nvme_bdev_io *bio, 8716 struct spdk_nvme_cmd *cmd, struct iovec *iov, int iovcnt, 8717 size_t nbytes, void *md_buf, size_t md_len) 8718 { 8719 struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns; 8720 struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair; 8721 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 8722 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 8723 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 8724 8725 bio->iovs = iov; 8726 bio->iovcnt = iovcnt; 8727 bio->iovpos = 0; 8728 bio->iov_offset = 0; 8729 8730 if (nbytes > max_xfer_size) { 8731 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 8732 return -EINVAL; 8733 } 8734 8735 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 8736 SPDK_ERRLOG("invalid meta data buffer size\n"); 8737 return -EINVAL; 8738 } 8739 8740 /* 8741 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands 8742 * require a nsid, so fill it out automatically. 8743 */ 8744 cmd->nsid = spdk_nvme_ns_get_id(ns); 8745 8746 return spdk_nvme_ctrlr_cmd_iov_raw_with_md( 8747 ctrlr, qpair, cmd, (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio, 8748 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge); 8749 } 8750 8751 static void 8752 bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 8753 struct nvme_bdev_io *bio_to_abort) 8754 { 8755 struct nvme_io_path *io_path; 8756 int rc = 0; 8757 8758 rc = bdev_nvme_abort_retry_io(nbdev_ch, bio_to_abort); 8759 if (rc == 0) { 8760 bdev_nvme_admin_complete(bio, 0); 8761 return; 8762 } 8763 8764 io_path = bio_to_abort->io_path; 8765 if (io_path != NULL) { 8766 rc = spdk_nvme_ctrlr_cmd_abort_ext(io_path->qpair->ctrlr->ctrlr, 8767 io_path->qpair->qpair, 8768 bio_to_abort, 8769 bdev_nvme_abort_done, bio); 8770 } else { 8771 STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) { 8772 rc = spdk_nvme_ctrlr_cmd_abort_ext(io_path->qpair->ctrlr->ctrlr, 8773 NULL, 8774 bio_to_abort, 8775 bdev_nvme_abort_done, bio); 8776 8777 if (rc != -ENOENT) { 8778 break; 8779 } 8780 } 8781 } 8782 8783 if (rc != 0) { 8784 /* If no command was found or there was any error, complete the abort 8785 * request with failure. 8786 */ 8787 bdev_nvme_admin_complete(bio, rc); 8788 } 8789 } 8790 8791 static int 8792 bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, uint64_t src_offset_blocks, 8793 uint64_t num_blocks) 8794 { 8795 struct spdk_nvme_scc_source_range range = { 8796 .slba = src_offset_blocks, 8797 .nlb = num_blocks - 1 8798 }; 8799 8800 return spdk_nvme_ns_cmd_copy(bio->io_path->nvme_ns->ns, 8801 bio->io_path->qpair->qpair, 8802 &range, 1, dst_offset_blocks, 8803 bdev_nvme_queued_done, bio); 8804 } 8805 8806 static void 8807 bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w) 8808 { 8809 const char *action; 8810 uint32_t i; 8811 8812 if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) { 8813 action = "reset"; 8814 } else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) { 8815 action = "abort"; 8816 } else { 8817 action = "none"; 8818 } 8819 8820 spdk_json_write_object_begin(w); 8821 8822 spdk_json_write_named_string(w, "method", "bdev_nvme_set_options"); 8823 8824 spdk_json_write_named_object_begin(w, "params"); 8825 spdk_json_write_named_string(w, "action_on_timeout", action); 8826 spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us); 8827 spdk_json_write_named_uint64(w, "timeout_admin_us", g_opts.timeout_admin_us); 8828 spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms); 8829 spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst); 8830 spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight); 8831 spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight); 8832 spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight); 8833 spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us); 8834 spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us); 8835 spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests); 8836 spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit); 8837 spdk_json_write_named_uint32(w, "transport_retry_count", g_opts.transport_retry_count); 8838 spdk_json_write_named_int32(w, "bdev_retry_count", g_opts.bdev_retry_count); 8839 spdk_json_write_named_uint8(w, "transport_ack_timeout", g_opts.transport_ack_timeout); 8840 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", g_opts.ctrlr_loss_timeout_sec); 8841 spdk_json_write_named_uint32(w, "reconnect_delay_sec", g_opts.reconnect_delay_sec); 8842 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", g_opts.fast_io_fail_timeout_sec); 8843 spdk_json_write_named_bool(w, "disable_auto_failback", g_opts.disable_auto_failback); 8844 spdk_json_write_named_bool(w, "generate_uuids", g_opts.generate_uuids); 8845 spdk_json_write_named_uint8(w, "transport_tos", g_opts.transport_tos); 8846 spdk_json_write_named_bool(w, "nvme_error_stat", g_opts.nvme_error_stat); 8847 spdk_json_write_named_uint32(w, "rdma_srq_size", g_opts.rdma_srq_size); 8848 spdk_json_write_named_bool(w, "io_path_stat", g_opts.io_path_stat); 8849 spdk_json_write_named_bool(w, "allow_accel_sequence", g_opts.allow_accel_sequence); 8850 spdk_json_write_named_uint32(w, "rdma_max_cq_size", g_opts.rdma_max_cq_size); 8851 spdk_json_write_named_uint16(w, "rdma_cm_event_timeout_ms", g_opts.rdma_cm_event_timeout_ms); 8852 spdk_json_write_named_array_begin(w, "dhchap_digests"); 8853 for (i = 0; i < 32; ++i) { 8854 if (g_opts.dhchap_digests & SPDK_BIT(i)) { 8855 spdk_json_write_string(w, spdk_nvme_dhchap_get_digest_name(i)); 8856 } 8857 } 8858 spdk_json_write_array_end(w); 8859 spdk_json_write_named_array_begin(w, "dhchap_dhgroups"); 8860 for (i = 0; i < 32; ++i) { 8861 if (g_opts.dhchap_dhgroups & SPDK_BIT(i)) { 8862 spdk_json_write_string(w, spdk_nvme_dhchap_get_dhgroup_name(i)); 8863 } 8864 } 8865 8866 spdk_json_write_array_end(w); 8867 spdk_json_write_object_end(w); 8868 8869 spdk_json_write_object_end(w); 8870 } 8871 8872 static void 8873 bdev_nvme_discovery_config_json(struct spdk_json_write_ctx *w, struct discovery_ctx *ctx) 8874 { 8875 struct spdk_nvme_transport_id trid; 8876 8877 spdk_json_write_object_begin(w); 8878 8879 spdk_json_write_named_string(w, "method", "bdev_nvme_start_discovery"); 8880 8881 spdk_json_write_named_object_begin(w, "params"); 8882 spdk_json_write_named_string(w, "name", ctx->name); 8883 spdk_json_write_named_string(w, "hostnqn", ctx->hostnqn); 8884 8885 trid = ctx->trid; 8886 memset(trid.subnqn, 0, sizeof(trid.subnqn)); 8887 nvme_bdev_dump_trid_json(&trid, w); 8888 8889 spdk_json_write_named_bool(w, "wait_for_attach", ctx->wait_for_attach); 8890 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", ctx->bdev_opts.ctrlr_loss_timeout_sec); 8891 spdk_json_write_named_uint32(w, "reconnect_delay_sec", ctx->bdev_opts.reconnect_delay_sec); 8892 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 8893 ctx->bdev_opts.fast_io_fail_timeout_sec); 8894 spdk_json_write_object_end(w); 8895 8896 spdk_json_write_object_end(w); 8897 } 8898 8899 #ifdef SPDK_CONFIG_NVME_CUSE 8900 static void 8901 nvme_ctrlr_cuse_config_json(struct spdk_json_write_ctx *w, 8902 struct nvme_ctrlr *nvme_ctrlr) 8903 { 8904 size_t cuse_name_size = 128; 8905 char cuse_name[cuse_name_size]; 8906 8907 if (spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr, 8908 cuse_name, &cuse_name_size) != 0) { 8909 return; 8910 } 8911 8912 spdk_json_write_object_begin(w); 8913 8914 spdk_json_write_named_string(w, "method", "bdev_nvme_cuse_register"); 8915 8916 spdk_json_write_named_object_begin(w, "params"); 8917 spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name); 8918 spdk_json_write_object_end(w); 8919 8920 spdk_json_write_object_end(w); 8921 } 8922 #endif 8923 8924 static void 8925 nvme_ctrlr_config_json(struct spdk_json_write_ctx *w, 8926 struct nvme_ctrlr *nvme_ctrlr, 8927 struct nvme_path_id *path_id) 8928 { 8929 struct spdk_nvme_transport_id *trid; 8930 const struct spdk_nvme_ctrlr_opts *opts; 8931 8932 if (nvme_ctrlr->opts.from_discovery_service) { 8933 /* Do not emit an RPC for this - it will be implicitly 8934 * covered by a separate bdev_nvme_start_discovery or 8935 * bdev_nvme_start_mdns_discovery RPC. 8936 */ 8937 return; 8938 } 8939 8940 trid = &path_id->trid; 8941 8942 spdk_json_write_object_begin(w); 8943 8944 spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller"); 8945 8946 spdk_json_write_named_object_begin(w, "params"); 8947 spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name); 8948 nvme_bdev_dump_trid_json(trid, w); 8949 spdk_json_write_named_bool(w, "prchk_reftag", 8950 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0); 8951 spdk_json_write_named_bool(w, "prchk_guard", 8952 (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0); 8953 spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", nvme_ctrlr->opts.ctrlr_loss_timeout_sec); 8954 spdk_json_write_named_uint32(w, "reconnect_delay_sec", nvme_ctrlr->opts.reconnect_delay_sec); 8955 spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", 8956 nvme_ctrlr->opts.fast_io_fail_timeout_sec); 8957 if (nvme_ctrlr->psk != NULL) { 8958 spdk_json_write_named_string(w, "psk", spdk_key_get_name(nvme_ctrlr->psk)); 8959 } 8960 if (nvme_ctrlr->dhchap_key != NULL) { 8961 spdk_json_write_named_string(w, "dhchap_key", 8962 spdk_key_get_name(nvme_ctrlr->dhchap_key)); 8963 } 8964 if (nvme_ctrlr->dhchap_ctrlr_key != NULL) { 8965 spdk_json_write_named_string(w, "dhchap_ctrlr_key", 8966 spdk_key_get_name(nvme_ctrlr->dhchap_ctrlr_key)); 8967 } 8968 opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr); 8969 spdk_json_write_named_string(w, "hostnqn", opts->hostnqn); 8970 spdk_json_write_named_bool(w, "hdgst", opts->header_digest); 8971 spdk_json_write_named_bool(w, "ddgst", opts->data_digest); 8972 if (opts->src_addr[0] != '\0') { 8973 spdk_json_write_named_string(w, "hostaddr", opts->src_addr); 8974 } 8975 if (opts->src_svcid[0] != '\0') { 8976 spdk_json_write_named_string(w, "hostsvcid", opts->src_svcid); 8977 } 8978 8979 if (nvme_ctrlr->opts.multipath) { 8980 spdk_json_write_named_string(w, "multipath", "multipath"); 8981 } 8982 spdk_json_write_object_end(w); 8983 8984 spdk_json_write_object_end(w); 8985 } 8986 8987 static void 8988 bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w) 8989 { 8990 spdk_json_write_object_begin(w); 8991 spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug"); 8992 8993 spdk_json_write_named_object_begin(w, "params"); 8994 spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us); 8995 spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled); 8996 spdk_json_write_object_end(w); 8997 8998 spdk_json_write_object_end(w); 8999 } 9000 9001 static int 9002 bdev_nvme_config_json(struct spdk_json_write_ctx *w) 9003 { 9004 struct nvme_bdev_ctrlr *nbdev_ctrlr; 9005 struct nvme_ctrlr *nvme_ctrlr; 9006 struct discovery_ctx *ctx; 9007 struct nvme_path_id *path_id; 9008 9009 bdev_nvme_opts_config_json(w); 9010 9011 pthread_mutex_lock(&g_bdev_nvme_mutex); 9012 9013 TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 9014 TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) { 9015 path_id = nvme_ctrlr->active_path_id; 9016 assert(path_id == TAILQ_FIRST(&nvme_ctrlr->trids)); 9017 nvme_ctrlr_config_json(w, nvme_ctrlr, path_id); 9018 9019 path_id = TAILQ_NEXT(path_id, link); 9020 while (path_id != NULL) { 9021 nvme_ctrlr_config_json(w, nvme_ctrlr, path_id); 9022 path_id = TAILQ_NEXT(path_id, link); 9023 } 9024 9025 #ifdef SPDK_CONFIG_NVME_CUSE 9026 nvme_ctrlr_cuse_config_json(w, nvme_ctrlr); 9027 #endif 9028 } 9029 } 9030 9031 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 9032 if (!ctx->from_mdns_discovery_service) { 9033 bdev_nvme_discovery_config_json(w, ctx); 9034 } 9035 } 9036 9037 bdev_nvme_mdns_discovery_config_json(w); 9038 9039 /* Dump as last parameter to give all NVMe bdevs chance to be constructed 9040 * before enabling hotplug poller. 9041 */ 9042 bdev_nvme_hotplug_config_json(w); 9043 9044 pthread_mutex_unlock(&g_bdev_nvme_mutex); 9045 return 0; 9046 } 9047 9048 struct spdk_nvme_ctrlr * 9049 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev) 9050 { 9051 struct nvme_bdev *nbdev; 9052 struct nvme_ns *nvme_ns; 9053 9054 if (!bdev || bdev->module != &nvme_if) { 9055 return NULL; 9056 } 9057 9058 nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk); 9059 nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list); 9060 assert(nvme_ns != NULL); 9061 9062 return nvme_ns->ctrlr->ctrlr; 9063 } 9064 9065 static bool 9066 nvme_io_path_is_current(struct nvme_io_path *io_path) 9067 { 9068 const struct nvme_bdev_channel *nbdev_ch; 9069 bool current; 9070 9071 if (!nvme_io_path_is_available(io_path)) { 9072 return false; 9073 } 9074 9075 nbdev_ch = io_path->nbdev_ch; 9076 if (nbdev_ch == NULL) { 9077 current = false; 9078 } else if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) { 9079 struct nvme_io_path *optimized_io_path = NULL; 9080 9081 STAILQ_FOREACH(optimized_io_path, &nbdev_ch->io_path_list, stailq) { 9082 if (optimized_io_path->nvme_ns->ana_state == SPDK_NVME_ANA_OPTIMIZED_STATE) { 9083 break; 9084 } 9085 } 9086 9087 /* A non-optimized path is only current if there are no optimized paths. */ 9088 current = (io_path->nvme_ns->ana_state == SPDK_NVME_ANA_OPTIMIZED_STATE) || 9089 (optimized_io_path == NULL); 9090 } else { 9091 if (nbdev_ch->current_io_path) { 9092 current = (io_path == nbdev_ch->current_io_path); 9093 } else { 9094 struct nvme_io_path *first_path; 9095 9096 /* We arrived here as there are no optimized paths for active-passive 9097 * mode. Check if this io_path is the first one available on the list. 9098 */ 9099 current = false; 9100 STAILQ_FOREACH(first_path, &nbdev_ch->io_path_list, stailq) { 9101 if (nvme_io_path_is_available(first_path)) { 9102 current = (io_path == first_path); 9103 break; 9104 } 9105 } 9106 } 9107 } 9108 9109 return current; 9110 } 9111 9112 static struct nvme_ctrlr * 9113 bdev_nvme_next_ctrlr_unsafe(struct nvme_bdev_ctrlr *nbdev_ctrlr, struct nvme_ctrlr *prev) 9114 { 9115 struct nvme_ctrlr *next; 9116 9117 /* Must be called under g_bdev_nvme_mutex */ 9118 next = prev != NULL ? TAILQ_NEXT(prev, tailq) : TAILQ_FIRST(&nbdev_ctrlr->ctrlrs); 9119 while (next != NULL) { 9120 /* ref can be 0 when the ctrlr was released, but hasn't been detached yet */ 9121 pthread_mutex_lock(&next->mutex); 9122 if (next->ref > 0) { 9123 next->ref++; 9124 pthread_mutex_unlock(&next->mutex); 9125 return next; 9126 } 9127 9128 pthread_mutex_unlock(&next->mutex); 9129 next = TAILQ_NEXT(next, tailq); 9130 } 9131 9132 return NULL; 9133 } 9134 9135 struct bdev_nvme_set_keys_ctx { 9136 struct nvme_ctrlr *nctrlr; 9137 struct spdk_key *dhchap_key; 9138 struct spdk_key *dhchap_ctrlr_key; 9139 struct spdk_thread *thread; 9140 bdev_nvme_set_keys_cb cb_fn; 9141 void *cb_ctx; 9142 int status; 9143 }; 9144 9145 static void 9146 bdev_nvme_free_set_keys_ctx(struct bdev_nvme_set_keys_ctx *ctx) 9147 { 9148 if (ctx == NULL) { 9149 return; 9150 } 9151 9152 spdk_keyring_put_key(ctx->dhchap_key); 9153 spdk_keyring_put_key(ctx->dhchap_ctrlr_key); 9154 free(ctx); 9155 } 9156 9157 static void 9158 _bdev_nvme_set_keys_done(void *_ctx) 9159 { 9160 struct bdev_nvme_set_keys_ctx *ctx = _ctx; 9161 9162 ctx->cb_fn(ctx->cb_ctx, ctx->status); 9163 9164 if (ctx->nctrlr != NULL) { 9165 nvme_ctrlr_put_ref(ctx->nctrlr); 9166 } 9167 bdev_nvme_free_set_keys_ctx(ctx); 9168 } 9169 9170 static void 9171 bdev_nvme_set_keys_done(struct bdev_nvme_set_keys_ctx *ctx, int status) 9172 { 9173 ctx->status = status; 9174 spdk_thread_exec_msg(ctx->thread, _bdev_nvme_set_keys_done, ctx); 9175 } 9176 9177 static void bdev_nvme_authenticate_ctrlr(struct bdev_nvme_set_keys_ctx *ctx); 9178 9179 static void 9180 bdev_nvme_authenticate_ctrlr_continue(struct bdev_nvme_set_keys_ctx *ctx) 9181 { 9182 struct nvme_ctrlr *next; 9183 9184 pthread_mutex_lock(&g_bdev_nvme_mutex); 9185 next = bdev_nvme_next_ctrlr_unsafe(NULL, ctx->nctrlr); 9186 pthread_mutex_unlock(&g_bdev_nvme_mutex); 9187 9188 nvme_ctrlr_put_ref(ctx->nctrlr); 9189 ctx->nctrlr = next; 9190 9191 if (next == NULL) { 9192 bdev_nvme_set_keys_done(ctx, 0); 9193 } else { 9194 bdev_nvme_authenticate_ctrlr(ctx); 9195 } 9196 } 9197 9198 static void 9199 bdev_nvme_authenticate_qpairs_done(struct spdk_io_channel_iter *i, int status) 9200 { 9201 struct bdev_nvme_set_keys_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 9202 9203 if (status != 0) { 9204 bdev_nvme_set_keys_done(ctx, status); 9205 return; 9206 } 9207 bdev_nvme_authenticate_ctrlr_continue(ctx); 9208 } 9209 9210 static void 9211 bdev_nvme_authenticate_qpair_done(void *ctx, int status) 9212 { 9213 spdk_for_each_channel_continue(ctx, status); 9214 } 9215 9216 static void 9217 bdev_nvme_authenticate_qpair(struct spdk_io_channel_iter *i) 9218 { 9219 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 9220 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch); 9221 struct nvme_qpair *qpair = ctrlr_ch->qpair; 9222 int rc; 9223 9224 if (!nvme_qpair_is_connected(qpair)) { 9225 spdk_for_each_channel_continue(i, 0); 9226 return; 9227 } 9228 9229 rc = spdk_nvme_qpair_authenticate(qpair->qpair, bdev_nvme_authenticate_qpair_done, i); 9230 if (rc != 0) { 9231 spdk_for_each_channel_continue(i, rc); 9232 } 9233 } 9234 9235 static void 9236 bdev_nvme_authenticate_ctrlr_done(void *_ctx, int status) 9237 { 9238 struct bdev_nvme_set_keys_ctx *ctx = _ctx; 9239 9240 if (status != 0) { 9241 bdev_nvme_set_keys_done(ctx, status); 9242 return; 9243 } 9244 9245 spdk_for_each_channel(ctx->nctrlr, bdev_nvme_authenticate_qpair, ctx, 9246 bdev_nvme_authenticate_qpairs_done); 9247 } 9248 9249 static void 9250 bdev_nvme_authenticate_ctrlr(struct bdev_nvme_set_keys_ctx *ctx) 9251 { 9252 struct spdk_nvme_ctrlr_key_opts opts = {}; 9253 struct nvme_ctrlr *nctrlr = ctx->nctrlr; 9254 int rc; 9255 9256 opts.size = SPDK_SIZEOF(&opts, dhchap_ctrlr_key); 9257 opts.dhchap_key = ctx->dhchap_key; 9258 opts.dhchap_ctrlr_key = ctx->dhchap_ctrlr_key; 9259 rc = spdk_nvme_ctrlr_set_keys(nctrlr->ctrlr, &opts); 9260 if (rc != 0) { 9261 bdev_nvme_set_keys_done(ctx, rc); 9262 return; 9263 } 9264 9265 if (ctx->dhchap_key != NULL) { 9266 rc = spdk_nvme_ctrlr_authenticate(nctrlr->ctrlr, 9267 bdev_nvme_authenticate_ctrlr_done, ctx); 9268 if (rc != 0) { 9269 bdev_nvme_set_keys_done(ctx, rc); 9270 } 9271 } else { 9272 bdev_nvme_authenticate_ctrlr_continue(ctx); 9273 } 9274 } 9275 9276 int 9277 bdev_nvme_set_keys(const char *name, const char *dhchap_key, const char *dhchap_ctrlr_key, 9278 bdev_nvme_set_keys_cb cb_fn, void *cb_ctx) 9279 { 9280 struct bdev_nvme_set_keys_ctx *ctx; 9281 struct nvme_bdev_ctrlr *nbdev_ctrlr; 9282 struct nvme_ctrlr *nctrlr; 9283 9284 ctx = calloc(1, sizeof(*ctx)); 9285 if (ctx == NULL) { 9286 return -ENOMEM; 9287 } 9288 9289 if (dhchap_key != NULL) { 9290 ctx->dhchap_key = spdk_keyring_get_key(dhchap_key); 9291 if (ctx->dhchap_key == NULL) { 9292 SPDK_ERRLOG("Could not find key %s for bdev %s\n", dhchap_key, name); 9293 bdev_nvme_free_set_keys_ctx(ctx); 9294 return -ENOKEY; 9295 } 9296 } 9297 if (dhchap_ctrlr_key != NULL) { 9298 ctx->dhchap_ctrlr_key = spdk_keyring_get_key(dhchap_ctrlr_key); 9299 if (ctx->dhchap_ctrlr_key == NULL) { 9300 SPDK_ERRLOG("Could not find key %s for bdev %s\n", dhchap_ctrlr_key, name); 9301 bdev_nvme_free_set_keys_ctx(ctx); 9302 return -ENOKEY; 9303 } 9304 } 9305 9306 pthread_mutex_lock(&g_bdev_nvme_mutex); 9307 nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 9308 if (nbdev_ctrlr == NULL) { 9309 SPDK_ERRLOG("Could not find bdev_ctrlr %s\n", name); 9310 pthread_mutex_unlock(&g_bdev_nvme_mutex); 9311 bdev_nvme_free_set_keys_ctx(ctx); 9312 return -ENODEV; 9313 } 9314 nctrlr = bdev_nvme_next_ctrlr_unsafe(nbdev_ctrlr, NULL); 9315 if (nctrlr == NULL) { 9316 SPDK_ERRLOG("Could not find any nvme_ctrlrs on bdev_ctrlr %s\n", name); 9317 pthread_mutex_unlock(&g_bdev_nvme_mutex); 9318 bdev_nvme_free_set_keys_ctx(ctx); 9319 return -ENODEV; 9320 } 9321 pthread_mutex_unlock(&g_bdev_nvme_mutex); 9322 9323 ctx->nctrlr = nctrlr; 9324 ctx->cb_fn = cb_fn; 9325 ctx->cb_ctx = cb_ctx; 9326 ctx->thread = spdk_get_thread(); 9327 9328 bdev_nvme_authenticate_ctrlr(ctx); 9329 9330 return 0; 9331 } 9332 9333 void 9334 nvme_io_path_info_json(struct spdk_json_write_ctx *w, struct nvme_io_path *io_path) 9335 { 9336 struct nvme_ns *nvme_ns = io_path->nvme_ns; 9337 struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr; 9338 const struct spdk_nvme_ctrlr_data *cdata; 9339 const struct spdk_nvme_transport_id *trid; 9340 const char *adrfam_str; 9341 9342 spdk_json_write_object_begin(w); 9343 9344 spdk_json_write_named_string(w, "bdev_name", nvme_ns->bdev->disk.name); 9345 9346 cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr); 9347 trid = spdk_nvme_ctrlr_get_transport_id(nvme_ctrlr->ctrlr); 9348 9349 spdk_json_write_named_uint32(w, "cntlid", cdata->cntlid); 9350 spdk_json_write_named_bool(w, "current", nvme_io_path_is_current(io_path)); 9351 spdk_json_write_named_bool(w, "connected", nvme_qpair_is_connected(io_path->qpair)); 9352 spdk_json_write_named_bool(w, "accessible", nvme_ns_is_accessible(nvme_ns)); 9353 9354 spdk_json_write_named_object_begin(w, "transport"); 9355 spdk_json_write_named_string(w, "trtype", trid->trstring); 9356 spdk_json_write_named_string(w, "traddr", trid->traddr); 9357 if (trid->trsvcid[0] != '\0') { 9358 spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); 9359 } 9360 adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); 9361 if (adrfam_str) { 9362 spdk_json_write_named_string(w, "adrfam", adrfam_str); 9363 } 9364 spdk_json_write_object_end(w); 9365 9366 spdk_json_write_object_end(w); 9367 } 9368 9369 void 9370 bdev_nvme_get_discovery_info(struct spdk_json_write_ctx *w) 9371 { 9372 struct discovery_ctx *ctx; 9373 struct discovery_entry_ctx *entry_ctx; 9374 9375 spdk_json_write_array_begin(w); 9376 TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) { 9377 spdk_json_write_object_begin(w); 9378 spdk_json_write_named_string(w, "name", ctx->name); 9379 9380 spdk_json_write_named_object_begin(w, "trid"); 9381 nvme_bdev_dump_trid_json(&ctx->trid, w); 9382 spdk_json_write_object_end(w); 9383 9384 spdk_json_write_named_array_begin(w, "referrals"); 9385 TAILQ_FOREACH(entry_ctx, &ctx->discovery_entry_ctxs, tailq) { 9386 spdk_json_write_object_begin(w); 9387 spdk_json_write_named_object_begin(w, "trid"); 9388 nvme_bdev_dump_trid_json(&entry_ctx->trid, w); 9389 spdk_json_write_object_end(w); 9390 spdk_json_write_object_end(w); 9391 } 9392 spdk_json_write_array_end(w); 9393 9394 spdk_json_write_object_end(w); 9395 } 9396 spdk_json_write_array_end(w); 9397 } 9398 9399 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme) 9400 9401 static void 9402 bdev_nvme_trace(void) 9403 { 9404 struct spdk_trace_tpoint_opts opts[] = { 9405 { 9406 "BDEV_NVME_IO_START", TRACE_BDEV_NVME_IO_START, 9407 OWNER_TYPE_NONE, OBJECT_BDEV_NVME_IO, 1, 9408 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 9409 }, 9410 { 9411 "BDEV_NVME_IO_DONE", TRACE_BDEV_NVME_IO_DONE, 9412 OWNER_TYPE_NONE, OBJECT_BDEV_NVME_IO, 0, 9413 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 9414 } 9415 }; 9416 9417 9418 spdk_trace_register_object(OBJECT_BDEV_NVME_IO, 'N'); 9419 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 9420 spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_SUBMIT, OBJECT_BDEV_NVME_IO, 0); 9421 spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_SUBMIT, OBJECT_BDEV_NVME_IO, 0); 9422 spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_COMPLETE, OBJECT_BDEV_NVME_IO, 0); 9423 spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_COMPLETE, OBJECT_BDEV_NVME_IO, 0); 9424 } 9425 SPDK_TRACE_REGISTER_FN(bdev_nvme_trace, "bdev_nvme", TRACE_GROUP_BDEV_NVME) 9426