1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. All rights reserved. 5 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include "spdk/stdinc.h" 35 36 #include "bdev_nvme.h" 37 #include "bdev_ocssd.h" 38 39 #include "spdk/accel_engine.h" 40 #include "spdk/config.h" 41 #include "spdk/endian.h" 42 #include "spdk/bdev.h" 43 #include "spdk/json.h" 44 #include "spdk/nvme.h" 45 #include "spdk/nvme_ocssd.h" 46 #include "spdk/nvme_zns.h" 47 #include "spdk/thread.h" 48 #include "spdk/string.h" 49 #include "spdk/util.h" 50 51 #include "spdk/bdev_module.h" 52 #include "spdk/log.h" 53 54 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true 55 #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS (10000) 56 57 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w); 58 59 struct nvme_bdev_io { 60 /** array of iovecs to transfer. */ 61 struct iovec *iovs; 62 63 /** Number of iovecs in iovs array. */ 64 int iovcnt; 65 66 /** Current iovec position. */ 67 int iovpos; 68 69 /** Offset in current iovec. */ 70 uint32_t iov_offset; 71 72 /** array of iovecs to transfer. */ 73 struct iovec *fused_iovs; 74 75 /** Number of iovecs in iovs array. */ 76 int fused_iovcnt; 77 78 /** Current iovec position. */ 79 int fused_iovpos; 80 81 /** Offset in current iovec. */ 82 uint32_t fused_iov_offset; 83 84 /** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */ 85 struct spdk_nvme_cpl cpl; 86 87 /** Originating thread */ 88 struct spdk_thread *orig_thread; 89 90 /** Keeps track if first of fused commands was submitted */ 91 bool first_fused_submitted; 92 93 /** Temporary pointer to zone report buffer */ 94 struct spdk_nvme_zns_zone_report *zone_report_buf; 95 96 /** Keep track of how many zones that have been copied to the spdk_bdev_zone_info struct */ 97 uint64_t handled_zones; 98 }; 99 100 struct nvme_probe_ctx { 101 size_t count; 102 struct spdk_nvme_transport_id trids[NVME_MAX_CONTROLLERS]; 103 struct spdk_nvme_host_id hostids[NVME_MAX_CONTROLLERS]; 104 const char *names[NVME_MAX_CONTROLLERS]; 105 uint32_t prchk_flags[NVME_MAX_CONTROLLERS]; 106 const char *hostnqn; 107 }; 108 109 struct nvme_probe_skip_entry { 110 struct spdk_nvme_transport_id trid; 111 TAILQ_ENTRY(nvme_probe_skip_entry) tailq; 112 }; 113 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */ 114 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER( 115 g_skipped_nvme_ctrlrs); 116 117 static struct spdk_bdev_nvme_opts g_opts = { 118 .action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE, 119 .timeout_us = 0, 120 .keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS, 121 .retry_count = 4, 122 .arbitration_burst = 0, 123 .low_priority_weight = 0, 124 .medium_priority_weight = 0, 125 .high_priority_weight = 0, 126 .nvme_adminq_poll_period_us = 10000ULL, 127 .nvme_ioq_poll_period_us = 0, 128 .io_queue_requests = 0, 129 .delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT, 130 }; 131 132 #define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL 133 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT 100000ULL 134 135 static int g_hot_insert_nvme_controller_index = 0; 136 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT; 137 static bool g_nvme_hotplug_enabled = false; 138 static struct spdk_thread *g_bdev_nvme_init_thread; 139 static struct spdk_poller *g_hotplug_poller; 140 static struct spdk_poller *g_hotplug_probe_poller; 141 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx; 142 143 static void nvme_ctrlr_populate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 144 struct nvme_async_probe_ctx *ctx); 145 static void nvme_ctrlr_populate_namespaces_done(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 146 struct nvme_async_probe_ctx *ctx); 147 static int bdev_nvme_library_init(void); 148 static void bdev_nvme_library_fini(void); 149 static int bdev_nvme_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 150 struct nvme_bdev_io *bio, 151 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba, 152 uint32_t flags); 153 static int bdev_nvme_no_pi_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 154 struct nvme_bdev_io *bio, 155 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba); 156 static int bdev_nvme_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 157 struct nvme_bdev_io *bio, 158 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba, 159 uint32_t flags); 160 static int bdev_nvme_zone_appendv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 161 struct nvme_bdev_io *bio, 162 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, 163 uint64_t zslba, uint32_t flags); 164 static int bdev_nvme_comparev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 165 struct nvme_bdev_io *bio, 166 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba, 167 uint32_t flags); 168 static int bdev_nvme_comparev_and_writev(struct spdk_nvme_ns *ns, 169 struct spdk_nvme_qpair *qpair, 170 struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov, 171 int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba, 172 uint32_t flags); 173 static int bdev_nvme_get_zone_info(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 174 struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones, 175 struct spdk_bdev_zone_info *info); 176 static int bdev_nvme_zone_management(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 177 struct nvme_bdev_io *bio, uint64_t zone_id, 178 enum spdk_bdev_zone_action action); 179 static int bdev_nvme_admin_passthru(struct nvme_io_path *io_path, 180 struct nvme_bdev_io *bio, 181 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); 182 static int bdev_nvme_io_passthru(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 183 struct nvme_bdev_io *bio, 184 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); 185 static int bdev_nvme_io_passthru_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 186 struct nvme_bdev_io *bio, 187 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len); 188 static int bdev_nvme_abort(struct nvme_io_path *io_path, 189 struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort); 190 static int bdev_nvme_reset(struct nvme_io_path *io_path, struct spdk_bdev_io *bdev_io); 191 static int bdev_nvme_failover(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, bool remove); 192 static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr); 193 194 typedef void (*populate_namespace_fn)(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 195 struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx); 196 static void nvme_ctrlr_populate_standard_namespace(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 197 struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx); 198 199 static populate_namespace_fn g_populate_namespace_fn[] = { 200 NULL, 201 nvme_ctrlr_populate_standard_namespace, 202 bdev_ocssd_populate_namespace, 203 }; 204 205 typedef void (*depopulate_namespace_fn)(struct nvme_bdev_ns *nvme_ns); 206 static void nvme_ctrlr_depopulate_standard_namespace(struct nvme_bdev_ns *nvme_ns); 207 208 static depopulate_namespace_fn g_depopulate_namespace_fn[] = { 209 NULL, 210 nvme_ctrlr_depopulate_standard_namespace, 211 bdev_ocssd_depopulate_namespace, 212 }; 213 214 typedef void (*config_json_namespace_fn)(struct spdk_json_write_ctx *w, 215 struct nvme_bdev_ns *nvme_ns); 216 static void nvme_ctrlr_config_json_standard_namespace(struct spdk_json_write_ctx *w, 217 struct nvme_bdev_ns *nvme_ns); 218 219 static config_json_namespace_fn g_config_json_namespace_fn[] = { 220 NULL, 221 nvme_ctrlr_config_json_standard_namespace, 222 bdev_ocssd_namespace_config_json, 223 }; 224 225 struct spdk_nvme_qpair * 226 bdev_nvme_get_io_qpair(struct spdk_io_channel *io_path_ch) 227 { 228 struct nvme_io_path *io_path; 229 230 assert(io_path_ch != NULL); 231 232 io_path = spdk_io_channel_get_ctx(io_path_ch); 233 234 return io_path->qpair; 235 } 236 237 static int 238 bdev_nvme_get_ctx_size(void) 239 { 240 return sizeof(struct nvme_bdev_io); 241 } 242 243 static struct spdk_bdev_module nvme_if = { 244 .name = "nvme", 245 .async_fini = true, 246 .module_init = bdev_nvme_library_init, 247 .module_fini = bdev_nvme_library_fini, 248 .config_json = bdev_nvme_config_json, 249 .get_ctx_size = bdev_nvme_get_ctx_size, 250 251 }; 252 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if) 253 254 static inline bool 255 bdev_nvme_find_io_path(struct nvme_bdev *nbdev, struct nvme_io_path *io_path, 256 struct spdk_nvme_ns **_ns, struct spdk_nvme_qpair **_qpair) 257 { 258 if (spdk_unlikely(io_path->qpair == NULL)) { 259 /* The device is currently resetting. */ 260 return false; 261 } 262 263 *_ns = nbdev->nvme_ns->ns; 264 *_qpair = io_path->qpair; 265 return true; 266 } 267 268 static inline bool 269 bdev_nvme_find_admin_path(struct nvme_io_path *io_path, 270 struct nvme_bdev_ctrlr **_nvme_bdev_ctrlr) 271 { 272 *_nvme_bdev_ctrlr = io_path->ctrlr; 273 return true; 274 } 275 276 static inline void 277 bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio, 278 const struct spdk_nvme_cpl *cpl) 279 { 280 spdk_bdev_io_complete_nvme_status(spdk_bdev_io_from_ctx(bio), cpl->cdw0, 281 cpl->status.sct, cpl->status.sc); 282 } 283 284 static inline void 285 bdev_nvme_io_complete(struct nvme_bdev_io *bio, int rc) 286 { 287 enum spdk_bdev_io_status io_status; 288 289 if (rc == 0) { 290 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 291 } else if (rc == -ENOMEM) { 292 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 293 } else { 294 io_status = SPDK_BDEV_IO_STATUS_FAILED; 295 } 296 297 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), io_status); 298 } 299 300 static void 301 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx) 302 { 303 int rc; 304 305 SPDK_DEBUGLOG(bdev_nvme, "qpair %p is disconnected, attempting reconnect.\n", qpair); 306 /* 307 * Currently, just try to reconnect indefinitely. If we are doing a reset, the reset will 308 * reconnect a qpair and we will stop getting a callback for this one. 309 */ 310 rc = spdk_nvme_ctrlr_reconnect_io_qpair(qpair); 311 if (rc != 0) { 312 SPDK_WARNLOG("Failed to reconnect to qpair %p, errno %d\n", qpair, -rc); 313 } 314 } 315 316 static int 317 bdev_nvme_poll(void *arg) 318 { 319 struct nvme_bdev_poll_group *group = arg; 320 int64_t num_completions; 321 322 if (group->collect_spin_stat && group->start_ticks == 0) { 323 group->start_ticks = spdk_get_ticks(); 324 } 325 326 num_completions = spdk_nvme_poll_group_process_completions(group->group, 0, 327 bdev_nvme_disconnected_qpair_cb); 328 if (group->collect_spin_stat) { 329 if (num_completions > 0) { 330 if (group->end_ticks != 0) { 331 group->spin_ticks += (group->end_ticks - group->start_ticks); 332 group->end_ticks = 0; 333 } 334 group->start_ticks = 0; 335 } else { 336 group->end_ticks = spdk_get_ticks(); 337 } 338 } 339 340 return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 341 } 342 343 static int 344 bdev_nvme_poll_adminq(void *arg) 345 { 346 int32_t rc; 347 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = arg; 348 349 assert(nvme_bdev_ctrlr != NULL); 350 351 rc = spdk_nvme_ctrlr_process_admin_completions(nvme_bdev_ctrlr->ctrlr); 352 if (rc < 0) { 353 bdev_nvme_failover(nvme_bdev_ctrlr, false); 354 } 355 356 return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY; 357 } 358 359 static int 360 bdev_nvme_destruct(void *ctx) 361 { 362 struct nvme_bdev *nvme_disk = ctx; 363 struct nvme_bdev_ns *nvme_ns = nvme_disk->nvme_ns; 364 365 pthread_mutex_lock(&nvme_ns->ctrlr->mutex); 366 367 nvme_ns->bdev = NULL; 368 369 if (!nvme_ns->populated) { 370 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 371 372 nvme_bdev_ctrlr_destruct(nvme_ns->ctrlr); 373 } else { 374 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 375 } 376 377 free(nvme_disk->disk.name); 378 free(nvme_disk); 379 380 return 0; 381 } 382 383 static int 384 bdev_nvme_flush(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 385 struct nvme_bdev_io *bio, uint64_t offset, uint64_t nbytes) 386 { 387 bdev_nvme_io_complete(bio, 0); 388 389 return 0; 390 } 391 392 static int 393 bdev_nvme_create_qpair(struct nvme_io_path *io_path) 394 { 395 struct spdk_nvme_ctrlr *ctrlr = io_path->ctrlr->ctrlr; 396 struct spdk_nvme_io_qpair_opts opts; 397 struct spdk_nvme_qpair *qpair; 398 int rc; 399 400 spdk_nvme_ctrlr_get_default_io_qpair_opts(ctrlr, &opts, sizeof(opts)); 401 opts.delay_cmd_submit = g_opts.delay_cmd_submit; 402 opts.create_only = true; 403 opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests); 404 g_opts.io_queue_requests = opts.io_queue_requests; 405 406 qpair = spdk_nvme_ctrlr_alloc_io_qpair(ctrlr, &opts, sizeof(opts)); 407 if (qpair == NULL) { 408 return -1; 409 } 410 411 assert(io_path->group != NULL); 412 413 rc = spdk_nvme_poll_group_add(io_path->group->group, qpair); 414 if (rc != 0) { 415 SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n"); 416 goto err; 417 } 418 419 rc = spdk_nvme_ctrlr_connect_io_qpair(ctrlr, qpair); 420 if (rc != 0) { 421 SPDK_ERRLOG("Unable to connect I/O qpair.\n"); 422 goto err; 423 } 424 425 io_path->qpair = qpair; 426 427 return 0; 428 429 err: 430 spdk_nvme_ctrlr_free_io_qpair(qpair); 431 432 return rc; 433 } 434 435 static int 436 bdev_nvme_destroy_qpair(struct nvme_io_path *io_path) 437 { 438 int rc; 439 440 if (io_path->qpair == NULL) { 441 return 0; 442 } 443 444 rc = spdk_nvme_ctrlr_free_io_qpair(io_path->qpair); 445 if (!rc) { 446 io_path->qpair = NULL; 447 } 448 return rc; 449 } 450 451 static void 452 _bdev_nvme_check_pending_destruct(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr) 453 { 454 pthread_mutex_lock(&nvme_bdev_ctrlr->mutex); 455 if (nvme_bdev_ctrlr->destruct_after_reset) { 456 assert(nvme_bdev_ctrlr->ref == 0 && nvme_bdev_ctrlr->destruct); 457 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 458 459 spdk_thread_send_msg(nvme_bdev_ctrlr->thread, nvme_bdev_ctrlr_unregister, 460 nvme_bdev_ctrlr); 461 } else { 462 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 463 } 464 } 465 466 static void 467 bdev_nvme_check_pending_destruct(struct spdk_io_channel_iter *i, int status) 468 { 469 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_ctx(i); 470 471 _bdev_nvme_check_pending_destruct(nvme_bdev_ctrlr); 472 } 473 474 static void 475 _bdev_nvme_complete_pending_resets(struct nvme_io_path *io_path, 476 enum spdk_bdev_io_status status) 477 { 478 struct spdk_bdev_io *bdev_io; 479 480 while (!TAILQ_EMPTY(&io_path->pending_resets)) { 481 bdev_io = TAILQ_FIRST(&io_path->pending_resets); 482 TAILQ_REMOVE(&io_path->pending_resets, bdev_io, module_link); 483 spdk_bdev_io_complete(bdev_io, status); 484 } 485 } 486 487 static void 488 bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i) 489 { 490 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 491 struct nvme_io_path *io_path = spdk_io_channel_get_ctx(_ch); 492 493 _bdev_nvme_complete_pending_resets(io_path, SPDK_BDEV_IO_STATUS_SUCCESS); 494 495 spdk_for_each_channel_continue(i, 0); 496 } 497 498 static void 499 bdev_nvme_abort_pending_resets(struct spdk_io_channel_iter *i) 500 { 501 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 502 struct nvme_io_path *io_path = spdk_io_channel_get_ctx(_ch); 503 504 _bdev_nvme_complete_pending_resets(io_path, SPDK_BDEV_IO_STATUS_FAILED); 505 506 spdk_for_each_channel_continue(i, 0); 507 } 508 509 static void 510 bdev_nvme_reset_io_complete(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 511 struct spdk_bdev_io *bdev_io, int rc) 512 { 513 enum spdk_bdev_io_status io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 514 515 if (rc) { 516 io_status = SPDK_BDEV_IO_STATUS_FAILED; 517 } 518 519 spdk_bdev_io_complete(bdev_io, io_status); 520 521 /* Make sure we clear any pending resets before returning. */ 522 spdk_for_each_channel(nvme_bdev_ctrlr, 523 rc == 0 ? bdev_nvme_complete_pending_resets : 524 bdev_nvme_abort_pending_resets, 525 nvme_bdev_ctrlr, 526 bdev_nvme_check_pending_destruct); 527 } 528 529 static void 530 _bdev_nvme_reset_complete(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, int rc) 531 { 532 struct nvme_bdev_ctrlr_trid *curr_trid; 533 struct spdk_bdev_io *bdev_io = nvme_bdev_ctrlr->reset_bdev_io; 534 535 nvme_bdev_ctrlr->reset_bdev_io = NULL; 536 537 if (rc) { 538 SPDK_ERRLOG("Resetting controller failed.\n"); 539 } else { 540 SPDK_NOTICELOG("Resetting controller successful.\n"); 541 } 542 543 pthread_mutex_lock(&nvme_bdev_ctrlr->mutex); 544 nvme_bdev_ctrlr->resetting = false; 545 nvme_bdev_ctrlr->failover_in_progress = false; 546 547 curr_trid = TAILQ_FIRST(&nvme_bdev_ctrlr->trids); 548 assert(curr_trid != NULL); 549 assert(&curr_trid->trid == nvme_bdev_ctrlr->connected_trid); 550 551 curr_trid->is_failed = rc != 0 ? true : false; 552 553 if (nvme_bdev_ctrlr->ref == 0 && nvme_bdev_ctrlr->destruct) { 554 /* Destruct ctrlr after clearing pending resets. */ 555 nvme_bdev_ctrlr->destruct_after_reset = true; 556 } 557 558 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 559 560 if (bdev_io) { 561 bdev_nvme_reset_io_complete(nvme_bdev_ctrlr, bdev_io, rc); 562 } else { 563 /* Make sure we clear any pending resets before returning. */ 564 spdk_for_each_channel(nvme_bdev_ctrlr, 565 rc == 0 ? bdev_nvme_complete_pending_resets : 566 bdev_nvme_abort_pending_resets, 567 nvme_bdev_ctrlr, 568 bdev_nvme_check_pending_destruct); 569 } 570 } 571 572 static void 573 _bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status) 574 { 575 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_ctx(i); 576 577 _bdev_nvme_reset_complete(nvme_bdev_ctrlr, status); 578 } 579 580 static void 581 _bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i) 582 { 583 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 584 struct nvme_io_path *io_path = spdk_io_channel_get_ctx(_ch); 585 int rc; 586 587 rc = bdev_nvme_create_qpair(io_path); 588 589 spdk_for_each_channel_continue(i, rc); 590 } 591 592 static void 593 _bdev_nvme_reset_ctrlr(struct spdk_io_channel_iter *i, int status) 594 { 595 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_ctx(i); 596 int rc; 597 598 if (status) { 599 rc = status; 600 goto err; 601 } 602 603 rc = spdk_nvme_ctrlr_reset(nvme_bdev_ctrlr->ctrlr); 604 if (rc != 0) { 605 goto err; 606 } 607 608 /* Recreate all of the I/O queue pairs */ 609 spdk_for_each_channel(nvme_bdev_ctrlr, 610 _bdev_nvme_reset_create_qpair, 611 nvme_bdev_ctrlr, 612 _bdev_nvme_reset_create_qpairs_done); 613 return; 614 615 err: 616 _bdev_nvme_reset_complete(nvme_bdev_ctrlr, rc); 617 } 618 619 static void 620 _bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i) 621 { 622 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 623 struct nvme_io_path *io_path = spdk_io_channel_get_ctx(ch); 624 int rc; 625 626 rc = bdev_nvme_destroy_qpair(io_path); 627 628 spdk_for_each_channel_continue(i, rc); 629 } 630 631 static int 632 _bdev_nvme_reset(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr) 633 { 634 pthread_mutex_lock(&nvme_bdev_ctrlr->mutex); 635 if (nvme_bdev_ctrlr->destruct) { 636 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 637 return -EBUSY; 638 } 639 640 if (nvme_bdev_ctrlr->resetting) { 641 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 642 SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); 643 return -EAGAIN; 644 } 645 646 nvme_bdev_ctrlr->resetting = true; 647 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 648 649 /* First, delete all NVMe I/O queue pairs. */ 650 spdk_for_each_channel(nvme_bdev_ctrlr, 651 _bdev_nvme_reset_destroy_qpair, 652 nvme_bdev_ctrlr, 653 _bdev_nvme_reset_ctrlr); 654 655 return 0; 656 } 657 658 static int 659 bdev_nvme_reset(struct nvme_io_path *io_path, struct spdk_bdev_io *bdev_io) 660 { 661 int rc; 662 663 rc = _bdev_nvme_reset(io_path->ctrlr); 664 if (rc == 0) { 665 assert(io_path->ctrlr->reset_bdev_io == NULL); 666 io_path->ctrlr->reset_bdev_io = bdev_io; 667 } else if (rc == -EAGAIN) { 668 /* 669 * Reset call is queued only if it is from the app framework. This is on purpose so that 670 * we don't interfere with the app framework reset strategy. i.e. we are deferring to the 671 * upper level. If they are in the middle of a reset, we won't try to schedule another one. 672 */ 673 TAILQ_INSERT_TAIL(&io_path->pending_resets, bdev_io, module_link); 674 } else { 675 return rc; 676 } 677 678 return 0; 679 } 680 681 static int 682 _bdev_nvme_failover_start(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, bool remove) 683 { 684 struct nvme_bdev_ctrlr_trid *curr_trid = NULL, *next_trid = NULL; 685 int rc; 686 687 pthread_mutex_lock(&nvme_bdev_ctrlr->mutex); 688 if (nvme_bdev_ctrlr->destruct) { 689 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 690 /* Don't bother resetting if the controller is in the process of being destructed. */ 691 return -EBUSY; 692 } 693 694 curr_trid = TAILQ_FIRST(&nvme_bdev_ctrlr->trids); 695 assert(curr_trid); 696 assert(&curr_trid->trid == nvme_bdev_ctrlr->connected_trid); 697 next_trid = TAILQ_NEXT(curr_trid, link); 698 699 if (nvme_bdev_ctrlr->resetting) { 700 if (next_trid && !nvme_bdev_ctrlr->failover_in_progress) { 701 rc = -EAGAIN; 702 } else { 703 rc = -EBUSY; 704 } 705 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 706 SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); 707 return rc; 708 } 709 710 nvme_bdev_ctrlr->resetting = true; 711 curr_trid->is_failed = true; 712 713 if (next_trid) { 714 assert(curr_trid->trid.trtype != SPDK_NVME_TRANSPORT_PCIE); 715 716 SPDK_NOTICELOG("Start failover from %s:%s to %s:%s\n", curr_trid->trid.traddr, 717 curr_trid->trid.trsvcid, next_trid->trid.traddr, next_trid->trid.trsvcid); 718 719 nvme_bdev_ctrlr->failover_in_progress = true; 720 spdk_nvme_ctrlr_fail(nvme_bdev_ctrlr->ctrlr); 721 nvme_bdev_ctrlr->connected_trid = &next_trid->trid; 722 rc = spdk_nvme_ctrlr_set_trid(nvme_bdev_ctrlr->ctrlr, &next_trid->trid); 723 assert(rc == 0); 724 TAILQ_REMOVE(&nvme_bdev_ctrlr->trids, curr_trid, link); 725 if (!remove) { 726 /** Shuffle the old trid to the end of the list and use the new one. 727 * Allows for round robin through multiple connections. 728 */ 729 TAILQ_INSERT_TAIL(&nvme_bdev_ctrlr->trids, curr_trid, link); 730 } else { 731 free(curr_trid); 732 } 733 } 734 735 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 736 return 0; 737 } 738 739 static int 740 bdev_nvme_failover(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, bool remove) 741 { 742 int rc; 743 744 rc = _bdev_nvme_failover_start(nvme_bdev_ctrlr, remove); 745 if (rc == 0) { 746 /* First, delete all NVMe I/O queue pairs. */ 747 spdk_for_each_channel(nvme_bdev_ctrlr, 748 _bdev_nvme_reset_destroy_qpair, 749 nvme_bdev_ctrlr, 750 _bdev_nvme_reset_ctrlr); 751 } else if (rc != -EBUSY) { 752 return rc; 753 } 754 755 return 0; 756 } 757 758 static int 759 bdev_nvme_unmap(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 760 struct nvme_bdev_io *bio, 761 uint64_t offset_blocks, 762 uint64_t num_blocks); 763 764 static void 765 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 766 bool success) 767 { 768 struct nvme_bdev_io *bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 769 struct spdk_bdev *bdev = bdev_io->bdev; 770 struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev->ctxt; 771 struct nvme_io_path *io_path = spdk_io_channel_get_ctx(ch); 772 struct spdk_nvme_ns *ns; 773 struct spdk_nvme_qpair *qpair; 774 int ret; 775 776 if (!success) { 777 ret = -EINVAL; 778 goto exit; 779 } 780 781 if (spdk_unlikely(!bdev_nvme_find_io_path(nbdev, io_path, &ns, &qpair))) { 782 ret = -ENXIO; 783 goto exit; 784 } 785 786 ret = bdev_nvme_readv(ns, 787 qpair, 788 bio, 789 bdev_io->u.bdev.iovs, 790 bdev_io->u.bdev.iovcnt, 791 bdev_io->u.bdev.md_buf, 792 bdev_io->u.bdev.num_blocks, 793 bdev_io->u.bdev.offset_blocks, 794 bdev->dif_check_flags); 795 796 exit: 797 if (spdk_unlikely(ret != 0)) { 798 bdev_nvme_io_complete(bio, ret); 799 } 800 } 801 802 static void 803 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 804 { 805 struct nvme_io_path *io_path = spdk_io_channel_get_ctx(ch); 806 struct spdk_bdev *bdev = bdev_io->bdev; 807 struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev->ctxt; 808 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 809 struct nvme_bdev_io *nbdev_io_to_abort; 810 struct spdk_nvme_ns *ns; 811 struct spdk_nvme_qpair *qpair; 812 int rc = 0; 813 814 if (spdk_unlikely(!bdev_nvme_find_io_path(nbdev, io_path, &ns, &qpair))) { 815 rc = -ENXIO; 816 goto exit; 817 } 818 819 switch (bdev_io->type) { 820 case SPDK_BDEV_IO_TYPE_READ: 821 if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) { 822 rc = bdev_nvme_readv(ns, 823 qpair, 824 nbdev_io, 825 bdev_io->u.bdev.iovs, 826 bdev_io->u.bdev.iovcnt, 827 bdev_io->u.bdev.md_buf, 828 bdev_io->u.bdev.num_blocks, 829 bdev_io->u.bdev.offset_blocks, 830 bdev->dif_check_flags); 831 } else { 832 spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb, 833 bdev_io->u.bdev.num_blocks * bdev->blocklen); 834 rc = 0; 835 } 836 break; 837 case SPDK_BDEV_IO_TYPE_WRITE: 838 rc = bdev_nvme_writev(ns, 839 qpair, 840 nbdev_io, 841 bdev_io->u.bdev.iovs, 842 bdev_io->u.bdev.iovcnt, 843 bdev_io->u.bdev.md_buf, 844 bdev_io->u.bdev.num_blocks, 845 bdev_io->u.bdev.offset_blocks, 846 bdev->dif_check_flags); 847 break; 848 case SPDK_BDEV_IO_TYPE_COMPARE: 849 rc = bdev_nvme_comparev(ns, 850 qpair, 851 nbdev_io, 852 bdev_io->u.bdev.iovs, 853 bdev_io->u.bdev.iovcnt, 854 bdev_io->u.bdev.md_buf, 855 bdev_io->u.bdev.num_blocks, 856 bdev_io->u.bdev.offset_blocks, 857 bdev->dif_check_flags); 858 break; 859 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 860 rc = bdev_nvme_comparev_and_writev(ns, 861 qpair, 862 nbdev_io, 863 bdev_io->u.bdev.iovs, 864 bdev_io->u.bdev.iovcnt, 865 bdev_io->u.bdev.fused_iovs, 866 bdev_io->u.bdev.fused_iovcnt, 867 bdev_io->u.bdev.md_buf, 868 bdev_io->u.bdev.num_blocks, 869 bdev_io->u.bdev.offset_blocks, 870 bdev->dif_check_flags); 871 break; 872 case SPDK_BDEV_IO_TYPE_UNMAP: 873 rc = bdev_nvme_unmap(ns, 874 qpair, 875 nbdev_io, 876 bdev_io->u.bdev.offset_blocks, 877 bdev_io->u.bdev.num_blocks); 878 break; 879 case SPDK_BDEV_IO_TYPE_RESET: 880 rc = bdev_nvme_reset(io_path, bdev_io); 881 break; 882 case SPDK_BDEV_IO_TYPE_FLUSH: 883 rc = bdev_nvme_flush(ns, 884 qpair, 885 nbdev_io, 886 bdev_io->u.bdev.offset_blocks, 887 bdev_io->u.bdev.num_blocks); 888 break; 889 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 890 rc = bdev_nvme_zone_appendv(ns, 891 qpair, 892 nbdev_io, 893 bdev_io->u.bdev.iovs, 894 bdev_io->u.bdev.iovcnt, 895 bdev_io->u.bdev.md_buf, 896 bdev_io->u.bdev.num_blocks, 897 bdev_io->u.bdev.offset_blocks, 898 bdev->dif_check_flags); 899 break; 900 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 901 rc = bdev_nvme_get_zone_info(ns, 902 qpair, 903 nbdev_io, 904 bdev_io->u.zone_mgmt.zone_id, 905 bdev_io->u.zone_mgmt.num_zones, 906 bdev_io->u.zone_mgmt.buf); 907 break; 908 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 909 rc = bdev_nvme_zone_management(ns, 910 qpair, 911 nbdev_io, 912 bdev_io->u.zone_mgmt.zone_id, 913 bdev_io->u.zone_mgmt.zone_action); 914 break; 915 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 916 rc = bdev_nvme_admin_passthru(io_path, 917 nbdev_io, 918 &bdev_io->u.nvme_passthru.cmd, 919 bdev_io->u.nvme_passthru.buf, 920 bdev_io->u.nvme_passthru.nbytes); 921 break; 922 case SPDK_BDEV_IO_TYPE_NVME_IO: 923 rc = bdev_nvme_io_passthru(ns, 924 qpair, 925 nbdev_io, 926 &bdev_io->u.nvme_passthru.cmd, 927 bdev_io->u.nvme_passthru.buf, 928 bdev_io->u.nvme_passthru.nbytes); 929 break; 930 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 931 rc = bdev_nvme_io_passthru_md(ns, 932 qpair, 933 nbdev_io, 934 &bdev_io->u.nvme_passthru.cmd, 935 bdev_io->u.nvme_passthru.buf, 936 bdev_io->u.nvme_passthru.nbytes, 937 bdev_io->u.nvme_passthru.md_buf, 938 bdev_io->u.nvme_passthru.md_len); 939 break; 940 case SPDK_BDEV_IO_TYPE_ABORT: 941 nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx; 942 rc = bdev_nvme_abort(io_path, 943 nbdev_io, 944 nbdev_io_to_abort); 945 break; 946 default: 947 rc = -EINVAL; 948 break; 949 } 950 951 exit: 952 if (spdk_unlikely(rc != 0)) { 953 bdev_nvme_io_complete(nbdev_io, rc); 954 } 955 } 956 957 static bool 958 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 959 { 960 struct nvme_bdev *nbdev = ctx; 961 struct nvme_bdev_ns *nvme_ns; 962 struct spdk_nvme_ns *ns; 963 struct spdk_nvme_ctrlr *ctrlr; 964 const struct spdk_nvme_ctrlr_data *cdata; 965 966 nvme_ns = nbdev->nvme_ns; 967 assert(nvme_ns != NULL); 968 ns = nvme_ns->ns; 969 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 970 971 switch (io_type) { 972 case SPDK_BDEV_IO_TYPE_READ: 973 case SPDK_BDEV_IO_TYPE_WRITE: 974 case SPDK_BDEV_IO_TYPE_RESET: 975 case SPDK_BDEV_IO_TYPE_FLUSH: 976 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 977 case SPDK_BDEV_IO_TYPE_NVME_IO: 978 case SPDK_BDEV_IO_TYPE_ABORT: 979 return true; 980 981 case SPDK_BDEV_IO_TYPE_COMPARE: 982 return spdk_nvme_ns_supports_compare(ns); 983 984 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 985 return spdk_nvme_ns_get_md_size(ns) ? true : false; 986 987 case SPDK_BDEV_IO_TYPE_UNMAP: 988 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 989 return cdata->oncs.dsm; 990 991 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 992 /* 993 * The NVMe controller write_zeroes function is currently not used by our driver. 994 * NVMe write zeroes is limited to 16-bit block count, and the bdev layer currently 995 * has no mechanism for reporting a max write zeroes block count, nor ability to 996 * split a write zeroes request. 997 */ 998 return false; 999 1000 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 1001 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 1002 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) { 1003 return true; 1004 } 1005 return false; 1006 1007 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 1008 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 1009 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS; 1010 1011 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 1012 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS && 1013 spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ZONE_APPEND_SUPPORTED; 1014 1015 default: 1016 return false; 1017 } 1018 } 1019 1020 static int 1021 bdev_nvme_create_path_cb(void *io_device, void *ctx_buf) 1022 { 1023 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = io_device; 1024 struct nvme_io_path *io_path = ctx_buf; 1025 struct spdk_io_channel *pg_ch; 1026 int rc; 1027 1028 pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs); 1029 if (!pg_ch) { 1030 return -1; 1031 } 1032 1033 io_path->group = spdk_io_channel_get_ctx(pg_ch); 1034 1035 #ifdef SPDK_CONFIG_VTUNE 1036 io_path->group->collect_spin_stat = true; 1037 #else 1038 io_path->group->collect_spin_stat = false; 1039 #endif 1040 1041 TAILQ_INIT(&io_path->pending_resets); 1042 1043 if (spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) { 1044 rc = bdev_ocssd_create_io_channel(io_path); 1045 if (rc != 0) { 1046 goto err_ocssd_ch; 1047 } 1048 } 1049 1050 io_path->ctrlr = nvme_bdev_ctrlr; 1051 1052 rc = bdev_nvme_create_qpair(io_path); 1053 if (rc != 0) { 1054 goto err_qpair; 1055 } 1056 1057 return 0; 1058 1059 err_qpair: 1060 if (io_path->ocssd_ch) { 1061 bdev_ocssd_destroy_io_channel(io_path); 1062 } 1063 err_ocssd_ch: 1064 spdk_put_io_channel(pg_ch); 1065 1066 return rc; 1067 } 1068 1069 static void 1070 bdev_nvme_destroy_path_cb(void *io_device, void *ctx_buf) 1071 { 1072 struct nvme_io_path *io_path = ctx_buf; 1073 1074 assert(io_path->group != NULL); 1075 1076 if (io_path->ocssd_ch != NULL) { 1077 bdev_ocssd_destroy_io_channel(io_path); 1078 } 1079 1080 bdev_nvme_destroy_qpair(io_path); 1081 1082 spdk_put_io_channel(spdk_io_channel_from_ctx(io_path->group)); 1083 } 1084 1085 static void 1086 bdev_nvme_poll_group_submit_accel_crc32c(void *ctx, uint32_t *dst, struct iovec *iov, 1087 uint32_t iov_cnt, uint32_t seed, 1088 spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) 1089 { 1090 struct nvme_bdev_poll_group *group = ctx; 1091 int rc; 1092 1093 assert(group->accel_channel != NULL); 1094 assert(cb_fn != NULL); 1095 1096 rc = spdk_accel_submit_crc32cv(group->accel_channel, dst, iov, iov_cnt, seed, cb_fn, cb_arg); 1097 if (rc) { 1098 /* For the two cases, spdk_accel_submit_crc32cv does not call the user's cb_fn */ 1099 if (rc == -ENOMEM || rc == -EINVAL) { 1100 cb_fn(cb_arg, rc); 1101 } 1102 SPDK_ERRLOG("Cannot complete the accelerated crc32c operation with iov=%p\n", iov); 1103 } 1104 } 1105 1106 static struct spdk_nvme_accel_fn_table g_bdev_nvme_accel_fn_table = { 1107 .table_size = sizeof(struct spdk_nvme_accel_fn_table), 1108 .submit_accel_crc32c = bdev_nvme_poll_group_submit_accel_crc32c, 1109 }; 1110 1111 static int 1112 bdev_nvme_poll_group_create_cb(void *io_device, void *ctx_buf) 1113 { 1114 struct nvme_bdev_poll_group *group = ctx_buf; 1115 1116 group->group = spdk_nvme_poll_group_create(group, &g_bdev_nvme_accel_fn_table); 1117 if (group->group == NULL) { 1118 return -1; 1119 } 1120 1121 group->accel_channel = spdk_accel_engine_get_io_channel(); 1122 if (!group->accel_channel) { 1123 spdk_nvme_poll_group_destroy(group->group); 1124 SPDK_ERRLOG("Cannot get the accel_channel for bdev nvme polling group=%p\n", 1125 group); 1126 return -1; 1127 } 1128 1129 group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us); 1130 1131 if (group->poller == NULL) { 1132 spdk_put_io_channel(group->accel_channel); 1133 spdk_nvme_poll_group_destroy(group->group); 1134 return -1; 1135 } 1136 1137 return 0; 1138 } 1139 1140 static void 1141 bdev_nvme_poll_group_destroy_cb(void *io_device, void *ctx_buf) 1142 { 1143 struct nvme_bdev_poll_group *group = ctx_buf; 1144 1145 if (group->accel_channel) { 1146 spdk_put_io_channel(group->accel_channel); 1147 } 1148 1149 spdk_poller_unregister(&group->poller); 1150 if (spdk_nvme_poll_group_destroy(group->group)) { 1151 SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.\n"); 1152 assert(false); 1153 } 1154 } 1155 1156 static struct spdk_io_channel * 1157 bdev_nvme_get_io_channel(void *ctx) 1158 { 1159 struct nvme_bdev *nvme_bdev = ctx; 1160 1161 return spdk_get_io_channel(nvme_bdev->nvme_ns->ctrlr); 1162 } 1163 1164 static void * 1165 bdev_nvme_get_module_ctx(void *ctx) 1166 { 1167 struct nvme_bdev *nvme_bdev = ctx; 1168 1169 return bdev_nvme_get_ctrlr(&nvme_bdev->disk); 1170 } 1171 1172 static const char * 1173 _nvme_ana_state_str(enum spdk_nvme_ana_state ana_state) 1174 { 1175 switch (ana_state) { 1176 case SPDK_NVME_ANA_OPTIMIZED_STATE: 1177 return "optimized"; 1178 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 1179 return "non_optimized"; 1180 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 1181 return "inaccessible"; 1182 case SPDK_NVME_ANA_PERSISTENT_LOSS_STATE: 1183 return "persistent_loss"; 1184 case SPDK_NVME_ANA_CHANGE_STATE: 1185 return "change"; 1186 default: 1187 return NULL; 1188 } 1189 } 1190 1191 static int 1192 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) 1193 { 1194 struct nvme_bdev *nvme_bdev = ctx; 1195 struct nvme_bdev_ns *nvme_ns; 1196 struct spdk_nvme_ns *ns; 1197 struct spdk_nvme_ctrlr *ctrlr; 1198 const struct spdk_nvme_ctrlr_data *cdata; 1199 const struct spdk_nvme_transport_id *trid; 1200 union spdk_nvme_vs_register vs; 1201 union spdk_nvme_csts_register csts; 1202 char buf[128]; 1203 1204 nvme_ns = nvme_bdev->nvme_ns; 1205 assert(nvme_ns != NULL); 1206 ns = nvme_ns->ns; 1207 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 1208 1209 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 1210 trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); 1211 vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr); 1212 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); 1213 1214 spdk_json_write_named_object_begin(w, "nvme"); 1215 1216 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 1217 spdk_json_write_named_string(w, "pci_address", trid->traddr); 1218 } 1219 1220 spdk_json_write_named_object_begin(w, "trid"); 1221 1222 nvme_bdev_dump_trid_json(trid, w); 1223 1224 spdk_json_write_object_end(w); 1225 1226 #ifdef SPDK_CONFIG_NVME_CUSE 1227 size_t cuse_name_size = 128; 1228 char cuse_name[cuse_name_size]; 1229 1230 int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns), 1231 cuse_name, &cuse_name_size); 1232 if (rc == 0) { 1233 spdk_json_write_named_string(w, "cuse_device", cuse_name); 1234 } 1235 #endif 1236 1237 spdk_json_write_named_object_begin(w, "ctrlr_data"); 1238 1239 spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid); 1240 1241 snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn); 1242 spdk_str_trim(buf); 1243 spdk_json_write_named_string(w, "model_number", buf); 1244 1245 snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn); 1246 spdk_str_trim(buf); 1247 spdk_json_write_named_string(w, "serial_number", buf); 1248 1249 snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr); 1250 spdk_str_trim(buf); 1251 spdk_json_write_named_string(w, "firmware_revision", buf); 1252 1253 if (cdata->subnqn[0] != '\0') { 1254 spdk_json_write_named_string(w, "subnqn", cdata->subnqn); 1255 } 1256 1257 spdk_json_write_named_object_begin(w, "oacs"); 1258 1259 spdk_json_write_named_uint32(w, "security", cdata->oacs.security); 1260 spdk_json_write_named_uint32(w, "format", cdata->oacs.format); 1261 spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware); 1262 spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage); 1263 1264 spdk_json_write_object_end(w); 1265 1266 spdk_json_write_object_end(w); 1267 1268 spdk_json_write_named_object_begin(w, "vs"); 1269 1270 spdk_json_write_name(w, "nvme_version"); 1271 if (vs.bits.ter) { 1272 spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter); 1273 } else { 1274 spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr); 1275 } 1276 1277 spdk_json_write_object_end(w); 1278 1279 spdk_json_write_named_object_begin(w, "csts"); 1280 1281 spdk_json_write_named_uint32(w, "rdy", csts.bits.rdy); 1282 spdk_json_write_named_uint32(w, "cfs", csts.bits.cfs); 1283 1284 spdk_json_write_object_end(w); 1285 1286 spdk_json_write_named_object_begin(w, "ns_data"); 1287 1288 spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns)); 1289 1290 if (cdata->cmic.ana_reporting) { 1291 spdk_json_write_named_string(w, "ana_state", 1292 _nvme_ana_state_str(spdk_nvme_ns_get_ana_state(ns))); 1293 } 1294 1295 spdk_json_write_object_end(w); 1296 1297 if (cdata->oacs.security) { 1298 spdk_json_write_named_object_begin(w, "security"); 1299 1300 spdk_json_write_named_bool(w, "opal", nvme_bdev->opal); 1301 1302 spdk_json_write_object_end(w); 1303 } 1304 1305 spdk_json_write_object_end(w); 1306 1307 return 0; 1308 } 1309 1310 static void 1311 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1312 { 1313 /* No config per bdev needed */ 1314 } 1315 1316 static uint64_t 1317 bdev_nvme_get_spin_time(struct spdk_io_channel *ch) 1318 { 1319 struct nvme_io_path *io_path = spdk_io_channel_get_ctx(ch); 1320 struct nvme_bdev_poll_group *group = io_path->group; 1321 uint64_t spin_time; 1322 1323 if (!group || !group->collect_spin_stat) { 1324 return 0; 1325 } 1326 1327 if (group->end_ticks != 0) { 1328 group->spin_ticks += (group->end_ticks - group->start_ticks); 1329 group->end_ticks = 0; 1330 } 1331 1332 spin_time = (group->spin_ticks * 1000000ULL) / spdk_get_ticks_hz(); 1333 group->start_ticks = 0; 1334 group->spin_ticks = 0; 1335 1336 return spin_time; 1337 } 1338 1339 static const struct spdk_bdev_fn_table nvmelib_fn_table = { 1340 .destruct = bdev_nvme_destruct, 1341 .submit_request = bdev_nvme_submit_request, 1342 .io_type_supported = bdev_nvme_io_type_supported, 1343 .get_io_channel = bdev_nvme_get_io_channel, 1344 .dump_info_json = bdev_nvme_dump_info_json, 1345 .write_config_json = bdev_nvme_write_config_json, 1346 .get_spin_time = bdev_nvme_get_spin_time, 1347 .get_module_ctx = bdev_nvme_get_module_ctx, 1348 }; 1349 1350 static int 1351 nvme_disk_create(struct spdk_bdev *disk, const char *base_name, 1352 struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns, 1353 uint32_t prchk_flags, void *ctx) 1354 { 1355 const struct spdk_uuid *uuid; 1356 const struct spdk_nvme_ctrlr_data *cdata; 1357 const struct spdk_nvme_ns_data *nsdata; 1358 int rc; 1359 enum spdk_nvme_csi csi; 1360 uint32_t atomic_bs, phys_bs, bs; 1361 1362 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 1363 csi = spdk_nvme_ns_get_csi(ns); 1364 1365 switch (csi) { 1366 case SPDK_NVME_CSI_NVM: 1367 disk->product_name = "NVMe disk"; 1368 break; 1369 case SPDK_NVME_CSI_ZNS: 1370 disk->product_name = "NVMe ZNS disk"; 1371 disk->zoned = true; 1372 disk->zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 1373 disk->max_zone_append_size = spdk_nvme_zns_ctrlr_get_max_zone_append_size(ctrlr) / 1374 spdk_nvme_ns_get_extended_sector_size(ns); 1375 disk->max_open_zones = spdk_nvme_zns_ns_get_max_open_zones(ns); 1376 disk->max_active_zones = spdk_nvme_zns_ns_get_max_active_zones(ns); 1377 break; 1378 default: 1379 SPDK_ERRLOG("unsupported CSI: %u\n", csi); 1380 return -ENOTSUP; 1381 } 1382 1383 disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns)); 1384 if (!disk->name) { 1385 return -ENOMEM; 1386 } 1387 1388 disk->write_cache = 0; 1389 if (cdata->vwc.present) { 1390 /* Enable if the Volatile Write Cache exists */ 1391 disk->write_cache = 1; 1392 } 1393 disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns); 1394 disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns); 1395 disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns); 1396 1397 uuid = spdk_nvme_ns_get_uuid(ns); 1398 if (uuid != NULL) { 1399 disk->uuid = *uuid; 1400 } 1401 1402 nsdata = spdk_nvme_ns_get_data(ns); 1403 bs = spdk_nvme_ns_get_sector_size(ns); 1404 atomic_bs = bs; 1405 phys_bs = bs; 1406 if (nsdata->nabo == 0) { 1407 if (nsdata->nsfeat.ns_atomic_write_unit && nsdata->nawupf) { 1408 atomic_bs = bs * (1 + nsdata->nawupf); 1409 } else { 1410 atomic_bs = bs * (1 + cdata->awupf); 1411 } 1412 } 1413 if (nsdata->nsfeat.optperf) { 1414 phys_bs = bs * (1 + nsdata->npwg); 1415 } 1416 disk->phys_blocklen = spdk_min(phys_bs, atomic_bs); 1417 1418 disk->md_len = spdk_nvme_ns_get_md_size(ns); 1419 if (disk->md_len != 0) { 1420 disk->md_interleave = nsdata->flbas.extended; 1421 disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns); 1422 if (disk->dif_type != SPDK_DIF_DISABLE) { 1423 disk->dif_is_head_of_md = nsdata->dps.md_start; 1424 disk->dif_check_flags = prchk_flags; 1425 } 1426 } 1427 1428 if (!(spdk_nvme_ctrlr_get_flags(ctrlr) & 1429 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) { 1430 disk->acwu = 0; 1431 } else if (nsdata->nsfeat.ns_atomic_write_unit) { 1432 disk->acwu = nsdata->nacwu; 1433 } else { 1434 disk->acwu = cdata->acwu; 1435 } 1436 1437 disk->ctxt = ctx; 1438 disk->fn_table = &nvmelib_fn_table; 1439 disk->module = &nvme_if; 1440 rc = spdk_bdev_register(disk); 1441 if (rc) { 1442 SPDK_ERRLOG("spdk_bdev_register() failed\n"); 1443 free(disk->name); 1444 return rc; 1445 } 1446 1447 return 0; 1448 } 1449 1450 static int 1451 nvme_bdev_create(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, struct nvme_bdev_ns *nvme_ns) 1452 { 1453 struct nvme_bdev *bdev; 1454 int rc; 1455 1456 bdev = calloc(1, sizeof(*bdev)); 1457 if (!bdev) { 1458 SPDK_ERRLOG("bdev calloc() failed\n"); 1459 return -ENOMEM; 1460 } 1461 1462 bdev->nvme_ns = nvme_ns; 1463 bdev->opal = nvme_bdev_ctrlr->opal_dev != NULL; 1464 1465 rc = nvme_disk_create(&bdev->disk, nvme_bdev_ctrlr->name, nvme_bdev_ctrlr->ctrlr, 1466 nvme_ns->ns, nvme_bdev_ctrlr->prchk_flags, bdev); 1467 if (rc != 0) { 1468 SPDK_ERRLOG("Failed to create NVMe disk\n"); 1469 free(bdev); 1470 return rc; 1471 } 1472 1473 nvme_ns->bdev = bdev; 1474 1475 return 0; 1476 } 1477 1478 static bool 1479 bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2) 1480 { 1481 const struct spdk_nvme_ns_data *nsdata1, *nsdata2; 1482 const struct spdk_uuid *uuid1, *uuid2; 1483 1484 nsdata1 = spdk_nvme_ns_get_data(ns1); 1485 nsdata2 = spdk_nvme_ns_get_data(ns2); 1486 uuid1 = spdk_nvme_ns_get_uuid(ns1); 1487 uuid2 = spdk_nvme_ns_get_uuid(ns2); 1488 1489 return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid)) == 0 && 1490 nsdata1->eui64 == nsdata2->eui64 && 1491 uuid1 != NULL && uuid2 != NULL && spdk_uuid_compare(uuid1, uuid2) == 0; 1492 } 1493 1494 static void 1495 nvme_ctrlr_populate_standard_namespace(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 1496 struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx) 1497 { 1498 struct spdk_nvme_ctrlr *ctrlr = nvme_bdev_ctrlr->ctrlr; 1499 struct spdk_nvme_ns *ns; 1500 int rc = 0; 1501 1502 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id); 1503 if (!ns) { 1504 SPDK_DEBUGLOG(bdev_nvme, "Invalid NS %d\n", nvme_ns->id); 1505 rc = -EINVAL; 1506 goto done; 1507 } 1508 1509 nvme_ns->ns = ns; 1510 nvme_ns->populated = true; 1511 1512 rc = nvme_bdev_create(nvme_bdev_ctrlr, nvme_ns); 1513 done: 1514 nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, rc); 1515 } 1516 1517 static bool 1518 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 1519 struct spdk_nvme_ctrlr_opts *opts) 1520 { 1521 struct nvme_probe_skip_entry *entry; 1522 1523 TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) { 1524 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 1525 return false; 1526 } 1527 } 1528 1529 opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst; 1530 opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight; 1531 opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight; 1532 opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight; 1533 1534 SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr); 1535 1536 return true; 1537 } 1538 1539 static void 1540 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl) 1541 { 1542 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = ctx; 1543 1544 if (spdk_nvme_cpl_is_error(cpl)) { 1545 SPDK_WARNLOG("Abort failed. Resetting controller. sc is %u, sct is %u.\n", cpl->status.sc, 1546 cpl->status.sct); 1547 _bdev_nvme_reset(nvme_bdev_ctrlr); 1548 } 1549 } 1550 1551 static void 1552 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr, 1553 struct spdk_nvme_qpair *qpair, uint16_t cid) 1554 { 1555 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = cb_arg; 1556 union spdk_nvme_csts_register csts; 1557 int rc; 1558 1559 assert(nvme_bdev_ctrlr->ctrlr == ctrlr); 1560 1561 SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid); 1562 1563 /* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O 1564 * queue. (Note: qpair == NULL when there's an admin cmd timeout.) Otherwise we 1565 * would submit another fabrics cmd on the admin queue to read CSTS and check for its 1566 * completion recursively. 1567 */ 1568 if (nvme_bdev_ctrlr->connected_trid->trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) { 1569 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); 1570 if (csts.bits.cfs) { 1571 SPDK_ERRLOG("Controller Fatal Status, reset required\n"); 1572 _bdev_nvme_reset(nvme_bdev_ctrlr); 1573 return; 1574 } 1575 } 1576 1577 switch (g_opts.action_on_timeout) { 1578 case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT: 1579 if (qpair) { 1580 /* Don't send abort to ctrlr when reset is running. */ 1581 pthread_mutex_lock(&nvme_bdev_ctrlr->mutex); 1582 if (nvme_bdev_ctrlr->resetting) { 1583 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 1584 SPDK_NOTICELOG("Quit abort. Ctrlr is in the process of reseting.\n"); 1585 return; 1586 } 1587 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 1588 1589 rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid, 1590 nvme_abort_cpl, nvme_bdev_ctrlr); 1591 if (rc == 0) { 1592 return; 1593 } 1594 1595 SPDK_ERRLOG("Unable to send abort. Resetting, rc is %d.\n", rc); 1596 } 1597 1598 /* FALLTHROUGH */ 1599 case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET: 1600 _bdev_nvme_reset(nvme_bdev_ctrlr); 1601 break; 1602 case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE: 1603 SPDK_DEBUGLOG(bdev_nvme, "No action for nvme controller timeout.\n"); 1604 break; 1605 default: 1606 SPDK_ERRLOG("An invalid timeout action value is found.\n"); 1607 break; 1608 } 1609 } 1610 1611 static void 1612 nvme_ctrlr_depopulate_standard_namespace(struct nvme_bdev_ns *nvme_ns) 1613 { 1614 struct nvme_bdev *bdev; 1615 1616 bdev = nvme_ns->bdev; 1617 if (bdev != NULL) { 1618 spdk_bdev_unregister(&bdev->disk, NULL, NULL); 1619 } 1620 1621 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 1622 } 1623 1624 static void 1625 nvme_ctrlr_populate_namespace(struct nvme_bdev_ctrlr *ctrlr, struct nvme_bdev_ns *nvme_ns, 1626 struct nvme_async_probe_ctx *ctx) 1627 { 1628 g_populate_namespace_fn[nvme_ns->type](ctrlr, nvme_ns, ctx); 1629 } 1630 1631 static void 1632 nvme_ctrlr_depopulate_namespace(struct nvme_bdev_ctrlr *ctrlr, struct nvme_bdev_ns *nvme_ns) 1633 { 1634 g_depopulate_namespace_fn[nvme_ns->type](nvme_ns); 1635 } 1636 1637 void 1638 nvme_ctrlr_populate_namespace_done(struct nvme_async_probe_ctx *ctx, 1639 struct nvme_bdev_ns *nvme_ns, int rc) 1640 { 1641 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = nvme_ns->ctrlr; 1642 1643 assert(nvme_bdev_ctrlr != NULL); 1644 1645 if (rc == 0) { 1646 pthread_mutex_lock(&nvme_bdev_ctrlr->mutex); 1647 nvme_bdev_ctrlr->ref++; 1648 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 1649 } else { 1650 memset(nvme_ns, 0, sizeof(*nvme_ns)); 1651 } 1652 1653 if (ctx) { 1654 ctx->populates_in_progress--; 1655 if (ctx->populates_in_progress == 0) { 1656 nvme_ctrlr_populate_namespaces_done(nvme_bdev_ctrlr, ctx); 1657 } 1658 } 1659 } 1660 1661 static void 1662 nvme_ctrlr_populate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 1663 struct nvme_async_probe_ctx *ctx) 1664 { 1665 struct spdk_nvme_ctrlr *ctrlr = nvme_bdev_ctrlr->ctrlr; 1666 struct nvme_bdev_ns *nvme_ns; 1667 struct spdk_nvme_ns *ns; 1668 struct nvme_bdev *bdev; 1669 uint32_t i; 1670 int rc; 1671 uint64_t num_sectors; 1672 bool ns_is_active; 1673 1674 if (ctx) { 1675 /* Initialize this count to 1 to handle the populate functions 1676 * calling nvme_ctrlr_populate_namespace_done() immediately. 1677 */ 1678 ctx->populates_in_progress = 1; 1679 } 1680 1681 for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) { 1682 uint32_t nsid = i + 1; 1683 1684 nvme_ns = nvme_bdev_ctrlr->namespaces[i]; 1685 ns_is_active = spdk_nvme_ctrlr_is_active_ns(ctrlr, nsid); 1686 1687 if (nvme_ns->populated && ns_is_active && nvme_ns->type == NVME_BDEV_NS_STANDARD) { 1688 /* NS is still there but attributes may have changed */ 1689 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid); 1690 num_sectors = spdk_nvme_ns_get_num_sectors(ns); 1691 bdev = nvme_ns->bdev; 1692 assert(bdev != NULL); 1693 if (bdev->disk.blockcnt != num_sectors) { 1694 SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n", 1695 nsid, 1696 bdev->disk.name, 1697 bdev->disk.blockcnt, 1698 num_sectors); 1699 rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors); 1700 if (rc != 0) { 1701 SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n", 1702 bdev->disk.name, rc); 1703 } 1704 } 1705 } 1706 1707 if (!nvme_ns->populated && ns_is_active) { 1708 nvme_ns->id = nsid; 1709 nvme_ns->ctrlr = nvme_bdev_ctrlr; 1710 if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) { 1711 nvme_ns->type = NVME_BDEV_NS_OCSSD; 1712 } else { 1713 nvme_ns->type = NVME_BDEV_NS_STANDARD; 1714 } 1715 1716 nvme_ns->bdev = NULL; 1717 1718 if (ctx) { 1719 ctx->populates_in_progress++; 1720 } 1721 nvme_ctrlr_populate_namespace(nvme_bdev_ctrlr, nvme_ns, ctx); 1722 } 1723 1724 if (nvme_ns->populated && !ns_is_active) { 1725 nvme_ctrlr_depopulate_namespace(nvme_bdev_ctrlr, nvme_ns); 1726 } 1727 } 1728 1729 if (ctx) { 1730 /* Decrement this count now that the loop is over to account 1731 * for the one we started with. If the count is then 0, we 1732 * know any populate_namespace functions completed immediately, 1733 * so we'll kick the callback here. 1734 */ 1735 ctx->populates_in_progress--; 1736 if (ctx->populates_in_progress == 0) { 1737 nvme_ctrlr_populate_namespaces_done(nvme_bdev_ctrlr, ctx); 1738 } 1739 } 1740 1741 } 1742 1743 static void 1744 nvme_ctrlr_depopulate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr) 1745 { 1746 uint32_t i; 1747 struct nvme_bdev_ns *nvme_ns; 1748 1749 for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) { 1750 uint32_t nsid = i + 1; 1751 1752 nvme_ns = nvme_bdev_ctrlr->namespaces[nsid - 1]; 1753 if (nvme_ns->populated) { 1754 assert(nvme_ns->id == nsid); 1755 nvme_ctrlr_depopulate_namespace(nvme_bdev_ctrlr, nvme_ns); 1756 } 1757 } 1758 } 1759 1760 static void 1761 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 1762 { 1763 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = arg; 1764 union spdk_nvme_async_event_completion event; 1765 1766 if (spdk_nvme_cpl_is_error(cpl)) { 1767 SPDK_WARNLOG("AER request execute failed"); 1768 return; 1769 } 1770 1771 event.raw = cpl->cdw0; 1772 if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 1773 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) { 1774 nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, NULL); 1775 } else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_VENDOR) && 1776 (event.bits.log_page_identifier == SPDK_OCSSD_LOG_CHUNK_NOTIFICATION) && 1777 spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) { 1778 bdev_ocssd_handle_chunk_notification(nvme_bdev_ctrlr); 1779 } 1780 } 1781 1782 static void 1783 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, size_t count, int rc) 1784 { 1785 if (ctx->cb_fn) { 1786 ctx->cb_fn(ctx->cb_ctx, count, rc); 1787 } 1788 1789 ctx->namespaces_populated = true; 1790 if (ctx->probe_done) { 1791 /* The probe was already completed, so we need to free the context 1792 * here. This can happen for cases like OCSSD, where we need to 1793 * send additional commands to the SSD after attach. 1794 */ 1795 free(ctx); 1796 } 1797 } 1798 1799 static int 1800 _nvme_bdev_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr, 1801 const char *name, 1802 const struct spdk_nvme_transport_id *trid, 1803 uint32_t prchk_flags, 1804 struct nvme_bdev_ctrlr **_nvme_bdev_ctrlr) 1805 { 1806 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; 1807 struct nvme_bdev_ctrlr_trid *trid_entry; 1808 uint32_t i; 1809 int rc; 1810 1811 nvme_bdev_ctrlr = calloc(1, sizeof(*nvme_bdev_ctrlr)); 1812 if (nvme_bdev_ctrlr == NULL) { 1813 SPDK_ERRLOG("Failed to allocate device struct\n"); 1814 return -ENOMEM; 1815 } 1816 1817 rc = pthread_mutex_init(&nvme_bdev_ctrlr->mutex, NULL); 1818 if (rc != 0) { 1819 goto err_init_mutex; 1820 } 1821 1822 TAILQ_INIT(&nvme_bdev_ctrlr->trids); 1823 nvme_bdev_ctrlr->num_ns = spdk_nvme_ctrlr_get_num_ns(ctrlr); 1824 if (nvme_bdev_ctrlr->num_ns != 0) { 1825 nvme_bdev_ctrlr->namespaces = calloc(nvme_bdev_ctrlr->num_ns, sizeof(struct nvme_bdev_ns *)); 1826 if (!nvme_bdev_ctrlr->namespaces) { 1827 SPDK_ERRLOG("Failed to allocate block namespaces pointer\n"); 1828 rc = -ENOMEM; 1829 goto err_alloc_namespaces; 1830 } 1831 } 1832 1833 trid_entry = calloc(1, sizeof(*trid_entry)); 1834 if (trid_entry == NULL) { 1835 SPDK_ERRLOG("Failed to allocate trid entry pointer\n"); 1836 rc = -ENOMEM; 1837 goto err_alloc_trid; 1838 } 1839 1840 trid_entry->trid = *trid; 1841 1842 for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) { 1843 nvme_bdev_ctrlr->namespaces[i] = calloc(1, sizeof(struct nvme_bdev_ns)); 1844 if (nvme_bdev_ctrlr->namespaces[i] == NULL) { 1845 SPDK_ERRLOG("Failed to allocate block namespace struct\n"); 1846 rc = -ENOMEM; 1847 goto err_alloc_namespace; 1848 } 1849 } 1850 1851 nvme_bdev_ctrlr->thread = spdk_get_thread(); 1852 nvme_bdev_ctrlr->adminq_timer_poller = NULL; 1853 nvme_bdev_ctrlr->ctrlr = ctrlr; 1854 nvme_bdev_ctrlr->ref = 1; 1855 nvme_bdev_ctrlr->connected_trid = &trid_entry->trid; 1856 nvme_bdev_ctrlr->name = strdup(name); 1857 if (nvme_bdev_ctrlr->name == NULL) { 1858 rc = -ENOMEM; 1859 goto err_alloc_name; 1860 } 1861 1862 if (spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) { 1863 rc = bdev_ocssd_init_ctrlr(nvme_bdev_ctrlr); 1864 if (spdk_unlikely(rc != 0)) { 1865 SPDK_ERRLOG("Unable to initialize OCSSD controller\n"); 1866 goto err_init_ocssd; 1867 } 1868 } 1869 1870 nvme_bdev_ctrlr->prchk_flags = prchk_flags; 1871 1872 spdk_io_device_register(nvme_bdev_ctrlr, 1873 bdev_nvme_create_path_cb, 1874 bdev_nvme_destroy_path_cb, 1875 sizeof(struct nvme_io_path), 1876 name); 1877 1878 nvme_bdev_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_bdev_ctrlr, 1879 g_opts.nvme_adminq_poll_period_us); 1880 1881 TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nvme_bdev_ctrlr, tailq); 1882 1883 if (g_opts.timeout_us > 0) { 1884 spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us, 1885 timeout_cb, nvme_bdev_ctrlr); 1886 } 1887 1888 spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_bdev_ctrlr); 1889 spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_bdev_ctrlr); 1890 1891 if (spdk_nvme_ctrlr_get_flags(nvme_bdev_ctrlr->ctrlr) & 1892 SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) { 1893 nvme_bdev_ctrlr->opal_dev = spdk_opal_dev_construct(nvme_bdev_ctrlr->ctrlr); 1894 } 1895 1896 TAILQ_INSERT_HEAD(&nvme_bdev_ctrlr->trids, trid_entry, link); 1897 1898 if (_nvme_bdev_ctrlr != NULL) { 1899 *_nvme_bdev_ctrlr = nvme_bdev_ctrlr; 1900 } 1901 return 0; 1902 1903 err_init_ocssd: 1904 free(nvme_bdev_ctrlr->name); 1905 err_alloc_name: 1906 err_alloc_namespace: 1907 for (; i > 0; i--) { 1908 free(nvme_bdev_ctrlr->namespaces[i - 1]); 1909 } 1910 free(trid_entry); 1911 err_alloc_trid: 1912 free(nvme_bdev_ctrlr->namespaces); 1913 err_alloc_namespaces: 1914 pthread_mutex_destroy(&nvme_bdev_ctrlr->mutex); 1915 err_init_mutex: 1916 free(nvme_bdev_ctrlr); 1917 return rc; 1918 } 1919 1920 static void 1921 nvme_bdev_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr, 1922 const char *name, 1923 const struct spdk_nvme_transport_id *trid, 1924 uint32_t prchk_flags, 1925 struct nvme_async_probe_ctx *ctx) 1926 { 1927 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = NULL; 1928 int rc; 1929 1930 rc = _nvme_bdev_ctrlr_create(ctrlr, name, trid, prchk_flags, &nvme_bdev_ctrlr); 1931 if (rc != 0) { 1932 SPDK_ERRLOG("Failed to create new NVMe controller\n"); 1933 goto err; 1934 } 1935 1936 nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, ctx); 1937 return; 1938 1939 err: 1940 if (ctx != NULL) { 1941 populate_namespaces_cb(ctx, 0, rc); 1942 } 1943 } 1944 1945 static void 1946 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 1947 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 1948 { 1949 struct nvme_probe_ctx *ctx = cb_ctx; 1950 char *name = NULL; 1951 uint32_t prchk_flags = 0; 1952 size_t i; 1953 1954 if (ctx) { 1955 for (i = 0; i < ctx->count; i++) { 1956 if (spdk_nvme_transport_id_compare(trid, &ctx->trids[i]) == 0) { 1957 prchk_flags = ctx->prchk_flags[i]; 1958 name = strdup(ctx->names[i]); 1959 break; 1960 } 1961 } 1962 } else { 1963 name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++); 1964 } 1965 if (!name) { 1966 SPDK_ERRLOG("Failed to assign name to NVMe device\n"); 1967 return; 1968 } 1969 1970 SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name); 1971 1972 nvme_bdev_ctrlr_create(ctrlr, name, trid, prchk_flags, NULL); 1973 1974 free(name); 1975 } 1976 1977 static void 1978 _nvme_bdev_ctrlr_destruct(void *ctx) 1979 { 1980 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = ctx; 1981 1982 nvme_ctrlr_depopulate_namespaces(nvme_bdev_ctrlr); 1983 nvme_bdev_ctrlr_destruct(nvme_bdev_ctrlr); 1984 } 1985 1986 static int 1987 _bdev_nvme_delete(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, bool hotplug) 1988 { 1989 struct nvme_probe_skip_entry *entry; 1990 1991 pthread_mutex_lock(&nvme_bdev_ctrlr->mutex); 1992 1993 /* The controller's destruction was already started */ 1994 if (nvme_bdev_ctrlr->destruct) { 1995 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 1996 return 0; 1997 } 1998 1999 if (!hotplug && 2000 nvme_bdev_ctrlr->connected_trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 2001 entry = calloc(1, sizeof(*entry)); 2002 if (!entry) { 2003 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 2004 return -ENOMEM; 2005 } 2006 entry->trid = *nvme_bdev_ctrlr->connected_trid; 2007 TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq); 2008 } 2009 2010 nvme_bdev_ctrlr->destruct = true; 2011 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 2012 2013 _nvme_bdev_ctrlr_destruct(nvme_bdev_ctrlr); 2014 2015 return 0; 2016 } 2017 2018 static void 2019 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr) 2020 { 2021 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = cb_ctx; 2022 2023 _bdev_nvme_delete(nvme_bdev_ctrlr, true); 2024 } 2025 2026 static int 2027 bdev_nvme_hotplug_probe(void *arg) 2028 { 2029 if (g_hotplug_probe_ctx == NULL) { 2030 spdk_poller_unregister(&g_hotplug_probe_poller); 2031 return SPDK_POLLER_IDLE; 2032 } 2033 2034 if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) { 2035 g_hotplug_probe_ctx = NULL; 2036 spdk_poller_unregister(&g_hotplug_probe_poller); 2037 } 2038 2039 return SPDK_POLLER_BUSY; 2040 } 2041 2042 static int 2043 bdev_nvme_hotplug(void *arg) 2044 { 2045 struct spdk_nvme_transport_id trid_pcie; 2046 2047 if (g_hotplug_probe_ctx) { 2048 return SPDK_POLLER_BUSY; 2049 } 2050 2051 memset(&trid_pcie, 0, sizeof(trid_pcie)); 2052 spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE); 2053 2054 g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL, 2055 hotplug_probe_cb, attach_cb, NULL); 2056 2057 if (g_hotplug_probe_ctx) { 2058 assert(g_hotplug_probe_poller == NULL); 2059 g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000); 2060 } 2061 2062 return SPDK_POLLER_BUSY; 2063 } 2064 2065 void 2066 bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts) 2067 { 2068 *opts = g_opts; 2069 } 2070 2071 int 2072 bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts) 2073 { 2074 if (g_bdev_nvme_init_thread != NULL) { 2075 if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 2076 return -EPERM; 2077 } 2078 } 2079 2080 g_opts = *opts; 2081 2082 return 0; 2083 } 2084 2085 struct set_nvme_hotplug_ctx { 2086 uint64_t period_us; 2087 bool enabled; 2088 spdk_msg_fn fn; 2089 void *fn_ctx; 2090 }; 2091 2092 static void 2093 set_nvme_hotplug_period_cb(void *_ctx) 2094 { 2095 struct set_nvme_hotplug_ctx *ctx = _ctx; 2096 2097 spdk_poller_unregister(&g_hotplug_poller); 2098 if (ctx->enabled) { 2099 g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us); 2100 } 2101 2102 g_nvme_hotplug_poll_period_us = ctx->period_us; 2103 g_nvme_hotplug_enabled = ctx->enabled; 2104 if (ctx->fn) { 2105 ctx->fn(ctx->fn_ctx); 2106 } 2107 2108 free(ctx); 2109 } 2110 2111 int 2112 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx) 2113 { 2114 struct set_nvme_hotplug_ctx *ctx; 2115 2116 if (enabled == true && !spdk_process_is_primary()) { 2117 return -EPERM; 2118 } 2119 2120 ctx = calloc(1, sizeof(*ctx)); 2121 if (ctx == NULL) { 2122 return -ENOMEM; 2123 } 2124 2125 period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us; 2126 ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX); 2127 ctx->enabled = enabled; 2128 ctx->fn = cb; 2129 ctx->fn_ctx = cb_ctx; 2130 2131 spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx); 2132 return 0; 2133 } 2134 2135 static void 2136 nvme_ctrlr_populate_namespaces_done(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 2137 struct nvme_async_probe_ctx *ctx) 2138 { 2139 struct nvme_bdev_ns *nvme_ns; 2140 struct nvme_bdev *nvme_bdev; 2141 uint32_t i, nsid; 2142 size_t j; 2143 2144 assert(nvme_bdev_ctrlr != NULL); 2145 2146 /* 2147 * Report the new bdevs that were created in this call. 2148 * There can be more than one bdev per NVMe controller. 2149 */ 2150 j = 0; 2151 for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) { 2152 nsid = i + 1; 2153 nvme_ns = nvme_bdev_ctrlr->namespaces[nsid - 1]; 2154 if (!nvme_ns->populated) { 2155 continue; 2156 } 2157 assert(nvme_ns->id == nsid); 2158 nvme_bdev = nvme_ns->bdev; 2159 if (nvme_bdev == NULL) { 2160 assert(nvme_ns->type == NVME_BDEV_NS_OCSSD); 2161 continue; 2162 } 2163 if (j < ctx->count) { 2164 ctx->names[j] = nvme_bdev->disk.name; 2165 j++; 2166 } else { 2167 SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n", 2168 ctx->count); 2169 populate_namespaces_cb(ctx, 0, -ERANGE); 2170 return; 2171 } 2172 } 2173 2174 populate_namespaces_cb(ctx, j, 0); 2175 } 2176 2177 static int 2178 bdev_nvme_compare_trids(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 2179 struct spdk_nvme_ctrlr *new_ctrlr, 2180 struct spdk_nvme_transport_id *trid) 2181 { 2182 struct nvme_bdev_ctrlr_trid *tmp_trid; 2183 2184 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 2185 SPDK_ERRLOG("PCIe failover is not supported.\n"); 2186 return -ENOTSUP; 2187 } 2188 2189 /* Currently we only support failover to the same transport type. */ 2190 if (nvme_bdev_ctrlr->connected_trid->trtype != trid->trtype) { 2191 return -EINVAL; 2192 } 2193 2194 /* Currently we only support failover to the same NQN. */ 2195 if (strncmp(trid->subnqn, nvme_bdev_ctrlr->connected_trid->subnqn, SPDK_NVMF_NQN_MAX_LEN)) { 2196 return -EINVAL; 2197 } 2198 2199 /* Skip all the other checks if we've already registered this path. */ 2200 TAILQ_FOREACH(tmp_trid, &nvme_bdev_ctrlr->trids, link) { 2201 if (!spdk_nvme_transport_id_compare(&tmp_trid->trid, trid)) { 2202 return -EEXIST; 2203 } 2204 } 2205 2206 return 0; 2207 } 2208 2209 static int 2210 bdev_nvme_compare_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 2211 struct spdk_nvme_ctrlr *new_ctrlr) 2212 { 2213 uint32_t i, nsid; 2214 struct nvme_bdev_ns *nvme_ns; 2215 struct spdk_nvme_ns *new_ns; 2216 2217 if (spdk_nvme_ctrlr_get_num_ns(new_ctrlr) != nvme_bdev_ctrlr->num_ns) { 2218 return -EINVAL; 2219 } 2220 2221 for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) { 2222 nsid = i + 1; 2223 2224 nvme_ns = nvme_bdev_ctrlr->namespaces[i]; 2225 if (!nvme_ns->populated) { 2226 continue; 2227 } 2228 2229 new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nsid); 2230 assert(new_ns != NULL); 2231 2232 if (!bdev_nvme_compare_ns(nvme_ns->ns, new_ns)) { 2233 return -EINVAL; 2234 } 2235 } 2236 2237 return 0; 2238 } 2239 2240 static int 2241 _bdev_nvme_add_secondary_trid(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 2242 struct spdk_nvme_transport_id *trid) 2243 { 2244 struct nvme_bdev_ctrlr_trid *new_trid, *tmp_trid; 2245 2246 new_trid = calloc(1, sizeof(*new_trid)); 2247 if (new_trid == NULL) { 2248 return -ENOMEM; 2249 } 2250 new_trid->trid = *trid; 2251 new_trid->is_failed = false; 2252 2253 TAILQ_FOREACH(tmp_trid, &nvme_bdev_ctrlr->trids, link) { 2254 if (tmp_trid->is_failed) { 2255 TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link); 2256 return 0; 2257 } 2258 } 2259 2260 TAILQ_INSERT_TAIL(&nvme_bdev_ctrlr->trids, new_trid, link); 2261 return 0; 2262 } 2263 2264 /* This is the case that a secondary path is added to an existing 2265 * nvme_bdev_ctrlr for failover. After checking if it can access the same 2266 * namespaces as the primary path, it is disconnected until failover occurs. 2267 */ 2268 static void 2269 bdev_nvme_add_secondary_trid(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 2270 struct spdk_nvme_ctrlr *new_ctrlr, 2271 struct spdk_nvme_transport_id *trid, 2272 struct nvme_async_probe_ctx *ctx) 2273 { 2274 int rc; 2275 2276 assert(nvme_bdev_ctrlr != NULL); 2277 2278 pthread_mutex_lock(&nvme_bdev_ctrlr->mutex); 2279 2280 rc = bdev_nvme_compare_trids(nvme_bdev_ctrlr, new_ctrlr, trid); 2281 if (rc != 0) { 2282 goto exit; 2283 } 2284 2285 rc = bdev_nvme_compare_namespaces(nvme_bdev_ctrlr, new_ctrlr); 2286 if (rc != 0) { 2287 goto exit; 2288 } 2289 2290 rc = _bdev_nvme_add_secondary_trid(nvme_bdev_ctrlr, trid); 2291 2292 exit: 2293 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 2294 2295 spdk_nvme_detach(new_ctrlr); 2296 2297 if (ctx != NULL) { 2298 populate_namespaces_cb(ctx, 0, rc); 2299 } 2300 } 2301 2302 static void 2303 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 2304 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 2305 { 2306 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 2307 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; 2308 struct nvme_async_probe_ctx *ctx; 2309 2310 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, opts); 2311 ctx->ctrlr_attached = true; 2312 2313 nvme_bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(ctx->base_name); 2314 if (nvme_bdev_ctrlr) { 2315 bdev_nvme_add_secondary_trid(nvme_bdev_ctrlr, ctrlr, &ctx->trid, ctx); 2316 return; 2317 } 2318 2319 nvme_bdev_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx->prchk_flags, ctx); 2320 } 2321 2322 static int 2323 bdev_nvme_async_poll(void *arg) 2324 { 2325 struct nvme_async_probe_ctx *ctx = arg; 2326 int rc; 2327 2328 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 2329 if (spdk_unlikely(rc != -EAGAIN)) { 2330 ctx->probe_done = true; 2331 spdk_poller_unregister(&ctx->poller); 2332 if (!ctx->ctrlr_attached) { 2333 /* The probe is done, but no controller was attached. 2334 * That means we had a failure, so report -EIO back to 2335 * the caller (usually the RPC). populate_namespaces_cb() 2336 * will take care of freeing the nvme_async_probe_ctx. 2337 */ 2338 populate_namespaces_cb(ctx, 0, -EIO); 2339 } else if (ctx->namespaces_populated) { 2340 /* The namespaces for the attached controller were all 2341 * populated and the response was already sent to the 2342 * caller (usually the RPC). So free the context here. 2343 */ 2344 free(ctx); 2345 } 2346 } 2347 2348 return SPDK_POLLER_BUSY; 2349 } 2350 2351 int 2352 bdev_nvme_create(struct spdk_nvme_transport_id *trid, 2353 struct spdk_nvme_host_id *hostid, 2354 const char *base_name, 2355 const char **names, 2356 uint32_t count, 2357 const char *hostnqn, 2358 uint32_t prchk_flags, 2359 spdk_bdev_create_nvme_fn cb_fn, 2360 void *cb_ctx, 2361 struct spdk_nvme_ctrlr_opts *opts) 2362 { 2363 struct nvme_probe_skip_entry *entry, *tmp; 2364 struct nvme_async_probe_ctx *ctx; 2365 2366 /* TODO expand this check to include both the host and target TRIDs. 2367 * Only if both are the same should we fail. 2368 */ 2369 if (nvme_bdev_ctrlr_get(trid) != NULL) { 2370 SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr); 2371 return -EEXIST; 2372 } 2373 2374 ctx = calloc(1, sizeof(*ctx)); 2375 if (!ctx) { 2376 return -ENOMEM; 2377 } 2378 ctx->base_name = base_name; 2379 ctx->names = names; 2380 ctx->count = count; 2381 ctx->cb_fn = cb_fn; 2382 ctx->cb_ctx = cb_ctx; 2383 ctx->prchk_flags = prchk_flags; 2384 ctx->trid = *trid; 2385 2386 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 2387 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) { 2388 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 2389 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 2390 free(entry); 2391 break; 2392 } 2393 } 2394 } 2395 2396 if (opts) { 2397 memcpy(&ctx->opts, opts, sizeof(*opts)); 2398 } else { 2399 spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->opts, sizeof(ctx->opts)); 2400 } 2401 2402 ctx->opts.transport_retry_count = g_opts.retry_count; 2403 ctx->opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms; 2404 2405 if (hostnqn) { 2406 snprintf(ctx->opts.hostnqn, sizeof(ctx->opts.hostnqn), "%s", hostnqn); 2407 } 2408 2409 if (hostid->hostaddr[0] != '\0') { 2410 snprintf(ctx->opts.src_addr, sizeof(ctx->opts.src_addr), "%s", hostid->hostaddr); 2411 } 2412 2413 if (hostid->hostsvcid[0] != '\0') { 2414 snprintf(ctx->opts.src_svcid, sizeof(ctx->opts.src_svcid), "%s", hostid->hostsvcid); 2415 } 2416 2417 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->opts, connect_attach_cb); 2418 if (ctx->probe_ctx == NULL) { 2419 SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr); 2420 free(ctx); 2421 return -ENODEV; 2422 } 2423 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000); 2424 2425 return 0; 2426 } 2427 2428 static int 2429 bdev_nvme_delete_secondary_trid(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 2430 const struct spdk_nvme_transport_id *trid) 2431 { 2432 struct nvme_bdev_ctrlr_trid *ctrlr_trid, *tmp_trid; 2433 2434 if (!spdk_nvme_transport_id_compare(trid, nvme_bdev_ctrlr->connected_trid)) { 2435 return -EBUSY; 2436 } 2437 2438 TAILQ_FOREACH_SAFE(ctrlr_trid, &nvme_bdev_ctrlr->trids, link, tmp_trid) { 2439 if (!spdk_nvme_transport_id_compare(&ctrlr_trid->trid, trid)) { 2440 TAILQ_REMOVE(&nvme_bdev_ctrlr->trids, ctrlr_trid, link); 2441 free(ctrlr_trid); 2442 return 0; 2443 } 2444 } 2445 2446 return -ENXIO; 2447 } 2448 2449 int 2450 bdev_nvme_delete(const char *name, const struct spdk_nvme_transport_id *trid) 2451 { 2452 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; 2453 struct nvme_bdev_ctrlr_trid *ctrlr_trid; 2454 2455 if (name == NULL) { 2456 return -EINVAL; 2457 } 2458 2459 nvme_bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 2460 if (nvme_bdev_ctrlr == NULL) { 2461 SPDK_ERRLOG("Failed to find NVMe controller\n"); 2462 return -ENODEV; 2463 } 2464 2465 /* case 1: remove the controller itself. */ 2466 if (trid == NULL) { 2467 return _bdev_nvme_delete(nvme_bdev_ctrlr, false); 2468 } 2469 2470 /* case 2: we are currently using the path to be removed. */ 2471 if (!spdk_nvme_transport_id_compare(trid, nvme_bdev_ctrlr->connected_trid)) { 2472 ctrlr_trid = TAILQ_FIRST(&nvme_bdev_ctrlr->trids); 2473 assert(nvme_bdev_ctrlr->connected_trid == &ctrlr_trid->trid); 2474 /* case 2A: the current path is the only path. */ 2475 if (!TAILQ_NEXT(ctrlr_trid, link)) { 2476 return _bdev_nvme_delete(nvme_bdev_ctrlr, false); 2477 } 2478 2479 /* case 2B: there is an alternative path. */ 2480 return bdev_nvme_failover(nvme_bdev_ctrlr, true); 2481 } 2482 2483 /* case 3: We are not using the specified path. */ 2484 return bdev_nvme_delete_secondary_trid(nvme_bdev_ctrlr, trid); 2485 } 2486 2487 static int 2488 bdev_nvme_library_init(void) 2489 { 2490 g_bdev_nvme_init_thread = spdk_get_thread(); 2491 2492 spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_poll_group_create_cb, 2493 bdev_nvme_poll_group_destroy_cb, 2494 sizeof(struct nvme_bdev_poll_group), "bdev_nvme_poll_groups"); 2495 2496 return 0; 2497 } 2498 2499 static void 2500 bdev_nvme_library_fini(void) 2501 { 2502 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, *tmp; 2503 struct nvme_probe_skip_entry *entry, *entry_tmp; 2504 2505 spdk_poller_unregister(&g_hotplug_poller); 2506 free(g_hotplug_probe_ctx); 2507 g_hotplug_probe_ctx = NULL; 2508 2509 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) { 2510 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 2511 free(entry); 2512 } 2513 2514 pthread_mutex_lock(&g_bdev_nvme_mutex); 2515 TAILQ_FOREACH_SAFE(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq, tmp) { 2516 pthread_mutex_lock(&nvme_bdev_ctrlr->mutex); 2517 if (nvme_bdev_ctrlr->destruct) { 2518 /* This controller's destruction was already started 2519 * before the application started shutting down 2520 */ 2521 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 2522 continue; 2523 } 2524 nvme_bdev_ctrlr->destruct = true; 2525 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 2526 2527 spdk_thread_send_msg(nvme_bdev_ctrlr->thread, _nvme_bdev_ctrlr_destruct, 2528 nvme_bdev_ctrlr); 2529 } 2530 2531 g_bdev_nvme_module_finish = true; 2532 if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 2533 pthread_mutex_unlock(&g_bdev_nvme_mutex); 2534 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 2535 spdk_bdev_module_finish_done(); 2536 return; 2537 } 2538 2539 pthread_mutex_unlock(&g_bdev_nvme_mutex); 2540 } 2541 2542 static void 2543 bdev_nvme_verify_pi_error(struct nvme_bdev_io *bio) 2544 { 2545 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2546 struct spdk_bdev *bdev = bdev_io->bdev; 2547 struct spdk_dif_ctx dif_ctx; 2548 struct spdk_dif_error err_blk = {}; 2549 int rc; 2550 2551 rc = spdk_dif_ctx_init(&dif_ctx, 2552 bdev->blocklen, bdev->md_len, bdev->md_interleave, 2553 bdev->dif_is_head_of_md, bdev->dif_type, bdev->dif_check_flags, 2554 bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0); 2555 if (rc != 0) { 2556 SPDK_ERRLOG("Initialization of DIF context failed\n"); 2557 return; 2558 } 2559 2560 if (bdev->md_interleave) { 2561 rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 2562 bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 2563 } else { 2564 struct iovec md_iov = { 2565 .iov_base = bdev_io->u.bdev.md_buf, 2566 .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len, 2567 }; 2568 2569 rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 2570 &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 2571 } 2572 2573 if (rc != 0) { 2574 SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n", 2575 err_blk.err_type, err_blk.err_offset); 2576 } else { 2577 SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n"); 2578 } 2579 } 2580 2581 static void 2582 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 2583 { 2584 struct nvme_bdev_io *bio = ref; 2585 2586 if (spdk_nvme_cpl_is_success(cpl)) { 2587 /* Run PI verification for read data buffer. */ 2588 bdev_nvme_verify_pi_error(bio); 2589 } 2590 2591 /* Return original completion status */ 2592 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 2593 } 2594 2595 static void 2596 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 2597 { 2598 struct nvme_bdev_io *bio = ref; 2599 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2600 struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt; 2601 struct nvme_io_path *io_path; 2602 struct spdk_nvme_ns *ns; 2603 struct spdk_nvme_qpair *qpair; 2604 int ret; 2605 2606 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 2607 SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n", 2608 cpl->status.sct, cpl->status.sc); 2609 2610 /* Save completion status to use after verifying PI error. */ 2611 bio->cpl = *cpl; 2612 2613 io_path = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 2614 2615 if (spdk_likely(bdev_nvme_find_io_path(nbdev, io_path, &ns, &qpair))) { 2616 /* Read without PI checking to verify PI error. */ 2617 ret = bdev_nvme_no_pi_readv(ns, 2618 qpair, 2619 bio, 2620 bdev_io->u.bdev.iovs, 2621 bdev_io->u.bdev.iovcnt, 2622 bdev_io->u.bdev.md_buf, 2623 bdev_io->u.bdev.num_blocks, 2624 bdev_io->u.bdev.offset_blocks); 2625 if (ret == 0) { 2626 return; 2627 } 2628 } 2629 } 2630 2631 bdev_nvme_io_complete_nvme_status(bio, cpl); 2632 } 2633 2634 static void 2635 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 2636 { 2637 struct nvme_bdev_io *bio = ref; 2638 2639 if (spdk_nvme_cpl_is_pi_error(cpl)) { 2640 SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n", 2641 cpl->status.sct, cpl->status.sc); 2642 /* Run PI verification for write data buffer if PI error is detected. */ 2643 bdev_nvme_verify_pi_error(bio); 2644 } 2645 2646 bdev_nvme_io_complete_nvme_status(bio, cpl); 2647 } 2648 2649 static void 2650 bdev_nvme_zone_appendv_done(void *ref, const struct spdk_nvme_cpl *cpl) 2651 { 2652 struct nvme_bdev_io *bio = ref; 2653 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2654 2655 /* spdk_bdev_io_get_append_location() requires that the ALBA is stored in offset_blocks. 2656 * Additionally, offset_blocks has to be set before calling bdev_nvme_verify_pi_error(). 2657 */ 2658 bdev_io->u.bdev.offset_blocks = *(uint64_t *)&cpl->cdw0; 2659 2660 if (spdk_nvme_cpl_is_pi_error(cpl)) { 2661 SPDK_ERRLOG("zone append completed with PI error (sct=%d, sc=%d)\n", 2662 cpl->status.sct, cpl->status.sc); 2663 /* Run PI verification for zone append data buffer if PI error is detected. */ 2664 bdev_nvme_verify_pi_error(bio); 2665 } 2666 2667 bdev_nvme_io_complete_nvme_status(bio, cpl); 2668 } 2669 2670 static void 2671 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl) 2672 { 2673 struct nvme_bdev_io *bio = ref; 2674 2675 if (spdk_nvme_cpl_is_pi_error(cpl)) { 2676 SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n", 2677 cpl->status.sct, cpl->status.sc); 2678 /* Run PI verification for compare data buffer if PI error is detected. */ 2679 bdev_nvme_verify_pi_error(bio); 2680 } 2681 2682 bdev_nvme_io_complete_nvme_status(bio, cpl); 2683 } 2684 2685 static void 2686 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 2687 { 2688 struct nvme_bdev_io *bio = ref; 2689 2690 /* Compare operation completion */ 2691 if ((cpl->cdw0 & 0xFF) == SPDK_NVME_OPC_COMPARE) { 2692 /* Save compare result for write callback */ 2693 bio->cpl = *cpl; 2694 return; 2695 } 2696 2697 /* Write operation completion */ 2698 if (spdk_nvme_cpl_is_error(&bio->cpl)) { 2699 /* If bio->cpl is already an error, it means the compare operation failed. In that case, 2700 * complete the IO with the compare operation's status. 2701 */ 2702 if (!spdk_nvme_cpl_is_error(cpl)) { 2703 SPDK_ERRLOG("Unexpected write success after compare failure.\n"); 2704 } 2705 2706 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 2707 } else { 2708 bdev_nvme_io_complete_nvme_status(bio, cpl); 2709 } 2710 } 2711 2712 static void 2713 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl) 2714 { 2715 struct nvme_bdev_io *bio = ref; 2716 2717 bdev_nvme_io_complete_nvme_status(bio, cpl); 2718 } 2719 2720 static int 2721 fill_zone_from_report(struct spdk_bdev_zone_info *info, struct spdk_nvme_zns_zone_desc *desc) 2722 { 2723 switch (desc->zs) { 2724 case SPDK_NVME_ZONE_STATE_EMPTY: 2725 info->state = SPDK_BDEV_ZONE_STATE_EMPTY; 2726 break; 2727 case SPDK_NVME_ZONE_STATE_IOPEN: 2728 info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN; 2729 break; 2730 case SPDK_NVME_ZONE_STATE_EOPEN: 2731 info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN; 2732 break; 2733 case SPDK_NVME_ZONE_STATE_CLOSED: 2734 info->state = SPDK_BDEV_ZONE_STATE_CLOSED; 2735 break; 2736 case SPDK_NVME_ZONE_STATE_RONLY: 2737 info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY; 2738 break; 2739 case SPDK_NVME_ZONE_STATE_FULL: 2740 info->state = SPDK_BDEV_ZONE_STATE_FULL; 2741 break; 2742 case SPDK_NVME_ZONE_STATE_OFFLINE: 2743 info->state = SPDK_BDEV_ZONE_STATE_OFFLINE; 2744 break; 2745 default: 2746 SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", desc->zs); 2747 return -EIO; 2748 } 2749 2750 info->zone_id = desc->zslba; 2751 info->write_pointer = desc->wp; 2752 info->capacity = desc->zcap; 2753 2754 return 0; 2755 } 2756 2757 static void 2758 bdev_nvme_get_zone_info_done(void *ref, const struct spdk_nvme_cpl *cpl) 2759 { 2760 struct nvme_bdev_io *bio = ref; 2761 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2762 struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt; 2763 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 2764 struct nvme_io_path *io_path = spdk_io_channel_get_ctx(ch); 2765 uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id; 2766 uint32_t zones_to_copy = bdev_io->u.zone_mgmt.num_zones; 2767 struct spdk_bdev_zone_info *info = bdev_io->u.zone_mgmt.buf; 2768 uint64_t max_zones_per_buf, i; 2769 uint32_t zone_report_bufsize; 2770 struct spdk_nvme_ns *ns; 2771 struct spdk_nvme_qpair *qpair; 2772 int ret; 2773 2774 if (spdk_nvme_cpl_is_error(cpl)) { 2775 goto out_complete_io_nvme_cpl; 2776 } 2777 2778 if (!bdev_nvme_find_io_path(nbdev, io_path, &ns, &qpair)) { 2779 ret = -ENXIO; 2780 goto out_complete_io_ret; 2781 } 2782 2783 zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 2784 max_zones_per_buf = (zone_report_bufsize - sizeof(*bio->zone_report_buf)) / 2785 sizeof(bio->zone_report_buf->descs[0]); 2786 2787 if (bio->zone_report_buf->nr_zones > max_zones_per_buf) { 2788 ret = -EINVAL; 2789 goto out_complete_io_ret; 2790 } 2791 2792 if (!bio->zone_report_buf->nr_zones) { 2793 ret = -EINVAL; 2794 goto out_complete_io_ret; 2795 } 2796 2797 for (i = 0; i < bio->zone_report_buf->nr_zones && bio->handled_zones < zones_to_copy; i++) { 2798 ret = fill_zone_from_report(&info[bio->handled_zones], 2799 &bio->zone_report_buf->descs[i]); 2800 if (ret) { 2801 goto out_complete_io_ret; 2802 } 2803 bio->handled_zones++; 2804 } 2805 2806 if (bio->handled_zones < zones_to_copy) { 2807 uint64_t zone_size_lba = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 2808 uint64_t slba = zone_id + (zone_size_lba * bio->handled_zones); 2809 2810 memset(bio->zone_report_buf, 0, zone_report_bufsize); 2811 ret = spdk_nvme_zns_report_zones(ns, qpair, 2812 bio->zone_report_buf, zone_report_bufsize, 2813 slba, SPDK_NVME_ZRA_LIST_ALL, true, 2814 bdev_nvme_get_zone_info_done, bio); 2815 if (!ret) { 2816 return; 2817 } else { 2818 goto out_complete_io_ret; 2819 } 2820 } 2821 2822 out_complete_io_nvme_cpl: 2823 free(bio->zone_report_buf); 2824 bio->zone_report_buf = NULL; 2825 bdev_nvme_io_complete_nvme_status(bio, cpl); 2826 return; 2827 2828 out_complete_io_ret: 2829 free(bio->zone_report_buf); 2830 bio->zone_report_buf = NULL; 2831 bdev_nvme_io_complete(bio, ret); 2832 } 2833 2834 static void 2835 bdev_nvme_zone_management_done(void *ref, const struct spdk_nvme_cpl *cpl) 2836 { 2837 struct nvme_bdev_io *bio = ref; 2838 2839 bdev_nvme_io_complete_nvme_status(bio, cpl); 2840 } 2841 2842 static void 2843 bdev_nvme_admin_passthru_completion(void *ctx) 2844 { 2845 struct nvme_bdev_io *bio = ctx; 2846 2847 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 2848 } 2849 2850 static void 2851 bdev_nvme_abort_completion(void *ctx) 2852 { 2853 struct nvme_bdev_io *bio = ctx; 2854 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2855 2856 if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) { 2857 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 2858 } else { 2859 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2860 } 2861 } 2862 2863 static void 2864 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl) 2865 { 2866 struct nvme_bdev_io *bio = ref; 2867 2868 bio->cpl = *cpl; 2869 spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_completion, bio); 2870 } 2871 2872 static void 2873 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl) 2874 { 2875 struct nvme_bdev_io *bio = ref; 2876 2877 bio->cpl = *cpl; 2878 spdk_thread_send_msg(bio->orig_thread, bdev_nvme_admin_passthru_completion, bio); 2879 } 2880 2881 static void 2882 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset) 2883 { 2884 struct nvme_bdev_io *bio = ref; 2885 struct iovec *iov; 2886 2887 bio->iov_offset = sgl_offset; 2888 for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) { 2889 iov = &bio->iovs[bio->iovpos]; 2890 if (bio->iov_offset < iov->iov_len) { 2891 break; 2892 } 2893 2894 bio->iov_offset -= iov->iov_len; 2895 } 2896 } 2897 2898 static int 2899 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length) 2900 { 2901 struct nvme_bdev_io *bio = ref; 2902 struct iovec *iov; 2903 2904 assert(bio->iovpos < bio->iovcnt); 2905 2906 iov = &bio->iovs[bio->iovpos]; 2907 2908 *address = iov->iov_base; 2909 *length = iov->iov_len; 2910 2911 if (bio->iov_offset) { 2912 assert(bio->iov_offset <= iov->iov_len); 2913 *address += bio->iov_offset; 2914 *length -= bio->iov_offset; 2915 } 2916 2917 bio->iov_offset += *length; 2918 if (bio->iov_offset == iov->iov_len) { 2919 bio->iovpos++; 2920 bio->iov_offset = 0; 2921 } 2922 2923 return 0; 2924 } 2925 2926 static void 2927 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset) 2928 { 2929 struct nvme_bdev_io *bio = ref; 2930 struct iovec *iov; 2931 2932 bio->fused_iov_offset = sgl_offset; 2933 for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) { 2934 iov = &bio->fused_iovs[bio->fused_iovpos]; 2935 if (bio->fused_iov_offset < iov->iov_len) { 2936 break; 2937 } 2938 2939 bio->fused_iov_offset -= iov->iov_len; 2940 } 2941 } 2942 2943 static int 2944 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length) 2945 { 2946 struct nvme_bdev_io *bio = ref; 2947 struct iovec *iov; 2948 2949 assert(bio->fused_iovpos < bio->fused_iovcnt); 2950 2951 iov = &bio->fused_iovs[bio->fused_iovpos]; 2952 2953 *address = iov->iov_base; 2954 *length = iov->iov_len; 2955 2956 if (bio->fused_iov_offset) { 2957 assert(bio->fused_iov_offset <= iov->iov_len); 2958 *address += bio->fused_iov_offset; 2959 *length -= bio->fused_iov_offset; 2960 } 2961 2962 bio->fused_iov_offset += *length; 2963 if (bio->fused_iov_offset == iov->iov_len) { 2964 bio->fused_iovpos++; 2965 bio->fused_iov_offset = 0; 2966 } 2967 2968 return 0; 2969 } 2970 2971 static int 2972 bdev_nvme_no_pi_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 2973 struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 2974 void *md, uint64_t lba_count, uint64_t lba) 2975 { 2976 int rc; 2977 2978 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n", 2979 lba_count, lba); 2980 2981 bio->iovs = iov; 2982 bio->iovcnt = iovcnt; 2983 bio->iovpos = 0; 2984 bio->iov_offset = 0; 2985 2986 rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count, 2987 bdev_nvme_no_pi_readv_done, bio, 0, 2988 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 2989 md, 0, 0); 2990 2991 if (rc != 0 && rc != -ENOMEM) { 2992 SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc); 2993 } 2994 return rc; 2995 } 2996 2997 static int 2998 bdev_nvme_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 2999 struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 3000 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags) 3001 { 3002 int rc; 3003 3004 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n", 3005 lba_count, lba); 3006 3007 bio->iovs = iov; 3008 bio->iovcnt = iovcnt; 3009 bio->iovpos = 0; 3010 bio->iov_offset = 0; 3011 3012 if (iovcnt == 1) { 3013 rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base, md, lba, 3014 lba_count, 3015 bdev_nvme_readv_done, bio, 3016 flags, 3017 0, 0); 3018 } else { 3019 rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count, 3020 bdev_nvme_readv_done, bio, flags, 3021 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 3022 md, 0, 0); 3023 } 3024 3025 if (rc != 0 && rc != -ENOMEM) { 3026 SPDK_ERRLOG("readv failed: rc = %d\n", rc); 3027 } 3028 return rc; 3029 } 3030 3031 static int 3032 bdev_nvme_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 3033 struct nvme_bdev_io *bio, 3034 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba, 3035 uint32_t flags) 3036 { 3037 int rc; 3038 3039 SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 3040 lba_count, lba); 3041 3042 bio->iovs = iov; 3043 bio->iovcnt = iovcnt; 3044 bio->iovpos = 0; 3045 bio->iov_offset = 0; 3046 3047 if (iovcnt == 1) { 3048 rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base, md, lba, 3049 lba_count, 3050 bdev_nvme_writev_done, bio, 3051 flags, 3052 0, 0); 3053 } else { 3054 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 3055 bdev_nvme_writev_done, bio, flags, 3056 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 3057 md, 0, 0); 3058 } 3059 3060 if (rc != 0 && rc != -ENOMEM) { 3061 SPDK_ERRLOG("writev failed: rc = %d\n", rc); 3062 } 3063 return rc; 3064 } 3065 3066 static int 3067 bdev_nvme_zone_appendv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 3068 struct nvme_bdev_io *bio, 3069 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t zslba, 3070 uint32_t flags) 3071 { 3072 int rc; 3073 3074 SPDK_DEBUGLOG(bdev_nvme, "zone append %" PRIu64 " blocks to zone start lba %#" PRIx64 "\n", 3075 lba_count, zslba); 3076 3077 bio->iovs = iov; 3078 bio->iovcnt = iovcnt; 3079 bio->iovpos = 0; 3080 bio->iov_offset = 0; 3081 3082 if (iovcnt == 1) { 3083 rc = spdk_nvme_zns_zone_append_with_md(ns, qpair, iov[0].iov_base, md, zslba, 3084 lba_count, 3085 bdev_nvme_zone_appendv_done, bio, 3086 flags, 3087 0, 0); 3088 } else { 3089 rc = spdk_nvme_zns_zone_appendv_with_md(ns, qpair, zslba, lba_count, 3090 bdev_nvme_zone_appendv_done, bio, flags, 3091 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 3092 md, 0, 0); 3093 } 3094 3095 if (rc != 0 && rc != -ENOMEM) { 3096 SPDK_ERRLOG("zone append failed: rc = %d\n", rc); 3097 } 3098 return rc; 3099 } 3100 3101 static int 3102 bdev_nvme_comparev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 3103 struct nvme_bdev_io *bio, 3104 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba, 3105 uint32_t flags) 3106 { 3107 int rc; 3108 3109 SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n", 3110 lba_count, lba); 3111 3112 bio->iovs = iov; 3113 bio->iovcnt = iovcnt; 3114 bio->iovpos = 0; 3115 bio->iov_offset = 0; 3116 3117 rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count, 3118 bdev_nvme_comparev_done, bio, flags, 3119 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 3120 md, 0, 0); 3121 3122 if (rc != 0 && rc != -ENOMEM) { 3123 SPDK_ERRLOG("comparev failed: rc = %d\n", rc); 3124 } 3125 return rc; 3126 } 3127 3128 static int 3129 bdev_nvme_comparev_and_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 3130 struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, 3131 struct iovec *write_iov, int write_iovcnt, 3132 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags) 3133 { 3134 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 3135 int rc; 3136 3137 SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 3138 lba_count, lba); 3139 3140 bio->iovs = cmp_iov; 3141 bio->iovcnt = cmp_iovcnt; 3142 bio->iovpos = 0; 3143 bio->iov_offset = 0; 3144 bio->fused_iovs = write_iov; 3145 bio->fused_iovcnt = write_iovcnt; 3146 bio->fused_iovpos = 0; 3147 bio->fused_iov_offset = 0; 3148 3149 if (bdev_io->num_retries == 0) { 3150 bio->first_fused_submitted = false; 3151 } 3152 3153 if (!bio->first_fused_submitted) { 3154 flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST; 3155 memset(&bio->cpl, 0, sizeof(bio->cpl)); 3156 3157 rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count, 3158 bdev_nvme_comparev_and_writev_done, bio, flags, 3159 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0); 3160 if (rc == 0) { 3161 bio->first_fused_submitted = true; 3162 flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST; 3163 } else { 3164 if (rc != -ENOMEM) { 3165 SPDK_ERRLOG("compare failed: rc = %d\n", rc); 3166 } 3167 return rc; 3168 } 3169 } 3170 3171 flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND; 3172 3173 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 3174 bdev_nvme_comparev_and_writev_done, bio, flags, 3175 bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0); 3176 if (rc != 0 && rc != -ENOMEM) { 3177 SPDK_ERRLOG("write failed: rc = %d\n", rc); 3178 rc = 0; 3179 } 3180 3181 return rc; 3182 } 3183 3184 static int 3185 bdev_nvme_unmap(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 3186 struct nvme_bdev_io *bio, 3187 uint64_t offset_blocks, 3188 uint64_t num_blocks) 3189 { 3190 struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES]; 3191 struct spdk_nvme_dsm_range *range; 3192 uint64_t offset, remaining; 3193 uint64_t num_ranges_u64; 3194 uint16_t num_ranges; 3195 int rc; 3196 3197 num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) / 3198 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 3199 if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) { 3200 SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks); 3201 return -EINVAL; 3202 } 3203 num_ranges = (uint16_t)num_ranges_u64; 3204 3205 offset = offset_blocks; 3206 remaining = num_blocks; 3207 range = &dsm_ranges[0]; 3208 3209 /* Fill max-size ranges until the remaining blocks fit into one range */ 3210 while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) { 3211 range->attributes.raw = 0; 3212 range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 3213 range->starting_lba = offset; 3214 3215 offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 3216 remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 3217 range++; 3218 } 3219 3220 /* Final range describes the remaining blocks */ 3221 range->attributes.raw = 0; 3222 range->length = remaining; 3223 range->starting_lba = offset; 3224 3225 rc = spdk_nvme_ns_cmd_dataset_management(ns, qpair, 3226 SPDK_NVME_DSM_ATTR_DEALLOCATE, 3227 dsm_ranges, num_ranges, 3228 bdev_nvme_queued_done, bio); 3229 3230 return rc; 3231 } 3232 3233 static int 3234 bdev_nvme_get_zone_info(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 3235 struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones, 3236 struct spdk_bdev_zone_info *info) 3237 { 3238 uint32_t zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 3239 uint64_t zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 3240 uint64_t total_zones = spdk_nvme_zns_ns_get_num_zones(ns); 3241 3242 if (zone_id % zone_size != 0) { 3243 return -EINVAL; 3244 } 3245 3246 if (num_zones > total_zones || !num_zones) { 3247 return -EINVAL; 3248 } 3249 3250 assert(!bio->zone_report_buf); 3251 bio->zone_report_buf = calloc(1, zone_report_bufsize); 3252 if (!bio->zone_report_buf) { 3253 return -ENOMEM; 3254 } 3255 3256 bio->handled_zones = 0; 3257 3258 return spdk_nvme_zns_report_zones(ns, qpair, bio->zone_report_buf, zone_report_bufsize, 3259 zone_id, SPDK_NVME_ZRA_LIST_ALL, true, 3260 bdev_nvme_get_zone_info_done, bio); 3261 } 3262 3263 static int 3264 bdev_nvme_zone_management(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 3265 struct nvme_bdev_io *bio, uint64_t zone_id, 3266 enum spdk_bdev_zone_action action) 3267 { 3268 switch (action) { 3269 case SPDK_BDEV_ZONE_CLOSE: 3270 return spdk_nvme_zns_close_zone(ns, qpair, zone_id, false, 3271 bdev_nvme_zone_management_done, bio); 3272 case SPDK_BDEV_ZONE_FINISH: 3273 return spdk_nvme_zns_finish_zone(ns, qpair, zone_id, false, 3274 bdev_nvme_zone_management_done, bio); 3275 case SPDK_BDEV_ZONE_OPEN: 3276 return spdk_nvme_zns_open_zone(ns, qpair, zone_id, false, 3277 bdev_nvme_zone_management_done, bio); 3278 case SPDK_BDEV_ZONE_RESET: 3279 return spdk_nvme_zns_reset_zone(ns, qpair, zone_id, false, 3280 bdev_nvme_zone_management_done, bio); 3281 case SPDK_BDEV_ZONE_OFFLINE: 3282 return spdk_nvme_zns_offline_zone(ns, qpair, zone_id, false, 3283 bdev_nvme_zone_management_done, bio); 3284 default: 3285 return -EINVAL; 3286 } 3287 } 3288 3289 static int 3290 bdev_nvme_admin_passthru(struct nvme_io_path *io_path, struct nvme_bdev_io *bio, 3291 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) 3292 { 3293 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; 3294 uint32_t max_xfer_size; 3295 3296 if (!bdev_nvme_find_admin_path(io_path, &nvme_bdev_ctrlr)) { 3297 return -EINVAL; 3298 } 3299 3300 max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_bdev_ctrlr->ctrlr); 3301 3302 if (nbytes > max_xfer_size) { 3303 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 3304 return -EINVAL; 3305 } 3306 3307 bio->orig_thread = spdk_get_thread(); 3308 3309 return spdk_nvme_ctrlr_cmd_admin_raw(nvme_bdev_ctrlr->ctrlr, cmd, buf, 3310 (uint32_t)nbytes, bdev_nvme_admin_passthru_done, bio); 3311 } 3312 3313 static int 3314 bdev_nvme_io_passthru(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 3315 struct nvme_bdev_io *bio, 3316 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) 3317 { 3318 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 3319 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 3320 3321 if (nbytes > max_xfer_size) { 3322 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 3323 return -EINVAL; 3324 } 3325 3326 /* 3327 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 3328 * so fill it out automatically. 3329 */ 3330 cmd->nsid = spdk_nvme_ns_get_id(ns); 3331 3332 return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf, 3333 (uint32_t)nbytes, bdev_nvme_queued_done, bio); 3334 } 3335 3336 static int 3337 bdev_nvme_io_passthru_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 3338 struct nvme_bdev_io *bio, 3339 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len) 3340 { 3341 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 3342 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 3343 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 3344 3345 if (nbytes > max_xfer_size) { 3346 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 3347 return -EINVAL; 3348 } 3349 3350 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 3351 SPDK_ERRLOG("invalid meta data buffer size\n"); 3352 return -EINVAL; 3353 } 3354 3355 /* 3356 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 3357 * so fill it out automatically. 3358 */ 3359 cmd->nsid = spdk_nvme_ns_get_id(ns); 3360 3361 return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf, 3362 (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio); 3363 } 3364 3365 static int 3366 bdev_nvme_abort(struct nvme_io_path *io_path, struct nvme_bdev_io *bio, 3367 struct nvme_bdev_io *bio_to_abort) 3368 { 3369 int rc; 3370 3371 bio->orig_thread = spdk_get_thread(); 3372 3373 rc = spdk_nvme_ctrlr_cmd_abort_ext(io_path->ctrlr->ctrlr, 3374 io_path->qpair, 3375 bio_to_abort, 3376 bdev_nvme_abort_done, bio); 3377 if (rc == -ENOENT) { 3378 /* If no command was found in I/O qpair, the target command may be 3379 * admin command. 3380 */ 3381 rc = spdk_nvme_ctrlr_cmd_abort_ext(io_path->ctrlr->ctrlr, 3382 NULL, 3383 bio_to_abort, 3384 bdev_nvme_abort_done, bio); 3385 } 3386 3387 if (rc == -ENOENT) { 3388 /* If no command was found, complete the abort request with failure. */ 3389 bio->cpl.cdw0 |= 1U; 3390 bio->cpl.status.sc = SPDK_NVME_SC_SUCCESS; 3391 bio->cpl.status.sct = SPDK_NVME_SCT_GENERIC; 3392 3393 bdev_nvme_abort_completion(bio); 3394 3395 rc = 0; 3396 } 3397 3398 return rc; 3399 } 3400 3401 static void 3402 nvme_ctrlr_config_json_standard_namespace(struct spdk_json_write_ctx *w, 3403 struct nvme_bdev_ns *nvme_ns) 3404 { 3405 /* nop */ 3406 } 3407 3408 static void 3409 nvme_namespace_config_json(struct spdk_json_write_ctx *w, struct nvme_bdev_ns *nvme_ns) 3410 { 3411 g_config_json_namespace_fn[nvme_ns->type](w, nvme_ns); 3412 } 3413 3414 static void 3415 bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w) 3416 { 3417 const char *action; 3418 3419 if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) { 3420 action = "reset"; 3421 } else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) { 3422 action = "abort"; 3423 } else { 3424 action = "none"; 3425 } 3426 3427 spdk_json_write_object_begin(w); 3428 3429 spdk_json_write_named_string(w, "method", "bdev_nvme_set_options"); 3430 3431 spdk_json_write_named_object_begin(w, "params"); 3432 spdk_json_write_named_string(w, "action_on_timeout", action); 3433 spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us); 3434 spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms); 3435 spdk_json_write_named_uint32(w, "retry_count", g_opts.retry_count); 3436 spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst); 3437 spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight); 3438 spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight); 3439 spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight); 3440 spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us); 3441 spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us); 3442 spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests); 3443 spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit); 3444 spdk_json_write_object_end(w); 3445 3446 spdk_json_write_object_end(w); 3447 } 3448 3449 static void 3450 nvme_bdev_ctrlr_config_json(struct spdk_json_write_ctx *w, 3451 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr) 3452 { 3453 struct spdk_nvme_transport_id *trid; 3454 3455 trid = nvme_bdev_ctrlr->connected_trid; 3456 3457 spdk_json_write_object_begin(w); 3458 3459 spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller"); 3460 3461 spdk_json_write_named_object_begin(w, "params"); 3462 spdk_json_write_named_string(w, "name", nvme_bdev_ctrlr->name); 3463 nvme_bdev_dump_trid_json(trid, w); 3464 spdk_json_write_named_bool(w, "prchk_reftag", 3465 (nvme_bdev_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0); 3466 spdk_json_write_named_bool(w, "prchk_guard", 3467 (nvme_bdev_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0); 3468 3469 spdk_json_write_object_end(w); 3470 3471 spdk_json_write_object_end(w); 3472 } 3473 3474 static void 3475 bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w) 3476 { 3477 spdk_json_write_object_begin(w); 3478 spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug"); 3479 3480 spdk_json_write_named_object_begin(w, "params"); 3481 spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us); 3482 spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled); 3483 spdk_json_write_object_end(w); 3484 3485 spdk_json_write_object_end(w); 3486 } 3487 3488 static int 3489 bdev_nvme_config_json(struct spdk_json_write_ctx *w) 3490 { 3491 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; 3492 uint32_t nsid; 3493 3494 bdev_nvme_opts_config_json(w); 3495 3496 pthread_mutex_lock(&g_bdev_nvme_mutex); 3497 3498 TAILQ_FOREACH(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 3499 nvme_bdev_ctrlr_config_json(w, nvme_bdev_ctrlr); 3500 3501 for (nsid = 0; nsid < nvme_bdev_ctrlr->num_ns; ++nsid) { 3502 if (!nvme_bdev_ctrlr->namespaces[nsid]->populated) { 3503 continue; 3504 } 3505 3506 nvme_namespace_config_json(w, nvme_bdev_ctrlr->namespaces[nsid]); 3507 } 3508 } 3509 3510 /* Dump as last parameter to give all NVMe bdevs chance to be constructed 3511 * before enabling hotplug poller. 3512 */ 3513 bdev_nvme_hotplug_config_json(w); 3514 3515 pthread_mutex_unlock(&g_bdev_nvme_mutex); 3516 return 0; 3517 } 3518 3519 struct spdk_nvme_ctrlr * 3520 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev) 3521 { 3522 if (!bdev || bdev->module != &nvme_if) { 3523 return NULL; 3524 } 3525 3526 return SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk)->nvme_ns->ctrlr->ctrlr; 3527 } 3528 3529 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme) 3530