1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. All rights reserved. 5 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 6 * Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * * Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * * Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * * Neither the name of Intel Corporation nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 25 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 26 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 */ 34 35 #include "spdk/stdinc.h" 36 37 #include "bdev_nvme.h" 38 39 #include "spdk/accel_engine.h" 40 #include "spdk/config.h" 41 #include "spdk/endian.h" 42 #include "spdk/bdev.h" 43 #include "spdk/json.h" 44 #include "spdk/nvme.h" 45 #include "spdk/nvme_ocssd.h" 46 #include "spdk/nvme_zns.h" 47 #include "spdk/thread.h" 48 #include "spdk/string.h" 49 #include "spdk/util.h" 50 51 #include "spdk/bdev_module.h" 52 #include "spdk/log.h" 53 54 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true 55 #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS (10000) 56 57 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w); 58 59 struct nvme_bdev_io { 60 /** array of iovecs to transfer. */ 61 struct iovec *iovs; 62 63 /** Number of iovecs in iovs array. */ 64 int iovcnt; 65 66 /** Current iovec position. */ 67 int iovpos; 68 69 /** Offset in current iovec. */ 70 uint32_t iov_offset; 71 72 /** array of iovecs to transfer. */ 73 struct iovec *fused_iovs; 74 75 /** Number of iovecs in iovs array. */ 76 int fused_iovcnt; 77 78 /** Current iovec position. */ 79 int fused_iovpos; 80 81 /** Offset in current iovec. */ 82 uint32_t fused_iov_offset; 83 84 /** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */ 85 struct spdk_nvme_cpl cpl; 86 /** Extended IO opts passed by the user to bdev layer and mapped to NVME format */ 87 struct spdk_nvme_ns_cmd_ext_io_opts ext_opts; 88 89 /** Originating thread */ 90 struct spdk_thread *orig_thread; 91 92 /** Keeps track if first of fused commands was submitted */ 93 bool first_fused_submitted; 94 95 /** Temporary pointer to zone report buffer */ 96 struct spdk_nvme_zns_zone_report *zone_report_buf; 97 98 /** Keep track of how many zones that have been copied to the spdk_bdev_zone_info struct */ 99 uint64_t handled_zones; 100 }; 101 102 struct nvme_probe_ctx { 103 size_t count; 104 struct spdk_nvme_transport_id trids[NVME_MAX_CONTROLLERS]; 105 struct spdk_nvme_host_id hostids[NVME_MAX_CONTROLLERS]; 106 const char *names[NVME_MAX_CONTROLLERS]; 107 uint32_t prchk_flags[NVME_MAX_CONTROLLERS]; 108 const char *hostnqn; 109 }; 110 111 struct nvme_probe_skip_entry { 112 struct spdk_nvme_transport_id trid; 113 TAILQ_ENTRY(nvme_probe_skip_entry) tailq; 114 }; 115 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */ 116 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER( 117 g_skipped_nvme_ctrlrs); 118 119 static struct spdk_bdev_nvme_opts g_opts = { 120 .action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE, 121 .timeout_us = 0, 122 .timeout_admin_us = 0, 123 .keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS, 124 .retry_count = 4, 125 .arbitration_burst = 0, 126 .low_priority_weight = 0, 127 .medium_priority_weight = 0, 128 .high_priority_weight = 0, 129 .nvme_adminq_poll_period_us = 10000ULL, 130 .nvme_ioq_poll_period_us = 0, 131 .io_queue_requests = 0, 132 .delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT, 133 }; 134 135 #define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL 136 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT 100000ULL 137 138 static int g_hot_insert_nvme_controller_index = 0; 139 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT; 140 static bool g_nvme_hotplug_enabled = false; 141 static struct spdk_thread *g_bdev_nvme_init_thread; 142 static struct spdk_poller *g_hotplug_poller; 143 static struct spdk_poller *g_hotplug_probe_poller; 144 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx; 145 146 static void nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 147 struct nvme_async_probe_ctx *ctx); 148 static void nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 149 struct nvme_async_probe_ctx *ctx); 150 static int bdev_nvme_library_init(void); 151 static void bdev_nvme_library_fini(void); 152 static int bdev_nvme_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 153 struct nvme_bdev_io *bio, 154 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba, 155 uint32_t flags, struct spdk_bdev_ext_io_opts *ext_opts); 156 static int bdev_nvme_no_pi_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 157 struct nvme_bdev_io *bio, 158 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba); 159 static int bdev_nvme_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 160 struct nvme_bdev_io *bio, 161 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba, 162 uint32_t flags, struct spdk_bdev_ext_io_opts *ext_opts); 163 static int bdev_nvme_zone_appendv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 164 struct nvme_bdev_io *bio, 165 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, 166 uint64_t zslba, uint32_t flags); 167 static int bdev_nvme_comparev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 168 struct nvme_bdev_io *bio, 169 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba, 170 uint32_t flags); 171 static int bdev_nvme_comparev_and_writev(struct spdk_nvme_ns *ns, 172 struct spdk_nvme_qpair *qpair, 173 struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov, 174 int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba, 175 uint32_t flags); 176 static int bdev_nvme_get_zone_info(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 177 struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones, 178 struct spdk_bdev_zone_info *info); 179 static int bdev_nvme_zone_management(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 180 struct nvme_bdev_io *bio, uint64_t zone_id, 181 enum spdk_bdev_zone_action action); 182 static int bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, 183 struct nvme_bdev_io *bio, 184 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); 185 static int bdev_nvme_io_passthru(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 186 struct nvme_bdev_io *bio, 187 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); 188 static int bdev_nvme_io_passthru_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 189 struct nvme_bdev_io *bio, 190 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len); 191 static int bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, 192 struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort); 193 static int bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio); 194 static int bdev_nvme_failover(struct nvme_ctrlr *nvme_ctrlr, bool remove); 195 static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr); 196 197 struct spdk_nvme_qpair * 198 bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch) 199 { 200 struct nvme_ctrlr_channel *ctrlr_ch; 201 202 assert(ctrlr_io_ch != NULL); 203 204 ctrlr_ch = spdk_io_channel_get_ctx(ctrlr_io_ch); 205 206 return ctrlr_ch->qpair; 207 } 208 209 static int 210 bdev_nvme_get_ctx_size(void) 211 { 212 return sizeof(struct nvme_bdev_io); 213 } 214 215 static struct spdk_bdev_module nvme_if = { 216 .name = "nvme", 217 .async_fini = true, 218 .module_init = bdev_nvme_library_init, 219 .module_fini = bdev_nvme_library_fini, 220 .config_json = bdev_nvme_config_json, 221 .get_ctx_size = bdev_nvme_get_ctx_size, 222 223 }; 224 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if) 225 226 static inline bool 227 bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch, 228 struct spdk_nvme_ns **_ns, struct spdk_nvme_qpair **_qpair) 229 { 230 if (spdk_unlikely(nbdev_ch->ctrlr_ch->qpair == NULL)) { 231 /* The device is currently resetting. */ 232 return false; 233 } 234 235 *_ns = nbdev_ch->nvme_ns->ns; 236 *_qpair = nbdev_ch->ctrlr_ch->qpair; 237 return true; 238 } 239 240 static inline bool 241 bdev_nvme_find_admin_path(struct nvme_bdev_channel *nbdev_ch, 242 struct nvme_ctrlr **_nvme_ctrlr) 243 { 244 *_nvme_ctrlr = nbdev_ch->ctrlr_ch->ctrlr; 245 return true; 246 } 247 248 static inline void 249 bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio, 250 const struct spdk_nvme_cpl *cpl) 251 { 252 spdk_bdev_io_complete_nvme_status(spdk_bdev_io_from_ctx(bio), cpl->cdw0, 253 cpl->status.sct, cpl->status.sc); 254 } 255 256 static inline void 257 bdev_nvme_io_complete(struct nvme_bdev_io *bio, int rc) 258 { 259 enum spdk_bdev_io_status io_status; 260 261 if (rc == 0) { 262 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 263 } else if (rc == -ENOMEM) { 264 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 265 } else { 266 io_status = SPDK_BDEV_IO_STATUS_FAILED; 267 } 268 269 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), io_status); 270 } 271 272 static void 273 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx) 274 { 275 int rc; 276 277 SPDK_DEBUGLOG(bdev_nvme, "qpair %p is disconnected, attempting reconnect.\n", qpair); 278 /* 279 * Currently, just try to reconnect indefinitely. If we are doing a reset, the reset will 280 * reconnect a qpair and we will stop getting a callback for this one. 281 */ 282 rc = spdk_nvme_ctrlr_reconnect_io_qpair(qpair); 283 if (rc != 0) { 284 SPDK_DEBUGLOG(bdev_nvme, "Failed to reconnect to qpair %p, errno %d\n", qpair, -rc); 285 } 286 } 287 288 static int 289 bdev_nvme_poll(void *arg) 290 { 291 struct nvme_poll_group *group = arg; 292 int64_t num_completions; 293 294 if (group->collect_spin_stat && group->start_ticks == 0) { 295 group->start_ticks = spdk_get_ticks(); 296 } 297 298 num_completions = spdk_nvme_poll_group_process_completions(group->group, 0, 299 bdev_nvme_disconnected_qpair_cb); 300 if (group->collect_spin_stat) { 301 if (num_completions > 0) { 302 if (group->end_ticks != 0) { 303 group->spin_ticks += (group->end_ticks - group->start_ticks); 304 group->end_ticks = 0; 305 } 306 group->start_ticks = 0; 307 } else { 308 group->end_ticks = spdk_get_ticks(); 309 } 310 } 311 312 return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 313 } 314 315 static int 316 bdev_nvme_poll_adminq(void *arg) 317 { 318 int32_t rc; 319 struct nvme_ctrlr *nvme_ctrlr = arg; 320 321 assert(nvme_ctrlr != NULL); 322 323 rc = spdk_nvme_ctrlr_process_admin_completions(nvme_ctrlr->ctrlr); 324 if (rc < 0) { 325 bdev_nvme_failover(nvme_ctrlr, false); 326 } 327 328 return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY; 329 } 330 331 static void 332 _bdev_nvme_unregister_dev_cb(void *io_device) 333 { 334 struct nvme_bdev *nvme_disk = io_device; 335 336 free(nvme_disk->disk.name); 337 free(nvme_disk); 338 } 339 340 static int 341 bdev_nvme_destruct(void *ctx) 342 { 343 struct nvme_bdev *nvme_disk = ctx; 344 struct nvme_ns *nvme_ns = nvme_disk->nvme_ns; 345 346 pthread_mutex_lock(&nvme_ns->ctrlr->mutex); 347 348 nvme_ns->bdev = NULL; 349 350 if (!nvme_ns->populated) { 351 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 352 353 nvme_ctrlr_release(nvme_ns->ctrlr); 354 } else { 355 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 356 } 357 358 spdk_io_device_unregister(nvme_disk, _bdev_nvme_unregister_dev_cb); 359 360 return 0; 361 } 362 363 static int 364 bdev_nvme_flush(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 365 struct nvme_bdev_io *bio, uint64_t offset, uint64_t nbytes) 366 { 367 bdev_nvme_io_complete(bio, 0); 368 369 return 0; 370 } 371 372 static int 373 bdev_nvme_create_qpair(struct nvme_ctrlr_channel *ctrlr_ch) 374 { 375 struct spdk_nvme_ctrlr *ctrlr = ctrlr_ch->ctrlr->ctrlr; 376 struct spdk_nvme_io_qpair_opts opts; 377 struct spdk_nvme_qpair *qpair; 378 int rc; 379 380 spdk_nvme_ctrlr_get_default_io_qpair_opts(ctrlr, &opts, sizeof(opts)); 381 opts.delay_cmd_submit = g_opts.delay_cmd_submit; 382 opts.create_only = true; 383 opts.async_mode = true; 384 opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests); 385 g_opts.io_queue_requests = opts.io_queue_requests; 386 387 qpair = spdk_nvme_ctrlr_alloc_io_qpair(ctrlr, &opts, sizeof(opts)); 388 if (qpair == NULL) { 389 return -1; 390 } 391 392 assert(ctrlr_ch->group != NULL); 393 394 rc = spdk_nvme_poll_group_add(ctrlr_ch->group->group, qpair); 395 if (rc != 0) { 396 SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n"); 397 goto err; 398 } 399 400 rc = spdk_nvme_ctrlr_connect_io_qpair(ctrlr, qpair); 401 if (rc != 0) { 402 SPDK_ERRLOG("Unable to connect I/O qpair.\n"); 403 goto err; 404 } 405 406 ctrlr_ch->qpair = qpair; 407 408 return 0; 409 410 err: 411 spdk_nvme_ctrlr_free_io_qpair(qpair); 412 413 return rc; 414 } 415 416 static void 417 bdev_nvme_destroy_qpair(struct nvme_ctrlr_channel *ctrlr_ch) 418 { 419 if (ctrlr_ch->qpair != NULL) { 420 spdk_nvme_ctrlr_free_io_qpair(ctrlr_ch->qpair); 421 ctrlr_ch->qpair = NULL; 422 } 423 } 424 425 static void 426 _bdev_nvme_check_pending_destruct(struct nvme_ctrlr *nvme_ctrlr) 427 { 428 pthread_mutex_lock(&nvme_ctrlr->mutex); 429 if (nvme_ctrlr->destruct_after_reset) { 430 assert(nvme_ctrlr->ref == 0 && nvme_ctrlr->destruct); 431 pthread_mutex_unlock(&nvme_ctrlr->mutex); 432 433 spdk_thread_send_msg(nvme_ctrlr->thread, nvme_ctrlr_unregister, 434 nvme_ctrlr); 435 } else { 436 pthread_mutex_unlock(&nvme_ctrlr->mutex); 437 } 438 } 439 440 static void 441 bdev_nvme_check_pending_destruct(struct spdk_io_channel_iter *i, int status) 442 { 443 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 444 445 _bdev_nvme_check_pending_destruct(nvme_ctrlr); 446 } 447 448 static void 449 _bdev_nvme_complete_pending_resets(struct nvme_ctrlr_channel *ctrlr_ch, 450 enum spdk_bdev_io_status status) 451 { 452 struct spdk_bdev_io *bdev_io; 453 454 while (!TAILQ_EMPTY(&ctrlr_ch->pending_resets)) { 455 bdev_io = TAILQ_FIRST(&ctrlr_ch->pending_resets); 456 TAILQ_REMOVE(&ctrlr_ch->pending_resets, bdev_io, module_link); 457 spdk_bdev_io_complete(bdev_io, status); 458 } 459 } 460 461 static void 462 bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i) 463 { 464 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 465 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 466 467 _bdev_nvme_complete_pending_resets(ctrlr_ch, SPDK_BDEV_IO_STATUS_SUCCESS); 468 469 spdk_for_each_channel_continue(i, 0); 470 } 471 472 static void 473 bdev_nvme_abort_pending_resets(struct spdk_io_channel_iter *i) 474 { 475 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 476 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 477 478 _bdev_nvme_complete_pending_resets(ctrlr_ch, SPDK_BDEV_IO_STATUS_FAILED); 479 480 spdk_for_each_channel_continue(i, 0); 481 } 482 483 static void 484 bdev_nvme_reset_complete(struct nvme_ctrlr *nvme_ctrlr, int rc) 485 { 486 struct nvme_ctrlr_trid *curr_trid; 487 bdev_nvme_reset_cb reset_cb_fn = nvme_ctrlr->reset_cb_fn; 488 void *reset_cb_arg = nvme_ctrlr->reset_cb_arg; 489 490 nvme_ctrlr->reset_cb_fn = NULL; 491 nvme_ctrlr->reset_cb_arg = NULL; 492 493 if (rc) { 494 SPDK_ERRLOG("Resetting controller failed.\n"); 495 } else { 496 SPDK_NOTICELOG("Resetting controller successful.\n"); 497 } 498 499 pthread_mutex_lock(&nvme_ctrlr->mutex); 500 nvme_ctrlr->resetting = false; 501 nvme_ctrlr->failover_in_progress = false; 502 503 curr_trid = TAILQ_FIRST(&nvme_ctrlr->trids); 504 assert(curr_trid != NULL); 505 assert(&curr_trid->trid == nvme_ctrlr->connected_trid); 506 507 curr_trid->is_failed = rc != 0 ? true : false; 508 509 if (nvme_ctrlr->ref == 0 && nvme_ctrlr->destruct) { 510 /* Destruct ctrlr after clearing pending resets. */ 511 nvme_ctrlr->destruct_after_reset = true; 512 } 513 514 pthread_mutex_unlock(&nvme_ctrlr->mutex); 515 516 if (reset_cb_fn) { 517 reset_cb_fn(reset_cb_arg, rc); 518 } 519 520 /* Make sure we clear any pending resets before returning. */ 521 spdk_for_each_channel(nvme_ctrlr, 522 rc == 0 ? bdev_nvme_complete_pending_resets : 523 bdev_nvme_abort_pending_resets, 524 NULL, 525 bdev_nvme_check_pending_destruct); 526 } 527 528 static void 529 bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status) 530 { 531 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 532 533 bdev_nvme_reset_complete(nvme_ctrlr, status); 534 } 535 536 static void 537 bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i) 538 { 539 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 540 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 541 int rc; 542 543 rc = bdev_nvme_create_qpair(ctrlr_ch); 544 545 spdk_for_each_channel_continue(i, rc); 546 } 547 548 static int 549 bdev_nvme_ctrlr_reset_poll(void *arg) 550 { 551 struct nvme_ctrlr *nvme_ctrlr = arg; 552 int rc; 553 554 rc = spdk_nvme_ctrlr_reset_poll_async(nvme_ctrlr->reset_ctx); 555 if (rc == -EAGAIN) { 556 return SPDK_POLLER_BUSY; 557 } 558 559 spdk_poller_unregister(&nvme_ctrlr->reset_poller); 560 if (rc == 0) { 561 /* Recreate all of the I/O queue pairs */ 562 spdk_for_each_channel(nvme_ctrlr, 563 bdev_nvme_reset_create_qpair, 564 NULL, 565 bdev_nvme_reset_create_qpairs_done); 566 } else { 567 bdev_nvme_reset_complete(nvme_ctrlr, rc); 568 } 569 return SPDK_POLLER_BUSY; 570 } 571 572 static void 573 bdev_nvme_reset_ctrlr(struct spdk_io_channel_iter *i, int status) 574 { 575 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 576 int rc; 577 578 if (status) { 579 rc = status; 580 goto err; 581 } 582 583 rc = spdk_nvme_ctrlr_reset_async(nvme_ctrlr->ctrlr, &nvme_ctrlr->reset_ctx); 584 if (rc != 0) { 585 SPDK_ERRLOG("Create controller reset context failed\n"); 586 goto err; 587 } 588 assert(nvme_ctrlr->reset_poller == NULL); 589 nvme_ctrlr->reset_poller = SPDK_POLLER_REGISTER(bdev_nvme_ctrlr_reset_poll, 590 nvme_ctrlr, 0); 591 592 return; 593 594 err: 595 bdev_nvme_reset_complete(nvme_ctrlr, rc); 596 } 597 598 static void 599 bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i) 600 { 601 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 602 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch); 603 604 bdev_nvme_destroy_qpair(ctrlr_ch); 605 spdk_for_each_channel_continue(i, 0); 606 } 607 608 static int 609 bdev_nvme_reset(struct nvme_ctrlr *nvme_ctrlr) 610 { 611 pthread_mutex_lock(&nvme_ctrlr->mutex); 612 if (nvme_ctrlr->destruct) { 613 pthread_mutex_unlock(&nvme_ctrlr->mutex); 614 return -EBUSY; 615 } 616 617 if (nvme_ctrlr->resetting) { 618 pthread_mutex_unlock(&nvme_ctrlr->mutex); 619 SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); 620 return -EAGAIN; 621 } 622 623 nvme_ctrlr->resetting = true; 624 pthread_mutex_unlock(&nvme_ctrlr->mutex); 625 spdk_nvme_ctrlr_prepare_for_reset(nvme_ctrlr->ctrlr); 626 627 /* First, delete all NVMe I/O queue pairs. */ 628 spdk_for_each_channel(nvme_ctrlr, 629 bdev_nvme_reset_destroy_qpair, 630 NULL, 631 bdev_nvme_reset_ctrlr); 632 633 return 0; 634 } 635 636 int 637 bdev_nvme_reset_rpc(struct nvme_ctrlr *nvme_ctrlr, bdev_nvme_reset_cb cb_fn, void *cb_arg) 638 { 639 int rc; 640 641 rc = bdev_nvme_reset(nvme_ctrlr); 642 if (rc == 0) { 643 nvme_ctrlr->reset_cb_fn = cb_fn; 644 nvme_ctrlr->reset_cb_arg = cb_arg; 645 } 646 return rc; 647 } 648 649 static void 650 bdev_nvme_reset_io_complete(void *cb_arg, int rc) 651 { 652 struct nvme_bdev_io *bio = cb_arg; 653 654 bdev_nvme_io_complete(bio, rc); 655 } 656 657 static int 658 bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio) 659 { 660 struct nvme_ctrlr_channel *ctrlr_ch = nbdev_ch->ctrlr_ch; 661 struct spdk_bdev_io *bdev_io; 662 int rc; 663 664 rc = bdev_nvme_reset(ctrlr_ch->ctrlr); 665 if (rc == 0) { 666 assert(ctrlr_ch->ctrlr->reset_cb_fn == NULL); 667 assert(ctrlr_ch->ctrlr->reset_cb_arg == NULL); 668 ctrlr_ch->ctrlr->reset_cb_fn = bdev_nvme_reset_io_complete; 669 ctrlr_ch->ctrlr->reset_cb_arg = bio; 670 } else if (rc == -EAGAIN) { 671 /* 672 * Reset call is queued only if it is from the app framework. This is on purpose so that 673 * we don't interfere with the app framework reset strategy. i.e. we are deferring to the 674 * upper level. If they are in the middle of a reset, we won't try to schedule another one. 675 */ 676 bdev_io = spdk_bdev_io_from_ctx(bio); 677 TAILQ_INSERT_TAIL(&ctrlr_ch->pending_resets, bdev_io, module_link); 678 } else { 679 return rc; 680 } 681 682 return 0; 683 } 684 685 static int 686 bdev_nvme_failover_start(struct nvme_ctrlr *nvme_ctrlr, bool remove) 687 { 688 struct nvme_ctrlr_trid *curr_trid = NULL, *next_trid = NULL; 689 int rc; 690 691 pthread_mutex_lock(&nvme_ctrlr->mutex); 692 if (nvme_ctrlr->destruct) { 693 pthread_mutex_unlock(&nvme_ctrlr->mutex); 694 /* Don't bother resetting if the controller is in the process of being destructed. */ 695 return -EBUSY; 696 } 697 698 curr_trid = TAILQ_FIRST(&nvme_ctrlr->trids); 699 assert(curr_trid); 700 assert(&curr_trid->trid == nvme_ctrlr->connected_trid); 701 next_trid = TAILQ_NEXT(curr_trid, link); 702 703 if (nvme_ctrlr->resetting) { 704 if (next_trid && !nvme_ctrlr->failover_in_progress) { 705 rc = -EAGAIN; 706 } else { 707 rc = -EBUSY; 708 } 709 pthread_mutex_unlock(&nvme_ctrlr->mutex); 710 SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); 711 return rc; 712 } 713 714 nvme_ctrlr->resetting = true; 715 curr_trid->is_failed = true; 716 717 if (next_trid) { 718 assert(curr_trid->trid.trtype != SPDK_NVME_TRANSPORT_PCIE); 719 720 SPDK_NOTICELOG("Start failover from %s:%s to %s:%s\n", curr_trid->trid.traddr, 721 curr_trid->trid.trsvcid, next_trid->trid.traddr, next_trid->trid.trsvcid); 722 723 nvme_ctrlr->failover_in_progress = true; 724 spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr); 725 nvme_ctrlr->connected_trid = &next_trid->trid; 726 rc = spdk_nvme_ctrlr_set_trid(nvme_ctrlr->ctrlr, &next_trid->trid); 727 assert(rc == 0); 728 TAILQ_REMOVE(&nvme_ctrlr->trids, curr_trid, link); 729 if (!remove) { 730 /** Shuffle the old trid to the end of the list and use the new one. 731 * Allows for round robin through multiple connections. 732 */ 733 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, curr_trid, link); 734 } else { 735 free(curr_trid); 736 } 737 } 738 739 pthread_mutex_unlock(&nvme_ctrlr->mutex); 740 return 0; 741 } 742 743 static int 744 bdev_nvme_failover(struct nvme_ctrlr *nvme_ctrlr, bool remove) 745 { 746 int rc; 747 748 rc = bdev_nvme_failover_start(nvme_ctrlr, remove); 749 if (rc == 0) { 750 /* First, delete all NVMe I/O queue pairs. */ 751 spdk_for_each_channel(nvme_ctrlr, 752 bdev_nvme_reset_destroy_qpair, 753 NULL, 754 bdev_nvme_reset_ctrlr); 755 } else if (rc != -EBUSY) { 756 return rc; 757 } 758 759 return 0; 760 } 761 762 static int 763 bdev_nvme_unmap(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 764 struct nvme_bdev_io *bio, 765 uint64_t offset_blocks, 766 uint64_t num_blocks); 767 768 static int 769 bdev_nvme_write_zeroes(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 770 struct nvme_bdev_io *bio, 771 uint64_t offset_blocks, 772 uint64_t num_blocks); 773 774 static void 775 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 776 bool success) 777 { 778 struct nvme_bdev_io *bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 779 struct spdk_bdev *bdev = bdev_io->bdev; 780 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 781 struct spdk_nvme_ns *ns; 782 struct spdk_nvme_qpair *qpair; 783 int ret; 784 785 if (!success) { 786 ret = -EINVAL; 787 goto exit; 788 } 789 790 if (spdk_unlikely(!bdev_nvme_find_io_path(nbdev_ch, &ns, &qpair))) { 791 ret = -ENXIO; 792 goto exit; 793 } 794 795 ret = bdev_nvme_readv(ns, 796 qpair, 797 bio, 798 bdev_io->u.bdev.iovs, 799 bdev_io->u.bdev.iovcnt, 800 bdev_io->u.bdev.md_buf, 801 bdev_io->u.bdev.num_blocks, 802 bdev_io->u.bdev.offset_blocks, 803 bdev->dif_check_flags, 804 bdev_io->internal.ext_opts); 805 806 exit: 807 if (spdk_unlikely(ret != 0)) { 808 bdev_nvme_io_complete(bio, ret); 809 } 810 } 811 812 static void 813 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 814 { 815 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 816 struct spdk_bdev *bdev = bdev_io->bdev; 817 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 818 struct nvme_bdev_io *nbdev_io_to_abort; 819 struct spdk_nvme_ns *ns; 820 struct spdk_nvme_qpair *qpair; 821 int rc = 0; 822 823 if (spdk_unlikely(!bdev_nvme_find_io_path(nbdev_ch, &ns, &qpair))) { 824 rc = -ENXIO; 825 goto exit; 826 } 827 828 switch (bdev_io->type) { 829 case SPDK_BDEV_IO_TYPE_READ: 830 if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) { 831 rc = bdev_nvme_readv(ns, 832 qpair, 833 nbdev_io, 834 bdev_io->u.bdev.iovs, 835 bdev_io->u.bdev.iovcnt, 836 bdev_io->u.bdev.md_buf, 837 bdev_io->u.bdev.num_blocks, 838 bdev_io->u.bdev.offset_blocks, 839 bdev->dif_check_flags, 840 bdev_io->internal.ext_opts); 841 } else { 842 spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb, 843 bdev_io->u.bdev.num_blocks * bdev->blocklen); 844 rc = 0; 845 } 846 break; 847 case SPDK_BDEV_IO_TYPE_WRITE: 848 rc = bdev_nvme_writev(ns, 849 qpair, 850 nbdev_io, 851 bdev_io->u.bdev.iovs, 852 bdev_io->u.bdev.iovcnt, 853 bdev_io->u.bdev.md_buf, 854 bdev_io->u.bdev.num_blocks, 855 bdev_io->u.bdev.offset_blocks, 856 bdev->dif_check_flags, 857 bdev_io->internal.ext_opts); 858 break; 859 case SPDK_BDEV_IO_TYPE_COMPARE: 860 rc = bdev_nvme_comparev(ns, 861 qpair, 862 nbdev_io, 863 bdev_io->u.bdev.iovs, 864 bdev_io->u.bdev.iovcnt, 865 bdev_io->u.bdev.md_buf, 866 bdev_io->u.bdev.num_blocks, 867 bdev_io->u.bdev.offset_blocks, 868 bdev->dif_check_flags); 869 break; 870 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 871 rc = bdev_nvme_comparev_and_writev(ns, 872 qpair, 873 nbdev_io, 874 bdev_io->u.bdev.iovs, 875 bdev_io->u.bdev.iovcnt, 876 bdev_io->u.bdev.fused_iovs, 877 bdev_io->u.bdev.fused_iovcnt, 878 bdev_io->u.bdev.md_buf, 879 bdev_io->u.bdev.num_blocks, 880 bdev_io->u.bdev.offset_blocks, 881 bdev->dif_check_flags); 882 break; 883 case SPDK_BDEV_IO_TYPE_UNMAP: 884 rc = bdev_nvme_unmap(ns, 885 qpair, 886 nbdev_io, 887 bdev_io->u.bdev.offset_blocks, 888 bdev_io->u.bdev.num_blocks); 889 break; 890 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 891 rc = bdev_nvme_write_zeroes(ns, qpair, 892 nbdev_io, 893 bdev_io->u.bdev.offset_blocks, 894 bdev_io->u.bdev.num_blocks); 895 break; 896 case SPDK_BDEV_IO_TYPE_RESET: 897 rc = bdev_nvme_reset_io(nbdev_ch, nbdev_io); 898 break; 899 case SPDK_BDEV_IO_TYPE_FLUSH: 900 rc = bdev_nvme_flush(ns, 901 qpair, 902 nbdev_io, 903 bdev_io->u.bdev.offset_blocks, 904 bdev_io->u.bdev.num_blocks); 905 break; 906 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 907 rc = bdev_nvme_zone_appendv(ns, 908 qpair, 909 nbdev_io, 910 bdev_io->u.bdev.iovs, 911 bdev_io->u.bdev.iovcnt, 912 bdev_io->u.bdev.md_buf, 913 bdev_io->u.bdev.num_blocks, 914 bdev_io->u.bdev.offset_blocks, 915 bdev->dif_check_flags); 916 break; 917 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 918 rc = bdev_nvme_get_zone_info(ns, 919 qpair, 920 nbdev_io, 921 bdev_io->u.zone_mgmt.zone_id, 922 bdev_io->u.zone_mgmt.num_zones, 923 bdev_io->u.zone_mgmt.buf); 924 break; 925 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 926 rc = bdev_nvme_zone_management(ns, 927 qpair, 928 nbdev_io, 929 bdev_io->u.zone_mgmt.zone_id, 930 bdev_io->u.zone_mgmt.zone_action); 931 break; 932 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 933 rc = bdev_nvme_admin_passthru(nbdev_ch, 934 nbdev_io, 935 &bdev_io->u.nvme_passthru.cmd, 936 bdev_io->u.nvme_passthru.buf, 937 bdev_io->u.nvme_passthru.nbytes); 938 break; 939 case SPDK_BDEV_IO_TYPE_NVME_IO: 940 rc = bdev_nvme_io_passthru(ns, 941 qpair, 942 nbdev_io, 943 &bdev_io->u.nvme_passthru.cmd, 944 bdev_io->u.nvme_passthru.buf, 945 bdev_io->u.nvme_passthru.nbytes); 946 break; 947 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 948 rc = bdev_nvme_io_passthru_md(ns, 949 qpair, 950 nbdev_io, 951 &bdev_io->u.nvme_passthru.cmd, 952 bdev_io->u.nvme_passthru.buf, 953 bdev_io->u.nvme_passthru.nbytes, 954 bdev_io->u.nvme_passthru.md_buf, 955 bdev_io->u.nvme_passthru.md_len); 956 break; 957 case SPDK_BDEV_IO_TYPE_ABORT: 958 nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx; 959 rc = bdev_nvme_abort(nbdev_ch, 960 nbdev_io, 961 nbdev_io_to_abort); 962 break; 963 default: 964 rc = -EINVAL; 965 break; 966 } 967 968 exit: 969 if (spdk_unlikely(rc != 0)) { 970 bdev_nvme_io_complete(nbdev_io, rc); 971 } 972 } 973 974 static bool 975 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 976 { 977 struct nvme_bdev *nbdev = ctx; 978 struct nvme_ns *nvme_ns; 979 struct spdk_nvme_ns *ns; 980 struct spdk_nvme_ctrlr *ctrlr; 981 const struct spdk_nvme_ctrlr_data *cdata; 982 983 nvme_ns = nbdev->nvme_ns; 984 assert(nvme_ns != NULL); 985 ns = nvme_ns->ns; 986 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 987 988 switch (io_type) { 989 case SPDK_BDEV_IO_TYPE_READ: 990 case SPDK_BDEV_IO_TYPE_WRITE: 991 case SPDK_BDEV_IO_TYPE_RESET: 992 case SPDK_BDEV_IO_TYPE_FLUSH: 993 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 994 case SPDK_BDEV_IO_TYPE_NVME_IO: 995 case SPDK_BDEV_IO_TYPE_ABORT: 996 return true; 997 998 case SPDK_BDEV_IO_TYPE_COMPARE: 999 return spdk_nvme_ns_supports_compare(ns); 1000 1001 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 1002 return spdk_nvme_ns_get_md_size(ns) ? true : false; 1003 1004 case SPDK_BDEV_IO_TYPE_UNMAP: 1005 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 1006 return cdata->oncs.dsm; 1007 1008 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 1009 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 1010 return cdata->oncs.write_zeroes; 1011 1012 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 1013 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 1014 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) { 1015 return true; 1016 } 1017 return false; 1018 1019 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 1020 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 1021 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS; 1022 1023 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 1024 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS && 1025 spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ZONE_APPEND_SUPPORTED; 1026 1027 default: 1028 return false; 1029 } 1030 } 1031 1032 static int 1033 bdev_nvme_create_ctrlr_channel_cb(void *io_device, void *ctx_buf) 1034 { 1035 struct nvme_ctrlr *nvme_ctrlr = io_device; 1036 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 1037 struct spdk_io_channel *pg_ch; 1038 int rc; 1039 1040 pg_ch = spdk_get_io_channel(&g_nvme_ctrlrs); 1041 if (!pg_ch) { 1042 return -1; 1043 } 1044 1045 ctrlr_ch->group = spdk_io_channel_get_ctx(pg_ch); 1046 1047 #ifdef SPDK_CONFIG_VTUNE 1048 ctrlr_ch->group->collect_spin_stat = true; 1049 #else 1050 ctrlr_ch->group->collect_spin_stat = false; 1051 #endif 1052 1053 TAILQ_INIT(&ctrlr_ch->pending_resets); 1054 1055 ctrlr_ch->ctrlr = nvme_ctrlr; 1056 1057 rc = bdev_nvme_create_qpair(ctrlr_ch); 1058 if (rc != 0) { 1059 goto err_qpair; 1060 } 1061 1062 return 0; 1063 1064 err_qpair: 1065 spdk_put_io_channel(pg_ch); 1066 1067 return rc; 1068 } 1069 1070 static void 1071 bdev_nvme_destroy_ctrlr_channel_cb(void *io_device, void *ctx_buf) 1072 { 1073 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 1074 1075 assert(ctrlr_ch->group != NULL); 1076 1077 bdev_nvme_destroy_qpair(ctrlr_ch); 1078 1079 spdk_put_io_channel(spdk_io_channel_from_ctx(ctrlr_ch->group)); 1080 } 1081 1082 static void 1083 bdev_nvme_submit_accel_crc32c(void *ctx, uint32_t *dst, struct iovec *iov, 1084 uint32_t iov_cnt, uint32_t seed, 1085 spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) 1086 { 1087 struct nvme_poll_group *group = ctx; 1088 int rc; 1089 1090 assert(group->accel_channel != NULL); 1091 assert(cb_fn != NULL); 1092 1093 rc = spdk_accel_submit_crc32cv(group->accel_channel, dst, iov, iov_cnt, seed, cb_fn, cb_arg); 1094 if (rc) { 1095 /* For the two cases, spdk_accel_submit_crc32cv does not call the user's cb_fn */ 1096 if (rc == -ENOMEM || rc == -EINVAL) { 1097 cb_fn(cb_arg, rc); 1098 } 1099 SPDK_ERRLOG("Cannot complete the accelerated crc32c operation with iov=%p\n", iov); 1100 } 1101 } 1102 1103 static struct spdk_nvme_accel_fn_table g_bdev_nvme_accel_fn_table = { 1104 .table_size = sizeof(struct spdk_nvme_accel_fn_table), 1105 .submit_accel_crc32c = bdev_nvme_submit_accel_crc32c, 1106 }; 1107 1108 static int 1109 bdev_nvme_create_poll_group_cb(void *io_device, void *ctx_buf) 1110 { 1111 struct nvme_poll_group *group = ctx_buf; 1112 1113 group->group = spdk_nvme_poll_group_create(group, &g_bdev_nvme_accel_fn_table); 1114 if (group->group == NULL) { 1115 return -1; 1116 } 1117 1118 group->accel_channel = spdk_accel_engine_get_io_channel(); 1119 if (!group->accel_channel) { 1120 spdk_nvme_poll_group_destroy(group->group); 1121 SPDK_ERRLOG("Cannot get the accel_channel for bdev nvme polling group=%p\n", 1122 group); 1123 return -1; 1124 } 1125 1126 group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us); 1127 1128 if (group->poller == NULL) { 1129 spdk_put_io_channel(group->accel_channel); 1130 spdk_nvme_poll_group_destroy(group->group); 1131 return -1; 1132 } 1133 1134 return 0; 1135 } 1136 1137 static void 1138 bdev_nvme_destroy_poll_group_cb(void *io_device, void *ctx_buf) 1139 { 1140 struct nvme_poll_group *group = ctx_buf; 1141 1142 if (group->accel_channel) { 1143 spdk_put_io_channel(group->accel_channel); 1144 } 1145 1146 spdk_poller_unregister(&group->poller); 1147 if (spdk_nvme_poll_group_destroy(group->group)) { 1148 SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.\n"); 1149 assert(false); 1150 } 1151 } 1152 1153 static struct spdk_io_channel * 1154 bdev_nvme_get_io_channel(void *ctx) 1155 { 1156 struct nvme_bdev *nvme_bdev = ctx; 1157 1158 return spdk_get_io_channel(nvme_bdev); 1159 } 1160 1161 static void * 1162 bdev_nvme_get_module_ctx(void *ctx) 1163 { 1164 struct nvme_bdev *nvme_bdev = ctx; 1165 1166 return bdev_nvme_get_ctrlr(&nvme_bdev->disk); 1167 } 1168 1169 static const char * 1170 _nvme_ana_state_str(enum spdk_nvme_ana_state ana_state) 1171 { 1172 switch (ana_state) { 1173 case SPDK_NVME_ANA_OPTIMIZED_STATE: 1174 return "optimized"; 1175 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 1176 return "non_optimized"; 1177 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 1178 return "inaccessible"; 1179 case SPDK_NVME_ANA_PERSISTENT_LOSS_STATE: 1180 return "persistent_loss"; 1181 case SPDK_NVME_ANA_CHANGE_STATE: 1182 return "change"; 1183 default: 1184 return NULL; 1185 } 1186 } 1187 1188 static int 1189 bdev_nvme_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size) 1190 { 1191 struct nvme_bdev *nbdev = ctx; 1192 struct spdk_memory_domain *domain; 1193 1194 domain = spdk_nvme_ctrlr_get_memory_domain(nbdev->nvme_ns->ctrlr->ctrlr); 1195 1196 if (domain) { 1197 if (array_size > 0 && domains) { 1198 domains[0] = domain; 1199 } 1200 return 1; 1201 } 1202 1203 return 0; 1204 } 1205 1206 static int 1207 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) 1208 { 1209 struct nvme_bdev *nvme_bdev = ctx; 1210 struct nvme_ns *nvme_ns; 1211 struct spdk_nvme_ns *ns; 1212 struct spdk_nvme_ctrlr *ctrlr; 1213 const struct spdk_nvme_ctrlr_data *cdata; 1214 const struct spdk_nvme_transport_id *trid; 1215 union spdk_nvme_vs_register vs; 1216 union spdk_nvme_csts_register csts; 1217 char buf[128]; 1218 1219 nvme_ns = nvme_bdev->nvme_ns; 1220 assert(nvme_ns != NULL); 1221 ns = nvme_ns->ns; 1222 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 1223 1224 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 1225 trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); 1226 vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr); 1227 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); 1228 1229 spdk_json_write_named_object_begin(w, "nvme"); 1230 1231 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 1232 spdk_json_write_named_string(w, "pci_address", trid->traddr); 1233 } 1234 1235 spdk_json_write_named_object_begin(w, "trid"); 1236 1237 nvme_bdev_dump_trid_json(trid, w); 1238 1239 spdk_json_write_object_end(w); 1240 1241 #ifdef SPDK_CONFIG_NVME_CUSE 1242 size_t cuse_name_size = 128; 1243 char cuse_name[cuse_name_size]; 1244 1245 int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns), 1246 cuse_name, &cuse_name_size); 1247 if (rc == 0) { 1248 spdk_json_write_named_string(w, "cuse_device", cuse_name); 1249 } 1250 #endif 1251 1252 spdk_json_write_named_object_begin(w, "ctrlr_data"); 1253 1254 spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid); 1255 1256 snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn); 1257 spdk_str_trim(buf); 1258 spdk_json_write_named_string(w, "model_number", buf); 1259 1260 snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn); 1261 spdk_str_trim(buf); 1262 spdk_json_write_named_string(w, "serial_number", buf); 1263 1264 snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr); 1265 spdk_str_trim(buf); 1266 spdk_json_write_named_string(w, "firmware_revision", buf); 1267 1268 if (cdata->subnqn[0] != '\0') { 1269 spdk_json_write_named_string(w, "subnqn", cdata->subnqn); 1270 } 1271 1272 spdk_json_write_named_object_begin(w, "oacs"); 1273 1274 spdk_json_write_named_uint32(w, "security", cdata->oacs.security); 1275 spdk_json_write_named_uint32(w, "format", cdata->oacs.format); 1276 spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware); 1277 spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage); 1278 1279 spdk_json_write_object_end(w); 1280 1281 spdk_json_write_object_end(w); 1282 1283 spdk_json_write_named_object_begin(w, "vs"); 1284 1285 spdk_json_write_name(w, "nvme_version"); 1286 if (vs.bits.ter) { 1287 spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter); 1288 } else { 1289 spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr); 1290 } 1291 1292 spdk_json_write_object_end(w); 1293 1294 spdk_json_write_named_object_begin(w, "csts"); 1295 1296 spdk_json_write_named_uint32(w, "rdy", csts.bits.rdy); 1297 spdk_json_write_named_uint32(w, "cfs", csts.bits.cfs); 1298 1299 spdk_json_write_object_end(w); 1300 1301 spdk_json_write_named_object_begin(w, "ns_data"); 1302 1303 spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns)); 1304 1305 if (cdata->cmic.ana_reporting) { 1306 spdk_json_write_named_string(w, "ana_state", 1307 _nvme_ana_state_str(nvme_ns->ana_state)); 1308 } 1309 1310 spdk_json_write_object_end(w); 1311 1312 if (cdata->oacs.security) { 1313 spdk_json_write_named_object_begin(w, "security"); 1314 1315 spdk_json_write_named_bool(w, "opal", nvme_bdev->opal); 1316 1317 spdk_json_write_object_end(w); 1318 } 1319 1320 spdk_json_write_object_end(w); 1321 1322 return 0; 1323 } 1324 1325 static void 1326 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1327 { 1328 /* No config per bdev needed */ 1329 } 1330 1331 static uint64_t 1332 bdev_nvme_get_spin_time(struct spdk_io_channel *ch) 1333 { 1334 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 1335 struct nvme_ctrlr_channel *ctrlr_ch = nbdev_ch->ctrlr_ch; 1336 struct nvme_poll_group *group = ctrlr_ch->group; 1337 uint64_t spin_time; 1338 1339 if (!group || !group->collect_spin_stat) { 1340 return 0; 1341 } 1342 1343 if (group->end_ticks != 0) { 1344 group->spin_ticks += (group->end_ticks - group->start_ticks); 1345 group->end_ticks = 0; 1346 } 1347 1348 spin_time = (group->spin_ticks * 1000000ULL) / spdk_get_ticks_hz(); 1349 group->start_ticks = 0; 1350 group->spin_ticks = 0; 1351 1352 return spin_time; 1353 } 1354 1355 static const struct spdk_bdev_fn_table nvmelib_fn_table = { 1356 .destruct = bdev_nvme_destruct, 1357 .submit_request = bdev_nvme_submit_request, 1358 .io_type_supported = bdev_nvme_io_type_supported, 1359 .get_io_channel = bdev_nvme_get_io_channel, 1360 .dump_info_json = bdev_nvme_dump_info_json, 1361 .write_config_json = bdev_nvme_write_config_json, 1362 .get_spin_time = bdev_nvme_get_spin_time, 1363 .get_module_ctx = bdev_nvme_get_module_ctx, 1364 .get_memory_domains = bdev_nvme_get_memory_domains, 1365 }; 1366 1367 typedef int (*bdev_nvme_parse_ana_log_page_cb)( 1368 const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg); 1369 1370 static int 1371 bdev_nvme_parse_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 1372 bdev_nvme_parse_ana_log_page_cb cb_fn, void *cb_arg) 1373 { 1374 struct spdk_nvme_ana_group_descriptor *copied_desc; 1375 uint8_t *orig_desc; 1376 uint32_t i, desc_size, copy_len; 1377 int rc = 0; 1378 1379 if (nvme_ctrlr->ana_log_page == NULL) { 1380 return -EINVAL; 1381 } 1382 1383 copied_desc = nvme_ctrlr->copied_ana_desc; 1384 1385 orig_desc = (uint8_t *)nvme_ctrlr->ana_log_page + sizeof(struct spdk_nvme_ana_page); 1386 copy_len = nvme_ctrlr->ana_log_page_size - sizeof(struct spdk_nvme_ana_page); 1387 1388 for (i = 0; i < nvme_ctrlr->ana_log_page->num_ana_group_desc; i++) { 1389 memcpy(copied_desc, orig_desc, copy_len); 1390 1391 rc = cb_fn(copied_desc, cb_arg); 1392 if (rc != 0) { 1393 break; 1394 } 1395 1396 desc_size = sizeof(struct spdk_nvme_ana_group_descriptor) + 1397 copied_desc->num_of_nsid * sizeof(uint32_t); 1398 orig_desc += desc_size; 1399 copy_len -= desc_size; 1400 } 1401 1402 return rc; 1403 } 1404 1405 static int 1406 nvme_ns_set_ana_state(const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg) 1407 { 1408 struct nvme_ns *nvme_ns = cb_arg; 1409 uint32_t i; 1410 1411 for (i = 0; i < desc->num_of_nsid; i++) { 1412 if (desc->nsid[i] != spdk_nvme_ns_get_id(nvme_ns->ns)) { 1413 continue; 1414 } 1415 nvme_ns->ana_group_id = desc->ana_group_id; 1416 nvme_ns->ana_state = desc->ana_state; 1417 return 1; 1418 } 1419 1420 return 0; 1421 } 1422 1423 static int 1424 nvme_disk_create(struct spdk_bdev *disk, const char *base_name, 1425 struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns, 1426 uint32_t prchk_flags, void *ctx) 1427 { 1428 const struct spdk_uuid *uuid; 1429 const uint8_t *nguid; 1430 const struct spdk_nvme_ctrlr_data *cdata; 1431 const struct spdk_nvme_ns_data *nsdata; 1432 enum spdk_nvme_csi csi; 1433 uint32_t atomic_bs, phys_bs, bs; 1434 1435 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 1436 csi = spdk_nvme_ns_get_csi(ns); 1437 1438 switch (csi) { 1439 case SPDK_NVME_CSI_NVM: 1440 disk->product_name = "NVMe disk"; 1441 break; 1442 case SPDK_NVME_CSI_ZNS: 1443 disk->product_name = "NVMe ZNS disk"; 1444 disk->zoned = true; 1445 disk->zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 1446 disk->max_zone_append_size = spdk_nvme_zns_ctrlr_get_max_zone_append_size(ctrlr) / 1447 spdk_nvme_ns_get_extended_sector_size(ns); 1448 disk->max_open_zones = spdk_nvme_zns_ns_get_max_open_zones(ns); 1449 disk->max_active_zones = spdk_nvme_zns_ns_get_max_active_zones(ns); 1450 break; 1451 default: 1452 SPDK_ERRLOG("unsupported CSI: %u\n", csi); 1453 return -ENOTSUP; 1454 } 1455 1456 disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns)); 1457 if (!disk->name) { 1458 return -ENOMEM; 1459 } 1460 1461 disk->write_cache = 0; 1462 if (cdata->vwc.present) { 1463 /* Enable if the Volatile Write Cache exists */ 1464 disk->write_cache = 1; 1465 } 1466 if (cdata->oncs.write_zeroes) { 1467 disk->max_write_zeroes = UINT16_MAX + 1; 1468 } 1469 disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns); 1470 disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns); 1471 disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns); 1472 1473 nguid = spdk_nvme_ns_get_nguid(ns); 1474 if (!nguid) { 1475 uuid = spdk_nvme_ns_get_uuid(ns); 1476 if (uuid) { 1477 disk->uuid = *uuid; 1478 } 1479 } else { 1480 memcpy(&disk->uuid, nguid, sizeof(disk->uuid)); 1481 } 1482 1483 nsdata = spdk_nvme_ns_get_data(ns); 1484 bs = spdk_nvme_ns_get_sector_size(ns); 1485 atomic_bs = bs; 1486 phys_bs = bs; 1487 if (nsdata->nabo == 0) { 1488 if (nsdata->nsfeat.ns_atomic_write_unit && nsdata->nawupf) { 1489 atomic_bs = bs * (1 + nsdata->nawupf); 1490 } else { 1491 atomic_bs = bs * (1 + cdata->awupf); 1492 } 1493 } 1494 if (nsdata->nsfeat.optperf) { 1495 phys_bs = bs * (1 + nsdata->npwg); 1496 } 1497 disk->phys_blocklen = spdk_min(phys_bs, atomic_bs); 1498 1499 disk->md_len = spdk_nvme_ns_get_md_size(ns); 1500 if (disk->md_len != 0) { 1501 disk->md_interleave = nsdata->flbas.extended; 1502 disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns); 1503 if (disk->dif_type != SPDK_DIF_DISABLE) { 1504 disk->dif_is_head_of_md = nsdata->dps.md_start; 1505 disk->dif_check_flags = prchk_flags; 1506 } 1507 } 1508 1509 if (!(spdk_nvme_ctrlr_get_flags(ctrlr) & 1510 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) { 1511 disk->acwu = 0; 1512 } else if (nsdata->nsfeat.ns_atomic_write_unit) { 1513 disk->acwu = nsdata->nacwu; 1514 } else { 1515 disk->acwu = cdata->acwu; 1516 } 1517 1518 disk->ctxt = ctx; 1519 disk->fn_table = &nvmelib_fn_table; 1520 disk->module = &nvme_if; 1521 1522 return 0; 1523 } 1524 1525 static int 1526 nvme_bdev_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 1527 { 1528 struct nvme_bdev *bdev; 1529 int rc; 1530 1531 bdev = calloc(1, sizeof(*bdev)); 1532 if (!bdev) { 1533 SPDK_ERRLOG("bdev calloc() failed\n"); 1534 return -ENOMEM; 1535 } 1536 1537 bdev->nvme_ns = nvme_ns; 1538 bdev->opal = nvme_ctrlr->opal_dev != NULL; 1539 1540 rc = nvme_disk_create(&bdev->disk, nvme_ctrlr->name, nvme_ctrlr->ctrlr, 1541 nvme_ns->ns, nvme_ctrlr->prchk_flags, bdev); 1542 if (rc != 0) { 1543 SPDK_ERRLOG("Failed to create NVMe disk\n"); 1544 free(bdev); 1545 return rc; 1546 } 1547 1548 spdk_io_device_register(bdev, 1549 bdev_nvme_create_bdev_channel_cb, 1550 bdev_nvme_destroy_bdev_channel_cb, 1551 sizeof(struct nvme_bdev_channel), 1552 bdev->disk.name); 1553 1554 rc = spdk_bdev_register(&bdev->disk); 1555 if (rc != 0) { 1556 SPDK_ERRLOG("spdk_bdev_register() failed\n"); 1557 spdk_io_device_unregister(bdev, NULL); 1558 free(bdev->disk.name); 1559 free(bdev); 1560 return rc; 1561 } 1562 1563 nvme_ns->bdev = bdev; 1564 1565 return 0; 1566 } 1567 1568 static bool 1569 bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2) 1570 { 1571 const struct spdk_nvme_ns_data *nsdata1, *nsdata2; 1572 const struct spdk_uuid *uuid1, *uuid2; 1573 1574 nsdata1 = spdk_nvme_ns_get_data(ns1); 1575 nsdata2 = spdk_nvme_ns_get_data(ns2); 1576 uuid1 = spdk_nvme_ns_get_uuid(ns1); 1577 uuid2 = spdk_nvme_ns_get_uuid(ns2); 1578 1579 return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid)) == 0 && 1580 nsdata1->eui64 == nsdata2->eui64 && 1581 uuid1 != NULL && uuid2 != NULL && spdk_uuid_compare(uuid1, uuid2) == 0; 1582 } 1583 1584 static bool 1585 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 1586 struct spdk_nvme_ctrlr_opts *opts) 1587 { 1588 struct nvme_probe_skip_entry *entry; 1589 1590 TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) { 1591 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 1592 return false; 1593 } 1594 } 1595 1596 opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst; 1597 opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight; 1598 opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight; 1599 opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight; 1600 opts->disable_read_ana_log_page = true; 1601 1602 SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr); 1603 1604 return true; 1605 } 1606 1607 static void 1608 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl) 1609 { 1610 struct nvme_ctrlr *nvme_ctrlr = ctx; 1611 1612 if (spdk_nvme_cpl_is_error(cpl)) { 1613 SPDK_WARNLOG("Abort failed. Resetting controller. sc is %u, sct is %u.\n", cpl->status.sc, 1614 cpl->status.sct); 1615 bdev_nvme_reset(nvme_ctrlr); 1616 } else if (cpl->cdw0 & 0x1) { 1617 SPDK_WARNLOG("Specified command could not be aborted.\n"); 1618 bdev_nvme_reset(nvme_ctrlr); 1619 } 1620 } 1621 1622 static void 1623 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr, 1624 struct spdk_nvme_qpair *qpair, uint16_t cid) 1625 { 1626 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 1627 union spdk_nvme_csts_register csts; 1628 int rc; 1629 1630 assert(nvme_ctrlr->ctrlr == ctrlr); 1631 1632 SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid); 1633 1634 /* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O 1635 * queue. (Note: qpair == NULL when there's an admin cmd timeout.) Otherwise we 1636 * would submit another fabrics cmd on the admin queue to read CSTS and check for its 1637 * completion recursively. 1638 */ 1639 if (nvme_ctrlr->connected_trid->trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) { 1640 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); 1641 if (csts.bits.cfs) { 1642 SPDK_ERRLOG("Controller Fatal Status, reset required\n"); 1643 bdev_nvme_reset(nvme_ctrlr); 1644 return; 1645 } 1646 } 1647 1648 switch (g_opts.action_on_timeout) { 1649 case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT: 1650 if (qpair) { 1651 /* Don't send abort to ctrlr when reset is running. */ 1652 pthread_mutex_lock(&nvme_ctrlr->mutex); 1653 if (nvme_ctrlr->resetting) { 1654 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1655 SPDK_NOTICELOG("Quit abort. Ctrlr is in the process of reseting.\n"); 1656 return; 1657 } 1658 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1659 1660 rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid, 1661 nvme_abort_cpl, nvme_ctrlr); 1662 if (rc == 0) { 1663 return; 1664 } 1665 1666 SPDK_ERRLOG("Unable to send abort. Resetting, rc is %d.\n", rc); 1667 } 1668 1669 /* FALLTHROUGH */ 1670 case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET: 1671 bdev_nvme_reset(nvme_ctrlr); 1672 break; 1673 case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE: 1674 SPDK_DEBUGLOG(bdev_nvme, "No action for nvme controller timeout.\n"); 1675 break; 1676 default: 1677 SPDK_ERRLOG("An invalid timeout action value is found.\n"); 1678 break; 1679 } 1680 } 1681 1682 static void 1683 nvme_ctrlr_populate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns, 1684 struct nvme_async_probe_ctx *ctx) 1685 { 1686 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 1687 struct spdk_nvme_ns *ns; 1688 int rc = 0; 1689 1690 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id); 1691 if (!ns) { 1692 SPDK_DEBUGLOG(bdev_nvme, "Invalid NS %d\n", nvme_ns->id); 1693 rc = -EINVAL; 1694 goto done; 1695 } 1696 1697 nvme_ns->ns = ns; 1698 nvme_ns->populated = true; 1699 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 1700 1701 if (nvme_ctrlr->ana_log_page != NULL) { 1702 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ns_set_ana_state, nvme_ns); 1703 } 1704 1705 rc = nvme_bdev_create(nvme_ctrlr, nvme_ns); 1706 1707 done: 1708 if (rc == 0) { 1709 pthread_mutex_lock(&nvme_ctrlr->mutex); 1710 nvme_ctrlr->ref++; 1711 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1712 } else { 1713 memset(nvme_ns, 0, sizeof(*nvme_ns)); 1714 } 1715 1716 if (ctx) { 1717 ctx->populates_in_progress--; 1718 if (ctx->populates_in_progress == 0) { 1719 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 1720 } 1721 } 1722 } 1723 1724 static void 1725 nvme_ctrlr_depopulate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 1726 { 1727 struct nvme_bdev *bdev; 1728 1729 bdev = nvme_ns->bdev; 1730 if (bdev != NULL) { 1731 spdk_bdev_unregister(&bdev->disk, NULL, NULL); 1732 } 1733 1734 pthread_mutex_lock(&nvme_ctrlr->mutex); 1735 1736 nvme_ns->populated = false; 1737 1738 if (nvme_ns->bdev != NULL) { 1739 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1740 return; 1741 } 1742 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1743 1744 nvme_ctrlr_release(nvme_ctrlr); 1745 } 1746 1747 static void 1748 nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 1749 struct nvme_async_probe_ctx *ctx) 1750 { 1751 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 1752 struct nvme_ns *nvme_ns; 1753 struct spdk_nvme_ns *ns; 1754 struct nvme_bdev *bdev; 1755 uint32_t i; 1756 int rc; 1757 uint64_t num_sectors; 1758 bool ns_is_active; 1759 1760 if (ctx) { 1761 /* Initialize this count to 1 to handle the populate functions 1762 * calling nvme_ctrlr_populate_namespace_done() immediately. 1763 */ 1764 ctx->populates_in_progress = 1; 1765 } 1766 1767 for (i = 0; i < nvme_ctrlr->num_ns; i++) { 1768 uint32_t nsid = i + 1; 1769 1770 nvme_ns = nvme_ctrlr->namespaces[i]; 1771 ns_is_active = spdk_nvme_ctrlr_is_active_ns(ctrlr, nsid); 1772 1773 if (nvme_ns->populated && ns_is_active) { 1774 /* NS is still there but attributes may have changed */ 1775 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid); 1776 num_sectors = spdk_nvme_ns_get_num_sectors(ns); 1777 bdev = nvme_ns->bdev; 1778 assert(bdev != NULL); 1779 if (bdev->disk.blockcnt != num_sectors) { 1780 SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n", 1781 nsid, 1782 bdev->disk.name, 1783 bdev->disk.blockcnt, 1784 num_sectors); 1785 rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors); 1786 if (rc != 0) { 1787 SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n", 1788 bdev->disk.name, rc); 1789 } 1790 } 1791 } 1792 1793 if (!nvme_ns->populated && ns_is_active) { 1794 nvme_ns->id = nsid; 1795 nvme_ns->ctrlr = nvme_ctrlr; 1796 1797 nvme_ns->bdev = NULL; 1798 1799 if (ctx) { 1800 ctx->populates_in_progress++; 1801 } 1802 nvme_ctrlr_populate_namespace(nvme_ctrlr, nvme_ns, ctx); 1803 } 1804 1805 if (nvme_ns->populated && !ns_is_active) { 1806 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 1807 } 1808 } 1809 1810 if (ctx) { 1811 /* Decrement this count now that the loop is over to account 1812 * for the one we started with. If the count is then 0, we 1813 * know any populate_namespace functions completed immediately, 1814 * so we'll kick the callback here. 1815 */ 1816 ctx->populates_in_progress--; 1817 if (ctx->populates_in_progress == 0) { 1818 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 1819 } 1820 } 1821 1822 } 1823 1824 static void 1825 nvme_ctrlr_depopulate_namespaces(struct nvme_ctrlr *nvme_ctrlr) 1826 { 1827 uint32_t i; 1828 struct nvme_ns *nvme_ns; 1829 1830 for (i = 0; i < nvme_ctrlr->num_ns; i++) { 1831 uint32_t nsid = i + 1; 1832 1833 nvme_ns = nvme_ctrlr->namespaces[nsid - 1]; 1834 if (nvme_ns->populated) { 1835 assert(nvme_ns->id == nsid); 1836 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 1837 } 1838 } 1839 } 1840 1841 static bool 1842 nvme_ctrlr_acquire(struct nvme_ctrlr *nvme_ctrlr) 1843 { 1844 pthread_mutex_lock(&nvme_ctrlr->mutex); 1845 if (nvme_ctrlr->destruct || nvme_ctrlr->resetting) { 1846 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1847 return false; 1848 } 1849 nvme_ctrlr->ref++; 1850 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1851 return true; 1852 } 1853 1854 static int 1855 nvme_ctrlr_set_ana_states(const struct spdk_nvme_ana_group_descriptor *desc, 1856 void *cb_arg) 1857 { 1858 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 1859 struct nvme_ns *nvme_ns; 1860 uint32_t i, nsid; 1861 1862 for (i = 0; i < desc->num_of_nsid; i++) { 1863 nsid = desc->nsid[i]; 1864 if (nsid == 0 || nsid > nvme_ctrlr->num_ns) { 1865 continue; 1866 } 1867 1868 nvme_ns = nvme_ctrlr->namespaces[nsid - 1]; 1869 assert(nvme_ns != NULL); 1870 1871 if (!nvme_ns->populated) { 1872 continue; 1873 } 1874 1875 nvme_ns->ana_group_id = desc->ana_group_id; 1876 nvme_ns->ana_state = desc->ana_state; 1877 } 1878 1879 return 0; 1880 } 1881 1882 static void 1883 nvme_ctrlr_read_ana_log_page_done(void *ctx, const struct spdk_nvme_cpl *cpl) 1884 { 1885 struct nvme_ctrlr *nvme_ctrlr = ctx; 1886 1887 if (spdk_nvme_cpl_is_success(cpl)) { 1888 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ctrlr_set_ana_states, 1889 nvme_ctrlr); 1890 } 1891 1892 nvme_ctrlr_release(nvme_ctrlr); 1893 } 1894 1895 static void 1896 nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 1897 { 1898 int rc; 1899 1900 if (nvme_ctrlr->ana_log_page == NULL) { 1901 return; 1902 } 1903 1904 if (!nvme_ctrlr_acquire(nvme_ctrlr)) { 1905 return; 1906 } 1907 1908 rc = spdk_nvme_ctrlr_cmd_get_log_page(nvme_ctrlr->ctrlr, 1909 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 1910 SPDK_NVME_GLOBAL_NS_TAG, 1911 nvme_ctrlr->ana_log_page, 1912 nvme_ctrlr->ana_log_page_size, 0, 1913 nvme_ctrlr_read_ana_log_page_done, 1914 nvme_ctrlr); 1915 if (rc != 0) { 1916 nvme_ctrlr_release(nvme_ctrlr); 1917 } 1918 } 1919 1920 static void 1921 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 1922 { 1923 struct nvme_ctrlr *nvme_ctrlr = arg; 1924 union spdk_nvme_async_event_completion event; 1925 1926 if (spdk_nvme_cpl_is_error(cpl)) { 1927 SPDK_WARNLOG("AER request execute failed"); 1928 return; 1929 } 1930 1931 event.raw = cpl->cdw0; 1932 if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 1933 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) { 1934 nvme_ctrlr_populate_namespaces(nvme_ctrlr, NULL); 1935 } else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 1936 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_ANA_CHANGE)) { 1937 nvme_ctrlr_read_ana_log_page(nvme_ctrlr); 1938 } 1939 } 1940 1941 static void 1942 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, size_t count, int rc) 1943 { 1944 if (ctx->cb_fn) { 1945 ctx->cb_fn(ctx->cb_ctx, count, rc); 1946 } 1947 1948 ctx->namespaces_populated = true; 1949 if (ctx->probe_done) { 1950 /* The probe was already completed, so we need to free the context 1951 * here. This can happen for cases like OCSSD, where we need to 1952 * send additional commands to the SSD after attach. 1953 */ 1954 free(ctx); 1955 } 1956 } 1957 1958 static void 1959 nvme_ctrlr_create_done(struct nvme_ctrlr *nvme_ctrlr, 1960 struct nvme_async_probe_ctx *ctx) 1961 { 1962 spdk_io_device_register(nvme_ctrlr, 1963 bdev_nvme_create_ctrlr_channel_cb, 1964 bdev_nvme_destroy_ctrlr_channel_cb, 1965 sizeof(struct nvme_ctrlr_channel), 1966 nvme_ctrlr->name); 1967 1968 nvme_ctrlr_populate_namespaces(nvme_ctrlr, ctx); 1969 } 1970 1971 static void 1972 nvme_ctrlr_init_ana_log_page_done(void *_ctx, const struct spdk_nvme_cpl *cpl) 1973 { 1974 struct nvme_ctrlr *nvme_ctrlr = _ctx; 1975 struct nvme_async_probe_ctx *ctx = nvme_ctrlr->probe_ctx; 1976 1977 nvme_ctrlr->probe_ctx = NULL; 1978 1979 if (spdk_nvme_cpl_is_error(cpl)) { 1980 nvme_ctrlr_delete(nvme_ctrlr); 1981 1982 if (ctx != NULL) { 1983 populate_namespaces_cb(ctx, 0, -1); 1984 } 1985 return; 1986 } 1987 1988 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 1989 } 1990 1991 static int 1992 nvme_ctrlr_init_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 1993 struct nvme_async_probe_ctx *ctx) 1994 { 1995 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 1996 const struct spdk_nvme_ctrlr_data *cdata; 1997 uint32_t ana_log_page_size; 1998 1999 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 2000 2001 ana_log_page_size = sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 2002 sizeof(struct spdk_nvme_ana_group_descriptor) + cdata->nn * 2003 sizeof(uint32_t); 2004 2005 nvme_ctrlr->ana_log_page = spdk_zmalloc(ana_log_page_size, 64, NULL, 2006 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 2007 if (nvme_ctrlr->ana_log_page == NULL) { 2008 SPDK_ERRLOG("could not allocate ANA log page buffer\n"); 2009 return -ENXIO; 2010 } 2011 2012 /* Each descriptor in a ANA log page is not ensured to be 8-bytes aligned. 2013 * Hence copy each descriptor to a temporary area when parsing it. 2014 * 2015 * Allocate a buffer whose size is as large as ANA log page buffer because 2016 * we do not know the size of a descriptor until actually reading it. 2017 */ 2018 nvme_ctrlr->copied_ana_desc = calloc(1, ana_log_page_size); 2019 if (nvme_ctrlr->copied_ana_desc == NULL) { 2020 SPDK_ERRLOG("could not allocate a buffer to parse ANA descriptor\n"); 2021 return -ENOMEM; 2022 } 2023 2024 nvme_ctrlr->ana_log_page_size = ana_log_page_size; 2025 2026 nvme_ctrlr->probe_ctx = ctx; 2027 2028 return spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, 2029 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 2030 SPDK_NVME_GLOBAL_NS_TAG, 2031 nvme_ctrlr->ana_log_page, 2032 nvme_ctrlr->ana_log_page_size, 0, 2033 nvme_ctrlr_init_ana_log_page_done, 2034 nvme_ctrlr); 2035 } 2036 2037 static int 2038 nvme_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr, 2039 const char *name, 2040 const struct spdk_nvme_transport_id *trid, 2041 uint32_t prchk_flags, 2042 struct nvme_async_probe_ctx *ctx) 2043 { 2044 struct nvme_ctrlr *nvme_ctrlr; 2045 struct nvme_ctrlr_trid *trid_entry; 2046 uint32_t i, num_ns; 2047 const struct spdk_nvme_ctrlr_data *cdata; 2048 int rc; 2049 2050 nvme_ctrlr = calloc(1, sizeof(*nvme_ctrlr)); 2051 if (nvme_ctrlr == NULL) { 2052 SPDK_ERRLOG("Failed to allocate device struct\n"); 2053 return -ENOMEM; 2054 } 2055 2056 rc = pthread_mutex_init(&nvme_ctrlr->mutex, NULL); 2057 if (rc != 0) { 2058 free(nvme_ctrlr); 2059 return rc; 2060 } 2061 2062 TAILQ_INIT(&nvme_ctrlr->trids); 2063 2064 num_ns = spdk_nvme_ctrlr_get_num_ns(ctrlr); 2065 if (num_ns != 0) { 2066 nvme_ctrlr->namespaces = calloc(num_ns, sizeof(struct nvme_ns *)); 2067 if (!nvme_ctrlr->namespaces) { 2068 SPDK_ERRLOG("Failed to allocate block namespaces pointer\n"); 2069 rc = -ENOMEM; 2070 goto err; 2071 } 2072 2073 for (i = 0; i < num_ns; i++) { 2074 nvme_ctrlr->namespaces[i] = calloc(1, sizeof(struct nvme_ns)); 2075 if (nvme_ctrlr->namespaces[i] == NULL) { 2076 SPDK_ERRLOG("Failed to allocate block namespace struct\n"); 2077 rc = -ENOMEM; 2078 goto err; 2079 } 2080 nvme_ctrlr->num_ns++; 2081 } 2082 2083 assert(num_ns == nvme_ctrlr->num_ns); 2084 } 2085 2086 trid_entry = calloc(1, sizeof(*trid_entry)); 2087 if (trid_entry == NULL) { 2088 SPDK_ERRLOG("Failed to allocate trid entry pointer\n"); 2089 rc = -ENOMEM; 2090 goto err; 2091 } 2092 2093 trid_entry->trid = *trid; 2094 nvme_ctrlr->connected_trid = &trid_entry->trid; 2095 TAILQ_INSERT_HEAD(&nvme_ctrlr->trids, trid_entry, link); 2096 2097 nvme_ctrlr->thread = spdk_get_thread(); 2098 nvme_ctrlr->ctrlr = ctrlr; 2099 nvme_ctrlr->ref = 1; 2100 nvme_ctrlr->name = strdup(name); 2101 if (nvme_ctrlr->name == NULL) { 2102 rc = -ENOMEM; 2103 goto err; 2104 } 2105 2106 if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) { 2107 SPDK_ERRLOG("OCSSDs are not supported"); 2108 rc = -ENOTSUP; 2109 goto err; 2110 } 2111 2112 nvme_ctrlr->prchk_flags = prchk_flags; 2113 2114 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_ctrlr, 2115 g_opts.nvme_adminq_poll_period_us); 2116 2117 pthread_mutex_lock(&g_bdev_nvme_mutex); 2118 TAILQ_INSERT_TAIL(&g_nvme_ctrlrs, nvme_ctrlr, tailq); 2119 pthread_mutex_unlock(&g_bdev_nvme_mutex); 2120 2121 if (g_opts.timeout_us > 0) { 2122 /* Register timeout callback. Timeout values for IO vs. admin reqs can be different. */ 2123 /* If timeout_admin_us is 0 (not specified), admin uses same timeout as IO. */ 2124 uint64_t adm_timeout_us = (g_opts.timeout_admin_us == 0) ? 2125 g_opts.timeout_us : g_opts.timeout_admin_us; 2126 spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us, 2127 adm_timeout_us, timeout_cb, nvme_ctrlr); 2128 } 2129 2130 spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_ctrlr); 2131 spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_ctrlr); 2132 2133 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 2134 SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) { 2135 nvme_ctrlr->opal_dev = spdk_opal_dev_construct(ctrlr); 2136 } 2137 2138 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 2139 2140 if (cdata->cmic.ana_reporting) { 2141 rc = nvme_ctrlr_init_ana_log_page(nvme_ctrlr, ctx); 2142 if (rc == 0) { 2143 return 0; 2144 } 2145 } else { 2146 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 2147 return 0; 2148 } 2149 2150 err: 2151 nvme_ctrlr_delete(nvme_ctrlr); 2152 return rc; 2153 } 2154 2155 static void 2156 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 2157 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 2158 { 2159 struct nvme_probe_ctx *ctx = cb_ctx; 2160 char *name = NULL; 2161 uint32_t prchk_flags = 0; 2162 size_t i; 2163 2164 if (ctx) { 2165 for (i = 0; i < ctx->count; i++) { 2166 if (spdk_nvme_transport_id_compare(trid, &ctx->trids[i]) == 0) { 2167 prchk_flags = ctx->prchk_flags[i]; 2168 name = strdup(ctx->names[i]); 2169 break; 2170 } 2171 } 2172 } else { 2173 name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++); 2174 } 2175 if (!name) { 2176 SPDK_ERRLOG("Failed to assign name to NVMe device\n"); 2177 return; 2178 } 2179 2180 SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name); 2181 2182 nvme_ctrlr_create(ctrlr, name, trid, prchk_flags, NULL); 2183 2184 free(name); 2185 } 2186 2187 static void 2188 _nvme_ctrlr_destruct(void *ctx) 2189 { 2190 struct nvme_ctrlr *nvme_ctrlr = ctx; 2191 2192 nvme_ctrlr_depopulate_namespaces(nvme_ctrlr); 2193 nvme_ctrlr_release(nvme_ctrlr); 2194 } 2195 2196 static int 2197 _bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) 2198 { 2199 struct nvme_probe_skip_entry *entry; 2200 2201 pthread_mutex_lock(&nvme_ctrlr->mutex); 2202 2203 /* The controller's destruction was already started */ 2204 if (nvme_ctrlr->destruct) { 2205 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2206 return 0; 2207 } 2208 2209 if (!hotplug && 2210 nvme_ctrlr->connected_trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 2211 entry = calloc(1, sizeof(*entry)); 2212 if (!entry) { 2213 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2214 return -ENOMEM; 2215 } 2216 entry->trid = *nvme_ctrlr->connected_trid; 2217 TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq); 2218 } 2219 2220 nvme_ctrlr->destruct = true; 2221 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2222 2223 _nvme_ctrlr_destruct(nvme_ctrlr); 2224 2225 return 0; 2226 } 2227 2228 static void 2229 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr) 2230 { 2231 struct nvme_ctrlr *nvme_ctrlr = cb_ctx; 2232 2233 _bdev_nvme_delete(nvme_ctrlr, true); 2234 } 2235 2236 static int 2237 bdev_nvme_hotplug_probe(void *arg) 2238 { 2239 if (g_hotplug_probe_ctx == NULL) { 2240 spdk_poller_unregister(&g_hotplug_probe_poller); 2241 return SPDK_POLLER_IDLE; 2242 } 2243 2244 if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) { 2245 g_hotplug_probe_ctx = NULL; 2246 spdk_poller_unregister(&g_hotplug_probe_poller); 2247 } 2248 2249 return SPDK_POLLER_BUSY; 2250 } 2251 2252 static int 2253 bdev_nvme_hotplug(void *arg) 2254 { 2255 struct spdk_nvme_transport_id trid_pcie; 2256 2257 if (g_hotplug_probe_ctx) { 2258 return SPDK_POLLER_BUSY; 2259 } 2260 2261 memset(&trid_pcie, 0, sizeof(trid_pcie)); 2262 spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE); 2263 2264 g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL, 2265 hotplug_probe_cb, attach_cb, NULL); 2266 2267 if (g_hotplug_probe_ctx) { 2268 assert(g_hotplug_probe_poller == NULL); 2269 g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000); 2270 } 2271 2272 return SPDK_POLLER_BUSY; 2273 } 2274 2275 void 2276 bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts) 2277 { 2278 *opts = g_opts; 2279 } 2280 2281 static int 2282 bdev_nvme_validate_opts(const struct spdk_bdev_nvme_opts *opts) 2283 { 2284 if ((opts->timeout_us == 0) && (opts->timeout_admin_us != 0)) { 2285 /* Can't set timeout_admin_us without also setting timeout_us */ 2286 SPDK_WARNLOG("Invalid options: Can't have (timeout_us == 0) with (timeout_admin_us > 0)\n"); 2287 return -EINVAL; 2288 } 2289 2290 return 0; 2291 } 2292 2293 int 2294 bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts) 2295 { 2296 int ret = bdev_nvme_validate_opts(opts); 2297 if (ret) { 2298 SPDK_WARNLOG("Failed to set nvme opts.\n"); 2299 return ret; 2300 } 2301 2302 if (g_bdev_nvme_init_thread != NULL) { 2303 if (!TAILQ_EMPTY(&g_nvme_ctrlrs)) { 2304 return -EPERM; 2305 } 2306 } 2307 2308 g_opts = *opts; 2309 2310 return 0; 2311 } 2312 2313 struct set_nvme_hotplug_ctx { 2314 uint64_t period_us; 2315 bool enabled; 2316 spdk_msg_fn fn; 2317 void *fn_ctx; 2318 }; 2319 2320 static void 2321 set_nvme_hotplug_period_cb(void *_ctx) 2322 { 2323 struct set_nvme_hotplug_ctx *ctx = _ctx; 2324 2325 spdk_poller_unregister(&g_hotplug_poller); 2326 if (ctx->enabled) { 2327 g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us); 2328 } 2329 2330 g_nvme_hotplug_poll_period_us = ctx->period_us; 2331 g_nvme_hotplug_enabled = ctx->enabled; 2332 if (ctx->fn) { 2333 ctx->fn(ctx->fn_ctx); 2334 } 2335 2336 free(ctx); 2337 } 2338 2339 int 2340 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx) 2341 { 2342 struct set_nvme_hotplug_ctx *ctx; 2343 2344 if (enabled == true && !spdk_process_is_primary()) { 2345 return -EPERM; 2346 } 2347 2348 ctx = calloc(1, sizeof(*ctx)); 2349 if (ctx == NULL) { 2350 return -ENOMEM; 2351 } 2352 2353 period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us; 2354 ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX); 2355 ctx->enabled = enabled; 2356 ctx->fn = cb; 2357 ctx->fn_ctx = cb_ctx; 2358 2359 spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx); 2360 return 0; 2361 } 2362 2363 static void 2364 nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 2365 struct nvme_async_probe_ctx *ctx) 2366 { 2367 struct nvme_ns *nvme_ns; 2368 struct nvme_bdev *nvme_bdev; 2369 uint32_t i, nsid; 2370 size_t j; 2371 2372 assert(nvme_ctrlr != NULL); 2373 2374 /* 2375 * Report the new bdevs that were created in this call. 2376 * There can be more than one bdev per NVMe controller. 2377 */ 2378 j = 0; 2379 for (i = 0; i < nvme_ctrlr->num_ns; i++) { 2380 nsid = i + 1; 2381 nvme_ns = nvme_ctrlr->namespaces[nsid - 1]; 2382 if (!nvme_ns->populated) { 2383 continue; 2384 } 2385 assert(nvme_ns->id == nsid); 2386 nvme_bdev = nvme_ns->bdev; 2387 if (j < ctx->count) { 2388 ctx->names[j] = nvme_bdev->disk.name; 2389 j++; 2390 } else { 2391 SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n", 2392 ctx->count); 2393 populate_namespaces_cb(ctx, 0, -ERANGE); 2394 return; 2395 } 2396 } 2397 2398 populate_namespaces_cb(ctx, j, 0); 2399 } 2400 2401 static int 2402 bdev_nvme_compare_trids(struct nvme_ctrlr *nvme_ctrlr, 2403 struct spdk_nvme_ctrlr *new_ctrlr, 2404 struct spdk_nvme_transport_id *trid) 2405 { 2406 struct nvme_ctrlr_trid *tmp_trid; 2407 2408 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 2409 SPDK_ERRLOG("PCIe failover is not supported.\n"); 2410 return -ENOTSUP; 2411 } 2412 2413 /* Currently we only support failover to the same transport type. */ 2414 if (nvme_ctrlr->connected_trid->trtype != trid->trtype) { 2415 return -EINVAL; 2416 } 2417 2418 /* Currently we only support failover to the same NQN. */ 2419 if (strncmp(trid->subnqn, nvme_ctrlr->connected_trid->subnqn, SPDK_NVMF_NQN_MAX_LEN)) { 2420 return -EINVAL; 2421 } 2422 2423 /* Skip all the other checks if we've already registered this path. */ 2424 TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) { 2425 if (!spdk_nvme_transport_id_compare(&tmp_trid->trid, trid)) { 2426 return -EEXIST; 2427 } 2428 } 2429 2430 return 0; 2431 } 2432 2433 static int 2434 bdev_nvme_compare_namespaces(struct nvme_ctrlr *nvme_ctrlr, 2435 struct spdk_nvme_ctrlr *new_ctrlr) 2436 { 2437 uint32_t i, nsid; 2438 struct nvme_ns *nvme_ns; 2439 struct spdk_nvme_ns *new_ns; 2440 2441 if (spdk_nvme_ctrlr_get_num_ns(new_ctrlr) != nvme_ctrlr->num_ns) { 2442 return -EINVAL; 2443 } 2444 2445 for (i = 0; i < nvme_ctrlr->num_ns; i++) { 2446 nsid = i + 1; 2447 2448 nvme_ns = nvme_ctrlr->namespaces[i]; 2449 if (!nvme_ns->populated) { 2450 continue; 2451 } 2452 2453 new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nsid); 2454 assert(new_ns != NULL); 2455 2456 if (!bdev_nvme_compare_ns(nvme_ns->ns, new_ns)) { 2457 return -EINVAL; 2458 } 2459 } 2460 2461 return 0; 2462 } 2463 2464 static int 2465 _bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 2466 struct spdk_nvme_transport_id *trid) 2467 { 2468 struct nvme_ctrlr_trid *new_trid, *tmp_trid; 2469 2470 new_trid = calloc(1, sizeof(*new_trid)); 2471 if (new_trid == NULL) { 2472 return -ENOMEM; 2473 } 2474 new_trid->trid = *trid; 2475 new_trid->is_failed = false; 2476 2477 TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) { 2478 if (tmp_trid->is_failed) { 2479 TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link); 2480 return 0; 2481 } 2482 } 2483 2484 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, new_trid, link); 2485 return 0; 2486 } 2487 2488 /* This is the case that a secondary path is added to an existing 2489 * nvme_ctrlr for failover. After checking if it can access the same 2490 * namespaces as the primary path, it is disconnected until failover occurs. 2491 */ 2492 static int 2493 bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 2494 struct spdk_nvme_ctrlr *new_ctrlr, 2495 struct spdk_nvme_transport_id *trid) 2496 { 2497 int rc; 2498 2499 assert(nvme_ctrlr != NULL); 2500 2501 pthread_mutex_lock(&nvme_ctrlr->mutex); 2502 2503 rc = bdev_nvme_compare_trids(nvme_ctrlr, new_ctrlr, trid); 2504 if (rc != 0) { 2505 goto exit; 2506 } 2507 2508 rc = bdev_nvme_compare_namespaces(nvme_ctrlr, new_ctrlr); 2509 if (rc != 0) { 2510 goto exit; 2511 } 2512 2513 rc = _bdev_nvme_add_secondary_trid(nvme_ctrlr, trid); 2514 2515 exit: 2516 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2517 2518 spdk_nvme_detach(new_ctrlr); 2519 2520 return rc; 2521 } 2522 2523 static void 2524 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 2525 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 2526 { 2527 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 2528 struct nvme_async_probe_ctx *ctx; 2529 int rc; 2530 2531 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, opts); 2532 ctx->ctrlr_attached = true; 2533 2534 rc = nvme_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx->prchk_flags, ctx); 2535 if (rc != 0) { 2536 populate_namespaces_cb(ctx, 0, rc); 2537 } 2538 } 2539 2540 static void 2541 connect_set_failover_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 2542 struct spdk_nvme_ctrlr *ctrlr, 2543 const struct spdk_nvme_ctrlr_opts *opts) 2544 { 2545 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 2546 struct nvme_ctrlr *nvme_ctrlr; 2547 struct nvme_async_probe_ctx *ctx; 2548 int rc; 2549 2550 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, opts); 2551 ctx->ctrlr_attached = true; 2552 2553 nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->base_name); 2554 if (nvme_ctrlr) { 2555 rc = bdev_nvme_add_secondary_trid(nvme_ctrlr, ctrlr, &ctx->trid); 2556 } else { 2557 rc = -ENODEV; 2558 } 2559 2560 populate_namespaces_cb(ctx, 0, rc); 2561 } 2562 2563 static int 2564 bdev_nvme_async_poll(void *arg) 2565 { 2566 struct nvme_async_probe_ctx *ctx = arg; 2567 int rc; 2568 2569 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 2570 if (spdk_unlikely(rc != -EAGAIN)) { 2571 ctx->probe_done = true; 2572 spdk_poller_unregister(&ctx->poller); 2573 if (!ctx->ctrlr_attached) { 2574 /* The probe is done, but no controller was attached. 2575 * That means we had a failure, so report -EIO back to 2576 * the caller (usually the RPC). populate_namespaces_cb() 2577 * will take care of freeing the nvme_async_probe_ctx. 2578 */ 2579 populate_namespaces_cb(ctx, 0, -EIO); 2580 } else if (ctx->namespaces_populated) { 2581 /* The namespaces for the attached controller were all 2582 * populated and the response was already sent to the 2583 * caller (usually the RPC). So free the context here. 2584 */ 2585 free(ctx); 2586 } 2587 } 2588 2589 return SPDK_POLLER_BUSY; 2590 } 2591 2592 int 2593 bdev_nvme_create(struct spdk_nvme_transport_id *trid, 2594 struct spdk_nvme_host_id *hostid, 2595 const char *base_name, 2596 const char **names, 2597 uint32_t count, 2598 uint32_t prchk_flags, 2599 spdk_bdev_create_nvme_fn cb_fn, 2600 void *cb_ctx, 2601 struct spdk_nvme_ctrlr_opts *opts) 2602 { 2603 struct nvme_probe_skip_entry *entry, *tmp; 2604 struct nvme_async_probe_ctx *ctx; 2605 spdk_nvme_attach_cb attach_cb; 2606 2607 /* TODO expand this check to include both the host and target TRIDs. 2608 * Only if both are the same should we fail. 2609 */ 2610 if (nvme_ctrlr_get(trid) != NULL) { 2611 SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr); 2612 return -EEXIST; 2613 } 2614 2615 ctx = calloc(1, sizeof(*ctx)); 2616 if (!ctx) { 2617 return -ENOMEM; 2618 } 2619 ctx->base_name = base_name; 2620 ctx->names = names; 2621 ctx->count = count; 2622 ctx->cb_fn = cb_fn; 2623 ctx->cb_ctx = cb_ctx; 2624 ctx->prchk_flags = prchk_flags; 2625 ctx->trid = *trid; 2626 2627 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 2628 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) { 2629 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 2630 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 2631 free(entry); 2632 break; 2633 } 2634 } 2635 } 2636 2637 if (opts) { 2638 memcpy(&ctx->opts, opts, sizeof(*opts)); 2639 } else { 2640 spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->opts, sizeof(ctx->opts)); 2641 } 2642 2643 ctx->opts.transport_retry_count = g_opts.retry_count; 2644 ctx->opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms; 2645 ctx->opts.disable_read_ana_log_page = true; 2646 2647 if (hostid->hostaddr[0] != '\0') { 2648 snprintf(ctx->opts.src_addr, sizeof(ctx->opts.src_addr), "%s", hostid->hostaddr); 2649 } 2650 2651 if (hostid->hostsvcid[0] != '\0') { 2652 snprintf(ctx->opts.src_svcid, sizeof(ctx->opts.src_svcid), "%s", hostid->hostsvcid); 2653 } 2654 2655 if (nvme_ctrlr_get_by_name(base_name) == NULL) { 2656 attach_cb = connect_attach_cb; 2657 } else { 2658 attach_cb = connect_set_failover_cb; 2659 } 2660 2661 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->opts, attach_cb); 2662 if (ctx->probe_ctx == NULL) { 2663 SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr); 2664 free(ctx); 2665 return -ENODEV; 2666 } 2667 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000); 2668 2669 return 0; 2670 } 2671 2672 static int 2673 bdev_nvme_delete_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 2674 const struct spdk_nvme_transport_id *trid) 2675 { 2676 struct nvme_ctrlr_trid *ctrlr_trid, *tmp_trid; 2677 2678 if (!spdk_nvme_transport_id_compare(trid, nvme_ctrlr->connected_trid)) { 2679 return -EBUSY; 2680 } 2681 2682 TAILQ_FOREACH_SAFE(ctrlr_trid, &nvme_ctrlr->trids, link, tmp_trid) { 2683 if (!spdk_nvme_transport_id_compare(&ctrlr_trid->trid, trid)) { 2684 TAILQ_REMOVE(&nvme_ctrlr->trids, ctrlr_trid, link); 2685 free(ctrlr_trid); 2686 return 0; 2687 } 2688 } 2689 2690 return -ENXIO; 2691 } 2692 2693 int 2694 bdev_nvme_delete(const char *name, const struct spdk_nvme_transport_id *trid) 2695 { 2696 struct nvme_ctrlr *nvme_ctrlr; 2697 struct nvme_ctrlr_trid *ctrlr_trid; 2698 2699 if (name == NULL) { 2700 return -EINVAL; 2701 } 2702 2703 nvme_ctrlr = nvme_ctrlr_get_by_name(name); 2704 if (nvme_ctrlr == NULL) { 2705 SPDK_ERRLOG("Failed to find NVMe controller\n"); 2706 return -ENODEV; 2707 } 2708 2709 /* case 1: remove the controller itself. */ 2710 if (trid == NULL) { 2711 return _bdev_nvme_delete(nvme_ctrlr, false); 2712 } 2713 2714 /* case 2: we are currently using the path to be removed. */ 2715 if (!spdk_nvme_transport_id_compare(trid, nvme_ctrlr->connected_trid)) { 2716 ctrlr_trid = TAILQ_FIRST(&nvme_ctrlr->trids); 2717 assert(nvme_ctrlr->connected_trid == &ctrlr_trid->trid); 2718 /* case 2A: the current path is the only path. */ 2719 if (!TAILQ_NEXT(ctrlr_trid, link)) { 2720 return _bdev_nvme_delete(nvme_ctrlr, false); 2721 } 2722 2723 /* case 2B: there is an alternative path. */ 2724 return bdev_nvme_failover(nvme_ctrlr, true); 2725 } 2726 2727 /* case 3: We are not using the specified path. */ 2728 return bdev_nvme_delete_secondary_trid(nvme_ctrlr, trid); 2729 } 2730 2731 static int 2732 bdev_nvme_library_init(void) 2733 { 2734 g_bdev_nvme_init_thread = spdk_get_thread(); 2735 2736 spdk_io_device_register(&g_nvme_ctrlrs, bdev_nvme_create_poll_group_cb, 2737 bdev_nvme_destroy_poll_group_cb, 2738 sizeof(struct nvme_poll_group), "nvme_poll_groups"); 2739 2740 return 0; 2741 } 2742 2743 static void 2744 bdev_nvme_library_fini(void) 2745 { 2746 struct nvme_ctrlr *nvme_ctrlr, *tmp; 2747 struct nvme_probe_skip_entry *entry, *entry_tmp; 2748 2749 spdk_poller_unregister(&g_hotplug_poller); 2750 free(g_hotplug_probe_ctx); 2751 g_hotplug_probe_ctx = NULL; 2752 2753 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) { 2754 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 2755 free(entry); 2756 } 2757 2758 pthread_mutex_lock(&g_bdev_nvme_mutex); 2759 TAILQ_FOREACH_SAFE(nvme_ctrlr, &g_nvme_ctrlrs, tailq, tmp) { 2760 pthread_mutex_lock(&nvme_ctrlr->mutex); 2761 if (nvme_ctrlr->destruct) { 2762 /* This controller's destruction was already started 2763 * before the application started shutting down 2764 */ 2765 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2766 continue; 2767 } 2768 nvme_ctrlr->destruct = true; 2769 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2770 2771 spdk_thread_send_msg(nvme_ctrlr->thread, _nvme_ctrlr_destruct, 2772 nvme_ctrlr); 2773 } 2774 2775 g_bdev_nvme_module_finish = true; 2776 if (TAILQ_EMPTY(&g_nvme_ctrlrs)) { 2777 pthread_mutex_unlock(&g_bdev_nvme_mutex); 2778 spdk_io_device_unregister(&g_nvme_ctrlrs, NULL); 2779 spdk_bdev_module_fini_done(); 2780 return; 2781 } 2782 2783 pthread_mutex_unlock(&g_bdev_nvme_mutex); 2784 } 2785 2786 static void 2787 bdev_nvme_verify_pi_error(struct nvme_bdev_io *bio) 2788 { 2789 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2790 struct spdk_bdev *bdev = bdev_io->bdev; 2791 struct spdk_dif_ctx dif_ctx; 2792 struct spdk_dif_error err_blk = {}; 2793 int rc; 2794 2795 rc = spdk_dif_ctx_init(&dif_ctx, 2796 bdev->blocklen, bdev->md_len, bdev->md_interleave, 2797 bdev->dif_is_head_of_md, bdev->dif_type, bdev->dif_check_flags, 2798 bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0); 2799 if (rc != 0) { 2800 SPDK_ERRLOG("Initialization of DIF context failed\n"); 2801 return; 2802 } 2803 2804 if (bdev->md_interleave) { 2805 rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 2806 bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 2807 } else { 2808 struct iovec md_iov = { 2809 .iov_base = bdev_io->u.bdev.md_buf, 2810 .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len, 2811 }; 2812 2813 rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 2814 &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 2815 } 2816 2817 if (rc != 0) { 2818 SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n", 2819 err_blk.err_type, err_blk.err_offset); 2820 } else { 2821 SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n"); 2822 } 2823 } 2824 2825 static void 2826 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 2827 { 2828 struct nvme_bdev_io *bio = ref; 2829 2830 if (spdk_nvme_cpl_is_success(cpl)) { 2831 /* Run PI verification for read data buffer. */ 2832 bdev_nvme_verify_pi_error(bio); 2833 } 2834 2835 /* Return original completion status */ 2836 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 2837 } 2838 2839 static void 2840 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 2841 { 2842 struct nvme_bdev_io *bio = ref; 2843 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2844 struct nvme_bdev_channel *nbdev_ch; 2845 struct spdk_nvme_ns *ns; 2846 struct spdk_nvme_qpair *qpair; 2847 int ret; 2848 2849 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 2850 SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n", 2851 cpl->status.sct, cpl->status.sc); 2852 2853 /* Save completion status to use after verifying PI error. */ 2854 bio->cpl = *cpl; 2855 2856 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 2857 2858 if (spdk_likely(bdev_nvme_find_io_path(nbdev_ch, &ns, &qpair))) { 2859 /* Read without PI checking to verify PI error. */ 2860 ret = bdev_nvme_no_pi_readv(ns, 2861 qpair, 2862 bio, 2863 bdev_io->u.bdev.iovs, 2864 bdev_io->u.bdev.iovcnt, 2865 bdev_io->u.bdev.md_buf, 2866 bdev_io->u.bdev.num_blocks, 2867 bdev_io->u.bdev.offset_blocks); 2868 if (ret == 0) { 2869 return; 2870 } 2871 } 2872 } 2873 2874 bdev_nvme_io_complete_nvme_status(bio, cpl); 2875 } 2876 2877 static void 2878 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 2879 { 2880 struct nvme_bdev_io *bio = ref; 2881 2882 if (spdk_nvme_cpl_is_pi_error(cpl)) { 2883 SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n", 2884 cpl->status.sct, cpl->status.sc); 2885 /* Run PI verification for write data buffer if PI error is detected. */ 2886 bdev_nvme_verify_pi_error(bio); 2887 } 2888 2889 bdev_nvme_io_complete_nvme_status(bio, cpl); 2890 } 2891 2892 static void 2893 bdev_nvme_zone_appendv_done(void *ref, const struct spdk_nvme_cpl *cpl) 2894 { 2895 struct nvme_bdev_io *bio = ref; 2896 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2897 2898 /* spdk_bdev_io_get_append_location() requires that the ALBA is stored in offset_blocks. 2899 * Additionally, offset_blocks has to be set before calling bdev_nvme_verify_pi_error(). 2900 */ 2901 bdev_io->u.bdev.offset_blocks = *(uint64_t *)&cpl->cdw0; 2902 2903 if (spdk_nvme_cpl_is_pi_error(cpl)) { 2904 SPDK_ERRLOG("zone append completed with PI error (sct=%d, sc=%d)\n", 2905 cpl->status.sct, cpl->status.sc); 2906 /* Run PI verification for zone append data buffer if PI error is detected. */ 2907 bdev_nvme_verify_pi_error(bio); 2908 } 2909 2910 bdev_nvme_io_complete_nvme_status(bio, cpl); 2911 } 2912 2913 static void 2914 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl) 2915 { 2916 struct nvme_bdev_io *bio = ref; 2917 2918 if (spdk_nvme_cpl_is_pi_error(cpl)) { 2919 SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n", 2920 cpl->status.sct, cpl->status.sc); 2921 /* Run PI verification for compare data buffer if PI error is detected. */ 2922 bdev_nvme_verify_pi_error(bio); 2923 } 2924 2925 bdev_nvme_io_complete_nvme_status(bio, cpl); 2926 } 2927 2928 static void 2929 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 2930 { 2931 struct nvme_bdev_io *bio = ref; 2932 2933 /* Compare operation completion */ 2934 if ((cpl->cdw0 & 0xFF) == SPDK_NVME_OPC_COMPARE) { 2935 /* Save compare result for write callback */ 2936 bio->cpl = *cpl; 2937 return; 2938 } 2939 2940 /* Write operation completion */ 2941 if (spdk_nvme_cpl_is_error(&bio->cpl)) { 2942 /* If bio->cpl is already an error, it means the compare operation failed. In that case, 2943 * complete the IO with the compare operation's status. 2944 */ 2945 if (!spdk_nvme_cpl_is_error(cpl)) { 2946 SPDK_ERRLOG("Unexpected write success after compare failure.\n"); 2947 } 2948 2949 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 2950 } else { 2951 bdev_nvme_io_complete_nvme_status(bio, cpl); 2952 } 2953 } 2954 2955 static void 2956 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl) 2957 { 2958 struct nvme_bdev_io *bio = ref; 2959 2960 bdev_nvme_io_complete_nvme_status(bio, cpl); 2961 } 2962 2963 static int 2964 fill_zone_from_report(struct spdk_bdev_zone_info *info, struct spdk_nvme_zns_zone_desc *desc) 2965 { 2966 switch (desc->zs) { 2967 case SPDK_NVME_ZONE_STATE_EMPTY: 2968 info->state = SPDK_BDEV_ZONE_STATE_EMPTY; 2969 break; 2970 case SPDK_NVME_ZONE_STATE_IOPEN: 2971 info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN; 2972 break; 2973 case SPDK_NVME_ZONE_STATE_EOPEN: 2974 info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN; 2975 break; 2976 case SPDK_NVME_ZONE_STATE_CLOSED: 2977 info->state = SPDK_BDEV_ZONE_STATE_CLOSED; 2978 break; 2979 case SPDK_NVME_ZONE_STATE_RONLY: 2980 info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY; 2981 break; 2982 case SPDK_NVME_ZONE_STATE_FULL: 2983 info->state = SPDK_BDEV_ZONE_STATE_FULL; 2984 break; 2985 case SPDK_NVME_ZONE_STATE_OFFLINE: 2986 info->state = SPDK_BDEV_ZONE_STATE_OFFLINE; 2987 break; 2988 default: 2989 SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", desc->zs); 2990 return -EIO; 2991 } 2992 2993 info->zone_id = desc->zslba; 2994 info->write_pointer = desc->wp; 2995 info->capacity = desc->zcap; 2996 2997 return 0; 2998 } 2999 3000 static void 3001 bdev_nvme_get_zone_info_done(void *ref, const struct spdk_nvme_cpl *cpl) 3002 { 3003 struct nvme_bdev_io *bio = ref; 3004 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 3005 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 3006 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 3007 uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id; 3008 uint32_t zones_to_copy = bdev_io->u.zone_mgmt.num_zones; 3009 struct spdk_bdev_zone_info *info = bdev_io->u.zone_mgmt.buf; 3010 uint64_t max_zones_per_buf, i; 3011 uint32_t zone_report_bufsize; 3012 struct spdk_nvme_ns *ns; 3013 struct spdk_nvme_qpair *qpair; 3014 int ret; 3015 3016 if (spdk_nvme_cpl_is_error(cpl)) { 3017 goto out_complete_io_nvme_cpl; 3018 } 3019 3020 if (!bdev_nvme_find_io_path(nbdev_ch, &ns, &qpair)) { 3021 ret = -ENXIO; 3022 goto out_complete_io_ret; 3023 } 3024 3025 zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 3026 max_zones_per_buf = (zone_report_bufsize - sizeof(*bio->zone_report_buf)) / 3027 sizeof(bio->zone_report_buf->descs[0]); 3028 3029 if (bio->zone_report_buf->nr_zones > max_zones_per_buf) { 3030 ret = -EINVAL; 3031 goto out_complete_io_ret; 3032 } 3033 3034 if (!bio->zone_report_buf->nr_zones) { 3035 ret = -EINVAL; 3036 goto out_complete_io_ret; 3037 } 3038 3039 for (i = 0; i < bio->zone_report_buf->nr_zones && bio->handled_zones < zones_to_copy; i++) { 3040 ret = fill_zone_from_report(&info[bio->handled_zones], 3041 &bio->zone_report_buf->descs[i]); 3042 if (ret) { 3043 goto out_complete_io_ret; 3044 } 3045 bio->handled_zones++; 3046 } 3047 3048 if (bio->handled_zones < zones_to_copy) { 3049 uint64_t zone_size_lba = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 3050 uint64_t slba = zone_id + (zone_size_lba * bio->handled_zones); 3051 3052 memset(bio->zone_report_buf, 0, zone_report_bufsize); 3053 ret = spdk_nvme_zns_report_zones(ns, qpair, 3054 bio->zone_report_buf, zone_report_bufsize, 3055 slba, SPDK_NVME_ZRA_LIST_ALL, true, 3056 bdev_nvme_get_zone_info_done, bio); 3057 if (!ret) { 3058 return; 3059 } else { 3060 goto out_complete_io_ret; 3061 } 3062 } 3063 3064 out_complete_io_nvme_cpl: 3065 free(bio->zone_report_buf); 3066 bio->zone_report_buf = NULL; 3067 bdev_nvme_io_complete_nvme_status(bio, cpl); 3068 return; 3069 3070 out_complete_io_ret: 3071 free(bio->zone_report_buf); 3072 bio->zone_report_buf = NULL; 3073 bdev_nvme_io_complete(bio, ret); 3074 } 3075 3076 static void 3077 bdev_nvme_zone_management_done(void *ref, const struct spdk_nvme_cpl *cpl) 3078 { 3079 struct nvme_bdev_io *bio = ref; 3080 3081 bdev_nvme_io_complete_nvme_status(bio, cpl); 3082 } 3083 3084 static void 3085 bdev_nvme_admin_passthru_completion(void *ctx) 3086 { 3087 struct nvme_bdev_io *bio = ctx; 3088 3089 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 3090 } 3091 3092 static void 3093 bdev_nvme_abort_completion(void *ctx) 3094 { 3095 struct nvme_bdev_io *bio = ctx; 3096 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 3097 3098 if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) { 3099 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 3100 } else { 3101 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3102 } 3103 } 3104 3105 static void 3106 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl) 3107 { 3108 struct nvme_bdev_io *bio = ref; 3109 3110 bio->cpl = *cpl; 3111 spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_completion, bio); 3112 } 3113 3114 static void 3115 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl) 3116 { 3117 struct nvme_bdev_io *bio = ref; 3118 3119 bio->cpl = *cpl; 3120 spdk_thread_send_msg(bio->orig_thread, bdev_nvme_admin_passthru_completion, bio); 3121 } 3122 3123 static void 3124 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset) 3125 { 3126 struct nvme_bdev_io *bio = ref; 3127 struct iovec *iov; 3128 3129 bio->iov_offset = sgl_offset; 3130 for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) { 3131 iov = &bio->iovs[bio->iovpos]; 3132 if (bio->iov_offset < iov->iov_len) { 3133 break; 3134 } 3135 3136 bio->iov_offset -= iov->iov_len; 3137 } 3138 } 3139 3140 static int 3141 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length) 3142 { 3143 struct nvme_bdev_io *bio = ref; 3144 struct iovec *iov; 3145 3146 assert(bio->iovpos < bio->iovcnt); 3147 3148 iov = &bio->iovs[bio->iovpos]; 3149 3150 *address = iov->iov_base; 3151 *length = iov->iov_len; 3152 3153 if (bio->iov_offset) { 3154 assert(bio->iov_offset <= iov->iov_len); 3155 *address += bio->iov_offset; 3156 *length -= bio->iov_offset; 3157 } 3158 3159 bio->iov_offset += *length; 3160 if (bio->iov_offset == iov->iov_len) { 3161 bio->iovpos++; 3162 bio->iov_offset = 0; 3163 } 3164 3165 return 0; 3166 } 3167 3168 static void 3169 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset) 3170 { 3171 struct nvme_bdev_io *bio = ref; 3172 struct iovec *iov; 3173 3174 bio->fused_iov_offset = sgl_offset; 3175 for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) { 3176 iov = &bio->fused_iovs[bio->fused_iovpos]; 3177 if (bio->fused_iov_offset < iov->iov_len) { 3178 break; 3179 } 3180 3181 bio->fused_iov_offset -= iov->iov_len; 3182 } 3183 } 3184 3185 static int 3186 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length) 3187 { 3188 struct nvme_bdev_io *bio = ref; 3189 struct iovec *iov; 3190 3191 assert(bio->fused_iovpos < bio->fused_iovcnt); 3192 3193 iov = &bio->fused_iovs[bio->fused_iovpos]; 3194 3195 *address = iov->iov_base; 3196 *length = iov->iov_len; 3197 3198 if (bio->fused_iov_offset) { 3199 assert(bio->fused_iov_offset <= iov->iov_len); 3200 *address += bio->fused_iov_offset; 3201 *length -= bio->fused_iov_offset; 3202 } 3203 3204 bio->fused_iov_offset += *length; 3205 if (bio->fused_iov_offset == iov->iov_len) { 3206 bio->fused_iovpos++; 3207 bio->fused_iov_offset = 0; 3208 } 3209 3210 return 0; 3211 } 3212 3213 static int 3214 bdev_nvme_no_pi_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 3215 struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 3216 void *md, uint64_t lba_count, uint64_t lba) 3217 { 3218 int rc; 3219 3220 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n", 3221 lba_count, lba); 3222 3223 bio->iovs = iov; 3224 bio->iovcnt = iovcnt; 3225 bio->iovpos = 0; 3226 bio->iov_offset = 0; 3227 3228 rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count, 3229 bdev_nvme_no_pi_readv_done, bio, 0, 3230 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 3231 md, 0, 0); 3232 3233 if (rc != 0 && rc != -ENOMEM) { 3234 SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc); 3235 } 3236 return rc; 3237 } 3238 3239 static int 3240 bdev_nvme_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 3241 struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 3242 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, 3243 struct spdk_bdev_ext_io_opts *ext_opts) 3244 { 3245 int rc; 3246 3247 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n", 3248 lba_count, lba); 3249 3250 bio->iovs = iov; 3251 bio->iovcnt = iovcnt; 3252 bio->iovpos = 0; 3253 bio->iov_offset = 0; 3254 3255 if (ext_opts) { 3256 bio->ext_opts.size = sizeof(struct spdk_nvme_ns_cmd_ext_io_opts); 3257 bio->ext_opts.memory_domain = ext_opts->memory_domain; 3258 bio->ext_opts.memory_domain_ctx = ext_opts->memory_domain_ctx; 3259 bio->ext_opts.io_flags = flags; 3260 bio->ext_opts.metadata = md; 3261 3262 rc = spdk_nvme_ns_cmd_readv_ext(ns, qpair, lba, lba_count, 3263 bdev_nvme_readv_done, bio, 3264 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 3265 &bio->ext_opts); 3266 } else if (iovcnt == 1) { 3267 rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base, md, lba, 3268 lba_count, 3269 bdev_nvme_readv_done, bio, 3270 flags, 3271 0, 0); 3272 } else { 3273 rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count, 3274 bdev_nvme_readv_done, bio, flags, 3275 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 3276 md, 0, 0); 3277 } 3278 3279 if (rc != 0 && rc != -ENOMEM) { 3280 SPDK_ERRLOG("readv failed: rc = %d\n", rc); 3281 } 3282 return rc; 3283 } 3284 3285 static int 3286 bdev_nvme_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 3287 struct nvme_bdev_io *bio, 3288 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba, 3289 uint32_t flags, struct spdk_bdev_ext_io_opts *ext_opts) 3290 { 3291 int rc; 3292 3293 SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 3294 lba_count, lba); 3295 3296 bio->iovs = iov; 3297 bio->iovcnt = iovcnt; 3298 bio->iovpos = 0; 3299 bio->iov_offset = 0; 3300 3301 if (ext_opts) { 3302 bio->ext_opts.size = sizeof(struct spdk_nvme_ns_cmd_ext_io_opts); 3303 bio->ext_opts.memory_domain = ext_opts->memory_domain; 3304 bio->ext_opts.memory_domain_ctx = ext_opts->memory_domain_ctx; 3305 bio->ext_opts.io_flags = flags; 3306 bio->ext_opts.metadata = md; 3307 3308 rc = spdk_nvme_ns_cmd_writev_ext(ns, qpair, lba, lba_count, 3309 bdev_nvme_readv_done, bio, 3310 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 3311 &bio->ext_opts); 3312 } else if (iovcnt == 1) { 3313 rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base, md, lba, 3314 lba_count, 3315 bdev_nvme_writev_done, bio, 3316 flags, 3317 0, 0); 3318 } else { 3319 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 3320 bdev_nvme_writev_done, bio, flags, 3321 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 3322 md, 0, 0); 3323 } 3324 3325 if (rc != 0 && rc != -ENOMEM) { 3326 SPDK_ERRLOG("writev failed: rc = %d\n", rc); 3327 } 3328 return rc; 3329 } 3330 3331 static int 3332 bdev_nvme_zone_appendv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 3333 struct nvme_bdev_io *bio, 3334 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t zslba, 3335 uint32_t flags) 3336 { 3337 int rc; 3338 3339 SPDK_DEBUGLOG(bdev_nvme, "zone append %" PRIu64 " blocks to zone start lba %#" PRIx64 "\n", 3340 lba_count, zslba); 3341 3342 bio->iovs = iov; 3343 bio->iovcnt = iovcnt; 3344 bio->iovpos = 0; 3345 bio->iov_offset = 0; 3346 3347 if (iovcnt == 1) { 3348 rc = spdk_nvme_zns_zone_append_with_md(ns, qpair, iov[0].iov_base, md, zslba, 3349 lba_count, 3350 bdev_nvme_zone_appendv_done, bio, 3351 flags, 3352 0, 0); 3353 } else { 3354 rc = spdk_nvme_zns_zone_appendv_with_md(ns, qpair, zslba, lba_count, 3355 bdev_nvme_zone_appendv_done, bio, flags, 3356 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 3357 md, 0, 0); 3358 } 3359 3360 if (rc != 0 && rc != -ENOMEM) { 3361 SPDK_ERRLOG("zone append failed: rc = %d\n", rc); 3362 } 3363 return rc; 3364 } 3365 3366 static int 3367 bdev_nvme_comparev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 3368 struct nvme_bdev_io *bio, 3369 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba, 3370 uint32_t flags) 3371 { 3372 int rc; 3373 3374 SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n", 3375 lba_count, lba); 3376 3377 bio->iovs = iov; 3378 bio->iovcnt = iovcnt; 3379 bio->iovpos = 0; 3380 bio->iov_offset = 0; 3381 3382 rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count, 3383 bdev_nvme_comparev_done, bio, flags, 3384 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 3385 md, 0, 0); 3386 3387 if (rc != 0 && rc != -ENOMEM) { 3388 SPDK_ERRLOG("comparev failed: rc = %d\n", rc); 3389 } 3390 return rc; 3391 } 3392 3393 static int 3394 bdev_nvme_comparev_and_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 3395 struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, 3396 struct iovec *write_iov, int write_iovcnt, 3397 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags) 3398 { 3399 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 3400 int rc; 3401 3402 SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 3403 lba_count, lba); 3404 3405 bio->iovs = cmp_iov; 3406 bio->iovcnt = cmp_iovcnt; 3407 bio->iovpos = 0; 3408 bio->iov_offset = 0; 3409 bio->fused_iovs = write_iov; 3410 bio->fused_iovcnt = write_iovcnt; 3411 bio->fused_iovpos = 0; 3412 bio->fused_iov_offset = 0; 3413 3414 if (bdev_io->num_retries == 0) { 3415 bio->first_fused_submitted = false; 3416 } 3417 3418 if (!bio->first_fused_submitted) { 3419 flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST; 3420 memset(&bio->cpl, 0, sizeof(bio->cpl)); 3421 3422 rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count, 3423 bdev_nvme_comparev_and_writev_done, bio, flags, 3424 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0); 3425 if (rc == 0) { 3426 bio->first_fused_submitted = true; 3427 flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST; 3428 } else { 3429 if (rc != -ENOMEM) { 3430 SPDK_ERRLOG("compare failed: rc = %d\n", rc); 3431 } 3432 return rc; 3433 } 3434 } 3435 3436 flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND; 3437 3438 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 3439 bdev_nvme_comparev_and_writev_done, bio, flags, 3440 bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0); 3441 if (rc != 0 && rc != -ENOMEM) { 3442 SPDK_ERRLOG("write failed: rc = %d\n", rc); 3443 rc = 0; 3444 } 3445 3446 return rc; 3447 } 3448 3449 static int 3450 bdev_nvme_unmap(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 3451 struct nvme_bdev_io *bio, 3452 uint64_t offset_blocks, 3453 uint64_t num_blocks) 3454 { 3455 struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES]; 3456 struct spdk_nvme_dsm_range *range; 3457 uint64_t offset, remaining; 3458 uint64_t num_ranges_u64; 3459 uint16_t num_ranges; 3460 int rc; 3461 3462 num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) / 3463 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 3464 if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) { 3465 SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks); 3466 return -EINVAL; 3467 } 3468 num_ranges = (uint16_t)num_ranges_u64; 3469 3470 offset = offset_blocks; 3471 remaining = num_blocks; 3472 range = &dsm_ranges[0]; 3473 3474 /* Fill max-size ranges until the remaining blocks fit into one range */ 3475 while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) { 3476 range->attributes.raw = 0; 3477 range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 3478 range->starting_lba = offset; 3479 3480 offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 3481 remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 3482 range++; 3483 } 3484 3485 /* Final range describes the remaining blocks */ 3486 range->attributes.raw = 0; 3487 range->length = remaining; 3488 range->starting_lba = offset; 3489 3490 rc = spdk_nvme_ns_cmd_dataset_management(ns, qpair, 3491 SPDK_NVME_DSM_ATTR_DEALLOCATE, 3492 dsm_ranges, num_ranges, 3493 bdev_nvme_queued_done, bio); 3494 3495 return rc; 3496 } 3497 3498 static int 3499 bdev_nvme_write_zeroes(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 3500 struct nvme_bdev_io *bio, 3501 uint64_t offset_blocks, 3502 uint64_t num_blocks) 3503 { 3504 if (num_blocks > UINT16_MAX + 1) { 3505 SPDK_ERRLOG("NVMe write zeroes is limited to 16-bit block count\n"); 3506 return -EINVAL; 3507 } 3508 3509 return spdk_nvme_ns_cmd_write_zeroes(ns, qpair, 3510 offset_blocks, num_blocks, 3511 bdev_nvme_queued_done, bio, 3512 0); 3513 } 3514 3515 static int 3516 bdev_nvme_get_zone_info(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 3517 struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones, 3518 struct spdk_bdev_zone_info *info) 3519 { 3520 uint32_t zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 3521 uint64_t zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 3522 uint64_t total_zones = spdk_nvme_zns_ns_get_num_zones(ns); 3523 3524 if (zone_id % zone_size != 0) { 3525 return -EINVAL; 3526 } 3527 3528 if (num_zones > total_zones || !num_zones) { 3529 return -EINVAL; 3530 } 3531 3532 assert(!bio->zone_report_buf); 3533 bio->zone_report_buf = calloc(1, zone_report_bufsize); 3534 if (!bio->zone_report_buf) { 3535 return -ENOMEM; 3536 } 3537 3538 bio->handled_zones = 0; 3539 3540 return spdk_nvme_zns_report_zones(ns, qpair, bio->zone_report_buf, zone_report_bufsize, 3541 zone_id, SPDK_NVME_ZRA_LIST_ALL, true, 3542 bdev_nvme_get_zone_info_done, bio); 3543 } 3544 3545 static int 3546 bdev_nvme_zone_management(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 3547 struct nvme_bdev_io *bio, uint64_t zone_id, 3548 enum spdk_bdev_zone_action action) 3549 { 3550 switch (action) { 3551 case SPDK_BDEV_ZONE_CLOSE: 3552 return spdk_nvme_zns_close_zone(ns, qpair, zone_id, false, 3553 bdev_nvme_zone_management_done, bio); 3554 case SPDK_BDEV_ZONE_FINISH: 3555 return spdk_nvme_zns_finish_zone(ns, qpair, zone_id, false, 3556 bdev_nvme_zone_management_done, bio); 3557 case SPDK_BDEV_ZONE_OPEN: 3558 return spdk_nvme_zns_open_zone(ns, qpair, zone_id, false, 3559 bdev_nvme_zone_management_done, bio); 3560 case SPDK_BDEV_ZONE_RESET: 3561 return spdk_nvme_zns_reset_zone(ns, qpair, zone_id, false, 3562 bdev_nvme_zone_management_done, bio); 3563 case SPDK_BDEV_ZONE_OFFLINE: 3564 return spdk_nvme_zns_offline_zone(ns, qpair, zone_id, false, 3565 bdev_nvme_zone_management_done, bio); 3566 default: 3567 return -EINVAL; 3568 } 3569 } 3570 3571 static int 3572 bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 3573 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) 3574 { 3575 struct nvme_ctrlr *nvme_ctrlr; 3576 uint32_t max_xfer_size; 3577 3578 if (!bdev_nvme_find_admin_path(nbdev_ch, &nvme_ctrlr)) { 3579 return -EINVAL; 3580 } 3581 3582 max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ctrlr->ctrlr); 3583 3584 if (nbytes > max_xfer_size) { 3585 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 3586 return -EINVAL; 3587 } 3588 3589 bio->orig_thread = spdk_get_thread(); 3590 3591 return spdk_nvme_ctrlr_cmd_admin_raw(nvme_ctrlr->ctrlr, cmd, buf, 3592 (uint32_t)nbytes, bdev_nvme_admin_passthru_done, bio); 3593 } 3594 3595 static int 3596 bdev_nvme_io_passthru(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 3597 struct nvme_bdev_io *bio, 3598 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) 3599 { 3600 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 3601 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 3602 3603 if (nbytes > max_xfer_size) { 3604 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 3605 return -EINVAL; 3606 } 3607 3608 /* 3609 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 3610 * so fill it out automatically. 3611 */ 3612 cmd->nsid = spdk_nvme_ns_get_id(ns); 3613 3614 return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf, 3615 (uint32_t)nbytes, bdev_nvme_queued_done, bio); 3616 } 3617 3618 static int 3619 bdev_nvme_io_passthru_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 3620 struct nvme_bdev_io *bio, 3621 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len) 3622 { 3623 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 3624 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 3625 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 3626 3627 if (nbytes > max_xfer_size) { 3628 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 3629 return -EINVAL; 3630 } 3631 3632 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 3633 SPDK_ERRLOG("invalid meta data buffer size\n"); 3634 return -EINVAL; 3635 } 3636 3637 /* 3638 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 3639 * so fill it out automatically. 3640 */ 3641 cmd->nsid = spdk_nvme_ns_get_id(ns); 3642 3643 return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf, 3644 (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio); 3645 } 3646 3647 static int 3648 bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 3649 struct nvme_bdev_io *bio_to_abort) 3650 { 3651 struct nvme_ctrlr_channel *ctrlr_ch = nbdev_ch->ctrlr_ch; 3652 int rc; 3653 3654 bio->orig_thread = spdk_get_thread(); 3655 3656 rc = spdk_nvme_ctrlr_cmd_abort_ext(ctrlr_ch->ctrlr->ctrlr, 3657 ctrlr_ch->qpair, 3658 bio_to_abort, 3659 bdev_nvme_abort_done, bio); 3660 if (rc == -ENOENT) { 3661 /* If no command was found in I/O qpair, the target command may be 3662 * admin command. 3663 */ 3664 rc = spdk_nvme_ctrlr_cmd_abort_ext(ctrlr_ch->ctrlr->ctrlr, 3665 NULL, 3666 bio_to_abort, 3667 bdev_nvme_abort_done, bio); 3668 } 3669 3670 if (rc == -ENOENT) { 3671 /* If no command was found, complete the abort request with failure. */ 3672 bio->cpl.cdw0 |= 1U; 3673 bio->cpl.status.sc = SPDK_NVME_SC_SUCCESS; 3674 bio->cpl.status.sct = SPDK_NVME_SCT_GENERIC; 3675 3676 bdev_nvme_abort_completion(bio); 3677 3678 rc = 0; 3679 } 3680 3681 return rc; 3682 } 3683 3684 static void 3685 bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w) 3686 { 3687 const char *action; 3688 3689 if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) { 3690 action = "reset"; 3691 } else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) { 3692 action = "abort"; 3693 } else { 3694 action = "none"; 3695 } 3696 3697 spdk_json_write_object_begin(w); 3698 3699 spdk_json_write_named_string(w, "method", "bdev_nvme_set_options"); 3700 3701 spdk_json_write_named_object_begin(w, "params"); 3702 spdk_json_write_named_string(w, "action_on_timeout", action); 3703 spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us); 3704 spdk_json_write_named_uint64(w, "timeout_admin_us", g_opts.timeout_admin_us); 3705 spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms); 3706 spdk_json_write_named_uint32(w, "retry_count", g_opts.retry_count); 3707 spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst); 3708 spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight); 3709 spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight); 3710 spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight); 3711 spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us); 3712 spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us); 3713 spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests); 3714 spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit); 3715 spdk_json_write_object_end(w); 3716 3717 spdk_json_write_object_end(w); 3718 } 3719 3720 static void 3721 nvme_ctrlr_config_json(struct spdk_json_write_ctx *w, 3722 struct nvme_ctrlr *nvme_ctrlr) 3723 { 3724 struct spdk_nvme_transport_id *trid; 3725 3726 trid = nvme_ctrlr->connected_trid; 3727 3728 spdk_json_write_object_begin(w); 3729 3730 spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller"); 3731 3732 spdk_json_write_named_object_begin(w, "params"); 3733 spdk_json_write_named_string(w, "name", nvme_ctrlr->name); 3734 nvme_bdev_dump_trid_json(trid, w); 3735 spdk_json_write_named_bool(w, "prchk_reftag", 3736 (nvme_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0); 3737 spdk_json_write_named_bool(w, "prchk_guard", 3738 (nvme_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0); 3739 3740 spdk_json_write_object_end(w); 3741 3742 spdk_json_write_object_end(w); 3743 } 3744 3745 static void 3746 bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w) 3747 { 3748 spdk_json_write_object_begin(w); 3749 spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug"); 3750 3751 spdk_json_write_named_object_begin(w, "params"); 3752 spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us); 3753 spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled); 3754 spdk_json_write_object_end(w); 3755 3756 spdk_json_write_object_end(w); 3757 } 3758 3759 static int 3760 bdev_nvme_config_json(struct spdk_json_write_ctx *w) 3761 { 3762 struct nvme_ctrlr *nvme_ctrlr; 3763 3764 bdev_nvme_opts_config_json(w); 3765 3766 pthread_mutex_lock(&g_bdev_nvme_mutex); 3767 3768 TAILQ_FOREACH(nvme_ctrlr, &g_nvme_ctrlrs, tailq) { 3769 nvme_ctrlr_config_json(w, nvme_ctrlr); 3770 } 3771 3772 /* Dump as last parameter to give all NVMe bdevs chance to be constructed 3773 * before enabling hotplug poller. 3774 */ 3775 bdev_nvme_hotplug_config_json(w); 3776 3777 pthread_mutex_unlock(&g_bdev_nvme_mutex); 3778 return 0; 3779 } 3780 3781 struct spdk_nvme_ctrlr * 3782 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev) 3783 { 3784 if (!bdev || bdev->module != &nvme_if) { 3785 return NULL; 3786 } 3787 3788 return SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk)->nvme_ns->ctrlr->ctrlr; 3789 } 3790 3791 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme) 3792