1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. All rights reserved. 5 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 6 * Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * * Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * * Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * * Neither the name of Intel Corporation nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 25 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 26 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 */ 34 35 #include "spdk/stdinc.h" 36 37 #include "bdev_nvme.h" 38 39 #include "spdk/accel_engine.h" 40 #include "spdk/config.h" 41 #include "spdk/endian.h" 42 #include "spdk/bdev.h" 43 #include "spdk/json.h" 44 #include "spdk/nvme.h" 45 #include "spdk/nvme_ocssd.h" 46 #include "spdk/nvme_zns.h" 47 #include "spdk/thread.h" 48 #include "spdk/string.h" 49 #include "spdk/util.h" 50 51 #include "spdk/bdev_module.h" 52 #include "spdk/log.h" 53 54 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true 55 #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS (10000) 56 57 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w); 58 59 struct nvme_bdev_io { 60 /** array of iovecs to transfer. */ 61 struct iovec *iovs; 62 63 /** Number of iovecs in iovs array. */ 64 int iovcnt; 65 66 /** Current iovec position. */ 67 int iovpos; 68 69 /** Offset in current iovec. */ 70 uint32_t iov_offset; 71 72 /** array of iovecs to transfer. */ 73 struct iovec *fused_iovs; 74 75 /** Number of iovecs in iovs array. */ 76 int fused_iovcnt; 77 78 /** Current iovec position. */ 79 int fused_iovpos; 80 81 /** Offset in current iovec. */ 82 uint32_t fused_iov_offset; 83 84 /** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */ 85 struct spdk_nvme_cpl cpl; 86 /** Extended IO opts passed by the user to bdev layer and mapped to NVME format */ 87 struct spdk_nvme_ns_cmd_ext_io_opts ext_opts; 88 89 /** Originating thread */ 90 struct spdk_thread *orig_thread; 91 92 /** Keeps track if first of fused commands was submitted */ 93 bool first_fused_submitted; 94 95 /** Temporary pointer to zone report buffer */ 96 struct spdk_nvme_zns_zone_report *zone_report_buf; 97 98 /** Keep track of how many zones that have been copied to the spdk_bdev_zone_info struct */ 99 uint64_t handled_zones; 100 }; 101 102 struct nvme_probe_ctx { 103 size_t count; 104 struct spdk_nvme_transport_id trids[NVME_MAX_CONTROLLERS]; 105 struct spdk_nvme_host_id hostids[NVME_MAX_CONTROLLERS]; 106 const char *names[NVME_MAX_CONTROLLERS]; 107 uint32_t prchk_flags[NVME_MAX_CONTROLLERS]; 108 const char *hostnqn; 109 }; 110 111 struct nvme_probe_skip_entry { 112 struct spdk_nvme_transport_id trid; 113 TAILQ_ENTRY(nvme_probe_skip_entry) tailq; 114 }; 115 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */ 116 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER( 117 g_skipped_nvme_ctrlrs); 118 119 static struct spdk_bdev_nvme_opts g_opts = { 120 .action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE, 121 .timeout_us = 0, 122 .timeout_admin_us = 0, 123 .keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS, 124 .retry_count = 4, 125 .arbitration_burst = 0, 126 .low_priority_weight = 0, 127 .medium_priority_weight = 0, 128 .high_priority_weight = 0, 129 .nvme_adminq_poll_period_us = 10000ULL, 130 .nvme_ioq_poll_period_us = 0, 131 .io_queue_requests = 0, 132 .delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT, 133 }; 134 135 #define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL 136 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT 100000ULL 137 138 static int g_hot_insert_nvme_controller_index = 0; 139 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT; 140 static bool g_nvme_hotplug_enabled = false; 141 static struct spdk_thread *g_bdev_nvme_init_thread; 142 static struct spdk_poller *g_hotplug_poller; 143 static struct spdk_poller *g_hotplug_probe_poller; 144 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx; 145 146 static void nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 147 struct nvme_async_probe_ctx *ctx); 148 static void nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 149 struct nvme_async_probe_ctx *ctx); 150 static int bdev_nvme_library_init(void); 151 static void bdev_nvme_library_fini(void); 152 static int bdev_nvme_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 153 struct nvme_bdev_io *bio, 154 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba, 155 uint32_t flags, struct spdk_bdev_ext_io_opts *ext_opts); 156 static int bdev_nvme_no_pi_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 157 struct nvme_bdev_io *bio, 158 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba); 159 static int bdev_nvme_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 160 struct nvme_bdev_io *bio, 161 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba, 162 uint32_t flags, struct spdk_bdev_ext_io_opts *ext_opts); 163 static int bdev_nvme_zone_appendv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 164 struct nvme_bdev_io *bio, 165 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, 166 uint64_t zslba, uint32_t flags); 167 static int bdev_nvme_comparev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 168 struct nvme_bdev_io *bio, 169 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba, 170 uint32_t flags); 171 static int bdev_nvme_comparev_and_writev(struct spdk_nvme_ns *ns, 172 struct spdk_nvme_qpair *qpair, 173 struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov, 174 int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba, 175 uint32_t flags); 176 static int bdev_nvme_get_zone_info(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 177 struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones, 178 struct spdk_bdev_zone_info *info); 179 static int bdev_nvme_zone_management(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 180 struct nvme_bdev_io *bio, uint64_t zone_id, 181 enum spdk_bdev_zone_action action); 182 static int bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, 183 struct nvme_bdev_io *bio, 184 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); 185 static int bdev_nvme_io_passthru(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 186 struct nvme_bdev_io *bio, 187 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); 188 static int bdev_nvme_io_passthru_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 189 struct nvme_bdev_io *bio, 190 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len); 191 static int bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, 192 struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort); 193 static int bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio); 194 static int bdev_nvme_failover(struct nvme_ctrlr *nvme_ctrlr, bool remove); 195 static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr); 196 197 typedef void (*populate_namespace_fn)(struct nvme_ctrlr *nvme_ctrlr, 198 struct nvme_ns *nvme_ns, struct nvme_async_probe_ctx *ctx); 199 static void nvme_ctrlr_populate_standard_namespace(struct nvme_ctrlr *nvme_ctrlr, 200 struct nvme_ns *nvme_ns, struct nvme_async_probe_ctx *ctx); 201 202 static populate_namespace_fn g_populate_namespace_fn[] = { 203 NULL, 204 nvme_ctrlr_populate_standard_namespace, 205 }; 206 207 typedef void (*depopulate_namespace_fn)(struct nvme_ns *nvme_ns); 208 static void nvme_ctrlr_depopulate_standard_namespace(struct nvme_ns *nvme_ns); 209 210 static depopulate_namespace_fn g_depopulate_namespace_fn[] = { 211 NULL, 212 nvme_ctrlr_depopulate_standard_namespace, 213 }; 214 215 typedef void (*config_json_namespace_fn)(struct spdk_json_write_ctx *w, 216 struct nvme_ns *nvme_ns); 217 static void nvme_ctrlr_config_json_standard_namespace(struct spdk_json_write_ctx *w, 218 struct nvme_ns *nvme_ns); 219 220 static config_json_namespace_fn g_config_json_namespace_fn[] = { 221 NULL, 222 nvme_ctrlr_config_json_standard_namespace, 223 }; 224 225 struct spdk_nvme_qpair * 226 bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch) 227 { 228 struct nvme_ctrlr_channel *ctrlr_ch; 229 230 assert(ctrlr_io_ch != NULL); 231 232 ctrlr_ch = spdk_io_channel_get_ctx(ctrlr_io_ch); 233 234 return ctrlr_ch->qpair; 235 } 236 237 static int 238 bdev_nvme_get_ctx_size(void) 239 { 240 return sizeof(struct nvme_bdev_io); 241 } 242 243 static struct spdk_bdev_module nvme_if = { 244 .name = "nvme", 245 .async_fini = true, 246 .module_init = bdev_nvme_library_init, 247 .module_fini = bdev_nvme_library_fini, 248 .config_json = bdev_nvme_config_json, 249 .get_ctx_size = bdev_nvme_get_ctx_size, 250 251 }; 252 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if) 253 254 static inline bool 255 bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch, 256 struct spdk_nvme_ns **_ns, struct spdk_nvme_qpair **_qpair) 257 { 258 if (spdk_unlikely(nbdev_ch->ctrlr_ch->qpair == NULL)) { 259 /* The device is currently resetting. */ 260 return false; 261 } 262 263 *_ns = nbdev_ch->nvme_ns->ns; 264 *_qpair = nbdev_ch->ctrlr_ch->qpair; 265 return true; 266 } 267 268 static inline bool 269 bdev_nvme_find_admin_path(struct nvme_bdev_channel *nbdev_ch, 270 struct nvme_ctrlr **_nvme_ctrlr) 271 { 272 *_nvme_ctrlr = nbdev_ch->ctrlr_ch->ctrlr; 273 return true; 274 } 275 276 static inline void 277 bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio, 278 const struct spdk_nvme_cpl *cpl) 279 { 280 spdk_bdev_io_complete_nvme_status(spdk_bdev_io_from_ctx(bio), cpl->cdw0, 281 cpl->status.sct, cpl->status.sc); 282 } 283 284 static inline void 285 bdev_nvme_io_complete(struct nvme_bdev_io *bio, int rc) 286 { 287 enum spdk_bdev_io_status io_status; 288 289 if (rc == 0) { 290 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 291 } else if (rc == -ENOMEM) { 292 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 293 } else { 294 io_status = SPDK_BDEV_IO_STATUS_FAILED; 295 } 296 297 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), io_status); 298 } 299 300 static void 301 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx) 302 { 303 int rc; 304 305 SPDK_DEBUGLOG(bdev_nvme, "qpair %p is disconnected, attempting reconnect.\n", qpair); 306 /* 307 * Currently, just try to reconnect indefinitely. If we are doing a reset, the reset will 308 * reconnect a qpair and we will stop getting a callback for this one. 309 */ 310 rc = spdk_nvme_ctrlr_reconnect_io_qpair(qpair); 311 if (rc != 0) { 312 SPDK_DEBUGLOG(bdev_nvme, "Failed to reconnect to qpair %p, errno %d\n", qpair, -rc); 313 } 314 } 315 316 static int 317 bdev_nvme_poll(void *arg) 318 { 319 struct nvme_poll_group *group = arg; 320 int64_t num_completions; 321 322 if (group->collect_spin_stat && group->start_ticks == 0) { 323 group->start_ticks = spdk_get_ticks(); 324 } 325 326 num_completions = spdk_nvme_poll_group_process_completions(group->group, 0, 327 bdev_nvme_disconnected_qpair_cb); 328 if (group->collect_spin_stat) { 329 if (num_completions > 0) { 330 if (group->end_ticks != 0) { 331 group->spin_ticks += (group->end_ticks - group->start_ticks); 332 group->end_ticks = 0; 333 } 334 group->start_ticks = 0; 335 } else { 336 group->end_ticks = spdk_get_ticks(); 337 } 338 } 339 340 return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 341 } 342 343 static int 344 bdev_nvme_poll_adminq(void *arg) 345 { 346 int32_t rc; 347 struct nvme_ctrlr *nvme_ctrlr = arg; 348 349 assert(nvme_ctrlr != NULL); 350 351 rc = spdk_nvme_ctrlr_process_admin_completions(nvme_ctrlr->ctrlr); 352 if (rc < 0) { 353 bdev_nvme_failover(nvme_ctrlr, false); 354 } 355 356 return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY; 357 } 358 359 static void 360 _bdev_nvme_unregister_dev_cb(void *io_device) 361 { 362 struct nvme_bdev *nvme_disk = io_device; 363 364 free(nvme_disk->disk.name); 365 free(nvme_disk); 366 } 367 368 static int 369 bdev_nvme_destruct(void *ctx) 370 { 371 struct nvme_bdev *nvme_disk = ctx; 372 struct nvme_ns *nvme_ns = nvme_disk->nvme_ns; 373 374 pthread_mutex_lock(&nvme_ns->ctrlr->mutex); 375 376 nvme_ns->bdev = NULL; 377 378 if (!nvme_ns->populated) { 379 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 380 381 nvme_ctrlr_release(nvme_ns->ctrlr); 382 } else { 383 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 384 } 385 386 spdk_io_device_unregister(nvme_disk, _bdev_nvme_unregister_dev_cb); 387 388 return 0; 389 } 390 391 static int 392 bdev_nvme_flush(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 393 struct nvme_bdev_io *bio, uint64_t offset, uint64_t nbytes) 394 { 395 bdev_nvme_io_complete(bio, 0); 396 397 return 0; 398 } 399 400 static int 401 bdev_nvme_create_qpair(struct nvme_ctrlr_channel *ctrlr_ch) 402 { 403 struct spdk_nvme_ctrlr *ctrlr = ctrlr_ch->ctrlr->ctrlr; 404 struct spdk_nvme_io_qpair_opts opts; 405 struct spdk_nvme_qpair *qpair; 406 int rc; 407 408 spdk_nvme_ctrlr_get_default_io_qpair_opts(ctrlr, &opts, sizeof(opts)); 409 opts.delay_cmd_submit = g_opts.delay_cmd_submit; 410 opts.create_only = true; 411 opts.async_mode = true; 412 opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests); 413 g_opts.io_queue_requests = opts.io_queue_requests; 414 415 qpair = spdk_nvme_ctrlr_alloc_io_qpair(ctrlr, &opts, sizeof(opts)); 416 if (qpair == NULL) { 417 return -1; 418 } 419 420 assert(ctrlr_ch->group != NULL); 421 422 rc = spdk_nvme_poll_group_add(ctrlr_ch->group->group, qpair); 423 if (rc != 0) { 424 SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n"); 425 goto err; 426 } 427 428 rc = spdk_nvme_ctrlr_connect_io_qpair(ctrlr, qpair); 429 if (rc != 0) { 430 SPDK_ERRLOG("Unable to connect I/O qpair.\n"); 431 goto err; 432 } 433 434 ctrlr_ch->qpair = qpair; 435 436 return 0; 437 438 err: 439 spdk_nvme_ctrlr_free_io_qpair(qpair); 440 441 return rc; 442 } 443 444 static void 445 bdev_nvme_destroy_qpair(struct nvme_ctrlr_channel *ctrlr_ch) 446 { 447 if (ctrlr_ch->qpair != NULL) { 448 spdk_nvme_ctrlr_free_io_qpair(ctrlr_ch->qpair); 449 ctrlr_ch->qpair = NULL; 450 } 451 } 452 453 static void 454 _bdev_nvme_check_pending_destruct(struct nvme_ctrlr *nvme_ctrlr) 455 { 456 pthread_mutex_lock(&nvme_ctrlr->mutex); 457 if (nvme_ctrlr->destruct_after_reset) { 458 assert(nvme_ctrlr->ref == 0 && nvme_ctrlr->destruct); 459 pthread_mutex_unlock(&nvme_ctrlr->mutex); 460 461 spdk_thread_send_msg(nvme_ctrlr->thread, nvme_ctrlr_unregister, 462 nvme_ctrlr); 463 } else { 464 pthread_mutex_unlock(&nvme_ctrlr->mutex); 465 } 466 } 467 468 static void 469 bdev_nvme_check_pending_destruct(struct spdk_io_channel_iter *i, int status) 470 { 471 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 472 473 _bdev_nvme_check_pending_destruct(nvme_ctrlr); 474 } 475 476 static void 477 _bdev_nvme_complete_pending_resets(struct nvme_ctrlr_channel *ctrlr_ch, 478 enum spdk_bdev_io_status status) 479 { 480 struct spdk_bdev_io *bdev_io; 481 482 while (!TAILQ_EMPTY(&ctrlr_ch->pending_resets)) { 483 bdev_io = TAILQ_FIRST(&ctrlr_ch->pending_resets); 484 TAILQ_REMOVE(&ctrlr_ch->pending_resets, bdev_io, module_link); 485 spdk_bdev_io_complete(bdev_io, status); 486 } 487 } 488 489 static void 490 bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i) 491 { 492 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 493 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 494 495 _bdev_nvme_complete_pending_resets(ctrlr_ch, SPDK_BDEV_IO_STATUS_SUCCESS); 496 497 spdk_for_each_channel_continue(i, 0); 498 } 499 500 static void 501 bdev_nvme_abort_pending_resets(struct spdk_io_channel_iter *i) 502 { 503 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 504 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 505 506 _bdev_nvme_complete_pending_resets(ctrlr_ch, SPDK_BDEV_IO_STATUS_FAILED); 507 508 spdk_for_each_channel_continue(i, 0); 509 } 510 511 static void 512 bdev_nvme_reset_complete(struct nvme_ctrlr *nvme_ctrlr, int rc) 513 { 514 struct nvme_ctrlr_trid *curr_trid; 515 bdev_nvme_reset_cb reset_cb_fn = nvme_ctrlr->reset_cb_fn; 516 void *reset_cb_arg = nvme_ctrlr->reset_cb_arg; 517 518 nvme_ctrlr->reset_cb_fn = NULL; 519 nvme_ctrlr->reset_cb_arg = NULL; 520 521 if (rc) { 522 SPDK_ERRLOG("Resetting controller failed.\n"); 523 } else { 524 SPDK_NOTICELOG("Resetting controller successful.\n"); 525 } 526 527 pthread_mutex_lock(&nvme_ctrlr->mutex); 528 nvme_ctrlr->resetting = false; 529 nvme_ctrlr->failover_in_progress = false; 530 531 curr_trid = TAILQ_FIRST(&nvme_ctrlr->trids); 532 assert(curr_trid != NULL); 533 assert(&curr_trid->trid == nvme_ctrlr->connected_trid); 534 535 curr_trid->is_failed = rc != 0 ? true : false; 536 537 if (nvme_ctrlr->ref == 0 && nvme_ctrlr->destruct) { 538 /* Destruct ctrlr after clearing pending resets. */ 539 nvme_ctrlr->destruct_after_reset = true; 540 } 541 542 pthread_mutex_unlock(&nvme_ctrlr->mutex); 543 544 if (reset_cb_fn) { 545 reset_cb_fn(reset_cb_arg, rc); 546 } 547 548 /* Make sure we clear any pending resets before returning. */ 549 spdk_for_each_channel(nvme_ctrlr, 550 rc == 0 ? bdev_nvme_complete_pending_resets : 551 bdev_nvme_abort_pending_resets, 552 NULL, 553 bdev_nvme_check_pending_destruct); 554 } 555 556 static void 557 bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status) 558 { 559 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 560 561 bdev_nvme_reset_complete(nvme_ctrlr, status); 562 } 563 564 static void 565 bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i) 566 { 567 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 568 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 569 int rc; 570 571 rc = bdev_nvme_create_qpair(ctrlr_ch); 572 573 spdk_for_each_channel_continue(i, rc); 574 } 575 576 static int 577 bdev_nvme_ctrlr_reset_poll(void *arg) 578 { 579 struct nvme_ctrlr *nvme_ctrlr = arg; 580 int rc; 581 582 rc = spdk_nvme_ctrlr_reset_poll_async(nvme_ctrlr->reset_ctx); 583 if (rc == -EAGAIN) { 584 return SPDK_POLLER_BUSY; 585 } 586 587 spdk_poller_unregister(&nvme_ctrlr->reset_poller); 588 if (rc == 0) { 589 /* Recreate all of the I/O queue pairs */ 590 spdk_for_each_channel(nvme_ctrlr, 591 bdev_nvme_reset_create_qpair, 592 NULL, 593 bdev_nvme_reset_create_qpairs_done); 594 } else { 595 bdev_nvme_reset_complete(nvme_ctrlr, rc); 596 } 597 return SPDK_POLLER_BUSY; 598 } 599 600 static void 601 bdev_nvme_reset_ctrlr(struct spdk_io_channel_iter *i, int status) 602 { 603 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 604 int rc; 605 606 if (status) { 607 rc = status; 608 goto err; 609 } 610 611 rc = spdk_nvme_ctrlr_reset_async(nvme_ctrlr->ctrlr, &nvme_ctrlr->reset_ctx); 612 if (rc != 0) { 613 SPDK_ERRLOG("Create controller reset context failed\n"); 614 goto err; 615 } 616 assert(nvme_ctrlr->reset_poller == NULL); 617 nvme_ctrlr->reset_poller = SPDK_POLLER_REGISTER(bdev_nvme_ctrlr_reset_poll, 618 nvme_ctrlr, 0); 619 620 return; 621 622 err: 623 bdev_nvme_reset_complete(nvme_ctrlr, rc); 624 } 625 626 static void 627 bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i) 628 { 629 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 630 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch); 631 632 bdev_nvme_destroy_qpair(ctrlr_ch); 633 spdk_for_each_channel_continue(i, 0); 634 } 635 636 static int 637 bdev_nvme_reset(struct nvme_ctrlr *nvme_ctrlr) 638 { 639 pthread_mutex_lock(&nvme_ctrlr->mutex); 640 if (nvme_ctrlr->destruct) { 641 pthread_mutex_unlock(&nvme_ctrlr->mutex); 642 return -EBUSY; 643 } 644 645 if (nvme_ctrlr->resetting) { 646 pthread_mutex_unlock(&nvme_ctrlr->mutex); 647 SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); 648 return -EAGAIN; 649 } 650 651 nvme_ctrlr->resetting = true; 652 pthread_mutex_unlock(&nvme_ctrlr->mutex); 653 654 /* First, delete all NVMe I/O queue pairs. */ 655 spdk_for_each_channel(nvme_ctrlr, 656 bdev_nvme_reset_destroy_qpair, 657 NULL, 658 bdev_nvme_reset_ctrlr); 659 660 return 0; 661 } 662 663 int 664 bdev_nvme_reset_rpc(struct nvme_ctrlr *nvme_ctrlr, bdev_nvme_reset_cb cb_fn, void *cb_arg) 665 { 666 int rc; 667 668 rc = bdev_nvme_reset(nvme_ctrlr); 669 if (rc == 0) { 670 nvme_ctrlr->reset_cb_fn = cb_fn; 671 nvme_ctrlr->reset_cb_arg = cb_arg; 672 } 673 return rc; 674 } 675 676 static void 677 bdev_nvme_reset_io_complete(void *cb_arg, int rc) 678 { 679 struct nvme_bdev_io *bio = cb_arg; 680 681 bdev_nvme_io_complete(bio, rc); 682 } 683 684 static int 685 bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio) 686 { 687 struct nvme_ctrlr_channel *ctrlr_ch = nbdev_ch->ctrlr_ch; 688 struct spdk_bdev_io *bdev_io; 689 int rc; 690 691 rc = bdev_nvme_reset(ctrlr_ch->ctrlr); 692 if (rc == 0) { 693 assert(ctrlr_ch->ctrlr->reset_cb_fn == NULL); 694 assert(ctrlr_ch->ctrlr->reset_cb_arg == NULL); 695 ctrlr_ch->ctrlr->reset_cb_fn = bdev_nvme_reset_io_complete; 696 ctrlr_ch->ctrlr->reset_cb_arg = bio; 697 } else if (rc == -EAGAIN) { 698 /* 699 * Reset call is queued only if it is from the app framework. This is on purpose so that 700 * we don't interfere with the app framework reset strategy. i.e. we are deferring to the 701 * upper level. If they are in the middle of a reset, we won't try to schedule another one. 702 */ 703 bdev_io = spdk_bdev_io_from_ctx(bio); 704 TAILQ_INSERT_TAIL(&ctrlr_ch->pending_resets, bdev_io, module_link); 705 } else { 706 return rc; 707 } 708 709 return 0; 710 } 711 712 static int 713 bdev_nvme_failover_start(struct nvme_ctrlr *nvme_ctrlr, bool remove) 714 { 715 struct nvme_ctrlr_trid *curr_trid = NULL, *next_trid = NULL; 716 int rc; 717 718 pthread_mutex_lock(&nvme_ctrlr->mutex); 719 if (nvme_ctrlr->destruct) { 720 pthread_mutex_unlock(&nvme_ctrlr->mutex); 721 /* Don't bother resetting if the controller is in the process of being destructed. */ 722 return -EBUSY; 723 } 724 725 curr_trid = TAILQ_FIRST(&nvme_ctrlr->trids); 726 assert(curr_trid); 727 assert(&curr_trid->trid == nvme_ctrlr->connected_trid); 728 next_trid = TAILQ_NEXT(curr_trid, link); 729 730 if (nvme_ctrlr->resetting) { 731 if (next_trid && !nvme_ctrlr->failover_in_progress) { 732 rc = -EAGAIN; 733 } else { 734 rc = -EBUSY; 735 } 736 pthread_mutex_unlock(&nvme_ctrlr->mutex); 737 SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); 738 return rc; 739 } 740 741 nvme_ctrlr->resetting = true; 742 curr_trid->is_failed = true; 743 744 if (next_trid) { 745 assert(curr_trid->trid.trtype != SPDK_NVME_TRANSPORT_PCIE); 746 747 SPDK_NOTICELOG("Start failover from %s:%s to %s:%s\n", curr_trid->trid.traddr, 748 curr_trid->trid.trsvcid, next_trid->trid.traddr, next_trid->trid.trsvcid); 749 750 nvme_ctrlr->failover_in_progress = true; 751 spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr); 752 nvme_ctrlr->connected_trid = &next_trid->trid; 753 rc = spdk_nvme_ctrlr_set_trid(nvme_ctrlr->ctrlr, &next_trid->trid); 754 assert(rc == 0); 755 TAILQ_REMOVE(&nvme_ctrlr->trids, curr_trid, link); 756 if (!remove) { 757 /** Shuffle the old trid to the end of the list and use the new one. 758 * Allows for round robin through multiple connections. 759 */ 760 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, curr_trid, link); 761 } else { 762 free(curr_trid); 763 } 764 } 765 766 pthread_mutex_unlock(&nvme_ctrlr->mutex); 767 return 0; 768 } 769 770 static int 771 bdev_nvme_failover(struct nvme_ctrlr *nvme_ctrlr, bool remove) 772 { 773 int rc; 774 775 rc = bdev_nvme_failover_start(nvme_ctrlr, remove); 776 if (rc == 0) { 777 /* First, delete all NVMe I/O queue pairs. */ 778 spdk_for_each_channel(nvme_ctrlr, 779 bdev_nvme_reset_destroy_qpair, 780 NULL, 781 bdev_nvme_reset_ctrlr); 782 } else if (rc != -EBUSY) { 783 return rc; 784 } 785 786 return 0; 787 } 788 789 static int 790 bdev_nvme_unmap(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 791 struct nvme_bdev_io *bio, 792 uint64_t offset_blocks, 793 uint64_t num_blocks); 794 795 static int 796 bdev_nvme_write_zeroes(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 797 struct nvme_bdev_io *bio, 798 uint64_t offset_blocks, 799 uint64_t num_blocks); 800 801 static void 802 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 803 bool success) 804 { 805 struct nvme_bdev_io *bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 806 struct spdk_bdev *bdev = bdev_io->bdev; 807 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 808 struct spdk_nvme_ns *ns; 809 struct spdk_nvme_qpair *qpair; 810 int ret; 811 812 if (!success) { 813 ret = -EINVAL; 814 goto exit; 815 } 816 817 if (spdk_unlikely(!bdev_nvme_find_io_path(nbdev_ch, &ns, &qpair))) { 818 ret = -ENXIO; 819 goto exit; 820 } 821 822 ret = bdev_nvme_readv(ns, 823 qpair, 824 bio, 825 bdev_io->u.bdev.iovs, 826 bdev_io->u.bdev.iovcnt, 827 bdev_io->u.bdev.md_buf, 828 bdev_io->u.bdev.num_blocks, 829 bdev_io->u.bdev.offset_blocks, 830 bdev->dif_check_flags, 831 bdev_io->internal.ext_opts); 832 833 exit: 834 if (spdk_unlikely(ret != 0)) { 835 bdev_nvme_io_complete(bio, ret); 836 } 837 } 838 839 static void 840 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 841 { 842 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 843 struct spdk_bdev *bdev = bdev_io->bdev; 844 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 845 struct nvme_bdev_io *nbdev_io_to_abort; 846 struct spdk_nvme_ns *ns; 847 struct spdk_nvme_qpair *qpair; 848 int rc = 0; 849 850 if (spdk_unlikely(!bdev_nvme_find_io_path(nbdev_ch, &ns, &qpair))) { 851 rc = -ENXIO; 852 goto exit; 853 } 854 855 switch (bdev_io->type) { 856 case SPDK_BDEV_IO_TYPE_READ: 857 if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) { 858 rc = bdev_nvme_readv(ns, 859 qpair, 860 nbdev_io, 861 bdev_io->u.bdev.iovs, 862 bdev_io->u.bdev.iovcnt, 863 bdev_io->u.bdev.md_buf, 864 bdev_io->u.bdev.num_blocks, 865 bdev_io->u.bdev.offset_blocks, 866 bdev->dif_check_flags, 867 bdev_io->internal.ext_opts); 868 } else { 869 spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb, 870 bdev_io->u.bdev.num_blocks * bdev->blocklen); 871 rc = 0; 872 } 873 break; 874 case SPDK_BDEV_IO_TYPE_WRITE: 875 rc = bdev_nvme_writev(ns, 876 qpair, 877 nbdev_io, 878 bdev_io->u.bdev.iovs, 879 bdev_io->u.bdev.iovcnt, 880 bdev_io->u.bdev.md_buf, 881 bdev_io->u.bdev.num_blocks, 882 bdev_io->u.bdev.offset_blocks, 883 bdev->dif_check_flags, 884 bdev_io->internal.ext_opts); 885 break; 886 case SPDK_BDEV_IO_TYPE_COMPARE: 887 rc = bdev_nvme_comparev(ns, 888 qpair, 889 nbdev_io, 890 bdev_io->u.bdev.iovs, 891 bdev_io->u.bdev.iovcnt, 892 bdev_io->u.bdev.md_buf, 893 bdev_io->u.bdev.num_blocks, 894 bdev_io->u.bdev.offset_blocks, 895 bdev->dif_check_flags); 896 break; 897 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 898 rc = bdev_nvme_comparev_and_writev(ns, 899 qpair, 900 nbdev_io, 901 bdev_io->u.bdev.iovs, 902 bdev_io->u.bdev.iovcnt, 903 bdev_io->u.bdev.fused_iovs, 904 bdev_io->u.bdev.fused_iovcnt, 905 bdev_io->u.bdev.md_buf, 906 bdev_io->u.bdev.num_blocks, 907 bdev_io->u.bdev.offset_blocks, 908 bdev->dif_check_flags); 909 break; 910 case SPDK_BDEV_IO_TYPE_UNMAP: 911 rc = bdev_nvme_unmap(ns, 912 qpair, 913 nbdev_io, 914 bdev_io->u.bdev.offset_blocks, 915 bdev_io->u.bdev.num_blocks); 916 break; 917 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 918 rc = bdev_nvme_write_zeroes(ns, qpair, 919 nbdev_io, 920 bdev_io->u.bdev.offset_blocks, 921 bdev_io->u.bdev.num_blocks); 922 break; 923 case SPDK_BDEV_IO_TYPE_RESET: 924 rc = bdev_nvme_reset_io(nbdev_ch, nbdev_io); 925 break; 926 case SPDK_BDEV_IO_TYPE_FLUSH: 927 rc = bdev_nvme_flush(ns, 928 qpair, 929 nbdev_io, 930 bdev_io->u.bdev.offset_blocks, 931 bdev_io->u.bdev.num_blocks); 932 break; 933 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 934 rc = bdev_nvme_zone_appendv(ns, 935 qpair, 936 nbdev_io, 937 bdev_io->u.bdev.iovs, 938 bdev_io->u.bdev.iovcnt, 939 bdev_io->u.bdev.md_buf, 940 bdev_io->u.bdev.num_blocks, 941 bdev_io->u.bdev.offset_blocks, 942 bdev->dif_check_flags); 943 break; 944 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 945 rc = bdev_nvme_get_zone_info(ns, 946 qpair, 947 nbdev_io, 948 bdev_io->u.zone_mgmt.zone_id, 949 bdev_io->u.zone_mgmt.num_zones, 950 bdev_io->u.zone_mgmt.buf); 951 break; 952 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 953 rc = bdev_nvme_zone_management(ns, 954 qpair, 955 nbdev_io, 956 bdev_io->u.zone_mgmt.zone_id, 957 bdev_io->u.zone_mgmt.zone_action); 958 break; 959 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 960 rc = bdev_nvme_admin_passthru(nbdev_ch, 961 nbdev_io, 962 &bdev_io->u.nvme_passthru.cmd, 963 bdev_io->u.nvme_passthru.buf, 964 bdev_io->u.nvme_passthru.nbytes); 965 break; 966 case SPDK_BDEV_IO_TYPE_NVME_IO: 967 rc = bdev_nvme_io_passthru(ns, 968 qpair, 969 nbdev_io, 970 &bdev_io->u.nvme_passthru.cmd, 971 bdev_io->u.nvme_passthru.buf, 972 bdev_io->u.nvme_passthru.nbytes); 973 break; 974 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 975 rc = bdev_nvme_io_passthru_md(ns, 976 qpair, 977 nbdev_io, 978 &bdev_io->u.nvme_passthru.cmd, 979 bdev_io->u.nvme_passthru.buf, 980 bdev_io->u.nvme_passthru.nbytes, 981 bdev_io->u.nvme_passthru.md_buf, 982 bdev_io->u.nvme_passthru.md_len); 983 break; 984 case SPDK_BDEV_IO_TYPE_ABORT: 985 nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx; 986 rc = bdev_nvme_abort(nbdev_ch, 987 nbdev_io, 988 nbdev_io_to_abort); 989 break; 990 default: 991 rc = -EINVAL; 992 break; 993 } 994 995 exit: 996 if (spdk_unlikely(rc != 0)) { 997 bdev_nvme_io_complete(nbdev_io, rc); 998 } 999 } 1000 1001 static bool 1002 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 1003 { 1004 struct nvme_bdev *nbdev = ctx; 1005 struct nvme_ns *nvme_ns; 1006 struct spdk_nvme_ns *ns; 1007 struct spdk_nvme_ctrlr *ctrlr; 1008 const struct spdk_nvme_ctrlr_data *cdata; 1009 1010 nvme_ns = nbdev->nvme_ns; 1011 assert(nvme_ns != NULL); 1012 ns = nvme_ns->ns; 1013 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 1014 1015 switch (io_type) { 1016 case SPDK_BDEV_IO_TYPE_READ: 1017 case SPDK_BDEV_IO_TYPE_WRITE: 1018 case SPDK_BDEV_IO_TYPE_RESET: 1019 case SPDK_BDEV_IO_TYPE_FLUSH: 1020 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 1021 case SPDK_BDEV_IO_TYPE_NVME_IO: 1022 case SPDK_BDEV_IO_TYPE_ABORT: 1023 return true; 1024 1025 case SPDK_BDEV_IO_TYPE_COMPARE: 1026 return spdk_nvme_ns_supports_compare(ns); 1027 1028 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 1029 return spdk_nvme_ns_get_md_size(ns) ? true : false; 1030 1031 case SPDK_BDEV_IO_TYPE_UNMAP: 1032 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 1033 return cdata->oncs.dsm; 1034 1035 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 1036 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 1037 return cdata->oncs.write_zeroes; 1038 1039 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 1040 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 1041 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) { 1042 return true; 1043 } 1044 return false; 1045 1046 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 1047 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 1048 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS; 1049 1050 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 1051 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS && 1052 spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ZONE_APPEND_SUPPORTED; 1053 1054 default: 1055 return false; 1056 } 1057 } 1058 1059 static int 1060 bdev_nvme_create_ctrlr_channel_cb(void *io_device, void *ctx_buf) 1061 { 1062 struct nvme_ctrlr *nvme_ctrlr = io_device; 1063 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 1064 struct spdk_io_channel *pg_ch; 1065 int rc; 1066 1067 pg_ch = spdk_get_io_channel(&g_nvme_ctrlrs); 1068 if (!pg_ch) { 1069 return -1; 1070 } 1071 1072 ctrlr_ch->group = spdk_io_channel_get_ctx(pg_ch); 1073 1074 #ifdef SPDK_CONFIG_VTUNE 1075 ctrlr_ch->group->collect_spin_stat = true; 1076 #else 1077 ctrlr_ch->group->collect_spin_stat = false; 1078 #endif 1079 1080 TAILQ_INIT(&ctrlr_ch->pending_resets); 1081 1082 ctrlr_ch->ctrlr = nvme_ctrlr; 1083 1084 rc = bdev_nvme_create_qpair(ctrlr_ch); 1085 if (rc != 0) { 1086 goto err_qpair; 1087 } 1088 1089 return 0; 1090 1091 err_qpair: 1092 spdk_put_io_channel(pg_ch); 1093 1094 return rc; 1095 } 1096 1097 static void 1098 bdev_nvme_destroy_ctrlr_channel_cb(void *io_device, void *ctx_buf) 1099 { 1100 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 1101 1102 assert(ctrlr_ch->group != NULL); 1103 1104 bdev_nvme_destroy_qpair(ctrlr_ch); 1105 1106 spdk_put_io_channel(spdk_io_channel_from_ctx(ctrlr_ch->group)); 1107 } 1108 1109 static void 1110 bdev_nvme_submit_accel_crc32c(void *ctx, uint32_t *dst, struct iovec *iov, 1111 uint32_t iov_cnt, uint32_t seed, 1112 spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) 1113 { 1114 struct nvme_poll_group *group = ctx; 1115 int rc; 1116 1117 assert(group->accel_channel != NULL); 1118 assert(cb_fn != NULL); 1119 1120 rc = spdk_accel_submit_crc32cv(group->accel_channel, dst, iov, iov_cnt, seed, cb_fn, cb_arg); 1121 if (rc) { 1122 /* For the two cases, spdk_accel_submit_crc32cv does not call the user's cb_fn */ 1123 if (rc == -ENOMEM || rc == -EINVAL) { 1124 cb_fn(cb_arg, rc); 1125 } 1126 SPDK_ERRLOG("Cannot complete the accelerated crc32c operation with iov=%p\n", iov); 1127 } 1128 } 1129 1130 static struct spdk_nvme_accel_fn_table g_bdev_nvme_accel_fn_table = { 1131 .table_size = sizeof(struct spdk_nvme_accel_fn_table), 1132 .submit_accel_crc32c = bdev_nvme_submit_accel_crc32c, 1133 }; 1134 1135 static int 1136 bdev_nvme_create_poll_group_cb(void *io_device, void *ctx_buf) 1137 { 1138 struct nvme_poll_group *group = ctx_buf; 1139 1140 group->group = spdk_nvme_poll_group_create(group, &g_bdev_nvme_accel_fn_table); 1141 if (group->group == NULL) { 1142 return -1; 1143 } 1144 1145 group->accel_channel = spdk_accel_engine_get_io_channel(); 1146 if (!group->accel_channel) { 1147 spdk_nvme_poll_group_destroy(group->group); 1148 SPDK_ERRLOG("Cannot get the accel_channel for bdev nvme polling group=%p\n", 1149 group); 1150 return -1; 1151 } 1152 1153 group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us); 1154 1155 if (group->poller == NULL) { 1156 spdk_put_io_channel(group->accel_channel); 1157 spdk_nvme_poll_group_destroy(group->group); 1158 return -1; 1159 } 1160 1161 return 0; 1162 } 1163 1164 static void 1165 bdev_nvme_destroy_poll_group_cb(void *io_device, void *ctx_buf) 1166 { 1167 struct nvme_poll_group *group = ctx_buf; 1168 1169 if (group->accel_channel) { 1170 spdk_put_io_channel(group->accel_channel); 1171 } 1172 1173 spdk_poller_unregister(&group->poller); 1174 if (spdk_nvme_poll_group_destroy(group->group)) { 1175 SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.\n"); 1176 assert(false); 1177 } 1178 } 1179 1180 static struct spdk_io_channel * 1181 bdev_nvme_get_io_channel(void *ctx) 1182 { 1183 struct nvme_bdev *nvme_bdev = ctx; 1184 1185 return spdk_get_io_channel(nvme_bdev); 1186 } 1187 1188 static void * 1189 bdev_nvme_get_module_ctx(void *ctx) 1190 { 1191 struct nvme_bdev *nvme_bdev = ctx; 1192 1193 return bdev_nvme_get_ctrlr(&nvme_bdev->disk); 1194 } 1195 1196 static const char * 1197 _nvme_ana_state_str(enum spdk_nvme_ana_state ana_state) 1198 { 1199 switch (ana_state) { 1200 case SPDK_NVME_ANA_OPTIMIZED_STATE: 1201 return "optimized"; 1202 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 1203 return "non_optimized"; 1204 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 1205 return "inaccessible"; 1206 case SPDK_NVME_ANA_PERSISTENT_LOSS_STATE: 1207 return "persistent_loss"; 1208 case SPDK_NVME_ANA_CHANGE_STATE: 1209 return "change"; 1210 default: 1211 return NULL; 1212 } 1213 } 1214 1215 static int 1216 bdev_nvme_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size) 1217 { 1218 struct nvme_bdev *nbdev = ctx; 1219 struct spdk_memory_domain *domain; 1220 1221 domain = spdk_nvme_ctrlr_get_memory_domain(nbdev->nvme_ns->ctrlr->ctrlr); 1222 1223 if (domain) { 1224 if (array_size > 0 && domains) { 1225 domains[0] = domain; 1226 } 1227 return 1; 1228 } 1229 1230 return 0; 1231 } 1232 1233 static int 1234 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) 1235 { 1236 struct nvme_bdev *nvme_bdev = ctx; 1237 struct nvme_ns *nvme_ns; 1238 struct spdk_nvme_ns *ns; 1239 struct spdk_nvme_ctrlr *ctrlr; 1240 const struct spdk_nvme_ctrlr_data *cdata; 1241 const struct spdk_nvme_transport_id *trid; 1242 union spdk_nvme_vs_register vs; 1243 union spdk_nvme_csts_register csts; 1244 char buf[128]; 1245 1246 nvme_ns = nvme_bdev->nvme_ns; 1247 assert(nvme_ns != NULL); 1248 ns = nvme_ns->ns; 1249 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 1250 1251 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 1252 trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); 1253 vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr); 1254 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); 1255 1256 spdk_json_write_named_object_begin(w, "nvme"); 1257 1258 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 1259 spdk_json_write_named_string(w, "pci_address", trid->traddr); 1260 } 1261 1262 spdk_json_write_named_object_begin(w, "trid"); 1263 1264 nvme_bdev_dump_trid_json(trid, w); 1265 1266 spdk_json_write_object_end(w); 1267 1268 #ifdef SPDK_CONFIG_NVME_CUSE 1269 size_t cuse_name_size = 128; 1270 char cuse_name[cuse_name_size]; 1271 1272 int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns), 1273 cuse_name, &cuse_name_size); 1274 if (rc == 0) { 1275 spdk_json_write_named_string(w, "cuse_device", cuse_name); 1276 } 1277 #endif 1278 1279 spdk_json_write_named_object_begin(w, "ctrlr_data"); 1280 1281 spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid); 1282 1283 snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn); 1284 spdk_str_trim(buf); 1285 spdk_json_write_named_string(w, "model_number", buf); 1286 1287 snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn); 1288 spdk_str_trim(buf); 1289 spdk_json_write_named_string(w, "serial_number", buf); 1290 1291 snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr); 1292 spdk_str_trim(buf); 1293 spdk_json_write_named_string(w, "firmware_revision", buf); 1294 1295 if (cdata->subnqn[0] != '\0') { 1296 spdk_json_write_named_string(w, "subnqn", cdata->subnqn); 1297 } 1298 1299 spdk_json_write_named_object_begin(w, "oacs"); 1300 1301 spdk_json_write_named_uint32(w, "security", cdata->oacs.security); 1302 spdk_json_write_named_uint32(w, "format", cdata->oacs.format); 1303 spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware); 1304 spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage); 1305 1306 spdk_json_write_object_end(w); 1307 1308 spdk_json_write_object_end(w); 1309 1310 spdk_json_write_named_object_begin(w, "vs"); 1311 1312 spdk_json_write_name(w, "nvme_version"); 1313 if (vs.bits.ter) { 1314 spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter); 1315 } else { 1316 spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr); 1317 } 1318 1319 spdk_json_write_object_end(w); 1320 1321 spdk_json_write_named_object_begin(w, "csts"); 1322 1323 spdk_json_write_named_uint32(w, "rdy", csts.bits.rdy); 1324 spdk_json_write_named_uint32(w, "cfs", csts.bits.cfs); 1325 1326 spdk_json_write_object_end(w); 1327 1328 spdk_json_write_named_object_begin(w, "ns_data"); 1329 1330 spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns)); 1331 1332 if (cdata->cmic.ana_reporting) { 1333 spdk_json_write_named_string(w, "ana_state", 1334 _nvme_ana_state_str(nvme_ns->ana_state)); 1335 } 1336 1337 spdk_json_write_object_end(w); 1338 1339 if (cdata->oacs.security) { 1340 spdk_json_write_named_object_begin(w, "security"); 1341 1342 spdk_json_write_named_bool(w, "opal", nvme_bdev->opal); 1343 1344 spdk_json_write_object_end(w); 1345 } 1346 1347 spdk_json_write_object_end(w); 1348 1349 return 0; 1350 } 1351 1352 static void 1353 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1354 { 1355 /* No config per bdev needed */ 1356 } 1357 1358 static uint64_t 1359 bdev_nvme_get_spin_time(struct spdk_io_channel *ch) 1360 { 1361 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 1362 struct nvme_ctrlr_channel *ctrlr_ch = nbdev_ch->ctrlr_ch; 1363 struct nvme_poll_group *group = ctrlr_ch->group; 1364 uint64_t spin_time; 1365 1366 if (!group || !group->collect_spin_stat) { 1367 return 0; 1368 } 1369 1370 if (group->end_ticks != 0) { 1371 group->spin_ticks += (group->end_ticks - group->start_ticks); 1372 group->end_ticks = 0; 1373 } 1374 1375 spin_time = (group->spin_ticks * 1000000ULL) / spdk_get_ticks_hz(); 1376 group->start_ticks = 0; 1377 group->spin_ticks = 0; 1378 1379 return spin_time; 1380 } 1381 1382 static const struct spdk_bdev_fn_table nvmelib_fn_table = { 1383 .destruct = bdev_nvme_destruct, 1384 .submit_request = bdev_nvme_submit_request, 1385 .io_type_supported = bdev_nvme_io_type_supported, 1386 .get_io_channel = bdev_nvme_get_io_channel, 1387 .dump_info_json = bdev_nvme_dump_info_json, 1388 .write_config_json = bdev_nvme_write_config_json, 1389 .get_spin_time = bdev_nvme_get_spin_time, 1390 .get_module_ctx = bdev_nvme_get_module_ctx, 1391 .get_memory_domains = bdev_nvme_get_memory_domains, 1392 }; 1393 1394 typedef int (*bdev_nvme_parse_ana_log_page_cb)( 1395 const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg); 1396 1397 static int 1398 bdev_nvme_parse_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 1399 bdev_nvme_parse_ana_log_page_cb cb_fn, void *cb_arg) 1400 { 1401 struct spdk_nvme_ana_group_descriptor *copied_desc; 1402 uint8_t *orig_desc; 1403 uint32_t i, desc_size, copy_len; 1404 int rc = 0; 1405 1406 if (nvme_ctrlr->ana_log_page == NULL) { 1407 return -EINVAL; 1408 } 1409 1410 copied_desc = nvme_ctrlr->copied_ana_desc; 1411 1412 orig_desc = (uint8_t *)nvme_ctrlr->ana_log_page + sizeof(struct spdk_nvme_ana_page); 1413 copy_len = nvme_ctrlr->ana_log_page_size - sizeof(struct spdk_nvme_ana_page); 1414 1415 for (i = 0; i < nvme_ctrlr->ana_log_page->num_ana_group_desc; i++) { 1416 memcpy(copied_desc, orig_desc, copy_len); 1417 1418 rc = cb_fn(copied_desc, cb_arg); 1419 if (rc != 0) { 1420 break; 1421 } 1422 1423 desc_size = sizeof(struct spdk_nvme_ana_group_descriptor) + 1424 copied_desc->num_of_nsid * sizeof(uint32_t); 1425 orig_desc += desc_size; 1426 copy_len -= desc_size; 1427 } 1428 1429 return rc; 1430 } 1431 1432 static int 1433 nvme_ns_set_ana_state(const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg) 1434 { 1435 struct nvme_ns *nvme_ns = cb_arg; 1436 uint32_t i; 1437 1438 for (i = 0; i < desc->num_of_nsid; i++) { 1439 if (desc->nsid[i] != spdk_nvme_ns_get_id(nvme_ns->ns)) { 1440 continue; 1441 } 1442 nvme_ns->ana_group_id = desc->ana_group_id; 1443 nvme_ns->ana_state = desc->ana_state; 1444 return 1; 1445 } 1446 1447 return 0; 1448 } 1449 1450 static int 1451 nvme_disk_create(struct spdk_bdev *disk, const char *base_name, 1452 struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns, 1453 uint32_t prchk_flags, void *ctx) 1454 { 1455 const struct spdk_uuid *uuid; 1456 const uint8_t *nguid; 1457 const struct spdk_nvme_ctrlr_data *cdata; 1458 const struct spdk_nvme_ns_data *nsdata; 1459 enum spdk_nvme_csi csi; 1460 uint32_t atomic_bs, phys_bs, bs; 1461 1462 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 1463 csi = spdk_nvme_ns_get_csi(ns); 1464 1465 switch (csi) { 1466 case SPDK_NVME_CSI_NVM: 1467 disk->product_name = "NVMe disk"; 1468 break; 1469 case SPDK_NVME_CSI_ZNS: 1470 disk->product_name = "NVMe ZNS disk"; 1471 disk->zoned = true; 1472 disk->zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 1473 disk->max_zone_append_size = spdk_nvme_zns_ctrlr_get_max_zone_append_size(ctrlr) / 1474 spdk_nvme_ns_get_extended_sector_size(ns); 1475 disk->max_open_zones = spdk_nvme_zns_ns_get_max_open_zones(ns); 1476 disk->max_active_zones = spdk_nvme_zns_ns_get_max_active_zones(ns); 1477 break; 1478 default: 1479 SPDK_ERRLOG("unsupported CSI: %u\n", csi); 1480 return -ENOTSUP; 1481 } 1482 1483 disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns)); 1484 if (!disk->name) { 1485 return -ENOMEM; 1486 } 1487 1488 disk->write_cache = 0; 1489 if (cdata->vwc.present) { 1490 /* Enable if the Volatile Write Cache exists */ 1491 disk->write_cache = 1; 1492 } 1493 if (cdata->oncs.write_zeroes) { 1494 disk->max_write_zeroes = UINT16_MAX + 1; 1495 } 1496 disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns); 1497 disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns); 1498 disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns); 1499 1500 nguid = spdk_nvme_ns_get_nguid(ns); 1501 if (!nguid) { 1502 uuid = spdk_nvme_ns_get_uuid(ns); 1503 if (uuid) { 1504 disk->uuid = *uuid; 1505 } 1506 } else { 1507 memcpy(&disk->uuid, nguid, sizeof(disk->uuid)); 1508 } 1509 1510 nsdata = spdk_nvme_ns_get_data(ns); 1511 bs = spdk_nvme_ns_get_sector_size(ns); 1512 atomic_bs = bs; 1513 phys_bs = bs; 1514 if (nsdata->nabo == 0) { 1515 if (nsdata->nsfeat.ns_atomic_write_unit && nsdata->nawupf) { 1516 atomic_bs = bs * (1 + nsdata->nawupf); 1517 } else { 1518 atomic_bs = bs * (1 + cdata->awupf); 1519 } 1520 } 1521 if (nsdata->nsfeat.optperf) { 1522 phys_bs = bs * (1 + nsdata->npwg); 1523 } 1524 disk->phys_blocklen = spdk_min(phys_bs, atomic_bs); 1525 1526 disk->md_len = spdk_nvme_ns_get_md_size(ns); 1527 if (disk->md_len != 0) { 1528 disk->md_interleave = nsdata->flbas.extended; 1529 disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns); 1530 if (disk->dif_type != SPDK_DIF_DISABLE) { 1531 disk->dif_is_head_of_md = nsdata->dps.md_start; 1532 disk->dif_check_flags = prchk_flags; 1533 } 1534 } 1535 1536 if (!(spdk_nvme_ctrlr_get_flags(ctrlr) & 1537 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) { 1538 disk->acwu = 0; 1539 } else if (nsdata->nsfeat.ns_atomic_write_unit) { 1540 disk->acwu = nsdata->nacwu; 1541 } else { 1542 disk->acwu = cdata->acwu; 1543 } 1544 1545 disk->ctxt = ctx; 1546 disk->fn_table = &nvmelib_fn_table; 1547 disk->module = &nvme_if; 1548 1549 return 0; 1550 } 1551 1552 static int 1553 nvme_bdev_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 1554 { 1555 struct nvme_bdev *bdev; 1556 int rc; 1557 1558 bdev = calloc(1, sizeof(*bdev)); 1559 if (!bdev) { 1560 SPDK_ERRLOG("bdev calloc() failed\n"); 1561 return -ENOMEM; 1562 } 1563 1564 bdev->nvme_ns = nvme_ns; 1565 bdev->opal = nvme_ctrlr->opal_dev != NULL; 1566 1567 rc = nvme_disk_create(&bdev->disk, nvme_ctrlr->name, nvme_ctrlr->ctrlr, 1568 nvme_ns->ns, nvme_ctrlr->prchk_flags, bdev); 1569 if (rc != 0) { 1570 SPDK_ERRLOG("Failed to create NVMe disk\n"); 1571 free(bdev); 1572 return rc; 1573 } 1574 1575 spdk_io_device_register(bdev, 1576 bdev_nvme_create_bdev_channel_cb, 1577 bdev_nvme_destroy_bdev_channel_cb, 1578 sizeof(struct nvme_bdev_channel), 1579 bdev->disk.name); 1580 1581 rc = spdk_bdev_register(&bdev->disk); 1582 if (rc != 0) { 1583 SPDK_ERRLOG("spdk_bdev_register() failed\n"); 1584 spdk_io_device_unregister(bdev, NULL); 1585 free(bdev->disk.name); 1586 free(bdev); 1587 return rc; 1588 } 1589 1590 nvme_ns->bdev = bdev; 1591 1592 return 0; 1593 } 1594 1595 static bool 1596 bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2) 1597 { 1598 const struct spdk_nvme_ns_data *nsdata1, *nsdata2; 1599 const struct spdk_uuid *uuid1, *uuid2; 1600 1601 nsdata1 = spdk_nvme_ns_get_data(ns1); 1602 nsdata2 = spdk_nvme_ns_get_data(ns2); 1603 uuid1 = spdk_nvme_ns_get_uuid(ns1); 1604 uuid2 = spdk_nvme_ns_get_uuid(ns2); 1605 1606 return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid)) == 0 && 1607 nsdata1->eui64 == nsdata2->eui64 && 1608 uuid1 != NULL && uuid2 != NULL && spdk_uuid_compare(uuid1, uuid2) == 0; 1609 } 1610 1611 static void 1612 nvme_ctrlr_populate_standard_namespace(struct nvme_ctrlr *nvme_ctrlr, 1613 struct nvme_ns *nvme_ns, struct nvme_async_probe_ctx *ctx) 1614 { 1615 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 1616 struct spdk_nvme_ns *ns; 1617 int rc = 0; 1618 1619 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id); 1620 if (!ns) { 1621 SPDK_DEBUGLOG(bdev_nvme, "Invalid NS %d\n", nvme_ns->id); 1622 rc = -EINVAL; 1623 goto done; 1624 } 1625 1626 nvme_ns->ns = ns; 1627 nvme_ns->populated = true; 1628 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 1629 1630 if (nvme_ctrlr->ana_log_page != NULL) { 1631 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ns_set_ana_state, nvme_ns); 1632 } 1633 1634 rc = nvme_bdev_create(nvme_ctrlr, nvme_ns); 1635 done: 1636 nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, rc); 1637 } 1638 1639 static bool 1640 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 1641 struct spdk_nvme_ctrlr_opts *opts) 1642 { 1643 struct nvme_probe_skip_entry *entry; 1644 1645 TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) { 1646 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 1647 return false; 1648 } 1649 } 1650 1651 opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst; 1652 opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight; 1653 opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight; 1654 opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight; 1655 opts->disable_read_ana_log_page = true; 1656 1657 SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr); 1658 1659 return true; 1660 } 1661 1662 static void 1663 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl) 1664 { 1665 struct nvme_ctrlr *nvme_ctrlr = ctx; 1666 1667 if (spdk_nvme_cpl_is_error(cpl)) { 1668 SPDK_WARNLOG("Abort failed. Resetting controller. sc is %u, sct is %u.\n", cpl->status.sc, 1669 cpl->status.sct); 1670 bdev_nvme_reset(nvme_ctrlr); 1671 } else if (cpl->cdw0 & 0x1) { 1672 SPDK_WARNLOG("Specified command could not be aborted.\n"); 1673 bdev_nvme_reset(nvme_ctrlr); 1674 } 1675 } 1676 1677 static void 1678 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr, 1679 struct spdk_nvme_qpair *qpair, uint16_t cid) 1680 { 1681 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 1682 union spdk_nvme_csts_register csts; 1683 int rc; 1684 1685 assert(nvme_ctrlr->ctrlr == ctrlr); 1686 1687 SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid); 1688 1689 /* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O 1690 * queue. (Note: qpair == NULL when there's an admin cmd timeout.) Otherwise we 1691 * would submit another fabrics cmd on the admin queue to read CSTS and check for its 1692 * completion recursively. 1693 */ 1694 if (nvme_ctrlr->connected_trid->trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) { 1695 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); 1696 if (csts.bits.cfs) { 1697 SPDK_ERRLOG("Controller Fatal Status, reset required\n"); 1698 bdev_nvme_reset(nvme_ctrlr); 1699 return; 1700 } 1701 } 1702 1703 switch (g_opts.action_on_timeout) { 1704 case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT: 1705 if (qpair) { 1706 /* Don't send abort to ctrlr when reset is running. */ 1707 pthread_mutex_lock(&nvme_ctrlr->mutex); 1708 if (nvme_ctrlr->resetting) { 1709 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1710 SPDK_NOTICELOG("Quit abort. Ctrlr is in the process of reseting.\n"); 1711 return; 1712 } 1713 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1714 1715 rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid, 1716 nvme_abort_cpl, nvme_ctrlr); 1717 if (rc == 0) { 1718 return; 1719 } 1720 1721 SPDK_ERRLOG("Unable to send abort. Resetting, rc is %d.\n", rc); 1722 } 1723 1724 /* FALLTHROUGH */ 1725 case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET: 1726 bdev_nvme_reset(nvme_ctrlr); 1727 break; 1728 case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE: 1729 SPDK_DEBUGLOG(bdev_nvme, "No action for nvme controller timeout.\n"); 1730 break; 1731 default: 1732 SPDK_ERRLOG("An invalid timeout action value is found.\n"); 1733 break; 1734 } 1735 } 1736 1737 static void 1738 nvme_ctrlr_depopulate_standard_namespace(struct nvme_ns *nvme_ns) 1739 { 1740 struct nvme_bdev *bdev; 1741 1742 bdev = nvme_ns->bdev; 1743 if (bdev != NULL) { 1744 spdk_bdev_unregister(&bdev->disk, NULL, NULL); 1745 } 1746 1747 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 1748 } 1749 1750 static void 1751 nvme_ctrlr_populate_namespace(struct nvme_ctrlr *ctrlr, struct nvme_ns *nvme_ns, 1752 struct nvme_async_probe_ctx *ctx) 1753 { 1754 g_populate_namespace_fn[nvme_ns->type](ctrlr, nvme_ns, ctx); 1755 } 1756 1757 static void 1758 nvme_ctrlr_depopulate_namespace(struct nvme_ctrlr *ctrlr, struct nvme_ns *nvme_ns) 1759 { 1760 g_depopulate_namespace_fn[nvme_ns->type](nvme_ns); 1761 } 1762 1763 void 1764 nvme_ctrlr_populate_namespace_done(struct nvme_async_probe_ctx *ctx, 1765 struct nvme_ns *nvme_ns, int rc) 1766 { 1767 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 1768 1769 assert(nvme_ctrlr != NULL); 1770 1771 if (rc == 0) { 1772 pthread_mutex_lock(&nvme_ctrlr->mutex); 1773 nvme_ctrlr->ref++; 1774 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1775 } else { 1776 memset(nvme_ns, 0, sizeof(*nvme_ns)); 1777 } 1778 1779 if (ctx) { 1780 ctx->populates_in_progress--; 1781 if (ctx->populates_in_progress == 0) { 1782 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 1783 } 1784 } 1785 } 1786 1787 static void 1788 nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 1789 struct nvme_async_probe_ctx *ctx) 1790 { 1791 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 1792 struct nvme_ns *nvme_ns; 1793 struct spdk_nvme_ns *ns; 1794 struct nvme_bdev *bdev; 1795 uint32_t i; 1796 int rc; 1797 uint64_t num_sectors; 1798 bool ns_is_active; 1799 1800 if (ctx) { 1801 /* Initialize this count to 1 to handle the populate functions 1802 * calling nvme_ctrlr_populate_namespace_done() immediately. 1803 */ 1804 ctx->populates_in_progress = 1; 1805 } 1806 1807 for (i = 0; i < nvme_ctrlr->num_ns; i++) { 1808 uint32_t nsid = i + 1; 1809 1810 nvme_ns = nvme_ctrlr->namespaces[i]; 1811 ns_is_active = spdk_nvme_ctrlr_is_active_ns(ctrlr, nsid); 1812 1813 if (nvme_ns->populated && ns_is_active && nvme_ns->type == NVME_NS_STANDARD) { 1814 /* NS is still there but attributes may have changed */ 1815 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid); 1816 num_sectors = spdk_nvme_ns_get_num_sectors(ns); 1817 bdev = nvme_ns->bdev; 1818 assert(bdev != NULL); 1819 if (bdev->disk.blockcnt != num_sectors) { 1820 SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n", 1821 nsid, 1822 bdev->disk.name, 1823 bdev->disk.blockcnt, 1824 num_sectors); 1825 rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors); 1826 if (rc != 0) { 1827 SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n", 1828 bdev->disk.name, rc); 1829 } 1830 } 1831 } 1832 1833 if (!nvme_ns->populated && ns_is_active) { 1834 nvme_ns->id = nsid; 1835 nvme_ns->ctrlr = nvme_ctrlr; 1836 nvme_ns->type = NVME_NS_STANDARD; 1837 1838 nvme_ns->bdev = NULL; 1839 1840 if (ctx) { 1841 ctx->populates_in_progress++; 1842 } 1843 nvme_ctrlr_populate_namespace(nvme_ctrlr, nvme_ns, ctx); 1844 } 1845 1846 if (nvme_ns->populated && !ns_is_active) { 1847 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 1848 } 1849 } 1850 1851 if (ctx) { 1852 /* Decrement this count now that the loop is over to account 1853 * for the one we started with. If the count is then 0, we 1854 * know any populate_namespace functions completed immediately, 1855 * so we'll kick the callback here. 1856 */ 1857 ctx->populates_in_progress--; 1858 if (ctx->populates_in_progress == 0) { 1859 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 1860 } 1861 } 1862 1863 } 1864 1865 static void 1866 nvme_ctrlr_depopulate_namespaces(struct nvme_ctrlr *nvme_ctrlr) 1867 { 1868 uint32_t i; 1869 struct nvme_ns *nvme_ns; 1870 1871 for (i = 0; i < nvme_ctrlr->num_ns; i++) { 1872 uint32_t nsid = i + 1; 1873 1874 nvme_ns = nvme_ctrlr->namespaces[nsid - 1]; 1875 if (nvme_ns->populated) { 1876 assert(nvme_ns->id == nsid); 1877 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 1878 } 1879 } 1880 } 1881 1882 static bool 1883 nvme_ctrlr_acquire(struct nvme_ctrlr *nvme_ctrlr) 1884 { 1885 pthread_mutex_lock(&nvme_ctrlr->mutex); 1886 if (nvme_ctrlr->destruct || nvme_ctrlr->resetting) { 1887 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1888 return false; 1889 } 1890 nvme_ctrlr->ref++; 1891 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1892 return true; 1893 } 1894 1895 static int 1896 nvme_ctrlr_set_ana_states(const struct spdk_nvme_ana_group_descriptor *desc, 1897 void *cb_arg) 1898 { 1899 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 1900 struct nvme_ns *nvme_ns; 1901 uint32_t i, nsid; 1902 1903 for (i = 0; i < desc->num_of_nsid; i++) { 1904 nsid = desc->nsid[i]; 1905 if (nsid == 0 || nsid > nvme_ctrlr->num_ns) { 1906 continue; 1907 } 1908 1909 nvme_ns = nvme_ctrlr->namespaces[nsid - 1]; 1910 assert(nvme_ns != NULL); 1911 1912 if (!nvme_ns->populated) { 1913 continue; 1914 } 1915 1916 nvme_ns->ana_group_id = desc->ana_group_id; 1917 nvme_ns->ana_state = desc->ana_state; 1918 } 1919 1920 return 0; 1921 } 1922 1923 static void 1924 nvme_ctrlr_read_ana_log_page_done(void *ctx, const struct spdk_nvme_cpl *cpl) 1925 { 1926 struct nvme_ctrlr *nvme_ctrlr = ctx; 1927 1928 if (spdk_nvme_cpl_is_success(cpl)) { 1929 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ctrlr_set_ana_states, 1930 nvme_ctrlr); 1931 } 1932 1933 nvme_ctrlr_release(nvme_ctrlr); 1934 } 1935 1936 static void 1937 nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 1938 { 1939 int rc; 1940 1941 if (nvme_ctrlr->ana_log_page == NULL) { 1942 return; 1943 } 1944 1945 if (!nvme_ctrlr_acquire(nvme_ctrlr)) { 1946 return; 1947 } 1948 1949 rc = spdk_nvme_ctrlr_cmd_get_log_page(nvme_ctrlr->ctrlr, 1950 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 1951 SPDK_NVME_GLOBAL_NS_TAG, 1952 nvme_ctrlr->ana_log_page, 1953 nvme_ctrlr->ana_log_page_size, 0, 1954 nvme_ctrlr_read_ana_log_page_done, 1955 nvme_ctrlr); 1956 if (rc != 0) { 1957 nvme_ctrlr_release(nvme_ctrlr); 1958 } 1959 } 1960 1961 static void 1962 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 1963 { 1964 struct nvme_ctrlr *nvme_ctrlr = arg; 1965 union spdk_nvme_async_event_completion event; 1966 1967 if (spdk_nvme_cpl_is_error(cpl)) { 1968 SPDK_WARNLOG("AER request execute failed"); 1969 return; 1970 } 1971 1972 event.raw = cpl->cdw0; 1973 if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 1974 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) { 1975 nvme_ctrlr_populate_namespaces(nvme_ctrlr, NULL); 1976 } else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 1977 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_ANA_CHANGE)) { 1978 nvme_ctrlr_read_ana_log_page(nvme_ctrlr); 1979 } 1980 } 1981 1982 static void 1983 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, size_t count, int rc) 1984 { 1985 if (ctx->cb_fn) { 1986 ctx->cb_fn(ctx->cb_ctx, count, rc); 1987 } 1988 1989 ctx->namespaces_populated = true; 1990 if (ctx->probe_done) { 1991 /* The probe was already completed, so we need to free the context 1992 * here. This can happen for cases like OCSSD, where we need to 1993 * send additional commands to the SSD after attach. 1994 */ 1995 free(ctx); 1996 } 1997 } 1998 1999 static void 2000 nvme_ctrlr_create_done(struct nvme_ctrlr *nvme_ctrlr, 2001 struct nvme_async_probe_ctx *ctx) 2002 { 2003 spdk_io_device_register(nvme_ctrlr, 2004 bdev_nvme_create_ctrlr_channel_cb, 2005 bdev_nvme_destroy_ctrlr_channel_cb, 2006 sizeof(struct nvme_ctrlr_channel), 2007 nvme_ctrlr->name); 2008 2009 nvme_ctrlr_populate_namespaces(nvme_ctrlr, ctx); 2010 } 2011 2012 static void 2013 nvme_ctrlr_init_ana_log_page_done(void *_ctx, const struct spdk_nvme_cpl *cpl) 2014 { 2015 struct nvme_ctrlr *nvme_ctrlr = _ctx; 2016 struct nvme_async_probe_ctx *ctx = nvme_ctrlr->probe_ctx; 2017 2018 nvme_ctrlr->probe_ctx = NULL; 2019 2020 if (spdk_nvme_cpl_is_error(cpl)) { 2021 nvme_ctrlr_delete(nvme_ctrlr); 2022 2023 if (ctx != NULL) { 2024 populate_namespaces_cb(ctx, 0, -1); 2025 } 2026 return; 2027 } 2028 2029 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 2030 } 2031 2032 static int 2033 nvme_ctrlr_init_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 2034 struct nvme_async_probe_ctx *ctx) 2035 { 2036 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 2037 const struct spdk_nvme_ctrlr_data *cdata; 2038 uint32_t ana_log_page_size; 2039 2040 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 2041 2042 ana_log_page_size = sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 2043 sizeof(struct spdk_nvme_ana_group_descriptor) + cdata->nn * 2044 sizeof(uint32_t); 2045 2046 nvme_ctrlr->ana_log_page = spdk_zmalloc(ana_log_page_size, 64, NULL, 2047 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 2048 if (nvme_ctrlr->ana_log_page == NULL) { 2049 SPDK_ERRLOG("could not allocate ANA log page buffer\n"); 2050 return -ENXIO; 2051 } 2052 2053 /* Each descriptor in a ANA log page is not ensured to be 8-bytes aligned. 2054 * Hence copy each descriptor to a temporary area when parsing it. 2055 * 2056 * Allocate a buffer whose size is as large as ANA log page buffer because 2057 * we do not know the size of a descriptor until actually reading it. 2058 */ 2059 nvme_ctrlr->copied_ana_desc = calloc(1, ana_log_page_size); 2060 if (nvme_ctrlr->copied_ana_desc == NULL) { 2061 SPDK_ERRLOG("could not allocate a buffer to parse ANA descriptor\n"); 2062 return -ENOMEM; 2063 } 2064 2065 nvme_ctrlr->ana_log_page_size = ana_log_page_size; 2066 2067 nvme_ctrlr->probe_ctx = ctx; 2068 2069 return spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, 2070 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 2071 SPDK_NVME_GLOBAL_NS_TAG, 2072 nvme_ctrlr->ana_log_page, 2073 nvme_ctrlr->ana_log_page_size, 0, 2074 nvme_ctrlr_init_ana_log_page_done, 2075 nvme_ctrlr); 2076 } 2077 2078 static int 2079 nvme_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr, 2080 const char *name, 2081 const struct spdk_nvme_transport_id *trid, 2082 uint32_t prchk_flags, 2083 struct nvme_async_probe_ctx *ctx) 2084 { 2085 struct nvme_ctrlr *nvme_ctrlr; 2086 struct nvme_ctrlr_trid *trid_entry; 2087 uint32_t i, num_ns; 2088 const struct spdk_nvme_ctrlr_data *cdata; 2089 int rc; 2090 2091 nvme_ctrlr = calloc(1, sizeof(*nvme_ctrlr)); 2092 if (nvme_ctrlr == NULL) { 2093 SPDK_ERRLOG("Failed to allocate device struct\n"); 2094 return -ENOMEM; 2095 } 2096 2097 rc = pthread_mutex_init(&nvme_ctrlr->mutex, NULL); 2098 if (rc != 0) { 2099 free(nvme_ctrlr); 2100 return rc; 2101 } 2102 2103 TAILQ_INIT(&nvme_ctrlr->trids); 2104 2105 num_ns = spdk_nvme_ctrlr_get_num_ns(ctrlr); 2106 if (num_ns != 0) { 2107 nvme_ctrlr->namespaces = calloc(num_ns, sizeof(struct nvme_ns *)); 2108 if (!nvme_ctrlr->namespaces) { 2109 SPDK_ERRLOG("Failed to allocate block namespaces pointer\n"); 2110 rc = -ENOMEM; 2111 goto err; 2112 } 2113 2114 for (i = 0; i < num_ns; i++) { 2115 nvme_ctrlr->namespaces[i] = calloc(1, sizeof(struct nvme_ns)); 2116 if (nvme_ctrlr->namespaces[i] == NULL) { 2117 SPDK_ERRLOG("Failed to allocate block namespace struct\n"); 2118 rc = -ENOMEM; 2119 goto err; 2120 } 2121 nvme_ctrlr->num_ns++; 2122 } 2123 2124 assert(num_ns == nvme_ctrlr->num_ns); 2125 } 2126 2127 trid_entry = calloc(1, sizeof(*trid_entry)); 2128 if (trid_entry == NULL) { 2129 SPDK_ERRLOG("Failed to allocate trid entry pointer\n"); 2130 rc = -ENOMEM; 2131 goto err; 2132 } 2133 2134 trid_entry->trid = *trid; 2135 nvme_ctrlr->connected_trid = &trid_entry->trid; 2136 TAILQ_INSERT_HEAD(&nvme_ctrlr->trids, trid_entry, link); 2137 2138 nvme_ctrlr->thread = spdk_get_thread(); 2139 nvme_ctrlr->ctrlr = ctrlr; 2140 nvme_ctrlr->ref = 1; 2141 nvme_ctrlr->name = strdup(name); 2142 if (nvme_ctrlr->name == NULL) { 2143 rc = -ENOMEM; 2144 goto err; 2145 } 2146 2147 if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) { 2148 SPDK_ERRLOG("OCSSDs are not supported"); 2149 rc = -ENOTSUP; 2150 goto err; 2151 } 2152 2153 nvme_ctrlr->prchk_flags = prchk_flags; 2154 2155 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_ctrlr, 2156 g_opts.nvme_adminq_poll_period_us); 2157 2158 pthread_mutex_lock(&g_bdev_nvme_mutex); 2159 TAILQ_INSERT_TAIL(&g_nvme_ctrlrs, nvme_ctrlr, tailq); 2160 pthread_mutex_unlock(&g_bdev_nvme_mutex); 2161 2162 if (g_opts.timeout_us > 0) { 2163 /* Register timeout callback. Timeout values for IO vs. admin reqs can be different. */ 2164 /* If timeout_admin_us is 0 (not specified), admin uses same timeout as IO. */ 2165 uint64_t adm_timeout_us = (g_opts.timeout_admin_us == 0) ? 2166 g_opts.timeout_us : g_opts.timeout_admin_us; 2167 spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us, 2168 adm_timeout_us, timeout_cb, nvme_ctrlr); 2169 } 2170 2171 spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_ctrlr); 2172 spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_ctrlr); 2173 2174 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 2175 SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) { 2176 nvme_ctrlr->opal_dev = spdk_opal_dev_construct(ctrlr); 2177 } 2178 2179 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 2180 2181 if (cdata->cmic.ana_reporting) { 2182 rc = nvme_ctrlr_init_ana_log_page(nvme_ctrlr, ctx); 2183 if (rc == 0) { 2184 return 0; 2185 } 2186 } else { 2187 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 2188 return 0; 2189 } 2190 2191 err: 2192 nvme_ctrlr_delete(nvme_ctrlr); 2193 return rc; 2194 } 2195 2196 static void 2197 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 2198 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 2199 { 2200 struct nvme_probe_ctx *ctx = cb_ctx; 2201 char *name = NULL; 2202 uint32_t prchk_flags = 0; 2203 size_t i; 2204 2205 if (ctx) { 2206 for (i = 0; i < ctx->count; i++) { 2207 if (spdk_nvme_transport_id_compare(trid, &ctx->trids[i]) == 0) { 2208 prchk_flags = ctx->prchk_flags[i]; 2209 name = strdup(ctx->names[i]); 2210 break; 2211 } 2212 } 2213 } else { 2214 name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++); 2215 } 2216 if (!name) { 2217 SPDK_ERRLOG("Failed to assign name to NVMe device\n"); 2218 return; 2219 } 2220 2221 SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name); 2222 2223 nvme_ctrlr_create(ctrlr, name, trid, prchk_flags, NULL); 2224 2225 free(name); 2226 } 2227 2228 static void 2229 _nvme_ctrlr_destruct(void *ctx) 2230 { 2231 struct nvme_ctrlr *nvme_ctrlr = ctx; 2232 2233 nvme_ctrlr_depopulate_namespaces(nvme_ctrlr); 2234 nvme_ctrlr_release(nvme_ctrlr); 2235 } 2236 2237 static int 2238 _bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) 2239 { 2240 struct nvme_probe_skip_entry *entry; 2241 2242 pthread_mutex_lock(&nvme_ctrlr->mutex); 2243 2244 /* The controller's destruction was already started */ 2245 if (nvme_ctrlr->destruct) { 2246 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2247 return 0; 2248 } 2249 2250 if (!hotplug && 2251 nvme_ctrlr->connected_trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 2252 entry = calloc(1, sizeof(*entry)); 2253 if (!entry) { 2254 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2255 return -ENOMEM; 2256 } 2257 entry->trid = *nvme_ctrlr->connected_trid; 2258 TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq); 2259 } 2260 2261 nvme_ctrlr->destruct = true; 2262 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2263 2264 _nvme_ctrlr_destruct(nvme_ctrlr); 2265 2266 return 0; 2267 } 2268 2269 static void 2270 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr) 2271 { 2272 struct nvme_ctrlr *nvme_ctrlr = cb_ctx; 2273 2274 _bdev_nvme_delete(nvme_ctrlr, true); 2275 } 2276 2277 static int 2278 bdev_nvme_hotplug_probe(void *arg) 2279 { 2280 if (g_hotplug_probe_ctx == NULL) { 2281 spdk_poller_unregister(&g_hotplug_probe_poller); 2282 return SPDK_POLLER_IDLE; 2283 } 2284 2285 if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) { 2286 g_hotplug_probe_ctx = NULL; 2287 spdk_poller_unregister(&g_hotplug_probe_poller); 2288 } 2289 2290 return SPDK_POLLER_BUSY; 2291 } 2292 2293 static int 2294 bdev_nvme_hotplug(void *arg) 2295 { 2296 struct spdk_nvme_transport_id trid_pcie; 2297 2298 if (g_hotplug_probe_ctx) { 2299 return SPDK_POLLER_BUSY; 2300 } 2301 2302 memset(&trid_pcie, 0, sizeof(trid_pcie)); 2303 spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE); 2304 2305 g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL, 2306 hotplug_probe_cb, attach_cb, NULL); 2307 2308 if (g_hotplug_probe_ctx) { 2309 assert(g_hotplug_probe_poller == NULL); 2310 g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000); 2311 } 2312 2313 return SPDK_POLLER_BUSY; 2314 } 2315 2316 void 2317 bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts) 2318 { 2319 *opts = g_opts; 2320 } 2321 2322 static int 2323 bdev_nvme_validate_opts(const struct spdk_bdev_nvme_opts *opts) 2324 { 2325 if ((opts->timeout_us == 0) && (opts->timeout_admin_us != 0)) { 2326 /* Can't set timeout_admin_us without also setting timeout_us */ 2327 SPDK_WARNLOG("Invalid options: Can't have (timeout_us == 0) with (timeout_admin_us > 0)\n"); 2328 return -EINVAL; 2329 } 2330 2331 return 0; 2332 } 2333 2334 int 2335 bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts) 2336 { 2337 int ret = bdev_nvme_validate_opts(opts); 2338 if (ret) { 2339 SPDK_WARNLOG("Failed to set nvme opts.\n"); 2340 return ret; 2341 } 2342 2343 if (g_bdev_nvme_init_thread != NULL) { 2344 if (!TAILQ_EMPTY(&g_nvme_ctrlrs)) { 2345 return -EPERM; 2346 } 2347 } 2348 2349 g_opts = *opts; 2350 2351 return 0; 2352 } 2353 2354 struct set_nvme_hotplug_ctx { 2355 uint64_t period_us; 2356 bool enabled; 2357 spdk_msg_fn fn; 2358 void *fn_ctx; 2359 }; 2360 2361 static void 2362 set_nvme_hotplug_period_cb(void *_ctx) 2363 { 2364 struct set_nvme_hotplug_ctx *ctx = _ctx; 2365 2366 spdk_poller_unregister(&g_hotplug_poller); 2367 if (ctx->enabled) { 2368 g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us); 2369 } 2370 2371 g_nvme_hotplug_poll_period_us = ctx->period_us; 2372 g_nvme_hotplug_enabled = ctx->enabled; 2373 if (ctx->fn) { 2374 ctx->fn(ctx->fn_ctx); 2375 } 2376 2377 free(ctx); 2378 } 2379 2380 int 2381 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx) 2382 { 2383 struct set_nvme_hotplug_ctx *ctx; 2384 2385 if (enabled == true && !spdk_process_is_primary()) { 2386 return -EPERM; 2387 } 2388 2389 ctx = calloc(1, sizeof(*ctx)); 2390 if (ctx == NULL) { 2391 return -ENOMEM; 2392 } 2393 2394 period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us; 2395 ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX); 2396 ctx->enabled = enabled; 2397 ctx->fn = cb; 2398 ctx->fn_ctx = cb_ctx; 2399 2400 spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx); 2401 return 0; 2402 } 2403 2404 static void 2405 nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 2406 struct nvme_async_probe_ctx *ctx) 2407 { 2408 struct nvme_ns *nvme_ns; 2409 struct nvme_bdev *nvme_bdev; 2410 uint32_t i, nsid; 2411 size_t j; 2412 2413 assert(nvme_ctrlr != NULL); 2414 2415 /* 2416 * Report the new bdevs that were created in this call. 2417 * There can be more than one bdev per NVMe controller. 2418 */ 2419 j = 0; 2420 for (i = 0; i < nvme_ctrlr->num_ns; i++) { 2421 nsid = i + 1; 2422 nvme_ns = nvme_ctrlr->namespaces[nsid - 1]; 2423 if (!nvme_ns->populated) { 2424 continue; 2425 } 2426 assert(nvme_ns->id == nsid); 2427 nvme_bdev = nvme_ns->bdev; 2428 if (nvme_bdev == NULL) { 2429 assert(nvme_ns->type == NVME_NS_OCSSD); 2430 continue; 2431 } 2432 if (j < ctx->count) { 2433 ctx->names[j] = nvme_bdev->disk.name; 2434 j++; 2435 } else { 2436 SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n", 2437 ctx->count); 2438 populate_namespaces_cb(ctx, 0, -ERANGE); 2439 return; 2440 } 2441 } 2442 2443 populate_namespaces_cb(ctx, j, 0); 2444 } 2445 2446 static int 2447 bdev_nvme_compare_trids(struct nvme_ctrlr *nvme_ctrlr, 2448 struct spdk_nvme_ctrlr *new_ctrlr, 2449 struct spdk_nvme_transport_id *trid) 2450 { 2451 struct nvme_ctrlr_trid *tmp_trid; 2452 2453 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 2454 SPDK_ERRLOG("PCIe failover is not supported.\n"); 2455 return -ENOTSUP; 2456 } 2457 2458 /* Currently we only support failover to the same transport type. */ 2459 if (nvme_ctrlr->connected_trid->trtype != trid->trtype) { 2460 return -EINVAL; 2461 } 2462 2463 /* Currently we only support failover to the same NQN. */ 2464 if (strncmp(trid->subnqn, nvme_ctrlr->connected_trid->subnqn, SPDK_NVMF_NQN_MAX_LEN)) { 2465 return -EINVAL; 2466 } 2467 2468 /* Skip all the other checks if we've already registered this path. */ 2469 TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) { 2470 if (!spdk_nvme_transport_id_compare(&tmp_trid->trid, trid)) { 2471 return -EEXIST; 2472 } 2473 } 2474 2475 return 0; 2476 } 2477 2478 static int 2479 bdev_nvme_compare_namespaces(struct nvme_ctrlr *nvme_ctrlr, 2480 struct spdk_nvme_ctrlr *new_ctrlr) 2481 { 2482 uint32_t i, nsid; 2483 struct nvme_ns *nvme_ns; 2484 struct spdk_nvme_ns *new_ns; 2485 2486 if (spdk_nvme_ctrlr_get_num_ns(new_ctrlr) != nvme_ctrlr->num_ns) { 2487 return -EINVAL; 2488 } 2489 2490 for (i = 0; i < nvme_ctrlr->num_ns; i++) { 2491 nsid = i + 1; 2492 2493 nvme_ns = nvme_ctrlr->namespaces[i]; 2494 if (!nvme_ns->populated) { 2495 continue; 2496 } 2497 2498 new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nsid); 2499 assert(new_ns != NULL); 2500 2501 if (!bdev_nvme_compare_ns(nvme_ns->ns, new_ns)) { 2502 return -EINVAL; 2503 } 2504 } 2505 2506 return 0; 2507 } 2508 2509 static int 2510 _bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 2511 struct spdk_nvme_transport_id *trid) 2512 { 2513 struct nvme_ctrlr_trid *new_trid, *tmp_trid; 2514 2515 new_trid = calloc(1, sizeof(*new_trid)); 2516 if (new_trid == NULL) { 2517 return -ENOMEM; 2518 } 2519 new_trid->trid = *trid; 2520 new_trid->is_failed = false; 2521 2522 TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) { 2523 if (tmp_trid->is_failed) { 2524 TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link); 2525 return 0; 2526 } 2527 } 2528 2529 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, new_trid, link); 2530 return 0; 2531 } 2532 2533 /* This is the case that a secondary path is added to an existing 2534 * nvme_ctrlr for failover. After checking if it can access the same 2535 * namespaces as the primary path, it is disconnected until failover occurs. 2536 */ 2537 static int 2538 bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 2539 struct spdk_nvme_ctrlr *new_ctrlr, 2540 struct spdk_nvme_transport_id *trid) 2541 { 2542 int rc; 2543 2544 assert(nvme_ctrlr != NULL); 2545 2546 pthread_mutex_lock(&nvme_ctrlr->mutex); 2547 2548 rc = bdev_nvme_compare_trids(nvme_ctrlr, new_ctrlr, trid); 2549 if (rc != 0) { 2550 goto exit; 2551 } 2552 2553 rc = bdev_nvme_compare_namespaces(nvme_ctrlr, new_ctrlr); 2554 if (rc != 0) { 2555 goto exit; 2556 } 2557 2558 rc = _bdev_nvme_add_secondary_trid(nvme_ctrlr, trid); 2559 2560 exit: 2561 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2562 2563 spdk_nvme_detach(new_ctrlr); 2564 2565 return rc; 2566 } 2567 2568 static void 2569 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 2570 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 2571 { 2572 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 2573 struct nvme_async_probe_ctx *ctx; 2574 int rc; 2575 2576 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, opts); 2577 ctx->ctrlr_attached = true; 2578 2579 rc = nvme_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx->prchk_flags, ctx); 2580 if (rc != 0) { 2581 populate_namespaces_cb(ctx, 0, rc); 2582 } 2583 } 2584 2585 static void 2586 connect_set_failover_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 2587 struct spdk_nvme_ctrlr *ctrlr, 2588 const struct spdk_nvme_ctrlr_opts *opts) 2589 { 2590 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 2591 struct nvme_ctrlr *nvme_ctrlr; 2592 struct nvme_async_probe_ctx *ctx; 2593 int rc; 2594 2595 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, opts); 2596 ctx->ctrlr_attached = true; 2597 2598 nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->base_name); 2599 if (nvme_ctrlr) { 2600 rc = bdev_nvme_add_secondary_trid(nvme_ctrlr, ctrlr, &ctx->trid); 2601 } else { 2602 rc = -ENODEV; 2603 } 2604 2605 populate_namespaces_cb(ctx, 0, rc); 2606 } 2607 2608 static int 2609 bdev_nvme_async_poll(void *arg) 2610 { 2611 struct nvme_async_probe_ctx *ctx = arg; 2612 int rc; 2613 2614 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 2615 if (spdk_unlikely(rc != -EAGAIN)) { 2616 ctx->probe_done = true; 2617 spdk_poller_unregister(&ctx->poller); 2618 if (!ctx->ctrlr_attached) { 2619 /* The probe is done, but no controller was attached. 2620 * That means we had a failure, so report -EIO back to 2621 * the caller (usually the RPC). populate_namespaces_cb() 2622 * will take care of freeing the nvme_async_probe_ctx. 2623 */ 2624 populate_namespaces_cb(ctx, 0, -EIO); 2625 } else if (ctx->namespaces_populated) { 2626 /* The namespaces for the attached controller were all 2627 * populated and the response was already sent to the 2628 * caller (usually the RPC). So free the context here. 2629 */ 2630 free(ctx); 2631 } 2632 } 2633 2634 return SPDK_POLLER_BUSY; 2635 } 2636 2637 int 2638 bdev_nvme_create(struct spdk_nvme_transport_id *trid, 2639 struct spdk_nvme_host_id *hostid, 2640 const char *base_name, 2641 const char **names, 2642 uint32_t count, 2643 uint32_t prchk_flags, 2644 spdk_bdev_create_nvme_fn cb_fn, 2645 void *cb_ctx, 2646 struct spdk_nvme_ctrlr_opts *opts) 2647 { 2648 struct nvme_probe_skip_entry *entry, *tmp; 2649 struct nvme_async_probe_ctx *ctx; 2650 spdk_nvme_attach_cb attach_cb; 2651 2652 /* TODO expand this check to include both the host and target TRIDs. 2653 * Only if both are the same should we fail. 2654 */ 2655 if (nvme_ctrlr_get(trid) != NULL) { 2656 SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr); 2657 return -EEXIST; 2658 } 2659 2660 ctx = calloc(1, sizeof(*ctx)); 2661 if (!ctx) { 2662 return -ENOMEM; 2663 } 2664 ctx->base_name = base_name; 2665 ctx->names = names; 2666 ctx->count = count; 2667 ctx->cb_fn = cb_fn; 2668 ctx->cb_ctx = cb_ctx; 2669 ctx->prchk_flags = prchk_flags; 2670 ctx->trid = *trid; 2671 2672 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 2673 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) { 2674 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 2675 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 2676 free(entry); 2677 break; 2678 } 2679 } 2680 } 2681 2682 if (opts) { 2683 memcpy(&ctx->opts, opts, sizeof(*opts)); 2684 } else { 2685 spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->opts, sizeof(ctx->opts)); 2686 } 2687 2688 ctx->opts.transport_retry_count = g_opts.retry_count; 2689 ctx->opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms; 2690 ctx->opts.disable_read_ana_log_page = true; 2691 2692 if (hostid->hostaddr[0] != '\0') { 2693 snprintf(ctx->opts.src_addr, sizeof(ctx->opts.src_addr), "%s", hostid->hostaddr); 2694 } 2695 2696 if (hostid->hostsvcid[0] != '\0') { 2697 snprintf(ctx->opts.src_svcid, sizeof(ctx->opts.src_svcid), "%s", hostid->hostsvcid); 2698 } 2699 2700 if (nvme_ctrlr_get_by_name(base_name) == NULL) { 2701 attach_cb = connect_attach_cb; 2702 } else { 2703 attach_cb = connect_set_failover_cb; 2704 } 2705 2706 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->opts, attach_cb); 2707 if (ctx->probe_ctx == NULL) { 2708 SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr); 2709 free(ctx); 2710 return -ENODEV; 2711 } 2712 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000); 2713 2714 return 0; 2715 } 2716 2717 static int 2718 bdev_nvme_delete_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 2719 const struct spdk_nvme_transport_id *trid) 2720 { 2721 struct nvme_ctrlr_trid *ctrlr_trid, *tmp_trid; 2722 2723 if (!spdk_nvme_transport_id_compare(trid, nvme_ctrlr->connected_trid)) { 2724 return -EBUSY; 2725 } 2726 2727 TAILQ_FOREACH_SAFE(ctrlr_trid, &nvme_ctrlr->trids, link, tmp_trid) { 2728 if (!spdk_nvme_transport_id_compare(&ctrlr_trid->trid, trid)) { 2729 TAILQ_REMOVE(&nvme_ctrlr->trids, ctrlr_trid, link); 2730 free(ctrlr_trid); 2731 return 0; 2732 } 2733 } 2734 2735 return -ENXIO; 2736 } 2737 2738 int 2739 bdev_nvme_delete(const char *name, const struct spdk_nvme_transport_id *trid) 2740 { 2741 struct nvme_ctrlr *nvme_ctrlr; 2742 struct nvme_ctrlr_trid *ctrlr_trid; 2743 2744 if (name == NULL) { 2745 return -EINVAL; 2746 } 2747 2748 nvme_ctrlr = nvme_ctrlr_get_by_name(name); 2749 if (nvme_ctrlr == NULL) { 2750 SPDK_ERRLOG("Failed to find NVMe controller\n"); 2751 return -ENODEV; 2752 } 2753 2754 /* case 1: remove the controller itself. */ 2755 if (trid == NULL) { 2756 return _bdev_nvme_delete(nvme_ctrlr, false); 2757 } 2758 2759 /* case 2: we are currently using the path to be removed. */ 2760 if (!spdk_nvme_transport_id_compare(trid, nvme_ctrlr->connected_trid)) { 2761 ctrlr_trid = TAILQ_FIRST(&nvme_ctrlr->trids); 2762 assert(nvme_ctrlr->connected_trid == &ctrlr_trid->trid); 2763 /* case 2A: the current path is the only path. */ 2764 if (!TAILQ_NEXT(ctrlr_trid, link)) { 2765 return _bdev_nvme_delete(nvme_ctrlr, false); 2766 } 2767 2768 /* case 2B: there is an alternative path. */ 2769 return bdev_nvme_failover(nvme_ctrlr, true); 2770 } 2771 2772 /* case 3: We are not using the specified path. */ 2773 return bdev_nvme_delete_secondary_trid(nvme_ctrlr, trid); 2774 } 2775 2776 static int 2777 bdev_nvme_library_init(void) 2778 { 2779 g_bdev_nvme_init_thread = spdk_get_thread(); 2780 2781 spdk_io_device_register(&g_nvme_ctrlrs, bdev_nvme_create_poll_group_cb, 2782 bdev_nvme_destroy_poll_group_cb, 2783 sizeof(struct nvme_poll_group), "nvme_poll_groups"); 2784 2785 return 0; 2786 } 2787 2788 static void 2789 bdev_nvme_library_fini(void) 2790 { 2791 struct nvme_ctrlr *nvme_ctrlr, *tmp; 2792 struct nvme_probe_skip_entry *entry, *entry_tmp; 2793 2794 spdk_poller_unregister(&g_hotplug_poller); 2795 free(g_hotplug_probe_ctx); 2796 g_hotplug_probe_ctx = NULL; 2797 2798 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) { 2799 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 2800 free(entry); 2801 } 2802 2803 pthread_mutex_lock(&g_bdev_nvme_mutex); 2804 TAILQ_FOREACH_SAFE(nvme_ctrlr, &g_nvme_ctrlrs, tailq, tmp) { 2805 pthread_mutex_lock(&nvme_ctrlr->mutex); 2806 if (nvme_ctrlr->destruct) { 2807 /* This controller's destruction was already started 2808 * before the application started shutting down 2809 */ 2810 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2811 continue; 2812 } 2813 nvme_ctrlr->destruct = true; 2814 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2815 2816 spdk_thread_send_msg(nvme_ctrlr->thread, _nvme_ctrlr_destruct, 2817 nvme_ctrlr); 2818 } 2819 2820 g_bdev_nvme_module_finish = true; 2821 if (TAILQ_EMPTY(&g_nvme_ctrlrs)) { 2822 pthread_mutex_unlock(&g_bdev_nvme_mutex); 2823 spdk_io_device_unregister(&g_nvme_ctrlrs, NULL); 2824 spdk_bdev_module_fini_done(); 2825 return; 2826 } 2827 2828 pthread_mutex_unlock(&g_bdev_nvme_mutex); 2829 } 2830 2831 static void 2832 bdev_nvme_verify_pi_error(struct nvme_bdev_io *bio) 2833 { 2834 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2835 struct spdk_bdev *bdev = bdev_io->bdev; 2836 struct spdk_dif_ctx dif_ctx; 2837 struct spdk_dif_error err_blk = {}; 2838 int rc; 2839 2840 rc = spdk_dif_ctx_init(&dif_ctx, 2841 bdev->blocklen, bdev->md_len, bdev->md_interleave, 2842 bdev->dif_is_head_of_md, bdev->dif_type, bdev->dif_check_flags, 2843 bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0); 2844 if (rc != 0) { 2845 SPDK_ERRLOG("Initialization of DIF context failed\n"); 2846 return; 2847 } 2848 2849 if (bdev->md_interleave) { 2850 rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 2851 bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 2852 } else { 2853 struct iovec md_iov = { 2854 .iov_base = bdev_io->u.bdev.md_buf, 2855 .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len, 2856 }; 2857 2858 rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 2859 &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 2860 } 2861 2862 if (rc != 0) { 2863 SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n", 2864 err_blk.err_type, err_blk.err_offset); 2865 } else { 2866 SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n"); 2867 } 2868 } 2869 2870 static void 2871 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 2872 { 2873 struct nvme_bdev_io *bio = ref; 2874 2875 if (spdk_nvme_cpl_is_success(cpl)) { 2876 /* Run PI verification for read data buffer. */ 2877 bdev_nvme_verify_pi_error(bio); 2878 } 2879 2880 /* Return original completion status */ 2881 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 2882 } 2883 2884 static void 2885 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 2886 { 2887 struct nvme_bdev_io *bio = ref; 2888 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2889 struct nvme_bdev_channel *nbdev_ch; 2890 struct spdk_nvme_ns *ns; 2891 struct spdk_nvme_qpair *qpair; 2892 int ret; 2893 2894 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 2895 SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n", 2896 cpl->status.sct, cpl->status.sc); 2897 2898 /* Save completion status to use after verifying PI error. */ 2899 bio->cpl = *cpl; 2900 2901 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 2902 2903 if (spdk_likely(bdev_nvme_find_io_path(nbdev_ch, &ns, &qpair))) { 2904 /* Read without PI checking to verify PI error. */ 2905 ret = bdev_nvme_no_pi_readv(ns, 2906 qpair, 2907 bio, 2908 bdev_io->u.bdev.iovs, 2909 bdev_io->u.bdev.iovcnt, 2910 bdev_io->u.bdev.md_buf, 2911 bdev_io->u.bdev.num_blocks, 2912 bdev_io->u.bdev.offset_blocks); 2913 if (ret == 0) { 2914 return; 2915 } 2916 } 2917 } 2918 2919 bdev_nvme_io_complete_nvme_status(bio, cpl); 2920 } 2921 2922 static void 2923 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 2924 { 2925 struct nvme_bdev_io *bio = ref; 2926 2927 if (spdk_nvme_cpl_is_pi_error(cpl)) { 2928 SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n", 2929 cpl->status.sct, cpl->status.sc); 2930 /* Run PI verification for write data buffer if PI error is detected. */ 2931 bdev_nvme_verify_pi_error(bio); 2932 } 2933 2934 bdev_nvme_io_complete_nvme_status(bio, cpl); 2935 } 2936 2937 static void 2938 bdev_nvme_zone_appendv_done(void *ref, const struct spdk_nvme_cpl *cpl) 2939 { 2940 struct nvme_bdev_io *bio = ref; 2941 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2942 2943 /* spdk_bdev_io_get_append_location() requires that the ALBA is stored in offset_blocks. 2944 * Additionally, offset_blocks has to be set before calling bdev_nvme_verify_pi_error(). 2945 */ 2946 bdev_io->u.bdev.offset_blocks = *(uint64_t *)&cpl->cdw0; 2947 2948 if (spdk_nvme_cpl_is_pi_error(cpl)) { 2949 SPDK_ERRLOG("zone append completed with PI error (sct=%d, sc=%d)\n", 2950 cpl->status.sct, cpl->status.sc); 2951 /* Run PI verification for zone append data buffer if PI error is detected. */ 2952 bdev_nvme_verify_pi_error(bio); 2953 } 2954 2955 bdev_nvme_io_complete_nvme_status(bio, cpl); 2956 } 2957 2958 static void 2959 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl) 2960 { 2961 struct nvme_bdev_io *bio = ref; 2962 2963 if (spdk_nvme_cpl_is_pi_error(cpl)) { 2964 SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n", 2965 cpl->status.sct, cpl->status.sc); 2966 /* Run PI verification for compare data buffer if PI error is detected. */ 2967 bdev_nvme_verify_pi_error(bio); 2968 } 2969 2970 bdev_nvme_io_complete_nvme_status(bio, cpl); 2971 } 2972 2973 static void 2974 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 2975 { 2976 struct nvme_bdev_io *bio = ref; 2977 2978 /* Compare operation completion */ 2979 if ((cpl->cdw0 & 0xFF) == SPDK_NVME_OPC_COMPARE) { 2980 /* Save compare result for write callback */ 2981 bio->cpl = *cpl; 2982 return; 2983 } 2984 2985 /* Write operation completion */ 2986 if (spdk_nvme_cpl_is_error(&bio->cpl)) { 2987 /* If bio->cpl is already an error, it means the compare operation failed. In that case, 2988 * complete the IO with the compare operation's status. 2989 */ 2990 if (!spdk_nvme_cpl_is_error(cpl)) { 2991 SPDK_ERRLOG("Unexpected write success after compare failure.\n"); 2992 } 2993 2994 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 2995 } else { 2996 bdev_nvme_io_complete_nvme_status(bio, cpl); 2997 } 2998 } 2999 3000 static void 3001 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl) 3002 { 3003 struct nvme_bdev_io *bio = ref; 3004 3005 bdev_nvme_io_complete_nvme_status(bio, cpl); 3006 } 3007 3008 static int 3009 fill_zone_from_report(struct spdk_bdev_zone_info *info, struct spdk_nvme_zns_zone_desc *desc) 3010 { 3011 switch (desc->zs) { 3012 case SPDK_NVME_ZONE_STATE_EMPTY: 3013 info->state = SPDK_BDEV_ZONE_STATE_EMPTY; 3014 break; 3015 case SPDK_NVME_ZONE_STATE_IOPEN: 3016 info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN; 3017 break; 3018 case SPDK_NVME_ZONE_STATE_EOPEN: 3019 info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN; 3020 break; 3021 case SPDK_NVME_ZONE_STATE_CLOSED: 3022 info->state = SPDK_BDEV_ZONE_STATE_CLOSED; 3023 break; 3024 case SPDK_NVME_ZONE_STATE_RONLY: 3025 info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY; 3026 break; 3027 case SPDK_NVME_ZONE_STATE_FULL: 3028 info->state = SPDK_BDEV_ZONE_STATE_FULL; 3029 break; 3030 case SPDK_NVME_ZONE_STATE_OFFLINE: 3031 info->state = SPDK_BDEV_ZONE_STATE_OFFLINE; 3032 break; 3033 default: 3034 SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", desc->zs); 3035 return -EIO; 3036 } 3037 3038 info->zone_id = desc->zslba; 3039 info->write_pointer = desc->wp; 3040 info->capacity = desc->zcap; 3041 3042 return 0; 3043 } 3044 3045 static void 3046 bdev_nvme_get_zone_info_done(void *ref, const struct spdk_nvme_cpl *cpl) 3047 { 3048 struct nvme_bdev_io *bio = ref; 3049 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 3050 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 3051 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 3052 uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id; 3053 uint32_t zones_to_copy = bdev_io->u.zone_mgmt.num_zones; 3054 struct spdk_bdev_zone_info *info = bdev_io->u.zone_mgmt.buf; 3055 uint64_t max_zones_per_buf, i; 3056 uint32_t zone_report_bufsize; 3057 struct spdk_nvme_ns *ns; 3058 struct spdk_nvme_qpair *qpair; 3059 int ret; 3060 3061 if (spdk_nvme_cpl_is_error(cpl)) { 3062 goto out_complete_io_nvme_cpl; 3063 } 3064 3065 if (!bdev_nvme_find_io_path(nbdev_ch, &ns, &qpair)) { 3066 ret = -ENXIO; 3067 goto out_complete_io_ret; 3068 } 3069 3070 zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 3071 max_zones_per_buf = (zone_report_bufsize - sizeof(*bio->zone_report_buf)) / 3072 sizeof(bio->zone_report_buf->descs[0]); 3073 3074 if (bio->zone_report_buf->nr_zones > max_zones_per_buf) { 3075 ret = -EINVAL; 3076 goto out_complete_io_ret; 3077 } 3078 3079 if (!bio->zone_report_buf->nr_zones) { 3080 ret = -EINVAL; 3081 goto out_complete_io_ret; 3082 } 3083 3084 for (i = 0; i < bio->zone_report_buf->nr_zones && bio->handled_zones < zones_to_copy; i++) { 3085 ret = fill_zone_from_report(&info[bio->handled_zones], 3086 &bio->zone_report_buf->descs[i]); 3087 if (ret) { 3088 goto out_complete_io_ret; 3089 } 3090 bio->handled_zones++; 3091 } 3092 3093 if (bio->handled_zones < zones_to_copy) { 3094 uint64_t zone_size_lba = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 3095 uint64_t slba = zone_id + (zone_size_lba * bio->handled_zones); 3096 3097 memset(bio->zone_report_buf, 0, zone_report_bufsize); 3098 ret = spdk_nvme_zns_report_zones(ns, qpair, 3099 bio->zone_report_buf, zone_report_bufsize, 3100 slba, SPDK_NVME_ZRA_LIST_ALL, true, 3101 bdev_nvme_get_zone_info_done, bio); 3102 if (!ret) { 3103 return; 3104 } else { 3105 goto out_complete_io_ret; 3106 } 3107 } 3108 3109 out_complete_io_nvme_cpl: 3110 free(bio->zone_report_buf); 3111 bio->zone_report_buf = NULL; 3112 bdev_nvme_io_complete_nvme_status(bio, cpl); 3113 return; 3114 3115 out_complete_io_ret: 3116 free(bio->zone_report_buf); 3117 bio->zone_report_buf = NULL; 3118 bdev_nvme_io_complete(bio, ret); 3119 } 3120 3121 static void 3122 bdev_nvme_zone_management_done(void *ref, const struct spdk_nvme_cpl *cpl) 3123 { 3124 struct nvme_bdev_io *bio = ref; 3125 3126 bdev_nvme_io_complete_nvme_status(bio, cpl); 3127 } 3128 3129 static void 3130 bdev_nvme_admin_passthru_completion(void *ctx) 3131 { 3132 struct nvme_bdev_io *bio = ctx; 3133 3134 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 3135 } 3136 3137 static void 3138 bdev_nvme_abort_completion(void *ctx) 3139 { 3140 struct nvme_bdev_io *bio = ctx; 3141 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 3142 3143 if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) { 3144 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 3145 } else { 3146 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3147 } 3148 } 3149 3150 static void 3151 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl) 3152 { 3153 struct nvme_bdev_io *bio = ref; 3154 3155 bio->cpl = *cpl; 3156 spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_completion, bio); 3157 } 3158 3159 static void 3160 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl) 3161 { 3162 struct nvme_bdev_io *bio = ref; 3163 3164 bio->cpl = *cpl; 3165 spdk_thread_send_msg(bio->orig_thread, bdev_nvme_admin_passthru_completion, bio); 3166 } 3167 3168 static void 3169 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset) 3170 { 3171 struct nvme_bdev_io *bio = ref; 3172 struct iovec *iov; 3173 3174 bio->iov_offset = sgl_offset; 3175 for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) { 3176 iov = &bio->iovs[bio->iovpos]; 3177 if (bio->iov_offset < iov->iov_len) { 3178 break; 3179 } 3180 3181 bio->iov_offset -= iov->iov_len; 3182 } 3183 } 3184 3185 static int 3186 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length) 3187 { 3188 struct nvme_bdev_io *bio = ref; 3189 struct iovec *iov; 3190 3191 assert(bio->iovpos < bio->iovcnt); 3192 3193 iov = &bio->iovs[bio->iovpos]; 3194 3195 *address = iov->iov_base; 3196 *length = iov->iov_len; 3197 3198 if (bio->iov_offset) { 3199 assert(bio->iov_offset <= iov->iov_len); 3200 *address += bio->iov_offset; 3201 *length -= bio->iov_offset; 3202 } 3203 3204 bio->iov_offset += *length; 3205 if (bio->iov_offset == iov->iov_len) { 3206 bio->iovpos++; 3207 bio->iov_offset = 0; 3208 } 3209 3210 return 0; 3211 } 3212 3213 static void 3214 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset) 3215 { 3216 struct nvme_bdev_io *bio = ref; 3217 struct iovec *iov; 3218 3219 bio->fused_iov_offset = sgl_offset; 3220 for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) { 3221 iov = &bio->fused_iovs[bio->fused_iovpos]; 3222 if (bio->fused_iov_offset < iov->iov_len) { 3223 break; 3224 } 3225 3226 bio->fused_iov_offset -= iov->iov_len; 3227 } 3228 } 3229 3230 static int 3231 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length) 3232 { 3233 struct nvme_bdev_io *bio = ref; 3234 struct iovec *iov; 3235 3236 assert(bio->fused_iovpos < bio->fused_iovcnt); 3237 3238 iov = &bio->fused_iovs[bio->fused_iovpos]; 3239 3240 *address = iov->iov_base; 3241 *length = iov->iov_len; 3242 3243 if (bio->fused_iov_offset) { 3244 assert(bio->fused_iov_offset <= iov->iov_len); 3245 *address += bio->fused_iov_offset; 3246 *length -= bio->fused_iov_offset; 3247 } 3248 3249 bio->fused_iov_offset += *length; 3250 if (bio->fused_iov_offset == iov->iov_len) { 3251 bio->fused_iovpos++; 3252 bio->fused_iov_offset = 0; 3253 } 3254 3255 return 0; 3256 } 3257 3258 static int 3259 bdev_nvme_no_pi_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 3260 struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 3261 void *md, uint64_t lba_count, uint64_t lba) 3262 { 3263 int rc; 3264 3265 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n", 3266 lba_count, lba); 3267 3268 bio->iovs = iov; 3269 bio->iovcnt = iovcnt; 3270 bio->iovpos = 0; 3271 bio->iov_offset = 0; 3272 3273 rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count, 3274 bdev_nvme_no_pi_readv_done, bio, 0, 3275 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 3276 md, 0, 0); 3277 3278 if (rc != 0 && rc != -ENOMEM) { 3279 SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc); 3280 } 3281 return rc; 3282 } 3283 3284 static int 3285 bdev_nvme_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 3286 struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 3287 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags, 3288 struct spdk_bdev_ext_io_opts *ext_opts) 3289 { 3290 int rc; 3291 3292 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n", 3293 lba_count, lba); 3294 3295 bio->iovs = iov; 3296 bio->iovcnt = iovcnt; 3297 bio->iovpos = 0; 3298 bio->iov_offset = 0; 3299 3300 if (ext_opts) { 3301 bio->ext_opts.size = sizeof(struct spdk_nvme_ns_cmd_ext_io_opts); 3302 bio->ext_opts.memory_domain = ext_opts->memory_domain; 3303 bio->ext_opts.memory_domain_ctx = ext_opts->memory_domain_ctx; 3304 bio->ext_opts.io_flags = flags; 3305 bio->ext_opts.metadata = md; 3306 3307 rc = spdk_nvme_ns_cmd_readv_ext(ns, qpair, lba, lba_count, 3308 bdev_nvme_readv_done, bio, 3309 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 3310 &bio->ext_opts); 3311 } else if (iovcnt == 1) { 3312 rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base, md, lba, 3313 lba_count, 3314 bdev_nvme_readv_done, bio, 3315 flags, 3316 0, 0); 3317 } else { 3318 rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count, 3319 bdev_nvme_readv_done, bio, flags, 3320 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 3321 md, 0, 0); 3322 } 3323 3324 if (rc != 0 && rc != -ENOMEM) { 3325 SPDK_ERRLOG("readv failed: rc = %d\n", rc); 3326 } 3327 return rc; 3328 } 3329 3330 static int 3331 bdev_nvme_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 3332 struct nvme_bdev_io *bio, 3333 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba, 3334 uint32_t flags, struct spdk_bdev_ext_io_opts *ext_opts) 3335 { 3336 int rc; 3337 3338 SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 3339 lba_count, lba); 3340 3341 bio->iovs = iov; 3342 bio->iovcnt = iovcnt; 3343 bio->iovpos = 0; 3344 bio->iov_offset = 0; 3345 3346 if (ext_opts) { 3347 bio->ext_opts.size = sizeof(struct spdk_nvme_ns_cmd_ext_io_opts); 3348 bio->ext_opts.memory_domain = ext_opts->memory_domain; 3349 bio->ext_opts.memory_domain_ctx = ext_opts->memory_domain_ctx; 3350 bio->ext_opts.io_flags = flags; 3351 bio->ext_opts.metadata = md; 3352 3353 rc = spdk_nvme_ns_cmd_writev_ext(ns, qpair, lba, lba_count, 3354 bdev_nvme_readv_done, bio, 3355 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 3356 &bio->ext_opts); 3357 } else if (iovcnt == 1) { 3358 rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base, md, lba, 3359 lba_count, 3360 bdev_nvme_writev_done, bio, 3361 flags, 3362 0, 0); 3363 } else { 3364 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 3365 bdev_nvme_writev_done, bio, flags, 3366 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 3367 md, 0, 0); 3368 } 3369 3370 if (rc != 0 && rc != -ENOMEM) { 3371 SPDK_ERRLOG("writev failed: rc = %d\n", rc); 3372 } 3373 return rc; 3374 } 3375 3376 static int 3377 bdev_nvme_zone_appendv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 3378 struct nvme_bdev_io *bio, 3379 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t zslba, 3380 uint32_t flags) 3381 { 3382 int rc; 3383 3384 SPDK_DEBUGLOG(bdev_nvme, "zone append %" PRIu64 " blocks to zone start lba %#" PRIx64 "\n", 3385 lba_count, zslba); 3386 3387 bio->iovs = iov; 3388 bio->iovcnt = iovcnt; 3389 bio->iovpos = 0; 3390 bio->iov_offset = 0; 3391 3392 if (iovcnt == 1) { 3393 rc = spdk_nvme_zns_zone_append_with_md(ns, qpair, iov[0].iov_base, md, zslba, 3394 lba_count, 3395 bdev_nvme_zone_appendv_done, bio, 3396 flags, 3397 0, 0); 3398 } else { 3399 rc = spdk_nvme_zns_zone_appendv_with_md(ns, qpair, zslba, lba_count, 3400 bdev_nvme_zone_appendv_done, bio, flags, 3401 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 3402 md, 0, 0); 3403 } 3404 3405 if (rc != 0 && rc != -ENOMEM) { 3406 SPDK_ERRLOG("zone append failed: rc = %d\n", rc); 3407 } 3408 return rc; 3409 } 3410 3411 static int 3412 bdev_nvme_comparev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 3413 struct nvme_bdev_io *bio, 3414 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba, 3415 uint32_t flags) 3416 { 3417 int rc; 3418 3419 SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n", 3420 lba_count, lba); 3421 3422 bio->iovs = iov; 3423 bio->iovcnt = iovcnt; 3424 bio->iovpos = 0; 3425 bio->iov_offset = 0; 3426 3427 rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count, 3428 bdev_nvme_comparev_done, bio, flags, 3429 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 3430 md, 0, 0); 3431 3432 if (rc != 0 && rc != -ENOMEM) { 3433 SPDK_ERRLOG("comparev failed: rc = %d\n", rc); 3434 } 3435 return rc; 3436 } 3437 3438 static int 3439 bdev_nvme_comparev_and_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 3440 struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, 3441 struct iovec *write_iov, int write_iovcnt, 3442 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags) 3443 { 3444 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 3445 int rc; 3446 3447 SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 3448 lba_count, lba); 3449 3450 bio->iovs = cmp_iov; 3451 bio->iovcnt = cmp_iovcnt; 3452 bio->iovpos = 0; 3453 bio->iov_offset = 0; 3454 bio->fused_iovs = write_iov; 3455 bio->fused_iovcnt = write_iovcnt; 3456 bio->fused_iovpos = 0; 3457 bio->fused_iov_offset = 0; 3458 3459 if (bdev_io->num_retries == 0) { 3460 bio->first_fused_submitted = false; 3461 } 3462 3463 if (!bio->first_fused_submitted) { 3464 flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST; 3465 memset(&bio->cpl, 0, sizeof(bio->cpl)); 3466 3467 rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count, 3468 bdev_nvme_comparev_and_writev_done, bio, flags, 3469 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0); 3470 if (rc == 0) { 3471 bio->first_fused_submitted = true; 3472 flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST; 3473 } else { 3474 if (rc != -ENOMEM) { 3475 SPDK_ERRLOG("compare failed: rc = %d\n", rc); 3476 } 3477 return rc; 3478 } 3479 } 3480 3481 flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND; 3482 3483 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 3484 bdev_nvme_comparev_and_writev_done, bio, flags, 3485 bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0); 3486 if (rc != 0 && rc != -ENOMEM) { 3487 SPDK_ERRLOG("write failed: rc = %d\n", rc); 3488 rc = 0; 3489 } 3490 3491 return rc; 3492 } 3493 3494 static int 3495 bdev_nvme_unmap(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 3496 struct nvme_bdev_io *bio, 3497 uint64_t offset_blocks, 3498 uint64_t num_blocks) 3499 { 3500 struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES]; 3501 struct spdk_nvme_dsm_range *range; 3502 uint64_t offset, remaining; 3503 uint64_t num_ranges_u64; 3504 uint16_t num_ranges; 3505 int rc; 3506 3507 num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) / 3508 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 3509 if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) { 3510 SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks); 3511 return -EINVAL; 3512 } 3513 num_ranges = (uint16_t)num_ranges_u64; 3514 3515 offset = offset_blocks; 3516 remaining = num_blocks; 3517 range = &dsm_ranges[0]; 3518 3519 /* Fill max-size ranges until the remaining blocks fit into one range */ 3520 while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) { 3521 range->attributes.raw = 0; 3522 range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 3523 range->starting_lba = offset; 3524 3525 offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 3526 remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 3527 range++; 3528 } 3529 3530 /* Final range describes the remaining blocks */ 3531 range->attributes.raw = 0; 3532 range->length = remaining; 3533 range->starting_lba = offset; 3534 3535 rc = spdk_nvme_ns_cmd_dataset_management(ns, qpair, 3536 SPDK_NVME_DSM_ATTR_DEALLOCATE, 3537 dsm_ranges, num_ranges, 3538 bdev_nvme_queued_done, bio); 3539 3540 return rc; 3541 } 3542 3543 static int 3544 bdev_nvme_write_zeroes(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 3545 struct nvme_bdev_io *bio, 3546 uint64_t offset_blocks, 3547 uint64_t num_blocks) 3548 { 3549 if (num_blocks > UINT16_MAX + 1) { 3550 SPDK_ERRLOG("NVMe write zeroes is limited to 16-bit block count\n"); 3551 return -EINVAL; 3552 } 3553 3554 return spdk_nvme_ns_cmd_write_zeroes(ns, qpair, 3555 offset_blocks, num_blocks, 3556 bdev_nvme_queued_done, bio, 3557 0); 3558 } 3559 3560 static int 3561 bdev_nvme_get_zone_info(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 3562 struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones, 3563 struct spdk_bdev_zone_info *info) 3564 { 3565 uint32_t zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 3566 uint64_t zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 3567 uint64_t total_zones = spdk_nvme_zns_ns_get_num_zones(ns); 3568 3569 if (zone_id % zone_size != 0) { 3570 return -EINVAL; 3571 } 3572 3573 if (num_zones > total_zones || !num_zones) { 3574 return -EINVAL; 3575 } 3576 3577 assert(!bio->zone_report_buf); 3578 bio->zone_report_buf = calloc(1, zone_report_bufsize); 3579 if (!bio->zone_report_buf) { 3580 return -ENOMEM; 3581 } 3582 3583 bio->handled_zones = 0; 3584 3585 return spdk_nvme_zns_report_zones(ns, qpair, bio->zone_report_buf, zone_report_bufsize, 3586 zone_id, SPDK_NVME_ZRA_LIST_ALL, true, 3587 bdev_nvme_get_zone_info_done, bio); 3588 } 3589 3590 static int 3591 bdev_nvme_zone_management(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 3592 struct nvme_bdev_io *bio, uint64_t zone_id, 3593 enum spdk_bdev_zone_action action) 3594 { 3595 switch (action) { 3596 case SPDK_BDEV_ZONE_CLOSE: 3597 return spdk_nvme_zns_close_zone(ns, qpair, zone_id, false, 3598 bdev_nvme_zone_management_done, bio); 3599 case SPDK_BDEV_ZONE_FINISH: 3600 return spdk_nvme_zns_finish_zone(ns, qpair, zone_id, false, 3601 bdev_nvme_zone_management_done, bio); 3602 case SPDK_BDEV_ZONE_OPEN: 3603 return spdk_nvme_zns_open_zone(ns, qpair, zone_id, false, 3604 bdev_nvme_zone_management_done, bio); 3605 case SPDK_BDEV_ZONE_RESET: 3606 return spdk_nvme_zns_reset_zone(ns, qpair, zone_id, false, 3607 bdev_nvme_zone_management_done, bio); 3608 case SPDK_BDEV_ZONE_OFFLINE: 3609 return spdk_nvme_zns_offline_zone(ns, qpair, zone_id, false, 3610 bdev_nvme_zone_management_done, bio); 3611 default: 3612 return -EINVAL; 3613 } 3614 } 3615 3616 static int 3617 bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 3618 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) 3619 { 3620 struct nvme_ctrlr *nvme_ctrlr; 3621 uint32_t max_xfer_size; 3622 3623 if (!bdev_nvme_find_admin_path(nbdev_ch, &nvme_ctrlr)) { 3624 return -EINVAL; 3625 } 3626 3627 max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ctrlr->ctrlr); 3628 3629 if (nbytes > max_xfer_size) { 3630 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 3631 return -EINVAL; 3632 } 3633 3634 bio->orig_thread = spdk_get_thread(); 3635 3636 return spdk_nvme_ctrlr_cmd_admin_raw(nvme_ctrlr->ctrlr, cmd, buf, 3637 (uint32_t)nbytes, bdev_nvme_admin_passthru_done, bio); 3638 } 3639 3640 static int 3641 bdev_nvme_io_passthru(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 3642 struct nvme_bdev_io *bio, 3643 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) 3644 { 3645 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 3646 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 3647 3648 if (nbytes > max_xfer_size) { 3649 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 3650 return -EINVAL; 3651 } 3652 3653 /* 3654 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 3655 * so fill it out automatically. 3656 */ 3657 cmd->nsid = spdk_nvme_ns_get_id(ns); 3658 3659 return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf, 3660 (uint32_t)nbytes, bdev_nvme_queued_done, bio); 3661 } 3662 3663 static int 3664 bdev_nvme_io_passthru_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 3665 struct nvme_bdev_io *bio, 3666 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len) 3667 { 3668 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 3669 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 3670 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 3671 3672 if (nbytes > max_xfer_size) { 3673 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 3674 return -EINVAL; 3675 } 3676 3677 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 3678 SPDK_ERRLOG("invalid meta data buffer size\n"); 3679 return -EINVAL; 3680 } 3681 3682 /* 3683 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 3684 * so fill it out automatically. 3685 */ 3686 cmd->nsid = spdk_nvme_ns_get_id(ns); 3687 3688 return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf, 3689 (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio); 3690 } 3691 3692 static int 3693 bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 3694 struct nvme_bdev_io *bio_to_abort) 3695 { 3696 struct nvme_ctrlr_channel *ctrlr_ch = nbdev_ch->ctrlr_ch; 3697 int rc; 3698 3699 bio->orig_thread = spdk_get_thread(); 3700 3701 rc = spdk_nvme_ctrlr_cmd_abort_ext(ctrlr_ch->ctrlr->ctrlr, 3702 ctrlr_ch->qpair, 3703 bio_to_abort, 3704 bdev_nvme_abort_done, bio); 3705 if (rc == -ENOENT) { 3706 /* If no command was found in I/O qpair, the target command may be 3707 * admin command. 3708 */ 3709 rc = spdk_nvme_ctrlr_cmd_abort_ext(ctrlr_ch->ctrlr->ctrlr, 3710 NULL, 3711 bio_to_abort, 3712 bdev_nvme_abort_done, bio); 3713 } 3714 3715 if (rc == -ENOENT) { 3716 /* If no command was found, complete the abort request with failure. */ 3717 bio->cpl.cdw0 |= 1U; 3718 bio->cpl.status.sc = SPDK_NVME_SC_SUCCESS; 3719 bio->cpl.status.sct = SPDK_NVME_SCT_GENERIC; 3720 3721 bdev_nvme_abort_completion(bio); 3722 3723 rc = 0; 3724 } 3725 3726 return rc; 3727 } 3728 3729 static void 3730 nvme_ctrlr_config_json_standard_namespace(struct spdk_json_write_ctx *w, 3731 struct nvme_ns *nvme_ns) 3732 { 3733 /* nop */ 3734 } 3735 3736 static void 3737 nvme_namespace_config_json(struct spdk_json_write_ctx *w, struct nvme_ns *nvme_ns) 3738 { 3739 g_config_json_namespace_fn[nvme_ns->type](w, nvme_ns); 3740 } 3741 3742 static void 3743 bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w) 3744 { 3745 const char *action; 3746 3747 if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) { 3748 action = "reset"; 3749 } else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) { 3750 action = "abort"; 3751 } else { 3752 action = "none"; 3753 } 3754 3755 spdk_json_write_object_begin(w); 3756 3757 spdk_json_write_named_string(w, "method", "bdev_nvme_set_options"); 3758 3759 spdk_json_write_named_object_begin(w, "params"); 3760 spdk_json_write_named_string(w, "action_on_timeout", action); 3761 spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us); 3762 spdk_json_write_named_uint64(w, "timeout_admin_us", g_opts.timeout_admin_us); 3763 spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms); 3764 spdk_json_write_named_uint32(w, "retry_count", g_opts.retry_count); 3765 spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst); 3766 spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight); 3767 spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight); 3768 spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight); 3769 spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us); 3770 spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us); 3771 spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests); 3772 spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit); 3773 spdk_json_write_object_end(w); 3774 3775 spdk_json_write_object_end(w); 3776 } 3777 3778 static void 3779 nvme_ctrlr_config_json(struct spdk_json_write_ctx *w, 3780 struct nvme_ctrlr *nvme_ctrlr) 3781 { 3782 struct spdk_nvme_transport_id *trid; 3783 3784 trid = nvme_ctrlr->connected_trid; 3785 3786 spdk_json_write_object_begin(w); 3787 3788 spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller"); 3789 3790 spdk_json_write_named_object_begin(w, "params"); 3791 spdk_json_write_named_string(w, "name", nvme_ctrlr->name); 3792 nvme_bdev_dump_trid_json(trid, w); 3793 spdk_json_write_named_bool(w, "prchk_reftag", 3794 (nvme_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0); 3795 spdk_json_write_named_bool(w, "prchk_guard", 3796 (nvme_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0); 3797 3798 spdk_json_write_object_end(w); 3799 3800 spdk_json_write_object_end(w); 3801 } 3802 3803 static void 3804 bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w) 3805 { 3806 spdk_json_write_object_begin(w); 3807 spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug"); 3808 3809 spdk_json_write_named_object_begin(w, "params"); 3810 spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us); 3811 spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled); 3812 spdk_json_write_object_end(w); 3813 3814 spdk_json_write_object_end(w); 3815 } 3816 3817 static int 3818 bdev_nvme_config_json(struct spdk_json_write_ctx *w) 3819 { 3820 struct nvme_ctrlr *nvme_ctrlr; 3821 uint32_t nsid; 3822 3823 bdev_nvme_opts_config_json(w); 3824 3825 pthread_mutex_lock(&g_bdev_nvme_mutex); 3826 3827 TAILQ_FOREACH(nvme_ctrlr, &g_nvme_ctrlrs, tailq) { 3828 nvme_ctrlr_config_json(w, nvme_ctrlr); 3829 3830 for (nsid = 0; nsid < nvme_ctrlr->num_ns; ++nsid) { 3831 if (!nvme_ctrlr->namespaces[nsid]->populated) { 3832 continue; 3833 } 3834 3835 nvme_namespace_config_json(w, nvme_ctrlr->namespaces[nsid]); 3836 } 3837 } 3838 3839 /* Dump as last parameter to give all NVMe bdevs chance to be constructed 3840 * before enabling hotplug poller. 3841 */ 3842 bdev_nvme_hotplug_config_json(w); 3843 3844 pthread_mutex_unlock(&g_bdev_nvme_mutex); 3845 return 0; 3846 } 3847 3848 struct spdk_nvme_ctrlr * 3849 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev) 3850 { 3851 if (!bdev || bdev->module != &nvme_if) { 3852 return NULL; 3853 } 3854 3855 return SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk)->nvme_ns->ctrlr->ctrlr; 3856 } 3857 3858 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme) 3859