1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. All rights reserved. 5 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include "spdk/stdinc.h" 35 36 #include "bdev_nvme.h" 37 #include "bdev_ocssd.h" 38 39 #include "spdk/accel_engine.h" 40 #include "spdk/config.h" 41 #include "spdk/endian.h" 42 #include "spdk/bdev.h" 43 #include "spdk/json.h" 44 #include "spdk/nvme.h" 45 #include "spdk/nvme_ocssd.h" 46 #include "spdk/nvme_zns.h" 47 #include "spdk/thread.h" 48 #include "spdk/string.h" 49 #include "spdk/util.h" 50 51 #include "spdk/bdev_module.h" 52 #include "spdk/log.h" 53 54 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true 55 #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS (10000) 56 57 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w); 58 59 struct nvme_bdev_io { 60 /** array of iovecs to transfer. */ 61 struct iovec *iovs; 62 63 /** Number of iovecs in iovs array. */ 64 int iovcnt; 65 66 /** Current iovec position. */ 67 int iovpos; 68 69 /** Offset in current iovec. */ 70 uint32_t iov_offset; 71 72 /** array of iovecs to transfer. */ 73 struct iovec *fused_iovs; 74 75 /** Number of iovecs in iovs array. */ 76 int fused_iovcnt; 77 78 /** Current iovec position. */ 79 int fused_iovpos; 80 81 /** Offset in current iovec. */ 82 uint32_t fused_iov_offset; 83 84 /** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */ 85 struct spdk_nvme_cpl cpl; 86 87 /** Originating thread */ 88 struct spdk_thread *orig_thread; 89 90 /** Keeps track if first of fused commands was submitted */ 91 bool first_fused_submitted; 92 93 /** Temporary pointer to zone report buffer */ 94 struct spdk_nvme_zns_zone_report *zone_report_buf; 95 96 /** Keep track of how many zones that have been copied to the spdk_bdev_zone_info struct */ 97 uint64_t handled_zones; 98 }; 99 100 struct nvme_probe_ctx { 101 size_t count; 102 struct spdk_nvme_transport_id trids[NVME_MAX_CONTROLLERS]; 103 struct spdk_nvme_host_id hostids[NVME_MAX_CONTROLLERS]; 104 const char *names[NVME_MAX_CONTROLLERS]; 105 uint32_t prchk_flags[NVME_MAX_CONTROLLERS]; 106 const char *hostnqn; 107 }; 108 109 struct nvme_probe_skip_entry { 110 struct spdk_nvme_transport_id trid; 111 TAILQ_ENTRY(nvme_probe_skip_entry) tailq; 112 }; 113 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */ 114 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER( 115 g_skipped_nvme_ctrlrs); 116 117 static struct spdk_bdev_nvme_opts g_opts = { 118 .action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE, 119 .timeout_us = 0, 120 .timeout_admin_us = 0, 121 .keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS, 122 .retry_count = 4, 123 .arbitration_burst = 0, 124 .low_priority_weight = 0, 125 .medium_priority_weight = 0, 126 .high_priority_weight = 0, 127 .nvme_adminq_poll_period_us = 10000ULL, 128 .nvme_ioq_poll_period_us = 0, 129 .io_queue_requests = 0, 130 .delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT, 131 }; 132 133 #define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL 134 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT 100000ULL 135 136 static int g_hot_insert_nvme_controller_index = 0; 137 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT; 138 static bool g_nvme_hotplug_enabled = false; 139 static struct spdk_thread *g_bdev_nvme_init_thread; 140 static struct spdk_poller *g_hotplug_poller; 141 static struct spdk_poller *g_hotplug_probe_poller; 142 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx; 143 144 static void nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 145 struct nvme_async_probe_ctx *ctx); 146 static void nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 147 struct nvme_async_probe_ctx *ctx); 148 static int bdev_nvme_library_init(void); 149 static void bdev_nvme_library_fini(void); 150 static int bdev_nvme_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 151 struct nvme_bdev_io *bio, 152 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba, 153 uint32_t flags); 154 static int bdev_nvme_no_pi_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 155 struct nvme_bdev_io *bio, 156 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba); 157 static int bdev_nvme_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 158 struct nvme_bdev_io *bio, 159 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba, 160 uint32_t flags); 161 static int bdev_nvme_zone_appendv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 162 struct nvme_bdev_io *bio, 163 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, 164 uint64_t zslba, uint32_t flags); 165 static int bdev_nvme_comparev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 166 struct nvme_bdev_io *bio, 167 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba, 168 uint32_t flags); 169 static int bdev_nvme_comparev_and_writev(struct spdk_nvme_ns *ns, 170 struct spdk_nvme_qpair *qpair, 171 struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov, 172 int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba, 173 uint32_t flags); 174 static int bdev_nvme_get_zone_info(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 175 struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones, 176 struct spdk_bdev_zone_info *info); 177 static int bdev_nvme_zone_management(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 178 struct nvme_bdev_io *bio, uint64_t zone_id, 179 enum spdk_bdev_zone_action action); 180 static int bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, 181 struct nvme_bdev_io *bio, 182 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); 183 static int bdev_nvme_io_passthru(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 184 struct nvme_bdev_io *bio, 185 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); 186 static int bdev_nvme_io_passthru_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 187 struct nvme_bdev_io *bio, 188 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len); 189 static int bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, 190 struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort); 191 static int bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio); 192 static int bdev_nvme_failover(struct nvme_ctrlr *nvme_ctrlr, bool remove); 193 static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr); 194 195 typedef void (*populate_namespace_fn)(struct nvme_ctrlr *nvme_ctrlr, 196 struct nvme_ns *nvme_ns, struct nvme_async_probe_ctx *ctx); 197 static void nvme_ctrlr_populate_standard_namespace(struct nvme_ctrlr *nvme_ctrlr, 198 struct nvme_ns *nvme_ns, struct nvme_async_probe_ctx *ctx); 199 200 static populate_namespace_fn g_populate_namespace_fn[] = { 201 NULL, 202 nvme_ctrlr_populate_standard_namespace, 203 bdev_ocssd_populate_namespace, 204 }; 205 206 typedef void (*depopulate_namespace_fn)(struct nvme_ns *nvme_ns); 207 static void nvme_ctrlr_depopulate_standard_namespace(struct nvme_ns *nvme_ns); 208 209 static depopulate_namespace_fn g_depopulate_namespace_fn[] = { 210 NULL, 211 nvme_ctrlr_depopulate_standard_namespace, 212 bdev_ocssd_depopulate_namespace, 213 }; 214 215 typedef void (*config_json_namespace_fn)(struct spdk_json_write_ctx *w, 216 struct nvme_ns *nvme_ns); 217 static void nvme_ctrlr_config_json_standard_namespace(struct spdk_json_write_ctx *w, 218 struct nvme_ns *nvme_ns); 219 220 static config_json_namespace_fn g_config_json_namespace_fn[] = { 221 NULL, 222 nvme_ctrlr_config_json_standard_namespace, 223 bdev_ocssd_namespace_config_json, 224 }; 225 226 struct spdk_nvme_qpair * 227 bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch) 228 { 229 struct nvme_ctrlr_channel *ctrlr_ch; 230 231 assert(ctrlr_io_ch != NULL); 232 233 ctrlr_ch = spdk_io_channel_get_ctx(ctrlr_io_ch); 234 235 return ctrlr_ch->qpair; 236 } 237 238 static int 239 bdev_nvme_get_ctx_size(void) 240 { 241 return sizeof(struct nvme_bdev_io); 242 } 243 244 static struct spdk_bdev_module nvme_if = { 245 .name = "nvme", 246 .async_fini = true, 247 .module_init = bdev_nvme_library_init, 248 .module_fini = bdev_nvme_library_fini, 249 .config_json = bdev_nvme_config_json, 250 .get_ctx_size = bdev_nvme_get_ctx_size, 251 252 }; 253 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if) 254 255 static inline bool 256 bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch, 257 struct spdk_nvme_ns **_ns, struct spdk_nvme_qpair **_qpair) 258 { 259 if (spdk_unlikely(nbdev_ch->ctrlr_ch->qpair == NULL)) { 260 /* The device is currently resetting. */ 261 return false; 262 } 263 264 *_ns = nbdev_ch->nvme_ns->ns; 265 *_qpair = nbdev_ch->ctrlr_ch->qpair; 266 return true; 267 } 268 269 static inline bool 270 bdev_nvme_find_admin_path(struct nvme_bdev_channel *nbdev_ch, 271 struct nvme_ctrlr **_nvme_ctrlr) 272 { 273 *_nvme_ctrlr = nbdev_ch->ctrlr_ch->ctrlr; 274 return true; 275 } 276 277 static inline void 278 bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio, 279 const struct spdk_nvme_cpl *cpl) 280 { 281 spdk_bdev_io_complete_nvme_status(spdk_bdev_io_from_ctx(bio), cpl->cdw0, 282 cpl->status.sct, cpl->status.sc); 283 } 284 285 static inline void 286 bdev_nvme_io_complete(struct nvme_bdev_io *bio, int rc) 287 { 288 enum spdk_bdev_io_status io_status; 289 290 if (rc == 0) { 291 io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 292 } else if (rc == -ENOMEM) { 293 io_status = SPDK_BDEV_IO_STATUS_NOMEM; 294 } else { 295 io_status = SPDK_BDEV_IO_STATUS_FAILED; 296 } 297 298 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), io_status); 299 } 300 301 static void 302 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx) 303 { 304 int rc; 305 306 SPDK_DEBUGLOG(bdev_nvme, "qpair %p is disconnected, attempting reconnect.\n", qpair); 307 /* 308 * Currently, just try to reconnect indefinitely. If we are doing a reset, the reset will 309 * reconnect a qpair and we will stop getting a callback for this one. 310 */ 311 rc = spdk_nvme_ctrlr_reconnect_io_qpair(qpair); 312 if (rc != 0) { 313 SPDK_WARNLOG("Failed to reconnect to qpair %p, errno %d\n", qpair, -rc); 314 } 315 } 316 317 static int 318 bdev_nvme_poll(void *arg) 319 { 320 struct nvme_poll_group *group = arg; 321 int64_t num_completions; 322 323 if (group->collect_spin_stat && group->start_ticks == 0) { 324 group->start_ticks = spdk_get_ticks(); 325 } 326 327 num_completions = spdk_nvme_poll_group_process_completions(group->group, 0, 328 bdev_nvme_disconnected_qpair_cb); 329 if (group->collect_spin_stat) { 330 if (num_completions > 0) { 331 if (group->end_ticks != 0) { 332 group->spin_ticks += (group->end_ticks - group->start_ticks); 333 group->end_ticks = 0; 334 } 335 group->start_ticks = 0; 336 } else { 337 group->end_ticks = spdk_get_ticks(); 338 } 339 } 340 341 return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 342 } 343 344 static int 345 bdev_nvme_poll_adminq(void *arg) 346 { 347 int32_t rc; 348 struct nvme_ctrlr *nvme_ctrlr = arg; 349 350 assert(nvme_ctrlr != NULL); 351 352 rc = spdk_nvme_ctrlr_process_admin_completions(nvme_ctrlr->ctrlr); 353 if (rc < 0) { 354 bdev_nvme_failover(nvme_ctrlr, false); 355 } 356 357 return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY; 358 } 359 360 static void 361 _bdev_nvme_unregister_dev_cb(void *io_device) 362 { 363 struct nvme_bdev *nvme_disk = io_device; 364 365 free(nvme_disk->disk.name); 366 free(nvme_disk); 367 } 368 369 static int 370 bdev_nvme_destruct(void *ctx) 371 { 372 struct nvme_bdev *nvme_disk = ctx; 373 struct nvme_ns *nvme_ns = nvme_disk->nvme_ns; 374 375 pthread_mutex_lock(&nvme_ns->ctrlr->mutex); 376 377 nvme_ns->bdev = NULL; 378 379 if (!nvme_ns->populated) { 380 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 381 382 nvme_ctrlr_release(nvme_ns->ctrlr); 383 } else { 384 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 385 } 386 387 spdk_io_device_unregister(nvme_disk, _bdev_nvme_unregister_dev_cb); 388 389 return 0; 390 } 391 392 static int 393 bdev_nvme_flush(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 394 struct nvme_bdev_io *bio, uint64_t offset, uint64_t nbytes) 395 { 396 bdev_nvme_io_complete(bio, 0); 397 398 return 0; 399 } 400 401 static int 402 bdev_nvme_create_qpair(struct nvme_ctrlr_channel *ctrlr_ch) 403 { 404 struct spdk_nvme_ctrlr *ctrlr = ctrlr_ch->ctrlr->ctrlr; 405 struct spdk_nvme_io_qpair_opts opts; 406 struct spdk_nvme_qpair *qpair; 407 int rc; 408 409 spdk_nvme_ctrlr_get_default_io_qpair_opts(ctrlr, &opts, sizeof(opts)); 410 opts.delay_cmd_submit = g_opts.delay_cmd_submit; 411 opts.create_only = true; 412 opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests); 413 g_opts.io_queue_requests = opts.io_queue_requests; 414 415 qpair = spdk_nvme_ctrlr_alloc_io_qpair(ctrlr, &opts, sizeof(opts)); 416 if (qpair == NULL) { 417 return -1; 418 } 419 420 assert(ctrlr_ch->group != NULL); 421 422 rc = spdk_nvme_poll_group_add(ctrlr_ch->group->group, qpair); 423 if (rc != 0) { 424 SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n"); 425 goto err; 426 } 427 428 rc = spdk_nvme_ctrlr_connect_io_qpair(ctrlr, qpair); 429 if (rc != 0) { 430 SPDK_ERRLOG("Unable to connect I/O qpair.\n"); 431 goto err; 432 } 433 434 ctrlr_ch->qpair = qpair; 435 436 return 0; 437 438 err: 439 spdk_nvme_ctrlr_free_io_qpair(qpair); 440 441 return rc; 442 } 443 444 static void 445 bdev_nvme_destroy_qpair(struct nvme_ctrlr_channel *ctrlr_ch) 446 { 447 if (ctrlr_ch->qpair != NULL) { 448 spdk_nvme_ctrlr_free_io_qpair(ctrlr_ch->qpair); 449 ctrlr_ch->qpair = NULL; 450 } 451 } 452 453 static void 454 _bdev_nvme_check_pending_destruct(struct nvme_ctrlr *nvme_ctrlr) 455 { 456 pthread_mutex_lock(&nvme_ctrlr->mutex); 457 if (nvme_ctrlr->destruct_after_reset) { 458 assert(nvme_ctrlr->ref == 0 && nvme_ctrlr->destruct); 459 pthread_mutex_unlock(&nvme_ctrlr->mutex); 460 461 spdk_thread_send_msg(nvme_ctrlr->thread, nvme_ctrlr_unregister, 462 nvme_ctrlr); 463 } else { 464 pthread_mutex_unlock(&nvme_ctrlr->mutex); 465 } 466 } 467 468 static void 469 bdev_nvme_check_pending_destruct(struct spdk_io_channel_iter *i, int status) 470 { 471 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 472 473 _bdev_nvme_check_pending_destruct(nvme_ctrlr); 474 } 475 476 static void 477 _bdev_nvme_complete_pending_resets(struct nvme_ctrlr_channel *ctrlr_ch, 478 enum spdk_bdev_io_status status) 479 { 480 struct spdk_bdev_io *bdev_io; 481 482 while (!TAILQ_EMPTY(&ctrlr_ch->pending_resets)) { 483 bdev_io = TAILQ_FIRST(&ctrlr_ch->pending_resets); 484 TAILQ_REMOVE(&ctrlr_ch->pending_resets, bdev_io, module_link); 485 spdk_bdev_io_complete(bdev_io, status); 486 } 487 } 488 489 static void 490 bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i) 491 { 492 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 493 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 494 495 _bdev_nvme_complete_pending_resets(ctrlr_ch, SPDK_BDEV_IO_STATUS_SUCCESS); 496 497 spdk_for_each_channel_continue(i, 0); 498 } 499 500 static void 501 bdev_nvme_abort_pending_resets(struct spdk_io_channel_iter *i) 502 { 503 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 504 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 505 506 _bdev_nvme_complete_pending_resets(ctrlr_ch, SPDK_BDEV_IO_STATUS_FAILED); 507 508 spdk_for_each_channel_continue(i, 0); 509 } 510 511 static void 512 bdev_nvme_reset_complete(struct nvme_ctrlr *nvme_ctrlr, int rc) 513 { 514 struct nvme_ctrlr_trid *curr_trid; 515 bdev_nvme_reset_cb reset_cb_fn = nvme_ctrlr->reset_cb_fn; 516 void *reset_cb_arg = nvme_ctrlr->reset_cb_arg; 517 518 nvme_ctrlr->reset_cb_fn = NULL; 519 nvme_ctrlr->reset_cb_arg = NULL; 520 521 if (rc) { 522 SPDK_ERRLOG("Resetting controller failed.\n"); 523 } else { 524 SPDK_NOTICELOG("Resetting controller successful.\n"); 525 } 526 527 pthread_mutex_lock(&nvme_ctrlr->mutex); 528 nvme_ctrlr->resetting = false; 529 nvme_ctrlr->failover_in_progress = false; 530 531 curr_trid = TAILQ_FIRST(&nvme_ctrlr->trids); 532 assert(curr_trid != NULL); 533 assert(&curr_trid->trid == nvme_ctrlr->connected_trid); 534 535 curr_trid->is_failed = rc != 0 ? true : false; 536 537 if (nvme_ctrlr->ref == 0 && nvme_ctrlr->destruct) { 538 /* Destruct ctrlr after clearing pending resets. */ 539 nvme_ctrlr->destruct_after_reset = true; 540 } 541 542 pthread_mutex_unlock(&nvme_ctrlr->mutex); 543 544 if (reset_cb_fn) { 545 reset_cb_fn(reset_cb_arg, rc); 546 } 547 548 /* Make sure we clear any pending resets before returning. */ 549 spdk_for_each_channel(nvme_ctrlr, 550 rc == 0 ? bdev_nvme_complete_pending_resets : 551 bdev_nvme_abort_pending_resets, 552 NULL, 553 bdev_nvme_check_pending_destruct); 554 } 555 556 static void 557 bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status) 558 { 559 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 560 561 bdev_nvme_reset_complete(nvme_ctrlr, status); 562 } 563 564 static void 565 bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i) 566 { 567 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 568 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(_ch); 569 int rc; 570 571 rc = bdev_nvme_create_qpair(ctrlr_ch); 572 573 spdk_for_each_channel_continue(i, rc); 574 } 575 576 static int 577 bdev_nvme_ctrlr_reset_poll(void *arg) 578 { 579 struct nvme_ctrlr *nvme_ctrlr = arg; 580 int rc; 581 582 rc = spdk_nvme_ctrlr_reset_poll_async(nvme_ctrlr->reset_ctx); 583 if (rc == -EAGAIN) { 584 return SPDK_POLLER_BUSY; 585 } 586 587 spdk_poller_unregister(&nvme_ctrlr->reset_poller); 588 if (rc == 0) { 589 /* Recreate all of the I/O queue pairs */ 590 spdk_for_each_channel(nvme_ctrlr, 591 bdev_nvme_reset_create_qpair, 592 NULL, 593 bdev_nvme_reset_create_qpairs_done); 594 } else { 595 bdev_nvme_reset_complete(nvme_ctrlr, rc); 596 } 597 return SPDK_POLLER_BUSY; 598 } 599 600 static void 601 bdev_nvme_reset_ctrlr(struct spdk_io_channel_iter *i, int status) 602 { 603 struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i); 604 int rc; 605 606 if (status) { 607 rc = status; 608 goto err; 609 } 610 611 rc = spdk_nvme_ctrlr_reset_async(nvme_ctrlr->ctrlr, &nvme_ctrlr->reset_ctx); 612 if (rc != 0) { 613 SPDK_ERRLOG("Create controller reset context failed\n"); 614 goto err; 615 } 616 assert(nvme_ctrlr->reset_poller == NULL); 617 nvme_ctrlr->reset_poller = SPDK_POLLER_REGISTER(bdev_nvme_ctrlr_reset_poll, 618 nvme_ctrlr, 0); 619 620 return; 621 622 err: 623 bdev_nvme_reset_complete(nvme_ctrlr, rc); 624 } 625 626 static void 627 bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i) 628 { 629 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 630 struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch); 631 632 bdev_nvme_destroy_qpair(ctrlr_ch); 633 spdk_for_each_channel_continue(i, 0); 634 } 635 636 static int 637 bdev_nvme_reset(struct nvme_ctrlr *nvme_ctrlr) 638 { 639 pthread_mutex_lock(&nvme_ctrlr->mutex); 640 if (nvme_ctrlr->destruct) { 641 pthread_mutex_unlock(&nvme_ctrlr->mutex); 642 return -EBUSY; 643 } 644 645 if (nvme_ctrlr->resetting) { 646 pthread_mutex_unlock(&nvme_ctrlr->mutex); 647 SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); 648 return -EAGAIN; 649 } 650 651 nvme_ctrlr->resetting = true; 652 pthread_mutex_unlock(&nvme_ctrlr->mutex); 653 654 /* First, delete all NVMe I/O queue pairs. */ 655 spdk_for_each_channel(nvme_ctrlr, 656 bdev_nvme_reset_destroy_qpair, 657 NULL, 658 bdev_nvme_reset_ctrlr); 659 660 return 0; 661 } 662 663 int 664 bdev_nvme_reset_rpc(struct nvme_ctrlr *nvme_ctrlr, bdev_nvme_reset_cb cb_fn, void *cb_arg) 665 { 666 int rc; 667 668 rc = bdev_nvme_reset(nvme_ctrlr); 669 if (rc == 0) { 670 nvme_ctrlr->reset_cb_fn = cb_fn; 671 nvme_ctrlr->reset_cb_arg = cb_arg; 672 } 673 return rc; 674 } 675 676 static void 677 bdev_nvme_reset_io_complete(void *cb_arg, int rc) 678 { 679 struct nvme_bdev_io *bio = cb_arg; 680 681 bdev_nvme_io_complete(bio, rc); 682 } 683 684 static int 685 bdev_nvme_reset_io(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio) 686 { 687 struct nvme_ctrlr_channel *ctrlr_ch = nbdev_ch->ctrlr_ch; 688 struct spdk_bdev_io *bdev_io; 689 int rc; 690 691 rc = bdev_nvme_reset(ctrlr_ch->ctrlr); 692 if (rc == 0) { 693 assert(ctrlr_ch->ctrlr->reset_cb_fn == NULL); 694 assert(ctrlr_ch->ctrlr->reset_cb_arg == NULL); 695 ctrlr_ch->ctrlr->reset_cb_fn = bdev_nvme_reset_io_complete; 696 ctrlr_ch->ctrlr->reset_cb_arg = bio; 697 } else if (rc == -EAGAIN) { 698 /* 699 * Reset call is queued only if it is from the app framework. This is on purpose so that 700 * we don't interfere with the app framework reset strategy. i.e. we are deferring to the 701 * upper level. If they are in the middle of a reset, we won't try to schedule another one. 702 */ 703 bdev_io = spdk_bdev_io_from_ctx(bio); 704 TAILQ_INSERT_TAIL(&ctrlr_ch->pending_resets, bdev_io, module_link); 705 } else { 706 return rc; 707 } 708 709 return 0; 710 } 711 712 static int 713 bdev_nvme_failover_start(struct nvme_ctrlr *nvme_ctrlr, bool remove) 714 { 715 struct nvme_ctrlr_trid *curr_trid = NULL, *next_trid = NULL; 716 int rc; 717 718 pthread_mutex_lock(&nvme_ctrlr->mutex); 719 if (nvme_ctrlr->destruct) { 720 pthread_mutex_unlock(&nvme_ctrlr->mutex); 721 /* Don't bother resetting if the controller is in the process of being destructed. */ 722 return -EBUSY; 723 } 724 725 curr_trid = TAILQ_FIRST(&nvme_ctrlr->trids); 726 assert(curr_trid); 727 assert(&curr_trid->trid == nvme_ctrlr->connected_trid); 728 next_trid = TAILQ_NEXT(curr_trid, link); 729 730 if (nvme_ctrlr->resetting) { 731 if (next_trid && !nvme_ctrlr->failover_in_progress) { 732 rc = -EAGAIN; 733 } else { 734 rc = -EBUSY; 735 } 736 pthread_mutex_unlock(&nvme_ctrlr->mutex); 737 SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); 738 return rc; 739 } 740 741 nvme_ctrlr->resetting = true; 742 curr_trid->is_failed = true; 743 744 if (next_trid) { 745 assert(curr_trid->trid.trtype != SPDK_NVME_TRANSPORT_PCIE); 746 747 SPDK_NOTICELOG("Start failover from %s:%s to %s:%s\n", curr_trid->trid.traddr, 748 curr_trid->trid.trsvcid, next_trid->trid.traddr, next_trid->trid.trsvcid); 749 750 nvme_ctrlr->failover_in_progress = true; 751 spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr); 752 nvme_ctrlr->connected_trid = &next_trid->trid; 753 rc = spdk_nvme_ctrlr_set_trid(nvme_ctrlr->ctrlr, &next_trid->trid); 754 assert(rc == 0); 755 TAILQ_REMOVE(&nvme_ctrlr->trids, curr_trid, link); 756 if (!remove) { 757 /** Shuffle the old trid to the end of the list and use the new one. 758 * Allows for round robin through multiple connections. 759 */ 760 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, curr_trid, link); 761 } else { 762 free(curr_trid); 763 } 764 } 765 766 pthread_mutex_unlock(&nvme_ctrlr->mutex); 767 return 0; 768 } 769 770 static int 771 bdev_nvme_failover(struct nvme_ctrlr *nvme_ctrlr, bool remove) 772 { 773 int rc; 774 775 rc = bdev_nvme_failover_start(nvme_ctrlr, remove); 776 if (rc == 0) { 777 /* First, delete all NVMe I/O queue pairs. */ 778 spdk_for_each_channel(nvme_ctrlr, 779 bdev_nvme_reset_destroy_qpair, 780 NULL, 781 bdev_nvme_reset_ctrlr); 782 } else if (rc != -EBUSY) { 783 return rc; 784 } 785 786 return 0; 787 } 788 789 static int 790 bdev_nvme_unmap(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 791 struct nvme_bdev_io *bio, 792 uint64_t offset_blocks, 793 uint64_t num_blocks); 794 795 static int 796 bdev_nvme_write_zeroes(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 797 struct nvme_bdev_io *bio, 798 uint64_t offset_blocks, 799 uint64_t num_blocks); 800 801 static void 802 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 803 bool success) 804 { 805 struct nvme_bdev_io *bio = (struct nvme_bdev_io *)bdev_io->driver_ctx; 806 struct spdk_bdev *bdev = bdev_io->bdev; 807 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 808 struct spdk_nvme_ns *ns; 809 struct spdk_nvme_qpair *qpair; 810 int ret; 811 812 if (!success) { 813 ret = -EINVAL; 814 goto exit; 815 } 816 817 if (spdk_unlikely(!bdev_nvme_find_io_path(nbdev_ch, &ns, &qpair))) { 818 ret = -ENXIO; 819 goto exit; 820 } 821 822 ret = bdev_nvme_readv(ns, 823 qpair, 824 bio, 825 bdev_io->u.bdev.iovs, 826 bdev_io->u.bdev.iovcnt, 827 bdev_io->u.bdev.md_buf, 828 bdev_io->u.bdev.num_blocks, 829 bdev_io->u.bdev.offset_blocks, 830 bdev->dif_check_flags); 831 832 exit: 833 if (spdk_unlikely(ret != 0)) { 834 bdev_nvme_io_complete(bio, ret); 835 } 836 } 837 838 static void 839 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 840 { 841 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 842 struct spdk_bdev *bdev = bdev_io->bdev; 843 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 844 struct nvme_bdev_io *nbdev_io_to_abort; 845 struct spdk_nvme_ns *ns; 846 struct spdk_nvme_qpair *qpair; 847 int rc = 0; 848 849 if (spdk_unlikely(!bdev_nvme_find_io_path(nbdev_ch, &ns, &qpair))) { 850 rc = -ENXIO; 851 goto exit; 852 } 853 854 switch (bdev_io->type) { 855 case SPDK_BDEV_IO_TYPE_READ: 856 if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) { 857 rc = bdev_nvme_readv(ns, 858 qpair, 859 nbdev_io, 860 bdev_io->u.bdev.iovs, 861 bdev_io->u.bdev.iovcnt, 862 bdev_io->u.bdev.md_buf, 863 bdev_io->u.bdev.num_blocks, 864 bdev_io->u.bdev.offset_blocks, 865 bdev->dif_check_flags); 866 } else { 867 spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb, 868 bdev_io->u.bdev.num_blocks * bdev->blocklen); 869 rc = 0; 870 } 871 break; 872 case SPDK_BDEV_IO_TYPE_WRITE: 873 rc = bdev_nvme_writev(ns, 874 qpair, 875 nbdev_io, 876 bdev_io->u.bdev.iovs, 877 bdev_io->u.bdev.iovcnt, 878 bdev_io->u.bdev.md_buf, 879 bdev_io->u.bdev.num_blocks, 880 bdev_io->u.bdev.offset_blocks, 881 bdev->dif_check_flags); 882 break; 883 case SPDK_BDEV_IO_TYPE_COMPARE: 884 rc = bdev_nvme_comparev(ns, 885 qpair, 886 nbdev_io, 887 bdev_io->u.bdev.iovs, 888 bdev_io->u.bdev.iovcnt, 889 bdev_io->u.bdev.md_buf, 890 bdev_io->u.bdev.num_blocks, 891 bdev_io->u.bdev.offset_blocks, 892 bdev->dif_check_flags); 893 break; 894 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 895 rc = bdev_nvme_comparev_and_writev(ns, 896 qpair, 897 nbdev_io, 898 bdev_io->u.bdev.iovs, 899 bdev_io->u.bdev.iovcnt, 900 bdev_io->u.bdev.fused_iovs, 901 bdev_io->u.bdev.fused_iovcnt, 902 bdev_io->u.bdev.md_buf, 903 bdev_io->u.bdev.num_blocks, 904 bdev_io->u.bdev.offset_blocks, 905 bdev->dif_check_flags); 906 break; 907 case SPDK_BDEV_IO_TYPE_UNMAP: 908 rc = bdev_nvme_unmap(ns, 909 qpair, 910 nbdev_io, 911 bdev_io->u.bdev.offset_blocks, 912 bdev_io->u.bdev.num_blocks); 913 break; 914 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 915 rc = bdev_nvme_write_zeroes(ns, qpair, 916 nbdev_io, 917 bdev_io->u.bdev.offset_blocks, 918 bdev_io->u.bdev.num_blocks); 919 break; 920 case SPDK_BDEV_IO_TYPE_RESET: 921 rc = bdev_nvme_reset_io(nbdev_ch, nbdev_io); 922 break; 923 case SPDK_BDEV_IO_TYPE_FLUSH: 924 rc = bdev_nvme_flush(ns, 925 qpair, 926 nbdev_io, 927 bdev_io->u.bdev.offset_blocks, 928 bdev_io->u.bdev.num_blocks); 929 break; 930 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 931 rc = bdev_nvme_zone_appendv(ns, 932 qpair, 933 nbdev_io, 934 bdev_io->u.bdev.iovs, 935 bdev_io->u.bdev.iovcnt, 936 bdev_io->u.bdev.md_buf, 937 bdev_io->u.bdev.num_blocks, 938 bdev_io->u.bdev.offset_blocks, 939 bdev->dif_check_flags); 940 break; 941 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 942 rc = bdev_nvme_get_zone_info(ns, 943 qpair, 944 nbdev_io, 945 bdev_io->u.zone_mgmt.zone_id, 946 bdev_io->u.zone_mgmt.num_zones, 947 bdev_io->u.zone_mgmt.buf); 948 break; 949 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 950 rc = bdev_nvme_zone_management(ns, 951 qpair, 952 nbdev_io, 953 bdev_io->u.zone_mgmt.zone_id, 954 bdev_io->u.zone_mgmt.zone_action); 955 break; 956 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 957 rc = bdev_nvme_admin_passthru(nbdev_ch, 958 nbdev_io, 959 &bdev_io->u.nvme_passthru.cmd, 960 bdev_io->u.nvme_passthru.buf, 961 bdev_io->u.nvme_passthru.nbytes); 962 break; 963 case SPDK_BDEV_IO_TYPE_NVME_IO: 964 rc = bdev_nvme_io_passthru(ns, 965 qpair, 966 nbdev_io, 967 &bdev_io->u.nvme_passthru.cmd, 968 bdev_io->u.nvme_passthru.buf, 969 bdev_io->u.nvme_passthru.nbytes); 970 break; 971 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 972 rc = bdev_nvme_io_passthru_md(ns, 973 qpair, 974 nbdev_io, 975 &bdev_io->u.nvme_passthru.cmd, 976 bdev_io->u.nvme_passthru.buf, 977 bdev_io->u.nvme_passthru.nbytes, 978 bdev_io->u.nvme_passthru.md_buf, 979 bdev_io->u.nvme_passthru.md_len); 980 break; 981 case SPDK_BDEV_IO_TYPE_ABORT: 982 nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx; 983 rc = bdev_nvme_abort(nbdev_ch, 984 nbdev_io, 985 nbdev_io_to_abort); 986 break; 987 default: 988 rc = -EINVAL; 989 break; 990 } 991 992 exit: 993 if (spdk_unlikely(rc != 0)) { 994 bdev_nvme_io_complete(nbdev_io, rc); 995 } 996 } 997 998 static bool 999 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 1000 { 1001 struct nvme_bdev *nbdev = ctx; 1002 struct nvme_ns *nvme_ns; 1003 struct spdk_nvme_ns *ns; 1004 struct spdk_nvme_ctrlr *ctrlr; 1005 const struct spdk_nvme_ctrlr_data *cdata; 1006 1007 nvme_ns = nbdev->nvme_ns; 1008 assert(nvme_ns != NULL); 1009 ns = nvme_ns->ns; 1010 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 1011 1012 switch (io_type) { 1013 case SPDK_BDEV_IO_TYPE_READ: 1014 case SPDK_BDEV_IO_TYPE_WRITE: 1015 case SPDK_BDEV_IO_TYPE_RESET: 1016 case SPDK_BDEV_IO_TYPE_FLUSH: 1017 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 1018 case SPDK_BDEV_IO_TYPE_NVME_IO: 1019 case SPDK_BDEV_IO_TYPE_ABORT: 1020 return true; 1021 1022 case SPDK_BDEV_IO_TYPE_COMPARE: 1023 return spdk_nvme_ns_supports_compare(ns); 1024 1025 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 1026 return spdk_nvme_ns_get_md_size(ns) ? true : false; 1027 1028 case SPDK_BDEV_IO_TYPE_UNMAP: 1029 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 1030 return cdata->oncs.dsm; 1031 1032 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 1033 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 1034 return cdata->oncs.write_zeroes; 1035 1036 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 1037 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 1038 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) { 1039 return true; 1040 } 1041 return false; 1042 1043 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 1044 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 1045 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS; 1046 1047 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 1048 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS && 1049 spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ZONE_APPEND_SUPPORTED; 1050 1051 default: 1052 return false; 1053 } 1054 } 1055 1056 static int 1057 bdev_nvme_create_ctrlr_channel_cb(void *io_device, void *ctx_buf) 1058 { 1059 struct nvme_ctrlr *nvme_ctrlr = io_device; 1060 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 1061 struct spdk_io_channel *pg_ch; 1062 int rc; 1063 1064 pg_ch = spdk_get_io_channel(&g_nvme_ctrlrs); 1065 if (!pg_ch) { 1066 return -1; 1067 } 1068 1069 ctrlr_ch->group = spdk_io_channel_get_ctx(pg_ch); 1070 1071 #ifdef SPDK_CONFIG_VTUNE 1072 ctrlr_ch->group->collect_spin_stat = true; 1073 #else 1074 ctrlr_ch->group->collect_spin_stat = false; 1075 #endif 1076 1077 TAILQ_INIT(&ctrlr_ch->pending_resets); 1078 1079 if (spdk_nvme_ctrlr_is_ocssd_supported(nvme_ctrlr->ctrlr)) { 1080 rc = bdev_ocssd_create_io_channel(ctrlr_ch); 1081 if (rc != 0) { 1082 goto err_ocssd_ch; 1083 } 1084 } 1085 1086 ctrlr_ch->ctrlr = nvme_ctrlr; 1087 1088 rc = bdev_nvme_create_qpair(ctrlr_ch); 1089 if (rc != 0) { 1090 goto err_qpair; 1091 } 1092 1093 return 0; 1094 1095 err_qpair: 1096 if (ctrlr_ch->ocssd_ch) { 1097 bdev_ocssd_destroy_io_channel(ctrlr_ch); 1098 } 1099 err_ocssd_ch: 1100 spdk_put_io_channel(pg_ch); 1101 1102 return rc; 1103 } 1104 1105 static void 1106 bdev_nvme_destroy_ctrlr_channel_cb(void *io_device, void *ctx_buf) 1107 { 1108 struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf; 1109 1110 assert(ctrlr_ch->group != NULL); 1111 1112 if (ctrlr_ch->ocssd_ch != NULL) { 1113 bdev_ocssd_destroy_io_channel(ctrlr_ch); 1114 } 1115 1116 bdev_nvme_destroy_qpair(ctrlr_ch); 1117 1118 spdk_put_io_channel(spdk_io_channel_from_ctx(ctrlr_ch->group)); 1119 } 1120 1121 static void 1122 bdev_nvme_submit_accel_crc32c(void *ctx, uint32_t *dst, struct iovec *iov, 1123 uint32_t iov_cnt, uint32_t seed, 1124 spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) 1125 { 1126 struct nvme_poll_group *group = ctx; 1127 int rc; 1128 1129 assert(group->accel_channel != NULL); 1130 assert(cb_fn != NULL); 1131 1132 rc = spdk_accel_submit_crc32cv(group->accel_channel, dst, iov, iov_cnt, seed, cb_fn, cb_arg); 1133 if (rc) { 1134 /* For the two cases, spdk_accel_submit_crc32cv does not call the user's cb_fn */ 1135 if (rc == -ENOMEM || rc == -EINVAL) { 1136 cb_fn(cb_arg, rc); 1137 } 1138 SPDK_ERRLOG("Cannot complete the accelerated crc32c operation with iov=%p\n", iov); 1139 } 1140 } 1141 1142 static struct spdk_nvme_accel_fn_table g_bdev_nvme_accel_fn_table = { 1143 .table_size = sizeof(struct spdk_nvme_accel_fn_table), 1144 .submit_accel_crc32c = bdev_nvme_submit_accel_crc32c, 1145 }; 1146 1147 static int 1148 bdev_nvme_create_poll_group_cb(void *io_device, void *ctx_buf) 1149 { 1150 struct nvme_poll_group *group = ctx_buf; 1151 1152 group->group = spdk_nvme_poll_group_create(group, &g_bdev_nvme_accel_fn_table); 1153 if (group->group == NULL) { 1154 return -1; 1155 } 1156 1157 group->accel_channel = spdk_accel_engine_get_io_channel(); 1158 if (!group->accel_channel) { 1159 spdk_nvme_poll_group_destroy(group->group); 1160 SPDK_ERRLOG("Cannot get the accel_channel for bdev nvme polling group=%p\n", 1161 group); 1162 return -1; 1163 } 1164 1165 group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us); 1166 1167 if (group->poller == NULL) { 1168 spdk_put_io_channel(group->accel_channel); 1169 spdk_nvme_poll_group_destroy(group->group); 1170 return -1; 1171 } 1172 1173 return 0; 1174 } 1175 1176 static void 1177 bdev_nvme_destroy_poll_group_cb(void *io_device, void *ctx_buf) 1178 { 1179 struct nvme_poll_group *group = ctx_buf; 1180 1181 if (group->accel_channel) { 1182 spdk_put_io_channel(group->accel_channel); 1183 } 1184 1185 spdk_poller_unregister(&group->poller); 1186 if (spdk_nvme_poll_group_destroy(group->group)) { 1187 SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.\n"); 1188 assert(false); 1189 } 1190 } 1191 1192 static struct spdk_io_channel * 1193 bdev_nvme_get_io_channel(void *ctx) 1194 { 1195 struct nvme_bdev *nvme_bdev = ctx; 1196 1197 return spdk_get_io_channel(nvme_bdev); 1198 } 1199 1200 static void * 1201 bdev_nvme_get_module_ctx(void *ctx) 1202 { 1203 struct nvme_bdev *nvme_bdev = ctx; 1204 1205 return bdev_nvme_get_ctrlr(&nvme_bdev->disk); 1206 } 1207 1208 static const char * 1209 _nvme_ana_state_str(enum spdk_nvme_ana_state ana_state) 1210 { 1211 switch (ana_state) { 1212 case SPDK_NVME_ANA_OPTIMIZED_STATE: 1213 return "optimized"; 1214 case SPDK_NVME_ANA_NON_OPTIMIZED_STATE: 1215 return "non_optimized"; 1216 case SPDK_NVME_ANA_INACCESSIBLE_STATE: 1217 return "inaccessible"; 1218 case SPDK_NVME_ANA_PERSISTENT_LOSS_STATE: 1219 return "persistent_loss"; 1220 case SPDK_NVME_ANA_CHANGE_STATE: 1221 return "change"; 1222 default: 1223 return NULL; 1224 } 1225 } 1226 1227 static int 1228 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) 1229 { 1230 struct nvme_bdev *nvme_bdev = ctx; 1231 struct nvme_ns *nvme_ns; 1232 struct spdk_nvme_ns *ns; 1233 struct spdk_nvme_ctrlr *ctrlr; 1234 const struct spdk_nvme_ctrlr_data *cdata; 1235 const struct spdk_nvme_transport_id *trid; 1236 union spdk_nvme_vs_register vs; 1237 union spdk_nvme_csts_register csts; 1238 char buf[128]; 1239 1240 nvme_ns = nvme_bdev->nvme_ns; 1241 assert(nvme_ns != NULL); 1242 ns = nvme_ns->ns; 1243 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 1244 1245 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 1246 trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); 1247 vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr); 1248 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); 1249 1250 spdk_json_write_named_object_begin(w, "nvme"); 1251 1252 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 1253 spdk_json_write_named_string(w, "pci_address", trid->traddr); 1254 } 1255 1256 spdk_json_write_named_object_begin(w, "trid"); 1257 1258 nvme_bdev_dump_trid_json(trid, w); 1259 1260 spdk_json_write_object_end(w); 1261 1262 #ifdef SPDK_CONFIG_NVME_CUSE 1263 size_t cuse_name_size = 128; 1264 char cuse_name[cuse_name_size]; 1265 1266 int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns), 1267 cuse_name, &cuse_name_size); 1268 if (rc == 0) { 1269 spdk_json_write_named_string(w, "cuse_device", cuse_name); 1270 } 1271 #endif 1272 1273 spdk_json_write_named_object_begin(w, "ctrlr_data"); 1274 1275 spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid); 1276 1277 snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn); 1278 spdk_str_trim(buf); 1279 spdk_json_write_named_string(w, "model_number", buf); 1280 1281 snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn); 1282 spdk_str_trim(buf); 1283 spdk_json_write_named_string(w, "serial_number", buf); 1284 1285 snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr); 1286 spdk_str_trim(buf); 1287 spdk_json_write_named_string(w, "firmware_revision", buf); 1288 1289 if (cdata->subnqn[0] != '\0') { 1290 spdk_json_write_named_string(w, "subnqn", cdata->subnqn); 1291 } 1292 1293 spdk_json_write_named_object_begin(w, "oacs"); 1294 1295 spdk_json_write_named_uint32(w, "security", cdata->oacs.security); 1296 spdk_json_write_named_uint32(w, "format", cdata->oacs.format); 1297 spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware); 1298 spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage); 1299 1300 spdk_json_write_object_end(w); 1301 1302 spdk_json_write_object_end(w); 1303 1304 spdk_json_write_named_object_begin(w, "vs"); 1305 1306 spdk_json_write_name(w, "nvme_version"); 1307 if (vs.bits.ter) { 1308 spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter); 1309 } else { 1310 spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr); 1311 } 1312 1313 spdk_json_write_object_end(w); 1314 1315 spdk_json_write_named_object_begin(w, "csts"); 1316 1317 spdk_json_write_named_uint32(w, "rdy", csts.bits.rdy); 1318 spdk_json_write_named_uint32(w, "cfs", csts.bits.cfs); 1319 1320 spdk_json_write_object_end(w); 1321 1322 spdk_json_write_named_object_begin(w, "ns_data"); 1323 1324 spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns)); 1325 1326 if (cdata->cmic.ana_reporting) { 1327 spdk_json_write_named_string(w, "ana_state", 1328 _nvme_ana_state_str(nvme_ns->ana_state)); 1329 } 1330 1331 spdk_json_write_object_end(w); 1332 1333 if (cdata->oacs.security) { 1334 spdk_json_write_named_object_begin(w, "security"); 1335 1336 spdk_json_write_named_bool(w, "opal", nvme_bdev->opal); 1337 1338 spdk_json_write_object_end(w); 1339 } 1340 1341 spdk_json_write_object_end(w); 1342 1343 return 0; 1344 } 1345 1346 static void 1347 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1348 { 1349 /* No config per bdev needed */ 1350 } 1351 1352 static uint64_t 1353 bdev_nvme_get_spin_time(struct spdk_io_channel *ch) 1354 { 1355 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 1356 struct nvme_ctrlr_channel *ctrlr_ch = nbdev_ch->ctrlr_ch; 1357 struct nvme_poll_group *group = ctrlr_ch->group; 1358 uint64_t spin_time; 1359 1360 if (!group || !group->collect_spin_stat) { 1361 return 0; 1362 } 1363 1364 if (group->end_ticks != 0) { 1365 group->spin_ticks += (group->end_ticks - group->start_ticks); 1366 group->end_ticks = 0; 1367 } 1368 1369 spin_time = (group->spin_ticks * 1000000ULL) / spdk_get_ticks_hz(); 1370 group->start_ticks = 0; 1371 group->spin_ticks = 0; 1372 1373 return spin_time; 1374 } 1375 1376 static const struct spdk_bdev_fn_table nvmelib_fn_table = { 1377 .destruct = bdev_nvme_destruct, 1378 .submit_request = bdev_nvme_submit_request, 1379 .io_type_supported = bdev_nvme_io_type_supported, 1380 .get_io_channel = bdev_nvme_get_io_channel, 1381 .dump_info_json = bdev_nvme_dump_info_json, 1382 .write_config_json = bdev_nvme_write_config_json, 1383 .get_spin_time = bdev_nvme_get_spin_time, 1384 .get_module_ctx = bdev_nvme_get_module_ctx, 1385 }; 1386 1387 typedef int (*bdev_nvme_parse_ana_log_page_cb)( 1388 const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg); 1389 1390 static int 1391 bdev_nvme_parse_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 1392 bdev_nvme_parse_ana_log_page_cb cb_fn, void *cb_arg) 1393 { 1394 struct spdk_nvme_ana_group_descriptor *copied_desc; 1395 uint8_t *orig_desc; 1396 uint32_t i, desc_size, copy_len; 1397 int rc = 0; 1398 1399 if (nvme_ctrlr->ana_log_page == NULL) { 1400 return -EINVAL; 1401 } 1402 1403 copied_desc = nvme_ctrlr->copied_ana_desc; 1404 1405 orig_desc = (uint8_t *)nvme_ctrlr->ana_log_page + sizeof(struct spdk_nvme_ana_page); 1406 copy_len = nvme_ctrlr->ana_log_page_size - sizeof(struct spdk_nvme_ana_page); 1407 1408 for (i = 0; i < nvme_ctrlr->ana_log_page->num_ana_group_desc; i++) { 1409 memcpy(copied_desc, orig_desc, copy_len); 1410 1411 rc = cb_fn(copied_desc, cb_arg); 1412 if (rc != 0) { 1413 break; 1414 } 1415 1416 desc_size = sizeof(struct spdk_nvme_ana_group_descriptor) + 1417 copied_desc->num_of_nsid * sizeof(uint32_t); 1418 orig_desc += desc_size; 1419 copy_len -= desc_size; 1420 } 1421 1422 return rc; 1423 } 1424 1425 static int 1426 nvme_ns_set_ana_state(const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg) 1427 { 1428 struct nvme_ns *nvme_ns = cb_arg; 1429 uint32_t i; 1430 1431 for (i = 0; i < desc->num_of_nsid; i++) { 1432 if (desc->nsid[i] != spdk_nvme_ns_get_id(nvme_ns->ns)) { 1433 continue; 1434 } 1435 nvme_ns->ana_group_id = desc->ana_group_id; 1436 nvme_ns->ana_state = desc->ana_state; 1437 return 1; 1438 } 1439 1440 return 0; 1441 } 1442 1443 static int 1444 nvme_disk_create(struct spdk_bdev *disk, const char *base_name, 1445 struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns, 1446 uint32_t prchk_flags, void *ctx) 1447 { 1448 const struct spdk_uuid *uuid; 1449 const uint8_t *nguid; 1450 const struct spdk_nvme_ctrlr_data *cdata; 1451 const struct spdk_nvme_ns_data *nsdata; 1452 enum spdk_nvme_csi csi; 1453 uint32_t atomic_bs, phys_bs, bs; 1454 1455 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 1456 csi = spdk_nvme_ns_get_csi(ns); 1457 1458 switch (csi) { 1459 case SPDK_NVME_CSI_NVM: 1460 disk->product_name = "NVMe disk"; 1461 break; 1462 case SPDK_NVME_CSI_ZNS: 1463 disk->product_name = "NVMe ZNS disk"; 1464 disk->zoned = true; 1465 disk->zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 1466 disk->max_zone_append_size = spdk_nvme_zns_ctrlr_get_max_zone_append_size(ctrlr) / 1467 spdk_nvme_ns_get_extended_sector_size(ns); 1468 disk->max_open_zones = spdk_nvme_zns_ns_get_max_open_zones(ns); 1469 disk->max_active_zones = spdk_nvme_zns_ns_get_max_active_zones(ns); 1470 break; 1471 default: 1472 SPDK_ERRLOG("unsupported CSI: %u\n", csi); 1473 return -ENOTSUP; 1474 } 1475 1476 disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns)); 1477 if (!disk->name) { 1478 return -ENOMEM; 1479 } 1480 1481 disk->write_cache = 0; 1482 if (cdata->vwc.present) { 1483 /* Enable if the Volatile Write Cache exists */ 1484 disk->write_cache = 1; 1485 } 1486 if (cdata->oncs.write_zeroes) { 1487 disk->max_write_zeroes = UINT16_MAX + 1; 1488 } 1489 disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns); 1490 disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns); 1491 disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns); 1492 1493 nguid = spdk_nvme_ns_get_nguid(ns); 1494 if (!nguid) { 1495 uuid = spdk_nvme_ns_get_uuid(ns); 1496 if (uuid) { 1497 disk->uuid = *uuid; 1498 } 1499 } else { 1500 memcpy(&disk->uuid, nguid, sizeof(disk->uuid)); 1501 } 1502 1503 nsdata = spdk_nvme_ns_get_data(ns); 1504 bs = spdk_nvme_ns_get_sector_size(ns); 1505 atomic_bs = bs; 1506 phys_bs = bs; 1507 if (nsdata->nabo == 0) { 1508 if (nsdata->nsfeat.ns_atomic_write_unit && nsdata->nawupf) { 1509 atomic_bs = bs * (1 + nsdata->nawupf); 1510 } else { 1511 atomic_bs = bs * (1 + cdata->awupf); 1512 } 1513 } 1514 if (nsdata->nsfeat.optperf) { 1515 phys_bs = bs * (1 + nsdata->npwg); 1516 } 1517 disk->phys_blocklen = spdk_min(phys_bs, atomic_bs); 1518 1519 disk->md_len = spdk_nvme_ns_get_md_size(ns); 1520 if (disk->md_len != 0) { 1521 disk->md_interleave = nsdata->flbas.extended; 1522 disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns); 1523 if (disk->dif_type != SPDK_DIF_DISABLE) { 1524 disk->dif_is_head_of_md = nsdata->dps.md_start; 1525 disk->dif_check_flags = prchk_flags; 1526 } 1527 } 1528 1529 if (!(spdk_nvme_ctrlr_get_flags(ctrlr) & 1530 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) { 1531 disk->acwu = 0; 1532 } else if (nsdata->nsfeat.ns_atomic_write_unit) { 1533 disk->acwu = nsdata->nacwu; 1534 } else { 1535 disk->acwu = cdata->acwu; 1536 } 1537 1538 disk->ctxt = ctx; 1539 disk->fn_table = &nvmelib_fn_table; 1540 disk->module = &nvme_if; 1541 1542 return 0; 1543 } 1544 1545 static int 1546 nvme_bdev_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns) 1547 { 1548 struct nvme_bdev *bdev; 1549 int rc; 1550 1551 bdev = calloc(1, sizeof(*bdev)); 1552 if (!bdev) { 1553 SPDK_ERRLOG("bdev calloc() failed\n"); 1554 return -ENOMEM; 1555 } 1556 1557 bdev->nvme_ns = nvme_ns; 1558 bdev->opal = nvme_ctrlr->opal_dev != NULL; 1559 1560 rc = nvme_disk_create(&bdev->disk, nvme_ctrlr->name, nvme_ctrlr->ctrlr, 1561 nvme_ns->ns, nvme_ctrlr->prchk_flags, bdev); 1562 if (rc != 0) { 1563 SPDK_ERRLOG("Failed to create NVMe disk\n"); 1564 free(bdev); 1565 return rc; 1566 } 1567 1568 spdk_io_device_register(bdev, 1569 bdev_nvme_create_bdev_channel_cb, 1570 bdev_nvme_destroy_bdev_channel_cb, 1571 sizeof(struct nvme_bdev_channel), 1572 bdev->disk.name); 1573 1574 rc = spdk_bdev_register(&bdev->disk); 1575 if (rc != 0) { 1576 SPDK_ERRLOG("spdk_bdev_register() failed\n"); 1577 spdk_io_device_unregister(bdev, NULL); 1578 free(bdev->disk.name); 1579 free(bdev); 1580 return rc; 1581 } 1582 1583 nvme_ns->bdev = bdev; 1584 1585 return 0; 1586 } 1587 1588 static bool 1589 bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2) 1590 { 1591 const struct spdk_nvme_ns_data *nsdata1, *nsdata2; 1592 const struct spdk_uuid *uuid1, *uuid2; 1593 1594 nsdata1 = spdk_nvme_ns_get_data(ns1); 1595 nsdata2 = spdk_nvme_ns_get_data(ns2); 1596 uuid1 = spdk_nvme_ns_get_uuid(ns1); 1597 uuid2 = spdk_nvme_ns_get_uuid(ns2); 1598 1599 return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid)) == 0 && 1600 nsdata1->eui64 == nsdata2->eui64 && 1601 uuid1 != NULL && uuid2 != NULL && spdk_uuid_compare(uuid1, uuid2) == 0; 1602 } 1603 1604 static void 1605 nvme_ctrlr_populate_standard_namespace(struct nvme_ctrlr *nvme_ctrlr, 1606 struct nvme_ns *nvme_ns, struct nvme_async_probe_ctx *ctx) 1607 { 1608 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 1609 struct spdk_nvme_ns *ns; 1610 int rc = 0; 1611 1612 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id); 1613 if (!ns) { 1614 SPDK_DEBUGLOG(bdev_nvme, "Invalid NS %d\n", nvme_ns->id); 1615 rc = -EINVAL; 1616 goto done; 1617 } 1618 1619 nvme_ns->ns = ns; 1620 nvme_ns->populated = true; 1621 nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE; 1622 1623 if (nvme_ctrlr->ana_log_page != NULL) { 1624 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ns_set_ana_state, nvme_ns); 1625 } 1626 1627 rc = nvme_bdev_create(nvme_ctrlr, nvme_ns); 1628 done: 1629 nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, rc); 1630 } 1631 1632 static bool 1633 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 1634 struct spdk_nvme_ctrlr_opts *opts) 1635 { 1636 struct nvme_probe_skip_entry *entry; 1637 1638 TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) { 1639 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 1640 return false; 1641 } 1642 } 1643 1644 opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst; 1645 opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight; 1646 opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight; 1647 opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight; 1648 opts->disable_read_ana_log_page = true; 1649 1650 SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr); 1651 1652 return true; 1653 } 1654 1655 static void 1656 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl) 1657 { 1658 struct nvme_ctrlr *nvme_ctrlr = ctx; 1659 1660 if (spdk_nvme_cpl_is_error(cpl)) { 1661 SPDK_WARNLOG("Abort failed. Resetting controller. sc is %u, sct is %u.\n", cpl->status.sc, 1662 cpl->status.sct); 1663 bdev_nvme_reset(nvme_ctrlr); 1664 } 1665 } 1666 1667 static void 1668 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr, 1669 struct spdk_nvme_qpair *qpair, uint16_t cid) 1670 { 1671 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 1672 union spdk_nvme_csts_register csts; 1673 int rc; 1674 1675 assert(nvme_ctrlr->ctrlr == ctrlr); 1676 1677 SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid); 1678 1679 /* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O 1680 * queue. (Note: qpair == NULL when there's an admin cmd timeout.) Otherwise we 1681 * would submit another fabrics cmd on the admin queue to read CSTS and check for its 1682 * completion recursively. 1683 */ 1684 if (nvme_ctrlr->connected_trid->trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) { 1685 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); 1686 if (csts.bits.cfs) { 1687 SPDK_ERRLOG("Controller Fatal Status, reset required\n"); 1688 bdev_nvme_reset(nvme_ctrlr); 1689 return; 1690 } 1691 } 1692 1693 switch (g_opts.action_on_timeout) { 1694 case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT: 1695 if (qpair) { 1696 /* Don't send abort to ctrlr when reset is running. */ 1697 pthread_mutex_lock(&nvme_ctrlr->mutex); 1698 if (nvme_ctrlr->resetting) { 1699 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1700 SPDK_NOTICELOG("Quit abort. Ctrlr is in the process of reseting.\n"); 1701 return; 1702 } 1703 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1704 1705 rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid, 1706 nvme_abort_cpl, nvme_ctrlr); 1707 if (rc == 0) { 1708 return; 1709 } 1710 1711 SPDK_ERRLOG("Unable to send abort. Resetting, rc is %d.\n", rc); 1712 } 1713 1714 /* FALLTHROUGH */ 1715 case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET: 1716 bdev_nvme_reset(nvme_ctrlr); 1717 break; 1718 case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE: 1719 SPDK_DEBUGLOG(bdev_nvme, "No action for nvme controller timeout.\n"); 1720 break; 1721 default: 1722 SPDK_ERRLOG("An invalid timeout action value is found.\n"); 1723 break; 1724 } 1725 } 1726 1727 static void 1728 nvme_ctrlr_depopulate_standard_namespace(struct nvme_ns *nvme_ns) 1729 { 1730 struct nvme_bdev *bdev; 1731 1732 bdev = nvme_ns->bdev; 1733 if (bdev != NULL) { 1734 spdk_bdev_unregister(&bdev->disk, NULL, NULL); 1735 } 1736 1737 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 1738 } 1739 1740 static void 1741 nvme_ctrlr_populate_namespace(struct nvme_ctrlr *ctrlr, struct nvme_ns *nvme_ns, 1742 struct nvme_async_probe_ctx *ctx) 1743 { 1744 g_populate_namespace_fn[nvme_ns->type](ctrlr, nvme_ns, ctx); 1745 } 1746 1747 static void 1748 nvme_ctrlr_depopulate_namespace(struct nvme_ctrlr *ctrlr, struct nvme_ns *nvme_ns) 1749 { 1750 g_depopulate_namespace_fn[nvme_ns->type](nvme_ns); 1751 } 1752 1753 void 1754 nvme_ctrlr_populate_namespace_done(struct nvme_async_probe_ctx *ctx, 1755 struct nvme_ns *nvme_ns, int rc) 1756 { 1757 struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr; 1758 1759 assert(nvme_ctrlr != NULL); 1760 1761 if (rc == 0) { 1762 pthread_mutex_lock(&nvme_ctrlr->mutex); 1763 nvme_ctrlr->ref++; 1764 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1765 } else { 1766 memset(nvme_ns, 0, sizeof(*nvme_ns)); 1767 } 1768 1769 if (ctx) { 1770 ctx->populates_in_progress--; 1771 if (ctx->populates_in_progress == 0) { 1772 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 1773 } 1774 } 1775 } 1776 1777 static void 1778 nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr, 1779 struct nvme_async_probe_ctx *ctx) 1780 { 1781 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 1782 struct nvme_ns *nvme_ns; 1783 struct spdk_nvme_ns *ns; 1784 struct nvme_bdev *bdev; 1785 uint32_t i; 1786 int rc; 1787 uint64_t num_sectors; 1788 bool ns_is_active; 1789 1790 if (ctx) { 1791 /* Initialize this count to 1 to handle the populate functions 1792 * calling nvme_ctrlr_populate_namespace_done() immediately. 1793 */ 1794 ctx->populates_in_progress = 1; 1795 } 1796 1797 for (i = 0; i < nvme_ctrlr->num_ns; i++) { 1798 uint32_t nsid = i + 1; 1799 1800 nvme_ns = nvme_ctrlr->namespaces[i]; 1801 ns_is_active = spdk_nvme_ctrlr_is_active_ns(ctrlr, nsid); 1802 1803 if (nvme_ns->populated && ns_is_active && nvme_ns->type == NVME_NS_STANDARD) { 1804 /* NS is still there but attributes may have changed */ 1805 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid); 1806 num_sectors = spdk_nvme_ns_get_num_sectors(ns); 1807 bdev = nvme_ns->bdev; 1808 assert(bdev != NULL); 1809 if (bdev->disk.blockcnt != num_sectors) { 1810 SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n", 1811 nsid, 1812 bdev->disk.name, 1813 bdev->disk.blockcnt, 1814 num_sectors); 1815 rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors); 1816 if (rc != 0) { 1817 SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n", 1818 bdev->disk.name, rc); 1819 } 1820 } 1821 } 1822 1823 if (!nvme_ns->populated && ns_is_active) { 1824 nvme_ns->id = nsid; 1825 nvme_ns->ctrlr = nvme_ctrlr; 1826 if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) { 1827 nvme_ns->type = NVME_NS_OCSSD; 1828 } else { 1829 nvme_ns->type = NVME_NS_STANDARD; 1830 } 1831 1832 nvme_ns->bdev = NULL; 1833 1834 if (ctx) { 1835 ctx->populates_in_progress++; 1836 } 1837 nvme_ctrlr_populate_namespace(nvme_ctrlr, nvme_ns, ctx); 1838 } 1839 1840 if (nvme_ns->populated && !ns_is_active) { 1841 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 1842 } 1843 } 1844 1845 if (ctx) { 1846 /* Decrement this count now that the loop is over to account 1847 * for the one we started with. If the count is then 0, we 1848 * know any populate_namespace functions completed immediately, 1849 * so we'll kick the callback here. 1850 */ 1851 ctx->populates_in_progress--; 1852 if (ctx->populates_in_progress == 0) { 1853 nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx); 1854 } 1855 } 1856 1857 } 1858 1859 static void 1860 nvme_ctrlr_depopulate_namespaces(struct nvme_ctrlr *nvme_ctrlr) 1861 { 1862 uint32_t i; 1863 struct nvme_ns *nvme_ns; 1864 1865 for (i = 0; i < nvme_ctrlr->num_ns; i++) { 1866 uint32_t nsid = i + 1; 1867 1868 nvme_ns = nvme_ctrlr->namespaces[nsid - 1]; 1869 if (nvme_ns->populated) { 1870 assert(nvme_ns->id == nsid); 1871 nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns); 1872 } 1873 } 1874 } 1875 1876 static bool 1877 nvme_ctrlr_acquire(struct nvme_ctrlr *nvme_ctrlr) 1878 { 1879 pthread_mutex_lock(&nvme_ctrlr->mutex); 1880 if (nvme_ctrlr->destruct || nvme_ctrlr->resetting) { 1881 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1882 return false; 1883 } 1884 nvme_ctrlr->ref++; 1885 pthread_mutex_unlock(&nvme_ctrlr->mutex); 1886 return true; 1887 } 1888 1889 static int 1890 nvme_ctrlr_set_ana_states(const struct spdk_nvme_ana_group_descriptor *desc, 1891 void *cb_arg) 1892 { 1893 struct nvme_ctrlr *nvme_ctrlr = cb_arg; 1894 struct nvme_ns *nvme_ns; 1895 uint32_t i, nsid; 1896 1897 for (i = 0; i < desc->num_of_nsid; i++) { 1898 nsid = desc->nsid[i]; 1899 if (nsid == 0 || nsid > nvme_ctrlr->num_ns) { 1900 continue; 1901 } 1902 1903 nvme_ns = nvme_ctrlr->namespaces[nsid - 1]; 1904 assert(nvme_ns != NULL); 1905 1906 if (!nvme_ns->populated) { 1907 continue; 1908 } 1909 1910 nvme_ns->ana_group_id = desc->ana_group_id; 1911 nvme_ns->ana_state = desc->ana_state; 1912 } 1913 1914 return 0; 1915 } 1916 1917 static void 1918 nvme_ctrlr_read_ana_log_page_done(void *ctx, const struct spdk_nvme_cpl *cpl) 1919 { 1920 struct nvme_ctrlr *nvme_ctrlr = ctx; 1921 1922 if (spdk_nvme_cpl_is_success(cpl)) { 1923 bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ctrlr_set_ana_states, 1924 nvme_ctrlr); 1925 } 1926 1927 nvme_ctrlr_release(nvme_ctrlr); 1928 } 1929 1930 static void 1931 nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr) 1932 { 1933 int rc; 1934 1935 if (nvme_ctrlr->ana_log_page == NULL) { 1936 return; 1937 } 1938 1939 if (!nvme_ctrlr_acquire(nvme_ctrlr)) { 1940 return; 1941 } 1942 1943 rc = spdk_nvme_ctrlr_cmd_get_log_page(nvme_ctrlr->ctrlr, 1944 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 1945 SPDK_NVME_GLOBAL_NS_TAG, 1946 nvme_ctrlr->ana_log_page, 1947 nvme_ctrlr->ana_log_page_size, 0, 1948 nvme_ctrlr_read_ana_log_page_done, 1949 nvme_ctrlr); 1950 if (rc != 0) { 1951 nvme_ctrlr_release(nvme_ctrlr); 1952 } 1953 } 1954 1955 static void 1956 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 1957 { 1958 struct nvme_ctrlr *nvme_ctrlr = arg; 1959 union spdk_nvme_async_event_completion event; 1960 1961 if (spdk_nvme_cpl_is_error(cpl)) { 1962 SPDK_WARNLOG("AER request execute failed"); 1963 return; 1964 } 1965 1966 event.raw = cpl->cdw0; 1967 if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 1968 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) { 1969 nvme_ctrlr_populate_namespaces(nvme_ctrlr, NULL); 1970 } else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_VENDOR) && 1971 (event.bits.log_page_identifier == SPDK_OCSSD_LOG_CHUNK_NOTIFICATION) && 1972 spdk_nvme_ctrlr_is_ocssd_supported(nvme_ctrlr->ctrlr)) { 1973 bdev_ocssd_handle_chunk_notification(nvme_ctrlr); 1974 } else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 1975 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_ANA_CHANGE)) { 1976 nvme_ctrlr_read_ana_log_page(nvme_ctrlr); 1977 } 1978 } 1979 1980 static void 1981 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, size_t count, int rc) 1982 { 1983 if (ctx->cb_fn) { 1984 ctx->cb_fn(ctx->cb_ctx, count, rc); 1985 } 1986 1987 ctx->namespaces_populated = true; 1988 if (ctx->probe_done) { 1989 /* The probe was already completed, so we need to free the context 1990 * here. This can happen for cases like OCSSD, where we need to 1991 * send additional commands to the SSD after attach. 1992 */ 1993 free(ctx); 1994 } 1995 } 1996 1997 static void 1998 nvme_ctrlr_create_done(struct nvme_ctrlr *nvme_ctrlr, 1999 struct nvme_async_probe_ctx *ctx) 2000 { 2001 spdk_io_device_register(nvme_ctrlr, 2002 bdev_nvme_create_ctrlr_channel_cb, 2003 bdev_nvme_destroy_ctrlr_channel_cb, 2004 sizeof(struct nvme_ctrlr_channel), 2005 nvme_ctrlr->name); 2006 2007 nvme_ctrlr_populate_namespaces(nvme_ctrlr, ctx); 2008 } 2009 2010 static void 2011 nvme_ctrlr_init_ana_log_page_done(void *_ctx, const struct spdk_nvme_cpl *cpl) 2012 { 2013 struct nvme_ctrlr *nvme_ctrlr = _ctx; 2014 struct nvme_async_probe_ctx *ctx = nvme_ctrlr->probe_ctx; 2015 2016 nvme_ctrlr->probe_ctx = NULL; 2017 2018 if (spdk_nvme_cpl_is_error(cpl)) { 2019 nvme_ctrlr_delete(nvme_ctrlr); 2020 2021 if (ctx != NULL) { 2022 populate_namespaces_cb(ctx, 0, -1); 2023 } 2024 return; 2025 } 2026 2027 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 2028 } 2029 2030 static int 2031 nvme_ctrlr_init_ana_log_page(struct nvme_ctrlr *nvme_ctrlr, 2032 struct nvme_async_probe_ctx *ctx) 2033 { 2034 struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; 2035 const struct spdk_nvme_ctrlr_data *cdata; 2036 uint32_t ana_log_page_size; 2037 2038 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 2039 2040 ana_log_page_size = sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid * 2041 sizeof(struct spdk_nvme_ana_group_descriptor) + cdata->nn * 2042 sizeof(uint32_t); 2043 2044 nvme_ctrlr->ana_log_page = spdk_zmalloc(ana_log_page_size, 64, NULL, 2045 SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 2046 if (nvme_ctrlr->ana_log_page == NULL) { 2047 SPDK_ERRLOG("could not allocate ANA log page buffer\n"); 2048 return -ENXIO; 2049 } 2050 2051 /* Each descriptor in a ANA log page is not ensured to be 8-bytes aligned. 2052 * Hence copy each descriptor to a temporary area when parsing it. 2053 * 2054 * Allocate a buffer whose size is as large as ANA log page buffer because 2055 * we do not know the size of a descriptor until actually reading it. 2056 */ 2057 nvme_ctrlr->copied_ana_desc = calloc(1, ana_log_page_size); 2058 if (nvme_ctrlr->copied_ana_desc == NULL) { 2059 SPDK_ERRLOG("could not allocate a buffer to parse ANA descriptor\n"); 2060 return -ENOMEM; 2061 } 2062 2063 nvme_ctrlr->ana_log_page_size = ana_log_page_size; 2064 2065 nvme_ctrlr->probe_ctx = ctx; 2066 2067 return spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, 2068 SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS, 2069 SPDK_NVME_GLOBAL_NS_TAG, 2070 nvme_ctrlr->ana_log_page, 2071 nvme_ctrlr->ana_log_page_size, 0, 2072 nvme_ctrlr_init_ana_log_page_done, 2073 nvme_ctrlr); 2074 } 2075 2076 static int 2077 nvme_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr, 2078 const char *name, 2079 const struct spdk_nvme_transport_id *trid, 2080 uint32_t prchk_flags, 2081 struct nvme_async_probe_ctx *ctx) 2082 { 2083 struct nvme_ctrlr *nvme_ctrlr; 2084 struct nvme_ctrlr_trid *trid_entry; 2085 uint32_t i, num_ns; 2086 const struct spdk_nvme_ctrlr_data *cdata; 2087 int rc; 2088 2089 nvme_ctrlr = calloc(1, sizeof(*nvme_ctrlr)); 2090 if (nvme_ctrlr == NULL) { 2091 SPDK_ERRLOG("Failed to allocate device struct\n"); 2092 return -ENOMEM; 2093 } 2094 2095 rc = pthread_mutex_init(&nvme_ctrlr->mutex, NULL); 2096 if (rc != 0) { 2097 free(nvme_ctrlr); 2098 return rc; 2099 } 2100 2101 TAILQ_INIT(&nvme_ctrlr->trids); 2102 2103 num_ns = spdk_nvme_ctrlr_get_num_ns(ctrlr); 2104 if (num_ns != 0) { 2105 nvme_ctrlr->namespaces = calloc(num_ns, sizeof(struct nvme_ns *)); 2106 if (!nvme_ctrlr->namespaces) { 2107 SPDK_ERRLOG("Failed to allocate block namespaces pointer\n"); 2108 rc = -ENOMEM; 2109 goto err; 2110 } 2111 2112 for (i = 0; i < num_ns; i++) { 2113 nvme_ctrlr->namespaces[i] = calloc(1, sizeof(struct nvme_ns)); 2114 if (nvme_ctrlr->namespaces[i] == NULL) { 2115 SPDK_ERRLOG("Failed to allocate block namespace struct\n"); 2116 rc = -ENOMEM; 2117 goto err; 2118 } 2119 nvme_ctrlr->num_ns++; 2120 } 2121 2122 assert(num_ns == nvme_ctrlr->num_ns); 2123 } 2124 2125 trid_entry = calloc(1, sizeof(*trid_entry)); 2126 if (trid_entry == NULL) { 2127 SPDK_ERRLOG("Failed to allocate trid entry pointer\n"); 2128 rc = -ENOMEM; 2129 goto err; 2130 } 2131 2132 trid_entry->trid = *trid; 2133 nvme_ctrlr->connected_trid = &trid_entry->trid; 2134 TAILQ_INSERT_HEAD(&nvme_ctrlr->trids, trid_entry, link); 2135 2136 nvme_ctrlr->thread = spdk_get_thread(); 2137 nvme_ctrlr->ctrlr = ctrlr; 2138 nvme_ctrlr->ref = 1; 2139 nvme_ctrlr->name = strdup(name); 2140 if (nvme_ctrlr->name == NULL) { 2141 rc = -ENOMEM; 2142 goto err; 2143 } 2144 2145 if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) { 2146 rc = bdev_ocssd_init_ctrlr(nvme_ctrlr); 2147 if (spdk_unlikely(rc != 0)) { 2148 SPDK_ERRLOG("Unable to initialize OCSSD controller\n"); 2149 goto err; 2150 } 2151 } 2152 2153 nvme_ctrlr->prchk_flags = prchk_flags; 2154 2155 nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_ctrlr, 2156 g_opts.nvme_adminq_poll_period_us); 2157 2158 pthread_mutex_lock(&g_bdev_nvme_mutex); 2159 TAILQ_INSERT_TAIL(&g_nvme_ctrlrs, nvme_ctrlr, tailq); 2160 pthread_mutex_unlock(&g_bdev_nvme_mutex); 2161 2162 if (g_opts.timeout_us > 0) { 2163 /* Register timeout callback. Timeout values for IO vs. admin reqs can be different. */ 2164 /* If timeout_admin_us is 0 (not specified), admin uses same timeout as IO. */ 2165 uint64_t adm_timeout_us = (g_opts.timeout_admin_us == 0) ? 2166 g_opts.timeout_us : g_opts.timeout_admin_us; 2167 spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us, 2168 adm_timeout_us, timeout_cb, nvme_ctrlr); 2169 } 2170 2171 spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_ctrlr); 2172 spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_ctrlr); 2173 2174 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 2175 SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) { 2176 nvme_ctrlr->opal_dev = spdk_opal_dev_construct(ctrlr); 2177 } 2178 2179 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 2180 2181 if (cdata->cmic.ana_reporting) { 2182 rc = nvme_ctrlr_init_ana_log_page(nvme_ctrlr, ctx); 2183 if (rc == 0) { 2184 return 0; 2185 } 2186 } else { 2187 nvme_ctrlr_create_done(nvme_ctrlr, ctx); 2188 return 0; 2189 } 2190 2191 err: 2192 nvme_ctrlr_delete(nvme_ctrlr); 2193 return rc; 2194 } 2195 2196 static void 2197 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 2198 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 2199 { 2200 struct nvme_probe_ctx *ctx = cb_ctx; 2201 char *name = NULL; 2202 uint32_t prchk_flags = 0; 2203 size_t i; 2204 2205 if (ctx) { 2206 for (i = 0; i < ctx->count; i++) { 2207 if (spdk_nvme_transport_id_compare(trid, &ctx->trids[i]) == 0) { 2208 prchk_flags = ctx->prchk_flags[i]; 2209 name = strdup(ctx->names[i]); 2210 break; 2211 } 2212 } 2213 } else { 2214 name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++); 2215 } 2216 if (!name) { 2217 SPDK_ERRLOG("Failed to assign name to NVMe device\n"); 2218 return; 2219 } 2220 2221 SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name); 2222 2223 nvme_ctrlr_create(ctrlr, name, trid, prchk_flags, NULL); 2224 2225 free(name); 2226 } 2227 2228 static void 2229 _nvme_ctrlr_destruct(void *ctx) 2230 { 2231 struct nvme_ctrlr *nvme_ctrlr = ctx; 2232 2233 nvme_ctrlr_depopulate_namespaces(nvme_ctrlr); 2234 nvme_ctrlr_release(nvme_ctrlr); 2235 } 2236 2237 static int 2238 _bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, bool hotplug) 2239 { 2240 struct nvme_probe_skip_entry *entry; 2241 2242 pthread_mutex_lock(&nvme_ctrlr->mutex); 2243 2244 /* The controller's destruction was already started */ 2245 if (nvme_ctrlr->destruct) { 2246 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2247 return 0; 2248 } 2249 2250 if (!hotplug && 2251 nvme_ctrlr->connected_trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 2252 entry = calloc(1, sizeof(*entry)); 2253 if (!entry) { 2254 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2255 return -ENOMEM; 2256 } 2257 entry->trid = *nvme_ctrlr->connected_trid; 2258 TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq); 2259 } 2260 2261 nvme_ctrlr->destruct = true; 2262 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2263 2264 _nvme_ctrlr_destruct(nvme_ctrlr); 2265 2266 return 0; 2267 } 2268 2269 static void 2270 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr) 2271 { 2272 struct nvme_ctrlr *nvme_ctrlr = cb_ctx; 2273 2274 _bdev_nvme_delete(nvme_ctrlr, true); 2275 } 2276 2277 static int 2278 bdev_nvme_hotplug_probe(void *arg) 2279 { 2280 if (g_hotplug_probe_ctx == NULL) { 2281 spdk_poller_unregister(&g_hotplug_probe_poller); 2282 return SPDK_POLLER_IDLE; 2283 } 2284 2285 if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) { 2286 g_hotplug_probe_ctx = NULL; 2287 spdk_poller_unregister(&g_hotplug_probe_poller); 2288 } 2289 2290 return SPDK_POLLER_BUSY; 2291 } 2292 2293 static int 2294 bdev_nvme_hotplug(void *arg) 2295 { 2296 struct spdk_nvme_transport_id trid_pcie; 2297 2298 if (g_hotplug_probe_ctx) { 2299 return SPDK_POLLER_BUSY; 2300 } 2301 2302 memset(&trid_pcie, 0, sizeof(trid_pcie)); 2303 spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE); 2304 2305 g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL, 2306 hotplug_probe_cb, attach_cb, NULL); 2307 2308 if (g_hotplug_probe_ctx) { 2309 assert(g_hotplug_probe_poller == NULL); 2310 g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000); 2311 } 2312 2313 return SPDK_POLLER_BUSY; 2314 } 2315 2316 void 2317 bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts) 2318 { 2319 *opts = g_opts; 2320 } 2321 2322 static int 2323 bdev_nvme_validate_opts(const struct spdk_bdev_nvme_opts *opts) 2324 { 2325 if ((opts->timeout_us == 0) && (opts->timeout_admin_us != 0)) { 2326 /* Can't set timeout_admin_us without also setting timeout_us */ 2327 SPDK_WARNLOG("Invalid options: Can't have (timeout_us == 0) with (timeout_admin_us > 0)\n"); 2328 return -EINVAL; 2329 } 2330 2331 return 0; 2332 } 2333 2334 int 2335 bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts) 2336 { 2337 int ret = bdev_nvme_validate_opts(opts); 2338 if (ret) { 2339 SPDK_WARNLOG("Failed to set nvme opts.\n"); 2340 return ret; 2341 } 2342 2343 if (g_bdev_nvme_init_thread != NULL) { 2344 if (!TAILQ_EMPTY(&g_nvme_ctrlrs)) { 2345 return -EPERM; 2346 } 2347 } 2348 2349 g_opts = *opts; 2350 2351 return 0; 2352 } 2353 2354 struct set_nvme_hotplug_ctx { 2355 uint64_t period_us; 2356 bool enabled; 2357 spdk_msg_fn fn; 2358 void *fn_ctx; 2359 }; 2360 2361 static void 2362 set_nvme_hotplug_period_cb(void *_ctx) 2363 { 2364 struct set_nvme_hotplug_ctx *ctx = _ctx; 2365 2366 spdk_poller_unregister(&g_hotplug_poller); 2367 if (ctx->enabled) { 2368 g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us); 2369 } 2370 2371 g_nvme_hotplug_poll_period_us = ctx->period_us; 2372 g_nvme_hotplug_enabled = ctx->enabled; 2373 if (ctx->fn) { 2374 ctx->fn(ctx->fn_ctx); 2375 } 2376 2377 free(ctx); 2378 } 2379 2380 int 2381 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx) 2382 { 2383 struct set_nvme_hotplug_ctx *ctx; 2384 2385 if (enabled == true && !spdk_process_is_primary()) { 2386 return -EPERM; 2387 } 2388 2389 ctx = calloc(1, sizeof(*ctx)); 2390 if (ctx == NULL) { 2391 return -ENOMEM; 2392 } 2393 2394 period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us; 2395 ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX); 2396 ctx->enabled = enabled; 2397 ctx->fn = cb; 2398 ctx->fn_ctx = cb_ctx; 2399 2400 spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx); 2401 return 0; 2402 } 2403 2404 static void 2405 nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr, 2406 struct nvme_async_probe_ctx *ctx) 2407 { 2408 struct nvme_ns *nvme_ns; 2409 struct nvme_bdev *nvme_bdev; 2410 uint32_t i, nsid; 2411 size_t j; 2412 2413 assert(nvme_ctrlr != NULL); 2414 2415 /* 2416 * Report the new bdevs that were created in this call. 2417 * There can be more than one bdev per NVMe controller. 2418 */ 2419 j = 0; 2420 for (i = 0; i < nvme_ctrlr->num_ns; i++) { 2421 nsid = i + 1; 2422 nvme_ns = nvme_ctrlr->namespaces[nsid - 1]; 2423 if (!nvme_ns->populated) { 2424 continue; 2425 } 2426 assert(nvme_ns->id == nsid); 2427 nvme_bdev = nvme_ns->bdev; 2428 if (nvme_bdev == NULL) { 2429 assert(nvme_ns->type == NVME_NS_OCSSD); 2430 continue; 2431 } 2432 if (j < ctx->count) { 2433 ctx->names[j] = nvme_bdev->disk.name; 2434 j++; 2435 } else { 2436 SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n", 2437 ctx->count); 2438 populate_namespaces_cb(ctx, 0, -ERANGE); 2439 return; 2440 } 2441 } 2442 2443 populate_namespaces_cb(ctx, j, 0); 2444 } 2445 2446 static int 2447 bdev_nvme_compare_trids(struct nvme_ctrlr *nvme_ctrlr, 2448 struct spdk_nvme_ctrlr *new_ctrlr, 2449 struct spdk_nvme_transport_id *trid) 2450 { 2451 struct nvme_ctrlr_trid *tmp_trid; 2452 2453 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 2454 SPDK_ERRLOG("PCIe failover is not supported.\n"); 2455 return -ENOTSUP; 2456 } 2457 2458 /* Currently we only support failover to the same transport type. */ 2459 if (nvme_ctrlr->connected_trid->trtype != trid->trtype) { 2460 return -EINVAL; 2461 } 2462 2463 /* Currently we only support failover to the same NQN. */ 2464 if (strncmp(trid->subnqn, nvme_ctrlr->connected_trid->subnqn, SPDK_NVMF_NQN_MAX_LEN)) { 2465 return -EINVAL; 2466 } 2467 2468 /* Skip all the other checks if we've already registered this path. */ 2469 TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) { 2470 if (!spdk_nvme_transport_id_compare(&tmp_trid->trid, trid)) { 2471 return -EEXIST; 2472 } 2473 } 2474 2475 return 0; 2476 } 2477 2478 static int 2479 bdev_nvme_compare_namespaces(struct nvme_ctrlr *nvme_ctrlr, 2480 struct spdk_nvme_ctrlr *new_ctrlr) 2481 { 2482 uint32_t i, nsid; 2483 struct nvme_ns *nvme_ns; 2484 struct spdk_nvme_ns *new_ns; 2485 2486 if (spdk_nvme_ctrlr_get_num_ns(new_ctrlr) != nvme_ctrlr->num_ns) { 2487 return -EINVAL; 2488 } 2489 2490 for (i = 0; i < nvme_ctrlr->num_ns; i++) { 2491 nsid = i + 1; 2492 2493 nvme_ns = nvme_ctrlr->namespaces[i]; 2494 if (!nvme_ns->populated) { 2495 continue; 2496 } 2497 2498 new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nsid); 2499 assert(new_ns != NULL); 2500 2501 if (!bdev_nvme_compare_ns(nvme_ns->ns, new_ns)) { 2502 return -EINVAL; 2503 } 2504 } 2505 2506 return 0; 2507 } 2508 2509 static int 2510 _bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 2511 struct spdk_nvme_transport_id *trid) 2512 { 2513 struct nvme_ctrlr_trid *new_trid, *tmp_trid; 2514 2515 new_trid = calloc(1, sizeof(*new_trid)); 2516 if (new_trid == NULL) { 2517 return -ENOMEM; 2518 } 2519 new_trid->trid = *trid; 2520 new_trid->is_failed = false; 2521 2522 TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) { 2523 if (tmp_trid->is_failed) { 2524 TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link); 2525 return 0; 2526 } 2527 } 2528 2529 TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, new_trid, link); 2530 return 0; 2531 } 2532 2533 /* This is the case that a secondary path is added to an existing 2534 * nvme_ctrlr for failover. After checking if it can access the same 2535 * namespaces as the primary path, it is disconnected until failover occurs. 2536 */ 2537 static int 2538 bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 2539 struct spdk_nvme_ctrlr *new_ctrlr, 2540 struct spdk_nvme_transport_id *trid) 2541 { 2542 int rc; 2543 2544 assert(nvme_ctrlr != NULL); 2545 2546 pthread_mutex_lock(&nvme_ctrlr->mutex); 2547 2548 rc = bdev_nvme_compare_trids(nvme_ctrlr, new_ctrlr, trid); 2549 if (rc != 0) { 2550 goto exit; 2551 } 2552 2553 rc = bdev_nvme_compare_namespaces(nvme_ctrlr, new_ctrlr); 2554 if (rc != 0) { 2555 goto exit; 2556 } 2557 2558 rc = _bdev_nvme_add_secondary_trid(nvme_ctrlr, trid); 2559 2560 exit: 2561 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2562 2563 spdk_nvme_detach(new_ctrlr); 2564 2565 return rc; 2566 } 2567 2568 static void 2569 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 2570 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 2571 { 2572 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 2573 struct nvme_ctrlr *nvme_ctrlr; 2574 struct nvme_async_probe_ctx *ctx; 2575 int rc; 2576 2577 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, opts); 2578 ctx->ctrlr_attached = true; 2579 2580 nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->base_name); 2581 if (nvme_ctrlr) { 2582 rc = bdev_nvme_add_secondary_trid(nvme_ctrlr, ctrlr, &ctx->trid); 2583 } else { 2584 rc = nvme_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx->prchk_flags, ctx); 2585 if (rc == 0) { 2586 return; 2587 } 2588 } 2589 2590 populate_namespaces_cb(ctx, 0, rc); 2591 } 2592 2593 static int 2594 bdev_nvme_async_poll(void *arg) 2595 { 2596 struct nvme_async_probe_ctx *ctx = arg; 2597 int rc; 2598 2599 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 2600 if (spdk_unlikely(rc != -EAGAIN)) { 2601 ctx->probe_done = true; 2602 spdk_poller_unregister(&ctx->poller); 2603 if (!ctx->ctrlr_attached) { 2604 /* The probe is done, but no controller was attached. 2605 * That means we had a failure, so report -EIO back to 2606 * the caller (usually the RPC). populate_namespaces_cb() 2607 * will take care of freeing the nvme_async_probe_ctx. 2608 */ 2609 populate_namespaces_cb(ctx, 0, -EIO); 2610 } else if (ctx->namespaces_populated) { 2611 /* The namespaces for the attached controller were all 2612 * populated and the response was already sent to the 2613 * caller (usually the RPC). So free the context here. 2614 */ 2615 free(ctx); 2616 } 2617 } 2618 2619 return SPDK_POLLER_BUSY; 2620 } 2621 2622 int 2623 bdev_nvme_create(struct spdk_nvme_transport_id *trid, 2624 struct spdk_nvme_host_id *hostid, 2625 const char *base_name, 2626 const char **names, 2627 uint32_t count, 2628 const char *hostnqn, 2629 uint32_t prchk_flags, 2630 spdk_bdev_create_nvme_fn cb_fn, 2631 void *cb_ctx, 2632 struct spdk_nvme_ctrlr_opts *opts) 2633 { 2634 struct nvme_probe_skip_entry *entry, *tmp; 2635 struct nvme_async_probe_ctx *ctx; 2636 2637 /* TODO expand this check to include both the host and target TRIDs. 2638 * Only if both are the same should we fail. 2639 */ 2640 if (nvme_ctrlr_get(trid) != NULL) { 2641 SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr); 2642 return -EEXIST; 2643 } 2644 2645 ctx = calloc(1, sizeof(*ctx)); 2646 if (!ctx) { 2647 return -ENOMEM; 2648 } 2649 ctx->base_name = base_name; 2650 ctx->names = names; 2651 ctx->count = count; 2652 ctx->cb_fn = cb_fn; 2653 ctx->cb_ctx = cb_ctx; 2654 ctx->prchk_flags = prchk_flags; 2655 ctx->trid = *trid; 2656 2657 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 2658 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) { 2659 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 2660 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 2661 free(entry); 2662 break; 2663 } 2664 } 2665 } 2666 2667 if (opts) { 2668 memcpy(&ctx->opts, opts, sizeof(*opts)); 2669 } else { 2670 spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->opts, sizeof(ctx->opts)); 2671 } 2672 2673 ctx->opts.transport_retry_count = g_opts.retry_count; 2674 ctx->opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms; 2675 ctx->opts.disable_read_ana_log_page = true; 2676 2677 if (hostnqn) { 2678 snprintf(ctx->opts.hostnqn, sizeof(ctx->opts.hostnqn), "%s", hostnqn); 2679 } 2680 2681 if (hostid->hostaddr[0] != '\0') { 2682 snprintf(ctx->opts.src_addr, sizeof(ctx->opts.src_addr), "%s", hostid->hostaddr); 2683 } 2684 2685 if (hostid->hostsvcid[0] != '\0') { 2686 snprintf(ctx->opts.src_svcid, sizeof(ctx->opts.src_svcid), "%s", hostid->hostsvcid); 2687 } 2688 2689 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->opts, connect_attach_cb); 2690 if (ctx->probe_ctx == NULL) { 2691 SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr); 2692 free(ctx); 2693 return -ENODEV; 2694 } 2695 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000); 2696 2697 return 0; 2698 } 2699 2700 static int 2701 bdev_nvme_delete_secondary_trid(struct nvme_ctrlr *nvme_ctrlr, 2702 const struct spdk_nvme_transport_id *trid) 2703 { 2704 struct nvme_ctrlr_trid *ctrlr_trid, *tmp_trid; 2705 2706 if (!spdk_nvme_transport_id_compare(trid, nvme_ctrlr->connected_trid)) { 2707 return -EBUSY; 2708 } 2709 2710 TAILQ_FOREACH_SAFE(ctrlr_trid, &nvme_ctrlr->trids, link, tmp_trid) { 2711 if (!spdk_nvme_transport_id_compare(&ctrlr_trid->trid, trid)) { 2712 TAILQ_REMOVE(&nvme_ctrlr->trids, ctrlr_trid, link); 2713 free(ctrlr_trid); 2714 return 0; 2715 } 2716 } 2717 2718 return -ENXIO; 2719 } 2720 2721 int 2722 bdev_nvme_delete(const char *name, const struct spdk_nvme_transport_id *trid) 2723 { 2724 struct nvme_ctrlr *nvme_ctrlr; 2725 struct nvme_ctrlr_trid *ctrlr_trid; 2726 2727 if (name == NULL) { 2728 return -EINVAL; 2729 } 2730 2731 nvme_ctrlr = nvme_ctrlr_get_by_name(name); 2732 if (nvme_ctrlr == NULL) { 2733 SPDK_ERRLOG("Failed to find NVMe controller\n"); 2734 return -ENODEV; 2735 } 2736 2737 /* case 1: remove the controller itself. */ 2738 if (trid == NULL) { 2739 return _bdev_nvme_delete(nvme_ctrlr, false); 2740 } 2741 2742 /* case 2: we are currently using the path to be removed. */ 2743 if (!spdk_nvme_transport_id_compare(trid, nvme_ctrlr->connected_trid)) { 2744 ctrlr_trid = TAILQ_FIRST(&nvme_ctrlr->trids); 2745 assert(nvme_ctrlr->connected_trid == &ctrlr_trid->trid); 2746 /* case 2A: the current path is the only path. */ 2747 if (!TAILQ_NEXT(ctrlr_trid, link)) { 2748 return _bdev_nvme_delete(nvme_ctrlr, false); 2749 } 2750 2751 /* case 2B: there is an alternative path. */ 2752 return bdev_nvme_failover(nvme_ctrlr, true); 2753 } 2754 2755 /* case 3: We are not using the specified path. */ 2756 return bdev_nvme_delete_secondary_trid(nvme_ctrlr, trid); 2757 } 2758 2759 static int 2760 bdev_nvme_library_init(void) 2761 { 2762 g_bdev_nvme_init_thread = spdk_get_thread(); 2763 2764 spdk_io_device_register(&g_nvme_ctrlrs, bdev_nvme_create_poll_group_cb, 2765 bdev_nvme_destroy_poll_group_cb, 2766 sizeof(struct nvme_poll_group), "nvme_poll_groups"); 2767 2768 return 0; 2769 } 2770 2771 static void 2772 bdev_nvme_library_fini(void) 2773 { 2774 struct nvme_ctrlr *nvme_ctrlr, *tmp; 2775 struct nvme_probe_skip_entry *entry, *entry_tmp; 2776 2777 spdk_poller_unregister(&g_hotplug_poller); 2778 free(g_hotplug_probe_ctx); 2779 g_hotplug_probe_ctx = NULL; 2780 2781 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) { 2782 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 2783 free(entry); 2784 } 2785 2786 pthread_mutex_lock(&g_bdev_nvme_mutex); 2787 TAILQ_FOREACH_SAFE(nvme_ctrlr, &g_nvme_ctrlrs, tailq, tmp) { 2788 pthread_mutex_lock(&nvme_ctrlr->mutex); 2789 if (nvme_ctrlr->destruct) { 2790 /* This controller's destruction was already started 2791 * before the application started shutting down 2792 */ 2793 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2794 continue; 2795 } 2796 nvme_ctrlr->destruct = true; 2797 pthread_mutex_unlock(&nvme_ctrlr->mutex); 2798 2799 spdk_thread_send_msg(nvme_ctrlr->thread, _nvme_ctrlr_destruct, 2800 nvme_ctrlr); 2801 } 2802 2803 g_bdev_nvme_module_finish = true; 2804 if (TAILQ_EMPTY(&g_nvme_ctrlrs)) { 2805 pthread_mutex_unlock(&g_bdev_nvme_mutex); 2806 spdk_io_device_unregister(&g_nvme_ctrlrs, NULL); 2807 spdk_bdev_module_finish_done(); 2808 return; 2809 } 2810 2811 pthread_mutex_unlock(&g_bdev_nvme_mutex); 2812 } 2813 2814 static void 2815 bdev_nvme_verify_pi_error(struct nvme_bdev_io *bio) 2816 { 2817 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2818 struct spdk_bdev *bdev = bdev_io->bdev; 2819 struct spdk_dif_ctx dif_ctx; 2820 struct spdk_dif_error err_blk = {}; 2821 int rc; 2822 2823 rc = spdk_dif_ctx_init(&dif_ctx, 2824 bdev->blocklen, bdev->md_len, bdev->md_interleave, 2825 bdev->dif_is_head_of_md, bdev->dif_type, bdev->dif_check_flags, 2826 bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0); 2827 if (rc != 0) { 2828 SPDK_ERRLOG("Initialization of DIF context failed\n"); 2829 return; 2830 } 2831 2832 if (bdev->md_interleave) { 2833 rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 2834 bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 2835 } else { 2836 struct iovec md_iov = { 2837 .iov_base = bdev_io->u.bdev.md_buf, 2838 .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len, 2839 }; 2840 2841 rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 2842 &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 2843 } 2844 2845 if (rc != 0) { 2846 SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n", 2847 err_blk.err_type, err_blk.err_offset); 2848 } else { 2849 SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n"); 2850 } 2851 } 2852 2853 static void 2854 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 2855 { 2856 struct nvme_bdev_io *bio = ref; 2857 2858 if (spdk_nvme_cpl_is_success(cpl)) { 2859 /* Run PI verification for read data buffer. */ 2860 bdev_nvme_verify_pi_error(bio); 2861 } 2862 2863 /* Return original completion status */ 2864 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 2865 } 2866 2867 static void 2868 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 2869 { 2870 struct nvme_bdev_io *bio = ref; 2871 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2872 struct nvme_bdev_channel *nbdev_ch; 2873 struct spdk_nvme_ns *ns; 2874 struct spdk_nvme_qpair *qpair; 2875 int ret; 2876 2877 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 2878 SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n", 2879 cpl->status.sct, cpl->status.sc); 2880 2881 /* Save completion status to use after verifying PI error. */ 2882 bio->cpl = *cpl; 2883 2884 nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 2885 2886 if (spdk_likely(bdev_nvme_find_io_path(nbdev_ch, &ns, &qpair))) { 2887 /* Read without PI checking to verify PI error. */ 2888 ret = bdev_nvme_no_pi_readv(ns, 2889 qpair, 2890 bio, 2891 bdev_io->u.bdev.iovs, 2892 bdev_io->u.bdev.iovcnt, 2893 bdev_io->u.bdev.md_buf, 2894 bdev_io->u.bdev.num_blocks, 2895 bdev_io->u.bdev.offset_blocks); 2896 if (ret == 0) { 2897 return; 2898 } 2899 } 2900 } 2901 2902 bdev_nvme_io_complete_nvme_status(bio, cpl); 2903 } 2904 2905 static void 2906 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 2907 { 2908 struct nvme_bdev_io *bio = ref; 2909 2910 if (spdk_nvme_cpl_is_pi_error(cpl)) { 2911 SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n", 2912 cpl->status.sct, cpl->status.sc); 2913 /* Run PI verification for write data buffer if PI error is detected. */ 2914 bdev_nvme_verify_pi_error(bio); 2915 } 2916 2917 bdev_nvme_io_complete_nvme_status(bio, cpl); 2918 } 2919 2920 static void 2921 bdev_nvme_zone_appendv_done(void *ref, const struct spdk_nvme_cpl *cpl) 2922 { 2923 struct nvme_bdev_io *bio = ref; 2924 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2925 2926 /* spdk_bdev_io_get_append_location() requires that the ALBA is stored in offset_blocks. 2927 * Additionally, offset_blocks has to be set before calling bdev_nvme_verify_pi_error(). 2928 */ 2929 bdev_io->u.bdev.offset_blocks = *(uint64_t *)&cpl->cdw0; 2930 2931 if (spdk_nvme_cpl_is_pi_error(cpl)) { 2932 SPDK_ERRLOG("zone append completed with PI error (sct=%d, sc=%d)\n", 2933 cpl->status.sct, cpl->status.sc); 2934 /* Run PI verification for zone append data buffer if PI error is detected. */ 2935 bdev_nvme_verify_pi_error(bio); 2936 } 2937 2938 bdev_nvme_io_complete_nvme_status(bio, cpl); 2939 } 2940 2941 static void 2942 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl) 2943 { 2944 struct nvme_bdev_io *bio = ref; 2945 2946 if (spdk_nvme_cpl_is_pi_error(cpl)) { 2947 SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n", 2948 cpl->status.sct, cpl->status.sc); 2949 /* Run PI verification for compare data buffer if PI error is detected. */ 2950 bdev_nvme_verify_pi_error(bio); 2951 } 2952 2953 bdev_nvme_io_complete_nvme_status(bio, cpl); 2954 } 2955 2956 static void 2957 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 2958 { 2959 struct nvme_bdev_io *bio = ref; 2960 2961 /* Compare operation completion */ 2962 if ((cpl->cdw0 & 0xFF) == SPDK_NVME_OPC_COMPARE) { 2963 /* Save compare result for write callback */ 2964 bio->cpl = *cpl; 2965 return; 2966 } 2967 2968 /* Write operation completion */ 2969 if (spdk_nvme_cpl_is_error(&bio->cpl)) { 2970 /* If bio->cpl is already an error, it means the compare operation failed. In that case, 2971 * complete the IO with the compare operation's status. 2972 */ 2973 if (!spdk_nvme_cpl_is_error(cpl)) { 2974 SPDK_ERRLOG("Unexpected write success after compare failure.\n"); 2975 } 2976 2977 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 2978 } else { 2979 bdev_nvme_io_complete_nvme_status(bio, cpl); 2980 } 2981 } 2982 2983 static void 2984 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl) 2985 { 2986 struct nvme_bdev_io *bio = ref; 2987 2988 bdev_nvme_io_complete_nvme_status(bio, cpl); 2989 } 2990 2991 static int 2992 fill_zone_from_report(struct spdk_bdev_zone_info *info, struct spdk_nvme_zns_zone_desc *desc) 2993 { 2994 switch (desc->zs) { 2995 case SPDK_NVME_ZONE_STATE_EMPTY: 2996 info->state = SPDK_BDEV_ZONE_STATE_EMPTY; 2997 break; 2998 case SPDK_NVME_ZONE_STATE_IOPEN: 2999 info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN; 3000 break; 3001 case SPDK_NVME_ZONE_STATE_EOPEN: 3002 info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN; 3003 break; 3004 case SPDK_NVME_ZONE_STATE_CLOSED: 3005 info->state = SPDK_BDEV_ZONE_STATE_CLOSED; 3006 break; 3007 case SPDK_NVME_ZONE_STATE_RONLY: 3008 info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY; 3009 break; 3010 case SPDK_NVME_ZONE_STATE_FULL: 3011 info->state = SPDK_BDEV_ZONE_STATE_FULL; 3012 break; 3013 case SPDK_NVME_ZONE_STATE_OFFLINE: 3014 info->state = SPDK_BDEV_ZONE_STATE_OFFLINE; 3015 break; 3016 default: 3017 SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", desc->zs); 3018 return -EIO; 3019 } 3020 3021 info->zone_id = desc->zslba; 3022 info->write_pointer = desc->wp; 3023 info->capacity = desc->zcap; 3024 3025 return 0; 3026 } 3027 3028 static void 3029 bdev_nvme_get_zone_info_done(void *ref, const struct spdk_nvme_cpl *cpl) 3030 { 3031 struct nvme_bdev_io *bio = ref; 3032 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 3033 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 3034 struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch); 3035 uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id; 3036 uint32_t zones_to_copy = bdev_io->u.zone_mgmt.num_zones; 3037 struct spdk_bdev_zone_info *info = bdev_io->u.zone_mgmt.buf; 3038 uint64_t max_zones_per_buf, i; 3039 uint32_t zone_report_bufsize; 3040 struct spdk_nvme_ns *ns; 3041 struct spdk_nvme_qpair *qpair; 3042 int ret; 3043 3044 if (spdk_nvme_cpl_is_error(cpl)) { 3045 goto out_complete_io_nvme_cpl; 3046 } 3047 3048 if (!bdev_nvme_find_io_path(nbdev_ch, &ns, &qpair)) { 3049 ret = -ENXIO; 3050 goto out_complete_io_ret; 3051 } 3052 3053 zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 3054 max_zones_per_buf = (zone_report_bufsize - sizeof(*bio->zone_report_buf)) / 3055 sizeof(bio->zone_report_buf->descs[0]); 3056 3057 if (bio->zone_report_buf->nr_zones > max_zones_per_buf) { 3058 ret = -EINVAL; 3059 goto out_complete_io_ret; 3060 } 3061 3062 if (!bio->zone_report_buf->nr_zones) { 3063 ret = -EINVAL; 3064 goto out_complete_io_ret; 3065 } 3066 3067 for (i = 0; i < bio->zone_report_buf->nr_zones && bio->handled_zones < zones_to_copy; i++) { 3068 ret = fill_zone_from_report(&info[bio->handled_zones], 3069 &bio->zone_report_buf->descs[i]); 3070 if (ret) { 3071 goto out_complete_io_ret; 3072 } 3073 bio->handled_zones++; 3074 } 3075 3076 if (bio->handled_zones < zones_to_copy) { 3077 uint64_t zone_size_lba = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 3078 uint64_t slba = zone_id + (zone_size_lba * bio->handled_zones); 3079 3080 memset(bio->zone_report_buf, 0, zone_report_bufsize); 3081 ret = spdk_nvme_zns_report_zones(ns, qpair, 3082 bio->zone_report_buf, zone_report_bufsize, 3083 slba, SPDK_NVME_ZRA_LIST_ALL, true, 3084 bdev_nvme_get_zone_info_done, bio); 3085 if (!ret) { 3086 return; 3087 } else { 3088 goto out_complete_io_ret; 3089 } 3090 } 3091 3092 out_complete_io_nvme_cpl: 3093 free(bio->zone_report_buf); 3094 bio->zone_report_buf = NULL; 3095 bdev_nvme_io_complete_nvme_status(bio, cpl); 3096 return; 3097 3098 out_complete_io_ret: 3099 free(bio->zone_report_buf); 3100 bio->zone_report_buf = NULL; 3101 bdev_nvme_io_complete(bio, ret); 3102 } 3103 3104 static void 3105 bdev_nvme_zone_management_done(void *ref, const struct spdk_nvme_cpl *cpl) 3106 { 3107 struct nvme_bdev_io *bio = ref; 3108 3109 bdev_nvme_io_complete_nvme_status(bio, cpl); 3110 } 3111 3112 static void 3113 bdev_nvme_admin_passthru_completion(void *ctx) 3114 { 3115 struct nvme_bdev_io *bio = ctx; 3116 3117 bdev_nvme_io_complete_nvme_status(bio, &bio->cpl); 3118 } 3119 3120 static void 3121 bdev_nvme_abort_completion(void *ctx) 3122 { 3123 struct nvme_bdev_io *bio = ctx; 3124 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 3125 3126 if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) { 3127 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 3128 } else { 3129 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3130 } 3131 } 3132 3133 static void 3134 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl) 3135 { 3136 struct nvme_bdev_io *bio = ref; 3137 3138 bio->cpl = *cpl; 3139 spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_completion, bio); 3140 } 3141 3142 static void 3143 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl) 3144 { 3145 struct nvme_bdev_io *bio = ref; 3146 3147 bio->cpl = *cpl; 3148 spdk_thread_send_msg(bio->orig_thread, bdev_nvme_admin_passthru_completion, bio); 3149 } 3150 3151 static void 3152 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset) 3153 { 3154 struct nvme_bdev_io *bio = ref; 3155 struct iovec *iov; 3156 3157 bio->iov_offset = sgl_offset; 3158 for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) { 3159 iov = &bio->iovs[bio->iovpos]; 3160 if (bio->iov_offset < iov->iov_len) { 3161 break; 3162 } 3163 3164 bio->iov_offset -= iov->iov_len; 3165 } 3166 } 3167 3168 static int 3169 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length) 3170 { 3171 struct nvme_bdev_io *bio = ref; 3172 struct iovec *iov; 3173 3174 assert(bio->iovpos < bio->iovcnt); 3175 3176 iov = &bio->iovs[bio->iovpos]; 3177 3178 *address = iov->iov_base; 3179 *length = iov->iov_len; 3180 3181 if (bio->iov_offset) { 3182 assert(bio->iov_offset <= iov->iov_len); 3183 *address += bio->iov_offset; 3184 *length -= bio->iov_offset; 3185 } 3186 3187 bio->iov_offset += *length; 3188 if (bio->iov_offset == iov->iov_len) { 3189 bio->iovpos++; 3190 bio->iov_offset = 0; 3191 } 3192 3193 return 0; 3194 } 3195 3196 static void 3197 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset) 3198 { 3199 struct nvme_bdev_io *bio = ref; 3200 struct iovec *iov; 3201 3202 bio->fused_iov_offset = sgl_offset; 3203 for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) { 3204 iov = &bio->fused_iovs[bio->fused_iovpos]; 3205 if (bio->fused_iov_offset < iov->iov_len) { 3206 break; 3207 } 3208 3209 bio->fused_iov_offset -= iov->iov_len; 3210 } 3211 } 3212 3213 static int 3214 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length) 3215 { 3216 struct nvme_bdev_io *bio = ref; 3217 struct iovec *iov; 3218 3219 assert(bio->fused_iovpos < bio->fused_iovcnt); 3220 3221 iov = &bio->fused_iovs[bio->fused_iovpos]; 3222 3223 *address = iov->iov_base; 3224 *length = iov->iov_len; 3225 3226 if (bio->fused_iov_offset) { 3227 assert(bio->fused_iov_offset <= iov->iov_len); 3228 *address += bio->fused_iov_offset; 3229 *length -= bio->fused_iov_offset; 3230 } 3231 3232 bio->fused_iov_offset += *length; 3233 if (bio->fused_iov_offset == iov->iov_len) { 3234 bio->fused_iovpos++; 3235 bio->fused_iov_offset = 0; 3236 } 3237 3238 return 0; 3239 } 3240 3241 static int 3242 bdev_nvme_no_pi_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 3243 struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 3244 void *md, uint64_t lba_count, uint64_t lba) 3245 { 3246 int rc; 3247 3248 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n", 3249 lba_count, lba); 3250 3251 bio->iovs = iov; 3252 bio->iovcnt = iovcnt; 3253 bio->iovpos = 0; 3254 bio->iov_offset = 0; 3255 3256 rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count, 3257 bdev_nvme_no_pi_readv_done, bio, 0, 3258 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 3259 md, 0, 0); 3260 3261 if (rc != 0 && rc != -ENOMEM) { 3262 SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc); 3263 } 3264 return rc; 3265 } 3266 3267 static int 3268 bdev_nvme_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 3269 struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 3270 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags) 3271 { 3272 int rc; 3273 3274 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n", 3275 lba_count, lba); 3276 3277 bio->iovs = iov; 3278 bio->iovcnt = iovcnt; 3279 bio->iovpos = 0; 3280 bio->iov_offset = 0; 3281 3282 if (iovcnt == 1) { 3283 rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base, md, lba, 3284 lba_count, 3285 bdev_nvme_readv_done, bio, 3286 flags, 3287 0, 0); 3288 } else { 3289 rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count, 3290 bdev_nvme_readv_done, bio, flags, 3291 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 3292 md, 0, 0); 3293 } 3294 3295 if (rc != 0 && rc != -ENOMEM) { 3296 SPDK_ERRLOG("readv failed: rc = %d\n", rc); 3297 } 3298 return rc; 3299 } 3300 3301 static int 3302 bdev_nvme_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 3303 struct nvme_bdev_io *bio, 3304 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba, 3305 uint32_t flags) 3306 { 3307 int rc; 3308 3309 SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 3310 lba_count, lba); 3311 3312 bio->iovs = iov; 3313 bio->iovcnt = iovcnt; 3314 bio->iovpos = 0; 3315 bio->iov_offset = 0; 3316 3317 if (iovcnt == 1) { 3318 rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base, md, lba, 3319 lba_count, 3320 bdev_nvme_writev_done, bio, 3321 flags, 3322 0, 0); 3323 } else { 3324 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 3325 bdev_nvme_writev_done, bio, flags, 3326 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 3327 md, 0, 0); 3328 } 3329 3330 if (rc != 0 && rc != -ENOMEM) { 3331 SPDK_ERRLOG("writev failed: rc = %d\n", rc); 3332 } 3333 return rc; 3334 } 3335 3336 static int 3337 bdev_nvme_zone_appendv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 3338 struct nvme_bdev_io *bio, 3339 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t zslba, 3340 uint32_t flags) 3341 { 3342 int rc; 3343 3344 SPDK_DEBUGLOG(bdev_nvme, "zone append %" PRIu64 " blocks to zone start lba %#" PRIx64 "\n", 3345 lba_count, zslba); 3346 3347 bio->iovs = iov; 3348 bio->iovcnt = iovcnt; 3349 bio->iovpos = 0; 3350 bio->iov_offset = 0; 3351 3352 if (iovcnt == 1) { 3353 rc = spdk_nvme_zns_zone_append_with_md(ns, qpair, iov[0].iov_base, md, zslba, 3354 lba_count, 3355 bdev_nvme_zone_appendv_done, bio, 3356 flags, 3357 0, 0); 3358 } else { 3359 rc = spdk_nvme_zns_zone_appendv_with_md(ns, qpair, zslba, lba_count, 3360 bdev_nvme_zone_appendv_done, bio, flags, 3361 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 3362 md, 0, 0); 3363 } 3364 3365 if (rc != 0 && rc != -ENOMEM) { 3366 SPDK_ERRLOG("zone append failed: rc = %d\n", rc); 3367 } 3368 return rc; 3369 } 3370 3371 static int 3372 bdev_nvme_comparev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 3373 struct nvme_bdev_io *bio, 3374 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba, 3375 uint32_t flags) 3376 { 3377 int rc; 3378 3379 SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n", 3380 lba_count, lba); 3381 3382 bio->iovs = iov; 3383 bio->iovcnt = iovcnt; 3384 bio->iovpos = 0; 3385 bio->iov_offset = 0; 3386 3387 rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count, 3388 bdev_nvme_comparev_done, bio, flags, 3389 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 3390 md, 0, 0); 3391 3392 if (rc != 0 && rc != -ENOMEM) { 3393 SPDK_ERRLOG("comparev failed: rc = %d\n", rc); 3394 } 3395 return rc; 3396 } 3397 3398 static int 3399 bdev_nvme_comparev_and_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 3400 struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, 3401 struct iovec *write_iov, int write_iovcnt, 3402 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags) 3403 { 3404 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 3405 int rc; 3406 3407 SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 3408 lba_count, lba); 3409 3410 bio->iovs = cmp_iov; 3411 bio->iovcnt = cmp_iovcnt; 3412 bio->iovpos = 0; 3413 bio->iov_offset = 0; 3414 bio->fused_iovs = write_iov; 3415 bio->fused_iovcnt = write_iovcnt; 3416 bio->fused_iovpos = 0; 3417 bio->fused_iov_offset = 0; 3418 3419 if (bdev_io->num_retries == 0) { 3420 bio->first_fused_submitted = false; 3421 } 3422 3423 if (!bio->first_fused_submitted) { 3424 flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST; 3425 memset(&bio->cpl, 0, sizeof(bio->cpl)); 3426 3427 rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count, 3428 bdev_nvme_comparev_and_writev_done, bio, flags, 3429 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0); 3430 if (rc == 0) { 3431 bio->first_fused_submitted = true; 3432 flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST; 3433 } else { 3434 if (rc != -ENOMEM) { 3435 SPDK_ERRLOG("compare failed: rc = %d\n", rc); 3436 } 3437 return rc; 3438 } 3439 } 3440 3441 flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND; 3442 3443 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 3444 bdev_nvme_comparev_and_writev_done, bio, flags, 3445 bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0); 3446 if (rc != 0 && rc != -ENOMEM) { 3447 SPDK_ERRLOG("write failed: rc = %d\n", rc); 3448 rc = 0; 3449 } 3450 3451 return rc; 3452 } 3453 3454 static int 3455 bdev_nvme_unmap(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 3456 struct nvme_bdev_io *bio, 3457 uint64_t offset_blocks, 3458 uint64_t num_blocks) 3459 { 3460 struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES]; 3461 struct spdk_nvme_dsm_range *range; 3462 uint64_t offset, remaining; 3463 uint64_t num_ranges_u64; 3464 uint16_t num_ranges; 3465 int rc; 3466 3467 num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) / 3468 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 3469 if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) { 3470 SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks); 3471 return -EINVAL; 3472 } 3473 num_ranges = (uint16_t)num_ranges_u64; 3474 3475 offset = offset_blocks; 3476 remaining = num_blocks; 3477 range = &dsm_ranges[0]; 3478 3479 /* Fill max-size ranges until the remaining blocks fit into one range */ 3480 while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) { 3481 range->attributes.raw = 0; 3482 range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 3483 range->starting_lba = offset; 3484 3485 offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 3486 remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 3487 range++; 3488 } 3489 3490 /* Final range describes the remaining blocks */ 3491 range->attributes.raw = 0; 3492 range->length = remaining; 3493 range->starting_lba = offset; 3494 3495 rc = spdk_nvme_ns_cmd_dataset_management(ns, qpair, 3496 SPDK_NVME_DSM_ATTR_DEALLOCATE, 3497 dsm_ranges, num_ranges, 3498 bdev_nvme_queued_done, bio); 3499 3500 return rc; 3501 } 3502 3503 static int 3504 bdev_nvme_write_zeroes(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 3505 struct nvme_bdev_io *bio, 3506 uint64_t offset_blocks, 3507 uint64_t num_blocks) 3508 { 3509 if (num_blocks > UINT16_MAX + 1) { 3510 SPDK_ERRLOG("NVMe write zeroes is limited to 16-bit block count\n"); 3511 return -EINVAL; 3512 } 3513 3514 return spdk_nvme_ns_cmd_write_zeroes(ns, qpair, 3515 offset_blocks, num_blocks, 3516 bdev_nvme_queued_done, bio, 3517 0); 3518 } 3519 3520 static int 3521 bdev_nvme_get_zone_info(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 3522 struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones, 3523 struct spdk_bdev_zone_info *info) 3524 { 3525 uint32_t zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 3526 uint64_t zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 3527 uint64_t total_zones = spdk_nvme_zns_ns_get_num_zones(ns); 3528 3529 if (zone_id % zone_size != 0) { 3530 return -EINVAL; 3531 } 3532 3533 if (num_zones > total_zones || !num_zones) { 3534 return -EINVAL; 3535 } 3536 3537 assert(!bio->zone_report_buf); 3538 bio->zone_report_buf = calloc(1, zone_report_bufsize); 3539 if (!bio->zone_report_buf) { 3540 return -ENOMEM; 3541 } 3542 3543 bio->handled_zones = 0; 3544 3545 return spdk_nvme_zns_report_zones(ns, qpair, bio->zone_report_buf, zone_report_bufsize, 3546 zone_id, SPDK_NVME_ZRA_LIST_ALL, true, 3547 bdev_nvme_get_zone_info_done, bio); 3548 } 3549 3550 static int 3551 bdev_nvme_zone_management(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 3552 struct nvme_bdev_io *bio, uint64_t zone_id, 3553 enum spdk_bdev_zone_action action) 3554 { 3555 switch (action) { 3556 case SPDK_BDEV_ZONE_CLOSE: 3557 return spdk_nvme_zns_close_zone(ns, qpair, zone_id, false, 3558 bdev_nvme_zone_management_done, bio); 3559 case SPDK_BDEV_ZONE_FINISH: 3560 return spdk_nvme_zns_finish_zone(ns, qpair, zone_id, false, 3561 bdev_nvme_zone_management_done, bio); 3562 case SPDK_BDEV_ZONE_OPEN: 3563 return spdk_nvme_zns_open_zone(ns, qpair, zone_id, false, 3564 bdev_nvme_zone_management_done, bio); 3565 case SPDK_BDEV_ZONE_RESET: 3566 return spdk_nvme_zns_reset_zone(ns, qpair, zone_id, false, 3567 bdev_nvme_zone_management_done, bio); 3568 case SPDK_BDEV_ZONE_OFFLINE: 3569 return spdk_nvme_zns_offline_zone(ns, qpair, zone_id, false, 3570 bdev_nvme_zone_management_done, bio); 3571 default: 3572 return -EINVAL; 3573 } 3574 } 3575 3576 static int 3577 bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 3578 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) 3579 { 3580 struct nvme_ctrlr *nvme_ctrlr; 3581 uint32_t max_xfer_size; 3582 3583 if (!bdev_nvme_find_admin_path(nbdev_ch, &nvme_ctrlr)) { 3584 return -EINVAL; 3585 } 3586 3587 max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ctrlr->ctrlr); 3588 3589 if (nbytes > max_xfer_size) { 3590 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 3591 return -EINVAL; 3592 } 3593 3594 bio->orig_thread = spdk_get_thread(); 3595 3596 return spdk_nvme_ctrlr_cmd_admin_raw(nvme_ctrlr->ctrlr, cmd, buf, 3597 (uint32_t)nbytes, bdev_nvme_admin_passthru_done, bio); 3598 } 3599 3600 static int 3601 bdev_nvme_io_passthru(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 3602 struct nvme_bdev_io *bio, 3603 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) 3604 { 3605 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 3606 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 3607 3608 if (nbytes > max_xfer_size) { 3609 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 3610 return -EINVAL; 3611 } 3612 3613 /* 3614 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 3615 * so fill it out automatically. 3616 */ 3617 cmd->nsid = spdk_nvme_ns_get_id(ns); 3618 3619 return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf, 3620 (uint32_t)nbytes, bdev_nvme_queued_done, bio); 3621 } 3622 3623 static int 3624 bdev_nvme_io_passthru_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 3625 struct nvme_bdev_io *bio, 3626 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len) 3627 { 3628 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 3629 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 3630 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 3631 3632 if (nbytes > max_xfer_size) { 3633 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 3634 return -EINVAL; 3635 } 3636 3637 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 3638 SPDK_ERRLOG("invalid meta data buffer size\n"); 3639 return -EINVAL; 3640 } 3641 3642 /* 3643 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 3644 * so fill it out automatically. 3645 */ 3646 cmd->nsid = spdk_nvme_ns_get_id(ns); 3647 3648 return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf, 3649 (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio); 3650 } 3651 3652 static int 3653 bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio, 3654 struct nvme_bdev_io *bio_to_abort) 3655 { 3656 struct nvme_ctrlr_channel *ctrlr_ch = nbdev_ch->ctrlr_ch; 3657 int rc; 3658 3659 bio->orig_thread = spdk_get_thread(); 3660 3661 rc = spdk_nvme_ctrlr_cmd_abort_ext(ctrlr_ch->ctrlr->ctrlr, 3662 ctrlr_ch->qpair, 3663 bio_to_abort, 3664 bdev_nvme_abort_done, bio); 3665 if (rc == -ENOENT) { 3666 /* If no command was found in I/O qpair, the target command may be 3667 * admin command. 3668 */ 3669 rc = spdk_nvme_ctrlr_cmd_abort_ext(ctrlr_ch->ctrlr->ctrlr, 3670 NULL, 3671 bio_to_abort, 3672 bdev_nvme_abort_done, bio); 3673 } 3674 3675 if (rc == -ENOENT) { 3676 /* If no command was found, complete the abort request with failure. */ 3677 bio->cpl.cdw0 |= 1U; 3678 bio->cpl.status.sc = SPDK_NVME_SC_SUCCESS; 3679 bio->cpl.status.sct = SPDK_NVME_SCT_GENERIC; 3680 3681 bdev_nvme_abort_completion(bio); 3682 3683 rc = 0; 3684 } 3685 3686 return rc; 3687 } 3688 3689 static void 3690 nvme_ctrlr_config_json_standard_namespace(struct spdk_json_write_ctx *w, 3691 struct nvme_ns *nvme_ns) 3692 { 3693 /* nop */ 3694 } 3695 3696 static void 3697 nvme_namespace_config_json(struct spdk_json_write_ctx *w, struct nvme_ns *nvme_ns) 3698 { 3699 g_config_json_namespace_fn[nvme_ns->type](w, nvme_ns); 3700 } 3701 3702 static void 3703 bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w) 3704 { 3705 const char *action; 3706 3707 if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) { 3708 action = "reset"; 3709 } else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) { 3710 action = "abort"; 3711 } else { 3712 action = "none"; 3713 } 3714 3715 spdk_json_write_object_begin(w); 3716 3717 spdk_json_write_named_string(w, "method", "bdev_nvme_set_options"); 3718 3719 spdk_json_write_named_object_begin(w, "params"); 3720 spdk_json_write_named_string(w, "action_on_timeout", action); 3721 spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us); 3722 spdk_json_write_named_uint64(w, "timeout_admin_us", g_opts.timeout_admin_us); 3723 spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms); 3724 spdk_json_write_named_uint32(w, "retry_count", g_opts.retry_count); 3725 spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst); 3726 spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight); 3727 spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight); 3728 spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight); 3729 spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us); 3730 spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us); 3731 spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests); 3732 spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit); 3733 spdk_json_write_object_end(w); 3734 3735 spdk_json_write_object_end(w); 3736 } 3737 3738 static void 3739 nvme_ctrlr_config_json(struct spdk_json_write_ctx *w, 3740 struct nvme_ctrlr *nvme_ctrlr) 3741 { 3742 struct spdk_nvme_transport_id *trid; 3743 3744 trid = nvme_ctrlr->connected_trid; 3745 3746 spdk_json_write_object_begin(w); 3747 3748 spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller"); 3749 3750 spdk_json_write_named_object_begin(w, "params"); 3751 spdk_json_write_named_string(w, "name", nvme_ctrlr->name); 3752 nvme_bdev_dump_trid_json(trid, w); 3753 spdk_json_write_named_bool(w, "prchk_reftag", 3754 (nvme_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0); 3755 spdk_json_write_named_bool(w, "prchk_guard", 3756 (nvme_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0); 3757 3758 spdk_json_write_object_end(w); 3759 3760 spdk_json_write_object_end(w); 3761 } 3762 3763 static void 3764 bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w) 3765 { 3766 spdk_json_write_object_begin(w); 3767 spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug"); 3768 3769 spdk_json_write_named_object_begin(w, "params"); 3770 spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us); 3771 spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled); 3772 spdk_json_write_object_end(w); 3773 3774 spdk_json_write_object_end(w); 3775 } 3776 3777 static int 3778 bdev_nvme_config_json(struct spdk_json_write_ctx *w) 3779 { 3780 struct nvme_ctrlr *nvme_ctrlr; 3781 uint32_t nsid; 3782 3783 bdev_nvme_opts_config_json(w); 3784 3785 pthread_mutex_lock(&g_bdev_nvme_mutex); 3786 3787 TAILQ_FOREACH(nvme_ctrlr, &g_nvme_ctrlrs, tailq) { 3788 nvme_ctrlr_config_json(w, nvme_ctrlr); 3789 3790 for (nsid = 0; nsid < nvme_ctrlr->num_ns; ++nsid) { 3791 if (!nvme_ctrlr->namespaces[nsid]->populated) { 3792 continue; 3793 } 3794 3795 nvme_namespace_config_json(w, nvme_ctrlr->namespaces[nsid]); 3796 } 3797 } 3798 3799 /* Dump as last parameter to give all NVMe bdevs chance to be constructed 3800 * before enabling hotplug poller. 3801 */ 3802 bdev_nvme_hotplug_config_json(w); 3803 3804 pthread_mutex_unlock(&g_bdev_nvme_mutex); 3805 return 0; 3806 } 3807 3808 struct spdk_nvme_ctrlr * 3809 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev) 3810 { 3811 if (!bdev || bdev->module != &nvme_if) { 3812 return NULL; 3813 } 3814 3815 return SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk)->nvme_ns->ctrlr->ctrlr; 3816 } 3817 3818 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme) 3819