1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. All rights reserved. 5 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include "spdk/stdinc.h" 35 36 #include "bdev_nvme.h" 37 #include "bdev_ocssd.h" 38 39 #include "spdk/accel_engine.h" 40 #include "spdk/config.h" 41 #include "spdk/endian.h" 42 #include "spdk/bdev.h" 43 #include "spdk/json.h" 44 #include "spdk/nvme.h" 45 #include "spdk/nvme_ocssd.h" 46 #include "spdk/nvme_zns.h" 47 #include "spdk/thread.h" 48 #include "spdk/string.h" 49 #include "spdk/util.h" 50 51 #include "spdk/bdev_module.h" 52 #include "spdk/log.h" 53 54 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true 55 #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS (10000) 56 57 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w); 58 59 struct nvme_bdev_io { 60 /** array of iovecs to transfer. */ 61 struct iovec *iovs; 62 63 /** Number of iovecs in iovs array. */ 64 int iovcnt; 65 66 /** Current iovec position. */ 67 int iovpos; 68 69 /** Offset in current iovec. */ 70 uint32_t iov_offset; 71 72 /** array of iovecs to transfer. */ 73 struct iovec *fused_iovs; 74 75 /** Number of iovecs in iovs array. */ 76 int fused_iovcnt; 77 78 /** Current iovec position. */ 79 int fused_iovpos; 80 81 /** Offset in current iovec. */ 82 uint32_t fused_iov_offset; 83 84 /** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */ 85 struct spdk_nvme_cpl cpl; 86 87 /** Originating thread */ 88 struct spdk_thread *orig_thread; 89 90 /** Keeps track if first of fused commands was submitted */ 91 bool first_fused_submitted; 92 93 /** Temporary pointer to zone report buffer */ 94 struct spdk_nvme_zns_zone_report *zone_report_buf; 95 96 /** Keep track of how many zones that have been copied to the spdk_bdev_zone_info struct */ 97 uint64_t handled_zones; 98 }; 99 100 struct nvme_probe_ctx { 101 size_t count; 102 struct spdk_nvme_transport_id trids[NVME_MAX_CONTROLLERS]; 103 struct spdk_nvme_host_id hostids[NVME_MAX_CONTROLLERS]; 104 const char *names[NVME_MAX_CONTROLLERS]; 105 uint32_t prchk_flags[NVME_MAX_CONTROLLERS]; 106 const char *hostnqn; 107 }; 108 109 struct nvme_probe_skip_entry { 110 struct spdk_nvme_transport_id trid; 111 TAILQ_ENTRY(nvme_probe_skip_entry) tailq; 112 }; 113 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */ 114 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER( 115 g_skipped_nvme_ctrlrs); 116 117 static struct spdk_bdev_nvme_opts g_opts = { 118 .action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE, 119 .timeout_us = 0, 120 .keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS, 121 .retry_count = 4, 122 .arbitration_burst = 0, 123 .low_priority_weight = 0, 124 .medium_priority_weight = 0, 125 .high_priority_weight = 0, 126 .nvme_adminq_poll_period_us = 10000ULL, 127 .nvme_ioq_poll_period_us = 0, 128 .io_queue_requests = 0, 129 .delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT, 130 }; 131 132 #define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL 133 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT 100000ULL 134 135 static int g_hot_insert_nvme_controller_index = 0; 136 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT; 137 static bool g_nvme_hotplug_enabled = false; 138 static struct spdk_thread *g_bdev_nvme_init_thread; 139 static struct spdk_poller *g_hotplug_poller; 140 static struct spdk_poller *g_hotplug_probe_poller; 141 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx; 142 143 static void nvme_ctrlr_populate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 144 struct nvme_async_probe_ctx *ctx); 145 static void nvme_ctrlr_populate_namespaces_done(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 146 struct nvme_async_probe_ctx *ctx); 147 static int bdev_nvme_library_init(void); 148 static void bdev_nvme_library_fini(void); 149 static int bdev_nvme_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 150 struct nvme_bdev_io *bio, 151 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba, 152 uint32_t flags); 153 static int bdev_nvme_no_pi_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 154 struct nvme_bdev_io *bio, 155 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba); 156 static int bdev_nvme_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 157 struct nvme_bdev_io *bio, 158 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba, 159 uint32_t flags); 160 static int bdev_nvme_zone_appendv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 161 struct nvme_bdev_io *bio, 162 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, 163 uint64_t zslba, uint32_t flags); 164 static int bdev_nvme_comparev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 165 struct nvme_bdev_io *bio, 166 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba, 167 uint32_t flags); 168 static int bdev_nvme_comparev_and_writev(struct spdk_nvme_ns *ns, 169 struct spdk_nvme_qpair *qpair, 170 struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov, 171 int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba, 172 uint32_t flags); 173 static int bdev_nvme_get_zone_info(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 174 struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones, 175 struct spdk_bdev_zone_info *info); 176 static int bdev_nvme_zone_management(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 177 struct nvme_bdev_io *bio, uint64_t zone_id, 178 enum spdk_bdev_zone_action action); 179 static int bdev_nvme_admin_passthru(struct nvme_io_channel *nvme_ch, 180 struct nvme_bdev_io *bio, 181 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); 182 static int bdev_nvme_io_passthru(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 183 struct nvme_bdev_io *bio, 184 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); 185 static int bdev_nvme_io_passthru_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 186 struct nvme_bdev_io *bio, 187 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len); 188 static int bdev_nvme_abort(struct nvme_io_channel *nvme_ch, 189 struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort); 190 static int bdev_nvme_reset(struct nvme_io_channel *nvme_ch, struct nvme_bdev_io *bio); 191 static int bdev_nvme_failover(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, bool remove); 192 static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr); 193 194 typedef void (*populate_namespace_fn)(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 195 struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx); 196 static void nvme_ctrlr_populate_standard_namespace(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 197 struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx); 198 199 static populate_namespace_fn g_populate_namespace_fn[] = { 200 NULL, 201 nvme_ctrlr_populate_standard_namespace, 202 bdev_ocssd_populate_namespace, 203 }; 204 205 typedef void (*depopulate_namespace_fn)(struct nvme_bdev_ns *nvme_ns); 206 static void nvme_ctrlr_depopulate_standard_namespace(struct nvme_bdev_ns *nvme_ns); 207 208 static depopulate_namespace_fn g_depopulate_namespace_fn[] = { 209 NULL, 210 nvme_ctrlr_depopulate_standard_namespace, 211 bdev_ocssd_depopulate_namespace, 212 }; 213 214 typedef void (*config_json_namespace_fn)(struct spdk_json_write_ctx *w, 215 struct nvme_bdev_ns *nvme_ns); 216 static void nvme_ctrlr_config_json_standard_namespace(struct spdk_json_write_ctx *w, 217 struct nvme_bdev_ns *nvme_ns); 218 219 static config_json_namespace_fn g_config_json_namespace_fn[] = { 220 NULL, 221 nvme_ctrlr_config_json_standard_namespace, 222 bdev_ocssd_namespace_config_json, 223 }; 224 225 struct spdk_nvme_qpair * 226 bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch) 227 { 228 struct nvme_io_channel *nvme_ch; 229 230 assert(ctrlr_io_ch != NULL); 231 232 nvme_ch = spdk_io_channel_get_ctx(ctrlr_io_ch); 233 234 return nvme_ch->qpair; 235 } 236 237 static int 238 bdev_nvme_get_ctx_size(void) 239 { 240 return sizeof(struct nvme_bdev_io); 241 } 242 243 static struct spdk_bdev_module nvme_if = { 244 .name = "nvme", 245 .async_fini = true, 246 .module_init = bdev_nvme_library_init, 247 .module_fini = bdev_nvme_library_fini, 248 .config_json = bdev_nvme_config_json, 249 .get_ctx_size = bdev_nvme_get_ctx_size, 250 251 }; 252 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if) 253 254 static void 255 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx) 256 { 257 int rc; 258 259 SPDK_DEBUGLOG(bdev_nvme, "qpair %p is disconnected, attempting reconnect.\n", qpair); 260 /* 261 * Currently, just try to reconnect indefinitely. If we are doing a reset, the reset will 262 * reconnect a qpair and we will stop getting a callback for this one. 263 */ 264 rc = spdk_nvme_ctrlr_reconnect_io_qpair(qpair); 265 if (rc != 0) { 266 SPDK_WARNLOG("Failed to reconnect to qpair %p, errno %d\n", qpair, -rc); 267 } 268 } 269 270 static int 271 bdev_nvme_poll(void *arg) 272 { 273 struct nvme_bdev_poll_group *group = arg; 274 int64_t num_completions; 275 276 if (group->collect_spin_stat && group->start_ticks == 0) { 277 group->start_ticks = spdk_get_ticks(); 278 } 279 280 num_completions = spdk_nvme_poll_group_process_completions(group->group, 0, 281 bdev_nvme_disconnected_qpair_cb); 282 if (group->collect_spin_stat) { 283 if (num_completions > 0) { 284 if (group->end_ticks != 0) { 285 group->spin_ticks += (group->end_ticks - group->start_ticks); 286 group->end_ticks = 0; 287 } 288 group->start_ticks = 0; 289 } else { 290 group->end_ticks = spdk_get_ticks(); 291 } 292 } 293 294 return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 295 } 296 297 static int 298 bdev_nvme_poll_adminq(void *arg) 299 { 300 int32_t rc; 301 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = arg; 302 303 assert(nvme_bdev_ctrlr != NULL); 304 305 rc = spdk_nvme_ctrlr_process_admin_completions(nvme_bdev_ctrlr->ctrlr); 306 if (rc < 0) { 307 bdev_nvme_failover(nvme_bdev_ctrlr, false); 308 } 309 310 return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY; 311 } 312 313 static int 314 bdev_nvme_destruct(void *ctx) 315 { 316 struct nvme_bdev *nvme_disk = ctx; 317 struct nvme_bdev_ns *nvme_ns = nvme_disk->nvme_ns; 318 319 pthread_mutex_lock(&nvme_ns->ctrlr->mutex); 320 321 nvme_ns->bdev = NULL; 322 323 if (!nvme_ns->populated) { 324 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 325 326 nvme_bdev_ctrlr_destruct(nvme_ns->ctrlr); 327 } else { 328 pthread_mutex_unlock(&nvme_ns->ctrlr->mutex); 329 } 330 331 free(nvme_disk->disk.name); 332 free(nvme_disk); 333 334 return 0; 335 } 336 337 static int 338 bdev_nvme_flush(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 339 struct nvme_bdev_io *bio, uint64_t offset, uint64_t nbytes) 340 { 341 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_SUCCESS); 342 343 return 0; 344 } 345 346 static int 347 bdev_nvme_create_qpair(struct nvme_io_channel *nvme_ch) 348 { 349 struct spdk_nvme_ctrlr *ctrlr = nvme_ch->ctrlr->ctrlr; 350 struct spdk_nvme_io_qpair_opts opts; 351 struct spdk_nvme_qpair *qpair; 352 int rc; 353 354 spdk_nvme_ctrlr_get_default_io_qpair_opts(ctrlr, &opts, sizeof(opts)); 355 opts.delay_cmd_submit = g_opts.delay_cmd_submit; 356 opts.create_only = true; 357 opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests); 358 g_opts.io_queue_requests = opts.io_queue_requests; 359 360 qpair = spdk_nvme_ctrlr_alloc_io_qpair(ctrlr, &opts, sizeof(opts)); 361 if (qpair == NULL) { 362 return -1; 363 } 364 365 assert(nvme_ch->group != NULL); 366 367 rc = spdk_nvme_poll_group_add(nvme_ch->group->group, qpair); 368 if (rc != 0) { 369 SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n"); 370 goto err; 371 } 372 373 rc = spdk_nvme_ctrlr_connect_io_qpair(ctrlr, qpair); 374 if (rc != 0) { 375 SPDK_ERRLOG("Unable to connect I/O qpair.\n"); 376 goto err; 377 } 378 379 nvme_ch->qpair = qpair; 380 381 return 0; 382 383 err: 384 spdk_nvme_ctrlr_free_io_qpair(qpair); 385 386 return rc; 387 } 388 389 static int 390 bdev_nvme_destroy_qpair(struct nvme_io_channel *nvme_ch) 391 { 392 int rc; 393 394 if (nvme_ch->qpair == NULL) { 395 return 0; 396 } 397 398 rc = spdk_nvme_ctrlr_free_io_qpair(nvme_ch->qpair); 399 if (!rc) { 400 nvme_ch->qpair = NULL; 401 } 402 return rc; 403 } 404 405 static void 406 _bdev_nvme_check_pending_destruct(struct spdk_io_channel_iter *i, int status) 407 { 408 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_ctx(i); 409 410 pthread_mutex_lock(&nvme_bdev_ctrlr->mutex); 411 if (nvme_bdev_ctrlr->destruct_after_reset) { 412 assert(nvme_bdev_ctrlr->ref == 0 && nvme_bdev_ctrlr->destruct); 413 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 414 415 spdk_thread_send_msg(nvme_bdev_ctrlr->thread, nvme_bdev_ctrlr_unregister, 416 nvme_bdev_ctrlr); 417 } else { 418 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 419 } 420 } 421 422 static void 423 _bdev_nvme_complete_pending_resets(struct nvme_io_channel *nvme_ch, 424 enum spdk_bdev_io_status status) 425 { 426 struct spdk_bdev_io *bdev_io; 427 428 while (!TAILQ_EMPTY(&nvme_ch->pending_resets)) { 429 bdev_io = TAILQ_FIRST(&nvme_ch->pending_resets); 430 TAILQ_REMOVE(&nvme_ch->pending_resets, bdev_io, module_link); 431 spdk_bdev_io_complete(bdev_io, status); 432 } 433 } 434 435 static void 436 bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i) 437 { 438 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 439 struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(_ch); 440 441 _bdev_nvme_complete_pending_resets(nvme_ch, SPDK_BDEV_IO_STATUS_SUCCESS); 442 443 spdk_for_each_channel_continue(i, 0); 444 } 445 446 static void 447 bdev_nvme_abort_pending_resets(struct spdk_io_channel_iter *i) 448 { 449 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 450 struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(_ch); 451 452 _bdev_nvme_complete_pending_resets(nvme_ch, SPDK_BDEV_IO_STATUS_FAILED); 453 454 spdk_for_each_channel_continue(i, 0); 455 } 456 457 static void 458 _bdev_nvme_reset_complete(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, int rc) 459 { 460 struct nvme_bdev_ctrlr_trid *curr_trid; 461 struct nvme_bdev_io *bio = nvme_bdev_ctrlr->reset_bio; 462 enum spdk_bdev_io_status io_status = SPDK_BDEV_IO_STATUS_SUCCESS; 463 464 nvme_bdev_ctrlr->reset_bio = NULL; 465 466 if (rc) { 467 SPDK_ERRLOG("Resetting controller failed.\n"); 468 io_status = SPDK_BDEV_IO_STATUS_FAILED; 469 } else { 470 SPDK_NOTICELOG("Resetting controller successful.\n"); 471 } 472 473 pthread_mutex_lock(&nvme_bdev_ctrlr->mutex); 474 nvme_bdev_ctrlr->resetting = false; 475 nvme_bdev_ctrlr->failover_in_progress = false; 476 477 curr_trid = TAILQ_FIRST(&nvme_bdev_ctrlr->trids); 478 assert(curr_trid != NULL); 479 assert(&curr_trid->trid == nvme_bdev_ctrlr->connected_trid); 480 481 curr_trid->is_failed = rc != 0 ? true : false; 482 483 if (nvme_bdev_ctrlr->ref == 0 && nvme_bdev_ctrlr->destruct) { 484 /* Destruct ctrlr after clearing pending resets. */ 485 nvme_bdev_ctrlr->destruct_after_reset = true; 486 } 487 488 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 489 490 if (bio) { 491 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), io_status); 492 } 493 494 /* Make sure we clear any pending resets before returning. */ 495 spdk_for_each_channel(nvme_bdev_ctrlr, 496 rc == 0 ? bdev_nvme_complete_pending_resets : 497 bdev_nvme_abort_pending_resets, 498 nvme_bdev_ctrlr, 499 _bdev_nvme_check_pending_destruct); 500 } 501 502 static void 503 _bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status) 504 { 505 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_ctx(i); 506 507 _bdev_nvme_reset_complete(nvme_bdev_ctrlr, status); 508 } 509 510 static void 511 _bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i) 512 { 513 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 514 struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(_ch); 515 int rc; 516 517 rc = bdev_nvme_create_qpair(nvme_ch); 518 519 spdk_for_each_channel_continue(i, rc); 520 } 521 522 static void 523 _bdev_nvme_reset_ctrlr(struct spdk_io_channel_iter *i, int status) 524 { 525 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_ctx(i); 526 int rc; 527 528 if (status) { 529 rc = status; 530 goto err; 531 } 532 533 rc = spdk_nvme_ctrlr_reset(nvme_bdev_ctrlr->ctrlr); 534 if (rc != 0) { 535 goto err; 536 } 537 538 /* Recreate all of the I/O queue pairs */ 539 spdk_for_each_channel(nvme_bdev_ctrlr, 540 _bdev_nvme_reset_create_qpair, 541 nvme_bdev_ctrlr, 542 _bdev_nvme_reset_create_qpairs_done); 543 return; 544 545 err: 546 _bdev_nvme_reset_complete(nvme_bdev_ctrlr, rc); 547 } 548 549 static void 550 _bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i) 551 { 552 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 553 struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch); 554 int rc; 555 556 rc = bdev_nvme_destroy_qpair(nvme_ch); 557 558 spdk_for_each_channel_continue(i, rc); 559 } 560 561 static int 562 _bdev_nvme_reset(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr) 563 { 564 pthread_mutex_lock(&nvme_bdev_ctrlr->mutex); 565 if (nvme_bdev_ctrlr->destruct) { 566 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 567 return -EBUSY; 568 } 569 570 if (nvme_bdev_ctrlr->resetting) { 571 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 572 SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); 573 return -EAGAIN; 574 } 575 576 nvme_bdev_ctrlr->resetting = true; 577 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 578 579 /* First, delete all NVMe I/O queue pairs. */ 580 spdk_for_each_channel(nvme_bdev_ctrlr, 581 _bdev_nvme_reset_destroy_qpair, 582 nvme_bdev_ctrlr, 583 _bdev_nvme_reset_ctrlr); 584 585 return 0; 586 } 587 588 static int 589 bdev_nvme_reset(struct nvme_io_channel *nvme_ch, struct nvme_bdev_io *bio) 590 { 591 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 592 int rc; 593 594 rc = _bdev_nvme_reset(nvme_ch->ctrlr); 595 if (rc == 0) { 596 assert(nvme_ch->ctrlr->reset_bio == NULL); 597 nvme_ch->ctrlr->reset_bio = bio; 598 } else if (rc == -EBUSY) { 599 /* Don't bother resetting if the controller is in the process of being destructed. */ 600 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 601 } else if (rc == -EAGAIN) { 602 /* 603 * Reset call is queued only if it is from the app framework. This is on purpose so that 604 * we don't interfere with the app framework reset strategy. i.e. we are deferring to the 605 * upper level. If they are in the middle of a reset, we won't try to schedule another one. 606 */ 607 TAILQ_INSERT_TAIL(&nvme_ch->pending_resets, bdev_io, module_link); 608 } else { 609 return rc; 610 } 611 612 return 0; 613 } 614 615 static int 616 _bdev_nvme_failover_start(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, bool remove) 617 { 618 struct nvme_bdev_ctrlr_trid *curr_trid = NULL, *next_trid = NULL; 619 int rc; 620 621 pthread_mutex_lock(&nvme_bdev_ctrlr->mutex); 622 if (nvme_bdev_ctrlr->destruct) { 623 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 624 /* Don't bother resetting if the controller is in the process of being destructed. */ 625 return -EBUSY; 626 } 627 628 curr_trid = TAILQ_FIRST(&nvme_bdev_ctrlr->trids); 629 assert(curr_trid); 630 assert(&curr_trid->trid == nvme_bdev_ctrlr->connected_trid); 631 next_trid = TAILQ_NEXT(curr_trid, link); 632 633 if (nvme_bdev_ctrlr->resetting) { 634 if (next_trid && !nvme_bdev_ctrlr->failover_in_progress) { 635 rc = -EAGAIN; 636 } else { 637 rc = -EBUSY; 638 } 639 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 640 SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); 641 return rc; 642 } 643 644 nvme_bdev_ctrlr->resetting = true; 645 curr_trid->is_failed = true; 646 647 if (next_trid) { 648 assert(curr_trid->trid.trtype != SPDK_NVME_TRANSPORT_PCIE); 649 650 SPDK_NOTICELOG("Start failover from %s:%s to %s:%s\n", curr_trid->trid.traddr, 651 curr_trid->trid.trsvcid, next_trid->trid.traddr, next_trid->trid.trsvcid); 652 653 nvme_bdev_ctrlr->failover_in_progress = true; 654 spdk_nvme_ctrlr_fail(nvme_bdev_ctrlr->ctrlr); 655 nvme_bdev_ctrlr->connected_trid = &next_trid->trid; 656 rc = spdk_nvme_ctrlr_set_trid(nvme_bdev_ctrlr->ctrlr, &next_trid->trid); 657 assert(rc == 0); 658 TAILQ_REMOVE(&nvme_bdev_ctrlr->trids, curr_trid, link); 659 if (!remove) { 660 /** Shuffle the old trid to the end of the list and use the new one. 661 * Allows for round robin through multiple connections. 662 */ 663 TAILQ_INSERT_TAIL(&nvme_bdev_ctrlr->trids, curr_trid, link); 664 } else { 665 free(curr_trid); 666 } 667 } 668 669 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 670 return 0; 671 } 672 673 static int 674 bdev_nvme_failover(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, bool remove) 675 { 676 int rc; 677 678 rc = _bdev_nvme_failover_start(nvme_bdev_ctrlr, remove); 679 if (rc == 0) { 680 /* First, delete all NVMe I/O queue pairs. */ 681 spdk_for_each_channel(nvme_bdev_ctrlr, 682 _bdev_nvme_reset_destroy_qpair, 683 nvme_bdev_ctrlr, 684 _bdev_nvme_reset_ctrlr); 685 } else if (rc != -EBUSY) { 686 return rc; 687 } 688 689 return 0; 690 } 691 692 static int 693 bdev_nvme_unmap(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 694 struct nvme_bdev_io *bio, 695 uint64_t offset_blocks, 696 uint64_t num_blocks); 697 698 static void 699 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 700 bool success) 701 { 702 struct spdk_bdev *bdev = bdev_io->bdev; 703 struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev->ctxt; 704 struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch); 705 struct nvme_bdev_ns *nvme_ns; 706 struct spdk_nvme_qpair *qpair; 707 int ret; 708 709 if (!success) { 710 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 711 return; 712 } 713 714 if (spdk_unlikely(!bdev_nvme_find_io_path(nbdev, nvme_ch, &nvme_ns, &qpair))) { 715 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 716 return; 717 } 718 719 ret = bdev_nvme_readv(nvme_ns->ns, 720 qpair, 721 (struct nvme_bdev_io *)bdev_io->driver_ctx, 722 bdev_io->u.bdev.iovs, 723 bdev_io->u.bdev.iovcnt, 724 bdev_io->u.bdev.md_buf, 725 bdev_io->u.bdev.num_blocks, 726 bdev_io->u.bdev.offset_blocks, 727 bdev->dif_check_flags); 728 729 if (spdk_likely(ret == 0)) { 730 return; 731 } else if (ret == -ENOMEM) { 732 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM); 733 } else { 734 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 735 } 736 } 737 738 static int 739 _bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 740 { 741 struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch); 742 struct spdk_bdev *bdev = bdev_io->bdev; 743 struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev->ctxt; 744 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 745 struct nvme_bdev_io *nbdev_io_to_abort; 746 struct nvme_bdev_ns *nvme_ns; 747 struct spdk_nvme_qpair *qpair; 748 749 if (spdk_unlikely(!bdev_nvme_find_io_path(nbdev, nvme_ch, &nvme_ns, &qpair))) { 750 return -1; 751 } 752 753 switch (bdev_io->type) { 754 case SPDK_BDEV_IO_TYPE_READ: 755 if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) { 756 return bdev_nvme_readv(nvme_ns->ns, 757 qpair, 758 nbdev_io, 759 bdev_io->u.bdev.iovs, 760 bdev_io->u.bdev.iovcnt, 761 bdev_io->u.bdev.md_buf, 762 bdev_io->u.bdev.num_blocks, 763 bdev_io->u.bdev.offset_blocks, 764 bdev->dif_check_flags); 765 } else { 766 spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb, 767 bdev_io->u.bdev.num_blocks * bdev->blocklen); 768 return 0; 769 } 770 771 case SPDK_BDEV_IO_TYPE_WRITE: 772 return bdev_nvme_writev(nvme_ns->ns, 773 qpair, 774 nbdev_io, 775 bdev_io->u.bdev.iovs, 776 bdev_io->u.bdev.iovcnt, 777 bdev_io->u.bdev.md_buf, 778 bdev_io->u.bdev.num_blocks, 779 bdev_io->u.bdev.offset_blocks, 780 bdev->dif_check_flags); 781 782 case SPDK_BDEV_IO_TYPE_COMPARE: 783 return bdev_nvme_comparev(nvme_ns->ns, 784 qpair, 785 nbdev_io, 786 bdev_io->u.bdev.iovs, 787 bdev_io->u.bdev.iovcnt, 788 bdev_io->u.bdev.md_buf, 789 bdev_io->u.bdev.num_blocks, 790 bdev_io->u.bdev.offset_blocks, 791 bdev->dif_check_flags); 792 793 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 794 return bdev_nvme_comparev_and_writev(nvme_ns->ns, 795 qpair, 796 nbdev_io, 797 bdev_io->u.bdev.iovs, 798 bdev_io->u.bdev.iovcnt, 799 bdev_io->u.bdev.fused_iovs, 800 bdev_io->u.bdev.fused_iovcnt, 801 bdev_io->u.bdev.md_buf, 802 bdev_io->u.bdev.num_blocks, 803 bdev_io->u.bdev.offset_blocks, 804 bdev->dif_check_flags); 805 806 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 807 return bdev_nvme_unmap(nvme_ns->ns, 808 qpair, 809 nbdev_io, 810 bdev_io->u.bdev.offset_blocks, 811 bdev_io->u.bdev.num_blocks); 812 813 case SPDK_BDEV_IO_TYPE_UNMAP: 814 return bdev_nvme_unmap(nvme_ns->ns, 815 qpair, 816 nbdev_io, 817 bdev_io->u.bdev.offset_blocks, 818 bdev_io->u.bdev.num_blocks); 819 820 case SPDK_BDEV_IO_TYPE_RESET: 821 return bdev_nvme_reset(nvme_ch, nbdev_io); 822 823 case SPDK_BDEV_IO_TYPE_FLUSH: 824 return bdev_nvme_flush(nvme_ns->ns, 825 qpair, 826 nbdev_io, 827 bdev_io->u.bdev.offset_blocks, 828 bdev_io->u.bdev.num_blocks); 829 830 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 831 return bdev_nvme_zone_appendv(nvme_ns->ns, 832 qpair, 833 nbdev_io, 834 bdev_io->u.bdev.iovs, 835 bdev_io->u.bdev.iovcnt, 836 bdev_io->u.bdev.md_buf, 837 bdev_io->u.bdev.num_blocks, 838 bdev_io->u.bdev.offset_blocks, 839 bdev->dif_check_flags); 840 841 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 842 return bdev_nvme_get_zone_info(nvme_ns->ns, 843 qpair, 844 nbdev_io, 845 bdev_io->u.zone_mgmt.zone_id, 846 bdev_io->u.zone_mgmt.num_zones, 847 bdev_io->u.zone_mgmt.buf); 848 849 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 850 return bdev_nvme_zone_management(nvme_ns->ns, 851 qpair, 852 nbdev_io, 853 bdev_io->u.zone_mgmt.zone_id, 854 bdev_io->u.zone_mgmt.zone_action); 855 856 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 857 return bdev_nvme_admin_passthru(nvme_ch, 858 nbdev_io, 859 &bdev_io->u.nvme_passthru.cmd, 860 bdev_io->u.nvme_passthru.buf, 861 bdev_io->u.nvme_passthru.nbytes); 862 863 case SPDK_BDEV_IO_TYPE_NVME_IO: 864 return bdev_nvme_io_passthru(nvme_ns->ns, 865 qpair, 866 nbdev_io, 867 &bdev_io->u.nvme_passthru.cmd, 868 bdev_io->u.nvme_passthru.buf, 869 bdev_io->u.nvme_passthru.nbytes); 870 871 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 872 return bdev_nvme_io_passthru_md(nvme_ns->ns, 873 qpair, 874 nbdev_io, 875 &bdev_io->u.nvme_passthru.cmd, 876 bdev_io->u.nvme_passthru.buf, 877 bdev_io->u.nvme_passthru.nbytes, 878 bdev_io->u.nvme_passthru.md_buf, 879 bdev_io->u.nvme_passthru.md_len); 880 881 case SPDK_BDEV_IO_TYPE_ABORT: 882 nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx; 883 return bdev_nvme_abort(nvme_ch, 884 nbdev_io, 885 nbdev_io_to_abort); 886 887 default: 888 return -EINVAL; 889 } 890 return 0; 891 } 892 893 static void 894 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 895 { 896 int rc = _bdev_nvme_submit_request(ch, bdev_io); 897 898 if (spdk_unlikely(rc != 0)) { 899 if (rc == -ENOMEM) { 900 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM); 901 } else { 902 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 903 } 904 } 905 } 906 907 static bool 908 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 909 { 910 struct nvme_bdev *nbdev = ctx; 911 struct nvme_bdev_ns *nvme_ns; 912 struct spdk_nvme_ns *ns; 913 struct spdk_nvme_ctrlr *ctrlr; 914 const struct spdk_nvme_ctrlr_data *cdata; 915 916 nvme_ns = nvme_bdev_to_bdev_ns(nbdev); 917 assert(nvme_ns != NULL); 918 ns = nvme_ns->ns; 919 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 920 921 switch (io_type) { 922 case SPDK_BDEV_IO_TYPE_READ: 923 case SPDK_BDEV_IO_TYPE_WRITE: 924 case SPDK_BDEV_IO_TYPE_RESET: 925 case SPDK_BDEV_IO_TYPE_FLUSH: 926 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 927 case SPDK_BDEV_IO_TYPE_NVME_IO: 928 case SPDK_BDEV_IO_TYPE_ABORT: 929 return true; 930 931 case SPDK_BDEV_IO_TYPE_COMPARE: 932 return spdk_nvme_ns_supports_compare(ns); 933 934 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 935 return spdk_nvme_ns_get_md_size(ns) ? true : false; 936 937 case SPDK_BDEV_IO_TYPE_UNMAP: 938 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 939 return cdata->oncs.dsm; 940 941 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 942 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 943 /* 944 * If an NVMe controller guarantees reading unallocated blocks returns zero, 945 * we can implement WRITE_ZEROES as an NVMe deallocate command. 946 */ 947 if (cdata->oncs.dsm && 948 spdk_nvme_ns_get_dealloc_logical_block_read_value(ns) == 949 SPDK_NVME_DEALLOC_READ_00) { 950 return true; 951 } 952 /* 953 * The NVMe controller write_zeroes function is currently not used by our driver. 954 * If a user submits an arbitrarily large write_zeroes request to the controller, the request will fail. 955 * Until this is resolved, we only claim support for write_zeroes if deallocated blocks return 0's when read. 956 */ 957 return false; 958 959 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 960 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 961 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) { 962 return true; 963 } 964 return false; 965 966 case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO: 967 case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT: 968 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS; 969 970 case SPDK_BDEV_IO_TYPE_ZONE_APPEND: 971 return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS && 972 spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ZONE_APPEND_SUPPORTED; 973 974 default: 975 return false; 976 } 977 } 978 979 static int 980 bdev_nvme_create_cb(void *io_device, void *ctx_buf) 981 { 982 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = io_device; 983 struct nvme_io_channel *nvme_ch = ctx_buf; 984 struct spdk_io_channel *pg_ch = NULL; 985 int rc; 986 987 if (spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) { 988 rc = bdev_ocssd_create_io_channel(nvme_ch); 989 if (rc != 0) { 990 return rc; 991 } 992 } 993 994 pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs); 995 if (!pg_ch) { 996 rc = -1; 997 goto err_pg_ch; 998 } 999 1000 nvme_ch->group = spdk_io_channel_get_ctx(pg_ch); 1001 1002 #ifdef SPDK_CONFIG_VTUNE 1003 nvme_ch->group->collect_spin_stat = true; 1004 #else 1005 nvme_ch->group->collect_spin_stat = false; 1006 #endif 1007 1008 TAILQ_INIT(&nvme_ch->pending_resets); 1009 1010 nvme_ch->ctrlr = nvme_bdev_ctrlr; 1011 1012 rc = bdev_nvme_create_qpair(nvme_ch); 1013 if (rc != 0) { 1014 goto err_qpair; 1015 } 1016 1017 return 0; 1018 1019 err_qpair: 1020 spdk_put_io_channel(pg_ch); 1021 err_pg_ch: 1022 if (nvme_ch->ocssd_ch) { 1023 bdev_ocssd_destroy_io_channel(nvme_ch); 1024 } 1025 1026 return rc; 1027 } 1028 1029 static void 1030 bdev_nvme_destroy_cb(void *io_device, void *ctx_buf) 1031 { 1032 struct nvme_io_channel *nvme_ch = ctx_buf; 1033 1034 assert(nvme_ch->group != NULL); 1035 1036 if (nvme_ch->ocssd_ch != NULL) { 1037 bdev_ocssd_destroy_io_channel(nvme_ch); 1038 } 1039 1040 bdev_nvme_destroy_qpair(nvme_ch); 1041 1042 spdk_put_io_channel(spdk_io_channel_from_ctx(nvme_ch->group)); 1043 } 1044 1045 static void 1046 bdev_nvme_poll_group_submit_accel_crc32c(void *ctx, uint32_t *dst, struct iovec *iov, 1047 uint32_t iov_cnt, uint32_t seed, 1048 spdk_nvme_accel_completion_cb cb_fn, void *cb_arg) 1049 { 1050 struct nvme_bdev_poll_group *group = ctx; 1051 int rc; 1052 1053 assert(group->accel_channel != NULL); 1054 assert(cb_fn != NULL); 1055 1056 rc = spdk_accel_submit_crc32cv(group->accel_channel, dst, iov, iov_cnt, seed, cb_fn, cb_arg); 1057 if (rc) { 1058 /* For the two cases, spdk_accel_submit_crc32cv does not call the user's cb_fn */ 1059 if (rc == -ENOMEM || rc == -EINVAL) { 1060 cb_fn(cb_arg, rc); 1061 } 1062 SPDK_ERRLOG("Cannot complete the accelerated crc32c operation with iov=%p\n", iov); 1063 } 1064 } 1065 1066 static struct spdk_nvme_accel_fn_table g_bdev_nvme_accel_fn_table = { 1067 .table_size = sizeof(struct spdk_nvme_accel_fn_table), 1068 .submit_accel_crc32c = bdev_nvme_poll_group_submit_accel_crc32c, 1069 }; 1070 1071 static int 1072 bdev_nvme_poll_group_create_cb(void *io_device, void *ctx_buf) 1073 { 1074 struct nvme_bdev_poll_group *group = ctx_buf; 1075 1076 group->group = spdk_nvme_poll_group_create(group, &g_bdev_nvme_accel_fn_table); 1077 if (group->group == NULL) { 1078 return -1; 1079 } 1080 1081 group->accel_channel = spdk_accel_engine_get_io_channel(); 1082 if (!group->accel_channel) { 1083 spdk_nvme_poll_group_destroy(group->group); 1084 SPDK_ERRLOG("Cannot get the accel_channel for bdev nvme polling group=%p\n", 1085 group); 1086 return -1; 1087 } 1088 1089 group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us); 1090 1091 if (group->poller == NULL) { 1092 spdk_put_io_channel(group->accel_channel); 1093 spdk_nvme_poll_group_destroy(group->group); 1094 return -1; 1095 } 1096 1097 return 0; 1098 } 1099 1100 static void 1101 bdev_nvme_poll_group_destroy_cb(void *io_device, void *ctx_buf) 1102 { 1103 struct nvme_bdev_poll_group *group = ctx_buf; 1104 1105 if (group->accel_channel) { 1106 spdk_put_io_channel(group->accel_channel); 1107 } 1108 1109 spdk_poller_unregister(&group->poller); 1110 if (spdk_nvme_poll_group_destroy(group->group)) { 1111 SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module."); 1112 assert(false); 1113 } 1114 } 1115 1116 static struct spdk_io_channel * 1117 bdev_nvme_get_io_channel(void *ctx) 1118 { 1119 struct nvme_bdev *nvme_bdev = ctx; 1120 1121 return spdk_get_io_channel(nvme_bdev->nvme_ns->ctrlr); 1122 } 1123 1124 static void * 1125 bdev_nvme_get_module_ctx(void *ctx) 1126 { 1127 struct nvme_bdev *nvme_bdev = ctx; 1128 1129 return bdev_nvme_get_ctrlr(&nvme_bdev->disk); 1130 } 1131 1132 static int 1133 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) 1134 { 1135 struct nvme_bdev *nvme_bdev = ctx; 1136 struct nvme_bdev_ns *nvme_ns; 1137 struct spdk_nvme_ns *ns; 1138 struct spdk_nvme_ctrlr *ctrlr; 1139 const struct spdk_nvme_ctrlr_data *cdata; 1140 const struct spdk_nvme_transport_id *trid; 1141 union spdk_nvme_vs_register vs; 1142 union spdk_nvme_csts_register csts; 1143 char buf[128]; 1144 1145 nvme_ns = nvme_bdev_to_bdev_ns(nvme_bdev); 1146 assert(nvme_ns != NULL); 1147 ns = nvme_ns->ns; 1148 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 1149 1150 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 1151 trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); 1152 vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr); 1153 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); 1154 1155 spdk_json_write_named_object_begin(w, "nvme"); 1156 1157 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 1158 spdk_json_write_named_string(w, "pci_address", trid->traddr); 1159 } 1160 1161 spdk_json_write_named_object_begin(w, "trid"); 1162 1163 nvme_bdev_dump_trid_json(trid, w); 1164 1165 spdk_json_write_object_end(w); 1166 1167 #ifdef SPDK_CONFIG_NVME_CUSE 1168 size_t cuse_name_size = 128; 1169 char cuse_name[cuse_name_size]; 1170 1171 int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns), 1172 cuse_name, &cuse_name_size); 1173 if (rc == 0) { 1174 spdk_json_write_named_string(w, "cuse_device", cuse_name); 1175 } 1176 #endif 1177 1178 spdk_json_write_named_object_begin(w, "ctrlr_data"); 1179 1180 spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid); 1181 1182 snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn); 1183 spdk_str_trim(buf); 1184 spdk_json_write_named_string(w, "model_number", buf); 1185 1186 snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn); 1187 spdk_str_trim(buf); 1188 spdk_json_write_named_string(w, "serial_number", buf); 1189 1190 snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr); 1191 spdk_str_trim(buf); 1192 spdk_json_write_named_string(w, "firmware_revision", buf); 1193 1194 if (cdata->subnqn[0] != '\0') { 1195 spdk_json_write_named_string(w, "subnqn", cdata->subnqn); 1196 } 1197 1198 spdk_json_write_named_object_begin(w, "oacs"); 1199 1200 spdk_json_write_named_uint32(w, "security", cdata->oacs.security); 1201 spdk_json_write_named_uint32(w, "format", cdata->oacs.format); 1202 spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware); 1203 spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage); 1204 1205 spdk_json_write_object_end(w); 1206 1207 spdk_json_write_object_end(w); 1208 1209 spdk_json_write_named_object_begin(w, "vs"); 1210 1211 spdk_json_write_name(w, "nvme_version"); 1212 if (vs.bits.ter) { 1213 spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter); 1214 } else { 1215 spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr); 1216 } 1217 1218 spdk_json_write_object_end(w); 1219 1220 spdk_json_write_named_object_begin(w, "csts"); 1221 1222 spdk_json_write_named_uint32(w, "rdy", csts.bits.rdy); 1223 spdk_json_write_named_uint32(w, "cfs", csts.bits.cfs); 1224 1225 spdk_json_write_object_end(w); 1226 1227 spdk_json_write_named_object_begin(w, "ns_data"); 1228 1229 spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns)); 1230 1231 spdk_json_write_object_end(w); 1232 1233 if (cdata->oacs.security) { 1234 spdk_json_write_named_object_begin(w, "security"); 1235 1236 spdk_json_write_named_bool(w, "opal", nvme_bdev->opal); 1237 1238 spdk_json_write_object_end(w); 1239 } 1240 1241 spdk_json_write_object_end(w); 1242 1243 return 0; 1244 } 1245 1246 static void 1247 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1248 { 1249 /* No config per bdev needed */ 1250 } 1251 1252 static uint64_t 1253 bdev_nvme_get_spin_time(struct spdk_io_channel *ch) 1254 { 1255 struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch); 1256 struct nvme_bdev_poll_group *group = nvme_ch->group; 1257 uint64_t spin_time; 1258 1259 if (!group || !group->collect_spin_stat) { 1260 return 0; 1261 } 1262 1263 if (group->end_ticks != 0) { 1264 group->spin_ticks += (group->end_ticks - group->start_ticks); 1265 group->end_ticks = 0; 1266 } 1267 1268 spin_time = (group->spin_ticks * 1000000ULL) / spdk_get_ticks_hz(); 1269 group->start_ticks = 0; 1270 group->spin_ticks = 0; 1271 1272 return spin_time; 1273 } 1274 1275 static const struct spdk_bdev_fn_table nvmelib_fn_table = { 1276 .destruct = bdev_nvme_destruct, 1277 .submit_request = bdev_nvme_submit_request, 1278 .io_type_supported = bdev_nvme_io_type_supported, 1279 .get_io_channel = bdev_nvme_get_io_channel, 1280 .dump_info_json = bdev_nvme_dump_info_json, 1281 .write_config_json = bdev_nvme_write_config_json, 1282 .get_spin_time = bdev_nvme_get_spin_time, 1283 .get_module_ctx = bdev_nvme_get_module_ctx, 1284 }; 1285 1286 static int 1287 nvme_disk_create(struct spdk_bdev *disk, const char *base_name, 1288 struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns, 1289 uint32_t prchk_flags, void *ctx) 1290 { 1291 const struct spdk_uuid *uuid; 1292 const struct spdk_nvme_ctrlr_data *cdata; 1293 const struct spdk_nvme_ns_data *nsdata; 1294 int rc; 1295 enum spdk_nvme_csi csi; 1296 1297 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 1298 csi = spdk_nvme_ns_get_csi(ns); 1299 1300 switch (csi) { 1301 case SPDK_NVME_CSI_NVM: 1302 disk->product_name = "NVMe disk"; 1303 break; 1304 case SPDK_NVME_CSI_ZNS: 1305 disk->product_name = "NVMe ZNS disk"; 1306 disk->zoned = true; 1307 disk->zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 1308 disk->max_zone_append_size = spdk_nvme_zns_ctrlr_get_max_zone_append_size(ctrlr) / 1309 spdk_nvme_ns_get_extended_sector_size(ns); 1310 disk->max_open_zones = spdk_nvme_zns_ns_get_max_open_zones(ns); 1311 disk->max_active_zones = spdk_nvme_zns_ns_get_max_active_zones(ns); 1312 break; 1313 default: 1314 SPDK_ERRLOG("unsupported CSI: %u\n", csi); 1315 return -ENOTSUP; 1316 } 1317 1318 disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns)); 1319 if (!disk->name) { 1320 return -ENOMEM; 1321 } 1322 1323 disk->write_cache = 0; 1324 if (cdata->vwc.present) { 1325 /* Enable if the Volatile Write Cache exists */ 1326 disk->write_cache = 1; 1327 } 1328 disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns); 1329 disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns); 1330 disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns); 1331 1332 uuid = spdk_nvme_ns_get_uuid(ns); 1333 if (uuid != NULL) { 1334 disk->uuid = *uuid; 1335 } 1336 1337 nsdata = spdk_nvme_ns_get_data(ns); 1338 1339 disk->md_len = spdk_nvme_ns_get_md_size(ns); 1340 if (disk->md_len != 0) { 1341 disk->md_interleave = nsdata->flbas.extended; 1342 disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns); 1343 if (disk->dif_type != SPDK_DIF_DISABLE) { 1344 disk->dif_is_head_of_md = nsdata->dps.md_start; 1345 disk->dif_check_flags = prchk_flags; 1346 } 1347 } 1348 1349 if (!(spdk_nvme_ctrlr_get_flags(ctrlr) & 1350 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) { 1351 disk->acwu = 0; 1352 } else if (nsdata->nsfeat.ns_atomic_write_unit) { 1353 disk->acwu = nsdata->nacwu; 1354 } else { 1355 disk->acwu = cdata->acwu; 1356 } 1357 1358 disk->ctxt = ctx; 1359 disk->fn_table = &nvmelib_fn_table; 1360 disk->module = &nvme_if; 1361 rc = spdk_bdev_register(disk); 1362 if (rc) { 1363 SPDK_ERRLOG("spdk_bdev_register() failed\n"); 1364 free(disk->name); 1365 return rc; 1366 } 1367 1368 return 0; 1369 } 1370 1371 static int 1372 nvme_bdev_create(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, struct nvme_bdev_ns *nvme_ns) 1373 { 1374 struct nvme_bdev *bdev; 1375 int rc; 1376 1377 bdev = calloc(1, sizeof(*bdev)); 1378 if (!bdev) { 1379 SPDK_ERRLOG("bdev calloc() failed\n"); 1380 return -ENOMEM; 1381 } 1382 1383 bdev->nvme_ns = nvme_ns; 1384 bdev->opal = nvme_bdev_ctrlr->opal_dev != NULL; 1385 1386 rc = nvme_disk_create(&bdev->disk, nvme_bdev_ctrlr->name, nvme_bdev_ctrlr->ctrlr, 1387 nvme_ns->ns, nvme_bdev_ctrlr->prchk_flags, bdev); 1388 if (rc != 0) { 1389 SPDK_ERRLOG("Failed to create NVMe disk\n"); 1390 free(bdev); 1391 return rc; 1392 } 1393 1394 nvme_ns->bdev = bdev; 1395 1396 return 0; 1397 } 1398 1399 static void 1400 nvme_ctrlr_populate_standard_namespace(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 1401 struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx) 1402 { 1403 struct spdk_nvme_ctrlr *ctrlr = nvme_bdev_ctrlr->ctrlr; 1404 struct spdk_nvme_ns *ns; 1405 int rc = 0; 1406 1407 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id); 1408 if (!ns) { 1409 SPDK_DEBUGLOG(bdev_nvme, "Invalid NS %d\n", nvme_ns->id); 1410 rc = -EINVAL; 1411 goto done; 1412 } 1413 1414 nvme_ns->ns = ns; 1415 nvme_ns->populated = true; 1416 1417 rc = nvme_bdev_create(nvme_bdev_ctrlr, nvme_ns); 1418 done: 1419 nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, rc); 1420 } 1421 1422 static bool 1423 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 1424 struct spdk_nvme_ctrlr_opts *opts) 1425 { 1426 struct nvme_probe_skip_entry *entry; 1427 1428 TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) { 1429 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 1430 return false; 1431 } 1432 } 1433 1434 opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst; 1435 opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight; 1436 opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight; 1437 opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight; 1438 1439 SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr); 1440 1441 return true; 1442 } 1443 1444 static void 1445 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl) 1446 { 1447 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = ctx; 1448 1449 if (spdk_nvme_cpl_is_error(cpl)) { 1450 SPDK_WARNLOG("Abort failed. Resetting controller.\n"); 1451 _bdev_nvme_reset(nvme_bdev_ctrlr); 1452 } 1453 } 1454 1455 static void 1456 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr, 1457 struct spdk_nvme_qpair *qpair, uint16_t cid) 1458 { 1459 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = cb_arg; 1460 union spdk_nvme_csts_register csts; 1461 int rc; 1462 1463 assert(nvme_bdev_ctrlr->ctrlr == ctrlr); 1464 1465 SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid); 1466 1467 /* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O 1468 * queue. (Note: qpair == NULL when there's an admin cmd timeout.) Otherwise we 1469 * would submit another fabrics cmd on the admin queue to read CSTS and check for its 1470 * completion recursively. 1471 */ 1472 if (nvme_bdev_ctrlr->connected_trid->trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) { 1473 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); 1474 if (csts.bits.cfs) { 1475 SPDK_ERRLOG("Controller Fatal Status, reset required\n"); 1476 _bdev_nvme_reset(nvme_bdev_ctrlr); 1477 return; 1478 } 1479 } 1480 1481 switch (g_opts.action_on_timeout) { 1482 case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT: 1483 if (qpair) { 1484 rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid, 1485 nvme_abort_cpl, nvme_bdev_ctrlr); 1486 if (rc == 0) { 1487 return; 1488 } 1489 1490 SPDK_ERRLOG("Unable to send abort. Resetting.\n"); 1491 } 1492 1493 /* FALLTHROUGH */ 1494 case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET: 1495 _bdev_nvme_reset(nvme_bdev_ctrlr); 1496 break; 1497 case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE: 1498 SPDK_DEBUGLOG(bdev_nvme, "No action for nvme controller timeout.\n"); 1499 break; 1500 default: 1501 SPDK_ERRLOG("An invalid timeout action value is found.\n"); 1502 break; 1503 } 1504 } 1505 1506 static void 1507 nvme_ctrlr_depopulate_standard_namespace(struct nvme_bdev_ns *nvme_ns) 1508 { 1509 struct nvme_bdev *bdev; 1510 1511 bdev = nvme_ns->bdev; 1512 if (bdev != NULL) { 1513 spdk_bdev_unregister(&bdev->disk, NULL, NULL); 1514 } 1515 1516 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 1517 } 1518 1519 static void 1520 nvme_ctrlr_populate_namespace(struct nvme_bdev_ctrlr *ctrlr, struct nvme_bdev_ns *nvme_ns, 1521 struct nvme_async_probe_ctx *ctx) 1522 { 1523 g_populate_namespace_fn[nvme_ns->type](ctrlr, nvme_ns, ctx); 1524 } 1525 1526 static void 1527 nvme_ctrlr_depopulate_namespace(struct nvme_bdev_ctrlr *ctrlr, struct nvme_bdev_ns *nvme_ns) 1528 { 1529 g_depopulate_namespace_fn[nvme_ns->type](nvme_ns); 1530 } 1531 1532 void 1533 nvme_ctrlr_populate_namespace_done(struct nvme_async_probe_ctx *ctx, 1534 struct nvme_bdev_ns *nvme_ns, int rc) 1535 { 1536 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = nvme_ns->ctrlr; 1537 1538 assert(nvme_bdev_ctrlr != NULL); 1539 1540 if (rc == 0) { 1541 pthread_mutex_lock(&nvme_bdev_ctrlr->mutex); 1542 nvme_bdev_ctrlr->ref++; 1543 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 1544 } else { 1545 memset(nvme_ns, 0, sizeof(*nvme_ns)); 1546 } 1547 1548 if (ctx) { 1549 ctx->populates_in_progress--; 1550 if (ctx->populates_in_progress == 0) { 1551 nvme_ctrlr_populate_namespaces_done(nvme_bdev_ctrlr, ctx); 1552 } 1553 } 1554 } 1555 1556 static void 1557 nvme_ctrlr_populate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 1558 struct nvme_async_probe_ctx *ctx) 1559 { 1560 struct spdk_nvme_ctrlr *ctrlr = nvme_bdev_ctrlr->ctrlr; 1561 struct nvme_bdev_ns *nvme_ns; 1562 struct spdk_nvme_ns *ns; 1563 struct nvme_bdev *bdev; 1564 uint32_t i; 1565 int rc; 1566 uint64_t num_sectors; 1567 bool ns_is_active; 1568 1569 if (ctx) { 1570 /* Initialize this count to 1 to handle the populate functions 1571 * calling nvme_ctrlr_populate_namespace_done() immediately. 1572 */ 1573 ctx->populates_in_progress = 1; 1574 } 1575 1576 for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) { 1577 uint32_t nsid = i + 1; 1578 1579 nvme_ns = nvme_bdev_ctrlr->namespaces[i]; 1580 ns_is_active = spdk_nvme_ctrlr_is_active_ns(ctrlr, nsid); 1581 1582 if (nvme_ns->populated && ns_is_active && nvme_ns->type == NVME_BDEV_NS_STANDARD) { 1583 /* NS is still there but attributes may have changed */ 1584 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid); 1585 num_sectors = spdk_nvme_ns_get_num_sectors(ns); 1586 bdev = nvme_ns->bdev; 1587 assert(bdev != NULL); 1588 if (bdev->disk.blockcnt != num_sectors) { 1589 SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n", 1590 nsid, 1591 bdev->disk.name, 1592 bdev->disk.blockcnt, 1593 num_sectors); 1594 rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors); 1595 if (rc != 0) { 1596 SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n", 1597 bdev->disk.name, rc); 1598 } 1599 } 1600 } 1601 1602 if (!nvme_ns->populated && ns_is_active) { 1603 nvme_ns->id = nsid; 1604 nvme_ns->ctrlr = nvme_bdev_ctrlr; 1605 if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) { 1606 nvme_ns->type = NVME_BDEV_NS_OCSSD; 1607 } else { 1608 nvme_ns->type = NVME_BDEV_NS_STANDARD; 1609 } 1610 1611 nvme_ns->bdev = NULL; 1612 1613 if (ctx) { 1614 ctx->populates_in_progress++; 1615 } 1616 nvme_ctrlr_populate_namespace(nvme_bdev_ctrlr, nvme_ns, ctx); 1617 } 1618 1619 if (nvme_ns->populated && !ns_is_active) { 1620 nvme_ctrlr_depopulate_namespace(nvme_bdev_ctrlr, nvme_ns); 1621 } 1622 } 1623 1624 if (ctx) { 1625 /* Decrement this count now that the loop is over to account 1626 * for the one we started with. If the count is then 0, we 1627 * know any populate_namespace functions completed immediately, 1628 * so we'll kick the callback here. 1629 */ 1630 ctx->populates_in_progress--; 1631 if (ctx->populates_in_progress == 0) { 1632 nvme_ctrlr_populate_namespaces_done(nvme_bdev_ctrlr, ctx); 1633 } 1634 } 1635 1636 } 1637 1638 static void 1639 nvme_ctrlr_depopulate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr) 1640 { 1641 uint32_t i; 1642 struct nvme_bdev_ns *nvme_ns; 1643 1644 for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) { 1645 uint32_t nsid = i + 1; 1646 1647 nvme_ns = nvme_bdev_ctrlr->namespaces[nsid - 1]; 1648 if (nvme_ns->populated) { 1649 assert(nvme_ns->id == nsid); 1650 nvme_ctrlr_depopulate_namespace(nvme_bdev_ctrlr, nvme_ns); 1651 } 1652 } 1653 } 1654 1655 static void 1656 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 1657 { 1658 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = arg; 1659 union spdk_nvme_async_event_completion event; 1660 1661 if (spdk_nvme_cpl_is_error(cpl)) { 1662 SPDK_WARNLOG("AER request execute failed"); 1663 return; 1664 } 1665 1666 event.raw = cpl->cdw0; 1667 if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 1668 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) { 1669 nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, NULL); 1670 } else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_VENDOR) && 1671 (event.bits.log_page_identifier == SPDK_OCSSD_LOG_CHUNK_NOTIFICATION) && 1672 spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) { 1673 bdev_ocssd_handle_chunk_notification(nvme_bdev_ctrlr); 1674 } 1675 } 1676 1677 static void 1678 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, size_t count, int rc) 1679 { 1680 if (ctx->cb_fn) { 1681 ctx->cb_fn(ctx->cb_ctx, count, rc); 1682 } 1683 1684 ctx->namespaces_populated = true; 1685 if (ctx->probe_done) { 1686 /* The probe was already completed, so we need to free the context 1687 * here. This can happen for cases like OCSSD, where we need to 1688 * send additional commands to the SSD after attach. 1689 */ 1690 free(ctx); 1691 } 1692 } 1693 1694 static int 1695 _nvme_bdev_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr, 1696 const char *name, 1697 const struct spdk_nvme_transport_id *trid, 1698 uint32_t prchk_flags, 1699 struct nvme_bdev_ctrlr **_nvme_bdev_ctrlr) 1700 { 1701 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; 1702 struct nvme_bdev_ctrlr_trid *trid_entry; 1703 uint32_t i; 1704 int rc; 1705 1706 nvme_bdev_ctrlr = calloc(1, sizeof(*nvme_bdev_ctrlr)); 1707 if (nvme_bdev_ctrlr == NULL) { 1708 SPDK_ERRLOG("Failed to allocate device struct\n"); 1709 return -ENOMEM; 1710 } 1711 1712 rc = pthread_mutex_init(&nvme_bdev_ctrlr->mutex, NULL); 1713 if (rc != 0) { 1714 goto err_init_mutex; 1715 } 1716 1717 TAILQ_INIT(&nvme_bdev_ctrlr->trids); 1718 nvme_bdev_ctrlr->num_ns = spdk_nvme_ctrlr_get_num_ns(ctrlr); 1719 if (nvme_bdev_ctrlr->num_ns != 0) { 1720 nvme_bdev_ctrlr->namespaces = calloc(nvme_bdev_ctrlr->num_ns, sizeof(struct nvme_bdev_ns *)); 1721 if (!nvme_bdev_ctrlr->namespaces) { 1722 SPDK_ERRLOG("Failed to allocate block namespaces pointer\n"); 1723 rc = -ENOMEM; 1724 goto err_alloc_namespaces; 1725 } 1726 } 1727 1728 trid_entry = calloc(1, sizeof(*trid_entry)); 1729 if (trid_entry == NULL) { 1730 SPDK_ERRLOG("Failed to allocate trid entry pointer\n"); 1731 rc = -ENOMEM; 1732 goto err_alloc_trid; 1733 } 1734 1735 trid_entry->trid = *trid; 1736 1737 for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) { 1738 nvme_bdev_ctrlr->namespaces[i] = calloc(1, sizeof(struct nvme_bdev_ns)); 1739 if (nvme_bdev_ctrlr->namespaces[i] == NULL) { 1740 SPDK_ERRLOG("Failed to allocate block namespace struct\n"); 1741 rc = -ENOMEM; 1742 goto err_alloc_namespace; 1743 } 1744 } 1745 1746 nvme_bdev_ctrlr->thread = spdk_get_thread(); 1747 nvme_bdev_ctrlr->adminq_timer_poller = NULL; 1748 nvme_bdev_ctrlr->ctrlr = ctrlr; 1749 nvme_bdev_ctrlr->ref = 1; 1750 nvme_bdev_ctrlr->connected_trid = &trid_entry->trid; 1751 nvme_bdev_ctrlr->name = strdup(name); 1752 if (nvme_bdev_ctrlr->name == NULL) { 1753 rc = -ENOMEM; 1754 goto err_alloc_name; 1755 } 1756 1757 if (spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) { 1758 rc = bdev_ocssd_init_ctrlr(nvme_bdev_ctrlr); 1759 if (spdk_unlikely(rc != 0)) { 1760 SPDK_ERRLOG("Unable to initialize OCSSD controller\n"); 1761 goto err_init_ocssd; 1762 } 1763 } 1764 1765 nvme_bdev_ctrlr->prchk_flags = prchk_flags; 1766 1767 spdk_io_device_register(nvme_bdev_ctrlr, bdev_nvme_create_cb, bdev_nvme_destroy_cb, 1768 sizeof(struct nvme_io_channel), 1769 name); 1770 1771 nvme_bdev_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_bdev_ctrlr, 1772 g_opts.nvme_adminq_poll_period_us); 1773 1774 TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nvme_bdev_ctrlr, tailq); 1775 1776 if (g_opts.timeout_us > 0) { 1777 spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us, 1778 timeout_cb, nvme_bdev_ctrlr); 1779 } 1780 1781 spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_bdev_ctrlr); 1782 spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_bdev_ctrlr); 1783 1784 if (spdk_nvme_ctrlr_get_flags(nvme_bdev_ctrlr->ctrlr) & 1785 SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) { 1786 nvme_bdev_ctrlr->opal_dev = spdk_opal_dev_construct(nvme_bdev_ctrlr->ctrlr); 1787 if (nvme_bdev_ctrlr->opal_dev == NULL) { 1788 SPDK_ERRLOG("Failed to initialize Opal\n"); 1789 } 1790 } 1791 1792 TAILQ_INSERT_HEAD(&nvme_bdev_ctrlr->trids, trid_entry, link); 1793 1794 if (_nvme_bdev_ctrlr != NULL) { 1795 *_nvme_bdev_ctrlr = nvme_bdev_ctrlr; 1796 } 1797 return 0; 1798 1799 err_init_ocssd: 1800 free(nvme_bdev_ctrlr->name); 1801 err_alloc_name: 1802 err_alloc_namespace: 1803 for (; i > 0; i--) { 1804 free(nvme_bdev_ctrlr->namespaces[i - 1]); 1805 } 1806 free(trid_entry); 1807 err_alloc_trid: 1808 free(nvme_bdev_ctrlr->namespaces); 1809 err_alloc_namespaces: 1810 pthread_mutex_destroy(&nvme_bdev_ctrlr->mutex); 1811 err_init_mutex: 1812 free(nvme_bdev_ctrlr); 1813 return rc; 1814 } 1815 1816 static void 1817 nvme_bdev_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr, 1818 const char *name, 1819 const struct spdk_nvme_transport_id *trid, 1820 uint32_t prchk_flags, 1821 struct nvme_async_probe_ctx *ctx) 1822 { 1823 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = NULL; 1824 int rc; 1825 1826 rc = _nvme_bdev_ctrlr_create(ctrlr, name, trid, prchk_flags, &nvme_bdev_ctrlr); 1827 if (rc != 0) { 1828 SPDK_ERRLOG("Failed to create new NVMe controller\n"); 1829 goto err; 1830 } 1831 1832 nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, ctx); 1833 return; 1834 1835 err: 1836 if (ctx != NULL) { 1837 populate_namespaces_cb(ctx, 0, rc); 1838 } 1839 } 1840 1841 static void 1842 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 1843 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 1844 { 1845 struct nvme_probe_ctx *ctx = cb_ctx; 1846 char *name = NULL; 1847 uint32_t prchk_flags = 0; 1848 size_t i; 1849 1850 if (ctx) { 1851 for (i = 0; i < ctx->count; i++) { 1852 if (spdk_nvme_transport_id_compare(trid, &ctx->trids[i]) == 0) { 1853 prchk_flags = ctx->prchk_flags[i]; 1854 name = strdup(ctx->names[i]); 1855 break; 1856 } 1857 } 1858 } else { 1859 name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++); 1860 } 1861 if (!name) { 1862 SPDK_ERRLOG("Failed to assign name to NVMe device\n"); 1863 return; 1864 } 1865 1866 SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name); 1867 1868 nvme_bdev_ctrlr_create(ctrlr, name, trid, prchk_flags, NULL); 1869 1870 free(name); 1871 } 1872 1873 static void 1874 _nvme_bdev_ctrlr_destruct(void *ctx) 1875 { 1876 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = ctx; 1877 1878 nvme_ctrlr_depopulate_namespaces(nvme_bdev_ctrlr); 1879 nvme_bdev_ctrlr_destruct(nvme_bdev_ctrlr); 1880 } 1881 1882 static int 1883 _bdev_nvme_delete(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, bool hotplug) 1884 { 1885 struct nvme_probe_skip_entry *entry; 1886 1887 pthread_mutex_lock(&nvme_bdev_ctrlr->mutex); 1888 1889 /* The controller's destruction was already started */ 1890 if (nvme_bdev_ctrlr->destruct) { 1891 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 1892 return 0; 1893 } 1894 1895 if (!hotplug && 1896 nvme_bdev_ctrlr->connected_trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 1897 entry = calloc(1, sizeof(*entry)); 1898 if (!entry) { 1899 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 1900 return -ENOMEM; 1901 } 1902 entry->trid = *nvme_bdev_ctrlr->connected_trid; 1903 TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq); 1904 } 1905 1906 nvme_bdev_ctrlr->destruct = true; 1907 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 1908 1909 _nvme_bdev_ctrlr_destruct(nvme_bdev_ctrlr); 1910 1911 return 0; 1912 } 1913 1914 static void 1915 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr) 1916 { 1917 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = cb_ctx; 1918 1919 _bdev_nvme_delete(nvme_bdev_ctrlr, true); 1920 } 1921 1922 static int 1923 bdev_nvme_hotplug_probe(void *arg) 1924 { 1925 if (g_hotplug_probe_ctx == NULL) { 1926 spdk_poller_unregister(&g_hotplug_probe_poller); 1927 return SPDK_POLLER_IDLE; 1928 } 1929 1930 if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) { 1931 g_hotplug_probe_ctx = NULL; 1932 spdk_poller_unregister(&g_hotplug_probe_poller); 1933 } 1934 1935 return SPDK_POLLER_BUSY; 1936 } 1937 1938 static int 1939 bdev_nvme_hotplug(void *arg) 1940 { 1941 struct spdk_nvme_transport_id trid_pcie; 1942 1943 if (g_hotplug_probe_ctx) { 1944 return SPDK_POLLER_BUSY; 1945 } 1946 1947 memset(&trid_pcie, 0, sizeof(trid_pcie)); 1948 spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE); 1949 1950 g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL, 1951 hotplug_probe_cb, attach_cb, NULL); 1952 1953 if (g_hotplug_probe_ctx) { 1954 assert(g_hotplug_probe_poller == NULL); 1955 g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000); 1956 } 1957 1958 return SPDK_POLLER_BUSY; 1959 } 1960 1961 void 1962 bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts) 1963 { 1964 *opts = g_opts; 1965 } 1966 1967 int 1968 bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts) 1969 { 1970 if (g_bdev_nvme_init_thread != NULL) { 1971 if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 1972 return -EPERM; 1973 } 1974 } 1975 1976 g_opts = *opts; 1977 1978 return 0; 1979 } 1980 1981 struct set_nvme_hotplug_ctx { 1982 uint64_t period_us; 1983 bool enabled; 1984 spdk_msg_fn fn; 1985 void *fn_ctx; 1986 }; 1987 1988 static void 1989 set_nvme_hotplug_period_cb(void *_ctx) 1990 { 1991 struct set_nvme_hotplug_ctx *ctx = _ctx; 1992 1993 spdk_poller_unregister(&g_hotplug_poller); 1994 if (ctx->enabled) { 1995 g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us); 1996 } 1997 1998 g_nvme_hotplug_poll_period_us = ctx->period_us; 1999 g_nvme_hotplug_enabled = ctx->enabled; 2000 if (ctx->fn) { 2001 ctx->fn(ctx->fn_ctx); 2002 } 2003 2004 free(ctx); 2005 } 2006 2007 int 2008 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx) 2009 { 2010 struct set_nvme_hotplug_ctx *ctx; 2011 2012 if (enabled == true && !spdk_process_is_primary()) { 2013 return -EPERM; 2014 } 2015 2016 ctx = calloc(1, sizeof(*ctx)); 2017 if (ctx == NULL) { 2018 return -ENOMEM; 2019 } 2020 2021 period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us; 2022 ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX); 2023 ctx->enabled = enabled; 2024 ctx->fn = cb; 2025 ctx->fn_ctx = cb_ctx; 2026 2027 spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx); 2028 return 0; 2029 } 2030 2031 static void 2032 nvme_ctrlr_populate_namespaces_done(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 2033 struct nvme_async_probe_ctx *ctx) 2034 { 2035 struct nvme_bdev_ns *nvme_ns; 2036 struct nvme_bdev *nvme_bdev; 2037 uint32_t i, nsid; 2038 size_t j; 2039 2040 assert(nvme_bdev_ctrlr != NULL); 2041 2042 /* 2043 * Report the new bdevs that were created in this call. 2044 * There can be more than one bdev per NVMe controller. 2045 */ 2046 j = 0; 2047 for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) { 2048 nsid = i + 1; 2049 nvme_ns = nvme_bdev_ctrlr->namespaces[nsid - 1]; 2050 if (!nvme_ns->populated) { 2051 continue; 2052 } 2053 assert(nvme_ns->id == nsid); 2054 nvme_bdev = nvme_ns->bdev; 2055 if (nvme_bdev == NULL) { 2056 assert(nvme_ns->type == NVME_BDEV_NS_OCSSD); 2057 continue; 2058 } 2059 if (j < ctx->count) { 2060 ctx->names[j] = nvme_bdev->disk.name; 2061 j++; 2062 } else { 2063 SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n", 2064 ctx->count); 2065 populate_namespaces_cb(ctx, 0, -ERANGE); 2066 return; 2067 } 2068 } 2069 2070 populate_namespaces_cb(ctx, j, 0); 2071 } 2072 2073 static int 2074 bdev_nvme_compare_trids(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 2075 struct spdk_nvme_ctrlr *new_ctrlr, 2076 struct spdk_nvme_transport_id *trid) 2077 { 2078 struct nvme_bdev_ctrlr_trid *tmp_trid; 2079 2080 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 2081 SPDK_ERRLOG("PCIe failover is not supported.\n"); 2082 return -ENOTSUP; 2083 } 2084 2085 /* Currently we only support failover to the same transport type. */ 2086 if (nvme_bdev_ctrlr->connected_trid->trtype != trid->trtype) { 2087 return -EINVAL; 2088 } 2089 2090 /* Currently we only support failover to the same NQN. */ 2091 if (strncmp(trid->subnqn, nvme_bdev_ctrlr->connected_trid->subnqn, SPDK_NVMF_NQN_MAX_LEN)) { 2092 return -EINVAL; 2093 } 2094 2095 /* Skip all the other checks if we've already registered this path. */ 2096 TAILQ_FOREACH(tmp_trid, &nvme_bdev_ctrlr->trids, link) { 2097 if (!spdk_nvme_transport_id_compare(&tmp_trid->trid, trid)) { 2098 return -EEXIST; 2099 } 2100 } 2101 2102 return 0; 2103 } 2104 2105 static bool 2106 bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2) 2107 { 2108 const struct spdk_nvme_ns_data *nsdata1, *nsdata2; 2109 2110 nsdata1 = spdk_nvme_ns_get_data(ns1); 2111 nsdata2 = spdk_nvme_ns_get_data(ns2); 2112 2113 return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid)); 2114 } 2115 2116 static int 2117 bdev_nvme_compare_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 2118 struct spdk_nvme_ctrlr *new_ctrlr) 2119 { 2120 uint32_t i, nsid; 2121 struct nvme_bdev_ns *nvme_ns; 2122 struct spdk_nvme_ns *new_ns; 2123 2124 if (spdk_nvme_ctrlr_get_num_ns(new_ctrlr) != nvme_bdev_ctrlr->num_ns) { 2125 return -EINVAL; 2126 } 2127 2128 for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) { 2129 nsid = i + 1; 2130 2131 nvme_ns = nvme_bdev_ctrlr->namespaces[i]; 2132 if (!nvme_ns->populated) { 2133 continue; 2134 } 2135 2136 new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nsid); 2137 assert(new_ns != NULL); 2138 2139 if (bdev_nvme_compare_ns(nvme_ns->ns, new_ns) != 0) { 2140 return -EINVAL; 2141 } 2142 } 2143 2144 return 0; 2145 } 2146 2147 static int 2148 _bdev_nvme_add_secondary_trid(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 2149 struct spdk_nvme_transport_id *trid) 2150 { 2151 struct nvme_bdev_ctrlr_trid *new_trid, *tmp_trid; 2152 2153 new_trid = calloc(1, sizeof(*new_trid)); 2154 if (new_trid == NULL) { 2155 return -ENOMEM; 2156 } 2157 new_trid->trid = *trid; 2158 new_trid->is_failed = false; 2159 2160 TAILQ_FOREACH(tmp_trid, &nvme_bdev_ctrlr->trids, link) { 2161 if (tmp_trid->is_failed) { 2162 TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link); 2163 return 0; 2164 } 2165 } 2166 2167 TAILQ_INSERT_TAIL(&nvme_bdev_ctrlr->trids, new_trid, link); 2168 return 0; 2169 } 2170 2171 /* This is the case that a secondary path is added to an existing 2172 * nvme_bdev_ctrlr for failover. After checking if it can access the same 2173 * namespaces as the primary path, it is disconnected until failover occurs. 2174 */ 2175 static void 2176 bdev_nvme_add_secondary_trid(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 2177 struct spdk_nvme_ctrlr *new_ctrlr, 2178 struct spdk_nvme_transport_id *trid, 2179 struct nvme_async_probe_ctx *ctx) 2180 { 2181 int rc; 2182 2183 assert(nvme_bdev_ctrlr != NULL); 2184 2185 pthread_mutex_lock(&nvme_bdev_ctrlr->mutex); 2186 2187 rc = bdev_nvme_compare_trids(nvme_bdev_ctrlr, new_ctrlr, trid); 2188 if (rc != 0) { 2189 goto exit; 2190 } 2191 2192 rc = bdev_nvme_compare_namespaces(nvme_bdev_ctrlr, new_ctrlr); 2193 if (rc != 0) { 2194 goto exit; 2195 } 2196 2197 rc = _bdev_nvme_add_secondary_trid(nvme_bdev_ctrlr, trid); 2198 2199 exit: 2200 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 2201 2202 spdk_nvme_detach(new_ctrlr); 2203 2204 if (ctx != NULL) { 2205 populate_namespaces_cb(ctx, 0, rc); 2206 } 2207 } 2208 2209 static void 2210 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 2211 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 2212 { 2213 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 2214 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; 2215 struct nvme_async_probe_ctx *ctx; 2216 2217 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, opts); 2218 ctx->ctrlr_attached = true; 2219 2220 nvme_bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(ctx->base_name); 2221 if (nvme_bdev_ctrlr) { 2222 bdev_nvme_add_secondary_trid(nvme_bdev_ctrlr, ctrlr, &ctx->trid, ctx); 2223 return; 2224 } 2225 2226 nvme_bdev_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx->prchk_flags, ctx); 2227 } 2228 2229 static int 2230 bdev_nvme_async_poll(void *arg) 2231 { 2232 struct nvme_async_probe_ctx *ctx = arg; 2233 int rc; 2234 2235 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 2236 if (spdk_unlikely(rc != -EAGAIN)) { 2237 ctx->probe_done = true; 2238 spdk_poller_unregister(&ctx->poller); 2239 if (!ctx->ctrlr_attached) { 2240 /* The probe is done, but no controller was attached. 2241 * That means we had a failure, so report -EIO back to 2242 * the caller (usually the RPC). populate_namespaces_cb() 2243 * will take care of freeing the nvme_async_probe_ctx. 2244 */ 2245 populate_namespaces_cb(ctx, 0, -EIO); 2246 } else if (ctx->namespaces_populated) { 2247 /* The namespaces for the attached controller were all 2248 * populated and the response was already sent to the 2249 * caller (usually the RPC). So free the context here. 2250 */ 2251 free(ctx); 2252 } 2253 } 2254 2255 return SPDK_POLLER_BUSY; 2256 } 2257 2258 int 2259 bdev_nvme_create(struct spdk_nvme_transport_id *trid, 2260 struct spdk_nvme_host_id *hostid, 2261 const char *base_name, 2262 const char **names, 2263 uint32_t count, 2264 const char *hostnqn, 2265 uint32_t prchk_flags, 2266 spdk_bdev_create_nvme_fn cb_fn, 2267 void *cb_ctx, 2268 struct spdk_nvme_ctrlr_opts *opts) 2269 { 2270 struct nvme_probe_skip_entry *entry, *tmp; 2271 struct nvme_async_probe_ctx *ctx; 2272 2273 /* TODO expand this check to include both the host and target TRIDs. 2274 * Only if both are the same should we fail. 2275 */ 2276 if (nvme_bdev_ctrlr_get(trid) != NULL) { 2277 SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr); 2278 return -EEXIST; 2279 } 2280 2281 ctx = calloc(1, sizeof(*ctx)); 2282 if (!ctx) { 2283 return -ENOMEM; 2284 } 2285 ctx->base_name = base_name; 2286 ctx->names = names; 2287 ctx->count = count; 2288 ctx->cb_fn = cb_fn; 2289 ctx->cb_ctx = cb_ctx; 2290 ctx->prchk_flags = prchk_flags; 2291 ctx->trid = *trid; 2292 2293 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 2294 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) { 2295 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 2296 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 2297 free(entry); 2298 break; 2299 } 2300 } 2301 } 2302 2303 if (opts) { 2304 memcpy(&ctx->opts, opts, sizeof(*opts)); 2305 } else { 2306 spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->opts, sizeof(ctx->opts)); 2307 } 2308 2309 ctx->opts.transport_retry_count = g_opts.retry_count; 2310 ctx->opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms; 2311 2312 if (hostnqn) { 2313 snprintf(ctx->opts.hostnqn, sizeof(ctx->opts.hostnqn), "%s", hostnqn); 2314 } 2315 2316 if (hostid->hostaddr[0] != '\0') { 2317 snprintf(ctx->opts.src_addr, sizeof(ctx->opts.src_addr), "%s", hostid->hostaddr); 2318 } 2319 2320 if (hostid->hostsvcid[0] != '\0') { 2321 snprintf(ctx->opts.src_svcid, sizeof(ctx->opts.src_svcid), "%s", hostid->hostsvcid); 2322 } 2323 2324 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->opts, connect_attach_cb); 2325 if (ctx->probe_ctx == NULL) { 2326 SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr); 2327 free(ctx); 2328 return -ENODEV; 2329 } 2330 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000); 2331 2332 return 0; 2333 } 2334 2335 static int 2336 bdev_nvme_delete_secondary_trid(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 2337 const struct spdk_nvme_transport_id *trid) 2338 { 2339 struct nvme_bdev_ctrlr_trid *ctrlr_trid, *tmp_trid; 2340 2341 if (!spdk_nvme_transport_id_compare(trid, nvme_bdev_ctrlr->connected_trid)) { 2342 return -EBUSY; 2343 } 2344 2345 TAILQ_FOREACH_SAFE(ctrlr_trid, &nvme_bdev_ctrlr->trids, link, tmp_trid) { 2346 if (!spdk_nvme_transport_id_compare(&ctrlr_trid->trid, trid)) { 2347 TAILQ_REMOVE(&nvme_bdev_ctrlr->trids, ctrlr_trid, link); 2348 free(ctrlr_trid); 2349 return 0; 2350 } 2351 } 2352 2353 return -ENXIO; 2354 } 2355 2356 int 2357 bdev_nvme_delete(const char *name, const struct spdk_nvme_transport_id *trid) 2358 { 2359 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; 2360 struct nvme_bdev_ctrlr_trid *ctrlr_trid; 2361 2362 if (name == NULL) { 2363 return -EINVAL; 2364 } 2365 2366 nvme_bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 2367 if (nvme_bdev_ctrlr == NULL) { 2368 SPDK_ERRLOG("Failed to find NVMe controller\n"); 2369 return -ENODEV; 2370 } 2371 2372 /* case 1: remove the controller itself. */ 2373 if (trid == NULL) { 2374 return _bdev_nvme_delete(nvme_bdev_ctrlr, false); 2375 } 2376 2377 /* case 2: we are currently using the path to be removed. */ 2378 if (!spdk_nvme_transport_id_compare(trid, nvme_bdev_ctrlr->connected_trid)) { 2379 ctrlr_trid = TAILQ_FIRST(&nvme_bdev_ctrlr->trids); 2380 assert(nvme_bdev_ctrlr->connected_trid == &ctrlr_trid->trid); 2381 /* case 2A: the current path is the only path. */ 2382 if (!TAILQ_NEXT(ctrlr_trid, link)) { 2383 return _bdev_nvme_delete(nvme_bdev_ctrlr, false); 2384 } 2385 2386 /* case 2B: there is an alternative path. */ 2387 return bdev_nvme_failover(nvme_bdev_ctrlr, true); 2388 } 2389 2390 /* case 3: We are not using the specified path. */ 2391 return bdev_nvme_delete_secondary_trid(nvme_bdev_ctrlr, trid); 2392 } 2393 2394 static int 2395 bdev_nvme_library_init(void) 2396 { 2397 g_bdev_nvme_init_thread = spdk_get_thread(); 2398 2399 spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_poll_group_create_cb, 2400 bdev_nvme_poll_group_destroy_cb, 2401 sizeof(struct nvme_bdev_poll_group), "bdev_nvme_poll_groups"); 2402 2403 return 0; 2404 } 2405 2406 static void 2407 bdev_nvme_library_fini(void) 2408 { 2409 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, *tmp; 2410 struct nvme_probe_skip_entry *entry, *entry_tmp; 2411 2412 spdk_poller_unregister(&g_hotplug_poller); 2413 free(g_hotplug_probe_ctx); 2414 g_hotplug_probe_ctx = NULL; 2415 2416 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) { 2417 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 2418 free(entry); 2419 } 2420 2421 pthread_mutex_lock(&g_bdev_nvme_mutex); 2422 TAILQ_FOREACH_SAFE(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq, tmp) { 2423 pthread_mutex_lock(&nvme_bdev_ctrlr->mutex); 2424 if (nvme_bdev_ctrlr->destruct) { 2425 /* This controller's destruction was already started 2426 * before the application started shutting down 2427 */ 2428 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 2429 continue; 2430 } 2431 nvme_bdev_ctrlr->destruct = true; 2432 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 2433 2434 spdk_thread_send_msg(nvme_bdev_ctrlr->thread, _nvme_bdev_ctrlr_destruct, 2435 nvme_bdev_ctrlr); 2436 } 2437 2438 g_bdev_nvme_module_finish = true; 2439 if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 2440 pthread_mutex_unlock(&g_bdev_nvme_mutex); 2441 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 2442 spdk_bdev_module_finish_done(); 2443 return; 2444 } 2445 2446 pthread_mutex_unlock(&g_bdev_nvme_mutex); 2447 } 2448 2449 static void 2450 bdev_nvme_verify_pi_error(struct spdk_bdev_io *bdev_io) 2451 { 2452 struct spdk_bdev *bdev = bdev_io->bdev; 2453 struct spdk_dif_ctx dif_ctx; 2454 struct spdk_dif_error err_blk = {}; 2455 int rc; 2456 2457 rc = spdk_dif_ctx_init(&dif_ctx, 2458 bdev->blocklen, bdev->md_len, bdev->md_interleave, 2459 bdev->dif_is_head_of_md, bdev->dif_type, bdev->dif_check_flags, 2460 bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0); 2461 if (rc != 0) { 2462 SPDK_ERRLOG("Initialization of DIF context failed\n"); 2463 return; 2464 } 2465 2466 if (bdev->md_interleave) { 2467 rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 2468 bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 2469 } else { 2470 struct iovec md_iov = { 2471 .iov_base = bdev_io->u.bdev.md_buf, 2472 .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len, 2473 }; 2474 2475 rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 2476 &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 2477 } 2478 2479 if (rc != 0) { 2480 SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n", 2481 err_blk.err_type, err_blk.err_offset); 2482 } else { 2483 SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n"); 2484 } 2485 } 2486 2487 static void 2488 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 2489 { 2490 struct nvme_bdev_io *bio = ref; 2491 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2492 2493 if (spdk_nvme_cpl_is_success(cpl)) { 2494 /* Run PI verification for read data buffer. */ 2495 bdev_nvme_verify_pi_error(bdev_io); 2496 } 2497 2498 /* Return original completion status */ 2499 spdk_bdev_io_complete_nvme_status(bdev_io, bio->cpl.cdw0, bio->cpl.status.sct, 2500 bio->cpl.status.sc); 2501 } 2502 2503 static void 2504 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 2505 { 2506 struct nvme_bdev_io *bio = ref; 2507 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2508 struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt; 2509 struct nvme_io_channel *nvme_ch; 2510 struct nvme_bdev_ns *nvme_ns; 2511 struct spdk_nvme_qpair *qpair; 2512 int ret; 2513 2514 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 2515 SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n", 2516 cpl->status.sct, cpl->status.sc); 2517 2518 /* Save completion status to use after verifying PI error. */ 2519 bio->cpl = *cpl; 2520 2521 nvme_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 2522 2523 if (spdk_likely(bdev_nvme_find_io_path(nbdev, nvme_ch, &nvme_ns, &qpair))) { 2524 /* Read without PI checking to verify PI error. */ 2525 ret = bdev_nvme_no_pi_readv(nvme_ns->ns, 2526 qpair, 2527 bio, 2528 bdev_io->u.bdev.iovs, 2529 bdev_io->u.bdev.iovcnt, 2530 bdev_io->u.bdev.md_buf, 2531 bdev_io->u.bdev.num_blocks, 2532 bdev_io->u.bdev.offset_blocks); 2533 if (ret == 0) { 2534 return; 2535 } 2536 } 2537 } 2538 2539 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 2540 } 2541 2542 static void 2543 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 2544 { 2545 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref); 2546 2547 if (spdk_nvme_cpl_is_pi_error(cpl)) { 2548 SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n", 2549 cpl->status.sct, cpl->status.sc); 2550 /* Run PI verification for write data buffer if PI error is detected. */ 2551 bdev_nvme_verify_pi_error(bdev_io); 2552 } 2553 2554 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 2555 } 2556 2557 static void 2558 bdev_nvme_zone_appendv_done(void *ref, const struct spdk_nvme_cpl *cpl) 2559 { 2560 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref); 2561 2562 /* spdk_bdev_io_get_append_location() requires that the ALBA is stored in offset_blocks. 2563 * Additionally, offset_blocks has to be set before calling bdev_nvme_verify_pi_error(). 2564 */ 2565 bdev_io->u.bdev.offset_blocks = *(uint64_t *)&cpl->cdw0; 2566 2567 if (spdk_nvme_cpl_is_pi_error(cpl)) { 2568 SPDK_ERRLOG("zone append completed with PI error (sct=%d, sc=%d)\n", 2569 cpl->status.sct, cpl->status.sc); 2570 /* Run PI verification for zone append data buffer if PI error is detected. */ 2571 bdev_nvme_verify_pi_error(bdev_io); 2572 } 2573 2574 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 2575 } 2576 2577 static void 2578 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl) 2579 { 2580 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref); 2581 2582 if (spdk_nvme_cpl_is_pi_error(cpl)) { 2583 SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n", 2584 cpl->status.sct, cpl->status.sc); 2585 /* Run PI verification for compare data buffer if PI error is detected. */ 2586 bdev_nvme_verify_pi_error(bdev_io); 2587 } 2588 2589 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 2590 } 2591 2592 static void 2593 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 2594 { 2595 struct nvme_bdev_io *bio = ref; 2596 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2597 2598 /* Compare operation completion */ 2599 if ((cpl->cdw0 & 0xFF) == SPDK_NVME_OPC_COMPARE) { 2600 /* Save compare result for write callback */ 2601 bio->cpl = *cpl; 2602 return; 2603 } 2604 2605 /* Write operation completion */ 2606 if (spdk_nvme_cpl_is_error(&bio->cpl)) { 2607 /* If bio->cpl is already an error, it means the compare operation failed. In that case, 2608 * complete the IO with the compare operation's status. 2609 */ 2610 if (!spdk_nvme_cpl_is_error(cpl)) { 2611 SPDK_ERRLOG("Unexpected write success after compare failure.\n"); 2612 } 2613 2614 spdk_bdev_io_complete_nvme_status(bdev_io, bio->cpl.cdw0, bio->cpl.status.sct, bio->cpl.status.sc); 2615 } else { 2616 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 2617 } 2618 } 2619 2620 static void 2621 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl) 2622 { 2623 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref); 2624 2625 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 2626 } 2627 2628 static int 2629 fill_zone_from_report(struct spdk_bdev_zone_info *info, struct spdk_nvme_zns_zone_desc *desc) 2630 { 2631 switch (desc->zs) { 2632 case SPDK_NVME_ZONE_STATE_EMPTY: 2633 info->state = SPDK_BDEV_ZONE_STATE_EMPTY; 2634 break; 2635 case SPDK_NVME_ZONE_STATE_IOPEN: 2636 info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN; 2637 break; 2638 case SPDK_NVME_ZONE_STATE_EOPEN: 2639 info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN; 2640 break; 2641 case SPDK_NVME_ZONE_STATE_CLOSED: 2642 info->state = SPDK_BDEV_ZONE_STATE_CLOSED; 2643 break; 2644 case SPDK_NVME_ZONE_STATE_RONLY: 2645 info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY; 2646 break; 2647 case SPDK_NVME_ZONE_STATE_FULL: 2648 info->state = SPDK_BDEV_ZONE_STATE_FULL; 2649 break; 2650 case SPDK_NVME_ZONE_STATE_OFFLINE: 2651 info->state = SPDK_BDEV_ZONE_STATE_OFFLINE; 2652 break; 2653 default: 2654 SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", desc->zs); 2655 return -EIO; 2656 } 2657 2658 info->zone_id = desc->zslba; 2659 info->write_pointer = desc->wp; 2660 info->capacity = desc->zcap; 2661 2662 return 0; 2663 } 2664 2665 static void 2666 bdev_nvme_get_zone_info_done(void *ref, const struct spdk_nvme_cpl *cpl) 2667 { 2668 struct nvme_bdev_io *bio = ref; 2669 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2670 struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt; 2671 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 2672 struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch); 2673 uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id; 2674 uint32_t zones_to_copy = bdev_io->u.zone_mgmt.num_zones; 2675 struct spdk_bdev_zone_info *info = bdev_io->u.zone_mgmt.buf; 2676 enum spdk_bdev_io_status status; 2677 uint64_t max_zones_per_buf, i; 2678 uint32_t zone_report_bufsize; 2679 struct nvme_bdev_ns *nvme_ns; 2680 struct spdk_nvme_qpair *qpair; 2681 int ret; 2682 2683 if (spdk_nvme_cpl_is_error(cpl)) { 2684 goto out_complete_io_nvme_cpl; 2685 } 2686 2687 if (!bdev_nvme_find_io_path(nbdev, nvme_ch, &nvme_ns, &qpair)) { 2688 status = SPDK_BDEV_IO_STATUS_FAILED; 2689 goto out_complete_io_status; 2690 } 2691 2692 zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(nvme_ns->ns); 2693 max_zones_per_buf = (zone_report_bufsize - sizeof(*bio->zone_report_buf)) / 2694 sizeof(bio->zone_report_buf->descs[0]); 2695 2696 if (bio->zone_report_buf->nr_zones > max_zones_per_buf) { 2697 status = SPDK_BDEV_IO_STATUS_FAILED; 2698 goto out_complete_io_status; 2699 } 2700 2701 if (!bio->zone_report_buf->nr_zones) { 2702 status = SPDK_BDEV_IO_STATUS_FAILED; 2703 goto out_complete_io_status; 2704 } 2705 2706 for (i = 0; i < bio->zone_report_buf->nr_zones && bio->handled_zones < zones_to_copy; i++) { 2707 ret = fill_zone_from_report(&info[bio->handled_zones], 2708 &bio->zone_report_buf->descs[i]); 2709 if (ret) { 2710 status = SPDK_BDEV_IO_STATUS_FAILED; 2711 goto out_complete_io_status; 2712 } 2713 bio->handled_zones++; 2714 } 2715 2716 if (bio->handled_zones < zones_to_copy) { 2717 uint64_t zone_size_lba = spdk_nvme_zns_ns_get_zone_size_sectors(nvme_ns->ns); 2718 uint64_t slba = zone_id + (zone_size_lba * bio->handled_zones); 2719 2720 memset(bio->zone_report_buf, 0, zone_report_bufsize); 2721 ret = spdk_nvme_zns_report_zones(nvme_ns->ns, qpair, 2722 bio->zone_report_buf, zone_report_bufsize, 2723 slba, SPDK_NVME_ZRA_LIST_ALL, true, 2724 bdev_nvme_get_zone_info_done, bio); 2725 if (!ret) { 2726 return; 2727 } else if (ret == -ENOMEM) { 2728 status = SPDK_BDEV_IO_STATUS_NOMEM; 2729 goto out_complete_io_status; 2730 } else { 2731 status = SPDK_BDEV_IO_STATUS_FAILED; 2732 goto out_complete_io_status; 2733 } 2734 } 2735 2736 out_complete_io_nvme_cpl: 2737 free(bio->zone_report_buf); 2738 bio->zone_report_buf = NULL; 2739 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 2740 return; 2741 2742 out_complete_io_status: 2743 free(bio->zone_report_buf); 2744 bio->zone_report_buf = NULL; 2745 spdk_bdev_io_complete(bdev_io, status); 2746 } 2747 2748 static void 2749 bdev_nvme_zone_management_done(void *ref, const struct spdk_nvme_cpl *cpl) 2750 { 2751 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref); 2752 2753 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 2754 } 2755 2756 static void 2757 bdev_nvme_admin_passthru_completion(void *ctx) 2758 { 2759 struct nvme_bdev_io *bio = ctx; 2760 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2761 2762 spdk_bdev_io_complete_nvme_status(bdev_io, 2763 bio->cpl.cdw0, bio->cpl.status.sct, bio->cpl.status.sc); 2764 } 2765 2766 static void 2767 bdev_nvme_abort_completion(void *ctx) 2768 { 2769 struct nvme_bdev_io *bio = ctx; 2770 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2771 2772 if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) { 2773 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 2774 } else { 2775 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2776 } 2777 } 2778 2779 static void 2780 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl) 2781 { 2782 struct nvme_bdev_io *bio = ref; 2783 2784 bio->cpl = *cpl; 2785 spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_completion, bio); 2786 } 2787 2788 static void 2789 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl) 2790 { 2791 struct nvme_bdev_io *bio = ref; 2792 2793 bio->cpl = *cpl; 2794 spdk_thread_send_msg(bio->orig_thread, bdev_nvme_admin_passthru_completion, bio); 2795 } 2796 2797 static void 2798 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset) 2799 { 2800 struct nvme_bdev_io *bio = ref; 2801 struct iovec *iov; 2802 2803 bio->iov_offset = sgl_offset; 2804 for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) { 2805 iov = &bio->iovs[bio->iovpos]; 2806 if (bio->iov_offset < iov->iov_len) { 2807 break; 2808 } 2809 2810 bio->iov_offset -= iov->iov_len; 2811 } 2812 } 2813 2814 static int 2815 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length) 2816 { 2817 struct nvme_bdev_io *bio = ref; 2818 struct iovec *iov; 2819 2820 assert(bio->iovpos < bio->iovcnt); 2821 2822 iov = &bio->iovs[bio->iovpos]; 2823 2824 *address = iov->iov_base; 2825 *length = iov->iov_len; 2826 2827 if (bio->iov_offset) { 2828 assert(bio->iov_offset <= iov->iov_len); 2829 *address += bio->iov_offset; 2830 *length -= bio->iov_offset; 2831 } 2832 2833 bio->iov_offset += *length; 2834 if (bio->iov_offset == iov->iov_len) { 2835 bio->iovpos++; 2836 bio->iov_offset = 0; 2837 } 2838 2839 return 0; 2840 } 2841 2842 static void 2843 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset) 2844 { 2845 struct nvme_bdev_io *bio = ref; 2846 struct iovec *iov; 2847 2848 bio->fused_iov_offset = sgl_offset; 2849 for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) { 2850 iov = &bio->fused_iovs[bio->fused_iovpos]; 2851 if (bio->fused_iov_offset < iov->iov_len) { 2852 break; 2853 } 2854 2855 bio->fused_iov_offset -= iov->iov_len; 2856 } 2857 } 2858 2859 static int 2860 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length) 2861 { 2862 struct nvme_bdev_io *bio = ref; 2863 struct iovec *iov; 2864 2865 assert(bio->fused_iovpos < bio->fused_iovcnt); 2866 2867 iov = &bio->fused_iovs[bio->fused_iovpos]; 2868 2869 *address = iov->iov_base; 2870 *length = iov->iov_len; 2871 2872 if (bio->fused_iov_offset) { 2873 assert(bio->fused_iov_offset <= iov->iov_len); 2874 *address += bio->fused_iov_offset; 2875 *length -= bio->fused_iov_offset; 2876 } 2877 2878 bio->fused_iov_offset += *length; 2879 if (bio->fused_iov_offset == iov->iov_len) { 2880 bio->fused_iovpos++; 2881 bio->fused_iov_offset = 0; 2882 } 2883 2884 return 0; 2885 } 2886 2887 static int 2888 bdev_nvme_no_pi_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 2889 struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 2890 void *md, uint64_t lba_count, uint64_t lba) 2891 { 2892 int rc; 2893 2894 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n", 2895 lba_count, lba); 2896 2897 bio->iovs = iov; 2898 bio->iovcnt = iovcnt; 2899 bio->iovpos = 0; 2900 bio->iov_offset = 0; 2901 2902 rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count, 2903 bdev_nvme_no_pi_readv_done, bio, 0, 2904 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 2905 md, 0, 0); 2906 2907 if (rc != 0 && rc != -ENOMEM) { 2908 SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc); 2909 } 2910 return rc; 2911 } 2912 2913 static int 2914 bdev_nvme_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 2915 struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 2916 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags) 2917 { 2918 int rc; 2919 2920 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n", 2921 lba_count, lba); 2922 2923 bio->iovs = iov; 2924 bio->iovcnt = iovcnt; 2925 bio->iovpos = 0; 2926 bio->iov_offset = 0; 2927 2928 if (iovcnt == 1) { 2929 rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base, md, lba, 2930 lba_count, 2931 bdev_nvme_readv_done, bio, 2932 flags, 2933 0, 0); 2934 } else { 2935 rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count, 2936 bdev_nvme_readv_done, bio, flags, 2937 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 2938 md, 0, 0); 2939 } 2940 2941 if (rc != 0 && rc != -ENOMEM) { 2942 SPDK_ERRLOG("readv failed: rc = %d\n", rc); 2943 } 2944 return rc; 2945 } 2946 2947 static int 2948 bdev_nvme_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 2949 struct nvme_bdev_io *bio, 2950 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba, 2951 uint32_t flags) 2952 { 2953 int rc; 2954 2955 SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 2956 lba_count, lba); 2957 2958 bio->iovs = iov; 2959 bio->iovcnt = iovcnt; 2960 bio->iovpos = 0; 2961 bio->iov_offset = 0; 2962 2963 if (iovcnt == 1) { 2964 rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base, md, lba, 2965 lba_count, 2966 bdev_nvme_writev_done, bio, 2967 flags, 2968 0, 0); 2969 } else { 2970 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 2971 bdev_nvme_writev_done, bio, flags, 2972 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 2973 md, 0, 0); 2974 } 2975 2976 if (rc != 0 && rc != -ENOMEM) { 2977 SPDK_ERRLOG("writev failed: rc = %d\n", rc); 2978 } 2979 return rc; 2980 } 2981 2982 static int 2983 bdev_nvme_zone_appendv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 2984 struct nvme_bdev_io *bio, 2985 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t zslba, 2986 uint32_t flags) 2987 { 2988 int rc; 2989 2990 SPDK_DEBUGLOG(bdev_nvme, "zone append %" PRIu64 " blocks to zone start lba %#" PRIx64 "\n", 2991 lba_count, zslba); 2992 2993 bio->iovs = iov; 2994 bio->iovcnt = iovcnt; 2995 bio->iovpos = 0; 2996 bio->iov_offset = 0; 2997 2998 if (iovcnt == 1) { 2999 rc = spdk_nvme_zns_zone_append_with_md(ns, qpair, iov[0].iov_base, md, zslba, 3000 lba_count, 3001 bdev_nvme_zone_appendv_done, bio, 3002 flags, 3003 0, 0); 3004 } else { 3005 rc = spdk_nvme_zns_zone_appendv_with_md(ns, qpair, zslba, lba_count, 3006 bdev_nvme_zone_appendv_done, bio, flags, 3007 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 3008 md, 0, 0); 3009 } 3010 3011 if (rc != 0 && rc != -ENOMEM) { 3012 SPDK_ERRLOG("zone append failed: rc = %d\n", rc); 3013 } 3014 return rc; 3015 } 3016 3017 static int 3018 bdev_nvme_comparev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 3019 struct nvme_bdev_io *bio, 3020 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba, 3021 uint32_t flags) 3022 { 3023 int rc; 3024 3025 SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n", 3026 lba_count, lba); 3027 3028 bio->iovs = iov; 3029 bio->iovcnt = iovcnt; 3030 bio->iovpos = 0; 3031 bio->iov_offset = 0; 3032 3033 rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count, 3034 bdev_nvme_comparev_done, bio, flags, 3035 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 3036 md, 0, 0); 3037 3038 if (rc != 0 && rc != -ENOMEM) { 3039 SPDK_ERRLOG("comparev failed: rc = %d\n", rc); 3040 } 3041 return rc; 3042 } 3043 3044 static int 3045 bdev_nvme_comparev_and_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 3046 struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, 3047 struct iovec *write_iov, int write_iovcnt, 3048 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags) 3049 { 3050 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 3051 int rc; 3052 3053 SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 3054 lba_count, lba); 3055 3056 bio->iovs = cmp_iov; 3057 bio->iovcnt = cmp_iovcnt; 3058 bio->iovpos = 0; 3059 bio->iov_offset = 0; 3060 bio->fused_iovs = write_iov; 3061 bio->fused_iovcnt = write_iovcnt; 3062 bio->fused_iovpos = 0; 3063 bio->fused_iov_offset = 0; 3064 3065 if (bdev_io->num_retries == 0) { 3066 bio->first_fused_submitted = false; 3067 } 3068 3069 if (!bio->first_fused_submitted) { 3070 flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST; 3071 memset(&bio->cpl, 0, sizeof(bio->cpl)); 3072 3073 rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count, 3074 bdev_nvme_comparev_and_writev_done, bio, flags, 3075 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0); 3076 if (rc == 0) { 3077 bio->first_fused_submitted = true; 3078 flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST; 3079 } else { 3080 if (rc != -ENOMEM) { 3081 SPDK_ERRLOG("compare failed: rc = %d\n", rc); 3082 } 3083 return rc; 3084 } 3085 } 3086 3087 flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND; 3088 3089 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 3090 bdev_nvme_comparev_and_writev_done, bio, flags, 3091 bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0); 3092 if (rc != 0 && rc != -ENOMEM) { 3093 SPDK_ERRLOG("write failed: rc = %d\n", rc); 3094 rc = 0; 3095 } 3096 3097 return rc; 3098 } 3099 3100 static int 3101 bdev_nvme_unmap(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 3102 struct nvme_bdev_io *bio, 3103 uint64_t offset_blocks, 3104 uint64_t num_blocks) 3105 { 3106 struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES]; 3107 struct spdk_nvme_dsm_range *range; 3108 uint64_t offset, remaining; 3109 uint64_t num_ranges_u64; 3110 uint16_t num_ranges; 3111 int rc; 3112 3113 num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) / 3114 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 3115 if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) { 3116 SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks); 3117 return -EINVAL; 3118 } 3119 num_ranges = (uint16_t)num_ranges_u64; 3120 3121 offset = offset_blocks; 3122 remaining = num_blocks; 3123 range = &dsm_ranges[0]; 3124 3125 /* Fill max-size ranges until the remaining blocks fit into one range */ 3126 while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) { 3127 range->attributes.raw = 0; 3128 range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 3129 range->starting_lba = offset; 3130 3131 offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 3132 remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 3133 range++; 3134 } 3135 3136 /* Final range describes the remaining blocks */ 3137 range->attributes.raw = 0; 3138 range->length = remaining; 3139 range->starting_lba = offset; 3140 3141 rc = spdk_nvme_ns_cmd_dataset_management(ns, qpair, 3142 SPDK_NVME_DSM_ATTR_DEALLOCATE, 3143 dsm_ranges, num_ranges, 3144 bdev_nvme_queued_done, bio); 3145 3146 return rc; 3147 } 3148 3149 static int 3150 bdev_nvme_get_zone_info(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 3151 struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones, 3152 struct spdk_bdev_zone_info *info) 3153 { 3154 uint32_t zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns); 3155 uint64_t zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns); 3156 uint64_t total_zones = spdk_nvme_zns_ns_get_num_zones(ns); 3157 3158 if (zone_id % zone_size != 0) { 3159 return -EINVAL; 3160 } 3161 3162 if (num_zones > total_zones || !num_zones) { 3163 return -EINVAL; 3164 } 3165 3166 assert(!bio->zone_report_buf); 3167 bio->zone_report_buf = calloc(1, zone_report_bufsize); 3168 if (!bio->zone_report_buf) { 3169 return -ENOMEM; 3170 } 3171 3172 bio->handled_zones = 0; 3173 3174 return spdk_nvme_zns_report_zones(ns, qpair, bio->zone_report_buf, zone_report_bufsize, 3175 zone_id, SPDK_NVME_ZRA_LIST_ALL, true, 3176 bdev_nvme_get_zone_info_done, bio); 3177 } 3178 3179 static int 3180 bdev_nvme_zone_management(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 3181 struct nvme_bdev_io *bio, uint64_t zone_id, 3182 enum spdk_bdev_zone_action action) 3183 { 3184 switch (action) { 3185 case SPDK_BDEV_ZONE_CLOSE: 3186 return spdk_nvme_zns_close_zone(ns, qpair, zone_id, false, 3187 bdev_nvme_zone_management_done, bio); 3188 case SPDK_BDEV_ZONE_FINISH: 3189 return spdk_nvme_zns_finish_zone(ns, qpair, zone_id, false, 3190 bdev_nvme_zone_management_done, bio); 3191 case SPDK_BDEV_ZONE_OPEN: 3192 return spdk_nvme_zns_open_zone(ns, qpair, zone_id, false, 3193 bdev_nvme_zone_management_done, bio); 3194 case SPDK_BDEV_ZONE_RESET: 3195 return spdk_nvme_zns_reset_zone(ns, qpair, zone_id, false, 3196 bdev_nvme_zone_management_done, bio); 3197 case SPDK_BDEV_ZONE_OFFLINE: 3198 return spdk_nvme_zns_offline_zone(ns, qpair, zone_id, false, 3199 bdev_nvme_zone_management_done, bio); 3200 default: 3201 return -EINVAL; 3202 } 3203 } 3204 3205 static int 3206 bdev_nvme_admin_passthru(struct nvme_io_channel *nvme_ch, struct nvme_bdev_io *bio, 3207 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) 3208 { 3209 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; 3210 uint32_t max_xfer_size; 3211 3212 if (!bdev_nvme_find_admin_path(nvme_ch, &nvme_bdev_ctrlr)) { 3213 return -EINVAL; 3214 } 3215 3216 max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_bdev_ctrlr->ctrlr); 3217 3218 if (nbytes > max_xfer_size) { 3219 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 3220 return -EINVAL; 3221 } 3222 3223 bio->orig_thread = spdk_get_thread(); 3224 3225 return spdk_nvme_ctrlr_cmd_admin_raw(nvme_bdev_ctrlr->ctrlr, cmd, buf, 3226 (uint32_t)nbytes, bdev_nvme_admin_passthru_done, bio); 3227 } 3228 3229 static int 3230 bdev_nvme_io_passthru(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 3231 struct nvme_bdev_io *bio, 3232 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) 3233 { 3234 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 3235 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 3236 3237 if (nbytes > max_xfer_size) { 3238 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 3239 return -EINVAL; 3240 } 3241 3242 /* 3243 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 3244 * so fill it out automatically. 3245 */ 3246 cmd->nsid = spdk_nvme_ns_get_id(ns); 3247 3248 return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf, 3249 (uint32_t)nbytes, bdev_nvme_queued_done, bio); 3250 } 3251 3252 static int 3253 bdev_nvme_io_passthru_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 3254 struct nvme_bdev_io *bio, 3255 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len) 3256 { 3257 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 3258 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 3259 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 3260 3261 if (nbytes > max_xfer_size) { 3262 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 3263 return -EINVAL; 3264 } 3265 3266 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 3267 SPDK_ERRLOG("invalid meta data buffer size\n"); 3268 return -EINVAL; 3269 } 3270 3271 /* 3272 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 3273 * so fill it out automatically. 3274 */ 3275 cmd->nsid = spdk_nvme_ns_get_id(ns); 3276 3277 return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf, 3278 (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio); 3279 } 3280 3281 static void 3282 bdev_nvme_abort_admin_cmd(void *ctx) 3283 { 3284 struct nvme_bdev_io *bio = ctx; 3285 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 3286 struct nvme_io_channel *nvme_ch; 3287 struct nvme_bdev_io *bio_to_abort; 3288 int rc; 3289 3290 nvme_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 3291 bio_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx; 3292 3293 rc = spdk_nvme_ctrlr_cmd_abort_ext(nvme_ch->ctrlr->ctrlr, 3294 NULL, 3295 bio_to_abort, 3296 bdev_nvme_abort_done, bio); 3297 if (rc == -ENOENT) { 3298 /* If no admin command was found in admin qpair, complete the abort 3299 * request with failure. 3300 */ 3301 bio->cpl.cdw0 |= 1U; 3302 bio->cpl.status.sc = SPDK_NVME_SC_SUCCESS; 3303 bio->cpl.status.sct = SPDK_NVME_SCT_GENERIC; 3304 3305 spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_completion, bio); 3306 } 3307 } 3308 3309 static int 3310 bdev_nvme_abort(struct nvme_io_channel *nvme_ch, struct nvme_bdev_io *bio, 3311 struct nvme_bdev_io *bio_to_abort) 3312 { 3313 int rc; 3314 3315 bio->orig_thread = spdk_get_thread(); 3316 3317 rc = spdk_nvme_ctrlr_cmd_abort_ext(nvme_ch->ctrlr->ctrlr, 3318 nvme_ch->qpair, 3319 bio_to_abort, 3320 bdev_nvme_abort_done, bio); 3321 if (rc == -ENOENT) { 3322 /* If no command was found in I/O qpair, the target command may be 3323 * admin command. Only a single thread tries aborting admin command 3324 * to clean I/O flow. 3325 */ 3326 spdk_thread_send_msg(nvme_ch->ctrlr->thread, 3327 bdev_nvme_abort_admin_cmd, bio); 3328 rc = 0; 3329 } 3330 3331 return rc; 3332 } 3333 3334 static void 3335 nvme_ctrlr_config_json_standard_namespace(struct spdk_json_write_ctx *w, 3336 struct nvme_bdev_ns *nvme_ns) 3337 { 3338 /* nop */ 3339 } 3340 3341 static void 3342 nvme_namespace_config_json(struct spdk_json_write_ctx *w, struct nvme_bdev_ns *nvme_ns) 3343 { 3344 g_config_json_namespace_fn[nvme_ns->type](w, nvme_ns); 3345 } 3346 3347 static void 3348 bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w) 3349 { 3350 const char *action; 3351 3352 if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) { 3353 action = "reset"; 3354 } else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) { 3355 action = "abort"; 3356 } else { 3357 action = "none"; 3358 } 3359 3360 spdk_json_write_object_begin(w); 3361 3362 spdk_json_write_named_string(w, "method", "bdev_nvme_set_options"); 3363 3364 spdk_json_write_named_object_begin(w, "params"); 3365 spdk_json_write_named_string(w, "action_on_timeout", action); 3366 spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us); 3367 spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms); 3368 spdk_json_write_named_uint32(w, "retry_count", g_opts.retry_count); 3369 spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst); 3370 spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight); 3371 spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight); 3372 spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight); 3373 spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us); 3374 spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us); 3375 spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests); 3376 spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit); 3377 spdk_json_write_object_end(w); 3378 3379 spdk_json_write_object_end(w); 3380 } 3381 3382 static void 3383 nvme_bdev_ctrlr_config_json(struct spdk_json_write_ctx *w, 3384 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr) 3385 { 3386 struct spdk_nvme_transport_id *trid; 3387 3388 trid = nvme_bdev_ctrlr->connected_trid; 3389 3390 spdk_json_write_object_begin(w); 3391 3392 spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller"); 3393 3394 spdk_json_write_named_object_begin(w, "params"); 3395 spdk_json_write_named_string(w, "name", nvme_bdev_ctrlr->name); 3396 nvme_bdev_dump_trid_json(trid, w); 3397 spdk_json_write_named_bool(w, "prchk_reftag", 3398 (nvme_bdev_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0); 3399 spdk_json_write_named_bool(w, "prchk_guard", 3400 (nvme_bdev_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0); 3401 3402 spdk_json_write_object_end(w); 3403 3404 spdk_json_write_object_end(w); 3405 } 3406 3407 static void 3408 bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w) 3409 { 3410 spdk_json_write_object_begin(w); 3411 spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug"); 3412 3413 spdk_json_write_named_object_begin(w, "params"); 3414 spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us); 3415 spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled); 3416 spdk_json_write_object_end(w); 3417 3418 spdk_json_write_object_end(w); 3419 } 3420 3421 static int 3422 bdev_nvme_config_json(struct spdk_json_write_ctx *w) 3423 { 3424 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; 3425 uint32_t nsid; 3426 3427 bdev_nvme_opts_config_json(w); 3428 3429 pthread_mutex_lock(&g_bdev_nvme_mutex); 3430 3431 TAILQ_FOREACH(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 3432 nvme_bdev_ctrlr_config_json(w, nvme_bdev_ctrlr); 3433 3434 for (nsid = 0; nsid < nvme_bdev_ctrlr->num_ns; ++nsid) { 3435 if (!nvme_bdev_ctrlr->namespaces[nsid]->populated) { 3436 continue; 3437 } 3438 3439 nvme_namespace_config_json(w, nvme_bdev_ctrlr->namespaces[nsid]); 3440 } 3441 } 3442 3443 /* Dump as last parameter to give all NVMe bdevs chance to be constructed 3444 * before enabling hotplug poller. 3445 */ 3446 bdev_nvme_hotplug_config_json(w); 3447 3448 pthread_mutex_unlock(&g_bdev_nvme_mutex); 3449 return 0; 3450 } 3451 3452 struct spdk_nvme_ctrlr * 3453 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev) 3454 { 3455 if (!bdev || bdev->module != &nvme_if) { 3456 return NULL; 3457 } 3458 3459 return SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk)->nvme_ns->ctrlr->ctrlr; 3460 } 3461 3462 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme) 3463