1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. All rights reserved. 5 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include "spdk/stdinc.h" 35 36 #include "bdev_nvme.h" 37 #include "bdev_ocssd.h" 38 39 #include "spdk/config.h" 40 #include "spdk/endian.h" 41 #include "spdk/bdev.h" 42 #include "spdk/json.h" 43 #include "spdk/nvme.h" 44 #include "spdk/nvme_ocssd.h" 45 #include "spdk/thread.h" 46 #include "spdk/string.h" 47 #include "spdk/util.h" 48 49 #include "spdk/bdev_module.h" 50 #include "spdk/log.h" 51 52 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true 53 #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS (10000) 54 55 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w); 56 57 struct nvme_bdev_io { 58 /** array of iovecs to transfer. */ 59 struct iovec *iovs; 60 61 /** Number of iovecs in iovs array. */ 62 int iovcnt; 63 64 /** Current iovec position. */ 65 int iovpos; 66 67 /** Offset in current iovec. */ 68 uint32_t iov_offset; 69 70 /** array of iovecs to transfer. */ 71 struct iovec *fused_iovs; 72 73 /** Number of iovecs in iovs array. */ 74 int fused_iovcnt; 75 76 /** Current iovec position. */ 77 int fused_iovpos; 78 79 /** Offset in current iovec. */ 80 uint32_t fused_iov_offset; 81 82 /** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */ 83 struct spdk_nvme_cpl cpl; 84 85 /** Originating thread */ 86 struct spdk_thread *orig_thread; 87 88 /** Keeps track if first of fused commands was submitted */ 89 bool first_fused_submitted; 90 }; 91 92 struct nvme_probe_ctx { 93 size_t count; 94 struct spdk_nvme_transport_id trids[NVME_MAX_CONTROLLERS]; 95 struct spdk_nvme_host_id hostids[NVME_MAX_CONTROLLERS]; 96 const char *names[NVME_MAX_CONTROLLERS]; 97 uint32_t prchk_flags[NVME_MAX_CONTROLLERS]; 98 const char *hostnqn; 99 }; 100 101 struct nvme_probe_skip_entry { 102 struct spdk_nvme_transport_id trid; 103 TAILQ_ENTRY(nvme_probe_skip_entry) tailq; 104 }; 105 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */ 106 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER( 107 g_skipped_nvme_ctrlrs); 108 109 static struct spdk_bdev_nvme_opts g_opts = { 110 .action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE, 111 .timeout_us = 0, 112 .keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS, 113 .retry_count = 4, 114 .arbitration_burst = 0, 115 .low_priority_weight = 0, 116 .medium_priority_weight = 0, 117 .high_priority_weight = 0, 118 .nvme_adminq_poll_period_us = 10000ULL, 119 .nvme_ioq_poll_period_us = 0, 120 .io_queue_requests = 0, 121 .delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT, 122 }; 123 124 #define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL 125 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT 100000ULL 126 127 static int g_hot_insert_nvme_controller_index = 0; 128 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT; 129 static bool g_nvme_hotplug_enabled = false; 130 static struct spdk_thread *g_bdev_nvme_init_thread; 131 static struct spdk_poller *g_hotplug_poller; 132 static struct spdk_poller *g_hotplug_probe_poller; 133 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx; 134 135 static void nvme_ctrlr_populate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 136 struct nvme_async_probe_ctx *ctx); 137 static void nvme_ctrlr_populate_namespaces_done(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 138 struct nvme_async_probe_ctx *ctx); 139 static int bdev_nvme_library_init(void); 140 static void bdev_nvme_library_fini(void); 141 static int bdev_nvme_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 142 struct nvme_bdev_io *bio, 143 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba, 144 uint32_t flags); 145 static int bdev_nvme_no_pi_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 146 struct nvme_bdev_io *bio, 147 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba); 148 static int bdev_nvme_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 149 struct nvme_bdev_io *bio, 150 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba, 151 uint32_t flags); 152 static int bdev_nvme_comparev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 153 struct nvme_bdev_io *bio, 154 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba, 155 uint32_t flags); 156 static int bdev_nvme_comparev_and_writev(struct spdk_nvme_ns *ns, 157 struct spdk_nvme_qpair *qpair, 158 struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov, 159 int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba, 160 uint32_t flags); 161 static int bdev_nvme_admin_passthru(struct nvme_io_channel *nvme_ch, 162 struct nvme_bdev_io *bio, 163 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); 164 static int bdev_nvme_io_passthru(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 165 struct nvme_bdev_io *bio, 166 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); 167 static int bdev_nvme_io_passthru_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 168 struct nvme_bdev_io *bio, 169 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len); 170 static int bdev_nvme_abort(struct nvme_io_channel *nvme_ch, 171 struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort); 172 static int bdev_nvme_reset(struct nvme_io_channel *nvme_ch, struct nvme_bdev_io *bio); 173 static int bdev_nvme_failover(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, bool remove); 174 static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr); 175 176 typedef void (*populate_namespace_fn)(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 177 struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx); 178 static void nvme_ctrlr_populate_standard_namespace(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 179 struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx); 180 181 static populate_namespace_fn g_populate_namespace_fn[] = { 182 NULL, 183 nvme_ctrlr_populate_standard_namespace, 184 bdev_ocssd_populate_namespace, 185 }; 186 187 typedef void (*depopulate_namespace_fn)(struct nvme_bdev_ns *nvme_ns); 188 static void nvme_ctrlr_depopulate_standard_namespace(struct nvme_bdev_ns *nvme_ns); 189 190 static depopulate_namespace_fn g_depopulate_namespace_fn[] = { 191 NULL, 192 nvme_ctrlr_depopulate_standard_namespace, 193 bdev_ocssd_depopulate_namespace, 194 }; 195 196 typedef void (*config_json_namespace_fn)(struct spdk_json_write_ctx *w, 197 struct nvme_bdev_ns *nvme_ns); 198 static void nvme_ctrlr_config_json_standard_namespace(struct spdk_json_write_ctx *w, 199 struct nvme_bdev_ns *nvme_ns); 200 201 static config_json_namespace_fn g_config_json_namespace_fn[] = { 202 NULL, 203 nvme_ctrlr_config_json_standard_namespace, 204 bdev_ocssd_namespace_config_json, 205 }; 206 207 struct spdk_nvme_qpair * 208 bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch) 209 { 210 struct nvme_io_channel *nvme_ch; 211 212 nvme_ch = spdk_io_channel_get_ctx(ctrlr_io_ch); 213 214 return nvme_ch->qpair; 215 } 216 217 static int 218 bdev_nvme_get_ctx_size(void) 219 { 220 return sizeof(struct nvme_bdev_io); 221 } 222 223 static struct spdk_bdev_module nvme_if = { 224 .name = "nvme", 225 .async_fini = true, 226 .module_init = bdev_nvme_library_init, 227 .module_fini = bdev_nvme_library_fini, 228 .config_json = bdev_nvme_config_json, 229 .get_ctx_size = bdev_nvme_get_ctx_size, 230 231 }; 232 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if) 233 234 static void 235 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx) 236 { 237 int rc; 238 239 SPDK_DEBUGLOG(bdev_nvme, "qpair %p is disconnected, attempting reconnect.\n", qpair); 240 /* 241 * Currently, just try to reconnect indefinitely. If we are doing a reset, the reset will 242 * reconnect a qpair and we will stop getting a callback for this one. 243 */ 244 rc = spdk_nvme_ctrlr_reconnect_io_qpair(qpair); 245 if (rc != 0) { 246 SPDK_WARNLOG("Failed to reconnect to qpair %p, errno %d\n", qpair, -rc); 247 } 248 } 249 250 static int 251 bdev_nvme_poll(void *arg) 252 { 253 struct nvme_bdev_poll_group *group = arg; 254 int64_t num_completions; 255 256 if (group->collect_spin_stat && group->start_ticks == 0) { 257 group->start_ticks = spdk_get_ticks(); 258 } 259 260 num_completions = spdk_nvme_poll_group_process_completions(group->group, 0, 261 bdev_nvme_disconnected_qpair_cb); 262 if (group->collect_spin_stat) { 263 if (num_completions > 0) { 264 if (group->end_ticks != 0) { 265 group->spin_ticks += (group->end_ticks - group->start_ticks); 266 group->end_ticks = 0; 267 } 268 group->start_ticks = 0; 269 } else { 270 group->end_ticks = spdk_get_ticks(); 271 } 272 } 273 274 return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 275 } 276 277 static int 278 bdev_nvme_poll_adminq(void *arg) 279 { 280 int32_t rc; 281 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = arg; 282 283 assert(nvme_bdev_ctrlr != NULL); 284 285 rc = spdk_nvme_ctrlr_process_admin_completions(nvme_bdev_ctrlr->ctrlr); 286 if (rc < 0) { 287 bdev_nvme_failover(nvme_bdev_ctrlr, false); 288 } 289 290 return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY; 291 } 292 293 static int 294 bdev_nvme_destruct(void *ctx) 295 { 296 struct nvme_bdev *nvme_disk = ctx; 297 struct nvme_bdev_ns *nvme_ns = nvme_disk->nvme_ns; 298 299 nvme_ns->bdev = NULL; 300 301 nvme_bdev_ns_detach(nvme_ns); 302 303 free(nvme_disk->disk.name); 304 free(nvme_disk); 305 306 return 0; 307 } 308 309 static int 310 bdev_nvme_flush(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 311 struct nvme_bdev_io *bio, uint64_t offset, uint64_t nbytes) 312 { 313 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_SUCCESS); 314 315 return 0; 316 } 317 318 static int 319 bdev_nvme_create_qpair(struct nvme_io_channel *nvme_ch) 320 { 321 struct spdk_nvme_ctrlr *ctrlr = nvme_ch->ctrlr->ctrlr; 322 struct spdk_nvme_io_qpair_opts opts; 323 int rc; 324 325 spdk_nvme_ctrlr_get_default_io_qpair_opts(ctrlr, &opts, sizeof(opts)); 326 opts.delay_cmd_submit = g_opts.delay_cmd_submit; 327 opts.create_only = true; 328 opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests); 329 g_opts.io_queue_requests = opts.io_queue_requests; 330 331 nvme_ch->qpair = spdk_nvme_ctrlr_alloc_io_qpair(ctrlr, &opts, sizeof(opts)); 332 if (nvme_ch->qpair == NULL) { 333 return -1; 334 } 335 336 assert(nvme_ch->group != NULL); 337 338 rc = spdk_nvme_poll_group_add(nvme_ch->group->group, nvme_ch->qpair); 339 if (rc != 0) { 340 SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n"); 341 goto err; 342 } 343 344 rc = spdk_nvme_ctrlr_connect_io_qpair(ctrlr, nvme_ch->qpair); 345 if (rc != 0) { 346 SPDK_ERRLOG("Unable to connect I/O qpair.\n"); 347 goto err; 348 } 349 350 return 0; 351 352 err: 353 spdk_nvme_ctrlr_free_io_qpair(nvme_ch->qpair); 354 355 return rc; 356 } 357 358 static void 359 _bdev_nvme_reset_destruct_ctrlr(struct spdk_io_channel_iter *i, int status) 360 { 361 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_io_device(i); 362 363 spdk_thread_send_msg(nvme_bdev_ctrlr->thread, nvme_bdev_ctrlr_do_destruct, 364 nvme_bdev_ctrlr); 365 } 366 367 static void 368 _bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i) 369 { 370 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 371 struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(_ch); 372 struct spdk_bdev_io *bdev_io; 373 enum spdk_bdev_io_status status = SPDK_BDEV_IO_STATUS_SUCCESS; 374 375 /* A NULL ctx means success. */ 376 if (spdk_io_channel_iter_get_ctx(i) != NULL) { 377 status = SPDK_BDEV_IO_STATUS_FAILED; 378 } 379 380 while (!TAILQ_EMPTY(&nvme_ch->pending_resets)) { 381 bdev_io = TAILQ_FIRST(&nvme_ch->pending_resets); 382 TAILQ_REMOVE(&nvme_ch->pending_resets, bdev_io, module_link); 383 spdk_bdev_io_complete(bdev_io, status); 384 } 385 386 spdk_for_each_channel_continue(i, 0); 387 } 388 389 static void 390 _bdev_nvme_reset_complete(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, int rc) 391 { 392 /* we are using the for_each_channel cb_arg like a return code here. */ 393 /* If it's zero, we succeeded, otherwise, the reset failed. */ 394 void *cb_arg = NULL; 395 struct nvme_bdev_ctrlr_trid *curr_trid; 396 bool do_destruct = false; 397 398 if (rc) { 399 cb_arg = (void *)0x1; 400 SPDK_ERRLOG("Resetting controller failed.\n"); 401 } else { 402 SPDK_NOTICELOG("Resetting controller successful.\n"); 403 } 404 405 pthread_mutex_lock(&nvme_bdev_ctrlr->mutex); 406 nvme_bdev_ctrlr->resetting = false; 407 nvme_bdev_ctrlr->failover_in_progress = false; 408 409 curr_trid = TAILQ_FIRST(&nvme_bdev_ctrlr->trids); 410 assert(curr_trid != NULL); 411 assert(&curr_trid->trid == nvme_bdev_ctrlr->connected_trid); 412 413 curr_trid->is_failed = cb_arg != NULL ? true : false; 414 415 if (nvme_bdev_ctrlr->ref == 0 && nvme_bdev_ctrlr->destruct) { 416 /* Destruct ctrlr after clearing pending resets. */ 417 do_destruct = true; 418 } 419 420 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 421 422 /* Make sure we clear any pending resets before returning. */ 423 spdk_for_each_channel(nvme_bdev_ctrlr, 424 _bdev_nvme_complete_pending_resets, 425 cb_arg, 426 do_destruct ? _bdev_nvme_reset_destruct_ctrlr : NULL); 427 } 428 429 static void 430 _bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status) 431 { 432 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_io_device(i); 433 struct nvme_bdev_io *bio = spdk_io_channel_iter_get_ctx(i); 434 int rc = SPDK_BDEV_IO_STATUS_SUCCESS; 435 436 if (status) { 437 rc = SPDK_BDEV_IO_STATUS_FAILED; 438 } 439 if (bio) { 440 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), rc); 441 } 442 _bdev_nvme_reset_complete(nvme_bdev_ctrlr, status); 443 } 444 445 static void 446 _bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i) 447 { 448 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 449 struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(_ch); 450 int rc; 451 452 rc = bdev_nvme_create_qpair(nvme_ch); 453 454 spdk_for_each_channel_continue(i, rc); 455 } 456 457 static void 458 _bdev_nvme_reset_ctrlr(struct spdk_io_channel_iter *i, int status) 459 { 460 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_io_device(i); 461 struct nvme_bdev_io *bio = spdk_io_channel_iter_get_ctx(i); 462 int rc; 463 464 if (status) { 465 rc = status; 466 goto err; 467 } 468 469 rc = spdk_nvme_ctrlr_reset(nvme_bdev_ctrlr->ctrlr); 470 if (rc != 0) { 471 goto err; 472 } 473 474 /* Recreate all of the I/O queue pairs */ 475 spdk_for_each_channel(nvme_bdev_ctrlr, 476 _bdev_nvme_reset_create_qpair, 477 bio, 478 _bdev_nvme_reset_create_qpairs_done); 479 return; 480 481 err: 482 if (bio) { 483 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_FAILED); 484 } 485 _bdev_nvme_reset_complete(nvme_bdev_ctrlr, rc); 486 } 487 488 static void 489 _bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i) 490 { 491 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 492 struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch); 493 int rc; 494 495 rc = spdk_nvme_ctrlr_free_io_qpair(nvme_ch->qpair); 496 if (!rc) { 497 nvme_ch->qpair = NULL; 498 } 499 500 spdk_for_each_channel_continue(i, rc); 501 } 502 503 static int 504 _bdev_nvme_reset(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, void *ctx) 505 { 506 pthread_mutex_lock(&nvme_bdev_ctrlr->mutex); 507 if (nvme_bdev_ctrlr->destruct) { 508 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 509 return -EBUSY; 510 } 511 512 if (nvme_bdev_ctrlr->resetting) { 513 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 514 SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); 515 return -EAGAIN; 516 } 517 518 nvme_bdev_ctrlr->resetting = true; 519 520 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 521 522 /* First, delete all NVMe I/O queue pairs. */ 523 spdk_for_each_channel(nvme_bdev_ctrlr, 524 _bdev_nvme_reset_destroy_qpair, 525 ctx, 526 _bdev_nvme_reset_ctrlr); 527 528 return 0; 529 } 530 531 static int 532 bdev_nvme_reset(struct nvme_io_channel *nvme_ch, struct nvme_bdev_io *bio) 533 { 534 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 535 int rc; 536 537 rc = _bdev_nvme_reset(nvme_ch->ctrlr, bio); 538 if (rc == -EBUSY) { 539 /* Don't bother resetting if the controller is in the process of being destructed. */ 540 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 541 return 0; 542 } else if (rc == -EAGAIN) { 543 /* 544 * Reset call is queued only if it is from the app framework. This is on purpose so that 545 * we don't interfere with the app framework reset strategy. i.e. we are deferring to the 546 * upper level. If they are in the middle of a reset, we won't try to schedule another one. 547 */ 548 TAILQ_INSERT_TAIL(&nvme_ch->pending_resets, bdev_io, module_link); 549 return 0; 550 } else { 551 return rc; 552 } 553 } 554 555 static int 556 bdev_nvme_failover(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, bool remove) 557 { 558 struct nvme_bdev_ctrlr_trid *curr_trid = NULL, *next_trid = NULL; 559 int rc = 0; 560 561 pthread_mutex_lock(&nvme_bdev_ctrlr->mutex); 562 if (nvme_bdev_ctrlr->destruct) { 563 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 564 /* Don't bother resetting if the controller is in the process of being destructed. */ 565 return 0; 566 } 567 568 curr_trid = TAILQ_FIRST(&nvme_bdev_ctrlr->trids); 569 assert(curr_trid); 570 assert(&curr_trid->trid == nvme_bdev_ctrlr->connected_trid); 571 next_trid = TAILQ_NEXT(curr_trid, link); 572 573 if (nvme_bdev_ctrlr->resetting) { 574 if (next_trid && !nvme_bdev_ctrlr->failover_in_progress) { 575 rc = -EAGAIN; 576 } 577 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 578 SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); 579 return rc; 580 } 581 582 nvme_bdev_ctrlr->resetting = true; 583 curr_trid->is_failed = true; 584 585 if (next_trid) { 586 assert(curr_trid->trid.trtype != SPDK_NVME_TRANSPORT_PCIE); 587 588 SPDK_NOTICELOG("Start failover from %s:%s to %s:%s\n", curr_trid->trid.traddr, 589 curr_trid->trid.trsvcid, next_trid->trid.traddr, next_trid->trid.trsvcid); 590 591 nvme_bdev_ctrlr->failover_in_progress = true; 592 spdk_nvme_ctrlr_fail(nvme_bdev_ctrlr->ctrlr); 593 nvme_bdev_ctrlr->connected_trid = &next_trid->trid; 594 rc = spdk_nvme_ctrlr_set_trid(nvme_bdev_ctrlr->ctrlr, &next_trid->trid); 595 assert(rc == 0); 596 TAILQ_REMOVE(&nvme_bdev_ctrlr->trids, curr_trid, link); 597 if (!remove) { 598 /** Shuffle the old trid to the end of the list and use the new one. 599 * Allows for round robin through multiple connections. 600 */ 601 TAILQ_INSERT_TAIL(&nvme_bdev_ctrlr->trids, curr_trid, link); 602 } else { 603 free(curr_trid); 604 } 605 } 606 607 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 608 609 /* First, delete all NVMe I/O queue pairs. */ 610 spdk_for_each_channel(nvme_bdev_ctrlr, 611 _bdev_nvme_reset_destroy_qpair, 612 NULL, 613 _bdev_nvme_reset_ctrlr); 614 615 return 0; 616 } 617 618 static int 619 bdev_nvme_unmap(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 620 struct nvme_bdev_io *bio, 621 uint64_t offset_blocks, 622 uint64_t num_blocks); 623 624 static void 625 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 626 bool success) 627 { 628 struct spdk_bdev *bdev = bdev_io->bdev; 629 struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev->ctxt; 630 struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch); 631 struct nvme_bdev_ns *nvme_ns; 632 struct spdk_nvme_qpair *qpair; 633 int ret; 634 635 if (!success) { 636 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 637 return; 638 } 639 640 if (spdk_unlikely(!bdev_nvme_find_io_path(nbdev, nvme_ch, &nvme_ns, &qpair))) { 641 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 642 return; 643 } 644 645 ret = bdev_nvme_readv(nvme_ns->ns, 646 qpair, 647 (struct nvme_bdev_io *)bdev_io->driver_ctx, 648 bdev_io->u.bdev.iovs, 649 bdev_io->u.bdev.iovcnt, 650 bdev_io->u.bdev.md_buf, 651 bdev_io->u.bdev.num_blocks, 652 bdev_io->u.bdev.offset_blocks, 653 bdev->dif_check_flags); 654 655 if (spdk_likely(ret == 0)) { 656 return; 657 } else if (ret == -ENOMEM) { 658 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM); 659 } else { 660 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 661 } 662 } 663 664 static int 665 _bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 666 { 667 struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch); 668 struct spdk_bdev *bdev = bdev_io->bdev; 669 struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev->ctxt; 670 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 671 struct nvme_bdev_io *nbdev_io_to_abort; 672 struct nvme_bdev_ns *nvme_ns; 673 struct spdk_nvme_qpair *qpair; 674 675 if (spdk_unlikely(!bdev_nvme_find_io_path(nbdev, nvme_ch, &nvme_ns, &qpair))) { 676 return -1; 677 } 678 679 switch (bdev_io->type) { 680 case SPDK_BDEV_IO_TYPE_READ: 681 if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) { 682 return bdev_nvme_readv(nvme_ns->ns, 683 qpair, 684 nbdev_io, 685 bdev_io->u.bdev.iovs, 686 bdev_io->u.bdev.iovcnt, 687 bdev_io->u.bdev.md_buf, 688 bdev_io->u.bdev.num_blocks, 689 bdev_io->u.bdev.offset_blocks, 690 bdev->dif_check_flags); 691 } else { 692 spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb, 693 bdev_io->u.bdev.num_blocks * bdev->blocklen); 694 return 0; 695 } 696 697 case SPDK_BDEV_IO_TYPE_WRITE: 698 return bdev_nvme_writev(nvme_ns->ns, 699 qpair, 700 nbdev_io, 701 bdev_io->u.bdev.iovs, 702 bdev_io->u.bdev.iovcnt, 703 bdev_io->u.bdev.md_buf, 704 bdev_io->u.bdev.num_blocks, 705 bdev_io->u.bdev.offset_blocks, 706 bdev->dif_check_flags); 707 708 case SPDK_BDEV_IO_TYPE_COMPARE: 709 return bdev_nvme_comparev(nvme_ns->ns, 710 qpair, 711 nbdev_io, 712 bdev_io->u.bdev.iovs, 713 bdev_io->u.bdev.iovcnt, 714 bdev_io->u.bdev.md_buf, 715 bdev_io->u.bdev.num_blocks, 716 bdev_io->u.bdev.offset_blocks, 717 bdev->dif_check_flags); 718 719 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 720 return bdev_nvme_comparev_and_writev(nvme_ns->ns, 721 qpair, 722 nbdev_io, 723 bdev_io->u.bdev.iovs, 724 bdev_io->u.bdev.iovcnt, 725 bdev_io->u.bdev.fused_iovs, 726 bdev_io->u.bdev.fused_iovcnt, 727 bdev_io->u.bdev.md_buf, 728 bdev_io->u.bdev.num_blocks, 729 bdev_io->u.bdev.offset_blocks, 730 bdev->dif_check_flags); 731 732 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 733 return bdev_nvme_unmap(nvme_ns->ns, 734 qpair, 735 nbdev_io, 736 bdev_io->u.bdev.offset_blocks, 737 bdev_io->u.bdev.num_blocks); 738 739 case SPDK_BDEV_IO_TYPE_UNMAP: 740 return bdev_nvme_unmap(nvme_ns->ns, 741 qpair, 742 nbdev_io, 743 bdev_io->u.bdev.offset_blocks, 744 bdev_io->u.bdev.num_blocks); 745 746 case SPDK_BDEV_IO_TYPE_RESET: 747 return bdev_nvme_reset(nvme_ch, nbdev_io); 748 749 case SPDK_BDEV_IO_TYPE_FLUSH: 750 return bdev_nvme_flush(nvme_ns->ns, 751 qpair, 752 nbdev_io, 753 bdev_io->u.bdev.offset_blocks, 754 bdev_io->u.bdev.num_blocks); 755 756 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 757 return bdev_nvme_admin_passthru(nvme_ch, 758 nbdev_io, 759 &bdev_io->u.nvme_passthru.cmd, 760 bdev_io->u.nvme_passthru.buf, 761 bdev_io->u.nvme_passthru.nbytes); 762 763 case SPDK_BDEV_IO_TYPE_NVME_IO: 764 return bdev_nvme_io_passthru(nvme_ns->ns, 765 qpair, 766 nbdev_io, 767 &bdev_io->u.nvme_passthru.cmd, 768 bdev_io->u.nvme_passthru.buf, 769 bdev_io->u.nvme_passthru.nbytes); 770 771 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 772 return bdev_nvme_io_passthru_md(nvme_ns->ns, 773 qpair, 774 nbdev_io, 775 &bdev_io->u.nvme_passthru.cmd, 776 bdev_io->u.nvme_passthru.buf, 777 bdev_io->u.nvme_passthru.nbytes, 778 bdev_io->u.nvme_passthru.md_buf, 779 bdev_io->u.nvme_passthru.md_len); 780 781 case SPDK_BDEV_IO_TYPE_ABORT: 782 nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx; 783 return bdev_nvme_abort(nvme_ch, 784 nbdev_io, 785 nbdev_io_to_abort); 786 787 default: 788 return -EINVAL; 789 } 790 return 0; 791 } 792 793 static void 794 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 795 { 796 int rc = _bdev_nvme_submit_request(ch, bdev_io); 797 798 if (spdk_unlikely(rc != 0)) { 799 if (rc == -ENOMEM) { 800 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM); 801 } else { 802 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 803 } 804 } 805 } 806 807 static bool 808 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 809 { 810 struct nvme_bdev *nbdev = ctx; 811 struct nvme_bdev_ns *nvme_ns; 812 struct spdk_nvme_ns *ns; 813 struct spdk_nvme_ctrlr *ctrlr; 814 const struct spdk_nvme_ctrlr_data *cdata; 815 816 nvme_ns = nvme_bdev_to_bdev_ns(nbdev); 817 assert(nvme_ns != NULL); 818 ns = nvme_ns->ns; 819 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 820 821 switch (io_type) { 822 case SPDK_BDEV_IO_TYPE_READ: 823 case SPDK_BDEV_IO_TYPE_WRITE: 824 case SPDK_BDEV_IO_TYPE_RESET: 825 case SPDK_BDEV_IO_TYPE_FLUSH: 826 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 827 case SPDK_BDEV_IO_TYPE_NVME_IO: 828 case SPDK_BDEV_IO_TYPE_ABORT: 829 return true; 830 831 case SPDK_BDEV_IO_TYPE_COMPARE: 832 return spdk_nvme_ns_supports_compare(ns); 833 834 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 835 return spdk_nvme_ns_get_md_size(ns) ? true : false; 836 837 case SPDK_BDEV_IO_TYPE_UNMAP: 838 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 839 return cdata->oncs.dsm; 840 841 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 842 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 843 /* 844 * If an NVMe controller guarantees reading unallocated blocks returns zero, 845 * we can implement WRITE_ZEROES as an NVMe deallocate command. 846 */ 847 if (cdata->oncs.dsm && 848 spdk_nvme_ns_get_dealloc_logical_block_read_value(ns) == 849 SPDK_NVME_DEALLOC_READ_00) { 850 return true; 851 } 852 /* 853 * The NVMe controller write_zeroes function is currently not used by our driver. 854 * If a user submits an arbitrarily large write_zeroes request to the controller, the request will fail. 855 * Until this is resolved, we only claim support for write_zeroes if deallocated blocks return 0's when read. 856 */ 857 return false; 858 859 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 860 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 861 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) { 862 return true; 863 } 864 return false; 865 866 default: 867 return false; 868 } 869 } 870 871 static int 872 bdev_nvme_create_cb(void *io_device, void *ctx_buf) 873 { 874 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = io_device; 875 struct nvme_io_channel *nvme_ch = ctx_buf; 876 struct spdk_io_channel *pg_ch = NULL; 877 int rc; 878 879 if (spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) { 880 rc = bdev_ocssd_create_io_channel(nvme_ch); 881 if (rc != 0) { 882 return rc; 883 } 884 } 885 886 pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs); 887 if (!pg_ch) { 888 rc = -1; 889 goto err_pg_ch; 890 } 891 892 nvme_ch->group = spdk_io_channel_get_ctx(pg_ch); 893 894 #ifdef SPDK_CONFIG_VTUNE 895 nvme_ch->group->collect_spin_stat = true; 896 #else 897 nvme_ch->group->collect_spin_stat = false; 898 #endif 899 900 TAILQ_INIT(&nvme_ch->pending_resets); 901 902 nvme_ch->ctrlr = nvme_bdev_ctrlr; 903 904 rc = bdev_nvme_create_qpair(nvme_ch); 905 if (rc != 0) { 906 goto err_qpair; 907 } 908 909 return 0; 910 911 err_qpair: 912 spdk_put_io_channel(pg_ch); 913 err_pg_ch: 914 if (nvme_ch->ocssd_ch) { 915 bdev_ocssd_destroy_io_channel(nvme_ch); 916 } 917 918 return rc; 919 } 920 921 static void 922 bdev_nvme_destroy_cb(void *io_device, void *ctx_buf) 923 { 924 struct nvme_io_channel *nvme_ch = ctx_buf; 925 926 assert(nvme_ch->group != NULL); 927 928 if (nvme_ch->ocssd_ch != NULL) { 929 bdev_ocssd_destroy_io_channel(nvme_ch); 930 } 931 932 spdk_nvme_ctrlr_free_io_qpair(nvme_ch->qpair); 933 934 spdk_put_io_channel(spdk_io_channel_from_ctx(nvme_ch->group)); 935 } 936 937 static int 938 bdev_nvme_poll_group_create_cb(void *io_device, void *ctx_buf) 939 { 940 struct nvme_bdev_poll_group *group = ctx_buf; 941 942 group->group = spdk_nvme_poll_group_create(group); 943 if (group->group == NULL) { 944 return -1; 945 } 946 947 group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us); 948 949 if (group->poller == NULL) { 950 spdk_nvme_poll_group_destroy(group->group); 951 return -1; 952 } 953 954 return 0; 955 } 956 957 static void 958 bdev_nvme_poll_group_destroy_cb(void *io_device, void *ctx_buf) 959 { 960 struct nvme_bdev_poll_group *group = ctx_buf; 961 962 spdk_poller_unregister(&group->poller); 963 if (spdk_nvme_poll_group_destroy(group->group)) { 964 SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module."); 965 assert(false); 966 } 967 } 968 969 static struct spdk_io_channel * 970 bdev_nvme_get_io_channel(void *ctx) 971 { 972 struct nvme_bdev *nvme_bdev = ctx; 973 974 return spdk_get_io_channel(nvme_bdev->nvme_ns->ctrlr); 975 } 976 977 static void * 978 bdev_nvme_get_module_ctx(void *ctx) 979 { 980 struct nvme_bdev *nvme_bdev = ctx; 981 982 return bdev_nvme_get_ctrlr(&nvme_bdev->disk); 983 } 984 985 static int 986 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) 987 { 988 struct nvme_bdev *nvme_bdev = ctx; 989 struct nvme_bdev_ns *nvme_ns; 990 struct spdk_nvme_ns *ns; 991 struct spdk_nvme_ctrlr *ctrlr; 992 const struct spdk_nvme_ctrlr_data *cdata; 993 const struct spdk_nvme_transport_id *trid; 994 union spdk_nvme_vs_register vs; 995 union spdk_nvme_csts_register csts; 996 char buf[128]; 997 998 nvme_ns = nvme_bdev_to_bdev_ns(nvme_bdev); 999 assert(nvme_ns != NULL); 1000 ns = nvme_ns->ns; 1001 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 1002 1003 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 1004 trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); 1005 vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr); 1006 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); 1007 1008 spdk_json_write_named_object_begin(w, "nvme"); 1009 1010 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 1011 spdk_json_write_named_string(w, "pci_address", trid->traddr); 1012 } 1013 1014 spdk_json_write_named_object_begin(w, "trid"); 1015 1016 nvme_bdev_dump_trid_json(trid, w); 1017 1018 spdk_json_write_object_end(w); 1019 1020 #ifdef SPDK_CONFIG_NVME_CUSE 1021 size_t cuse_name_size = 128; 1022 char cuse_name[cuse_name_size]; 1023 1024 int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns), 1025 cuse_name, &cuse_name_size); 1026 if (rc == 0) { 1027 spdk_json_write_named_string(w, "cuse_device", cuse_name); 1028 } 1029 #endif 1030 1031 spdk_json_write_named_object_begin(w, "ctrlr_data"); 1032 1033 spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid); 1034 1035 snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn); 1036 spdk_str_trim(buf); 1037 spdk_json_write_named_string(w, "model_number", buf); 1038 1039 snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn); 1040 spdk_str_trim(buf); 1041 spdk_json_write_named_string(w, "serial_number", buf); 1042 1043 snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr); 1044 spdk_str_trim(buf); 1045 spdk_json_write_named_string(w, "firmware_revision", buf); 1046 1047 if (cdata->subnqn[0] != '\0') { 1048 spdk_json_write_named_string(w, "subnqn", cdata->subnqn); 1049 } 1050 1051 spdk_json_write_named_object_begin(w, "oacs"); 1052 1053 spdk_json_write_named_uint32(w, "security", cdata->oacs.security); 1054 spdk_json_write_named_uint32(w, "format", cdata->oacs.format); 1055 spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware); 1056 spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage); 1057 1058 spdk_json_write_object_end(w); 1059 1060 spdk_json_write_object_end(w); 1061 1062 spdk_json_write_named_object_begin(w, "vs"); 1063 1064 spdk_json_write_name(w, "nvme_version"); 1065 if (vs.bits.ter) { 1066 spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter); 1067 } else { 1068 spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr); 1069 } 1070 1071 spdk_json_write_object_end(w); 1072 1073 spdk_json_write_named_object_begin(w, "csts"); 1074 1075 spdk_json_write_named_uint32(w, "rdy", csts.bits.rdy); 1076 spdk_json_write_named_uint32(w, "cfs", csts.bits.cfs); 1077 1078 spdk_json_write_object_end(w); 1079 1080 spdk_json_write_named_object_begin(w, "ns_data"); 1081 1082 spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns)); 1083 1084 spdk_json_write_object_end(w); 1085 1086 if (cdata->oacs.security) { 1087 spdk_json_write_named_object_begin(w, "security"); 1088 1089 spdk_json_write_named_bool(w, "opal", nvme_bdev->opal); 1090 1091 spdk_json_write_object_end(w); 1092 } 1093 1094 spdk_json_write_object_end(w); 1095 1096 return 0; 1097 } 1098 1099 static void 1100 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1101 { 1102 /* No config per bdev needed */ 1103 } 1104 1105 static uint64_t 1106 bdev_nvme_get_spin_time(struct spdk_io_channel *ch) 1107 { 1108 struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch); 1109 struct nvme_bdev_poll_group *group = nvme_ch->group; 1110 uint64_t spin_time; 1111 1112 if (!group || !group->collect_spin_stat) { 1113 return 0; 1114 } 1115 1116 if (group->end_ticks != 0) { 1117 group->spin_ticks += (group->end_ticks - group->start_ticks); 1118 group->end_ticks = 0; 1119 } 1120 1121 spin_time = (group->spin_ticks * 1000000ULL) / spdk_get_ticks_hz(); 1122 group->start_ticks = 0; 1123 group->spin_ticks = 0; 1124 1125 return spin_time; 1126 } 1127 1128 static const struct spdk_bdev_fn_table nvmelib_fn_table = { 1129 .destruct = bdev_nvme_destruct, 1130 .submit_request = bdev_nvme_submit_request, 1131 .io_type_supported = bdev_nvme_io_type_supported, 1132 .get_io_channel = bdev_nvme_get_io_channel, 1133 .dump_info_json = bdev_nvme_dump_info_json, 1134 .write_config_json = bdev_nvme_write_config_json, 1135 .get_spin_time = bdev_nvme_get_spin_time, 1136 .get_module_ctx = bdev_nvme_get_module_ctx, 1137 }; 1138 1139 static int 1140 nvme_disk_create(struct spdk_bdev *disk, const char *base_name, 1141 struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns, 1142 uint32_t prchk_flags, void *ctx) 1143 { 1144 const struct spdk_uuid *uuid; 1145 const struct spdk_nvme_ctrlr_data *cdata; 1146 const struct spdk_nvme_ns_data *nsdata; 1147 int rc; 1148 1149 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 1150 1151 disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns)); 1152 if (!disk->name) { 1153 return -ENOMEM; 1154 } 1155 disk->product_name = "NVMe disk"; 1156 1157 disk->write_cache = 0; 1158 if (cdata->vwc.present) { 1159 /* Enable if the Volatile Write Cache exists */ 1160 disk->write_cache = 1; 1161 } 1162 disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns); 1163 disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns); 1164 disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns); 1165 1166 uuid = spdk_nvme_ns_get_uuid(ns); 1167 if (uuid != NULL) { 1168 disk->uuid = *uuid; 1169 } 1170 1171 nsdata = spdk_nvme_ns_get_data(ns); 1172 1173 disk->md_len = spdk_nvme_ns_get_md_size(ns); 1174 if (disk->md_len != 0) { 1175 disk->md_interleave = nsdata->flbas.extended; 1176 disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns); 1177 if (disk->dif_type != SPDK_DIF_DISABLE) { 1178 disk->dif_is_head_of_md = nsdata->dps.md_start; 1179 disk->dif_check_flags = prchk_flags; 1180 } 1181 } 1182 1183 if (!(spdk_nvme_ctrlr_get_flags(ctrlr) & 1184 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) { 1185 disk->acwu = 0; 1186 } else if (nsdata->nsfeat.ns_atomic_write_unit) { 1187 disk->acwu = nsdata->nacwu; 1188 } else { 1189 disk->acwu = cdata->acwu; 1190 } 1191 1192 disk->ctxt = ctx; 1193 disk->fn_table = &nvmelib_fn_table; 1194 disk->module = &nvme_if; 1195 rc = spdk_bdev_register(disk); 1196 if (rc) { 1197 SPDK_ERRLOG("spdk_bdev_register() failed\n"); 1198 free(disk->name); 1199 return rc; 1200 } 1201 1202 return 0; 1203 } 1204 1205 static int 1206 nvme_bdev_create(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, struct nvme_bdev_ns *nvme_ns) 1207 { 1208 struct nvme_bdev *bdev; 1209 int rc; 1210 1211 bdev = calloc(1, sizeof(*bdev)); 1212 if (!bdev) { 1213 SPDK_ERRLOG("bdev calloc() failed\n"); 1214 return -ENOMEM; 1215 } 1216 1217 bdev->nvme_ns = nvme_ns; 1218 bdev->opal = nvme_bdev_ctrlr->opal_dev != NULL; 1219 1220 rc = nvme_disk_create(&bdev->disk, nvme_bdev_ctrlr->name, nvme_bdev_ctrlr->ctrlr, 1221 nvme_ns->ns, nvme_bdev_ctrlr->prchk_flags, bdev); 1222 if (rc != 0) { 1223 SPDK_ERRLOG("Failed to create NVMe disk\n"); 1224 free(bdev); 1225 return rc; 1226 } 1227 1228 nvme_ns->ref++; 1229 nvme_ns->bdev = bdev; 1230 1231 return 0; 1232 } 1233 1234 static void 1235 nvme_ctrlr_populate_standard_namespace(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 1236 struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx) 1237 { 1238 struct spdk_nvme_ctrlr *ctrlr = nvme_bdev_ctrlr->ctrlr; 1239 struct spdk_nvme_ns *ns; 1240 int rc = 0; 1241 1242 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id); 1243 if (!ns) { 1244 SPDK_DEBUGLOG(bdev_nvme, "Invalid NS %d\n", nvme_ns->id); 1245 rc = -EINVAL; 1246 goto done; 1247 } 1248 1249 nvme_ns->ns = ns; 1250 nvme_ns->ref = 1; 1251 1252 rc = nvme_bdev_create(nvme_bdev_ctrlr, nvme_ns); 1253 done: 1254 nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, rc); 1255 } 1256 1257 static bool 1258 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 1259 struct spdk_nvme_ctrlr_opts *opts) 1260 { 1261 struct nvme_probe_skip_entry *entry; 1262 1263 TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) { 1264 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 1265 return false; 1266 } 1267 } 1268 1269 opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst; 1270 opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight; 1271 opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight; 1272 opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight; 1273 1274 SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr); 1275 1276 return true; 1277 } 1278 1279 static void 1280 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl) 1281 { 1282 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = ctx; 1283 1284 if (spdk_nvme_cpl_is_error(cpl)) { 1285 SPDK_WARNLOG("Abort failed. Resetting controller.\n"); 1286 _bdev_nvme_reset(nvme_bdev_ctrlr, NULL); 1287 } 1288 } 1289 1290 static void 1291 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr, 1292 struct spdk_nvme_qpair *qpair, uint16_t cid) 1293 { 1294 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = cb_arg; 1295 union spdk_nvme_csts_register csts; 1296 int rc; 1297 1298 assert(nvme_bdev_ctrlr->ctrlr == ctrlr); 1299 1300 SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid); 1301 1302 /* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O 1303 * queue. (Note: qpair == NULL when there's an admin cmd timeout.) Otherwise we 1304 * would submit another fabrics cmd on the admin queue to read CSTS and check for its 1305 * completion recursively. 1306 */ 1307 if (nvme_bdev_ctrlr->connected_trid->trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) { 1308 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); 1309 if (csts.bits.cfs) { 1310 SPDK_ERRLOG("Controller Fatal Status, reset required\n"); 1311 _bdev_nvme_reset(nvme_bdev_ctrlr, NULL); 1312 return; 1313 } 1314 } 1315 1316 switch (g_opts.action_on_timeout) { 1317 case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT: 1318 if (qpair) { 1319 rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid, 1320 nvme_abort_cpl, nvme_bdev_ctrlr); 1321 if (rc == 0) { 1322 return; 1323 } 1324 1325 SPDK_ERRLOG("Unable to send abort. Resetting.\n"); 1326 } 1327 1328 /* FALLTHROUGH */ 1329 case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET: 1330 _bdev_nvme_reset(nvme_bdev_ctrlr, NULL); 1331 break; 1332 case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE: 1333 SPDK_DEBUGLOG(bdev_nvme, "No action for nvme controller timeout.\n"); 1334 break; 1335 default: 1336 SPDK_ERRLOG("An invalid timeout action value is found.\n"); 1337 break; 1338 } 1339 } 1340 1341 void 1342 nvme_ctrlr_depopulate_namespace_done(struct nvme_bdev_ns *nvme_ns) 1343 { 1344 nvme_bdev_ns_detach(nvme_ns); 1345 } 1346 1347 static void 1348 nvme_ctrlr_depopulate_standard_namespace(struct nvme_bdev_ns *nvme_ns) 1349 { 1350 struct nvme_bdev *bdev; 1351 1352 bdev = nvme_bdev_ns_to_bdev(nvme_ns); 1353 if (bdev != NULL) { 1354 spdk_bdev_unregister(&bdev->disk, NULL, NULL); 1355 } 1356 1357 nvme_ns->populated = false; 1358 1359 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 1360 } 1361 1362 static void 1363 nvme_ctrlr_populate_namespace(struct nvme_bdev_ctrlr *ctrlr, struct nvme_bdev_ns *nvme_ns, 1364 struct nvme_async_probe_ctx *ctx) 1365 { 1366 g_populate_namespace_fn[nvme_ns->type](ctrlr, nvme_ns, ctx); 1367 } 1368 1369 static void 1370 nvme_ctrlr_depopulate_namespace(struct nvme_bdev_ctrlr *ctrlr, struct nvme_bdev_ns *nvme_ns) 1371 { 1372 g_depopulate_namespace_fn[nvme_ns->type](nvme_ns); 1373 } 1374 1375 void 1376 nvme_ctrlr_populate_namespace_done(struct nvme_async_probe_ctx *ctx, 1377 struct nvme_bdev_ns *nvme_ns, int rc) 1378 { 1379 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = nvme_ns->ctrlr; 1380 1381 assert(nvme_bdev_ctrlr != NULL); 1382 1383 if (rc == 0) { 1384 nvme_ns->populated = true; 1385 pthread_mutex_lock(&nvme_bdev_ctrlr->mutex); 1386 nvme_bdev_ctrlr->ref++; 1387 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 1388 } else { 1389 memset(nvme_ns, 0, sizeof(*nvme_ns)); 1390 } 1391 1392 if (ctx) { 1393 ctx->populates_in_progress--; 1394 if (ctx->populates_in_progress == 0) { 1395 nvme_ctrlr_populate_namespaces_done(nvme_bdev_ctrlr, ctx); 1396 } 1397 } 1398 } 1399 1400 static void 1401 nvme_ctrlr_populate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 1402 struct nvme_async_probe_ctx *ctx) 1403 { 1404 struct spdk_nvme_ctrlr *ctrlr = nvme_bdev_ctrlr->ctrlr; 1405 struct nvme_bdev_ns *nvme_ns; 1406 struct spdk_nvme_ns *ns; 1407 struct nvme_bdev *bdev; 1408 uint32_t i; 1409 int rc; 1410 uint64_t num_sectors; 1411 bool ns_is_active; 1412 1413 if (ctx) { 1414 /* Initialize this count to 1 to handle the populate functions 1415 * calling nvme_ctrlr_populate_namespace_done() immediately. 1416 */ 1417 ctx->populates_in_progress = 1; 1418 } 1419 1420 for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) { 1421 uint32_t nsid = i + 1; 1422 1423 nvme_ns = nvme_bdev_ctrlr->namespaces[i]; 1424 ns_is_active = spdk_nvme_ctrlr_is_active_ns(ctrlr, nsid); 1425 1426 if (nvme_ns->populated && ns_is_active && nvme_ns->type == NVME_BDEV_NS_STANDARD) { 1427 /* NS is still there but attributes may have changed */ 1428 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid); 1429 num_sectors = spdk_nvme_ns_get_num_sectors(ns); 1430 bdev = nvme_bdev_ns_to_bdev(nvme_ns); 1431 assert(bdev != NULL); 1432 if (bdev->disk.blockcnt != num_sectors) { 1433 SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n", 1434 nsid, 1435 bdev->disk.name, 1436 bdev->disk.blockcnt, 1437 num_sectors); 1438 rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors); 1439 if (rc != 0) { 1440 SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n", 1441 bdev->disk.name, rc); 1442 } 1443 } 1444 } 1445 1446 if (!nvme_ns->populated && ns_is_active) { 1447 nvme_ns->id = nsid; 1448 nvme_ns->ctrlr = nvme_bdev_ctrlr; 1449 if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) { 1450 nvme_ns->type = NVME_BDEV_NS_OCSSD; 1451 } else { 1452 nvme_ns->type = NVME_BDEV_NS_STANDARD; 1453 } 1454 1455 nvme_ns->bdev = NULL; 1456 1457 if (ctx) { 1458 ctx->populates_in_progress++; 1459 } 1460 nvme_ctrlr_populate_namespace(nvme_bdev_ctrlr, nvme_ns, ctx); 1461 } 1462 1463 if (nvme_ns->populated && !ns_is_active) { 1464 nvme_ctrlr_depopulate_namespace(nvme_bdev_ctrlr, nvme_ns); 1465 } 1466 } 1467 1468 if (ctx) { 1469 /* Decrement this count now that the loop is over to account 1470 * for the one we started with. If the count is then 0, we 1471 * know any populate_namespace functions completed immediately, 1472 * so we'll kick the callback here. 1473 */ 1474 ctx->populates_in_progress--; 1475 if (ctx->populates_in_progress == 0) { 1476 nvme_ctrlr_populate_namespaces_done(nvme_bdev_ctrlr, ctx); 1477 } 1478 } 1479 1480 } 1481 1482 static void 1483 nvme_ctrlr_depopulate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr) 1484 { 1485 uint32_t i; 1486 struct nvme_bdev_ns *nvme_ns; 1487 1488 for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) { 1489 uint32_t nsid = i + 1; 1490 1491 nvme_ns = nvme_bdev_ctrlr->namespaces[nsid - 1]; 1492 if (nvme_ns->populated) { 1493 assert(nvme_ns->id == nsid); 1494 nvme_ctrlr_depopulate_namespace(nvme_bdev_ctrlr, nvme_ns); 1495 } 1496 } 1497 } 1498 1499 static void 1500 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 1501 { 1502 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = arg; 1503 union spdk_nvme_async_event_completion event; 1504 1505 if (spdk_nvme_cpl_is_error(cpl)) { 1506 SPDK_WARNLOG("AER request execute failed"); 1507 return; 1508 } 1509 1510 event.raw = cpl->cdw0; 1511 if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 1512 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) { 1513 nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, NULL); 1514 } else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_VENDOR) && 1515 (event.bits.log_page_identifier == SPDK_OCSSD_LOG_CHUNK_NOTIFICATION) && 1516 spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) { 1517 bdev_ocssd_handle_chunk_notification(nvme_bdev_ctrlr); 1518 } 1519 } 1520 1521 static int 1522 nvme_bdev_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr, 1523 const char *name, 1524 const struct spdk_nvme_transport_id *trid, 1525 uint32_t prchk_flags, 1526 struct nvme_bdev_ctrlr **_nvme_bdev_ctrlr) 1527 { 1528 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; 1529 struct nvme_bdev_ctrlr_trid *trid_entry; 1530 uint32_t i; 1531 int rc; 1532 1533 nvme_bdev_ctrlr = calloc(1, sizeof(*nvme_bdev_ctrlr)); 1534 if (nvme_bdev_ctrlr == NULL) { 1535 SPDK_ERRLOG("Failed to allocate device struct\n"); 1536 return -ENOMEM; 1537 } 1538 1539 rc = pthread_mutex_init(&nvme_bdev_ctrlr->mutex, NULL); 1540 if (rc != 0) { 1541 goto err_init_mutex; 1542 } 1543 1544 TAILQ_INIT(&nvme_bdev_ctrlr->trids); 1545 nvme_bdev_ctrlr->num_ns = spdk_nvme_ctrlr_get_num_ns(ctrlr); 1546 if (nvme_bdev_ctrlr->num_ns != 0) { 1547 nvme_bdev_ctrlr->namespaces = calloc(nvme_bdev_ctrlr->num_ns, sizeof(struct nvme_bdev_ns *)); 1548 if (!nvme_bdev_ctrlr->namespaces) { 1549 SPDK_ERRLOG("Failed to allocate block namespaces pointer\n"); 1550 rc = -ENOMEM; 1551 goto err_alloc_namespaces; 1552 } 1553 } 1554 1555 trid_entry = calloc(1, sizeof(*trid_entry)); 1556 if (trid_entry == NULL) { 1557 SPDK_ERRLOG("Failed to allocate trid entry pointer\n"); 1558 rc = -ENOMEM; 1559 goto err_alloc_trid; 1560 } 1561 1562 trid_entry->trid = *trid; 1563 1564 for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) { 1565 nvme_bdev_ctrlr->namespaces[i] = calloc(1, sizeof(struct nvme_bdev_ns)); 1566 if (nvme_bdev_ctrlr->namespaces[i] == NULL) { 1567 SPDK_ERRLOG("Failed to allocate block namespace struct\n"); 1568 rc = -ENOMEM; 1569 goto err_alloc_namespace; 1570 } 1571 } 1572 1573 nvme_bdev_ctrlr->thread = spdk_get_thread(); 1574 nvme_bdev_ctrlr->adminq_timer_poller = NULL; 1575 nvme_bdev_ctrlr->ctrlr = ctrlr; 1576 nvme_bdev_ctrlr->ref = 1; 1577 nvme_bdev_ctrlr->connected_trid = &trid_entry->trid; 1578 nvme_bdev_ctrlr->name = strdup(name); 1579 if (nvme_bdev_ctrlr->name == NULL) { 1580 rc = -ENOMEM; 1581 goto err_alloc_name; 1582 } 1583 1584 if (spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) { 1585 rc = bdev_ocssd_init_ctrlr(nvme_bdev_ctrlr); 1586 if (spdk_unlikely(rc != 0)) { 1587 SPDK_ERRLOG("Unable to initialize OCSSD controller\n"); 1588 goto err_init_ocssd; 1589 } 1590 } 1591 1592 nvme_bdev_ctrlr->prchk_flags = prchk_flags; 1593 1594 spdk_io_device_register(nvme_bdev_ctrlr, bdev_nvme_create_cb, bdev_nvme_destroy_cb, 1595 sizeof(struct nvme_io_channel), 1596 name); 1597 1598 nvme_bdev_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_bdev_ctrlr, 1599 g_opts.nvme_adminq_poll_period_us); 1600 1601 TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nvme_bdev_ctrlr, tailq); 1602 1603 if (g_opts.timeout_us > 0) { 1604 spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us, 1605 timeout_cb, nvme_bdev_ctrlr); 1606 } 1607 1608 spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_bdev_ctrlr); 1609 spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_bdev_ctrlr); 1610 1611 if (spdk_nvme_ctrlr_get_flags(nvme_bdev_ctrlr->ctrlr) & 1612 SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) { 1613 nvme_bdev_ctrlr->opal_dev = spdk_opal_dev_construct(nvme_bdev_ctrlr->ctrlr); 1614 if (nvme_bdev_ctrlr->opal_dev == NULL) { 1615 SPDK_ERRLOG("Failed to initialize Opal\n"); 1616 } 1617 } 1618 1619 TAILQ_INSERT_HEAD(&nvme_bdev_ctrlr->trids, trid_entry, link); 1620 1621 if (_nvme_bdev_ctrlr != NULL) { 1622 *_nvme_bdev_ctrlr = nvme_bdev_ctrlr; 1623 } 1624 return 0; 1625 1626 err_init_ocssd: 1627 free(nvme_bdev_ctrlr->name); 1628 err_alloc_name: 1629 err_alloc_namespace: 1630 for (; i > 0; i--) { 1631 free(nvme_bdev_ctrlr->namespaces[i - 1]); 1632 } 1633 free(trid_entry); 1634 err_alloc_trid: 1635 free(nvme_bdev_ctrlr->namespaces); 1636 err_alloc_namespaces: 1637 pthread_mutex_destroy(&nvme_bdev_ctrlr->mutex); 1638 err_init_mutex: 1639 free(nvme_bdev_ctrlr); 1640 return rc; 1641 } 1642 1643 static void 1644 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 1645 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 1646 { 1647 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; 1648 struct nvme_probe_ctx *ctx = cb_ctx; 1649 char *name = NULL; 1650 uint32_t prchk_flags = 0; 1651 size_t i; 1652 int rc; 1653 1654 if (ctx) { 1655 for (i = 0; i < ctx->count; i++) { 1656 if (spdk_nvme_transport_id_compare(trid, &ctx->trids[i]) == 0) { 1657 prchk_flags = ctx->prchk_flags[i]; 1658 name = strdup(ctx->names[i]); 1659 break; 1660 } 1661 } 1662 } else { 1663 name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++); 1664 } 1665 if (!name) { 1666 SPDK_ERRLOG("Failed to assign name to NVMe device\n"); 1667 return; 1668 } 1669 1670 SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name); 1671 1672 rc = nvme_bdev_ctrlr_create(ctrlr, name, trid, prchk_flags, &nvme_bdev_ctrlr); 1673 if (rc != 0) { 1674 SPDK_ERRLOG("Failed to create new NVMe controller\n"); 1675 free(name); 1676 return; 1677 } 1678 1679 nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, NULL); 1680 1681 free(name); 1682 } 1683 1684 static void 1685 _nvme_bdev_ctrlr_destruct(void *ctx) 1686 { 1687 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = ctx; 1688 1689 nvme_ctrlr_depopulate_namespaces(nvme_bdev_ctrlr); 1690 nvme_bdev_ctrlr_destruct(nvme_bdev_ctrlr); 1691 } 1692 1693 static int 1694 _bdev_nvme_delete(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, bool hotplug) 1695 { 1696 struct nvme_probe_skip_entry *entry; 1697 1698 pthread_mutex_lock(&nvme_bdev_ctrlr->mutex); 1699 1700 /* The controller's destruction was already started */ 1701 if (nvme_bdev_ctrlr->destruct) { 1702 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 1703 return 0; 1704 } 1705 1706 if (!hotplug && 1707 nvme_bdev_ctrlr->connected_trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 1708 entry = calloc(1, sizeof(*entry)); 1709 if (!entry) { 1710 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 1711 return -ENOMEM; 1712 } 1713 entry->trid = *nvme_bdev_ctrlr->connected_trid; 1714 TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq); 1715 } 1716 1717 nvme_bdev_ctrlr->destruct = true; 1718 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 1719 1720 _nvme_bdev_ctrlr_destruct(nvme_bdev_ctrlr); 1721 1722 return 0; 1723 } 1724 1725 static void 1726 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr) 1727 { 1728 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = cb_ctx; 1729 1730 _bdev_nvme_delete(nvme_bdev_ctrlr, true); 1731 } 1732 1733 static int 1734 bdev_nvme_hotplug_probe(void *arg) 1735 { 1736 if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) { 1737 g_hotplug_probe_ctx = NULL; 1738 spdk_poller_unregister(&g_hotplug_probe_poller); 1739 } 1740 1741 return SPDK_POLLER_BUSY; 1742 } 1743 1744 static int 1745 bdev_nvme_hotplug(void *arg) 1746 { 1747 struct spdk_nvme_transport_id trid_pcie; 1748 1749 if (g_hotplug_probe_ctx) { 1750 return SPDK_POLLER_BUSY; 1751 } 1752 1753 memset(&trid_pcie, 0, sizeof(trid_pcie)); 1754 spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE); 1755 1756 g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL, 1757 hotplug_probe_cb, attach_cb, NULL); 1758 1759 if (g_hotplug_probe_ctx) { 1760 assert(g_hotplug_probe_poller == NULL); 1761 g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000); 1762 } 1763 1764 return SPDK_POLLER_BUSY; 1765 } 1766 1767 void 1768 bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts) 1769 { 1770 *opts = g_opts; 1771 } 1772 1773 int 1774 bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts) 1775 { 1776 if (g_bdev_nvme_init_thread != NULL) { 1777 if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 1778 return -EPERM; 1779 } 1780 } 1781 1782 g_opts = *opts; 1783 1784 return 0; 1785 } 1786 1787 struct set_nvme_hotplug_ctx { 1788 uint64_t period_us; 1789 bool enabled; 1790 spdk_msg_fn fn; 1791 void *fn_ctx; 1792 }; 1793 1794 static void 1795 set_nvme_hotplug_period_cb(void *_ctx) 1796 { 1797 struct set_nvme_hotplug_ctx *ctx = _ctx; 1798 1799 spdk_poller_unregister(&g_hotplug_poller); 1800 if (ctx->enabled) { 1801 g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us); 1802 } 1803 1804 g_nvme_hotplug_poll_period_us = ctx->period_us; 1805 g_nvme_hotplug_enabled = ctx->enabled; 1806 if (ctx->fn) { 1807 ctx->fn(ctx->fn_ctx); 1808 } 1809 1810 free(ctx); 1811 } 1812 1813 int 1814 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx) 1815 { 1816 struct set_nvme_hotplug_ctx *ctx; 1817 1818 if (enabled == true && !spdk_process_is_primary()) { 1819 return -EPERM; 1820 } 1821 1822 ctx = calloc(1, sizeof(*ctx)); 1823 if (ctx == NULL) { 1824 return -ENOMEM; 1825 } 1826 1827 period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us; 1828 ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX); 1829 ctx->enabled = enabled; 1830 ctx->fn = cb; 1831 ctx->fn_ctx = cb_ctx; 1832 1833 spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx); 1834 return 0; 1835 } 1836 1837 static void 1838 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, size_t count, int rc) 1839 { 1840 if (ctx->cb_fn) { 1841 ctx->cb_fn(ctx->cb_ctx, count, rc); 1842 } 1843 1844 ctx->namespaces_populated = true; 1845 if (ctx->probe_done) { 1846 /* The probe was already completed, so we need to free the context 1847 * here. This can happen for cases like OCSSD, where we need to 1848 * send additional commands to the SSD after attach. 1849 */ 1850 free(ctx); 1851 } 1852 } 1853 1854 static void 1855 nvme_ctrlr_populate_namespaces_done(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 1856 struct nvme_async_probe_ctx *ctx) 1857 { 1858 struct nvme_bdev_ns *nvme_ns; 1859 struct nvme_bdev *nvme_bdev; 1860 uint32_t i, nsid; 1861 size_t j; 1862 1863 assert(nvme_bdev_ctrlr != NULL); 1864 1865 /* 1866 * Report the new bdevs that were created in this call. 1867 * There can be more than one bdev per NVMe controller. 1868 */ 1869 j = 0; 1870 for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) { 1871 nsid = i + 1; 1872 nvme_ns = nvme_bdev_ctrlr->namespaces[nsid - 1]; 1873 if (!nvme_ns->populated) { 1874 continue; 1875 } 1876 assert(nvme_ns->id == nsid); 1877 nvme_bdev = nvme_bdev_ns_to_bdev(nvme_ns); 1878 if (nvme_bdev == NULL) { 1879 assert(nvme_ns->type == NVME_BDEV_NS_OCSSD); 1880 continue; 1881 } 1882 if (j < ctx->count) { 1883 ctx->names[j] = nvme_bdev->disk.name; 1884 j++; 1885 } else { 1886 SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n", 1887 ctx->count); 1888 populate_namespaces_cb(ctx, 0, -ERANGE); 1889 return; 1890 } 1891 } 1892 1893 populate_namespaces_cb(ctx, j, 0); 1894 } 1895 1896 static bool 1897 bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2) 1898 { 1899 const struct spdk_nvme_ns_data *nsdata1, *nsdata2; 1900 1901 nsdata1 = spdk_nvme_ns_get_data(ns1); 1902 nsdata2 = spdk_nvme_ns_get_data(ns2); 1903 1904 return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid)); 1905 } 1906 1907 static int 1908 bdev_nvme_add_trid(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, struct spdk_nvme_ctrlr *new_ctrlr, 1909 struct spdk_nvme_transport_id *trid) 1910 { 1911 uint32_t i, nsid; 1912 struct nvme_bdev_ns *nvme_ns; 1913 struct spdk_nvme_ns *new_ns; 1914 struct nvme_bdev_ctrlr_trid *new_trid, *tmp_trid; 1915 int rc = 0; 1916 1917 assert(nvme_bdev_ctrlr != NULL); 1918 1919 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 1920 SPDK_ERRLOG("PCIe failover is not supported.\n"); 1921 return -ENOTSUP; 1922 } 1923 1924 pthread_mutex_lock(&nvme_bdev_ctrlr->mutex); 1925 1926 /* Currently we only support failover to the same transport type. */ 1927 if (nvme_bdev_ctrlr->connected_trid->trtype != trid->trtype) { 1928 rc = -EINVAL; 1929 goto exit; 1930 } 1931 1932 /* Currently we only support failover to the same NQN. */ 1933 if (strncmp(trid->subnqn, nvme_bdev_ctrlr->connected_trid->subnqn, SPDK_NVMF_NQN_MAX_LEN)) { 1934 rc = -EINVAL; 1935 goto exit; 1936 } 1937 1938 /* Skip all the other checks if we've already registered this path. */ 1939 TAILQ_FOREACH(new_trid, &nvme_bdev_ctrlr->trids, link) { 1940 if (!spdk_nvme_transport_id_compare(&new_trid->trid, trid)) { 1941 rc = -EEXIST; 1942 goto exit; 1943 } 1944 } 1945 1946 if (spdk_nvme_ctrlr_get_num_ns(new_ctrlr) != nvme_bdev_ctrlr->num_ns) { 1947 rc = -EINVAL; 1948 goto exit; 1949 } 1950 1951 for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) { 1952 nsid = i + 1; 1953 1954 nvme_ns = nvme_bdev_ctrlr->namespaces[i]; 1955 if (!nvme_ns->populated) { 1956 continue; 1957 } 1958 1959 new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nsid); 1960 assert(new_ns != NULL); 1961 1962 if (bdev_nvme_compare_ns(nvme_ns->ns, new_ns) != 0) { 1963 rc = -EINVAL; 1964 goto exit; 1965 } 1966 } 1967 1968 new_trid = calloc(1, sizeof(*new_trid)); 1969 if (new_trid == NULL) { 1970 rc = -ENOMEM; 1971 goto exit; 1972 } 1973 new_trid->trid = *trid; 1974 new_trid->is_failed = false; 1975 1976 TAILQ_FOREACH(tmp_trid, &nvme_bdev_ctrlr->trids, link) { 1977 if (tmp_trid->is_failed) { 1978 TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link); 1979 goto exit; 1980 } 1981 } 1982 1983 TAILQ_INSERT_TAIL(&nvme_bdev_ctrlr->trids, new_trid, link); 1984 1985 exit: 1986 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 1987 return rc; 1988 } 1989 1990 static void 1991 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 1992 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 1993 { 1994 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 1995 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; 1996 struct nvme_async_probe_ctx *ctx; 1997 int rc; 1998 1999 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, opts); 2000 ctx->ctrlr_attached = true; 2001 2002 nvme_bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(ctx->base_name); 2003 if (nvme_bdev_ctrlr) { 2004 /* This is the case that a secondary path is added to an existing 2005 * nvme_bdev_ctrlr for failover. After checking if it can access the same 2006 * namespaces as the primary path, it is disconnected until failover occurs. 2007 */ 2008 rc = bdev_nvme_add_trid(nvme_bdev_ctrlr, ctrlr, &ctx->trid); 2009 2010 spdk_nvme_detach(ctrlr); 2011 goto exit; 2012 } 2013 2014 rc = nvme_bdev_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx->prchk_flags, 2015 &nvme_bdev_ctrlr); 2016 if (rc) { 2017 SPDK_ERRLOG("Failed to create new device\n"); 2018 goto exit; 2019 } 2020 2021 nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, ctx); 2022 return; 2023 2024 exit: 2025 populate_namespaces_cb(ctx, 0, rc); 2026 } 2027 2028 static int 2029 bdev_nvme_async_poll(void *arg) 2030 { 2031 struct nvme_async_probe_ctx *ctx = arg; 2032 int rc; 2033 2034 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 2035 if (spdk_unlikely(rc != -EAGAIN)) { 2036 ctx->probe_done = true; 2037 spdk_poller_unregister(&ctx->poller); 2038 if (!ctx->ctrlr_attached) { 2039 /* The probe is done, but no controller was attached. 2040 * That means we had a failure, so report -EIO back to 2041 * the caller (usually the RPC). populate_namespaces_cb() 2042 * will take care of freeing the nvme_async_probe_ctx. 2043 */ 2044 populate_namespaces_cb(ctx, 0, -EIO); 2045 } else if (ctx->namespaces_populated) { 2046 /* The namespaces for the attached controller were all 2047 * populated and the response was already sent to the 2048 * caller (usually the RPC). So free the context here. 2049 */ 2050 free(ctx); 2051 } 2052 } 2053 2054 return SPDK_POLLER_BUSY; 2055 } 2056 2057 int 2058 bdev_nvme_create(struct spdk_nvme_transport_id *trid, 2059 struct spdk_nvme_host_id *hostid, 2060 const char *base_name, 2061 const char **names, 2062 uint32_t count, 2063 const char *hostnqn, 2064 uint32_t prchk_flags, 2065 spdk_bdev_create_nvme_fn cb_fn, 2066 void *cb_ctx, 2067 struct spdk_nvme_ctrlr_opts *opts) 2068 { 2069 struct nvme_probe_skip_entry *entry, *tmp; 2070 struct nvme_async_probe_ctx *ctx; 2071 2072 /* TODO expand this check to include both the host and target TRIDs. 2073 * Only if both are the same should we fail. 2074 */ 2075 if (nvme_bdev_ctrlr_get(trid) != NULL) { 2076 SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr); 2077 return -EEXIST; 2078 } 2079 2080 ctx = calloc(1, sizeof(*ctx)); 2081 if (!ctx) { 2082 return -ENOMEM; 2083 } 2084 ctx->base_name = base_name; 2085 ctx->names = names; 2086 ctx->count = count; 2087 ctx->cb_fn = cb_fn; 2088 ctx->cb_ctx = cb_ctx; 2089 ctx->prchk_flags = prchk_flags; 2090 ctx->trid = *trid; 2091 2092 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 2093 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) { 2094 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 2095 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 2096 free(entry); 2097 break; 2098 } 2099 } 2100 } 2101 2102 if (opts) { 2103 memcpy(&ctx->opts, opts, sizeof(*opts)); 2104 } else { 2105 spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->opts, sizeof(ctx->opts)); 2106 } 2107 2108 ctx->opts.transport_retry_count = g_opts.retry_count; 2109 ctx->opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms; 2110 2111 if (hostnqn) { 2112 snprintf(ctx->opts.hostnqn, sizeof(ctx->opts.hostnqn), "%s", hostnqn); 2113 } 2114 2115 if (hostid->hostaddr[0] != '\0') { 2116 snprintf(ctx->opts.src_addr, sizeof(ctx->opts.src_addr), "%s", hostid->hostaddr); 2117 } 2118 2119 if (hostid->hostsvcid[0] != '\0') { 2120 snprintf(ctx->opts.src_svcid, sizeof(ctx->opts.src_svcid), "%s", hostid->hostsvcid); 2121 } 2122 2123 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->opts, connect_attach_cb); 2124 if (ctx->probe_ctx == NULL) { 2125 SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr); 2126 free(ctx); 2127 return -ENODEV; 2128 } 2129 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000); 2130 2131 return 0; 2132 } 2133 2134 int 2135 bdev_nvme_delete(const char *name, const struct spdk_nvme_transport_id *trid) 2136 { 2137 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; 2138 struct nvme_bdev_ctrlr_trid *ctrlr_trid, *tmp_trid; 2139 2140 if (name == NULL) { 2141 return -EINVAL; 2142 } 2143 2144 nvme_bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 2145 if (nvme_bdev_ctrlr == NULL) { 2146 SPDK_ERRLOG("Failed to find NVMe controller\n"); 2147 return -ENODEV; 2148 } 2149 2150 /* case 1: remove the controller itself. */ 2151 if (trid == NULL) { 2152 return _bdev_nvme_delete(nvme_bdev_ctrlr, false); 2153 } 2154 2155 /* case 2: we are currently using the path to be removed. */ 2156 if (!spdk_nvme_transport_id_compare(trid, nvme_bdev_ctrlr->connected_trid)) { 2157 ctrlr_trid = TAILQ_FIRST(&nvme_bdev_ctrlr->trids); 2158 assert(nvme_bdev_ctrlr->connected_trid == &ctrlr_trid->trid); 2159 /* case 2A: the current path is the only path. */ 2160 if (!TAILQ_NEXT(ctrlr_trid, link)) { 2161 return _bdev_nvme_delete(nvme_bdev_ctrlr, false); 2162 } 2163 2164 /* case 1B: there is an alternative path. */ 2165 return bdev_nvme_failover(nvme_bdev_ctrlr, true); 2166 } 2167 /* case 3: We are not using the specified path. */ 2168 TAILQ_FOREACH_SAFE(ctrlr_trid, &nvme_bdev_ctrlr->trids, link, tmp_trid) { 2169 if (!spdk_nvme_transport_id_compare(&ctrlr_trid->trid, trid)) { 2170 TAILQ_REMOVE(&nvme_bdev_ctrlr->trids, ctrlr_trid, link); 2171 free(ctrlr_trid); 2172 return 0; 2173 } 2174 } 2175 2176 /* case 3A: The address isn't even in the registered list. */ 2177 return -ENXIO; 2178 } 2179 2180 static int 2181 bdev_nvme_library_init(void) 2182 { 2183 g_bdev_nvme_init_thread = spdk_get_thread(); 2184 2185 spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_poll_group_create_cb, 2186 bdev_nvme_poll_group_destroy_cb, 2187 sizeof(struct nvme_bdev_poll_group), "bdev_nvme_poll_groups"); 2188 2189 return 0; 2190 } 2191 2192 static void 2193 bdev_nvme_library_fini(void) 2194 { 2195 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, *tmp; 2196 struct nvme_probe_skip_entry *entry, *entry_tmp; 2197 2198 spdk_poller_unregister(&g_hotplug_poller); 2199 free(g_hotplug_probe_ctx); 2200 2201 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) { 2202 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 2203 free(entry); 2204 } 2205 2206 pthread_mutex_lock(&g_bdev_nvme_mutex); 2207 TAILQ_FOREACH_SAFE(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq, tmp) { 2208 pthread_mutex_lock(&nvme_bdev_ctrlr->mutex); 2209 if (nvme_bdev_ctrlr->destruct) { 2210 /* This controller's destruction was already started 2211 * before the application started shutting down 2212 */ 2213 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 2214 continue; 2215 } 2216 nvme_bdev_ctrlr->destruct = true; 2217 pthread_mutex_unlock(&nvme_bdev_ctrlr->mutex); 2218 2219 spdk_thread_send_msg(nvme_bdev_ctrlr->thread, _nvme_bdev_ctrlr_destruct, 2220 nvme_bdev_ctrlr); 2221 } 2222 2223 g_bdev_nvme_module_finish = true; 2224 if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 2225 pthread_mutex_unlock(&g_bdev_nvme_mutex); 2226 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 2227 spdk_bdev_module_finish_done(); 2228 return; 2229 } 2230 2231 pthread_mutex_unlock(&g_bdev_nvme_mutex); 2232 } 2233 2234 static void 2235 bdev_nvme_verify_pi_error(struct spdk_bdev_io *bdev_io) 2236 { 2237 struct spdk_bdev *bdev = bdev_io->bdev; 2238 struct spdk_dif_ctx dif_ctx; 2239 struct spdk_dif_error err_blk = {}; 2240 int rc; 2241 2242 rc = spdk_dif_ctx_init(&dif_ctx, 2243 bdev->blocklen, bdev->md_len, bdev->md_interleave, 2244 bdev->dif_is_head_of_md, bdev->dif_type, bdev->dif_check_flags, 2245 bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0); 2246 if (rc != 0) { 2247 SPDK_ERRLOG("Initialization of DIF context failed\n"); 2248 return; 2249 } 2250 2251 if (bdev->md_interleave) { 2252 rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 2253 bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 2254 } else { 2255 struct iovec md_iov = { 2256 .iov_base = bdev_io->u.bdev.md_buf, 2257 .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len, 2258 }; 2259 2260 rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 2261 &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 2262 } 2263 2264 if (rc != 0) { 2265 SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n", 2266 err_blk.err_type, err_blk.err_offset); 2267 } else { 2268 SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n"); 2269 } 2270 } 2271 2272 static void 2273 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 2274 { 2275 struct nvme_bdev_io *bio = ref; 2276 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2277 2278 if (spdk_nvme_cpl_is_success(cpl)) { 2279 /* Run PI verification for read data buffer. */ 2280 bdev_nvme_verify_pi_error(bdev_io); 2281 } 2282 2283 /* Return original completion status */ 2284 spdk_bdev_io_complete_nvme_status(bdev_io, bio->cpl.cdw0, bio->cpl.status.sct, 2285 bio->cpl.status.sc); 2286 } 2287 2288 static void 2289 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 2290 { 2291 struct nvme_bdev_io *bio = ref; 2292 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2293 struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt; 2294 struct nvme_io_channel *nvme_ch; 2295 struct nvme_bdev_ns *nvme_ns; 2296 struct spdk_nvme_qpair *qpair; 2297 int ret; 2298 2299 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 2300 SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n", 2301 cpl->status.sct, cpl->status.sc); 2302 2303 /* Save completion status to use after verifying PI error. */ 2304 bio->cpl = *cpl; 2305 2306 nvme_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 2307 2308 if (spdk_likely(bdev_nvme_find_io_path(nbdev, nvme_ch, &nvme_ns, &qpair))) { 2309 /* Read without PI checking to verify PI error. */ 2310 ret = bdev_nvme_no_pi_readv(nvme_ns->ns, 2311 qpair, 2312 bio, 2313 bdev_io->u.bdev.iovs, 2314 bdev_io->u.bdev.iovcnt, 2315 bdev_io->u.bdev.md_buf, 2316 bdev_io->u.bdev.num_blocks, 2317 bdev_io->u.bdev.offset_blocks); 2318 if (ret == 0) { 2319 return; 2320 } 2321 } 2322 } 2323 2324 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 2325 } 2326 2327 static void 2328 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 2329 { 2330 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref); 2331 2332 if (spdk_nvme_cpl_is_pi_error(cpl)) { 2333 SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n", 2334 cpl->status.sct, cpl->status.sc); 2335 /* Run PI verification for write data buffer if PI error is detected. */ 2336 bdev_nvme_verify_pi_error(bdev_io); 2337 } 2338 2339 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 2340 } 2341 2342 static void 2343 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl) 2344 { 2345 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref); 2346 2347 if (spdk_nvme_cpl_is_pi_error(cpl)) { 2348 SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n", 2349 cpl->status.sct, cpl->status.sc); 2350 /* Run PI verification for compare data buffer if PI error is detected. */ 2351 bdev_nvme_verify_pi_error(bdev_io); 2352 } 2353 2354 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 2355 } 2356 2357 static void 2358 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 2359 { 2360 struct nvme_bdev_io *bio = ref; 2361 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2362 2363 /* Compare operation completion */ 2364 if ((cpl->cdw0 & 0xFF) == SPDK_NVME_OPC_COMPARE) { 2365 /* Save compare result for write callback */ 2366 bio->cpl = *cpl; 2367 return; 2368 } 2369 2370 /* Write operation completion */ 2371 if (spdk_nvme_cpl_is_error(&bio->cpl)) { 2372 /* If bio->cpl is already an error, it means the compare operation failed. In that case, 2373 * complete the IO with the compare operation's status. 2374 */ 2375 if (!spdk_nvme_cpl_is_error(cpl)) { 2376 SPDK_ERRLOG("Unexpected write success after compare failure.\n"); 2377 } 2378 2379 spdk_bdev_io_complete_nvme_status(bdev_io, bio->cpl.cdw0, bio->cpl.status.sct, bio->cpl.status.sc); 2380 } else { 2381 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 2382 } 2383 } 2384 2385 static void 2386 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl) 2387 { 2388 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref); 2389 2390 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 2391 } 2392 2393 static void 2394 bdev_nvme_admin_passthru_completion(void *ctx) 2395 { 2396 struct nvme_bdev_io *bio = ctx; 2397 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2398 2399 spdk_bdev_io_complete_nvme_status(bdev_io, 2400 bio->cpl.cdw0, bio->cpl.status.sct, bio->cpl.status.sc); 2401 } 2402 2403 static void 2404 bdev_nvme_abort_completion(void *ctx) 2405 { 2406 struct nvme_bdev_io *bio = ctx; 2407 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2408 2409 if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) { 2410 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 2411 } else { 2412 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2413 } 2414 } 2415 2416 static void 2417 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl) 2418 { 2419 struct nvme_bdev_io *bio = ref; 2420 2421 bio->cpl = *cpl; 2422 spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_completion, bio); 2423 } 2424 2425 static void 2426 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl) 2427 { 2428 struct nvme_bdev_io *bio = ref; 2429 2430 bio->cpl = *cpl; 2431 spdk_thread_send_msg(bio->orig_thread, bdev_nvme_admin_passthru_completion, bio); 2432 } 2433 2434 static void 2435 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset) 2436 { 2437 struct nvme_bdev_io *bio = ref; 2438 struct iovec *iov; 2439 2440 bio->iov_offset = sgl_offset; 2441 for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) { 2442 iov = &bio->iovs[bio->iovpos]; 2443 if (bio->iov_offset < iov->iov_len) { 2444 break; 2445 } 2446 2447 bio->iov_offset -= iov->iov_len; 2448 } 2449 } 2450 2451 static int 2452 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length) 2453 { 2454 struct nvme_bdev_io *bio = ref; 2455 struct iovec *iov; 2456 2457 assert(bio->iovpos < bio->iovcnt); 2458 2459 iov = &bio->iovs[bio->iovpos]; 2460 2461 *address = iov->iov_base; 2462 *length = iov->iov_len; 2463 2464 if (bio->iov_offset) { 2465 assert(bio->iov_offset <= iov->iov_len); 2466 *address += bio->iov_offset; 2467 *length -= bio->iov_offset; 2468 } 2469 2470 bio->iov_offset += *length; 2471 if (bio->iov_offset == iov->iov_len) { 2472 bio->iovpos++; 2473 bio->iov_offset = 0; 2474 } 2475 2476 return 0; 2477 } 2478 2479 static void 2480 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset) 2481 { 2482 struct nvme_bdev_io *bio = ref; 2483 struct iovec *iov; 2484 2485 bio->fused_iov_offset = sgl_offset; 2486 for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) { 2487 iov = &bio->fused_iovs[bio->fused_iovpos]; 2488 if (bio->fused_iov_offset < iov->iov_len) { 2489 break; 2490 } 2491 2492 bio->fused_iov_offset -= iov->iov_len; 2493 } 2494 } 2495 2496 static int 2497 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length) 2498 { 2499 struct nvme_bdev_io *bio = ref; 2500 struct iovec *iov; 2501 2502 assert(bio->fused_iovpos < bio->fused_iovcnt); 2503 2504 iov = &bio->fused_iovs[bio->fused_iovpos]; 2505 2506 *address = iov->iov_base; 2507 *length = iov->iov_len; 2508 2509 if (bio->fused_iov_offset) { 2510 assert(bio->fused_iov_offset <= iov->iov_len); 2511 *address += bio->fused_iov_offset; 2512 *length -= bio->fused_iov_offset; 2513 } 2514 2515 bio->fused_iov_offset += *length; 2516 if (bio->fused_iov_offset == iov->iov_len) { 2517 bio->fused_iovpos++; 2518 bio->fused_iov_offset = 0; 2519 } 2520 2521 return 0; 2522 } 2523 2524 static int 2525 bdev_nvme_no_pi_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 2526 struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 2527 void *md, uint64_t lba_count, uint64_t lba) 2528 { 2529 int rc; 2530 2531 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n", 2532 lba_count, lba); 2533 2534 bio->iovs = iov; 2535 bio->iovcnt = iovcnt; 2536 bio->iovpos = 0; 2537 bio->iov_offset = 0; 2538 2539 rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count, 2540 bdev_nvme_no_pi_readv_done, bio, 0, 2541 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 2542 md, 0, 0); 2543 2544 if (rc != 0 && rc != -ENOMEM) { 2545 SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc); 2546 } 2547 return rc; 2548 } 2549 2550 static int 2551 bdev_nvme_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 2552 struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 2553 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags) 2554 { 2555 int rc; 2556 2557 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n", 2558 lba_count, lba); 2559 2560 bio->iovs = iov; 2561 bio->iovcnt = iovcnt; 2562 bio->iovpos = 0; 2563 bio->iov_offset = 0; 2564 2565 if (iovcnt == 1) { 2566 rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base, md, lba, 2567 lba_count, 2568 bdev_nvme_readv_done, bio, 2569 flags, 2570 0, 0); 2571 } else { 2572 rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count, 2573 bdev_nvme_readv_done, bio, flags, 2574 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 2575 md, 0, 0); 2576 } 2577 2578 if (rc != 0 && rc != -ENOMEM) { 2579 SPDK_ERRLOG("readv failed: rc = %d\n", rc); 2580 } 2581 return rc; 2582 } 2583 2584 static int 2585 bdev_nvme_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 2586 struct nvme_bdev_io *bio, 2587 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba, 2588 uint32_t flags) 2589 { 2590 int rc; 2591 2592 SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 2593 lba_count, lba); 2594 2595 bio->iovs = iov; 2596 bio->iovcnt = iovcnt; 2597 bio->iovpos = 0; 2598 bio->iov_offset = 0; 2599 2600 if (iovcnt == 1) { 2601 rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base, md, lba, 2602 lba_count, 2603 bdev_nvme_writev_done, bio, 2604 flags, 2605 0, 0); 2606 } else { 2607 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 2608 bdev_nvme_writev_done, bio, flags, 2609 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 2610 md, 0, 0); 2611 } 2612 2613 if (rc != 0 && rc != -ENOMEM) { 2614 SPDK_ERRLOG("writev failed: rc = %d\n", rc); 2615 } 2616 return rc; 2617 } 2618 2619 static int 2620 bdev_nvme_comparev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 2621 struct nvme_bdev_io *bio, 2622 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba, 2623 uint32_t flags) 2624 { 2625 int rc; 2626 2627 SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n", 2628 lba_count, lba); 2629 2630 bio->iovs = iov; 2631 bio->iovcnt = iovcnt; 2632 bio->iovpos = 0; 2633 bio->iov_offset = 0; 2634 2635 rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count, 2636 bdev_nvme_comparev_done, bio, flags, 2637 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 2638 md, 0, 0); 2639 2640 if (rc != 0 && rc != -ENOMEM) { 2641 SPDK_ERRLOG("comparev failed: rc = %d\n", rc); 2642 } 2643 return rc; 2644 } 2645 2646 static int 2647 bdev_nvme_comparev_and_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 2648 struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, 2649 struct iovec *write_iov, int write_iovcnt, 2650 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags) 2651 { 2652 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2653 int rc; 2654 2655 SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 2656 lba_count, lba); 2657 2658 bio->iovs = cmp_iov; 2659 bio->iovcnt = cmp_iovcnt; 2660 bio->iovpos = 0; 2661 bio->iov_offset = 0; 2662 bio->fused_iovs = write_iov; 2663 bio->fused_iovcnt = write_iovcnt; 2664 bio->fused_iovpos = 0; 2665 bio->fused_iov_offset = 0; 2666 2667 if (bdev_io->num_retries == 0) { 2668 bio->first_fused_submitted = false; 2669 } 2670 2671 if (!bio->first_fused_submitted) { 2672 flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST; 2673 memset(&bio->cpl, 0, sizeof(bio->cpl)); 2674 2675 rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count, 2676 bdev_nvme_comparev_and_writev_done, bio, flags, 2677 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0); 2678 if (rc == 0) { 2679 bio->first_fused_submitted = true; 2680 flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST; 2681 } else { 2682 if (rc != -ENOMEM) { 2683 SPDK_ERRLOG("compare failed: rc = %d\n", rc); 2684 } 2685 return rc; 2686 } 2687 } 2688 2689 flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND; 2690 2691 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 2692 bdev_nvme_comparev_and_writev_done, bio, flags, 2693 bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0); 2694 if (rc != 0 && rc != -ENOMEM) { 2695 SPDK_ERRLOG("write failed: rc = %d\n", rc); 2696 rc = 0; 2697 } 2698 2699 return rc; 2700 } 2701 2702 static int 2703 bdev_nvme_unmap(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 2704 struct nvme_bdev_io *bio, 2705 uint64_t offset_blocks, 2706 uint64_t num_blocks) 2707 { 2708 struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES]; 2709 struct spdk_nvme_dsm_range *range; 2710 uint64_t offset, remaining; 2711 uint64_t num_ranges_u64; 2712 uint16_t num_ranges; 2713 int rc; 2714 2715 num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) / 2716 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 2717 if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) { 2718 SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks); 2719 return -EINVAL; 2720 } 2721 num_ranges = (uint16_t)num_ranges_u64; 2722 2723 offset = offset_blocks; 2724 remaining = num_blocks; 2725 range = &dsm_ranges[0]; 2726 2727 /* Fill max-size ranges until the remaining blocks fit into one range */ 2728 while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) { 2729 range->attributes.raw = 0; 2730 range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 2731 range->starting_lba = offset; 2732 2733 offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 2734 remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 2735 range++; 2736 } 2737 2738 /* Final range describes the remaining blocks */ 2739 range->attributes.raw = 0; 2740 range->length = remaining; 2741 range->starting_lba = offset; 2742 2743 rc = spdk_nvme_ns_cmd_dataset_management(ns, qpair, 2744 SPDK_NVME_DSM_ATTR_DEALLOCATE, 2745 dsm_ranges, num_ranges, 2746 bdev_nvme_queued_done, bio); 2747 2748 return rc; 2749 } 2750 2751 static int 2752 bdev_nvme_admin_passthru(struct nvme_io_channel *nvme_ch, struct nvme_bdev_io *bio, 2753 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) 2754 { 2755 uint32_t max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ch->ctrlr->ctrlr); 2756 2757 if (nbytes > max_xfer_size) { 2758 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 2759 return -EINVAL; 2760 } 2761 2762 bio->orig_thread = spdk_io_channel_get_thread(spdk_io_channel_from_ctx(nvme_ch)); 2763 2764 return spdk_nvme_ctrlr_cmd_admin_raw(nvme_ch->ctrlr->ctrlr, cmd, buf, 2765 (uint32_t)nbytes, bdev_nvme_admin_passthru_done, bio); 2766 } 2767 2768 static int 2769 bdev_nvme_io_passthru(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 2770 struct nvme_bdev_io *bio, 2771 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) 2772 { 2773 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 2774 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 2775 2776 if (nbytes > max_xfer_size) { 2777 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 2778 return -EINVAL; 2779 } 2780 2781 /* 2782 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 2783 * so fill it out automatically. 2784 */ 2785 cmd->nsid = spdk_nvme_ns_get_id(ns); 2786 2787 return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf, 2788 (uint32_t)nbytes, bdev_nvme_queued_done, bio); 2789 } 2790 2791 static int 2792 bdev_nvme_io_passthru_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 2793 struct nvme_bdev_io *bio, 2794 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len) 2795 { 2796 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 2797 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 2798 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 2799 2800 if (nbytes > max_xfer_size) { 2801 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 2802 return -EINVAL; 2803 } 2804 2805 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 2806 SPDK_ERRLOG("invalid meta data buffer size\n"); 2807 return -EINVAL; 2808 } 2809 2810 /* 2811 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 2812 * so fill it out automatically. 2813 */ 2814 cmd->nsid = spdk_nvme_ns_get_id(ns); 2815 2816 return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf, 2817 (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio); 2818 } 2819 2820 static void 2821 bdev_nvme_abort_admin_cmd(void *ctx) 2822 { 2823 struct nvme_bdev_io *bio = ctx; 2824 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2825 struct nvme_io_channel *nvme_ch; 2826 struct nvme_bdev_io *bio_to_abort; 2827 int rc; 2828 2829 nvme_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 2830 bio_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx; 2831 2832 rc = spdk_nvme_ctrlr_cmd_abort_ext(nvme_ch->ctrlr->ctrlr, 2833 NULL, 2834 bio_to_abort, 2835 bdev_nvme_abort_done, bio); 2836 if (rc == -ENOENT) { 2837 /* If no admin command was found in admin qpair, complete the abort 2838 * request with failure. 2839 */ 2840 bio->cpl.cdw0 |= 1U; 2841 bio->cpl.status.sc = SPDK_NVME_SC_SUCCESS; 2842 bio->cpl.status.sct = SPDK_NVME_SCT_GENERIC; 2843 2844 spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_completion, bio); 2845 } 2846 } 2847 2848 static int 2849 bdev_nvme_abort(struct nvme_io_channel *nvme_ch, struct nvme_bdev_io *bio, 2850 struct nvme_bdev_io *bio_to_abort) 2851 { 2852 int rc; 2853 2854 bio->orig_thread = spdk_io_channel_get_thread(spdk_io_channel_from_ctx(nvme_ch)); 2855 2856 rc = spdk_nvme_ctrlr_cmd_abort_ext(nvme_ch->ctrlr->ctrlr, 2857 nvme_ch->qpair, 2858 bio_to_abort, 2859 bdev_nvme_abort_done, bio); 2860 if (rc == -ENOENT) { 2861 /* If no command was found in I/O qpair, the target command may be 2862 * admin command. Only a single thread tries aborting admin command 2863 * to clean I/O flow. 2864 */ 2865 spdk_thread_send_msg(nvme_ch->ctrlr->thread, 2866 bdev_nvme_abort_admin_cmd, bio); 2867 rc = 0; 2868 } 2869 2870 return rc; 2871 } 2872 2873 static void 2874 nvme_ctrlr_config_json_standard_namespace(struct spdk_json_write_ctx *w, 2875 struct nvme_bdev_ns *nvme_ns) 2876 { 2877 /* nop */ 2878 } 2879 2880 static void 2881 nvme_namespace_config_json(struct spdk_json_write_ctx *w, struct nvme_bdev_ns *nvme_ns) 2882 { 2883 g_config_json_namespace_fn[nvme_ns->type](w, nvme_ns); 2884 } 2885 2886 static void 2887 bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w) 2888 { 2889 const char *action; 2890 2891 if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) { 2892 action = "reset"; 2893 } else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) { 2894 action = "abort"; 2895 } else { 2896 action = "none"; 2897 } 2898 2899 spdk_json_write_object_begin(w); 2900 2901 spdk_json_write_named_string(w, "method", "bdev_nvme_set_options"); 2902 2903 spdk_json_write_named_object_begin(w, "params"); 2904 spdk_json_write_named_string(w, "action_on_timeout", action); 2905 spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us); 2906 spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms); 2907 spdk_json_write_named_uint32(w, "retry_count", g_opts.retry_count); 2908 spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst); 2909 spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight); 2910 spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight); 2911 spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight); 2912 spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us); 2913 spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us); 2914 spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests); 2915 spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit); 2916 spdk_json_write_object_end(w); 2917 2918 spdk_json_write_object_end(w); 2919 } 2920 2921 static void 2922 nvme_bdev_ctrlr_config_json(struct spdk_json_write_ctx *w, 2923 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr) 2924 { 2925 struct spdk_nvme_transport_id *trid; 2926 2927 trid = nvme_bdev_ctrlr->connected_trid; 2928 2929 spdk_json_write_object_begin(w); 2930 2931 spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller"); 2932 2933 spdk_json_write_named_object_begin(w, "params"); 2934 spdk_json_write_named_string(w, "name", nvme_bdev_ctrlr->name); 2935 nvme_bdev_dump_trid_json(trid, w); 2936 spdk_json_write_named_bool(w, "prchk_reftag", 2937 (nvme_bdev_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0); 2938 spdk_json_write_named_bool(w, "prchk_guard", 2939 (nvme_bdev_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0); 2940 2941 spdk_json_write_object_end(w); 2942 2943 spdk_json_write_object_end(w); 2944 } 2945 2946 static void 2947 bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w) 2948 { 2949 spdk_json_write_object_begin(w); 2950 spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug"); 2951 2952 spdk_json_write_named_object_begin(w, "params"); 2953 spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us); 2954 spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled); 2955 spdk_json_write_object_end(w); 2956 2957 spdk_json_write_object_end(w); 2958 } 2959 2960 static int 2961 bdev_nvme_config_json(struct spdk_json_write_ctx *w) 2962 { 2963 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; 2964 uint32_t nsid; 2965 2966 bdev_nvme_opts_config_json(w); 2967 2968 pthread_mutex_lock(&g_bdev_nvme_mutex); 2969 2970 TAILQ_FOREACH(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 2971 nvme_bdev_ctrlr_config_json(w, nvme_bdev_ctrlr); 2972 2973 for (nsid = 0; nsid < nvme_bdev_ctrlr->num_ns; ++nsid) { 2974 if (!nvme_bdev_ctrlr->namespaces[nsid]->populated) { 2975 continue; 2976 } 2977 2978 nvme_namespace_config_json(w, nvme_bdev_ctrlr->namespaces[nsid]); 2979 } 2980 } 2981 2982 /* Dump as last parameter to give all NVMe bdevs chance to be constructed 2983 * before enabling hotplug poller. 2984 */ 2985 bdev_nvme_hotplug_config_json(w); 2986 2987 pthread_mutex_unlock(&g_bdev_nvme_mutex); 2988 return 0; 2989 } 2990 2991 struct spdk_nvme_ctrlr * 2992 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev) 2993 { 2994 if (!bdev || bdev->module != &nvme_if) { 2995 return NULL; 2996 } 2997 2998 return SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk)->nvme_ns->ctrlr->ctrlr; 2999 } 3000 3001 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme) 3002