1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. All rights reserved. 5 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include "spdk/stdinc.h" 35 36 #include "bdev_nvme.h" 37 #include "bdev_ocssd.h" 38 39 #include "spdk/config.h" 40 #include "spdk/endian.h" 41 #include "spdk/bdev.h" 42 #include "spdk/json.h" 43 #include "spdk/nvme.h" 44 #include "spdk/nvme_ocssd.h" 45 #include "spdk/thread.h" 46 #include "spdk/string.h" 47 #include "spdk/util.h" 48 49 #include "spdk/bdev_module.h" 50 #include "spdk/log.h" 51 52 #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true 53 #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS (10000) 54 55 static int bdev_nvme_config_json(struct spdk_json_write_ctx *w); 56 57 struct nvme_bdev_io { 58 /** array of iovecs to transfer. */ 59 struct iovec *iovs; 60 61 /** Number of iovecs in iovs array. */ 62 int iovcnt; 63 64 /** Current iovec position. */ 65 int iovpos; 66 67 /** Offset in current iovec. */ 68 uint32_t iov_offset; 69 70 /** array of iovecs to transfer. */ 71 struct iovec *fused_iovs; 72 73 /** Number of iovecs in iovs array. */ 74 int fused_iovcnt; 75 76 /** Current iovec position. */ 77 int fused_iovpos; 78 79 /** Offset in current iovec. */ 80 uint32_t fused_iov_offset; 81 82 /** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */ 83 struct spdk_nvme_cpl cpl; 84 85 /** Originating thread */ 86 struct spdk_thread *orig_thread; 87 88 /** Keeps track if first of fused commands was submitted */ 89 bool first_fused_submitted; 90 }; 91 92 struct nvme_probe_ctx { 93 size_t count; 94 struct spdk_nvme_transport_id trids[NVME_MAX_CONTROLLERS]; 95 struct spdk_nvme_host_id hostids[NVME_MAX_CONTROLLERS]; 96 const char *names[NVME_MAX_CONTROLLERS]; 97 uint32_t prchk_flags[NVME_MAX_CONTROLLERS]; 98 const char *hostnqn; 99 }; 100 101 struct nvme_probe_skip_entry { 102 struct spdk_nvme_transport_id trid; 103 TAILQ_ENTRY(nvme_probe_skip_entry) tailq; 104 }; 105 /* All the controllers deleted by users via RPC are skipped by hotplug monitor */ 106 static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER( 107 g_skipped_nvme_ctrlrs); 108 109 static struct spdk_bdev_nvme_opts g_opts = { 110 .action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE, 111 .timeout_us = 0, 112 .keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS, 113 .retry_count = 4, 114 .arbitration_burst = 0, 115 .low_priority_weight = 0, 116 .medium_priority_weight = 0, 117 .high_priority_weight = 0, 118 .nvme_adminq_poll_period_us = 10000ULL, 119 .nvme_ioq_poll_period_us = 0, 120 .io_queue_requests = 0, 121 .delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT, 122 }; 123 124 #define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL 125 #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT 100000ULL 126 127 static int g_hot_insert_nvme_controller_index = 0; 128 static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT; 129 static bool g_nvme_hotplug_enabled = false; 130 static struct spdk_thread *g_bdev_nvme_init_thread; 131 static struct spdk_poller *g_hotplug_poller; 132 static struct spdk_poller *g_hotplug_probe_poller; 133 static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx; 134 135 static void nvme_ctrlr_populate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 136 struct nvme_async_probe_ctx *ctx); 137 static void nvme_ctrlr_populate_namespaces_done(struct nvme_async_probe_ctx *ctx); 138 static int bdev_nvme_library_init(void); 139 static void bdev_nvme_library_fini(void); 140 static int bdev_nvme_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 141 struct nvme_bdev_io *bio, 142 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba, 143 uint32_t flags); 144 static int bdev_nvme_no_pi_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 145 struct nvme_bdev_io *bio, 146 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba); 147 static int bdev_nvme_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 148 struct nvme_bdev_io *bio, 149 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba, 150 uint32_t flags); 151 static int bdev_nvme_comparev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 152 struct nvme_bdev_io *bio, 153 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba, 154 uint32_t flags); 155 static int bdev_nvme_comparev_and_writev(struct spdk_nvme_ns *ns, 156 struct spdk_nvme_qpair *qpair, 157 struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov, 158 int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba, 159 uint32_t flags); 160 static int bdev_nvme_admin_passthru(struct nvme_io_channel *nvme_ch, 161 struct nvme_bdev_io *bio, 162 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); 163 static int bdev_nvme_io_passthru(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 164 struct nvme_bdev_io *bio, 165 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); 166 static int bdev_nvme_io_passthru_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 167 struct nvme_bdev_io *bio, 168 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len); 169 static int bdev_nvme_abort(struct nvme_io_channel *nvme_ch, 170 struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort); 171 static int bdev_nvme_reset(struct nvme_io_channel *nvme_ch, struct nvme_bdev_io *bio); 172 static int bdev_nvme_failover(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, bool remove); 173 static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr); 174 175 typedef void (*populate_namespace_fn)(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 176 struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx); 177 static void nvme_ctrlr_populate_standard_namespace(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 178 struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx); 179 180 static populate_namespace_fn g_populate_namespace_fn[] = { 181 NULL, 182 nvme_ctrlr_populate_standard_namespace, 183 bdev_ocssd_populate_namespace, 184 }; 185 186 typedef void (*depopulate_namespace_fn)(struct nvme_bdev_ns *nvme_ns); 187 static void nvme_ctrlr_depopulate_standard_namespace(struct nvme_bdev_ns *nvme_ns); 188 189 static depopulate_namespace_fn g_depopulate_namespace_fn[] = { 190 NULL, 191 nvme_ctrlr_depopulate_standard_namespace, 192 bdev_ocssd_depopulate_namespace, 193 }; 194 195 typedef void (*config_json_namespace_fn)(struct spdk_json_write_ctx *w, 196 struct nvme_bdev_ns *nvme_ns); 197 static void nvme_ctrlr_config_json_standard_namespace(struct spdk_json_write_ctx *w, 198 struct nvme_bdev_ns *nvme_ns); 199 200 static config_json_namespace_fn g_config_json_namespace_fn[] = { 201 NULL, 202 nvme_ctrlr_config_json_standard_namespace, 203 bdev_ocssd_namespace_config_json, 204 }; 205 206 struct spdk_nvme_qpair * 207 bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch) 208 { 209 struct nvme_io_channel *nvme_ch; 210 211 nvme_ch = spdk_io_channel_get_ctx(ctrlr_io_ch); 212 213 return nvme_ch->qpair; 214 } 215 216 static int 217 bdev_nvme_get_ctx_size(void) 218 { 219 return sizeof(struct nvme_bdev_io); 220 } 221 222 static struct spdk_bdev_module nvme_if = { 223 .name = "nvme", 224 .async_fini = true, 225 .module_init = bdev_nvme_library_init, 226 .module_fini = bdev_nvme_library_fini, 227 .config_json = bdev_nvme_config_json, 228 .get_ctx_size = bdev_nvme_get_ctx_size, 229 230 }; 231 SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if) 232 233 static void 234 bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx) 235 { 236 int rc; 237 238 SPDK_DEBUGLOG(bdev_nvme, "qpair %p is disconnected, attempting reconnect.\n", qpair); 239 /* 240 * Currently, just try to reconnect indefinitely. If we are doing a reset, the reset will 241 * reconnect a qpair and we will stop getting a callback for this one. 242 */ 243 rc = spdk_nvme_ctrlr_reconnect_io_qpair(qpair); 244 if (rc != 0) { 245 SPDK_WARNLOG("Failed to reconnect to qpair %p, errno %d\n", qpair, -rc); 246 } 247 } 248 249 static int 250 bdev_nvme_poll(void *arg) 251 { 252 struct nvme_bdev_poll_group *group = arg; 253 int64_t num_completions; 254 255 if (group->collect_spin_stat && group->start_ticks == 0) { 256 group->start_ticks = spdk_get_ticks(); 257 } 258 259 num_completions = spdk_nvme_poll_group_process_completions(group->group, 0, 260 bdev_nvme_disconnected_qpair_cb); 261 if (group->collect_spin_stat) { 262 if (num_completions > 0) { 263 if (group->end_ticks != 0) { 264 group->spin_ticks += (group->end_ticks - group->start_ticks); 265 group->end_ticks = 0; 266 } 267 group->start_ticks = 0; 268 } else { 269 group->end_ticks = spdk_get_ticks(); 270 } 271 } 272 273 return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE; 274 } 275 276 static int 277 bdev_nvme_poll_adminq(void *arg) 278 { 279 int32_t rc; 280 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = arg; 281 282 assert(nvme_bdev_ctrlr != NULL); 283 284 rc = spdk_nvme_ctrlr_process_admin_completions(nvme_bdev_ctrlr->ctrlr); 285 if (rc < 0) { 286 bdev_nvme_failover(nvme_bdev_ctrlr, false); 287 } 288 289 return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY; 290 } 291 292 static int 293 bdev_nvme_destruct(void *ctx) 294 { 295 struct nvme_bdev *nvme_disk = ctx; 296 struct nvme_bdev_ns *nvme_ns = nvme_disk->nvme_ns; 297 298 pthread_mutex_lock(&g_bdev_nvme_mutex); 299 TAILQ_REMOVE(&nvme_ns->bdevs, nvme_disk, tailq); 300 pthread_mutex_unlock(&g_bdev_nvme_mutex); 301 302 nvme_bdev_ns_detach(nvme_ns); 303 304 free(nvme_disk->disk.name); 305 free(nvme_disk); 306 307 return 0; 308 } 309 310 static int 311 bdev_nvme_flush(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 312 struct nvme_bdev_io *bio, uint64_t offset, uint64_t nbytes) 313 { 314 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_SUCCESS); 315 316 return 0; 317 } 318 319 static int 320 bdev_nvme_create_qpair(struct nvme_io_channel *nvme_ch) 321 { 322 struct spdk_nvme_ctrlr *ctrlr = nvme_ch->ctrlr->ctrlr; 323 struct spdk_nvme_io_qpair_opts opts; 324 int rc; 325 326 spdk_nvme_ctrlr_get_default_io_qpair_opts(ctrlr, &opts, sizeof(opts)); 327 opts.delay_cmd_submit = g_opts.delay_cmd_submit; 328 opts.create_only = true; 329 opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests); 330 g_opts.io_queue_requests = opts.io_queue_requests; 331 332 nvme_ch->qpair = spdk_nvme_ctrlr_alloc_io_qpair(ctrlr, &opts, sizeof(opts)); 333 if (nvme_ch->qpair == NULL) { 334 return -1; 335 } 336 337 assert(nvme_ch->group != NULL); 338 339 rc = spdk_nvme_poll_group_add(nvme_ch->group->group, nvme_ch->qpair); 340 if (rc != 0) { 341 SPDK_ERRLOG("Unable to begin polling on NVMe Channel.\n"); 342 goto err; 343 } 344 345 rc = spdk_nvme_ctrlr_connect_io_qpair(ctrlr, nvme_ch->qpair); 346 if (rc != 0) { 347 SPDK_ERRLOG("Unable to connect I/O qpair.\n"); 348 goto err; 349 } 350 351 return 0; 352 353 err: 354 spdk_nvme_ctrlr_free_io_qpair(nvme_ch->qpair); 355 356 return rc; 357 } 358 359 static void 360 _bdev_nvme_reset_destruct_ctrlr(struct spdk_io_channel_iter *i, int status) 361 { 362 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_io_device(i); 363 364 spdk_thread_send_msg(nvme_bdev_ctrlr->thread, nvme_bdev_ctrlr_do_destruct, 365 nvme_bdev_ctrlr); 366 } 367 368 static void 369 _bdev_nvme_complete_pending_resets(struct spdk_io_channel_iter *i) 370 { 371 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 372 struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(_ch); 373 struct spdk_bdev_io *bdev_io; 374 enum spdk_bdev_io_status status = SPDK_BDEV_IO_STATUS_SUCCESS; 375 376 /* A NULL ctx means success. */ 377 if (spdk_io_channel_iter_get_ctx(i) != NULL) { 378 status = SPDK_BDEV_IO_STATUS_FAILED; 379 } 380 381 while (!TAILQ_EMPTY(&nvme_ch->pending_resets)) { 382 bdev_io = TAILQ_FIRST(&nvme_ch->pending_resets); 383 TAILQ_REMOVE(&nvme_ch->pending_resets, bdev_io, module_link); 384 spdk_bdev_io_complete(bdev_io, status); 385 } 386 387 spdk_for_each_channel_continue(i, 0); 388 } 389 390 static void 391 _bdev_nvme_reset_complete(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, int rc) 392 { 393 /* we are using the for_each_channel cb_arg like a return code here. */ 394 /* If it's zero, we succeeded, otherwise, the reset failed. */ 395 void *cb_arg = NULL; 396 struct nvme_bdev_ctrlr_trid *curr_trid; 397 bool do_destruct = false; 398 399 if (rc) { 400 cb_arg = (void *)0x1; 401 SPDK_ERRLOG("Resetting controller failed.\n"); 402 } else { 403 SPDK_NOTICELOG("Resetting controller successful.\n"); 404 } 405 406 pthread_mutex_lock(&g_bdev_nvme_mutex); 407 nvme_bdev_ctrlr->resetting = false; 408 nvme_bdev_ctrlr->failover_in_progress = false; 409 410 curr_trid = TAILQ_FIRST(&nvme_bdev_ctrlr->trids); 411 assert(curr_trid != NULL); 412 assert(&curr_trid->trid == nvme_bdev_ctrlr->connected_trid); 413 414 curr_trid->is_failed = cb_arg != NULL ? true : false; 415 416 if (nvme_bdev_ctrlr->ref == 0 && nvme_bdev_ctrlr->destruct) { 417 /* Destruct ctrlr after clearing pending resets. */ 418 do_destruct = true; 419 } 420 421 pthread_mutex_unlock(&g_bdev_nvme_mutex); 422 /* Make sure we clear any pending resets before returning. */ 423 spdk_for_each_channel(nvme_bdev_ctrlr, 424 _bdev_nvme_complete_pending_resets, 425 cb_arg, 426 do_destruct ? _bdev_nvme_reset_destruct_ctrlr : NULL); 427 } 428 429 static void 430 _bdev_nvme_reset_create_qpairs_done(struct spdk_io_channel_iter *i, int status) 431 { 432 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_io_device(i); 433 struct nvme_bdev_io *bio = spdk_io_channel_iter_get_ctx(i); 434 int rc = SPDK_BDEV_IO_STATUS_SUCCESS; 435 436 if (status) { 437 rc = SPDK_BDEV_IO_STATUS_FAILED; 438 } 439 if (bio) { 440 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), rc); 441 } 442 _bdev_nvme_reset_complete(nvme_bdev_ctrlr, status); 443 } 444 445 static void 446 _bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i) 447 { 448 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 449 struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(_ch); 450 int rc; 451 452 rc = bdev_nvme_create_qpair(nvme_ch); 453 454 spdk_for_each_channel_continue(i, rc); 455 } 456 457 static void 458 _bdev_nvme_reset_ctrlr(struct spdk_io_channel_iter *i, int status) 459 { 460 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = spdk_io_channel_iter_get_io_device(i); 461 struct nvme_bdev_io *bio = spdk_io_channel_iter_get_ctx(i); 462 int rc; 463 464 if (status) { 465 rc = status; 466 goto err; 467 } 468 469 rc = spdk_nvme_ctrlr_reset(nvme_bdev_ctrlr->ctrlr); 470 if (rc != 0) { 471 goto err; 472 } 473 474 /* Recreate all of the I/O queue pairs */ 475 spdk_for_each_channel(nvme_bdev_ctrlr, 476 _bdev_nvme_reset_create_qpair, 477 bio, 478 _bdev_nvme_reset_create_qpairs_done); 479 return; 480 481 err: 482 if (bio) { 483 spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_FAILED); 484 } 485 _bdev_nvme_reset_complete(nvme_bdev_ctrlr, rc); 486 } 487 488 static void 489 _bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i) 490 { 491 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 492 struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch); 493 int rc; 494 495 rc = spdk_nvme_ctrlr_free_io_qpair(nvme_ch->qpair); 496 if (!rc) { 497 nvme_ch->qpair = NULL; 498 } 499 500 spdk_for_each_channel_continue(i, rc); 501 } 502 503 static int 504 _bdev_nvme_reset(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, void *ctx) 505 { 506 pthread_mutex_lock(&g_bdev_nvme_mutex); 507 if (nvme_bdev_ctrlr->destruct) { 508 pthread_mutex_unlock(&g_bdev_nvme_mutex); 509 return -EBUSY; 510 } 511 512 if (nvme_bdev_ctrlr->resetting) { 513 pthread_mutex_unlock(&g_bdev_nvme_mutex); 514 SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); 515 return -EAGAIN; 516 } 517 518 nvme_bdev_ctrlr->resetting = true; 519 520 pthread_mutex_unlock(&g_bdev_nvme_mutex); 521 /* First, delete all NVMe I/O queue pairs. */ 522 spdk_for_each_channel(nvme_bdev_ctrlr, 523 _bdev_nvme_reset_destroy_qpair, 524 ctx, 525 _bdev_nvme_reset_ctrlr); 526 527 return 0; 528 } 529 530 static int 531 bdev_nvme_reset(struct nvme_io_channel *nvme_ch, struct nvme_bdev_io *bio) 532 { 533 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 534 int rc; 535 536 rc = _bdev_nvme_reset(nvme_ch->ctrlr, bio); 537 if (rc == -EBUSY) { 538 /* Don't bother resetting if the controller is in the process of being destructed. */ 539 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 540 return 0; 541 } else if (rc == -EAGAIN) { 542 /* 543 * Reset call is queued only if it is from the app framework. This is on purpose so that 544 * we don't interfere with the app framework reset strategy. i.e. we are deferring to the 545 * upper level. If they are in the middle of a reset, we won't try to schedule another one. 546 */ 547 TAILQ_INSERT_TAIL(&nvme_ch->pending_resets, bdev_io, module_link); 548 return 0; 549 } else { 550 return rc; 551 } 552 } 553 554 static int 555 bdev_nvme_failover(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, bool remove) 556 { 557 struct nvme_bdev_ctrlr_trid *curr_trid = NULL, *next_trid = NULL; 558 int rc = 0; 559 560 pthread_mutex_lock(&g_bdev_nvme_mutex); 561 if (nvme_bdev_ctrlr->destruct) { 562 pthread_mutex_unlock(&g_bdev_nvme_mutex); 563 /* Don't bother resetting if the controller is in the process of being destructed. */ 564 return 0; 565 } 566 567 curr_trid = TAILQ_FIRST(&nvme_bdev_ctrlr->trids); 568 assert(curr_trid); 569 assert(&curr_trid->trid == nvme_bdev_ctrlr->connected_trid); 570 next_trid = TAILQ_NEXT(curr_trid, link); 571 572 if (nvme_bdev_ctrlr->resetting) { 573 if (next_trid && !nvme_bdev_ctrlr->failover_in_progress) { 574 rc = -EAGAIN; 575 } 576 pthread_mutex_unlock(&g_bdev_nvme_mutex); 577 SPDK_NOTICELOG("Unable to perform reset, already in progress.\n"); 578 return rc; 579 } 580 581 nvme_bdev_ctrlr->resetting = true; 582 curr_trid->is_failed = true; 583 584 if (next_trid) { 585 assert(curr_trid->trid.trtype != SPDK_NVME_TRANSPORT_PCIE); 586 587 SPDK_NOTICELOG("Start failover from %s:%s to %s:%s\n", curr_trid->trid.traddr, 588 curr_trid->trid.trsvcid, next_trid->trid.traddr, next_trid->trid.trsvcid); 589 590 nvme_bdev_ctrlr->failover_in_progress = true; 591 spdk_nvme_ctrlr_fail(nvme_bdev_ctrlr->ctrlr); 592 nvme_bdev_ctrlr->connected_trid = &next_trid->trid; 593 rc = spdk_nvme_ctrlr_set_trid(nvme_bdev_ctrlr->ctrlr, &next_trid->trid); 594 assert(rc == 0); 595 TAILQ_REMOVE(&nvme_bdev_ctrlr->trids, curr_trid, link); 596 if (!remove) { 597 /** Shuffle the old trid to the end of the list and use the new one. 598 * Allows for round robin through multiple connections. 599 */ 600 TAILQ_INSERT_TAIL(&nvme_bdev_ctrlr->trids, curr_trid, link); 601 } else { 602 free(curr_trid); 603 } 604 } 605 606 pthread_mutex_unlock(&g_bdev_nvme_mutex); 607 /* First, delete all NVMe I/O queue pairs. */ 608 spdk_for_each_channel(nvme_bdev_ctrlr, 609 _bdev_nvme_reset_destroy_qpair, 610 NULL, 611 _bdev_nvme_reset_ctrlr); 612 613 return 0; 614 } 615 616 static int 617 bdev_nvme_unmap(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 618 struct nvme_bdev_io *bio, 619 uint64_t offset_blocks, 620 uint64_t num_blocks); 621 622 static void 623 bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 624 bool success) 625 { 626 struct spdk_bdev *bdev = bdev_io->bdev; 627 struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev->ctxt; 628 struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch); 629 struct nvme_bdev_ns *nvme_ns; 630 struct spdk_nvme_qpair *qpair; 631 int ret; 632 633 if (!success) { 634 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 635 return; 636 } 637 638 if (spdk_unlikely(!bdev_nvme_find_io_path(nbdev, nvme_ch, &nvme_ns, &qpair))) { 639 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 640 return; 641 } 642 643 ret = bdev_nvme_readv(nvme_ns->ns, 644 qpair, 645 (struct nvme_bdev_io *)bdev_io->driver_ctx, 646 bdev_io->u.bdev.iovs, 647 bdev_io->u.bdev.iovcnt, 648 bdev_io->u.bdev.md_buf, 649 bdev_io->u.bdev.num_blocks, 650 bdev_io->u.bdev.offset_blocks, 651 bdev->dif_check_flags); 652 653 if (spdk_likely(ret == 0)) { 654 return; 655 } else if (ret == -ENOMEM) { 656 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM); 657 } else { 658 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 659 } 660 } 661 662 static int 663 _bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 664 { 665 struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch); 666 struct spdk_bdev *bdev = bdev_io->bdev; 667 struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev->ctxt; 668 struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx; 669 struct nvme_bdev_io *nbdev_io_to_abort; 670 struct nvme_bdev_ns *nvme_ns; 671 struct spdk_nvme_qpair *qpair; 672 673 if (spdk_unlikely(!bdev_nvme_find_io_path(nbdev, nvme_ch, &nvme_ns, &qpair))) { 674 return -1; 675 } 676 677 switch (bdev_io->type) { 678 case SPDK_BDEV_IO_TYPE_READ: 679 if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) { 680 return bdev_nvme_readv(nvme_ns->ns, 681 qpair, 682 nbdev_io, 683 bdev_io->u.bdev.iovs, 684 bdev_io->u.bdev.iovcnt, 685 bdev_io->u.bdev.md_buf, 686 bdev_io->u.bdev.num_blocks, 687 bdev_io->u.bdev.offset_blocks, 688 bdev->dif_check_flags); 689 } else { 690 spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb, 691 bdev_io->u.bdev.num_blocks * bdev->blocklen); 692 return 0; 693 } 694 695 case SPDK_BDEV_IO_TYPE_WRITE: 696 return bdev_nvme_writev(nvme_ns->ns, 697 qpair, 698 nbdev_io, 699 bdev_io->u.bdev.iovs, 700 bdev_io->u.bdev.iovcnt, 701 bdev_io->u.bdev.md_buf, 702 bdev_io->u.bdev.num_blocks, 703 bdev_io->u.bdev.offset_blocks, 704 bdev->dif_check_flags); 705 706 case SPDK_BDEV_IO_TYPE_COMPARE: 707 return bdev_nvme_comparev(nvme_ns->ns, 708 qpair, 709 nbdev_io, 710 bdev_io->u.bdev.iovs, 711 bdev_io->u.bdev.iovcnt, 712 bdev_io->u.bdev.md_buf, 713 bdev_io->u.bdev.num_blocks, 714 bdev_io->u.bdev.offset_blocks, 715 bdev->dif_check_flags); 716 717 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 718 return bdev_nvme_comparev_and_writev(nvme_ns->ns, 719 qpair, 720 nbdev_io, 721 bdev_io->u.bdev.iovs, 722 bdev_io->u.bdev.iovcnt, 723 bdev_io->u.bdev.fused_iovs, 724 bdev_io->u.bdev.fused_iovcnt, 725 bdev_io->u.bdev.md_buf, 726 bdev_io->u.bdev.num_blocks, 727 bdev_io->u.bdev.offset_blocks, 728 bdev->dif_check_flags); 729 730 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 731 return bdev_nvme_unmap(nvme_ns->ns, 732 qpair, 733 nbdev_io, 734 bdev_io->u.bdev.offset_blocks, 735 bdev_io->u.bdev.num_blocks); 736 737 case SPDK_BDEV_IO_TYPE_UNMAP: 738 return bdev_nvme_unmap(nvme_ns->ns, 739 qpair, 740 nbdev_io, 741 bdev_io->u.bdev.offset_blocks, 742 bdev_io->u.bdev.num_blocks); 743 744 case SPDK_BDEV_IO_TYPE_RESET: 745 return bdev_nvme_reset(nvme_ch, nbdev_io); 746 747 case SPDK_BDEV_IO_TYPE_FLUSH: 748 return bdev_nvme_flush(nvme_ns->ns, 749 qpair, 750 nbdev_io, 751 bdev_io->u.bdev.offset_blocks, 752 bdev_io->u.bdev.num_blocks); 753 754 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 755 return bdev_nvme_admin_passthru(nvme_ch, 756 nbdev_io, 757 &bdev_io->u.nvme_passthru.cmd, 758 bdev_io->u.nvme_passthru.buf, 759 bdev_io->u.nvme_passthru.nbytes); 760 761 case SPDK_BDEV_IO_TYPE_NVME_IO: 762 return bdev_nvme_io_passthru(nvme_ns->ns, 763 qpair, 764 nbdev_io, 765 &bdev_io->u.nvme_passthru.cmd, 766 bdev_io->u.nvme_passthru.buf, 767 bdev_io->u.nvme_passthru.nbytes); 768 769 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 770 return bdev_nvme_io_passthru_md(nvme_ns->ns, 771 qpair, 772 nbdev_io, 773 &bdev_io->u.nvme_passthru.cmd, 774 bdev_io->u.nvme_passthru.buf, 775 bdev_io->u.nvme_passthru.nbytes, 776 bdev_io->u.nvme_passthru.md_buf, 777 bdev_io->u.nvme_passthru.md_len); 778 779 case SPDK_BDEV_IO_TYPE_ABORT: 780 nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx; 781 return bdev_nvme_abort(nvme_ch, 782 nbdev_io, 783 nbdev_io_to_abort); 784 785 default: 786 return -EINVAL; 787 } 788 return 0; 789 } 790 791 static void 792 bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 793 { 794 int rc = _bdev_nvme_submit_request(ch, bdev_io); 795 796 if (spdk_unlikely(rc != 0)) { 797 if (rc == -ENOMEM) { 798 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM); 799 } else { 800 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 801 } 802 } 803 } 804 805 static bool 806 bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) 807 { 808 struct nvme_bdev *nbdev = ctx; 809 struct nvme_bdev_ns *nvme_ns; 810 struct spdk_nvme_ns *ns; 811 struct spdk_nvme_ctrlr *ctrlr; 812 const struct spdk_nvme_ctrlr_data *cdata; 813 814 nvme_ns = nvme_bdev_to_bdev_ns(nbdev); 815 assert(nvme_ns != NULL); 816 ns = nvme_ns->ns; 817 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 818 819 switch (io_type) { 820 case SPDK_BDEV_IO_TYPE_READ: 821 case SPDK_BDEV_IO_TYPE_WRITE: 822 case SPDK_BDEV_IO_TYPE_RESET: 823 case SPDK_BDEV_IO_TYPE_FLUSH: 824 case SPDK_BDEV_IO_TYPE_NVME_ADMIN: 825 case SPDK_BDEV_IO_TYPE_NVME_IO: 826 case SPDK_BDEV_IO_TYPE_ABORT: 827 return true; 828 829 case SPDK_BDEV_IO_TYPE_COMPARE: 830 return spdk_nvme_ns_supports_compare(ns); 831 832 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 833 return spdk_nvme_ns_get_md_size(ns) ? true : false; 834 835 case SPDK_BDEV_IO_TYPE_UNMAP: 836 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 837 return cdata->oncs.dsm; 838 839 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 840 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 841 /* 842 * If an NVMe controller guarantees reading unallocated blocks returns zero, 843 * we can implement WRITE_ZEROES as an NVMe deallocate command. 844 */ 845 if (cdata->oncs.dsm && 846 spdk_nvme_ns_get_dealloc_logical_block_read_value(ns) == 847 SPDK_NVME_DEALLOC_READ_00) { 848 return true; 849 } 850 /* 851 * The NVMe controller write_zeroes function is currently not used by our driver. 852 * If a user submits an arbitrarily large write_zeroes request to the controller, the request will fail. 853 * Until this is resolved, we only claim support for write_zeroes if deallocated blocks return 0's when read. 854 */ 855 return false; 856 857 case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE: 858 if (spdk_nvme_ctrlr_get_flags(ctrlr) & 859 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) { 860 return true; 861 } 862 return false; 863 864 default: 865 return false; 866 } 867 } 868 869 static int 870 bdev_nvme_create_cb(void *io_device, void *ctx_buf) 871 { 872 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = io_device; 873 struct nvme_io_channel *nvme_ch = ctx_buf; 874 struct spdk_io_channel *pg_ch = NULL; 875 int rc; 876 877 if (spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) { 878 rc = bdev_ocssd_create_io_channel(nvme_ch); 879 if (rc != 0) { 880 return rc; 881 } 882 } 883 884 pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs); 885 if (!pg_ch) { 886 rc = -1; 887 goto err_pg_ch; 888 } 889 890 nvme_ch->group = spdk_io_channel_get_ctx(pg_ch); 891 892 #ifdef SPDK_CONFIG_VTUNE 893 nvme_ch->group->collect_spin_stat = true; 894 #else 895 nvme_ch->group->collect_spin_stat = false; 896 #endif 897 898 TAILQ_INIT(&nvme_ch->pending_resets); 899 900 nvme_ch->ctrlr = nvme_bdev_ctrlr; 901 902 rc = bdev_nvme_create_qpair(nvme_ch); 903 if (rc != 0) { 904 goto err_qpair; 905 } 906 907 return 0; 908 909 err_qpair: 910 spdk_put_io_channel(pg_ch); 911 err_pg_ch: 912 if (nvme_ch->ocssd_ch) { 913 bdev_ocssd_destroy_io_channel(nvme_ch); 914 } 915 916 return rc; 917 } 918 919 static void 920 bdev_nvme_destroy_cb(void *io_device, void *ctx_buf) 921 { 922 struct nvme_io_channel *nvme_ch = ctx_buf; 923 924 assert(nvme_ch->group != NULL); 925 926 if (nvme_ch->ocssd_ch != NULL) { 927 bdev_ocssd_destroy_io_channel(nvme_ch); 928 } 929 930 spdk_nvme_ctrlr_free_io_qpair(nvme_ch->qpair); 931 932 spdk_put_io_channel(spdk_io_channel_from_ctx(nvme_ch->group)); 933 } 934 935 static int 936 bdev_nvme_poll_group_create_cb(void *io_device, void *ctx_buf) 937 { 938 struct nvme_bdev_poll_group *group = ctx_buf; 939 940 group->group = spdk_nvme_poll_group_create(group); 941 if (group->group == NULL) { 942 return -1; 943 } 944 945 group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, g_opts.nvme_ioq_poll_period_us); 946 947 if (group->poller == NULL) { 948 spdk_nvme_poll_group_destroy(group->group); 949 return -1; 950 } 951 952 return 0; 953 } 954 955 static void 956 bdev_nvme_poll_group_destroy_cb(void *io_device, void *ctx_buf) 957 { 958 struct nvme_bdev_poll_group *group = ctx_buf; 959 960 spdk_poller_unregister(&group->poller); 961 if (spdk_nvme_poll_group_destroy(group->group)) { 962 SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module."); 963 assert(false); 964 } 965 } 966 967 static struct spdk_io_channel * 968 bdev_nvme_get_io_channel(void *ctx) 969 { 970 struct nvme_bdev *nvme_bdev = ctx; 971 972 return spdk_get_io_channel(nvme_bdev->nvme_ns->ctrlr); 973 } 974 975 static void * 976 bdev_nvme_get_module_ctx(void *ctx) 977 { 978 struct nvme_bdev *nvme_bdev = ctx; 979 980 return bdev_nvme_get_ctrlr(&nvme_bdev->disk); 981 } 982 983 static int 984 bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) 985 { 986 struct nvme_bdev *nvme_bdev = ctx; 987 struct nvme_bdev_ns *nvme_ns; 988 struct spdk_nvme_ns *ns; 989 struct spdk_nvme_ctrlr *ctrlr; 990 const struct spdk_nvme_ctrlr_data *cdata; 991 const struct spdk_nvme_transport_id *trid; 992 union spdk_nvme_vs_register vs; 993 union spdk_nvme_csts_register csts; 994 char buf[128]; 995 996 nvme_ns = nvme_bdev_to_bdev_ns(nvme_bdev); 997 assert(nvme_ns != NULL); 998 ns = nvme_ns->ns; 999 ctrlr = spdk_nvme_ns_get_ctrlr(ns); 1000 1001 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 1002 trid = spdk_nvme_ctrlr_get_transport_id(ctrlr); 1003 vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr); 1004 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); 1005 1006 spdk_json_write_named_object_begin(w, "nvme"); 1007 1008 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 1009 spdk_json_write_named_string(w, "pci_address", trid->traddr); 1010 } 1011 1012 spdk_json_write_named_object_begin(w, "trid"); 1013 1014 nvme_bdev_dump_trid_json(trid, w); 1015 1016 spdk_json_write_object_end(w); 1017 1018 #ifdef SPDK_CONFIG_NVME_CUSE 1019 size_t cuse_name_size = 128; 1020 char cuse_name[cuse_name_size]; 1021 1022 int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns), 1023 cuse_name, &cuse_name_size); 1024 if (rc == 0) { 1025 spdk_json_write_named_string(w, "cuse_device", cuse_name); 1026 } 1027 #endif 1028 1029 spdk_json_write_named_object_begin(w, "ctrlr_data"); 1030 1031 spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid); 1032 1033 snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn); 1034 spdk_str_trim(buf); 1035 spdk_json_write_named_string(w, "model_number", buf); 1036 1037 snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn); 1038 spdk_str_trim(buf); 1039 spdk_json_write_named_string(w, "serial_number", buf); 1040 1041 snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr); 1042 spdk_str_trim(buf); 1043 spdk_json_write_named_string(w, "firmware_revision", buf); 1044 1045 if (cdata->subnqn[0] != '\0') { 1046 spdk_json_write_named_string(w, "subnqn", cdata->subnqn); 1047 } 1048 1049 spdk_json_write_named_object_begin(w, "oacs"); 1050 1051 spdk_json_write_named_uint32(w, "security", cdata->oacs.security); 1052 spdk_json_write_named_uint32(w, "format", cdata->oacs.format); 1053 spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware); 1054 spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage); 1055 1056 spdk_json_write_object_end(w); 1057 1058 spdk_json_write_object_end(w); 1059 1060 spdk_json_write_named_object_begin(w, "vs"); 1061 1062 spdk_json_write_name(w, "nvme_version"); 1063 if (vs.bits.ter) { 1064 spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter); 1065 } else { 1066 spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr); 1067 } 1068 1069 spdk_json_write_object_end(w); 1070 1071 spdk_json_write_named_object_begin(w, "csts"); 1072 1073 spdk_json_write_named_uint32(w, "rdy", csts.bits.rdy); 1074 spdk_json_write_named_uint32(w, "cfs", csts.bits.cfs); 1075 1076 spdk_json_write_object_end(w); 1077 1078 spdk_json_write_named_object_begin(w, "ns_data"); 1079 1080 spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns)); 1081 1082 spdk_json_write_object_end(w); 1083 1084 if (cdata->oacs.security) { 1085 spdk_json_write_named_object_begin(w, "security"); 1086 1087 spdk_json_write_named_bool(w, "opal", nvme_bdev->opal); 1088 1089 spdk_json_write_object_end(w); 1090 } 1091 1092 spdk_json_write_object_end(w); 1093 1094 return 0; 1095 } 1096 1097 static void 1098 bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1099 { 1100 /* No config per bdev needed */ 1101 } 1102 1103 static uint64_t 1104 bdev_nvme_get_spin_time(struct spdk_io_channel *ch) 1105 { 1106 struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch); 1107 struct nvme_bdev_poll_group *group = nvme_ch->group; 1108 uint64_t spin_time; 1109 1110 if (!group || !group->collect_spin_stat) { 1111 return 0; 1112 } 1113 1114 if (group->end_ticks != 0) { 1115 group->spin_ticks += (group->end_ticks - group->start_ticks); 1116 group->end_ticks = 0; 1117 } 1118 1119 spin_time = (group->spin_ticks * 1000000ULL) / spdk_get_ticks_hz(); 1120 group->start_ticks = 0; 1121 group->spin_ticks = 0; 1122 1123 return spin_time; 1124 } 1125 1126 static const struct spdk_bdev_fn_table nvmelib_fn_table = { 1127 .destruct = bdev_nvme_destruct, 1128 .submit_request = bdev_nvme_submit_request, 1129 .io_type_supported = bdev_nvme_io_type_supported, 1130 .get_io_channel = bdev_nvme_get_io_channel, 1131 .dump_info_json = bdev_nvme_dump_info_json, 1132 .write_config_json = bdev_nvme_write_config_json, 1133 .get_spin_time = bdev_nvme_get_spin_time, 1134 .get_module_ctx = bdev_nvme_get_module_ctx, 1135 }; 1136 1137 static int 1138 nvme_disk_create(struct spdk_bdev *disk, const char *base_name, 1139 struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns, 1140 uint32_t prchk_flags, void *ctx) 1141 { 1142 const struct spdk_uuid *uuid; 1143 const struct spdk_nvme_ctrlr_data *cdata; 1144 const struct spdk_nvme_ns_data *nsdata; 1145 int rc; 1146 1147 cdata = spdk_nvme_ctrlr_get_data(ctrlr); 1148 1149 disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns)); 1150 if (!disk->name) { 1151 return -ENOMEM; 1152 } 1153 disk->product_name = "NVMe disk"; 1154 1155 disk->write_cache = 0; 1156 if (cdata->vwc.present) { 1157 /* Enable if the Volatile Write Cache exists */ 1158 disk->write_cache = 1; 1159 } 1160 disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns); 1161 disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns); 1162 disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns); 1163 1164 uuid = spdk_nvme_ns_get_uuid(ns); 1165 if (uuid != NULL) { 1166 disk->uuid = *uuid; 1167 } 1168 1169 nsdata = spdk_nvme_ns_get_data(ns); 1170 1171 disk->md_len = spdk_nvme_ns_get_md_size(ns); 1172 if (disk->md_len != 0) { 1173 disk->md_interleave = nsdata->flbas.extended; 1174 disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns); 1175 if (disk->dif_type != SPDK_DIF_DISABLE) { 1176 disk->dif_is_head_of_md = nsdata->dps.md_start; 1177 disk->dif_check_flags = prchk_flags; 1178 } 1179 } 1180 1181 if (!(spdk_nvme_ctrlr_get_flags(ctrlr) & 1182 SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) { 1183 disk->acwu = 0; 1184 } else if (nsdata->nsfeat.ns_atomic_write_unit) { 1185 disk->acwu = nsdata->nacwu; 1186 } else { 1187 disk->acwu = cdata->acwu; 1188 } 1189 1190 disk->ctxt = ctx; 1191 disk->fn_table = &nvmelib_fn_table; 1192 disk->module = &nvme_if; 1193 rc = spdk_bdev_register(disk); 1194 if (rc) { 1195 SPDK_ERRLOG("spdk_bdev_register() failed\n"); 1196 free(disk->name); 1197 return rc; 1198 } 1199 1200 return 0; 1201 } 1202 1203 static int 1204 nvme_bdev_create(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, struct nvme_bdev_ns *nvme_ns) 1205 { 1206 struct nvme_bdev *bdev; 1207 int rc; 1208 1209 bdev = calloc(1, sizeof(*bdev)); 1210 if (!bdev) { 1211 SPDK_ERRLOG("bdev calloc() failed\n"); 1212 return -ENOMEM; 1213 } 1214 1215 bdev->nvme_ns = nvme_ns; 1216 bdev->opal = nvme_bdev_ctrlr->opal_dev != NULL; 1217 1218 rc = nvme_disk_create(&bdev->disk, nvme_bdev_ctrlr->name, nvme_bdev_ctrlr->ctrlr, 1219 nvme_ns->ns, nvme_bdev_ctrlr->prchk_flags, bdev); 1220 if (rc != 0) { 1221 SPDK_ERRLOG("Failed to create NVMe disk\n"); 1222 free(bdev); 1223 return rc; 1224 } 1225 1226 nvme_ns->ref++; 1227 TAILQ_INSERT_TAIL(&nvme_ns->bdevs, bdev, tailq); 1228 1229 return 0; 1230 } 1231 1232 static void 1233 nvme_ctrlr_populate_standard_namespace(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 1234 struct nvme_bdev_ns *nvme_ns, struct nvme_async_probe_ctx *ctx) 1235 { 1236 struct spdk_nvme_ctrlr *ctrlr = nvme_bdev_ctrlr->ctrlr; 1237 struct spdk_nvme_ns *ns; 1238 int rc = 0; 1239 1240 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id); 1241 if (!ns) { 1242 SPDK_DEBUGLOG(bdev_nvme, "Invalid NS %d\n", nvme_ns->id); 1243 rc = -EINVAL; 1244 goto done; 1245 } 1246 1247 nvme_ns->ns = ns; 1248 nvme_ns->ref = 1; 1249 1250 rc = nvme_bdev_create(nvme_bdev_ctrlr, nvme_ns); 1251 done: 1252 nvme_ctrlr_populate_namespace_done(ctx, nvme_ns, rc); 1253 } 1254 1255 static bool 1256 hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 1257 struct spdk_nvme_ctrlr_opts *opts) 1258 { 1259 struct nvme_probe_skip_entry *entry; 1260 1261 TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) { 1262 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 1263 return false; 1264 } 1265 } 1266 1267 opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst; 1268 opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight; 1269 opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight; 1270 opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight; 1271 1272 SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr); 1273 1274 return true; 1275 } 1276 1277 static void 1278 nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl) 1279 { 1280 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = ctx; 1281 1282 if (spdk_nvme_cpl_is_error(cpl)) { 1283 SPDK_WARNLOG("Abort failed. Resetting controller.\n"); 1284 _bdev_nvme_reset(nvme_bdev_ctrlr, NULL); 1285 } 1286 } 1287 1288 static void 1289 timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr, 1290 struct spdk_nvme_qpair *qpair, uint16_t cid) 1291 { 1292 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = cb_arg; 1293 union spdk_nvme_csts_register csts; 1294 int rc; 1295 1296 assert(nvme_bdev_ctrlr->ctrlr == ctrlr); 1297 1298 SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid); 1299 1300 /* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O 1301 * queue. (Note: qpair == NULL when there's an admin cmd timeout.) Otherwise we 1302 * would submit another fabrics cmd on the admin queue to read CSTS and check for its 1303 * completion recursively. 1304 */ 1305 if (nvme_bdev_ctrlr->connected_trid->trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) { 1306 csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); 1307 if (csts.bits.cfs) { 1308 SPDK_ERRLOG("Controller Fatal Status, reset required\n"); 1309 _bdev_nvme_reset(nvme_bdev_ctrlr, NULL); 1310 return; 1311 } 1312 } 1313 1314 switch (g_opts.action_on_timeout) { 1315 case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT: 1316 if (qpair) { 1317 rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid, 1318 nvme_abort_cpl, nvme_bdev_ctrlr); 1319 if (rc == 0) { 1320 return; 1321 } 1322 1323 SPDK_ERRLOG("Unable to send abort. Resetting.\n"); 1324 } 1325 1326 /* FALLTHROUGH */ 1327 case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET: 1328 _bdev_nvme_reset(nvme_bdev_ctrlr, NULL); 1329 break; 1330 case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE: 1331 SPDK_DEBUGLOG(bdev_nvme, "No action for nvme controller timeout.\n"); 1332 break; 1333 default: 1334 SPDK_ERRLOG("An invalid timeout action value is found.\n"); 1335 break; 1336 } 1337 } 1338 1339 void 1340 nvme_ctrlr_depopulate_namespace_done(struct nvme_bdev_ns *nvme_ns) 1341 { 1342 nvme_bdev_ns_detach(nvme_ns); 1343 } 1344 1345 static void 1346 nvme_ctrlr_depopulate_standard_namespace(struct nvme_bdev_ns *nvme_ns) 1347 { 1348 struct nvme_bdev *bdev, *tmp; 1349 1350 TAILQ_FOREACH_SAFE(bdev, &nvme_ns->bdevs, tailq, tmp) { 1351 spdk_bdev_unregister(&bdev->disk, NULL, NULL); 1352 } 1353 1354 nvme_ns->populated = false; 1355 1356 nvme_ctrlr_depopulate_namespace_done(nvme_ns); 1357 } 1358 1359 static void 1360 nvme_ctrlr_populate_namespace(struct nvme_bdev_ctrlr *ctrlr, struct nvme_bdev_ns *nvme_ns, 1361 struct nvme_async_probe_ctx *ctx) 1362 { 1363 g_populate_namespace_fn[nvme_ns->type](ctrlr, nvme_ns, ctx); 1364 } 1365 1366 static void 1367 nvme_ctrlr_depopulate_namespace(struct nvme_bdev_ctrlr *ctrlr, struct nvme_bdev_ns *nvme_ns) 1368 { 1369 g_depopulate_namespace_fn[nvme_ns->type](nvme_ns); 1370 } 1371 1372 void 1373 nvme_ctrlr_populate_namespace_done(struct nvme_async_probe_ctx *ctx, 1374 struct nvme_bdev_ns *nvme_ns, int rc) 1375 { 1376 if (rc == 0) { 1377 nvme_ns->populated = true; 1378 pthread_mutex_lock(&g_bdev_nvme_mutex); 1379 nvme_ns->ctrlr->ref++; 1380 pthread_mutex_unlock(&g_bdev_nvme_mutex); 1381 } else { 1382 memset(nvme_ns, 0, sizeof(*nvme_ns)); 1383 } 1384 1385 if (ctx) { 1386 ctx->populates_in_progress--; 1387 if (ctx->populates_in_progress == 0) { 1388 nvme_ctrlr_populate_namespaces_done(ctx); 1389 } 1390 } 1391 } 1392 1393 static void 1394 nvme_ctrlr_populate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, 1395 struct nvme_async_probe_ctx *ctx) 1396 { 1397 struct spdk_nvme_ctrlr *ctrlr = nvme_bdev_ctrlr->ctrlr; 1398 struct nvme_bdev_ns *nvme_ns; 1399 struct spdk_nvme_ns *ns; 1400 struct nvme_bdev *bdev; 1401 uint32_t i; 1402 int rc; 1403 uint64_t num_sectors; 1404 bool ns_is_active; 1405 1406 if (ctx) { 1407 /* Initialize this count to 1 to handle the populate functions 1408 * calling nvme_ctrlr_populate_namespace_done() immediately. 1409 */ 1410 ctx->populates_in_progress = 1; 1411 } 1412 1413 for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) { 1414 uint32_t nsid = i + 1; 1415 1416 nvme_ns = nvme_bdev_ctrlr->namespaces[i]; 1417 ns_is_active = spdk_nvme_ctrlr_is_active_ns(ctrlr, nsid); 1418 1419 if (nvme_ns->populated && ns_is_active && nvme_ns->type == NVME_BDEV_NS_STANDARD) { 1420 /* NS is still there but attributes may have changed */ 1421 ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid); 1422 num_sectors = spdk_nvme_ns_get_num_sectors(ns); 1423 bdev = TAILQ_FIRST(&nvme_ns->bdevs); 1424 if (bdev->disk.blockcnt != num_sectors) { 1425 SPDK_NOTICELOG("NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n", 1426 nsid, 1427 bdev->disk.name, 1428 bdev->disk.blockcnt, 1429 num_sectors); 1430 rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors); 1431 if (rc != 0) { 1432 SPDK_ERRLOG("Could not change num blocks for nvme bdev: name %s, errno: %d.\n", 1433 bdev->disk.name, rc); 1434 } 1435 } 1436 } 1437 1438 if (!nvme_ns->populated && ns_is_active) { 1439 nvme_ns->id = nsid; 1440 nvme_ns->ctrlr = nvme_bdev_ctrlr; 1441 if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) { 1442 nvme_ns->type = NVME_BDEV_NS_OCSSD; 1443 } else { 1444 nvme_ns->type = NVME_BDEV_NS_STANDARD; 1445 } 1446 1447 TAILQ_INIT(&nvme_ns->bdevs); 1448 1449 if (ctx) { 1450 ctx->populates_in_progress++; 1451 } 1452 nvme_ctrlr_populate_namespace(nvme_bdev_ctrlr, nvme_ns, ctx); 1453 } 1454 1455 if (nvme_ns->populated && !ns_is_active) { 1456 nvme_ctrlr_depopulate_namespace(nvme_bdev_ctrlr, nvme_ns); 1457 } 1458 } 1459 1460 if (ctx) { 1461 /* Decrement this count now that the loop is over to account 1462 * for the one we started with. If the count is then 0, we 1463 * know any populate_namespace functions completed immediately, 1464 * so we'll kick the callback here. 1465 */ 1466 ctx->populates_in_progress--; 1467 if (ctx->populates_in_progress == 0) { 1468 nvme_ctrlr_populate_namespaces_done(ctx); 1469 } 1470 } 1471 1472 } 1473 1474 static void 1475 nvme_ctrlr_depopulate_namespaces(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr) 1476 { 1477 uint32_t i; 1478 struct nvme_bdev_ns *nvme_ns; 1479 1480 for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) { 1481 uint32_t nsid = i + 1; 1482 1483 nvme_ns = nvme_bdev_ctrlr->namespaces[nsid - 1]; 1484 if (nvme_ns->populated) { 1485 assert(nvme_ns->id == nsid); 1486 nvme_ctrlr_depopulate_namespace(nvme_bdev_ctrlr, nvme_ns); 1487 } 1488 } 1489 } 1490 1491 static void 1492 aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) 1493 { 1494 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = arg; 1495 union spdk_nvme_async_event_completion event; 1496 1497 if (spdk_nvme_cpl_is_error(cpl)) { 1498 SPDK_WARNLOG("AER request execute failed"); 1499 return; 1500 } 1501 1502 event.raw = cpl->cdw0; 1503 if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && 1504 (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) { 1505 nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, NULL); 1506 } else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_VENDOR) && 1507 (event.bits.log_page_identifier == SPDK_OCSSD_LOG_CHUNK_NOTIFICATION) && 1508 spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) { 1509 bdev_ocssd_handle_chunk_notification(nvme_bdev_ctrlr); 1510 } 1511 } 1512 1513 static int 1514 nvme_bdev_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr, 1515 const char *name, 1516 const struct spdk_nvme_transport_id *trid, 1517 uint32_t prchk_flags) 1518 { 1519 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; 1520 struct nvme_bdev_ctrlr_trid *trid_entry; 1521 uint32_t i; 1522 int rc; 1523 1524 nvme_bdev_ctrlr = calloc(1, sizeof(*nvme_bdev_ctrlr)); 1525 if (nvme_bdev_ctrlr == NULL) { 1526 SPDK_ERRLOG("Failed to allocate device struct\n"); 1527 return -ENOMEM; 1528 } 1529 1530 TAILQ_INIT(&nvme_bdev_ctrlr->trids); 1531 nvme_bdev_ctrlr->num_ns = spdk_nvme_ctrlr_get_num_ns(ctrlr); 1532 nvme_bdev_ctrlr->namespaces = calloc(nvme_bdev_ctrlr->num_ns, sizeof(struct nvme_bdev_ns *)); 1533 if (!nvme_bdev_ctrlr->namespaces) { 1534 SPDK_ERRLOG("Failed to allocate block namespaces pointer\n"); 1535 rc = -ENOMEM; 1536 goto err_alloc_namespaces; 1537 } 1538 1539 trid_entry = calloc(1, sizeof(*trid_entry)); 1540 if (trid_entry == NULL) { 1541 SPDK_ERRLOG("Failed to allocate trid entry pointer\n"); 1542 rc = -ENOMEM; 1543 goto err_alloc_trid; 1544 } 1545 1546 trid_entry->trid = *trid; 1547 1548 for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) { 1549 nvme_bdev_ctrlr->namespaces[i] = calloc(1, sizeof(struct nvme_bdev_ns)); 1550 if (nvme_bdev_ctrlr->namespaces[i] == NULL) { 1551 SPDK_ERRLOG("Failed to allocate block namespace struct\n"); 1552 rc = -ENOMEM; 1553 goto err_alloc_namespace; 1554 } 1555 } 1556 1557 nvme_bdev_ctrlr->thread = spdk_get_thread(); 1558 nvme_bdev_ctrlr->adminq_timer_poller = NULL; 1559 nvme_bdev_ctrlr->ctrlr = ctrlr; 1560 nvme_bdev_ctrlr->ref = 1; 1561 nvme_bdev_ctrlr->connected_trid = &trid_entry->trid; 1562 nvme_bdev_ctrlr->name = strdup(name); 1563 if (nvme_bdev_ctrlr->name == NULL) { 1564 rc = -ENOMEM; 1565 goto err_alloc_name; 1566 } 1567 1568 if (spdk_nvme_ctrlr_is_ocssd_supported(nvme_bdev_ctrlr->ctrlr)) { 1569 rc = bdev_ocssd_init_ctrlr(nvme_bdev_ctrlr); 1570 if (spdk_unlikely(rc != 0)) { 1571 SPDK_ERRLOG("Unable to initialize OCSSD controller\n"); 1572 goto err_init_ocssd; 1573 } 1574 } 1575 1576 nvme_bdev_ctrlr->prchk_flags = prchk_flags; 1577 1578 spdk_io_device_register(nvme_bdev_ctrlr, bdev_nvme_create_cb, bdev_nvme_destroy_cb, 1579 sizeof(struct nvme_io_channel), 1580 name); 1581 1582 nvme_bdev_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_bdev_ctrlr, 1583 g_opts.nvme_adminq_poll_period_us); 1584 1585 TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nvme_bdev_ctrlr, tailq); 1586 1587 if (g_opts.timeout_us > 0) { 1588 spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us, 1589 timeout_cb, nvme_bdev_ctrlr); 1590 } 1591 1592 spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_bdev_ctrlr); 1593 spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_bdev_ctrlr); 1594 1595 if (spdk_nvme_ctrlr_get_flags(nvme_bdev_ctrlr->ctrlr) & 1596 SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) { 1597 nvme_bdev_ctrlr->opal_dev = spdk_opal_dev_construct(nvme_bdev_ctrlr->ctrlr); 1598 if (nvme_bdev_ctrlr->opal_dev == NULL) { 1599 SPDK_ERRLOG("Failed to initialize Opal\n"); 1600 } 1601 } 1602 1603 TAILQ_INSERT_HEAD(&nvme_bdev_ctrlr->trids, trid_entry, link); 1604 return 0; 1605 1606 err_init_ocssd: 1607 free(nvme_bdev_ctrlr->name); 1608 err_alloc_name: 1609 err_alloc_namespace: 1610 for (; i > 0; i--) { 1611 free(nvme_bdev_ctrlr->namespaces[i - 1]); 1612 } 1613 free(trid_entry); 1614 err_alloc_trid: 1615 free(nvme_bdev_ctrlr->namespaces); 1616 err_alloc_namespaces: 1617 free(nvme_bdev_ctrlr); 1618 return rc; 1619 } 1620 1621 static void 1622 attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 1623 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 1624 { 1625 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; 1626 struct nvme_probe_ctx *ctx = cb_ctx; 1627 char *name = NULL; 1628 uint32_t prchk_flags = 0; 1629 size_t i; 1630 1631 if (ctx) { 1632 for (i = 0; i < ctx->count; i++) { 1633 if (spdk_nvme_transport_id_compare(trid, &ctx->trids[i]) == 0) { 1634 prchk_flags = ctx->prchk_flags[i]; 1635 name = strdup(ctx->names[i]); 1636 break; 1637 } 1638 } 1639 } else { 1640 name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++); 1641 } 1642 if (!name) { 1643 SPDK_ERRLOG("Failed to assign name to NVMe device\n"); 1644 return; 1645 } 1646 1647 SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name); 1648 1649 nvme_bdev_ctrlr_create(ctrlr, name, trid, prchk_flags); 1650 1651 nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(trid); 1652 if (!nvme_bdev_ctrlr) { 1653 SPDK_ERRLOG("Failed to find new NVMe controller\n"); 1654 free(name); 1655 return; 1656 } 1657 1658 nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, NULL); 1659 1660 free(name); 1661 } 1662 1663 static void 1664 _nvme_bdev_ctrlr_destruct(void *ctx) 1665 { 1666 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = ctx; 1667 1668 nvme_ctrlr_depopulate_namespaces(nvme_bdev_ctrlr); 1669 nvme_bdev_ctrlr_destruct(nvme_bdev_ctrlr); 1670 } 1671 1672 static void 1673 remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr) 1674 { 1675 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr = cb_ctx; 1676 1677 pthread_mutex_lock(&g_bdev_nvme_mutex); 1678 assert(nvme_bdev_ctrlr->ctrlr == ctrlr); 1679 /* The controller's destruction was already started */ 1680 if (nvme_bdev_ctrlr->destruct) { 1681 pthread_mutex_unlock(&g_bdev_nvme_mutex); 1682 return; 1683 } 1684 nvme_bdev_ctrlr->destruct = true; 1685 pthread_mutex_unlock(&g_bdev_nvme_mutex); 1686 _nvme_bdev_ctrlr_destruct(nvme_bdev_ctrlr); 1687 } 1688 1689 static int 1690 bdev_nvme_hotplug_probe(void *arg) 1691 { 1692 if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) { 1693 g_hotplug_probe_ctx = NULL; 1694 spdk_poller_unregister(&g_hotplug_probe_poller); 1695 } 1696 1697 return SPDK_POLLER_BUSY; 1698 } 1699 1700 static int 1701 bdev_nvme_hotplug(void *arg) 1702 { 1703 struct spdk_nvme_transport_id trid_pcie; 1704 1705 if (g_hotplug_probe_ctx) { 1706 return SPDK_POLLER_BUSY; 1707 } 1708 1709 memset(&trid_pcie, 0, sizeof(trid_pcie)); 1710 spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE); 1711 1712 g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL, 1713 hotplug_probe_cb, attach_cb, NULL); 1714 1715 if (g_hotplug_probe_ctx) { 1716 assert(g_hotplug_probe_poller == NULL); 1717 g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000); 1718 } 1719 1720 return SPDK_POLLER_BUSY; 1721 } 1722 1723 void 1724 bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts) 1725 { 1726 *opts = g_opts; 1727 } 1728 1729 int 1730 bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts) 1731 { 1732 if (g_bdev_nvme_init_thread != NULL) { 1733 if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 1734 return -EPERM; 1735 } 1736 } 1737 1738 g_opts = *opts; 1739 1740 return 0; 1741 } 1742 1743 struct set_nvme_hotplug_ctx { 1744 uint64_t period_us; 1745 bool enabled; 1746 spdk_msg_fn fn; 1747 void *fn_ctx; 1748 }; 1749 1750 static void 1751 set_nvme_hotplug_period_cb(void *_ctx) 1752 { 1753 struct set_nvme_hotplug_ctx *ctx = _ctx; 1754 1755 spdk_poller_unregister(&g_hotplug_poller); 1756 if (ctx->enabled) { 1757 g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us); 1758 } 1759 1760 g_nvme_hotplug_poll_period_us = ctx->period_us; 1761 g_nvme_hotplug_enabled = ctx->enabled; 1762 if (ctx->fn) { 1763 ctx->fn(ctx->fn_ctx); 1764 } 1765 1766 free(ctx); 1767 } 1768 1769 int 1770 bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx) 1771 { 1772 struct set_nvme_hotplug_ctx *ctx; 1773 1774 if (enabled == true && !spdk_process_is_primary()) { 1775 return -EPERM; 1776 } 1777 1778 ctx = calloc(1, sizeof(*ctx)); 1779 if (ctx == NULL) { 1780 return -ENOMEM; 1781 } 1782 1783 period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us; 1784 ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX); 1785 ctx->enabled = enabled; 1786 ctx->fn = cb; 1787 ctx->fn_ctx = cb_ctx; 1788 1789 spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx); 1790 return 0; 1791 } 1792 1793 static void 1794 populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, size_t count, int rc) 1795 { 1796 if (ctx->cb_fn) { 1797 ctx->cb_fn(ctx->cb_ctx, count, rc); 1798 } 1799 1800 ctx->namespaces_populated = true; 1801 if (ctx->probe_done) { 1802 /* The probe was already completed, so we need to free the context 1803 * here. This can happen for cases like OCSSD, where we need to 1804 * send additional commands to the SSD after attach. 1805 */ 1806 free(ctx); 1807 } 1808 } 1809 1810 static void 1811 nvme_ctrlr_populate_namespaces_done(struct nvme_async_probe_ctx *ctx) 1812 { 1813 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; 1814 struct nvme_bdev_ns *nvme_ns; 1815 struct nvme_bdev *nvme_bdev, *tmp; 1816 uint32_t i, nsid; 1817 size_t j; 1818 1819 nvme_bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(ctx->base_name); 1820 assert(nvme_bdev_ctrlr != NULL); 1821 1822 /* 1823 * Report the new bdevs that were created in this call. 1824 * There can be more than one bdev per NVMe controller. 1825 */ 1826 j = 0; 1827 for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) { 1828 nsid = i + 1; 1829 nvme_ns = nvme_bdev_ctrlr->namespaces[nsid - 1]; 1830 if (!nvme_ns->populated) { 1831 continue; 1832 } 1833 assert(nvme_ns->id == nsid); 1834 TAILQ_FOREACH_SAFE(nvme_bdev, &nvme_ns->bdevs, tailq, tmp) { 1835 if (j < ctx->count) { 1836 ctx->names[j] = nvme_bdev->disk.name; 1837 j++; 1838 } else { 1839 SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %du. Unable to return all names of created bdevs\n", 1840 ctx->count); 1841 populate_namespaces_cb(ctx, 0, -ERANGE); 1842 return; 1843 } 1844 } 1845 } 1846 1847 populate_namespaces_cb(ctx, j, 0); 1848 } 1849 1850 static bool 1851 bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2) 1852 { 1853 const struct spdk_nvme_ns_data *nsdata1, *nsdata2; 1854 1855 nsdata1 = spdk_nvme_ns_get_data(ns1); 1856 nsdata2 = spdk_nvme_ns_get_data(ns2); 1857 1858 return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid)); 1859 } 1860 1861 static int 1862 bdev_nvme_add_trid(struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, struct spdk_nvme_ctrlr *new_ctrlr, 1863 struct spdk_nvme_transport_id *trid) 1864 { 1865 uint32_t i, nsid; 1866 struct nvme_bdev_ns *nvme_ns; 1867 struct spdk_nvme_ns *new_ns; 1868 struct nvme_bdev_ctrlr_trid *new_trid, *tmp_trid; 1869 int rc = 0; 1870 1871 assert(nvme_bdev_ctrlr != NULL); 1872 1873 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 1874 SPDK_ERRLOG("PCIe failover is not supported.\n"); 1875 return -ENOTSUP; 1876 } 1877 1878 pthread_mutex_lock(&g_bdev_nvme_mutex); 1879 1880 /* Currently we only support failover to the same transport type. */ 1881 if (nvme_bdev_ctrlr->connected_trid->trtype != trid->trtype) { 1882 rc = -EINVAL; 1883 goto exit; 1884 } 1885 1886 /* Currently we only support failover to the same NQN. */ 1887 if (strncmp(trid->subnqn, nvme_bdev_ctrlr->connected_trid->subnqn, SPDK_NVMF_NQN_MAX_LEN)) { 1888 rc = -EINVAL; 1889 goto exit; 1890 } 1891 1892 /* Skip all the other checks if we've already registered this path. */ 1893 TAILQ_FOREACH(new_trid, &nvme_bdev_ctrlr->trids, link) { 1894 if (!spdk_nvme_transport_id_compare(&new_trid->trid, trid)) { 1895 rc = -EEXIST; 1896 goto exit; 1897 } 1898 } 1899 1900 if (spdk_nvme_ctrlr_get_num_ns(new_ctrlr) != nvme_bdev_ctrlr->num_ns) { 1901 rc = -EINVAL; 1902 goto exit; 1903 } 1904 1905 for (i = 0; i < nvme_bdev_ctrlr->num_ns; i++) { 1906 nsid = i + 1; 1907 1908 nvme_ns = nvme_bdev_ctrlr->namespaces[i]; 1909 if (!nvme_ns->populated) { 1910 continue; 1911 } 1912 1913 new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nsid); 1914 assert(new_ns != NULL); 1915 1916 if (bdev_nvme_compare_ns(nvme_ns->ns, new_ns) != 0) { 1917 rc = -EINVAL; 1918 goto exit; 1919 } 1920 } 1921 1922 new_trid = calloc(1, sizeof(*new_trid)); 1923 if (new_trid == NULL) { 1924 rc = -ENOMEM; 1925 goto exit; 1926 } 1927 new_trid->trid = *trid; 1928 new_trid->is_failed = false; 1929 1930 TAILQ_FOREACH(tmp_trid, &nvme_bdev_ctrlr->trids, link) { 1931 if (tmp_trid->is_failed) { 1932 TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link); 1933 goto exit; 1934 } 1935 } 1936 1937 TAILQ_INSERT_TAIL(&nvme_bdev_ctrlr->trids, new_trid, link); 1938 1939 exit: 1940 pthread_mutex_unlock(&g_bdev_nvme_mutex); 1941 return rc; 1942 } 1943 1944 static void 1945 connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 1946 struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 1947 { 1948 struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx; 1949 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; 1950 struct nvme_async_probe_ctx *ctx; 1951 int rc; 1952 1953 ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, opts); 1954 ctx->ctrlr_attached = true; 1955 1956 nvme_bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(ctx->base_name); 1957 if (nvme_bdev_ctrlr) { 1958 /* This is the case that a secondary path is added to an existing 1959 * nvme_bdev_ctrlr for failover. After checking if it can access the same 1960 * namespaces as the primary path, it is disconnected until failover occurs. 1961 */ 1962 rc = bdev_nvme_add_trid(nvme_bdev_ctrlr, ctrlr, &ctx->trid); 1963 1964 spdk_nvme_detach(ctrlr); 1965 goto exit; 1966 } 1967 1968 rc = nvme_bdev_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx->prchk_flags); 1969 if (rc) { 1970 SPDK_ERRLOG("Failed to create new device\n"); 1971 goto exit; 1972 } 1973 1974 nvme_bdev_ctrlr = nvme_bdev_ctrlr_get(&ctx->trid); 1975 assert(nvme_bdev_ctrlr != NULL); 1976 1977 nvme_ctrlr_populate_namespaces(nvme_bdev_ctrlr, ctx); 1978 return; 1979 1980 exit: 1981 populate_namespaces_cb(ctx, 0, rc); 1982 } 1983 1984 static int 1985 bdev_nvme_async_poll(void *arg) 1986 { 1987 struct nvme_async_probe_ctx *ctx = arg; 1988 int rc; 1989 1990 rc = spdk_nvme_probe_poll_async(ctx->probe_ctx); 1991 if (spdk_unlikely(rc != -EAGAIN)) { 1992 ctx->probe_done = true; 1993 spdk_poller_unregister(&ctx->poller); 1994 if (!ctx->ctrlr_attached) { 1995 /* The probe is done, but no controller was attached. 1996 * That means we had a failure, so report -EIO back to 1997 * the caller (usually the RPC). populate_namespaces_cb() 1998 * will take care of freeing the nvme_async_probe_ctx. 1999 */ 2000 populate_namespaces_cb(ctx, 0, -EIO); 2001 } else if (ctx->namespaces_populated) { 2002 /* The namespaces for the attached controller were all 2003 * populated and the response was already sent to the 2004 * caller (usually the RPC). So free the context here. 2005 */ 2006 free(ctx); 2007 } 2008 } 2009 2010 return SPDK_POLLER_BUSY; 2011 } 2012 2013 int 2014 bdev_nvme_create(struct spdk_nvme_transport_id *trid, 2015 struct spdk_nvme_host_id *hostid, 2016 const char *base_name, 2017 const char **names, 2018 uint32_t count, 2019 const char *hostnqn, 2020 uint32_t prchk_flags, 2021 spdk_bdev_create_nvme_fn cb_fn, 2022 void *cb_ctx) 2023 { 2024 struct nvme_probe_skip_entry *entry, *tmp; 2025 struct nvme_async_probe_ctx *ctx; 2026 2027 /* TODO expand this check to include both the host and target TRIDs. 2028 * Only if both are the same should we fail. 2029 */ 2030 if (nvme_bdev_ctrlr_get(trid) != NULL) { 2031 SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr); 2032 return -EEXIST; 2033 } 2034 2035 ctx = calloc(1, sizeof(*ctx)); 2036 if (!ctx) { 2037 return -ENOMEM; 2038 } 2039 ctx->base_name = base_name; 2040 ctx->names = names; 2041 ctx->count = count; 2042 ctx->cb_fn = cb_fn; 2043 ctx->cb_ctx = cb_ctx; 2044 ctx->prchk_flags = prchk_flags; 2045 ctx->trid = *trid; 2046 2047 if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 2048 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) { 2049 if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) { 2050 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 2051 free(entry); 2052 break; 2053 } 2054 } 2055 } 2056 2057 spdk_nvme_ctrlr_get_default_ctrlr_opts(&ctx->opts, sizeof(ctx->opts)); 2058 ctx->opts.transport_retry_count = g_opts.retry_count; 2059 ctx->opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms; 2060 2061 if (hostnqn) { 2062 snprintf(ctx->opts.hostnqn, sizeof(ctx->opts.hostnqn), "%s", hostnqn); 2063 } 2064 2065 if (hostid->hostaddr[0] != '\0') { 2066 snprintf(ctx->opts.src_addr, sizeof(ctx->opts.src_addr), "%s", hostid->hostaddr); 2067 } 2068 2069 if (hostid->hostsvcid[0] != '\0') { 2070 snprintf(ctx->opts.src_svcid, sizeof(ctx->opts.src_svcid), "%s", hostid->hostsvcid); 2071 } 2072 2073 ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->opts, connect_attach_cb); 2074 if (ctx->probe_ctx == NULL) { 2075 SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr); 2076 free(ctx); 2077 return -ENODEV; 2078 } 2079 ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000); 2080 2081 return 0; 2082 } 2083 2084 int 2085 bdev_nvme_remove_trid(const char *name, struct spdk_nvme_transport_id *trid) 2086 { 2087 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; 2088 struct nvme_bdev_ctrlr_trid *ctrlr_trid, *tmp_trid; 2089 2090 if (name == NULL) { 2091 return -EINVAL; 2092 } 2093 2094 nvme_bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 2095 if (nvme_bdev_ctrlr == NULL) { 2096 SPDK_ERRLOG("Failed to find NVMe controller\n"); 2097 return -ENODEV; 2098 } 2099 2100 /* case 1: we are currently using the path to be removed. */ 2101 if (!spdk_nvme_transport_id_compare(trid, nvme_bdev_ctrlr->connected_trid)) { 2102 ctrlr_trid = TAILQ_FIRST(&nvme_bdev_ctrlr->trids); 2103 assert(nvme_bdev_ctrlr->connected_trid == &ctrlr_trid->trid); 2104 /* case 1A: the current path is the only path. */ 2105 if (!TAILQ_NEXT(ctrlr_trid, link)) { 2106 return bdev_nvme_delete(name); 2107 } 2108 2109 /* case 1B: there is an alternative path. */ 2110 return bdev_nvme_failover(nvme_bdev_ctrlr, true); 2111 } 2112 /* case 2: We are not using the specified path. */ 2113 TAILQ_FOREACH_SAFE(ctrlr_trid, &nvme_bdev_ctrlr->trids, link, tmp_trid) { 2114 if (!spdk_nvme_transport_id_compare(&ctrlr_trid->trid, trid)) { 2115 TAILQ_REMOVE(&nvme_bdev_ctrlr->trids, ctrlr_trid, link); 2116 free(ctrlr_trid); 2117 return 0; 2118 } 2119 } 2120 2121 /* case 2A: The address isn't even in the registered list. */ 2122 return -ENXIO; 2123 } 2124 2125 int 2126 bdev_nvme_delete(const char *name) 2127 { 2128 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; 2129 struct nvme_probe_skip_entry *entry; 2130 2131 if (name == NULL) { 2132 return -EINVAL; 2133 } 2134 2135 pthread_mutex_lock(&g_bdev_nvme_mutex); 2136 2137 nvme_bdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name); 2138 if (nvme_bdev_ctrlr == NULL) { 2139 pthread_mutex_unlock(&g_bdev_nvme_mutex); 2140 SPDK_ERRLOG("Failed to find NVMe controller\n"); 2141 return -ENODEV; 2142 } 2143 2144 /* The controller's destruction was already started */ 2145 if (nvme_bdev_ctrlr->destruct) { 2146 pthread_mutex_unlock(&g_bdev_nvme_mutex); 2147 return 0; 2148 } 2149 2150 if (nvme_bdev_ctrlr->connected_trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { 2151 entry = calloc(1, sizeof(*entry)); 2152 if (!entry) { 2153 pthread_mutex_unlock(&g_bdev_nvme_mutex); 2154 return -ENOMEM; 2155 } 2156 entry->trid = *nvme_bdev_ctrlr->connected_trid; 2157 TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq); 2158 } 2159 2160 nvme_bdev_ctrlr->destruct = true; 2161 pthread_mutex_unlock(&g_bdev_nvme_mutex); 2162 2163 _nvme_bdev_ctrlr_destruct(nvme_bdev_ctrlr); 2164 2165 return 0; 2166 } 2167 2168 static int 2169 bdev_nvme_library_init(void) 2170 { 2171 g_bdev_nvme_init_thread = spdk_get_thread(); 2172 2173 spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_poll_group_create_cb, 2174 bdev_nvme_poll_group_destroy_cb, 2175 sizeof(struct nvme_bdev_poll_group), "bdev_nvme_poll_groups"); 2176 2177 return 0; 2178 } 2179 2180 static void 2181 bdev_nvme_library_fini(void) 2182 { 2183 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr, *tmp; 2184 struct nvme_probe_skip_entry *entry, *entry_tmp; 2185 2186 spdk_poller_unregister(&g_hotplug_poller); 2187 free(g_hotplug_probe_ctx); 2188 2189 TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) { 2190 TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq); 2191 free(entry); 2192 } 2193 2194 pthread_mutex_lock(&g_bdev_nvme_mutex); 2195 TAILQ_FOREACH_SAFE(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq, tmp) { 2196 if (nvme_bdev_ctrlr->destruct) { 2197 /* This controller's destruction was already started 2198 * before the application started shutting down 2199 */ 2200 continue; 2201 } 2202 nvme_bdev_ctrlr->destruct = true; 2203 2204 spdk_thread_send_msg(nvme_bdev_ctrlr->thread, _nvme_bdev_ctrlr_destruct, 2205 nvme_bdev_ctrlr); 2206 } 2207 2208 g_bdev_nvme_module_finish = true; 2209 if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) { 2210 pthread_mutex_unlock(&g_bdev_nvme_mutex); 2211 spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL); 2212 spdk_bdev_module_finish_done(); 2213 return; 2214 } 2215 2216 pthread_mutex_unlock(&g_bdev_nvme_mutex); 2217 } 2218 2219 static void 2220 bdev_nvme_verify_pi_error(struct spdk_bdev_io *bdev_io) 2221 { 2222 struct spdk_bdev *bdev = bdev_io->bdev; 2223 struct spdk_dif_ctx dif_ctx; 2224 struct spdk_dif_error err_blk = {}; 2225 int rc; 2226 2227 rc = spdk_dif_ctx_init(&dif_ctx, 2228 bdev->blocklen, bdev->md_len, bdev->md_interleave, 2229 bdev->dif_is_head_of_md, bdev->dif_type, bdev->dif_check_flags, 2230 bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0); 2231 if (rc != 0) { 2232 SPDK_ERRLOG("Initialization of DIF context failed\n"); 2233 return; 2234 } 2235 2236 if (bdev->md_interleave) { 2237 rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 2238 bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 2239 } else { 2240 struct iovec md_iov = { 2241 .iov_base = bdev_io->u.bdev.md_buf, 2242 .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len, 2243 }; 2244 2245 rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 2246 &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk); 2247 } 2248 2249 if (rc != 0) { 2250 SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n", 2251 err_blk.err_type, err_blk.err_offset); 2252 } else { 2253 SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n"); 2254 } 2255 } 2256 2257 static void 2258 bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 2259 { 2260 struct nvme_bdev_io *bio = ref; 2261 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2262 2263 if (spdk_nvme_cpl_is_success(cpl)) { 2264 /* Run PI verification for read data buffer. */ 2265 bdev_nvme_verify_pi_error(bdev_io); 2266 } 2267 2268 /* Return original completion status */ 2269 spdk_bdev_io_complete_nvme_status(bdev_io, bio->cpl.cdw0, bio->cpl.status.sct, 2270 bio->cpl.status.sc); 2271 } 2272 2273 static void 2274 bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl) 2275 { 2276 struct nvme_bdev_io *bio = ref; 2277 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2278 struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt; 2279 struct nvme_io_channel *nvme_ch; 2280 int ret; 2281 2282 if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) { 2283 SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n", 2284 cpl->status.sct, cpl->status.sc); 2285 2286 /* Save completion status to use after verifying PI error. */ 2287 bio->cpl = *cpl; 2288 2289 nvme_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 2290 2291 /* Read without PI checking to verify PI error. */ 2292 ret = bdev_nvme_no_pi_readv(nbdev->nvme_ns->ns, 2293 nvme_ch->qpair, 2294 bio, 2295 bdev_io->u.bdev.iovs, 2296 bdev_io->u.bdev.iovcnt, 2297 bdev_io->u.bdev.md_buf, 2298 bdev_io->u.bdev.num_blocks, 2299 bdev_io->u.bdev.offset_blocks); 2300 if (ret == 0) { 2301 return; 2302 } 2303 } 2304 2305 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 2306 } 2307 2308 static void 2309 bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 2310 { 2311 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref); 2312 2313 if (spdk_nvme_cpl_is_pi_error(cpl)) { 2314 SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n", 2315 cpl->status.sct, cpl->status.sc); 2316 /* Run PI verification for write data buffer if PI error is detected. */ 2317 bdev_nvme_verify_pi_error(bdev_io); 2318 } 2319 2320 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 2321 } 2322 2323 static void 2324 bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl) 2325 { 2326 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref); 2327 2328 if (spdk_nvme_cpl_is_pi_error(cpl)) { 2329 SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n", 2330 cpl->status.sct, cpl->status.sc); 2331 /* Run PI verification for compare data buffer if PI error is detected. */ 2332 bdev_nvme_verify_pi_error(bdev_io); 2333 } 2334 2335 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 2336 } 2337 2338 static void 2339 bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl) 2340 { 2341 struct nvme_bdev_io *bio = ref; 2342 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2343 2344 /* Compare operation completion */ 2345 if ((cpl->cdw0 & 0xFF) == SPDK_NVME_OPC_COMPARE) { 2346 /* Save compare result for write callback */ 2347 bio->cpl = *cpl; 2348 return; 2349 } 2350 2351 /* Write operation completion */ 2352 if (spdk_nvme_cpl_is_error(&bio->cpl)) { 2353 /* If bio->cpl is already an error, it means the compare operation failed. In that case, 2354 * complete the IO with the compare operation's status. 2355 */ 2356 if (!spdk_nvme_cpl_is_error(cpl)) { 2357 SPDK_ERRLOG("Unexpected write success after compare failure.\n"); 2358 } 2359 2360 spdk_bdev_io_complete_nvme_status(bdev_io, bio->cpl.cdw0, bio->cpl.status.sct, bio->cpl.status.sc); 2361 } else { 2362 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 2363 } 2364 } 2365 2366 static void 2367 bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl) 2368 { 2369 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref); 2370 2371 spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc); 2372 } 2373 2374 static void 2375 bdev_nvme_admin_passthru_completion(void *ctx) 2376 { 2377 struct nvme_bdev_io *bio = ctx; 2378 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2379 2380 spdk_bdev_io_complete_nvme_status(bdev_io, 2381 bio->cpl.cdw0, bio->cpl.status.sct, bio->cpl.status.sc); 2382 } 2383 2384 static void 2385 bdev_nvme_abort_completion(void *ctx) 2386 { 2387 struct nvme_bdev_io *bio = ctx; 2388 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2389 2390 if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) { 2391 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 2392 } else { 2393 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2394 } 2395 } 2396 2397 static void 2398 bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl) 2399 { 2400 struct nvme_bdev_io *bio = ref; 2401 2402 bio->cpl = *cpl; 2403 spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_completion, bio); 2404 } 2405 2406 static void 2407 bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl) 2408 { 2409 struct nvme_bdev_io *bio = ref; 2410 2411 bio->cpl = *cpl; 2412 spdk_thread_send_msg(bio->orig_thread, bdev_nvme_admin_passthru_completion, bio); 2413 } 2414 2415 static void 2416 bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset) 2417 { 2418 struct nvme_bdev_io *bio = ref; 2419 struct iovec *iov; 2420 2421 bio->iov_offset = sgl_offset; 2422 for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) { 2423 iov = &bio->iovs[bio->iovpos]; 2424 if (bio->iov_offset < iov->iov_len) { 2425 break; 2426 } 2427 2428 bio->iov_offset -= iov->iov_len; 2429 } 2430 } 2431 2432 static int 2433 bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length) 2434 { 2435 struct nvme_bdev_io *bio = ref; 2436 struct iovec *iov; 2437 2438 assert(bio->iovpos < bio->iovcnt); 2439 2440 iov = &bio->iovs[bio->iovpos]; 2441 2442 *address = iov->iov_base; 2443 *length = iov->iov_len; 2444 2445 if (bio->iov_offset) { 2446 assert(bio->iov_offset <= iov->iov_len); 2447 *address += bio->iov_offset; 2448 *length -= bio->iov_offset; 2449 } 2450 2451 bio->iov_offset += *length; 2452 if (bio->iov_offset == iov->iov_len) { 2453 bio->iovpos++; 2454 bio->iov_offset = 0; 2455 } 2456 2457 return 0; 2458 } 2459 2460 static void 2461 bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset) 2462 { 2463 struct nvme_bdev_io *bio = ref; 2464 struct iovec *iov; 2465 2466 bio->fused_iov_offset = sgl_offset; 2467 for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) { 2468 iov = &bio->fused_iovs[bio->fused_iovpos]; 2469 if (bio->fused_iov_offset < iov->iov_len) { 2470 break; 2471 } 2472 2473 bio->fused_iov_offset -= iov->iov_len; 2474 } 2475 } 2476 2477 static int 2478 bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length) 2479 { 2480 struct nvme_bdev_io *bio = ref; 2481 struct iovec *iov; 2482 2483 assert(bio->fused_iovpos < bio->fused_iovcnt); 2484 2485 iov = &bio->fused_iovs[bio->fused_iovpos]; 2486 2487 *address = iov->iov_base; 2488 *length = iov->iov_len; 2489 2490 if (bio->fused_iov_offset) { 2491 assert(bio->fused_iov_offset <= iov->iov_len); 2492 *address += bio->fused_iov_offset; 2493 *length -= bio->fused_iov_offset; 2494 } 2495 2496 bio->fused_iov_offset += *length; 2497 if (bio->fused_iov_offset == iov->iov_len) { 2498 bio->fused_iovpos++; 2499 bio->fused_iov_offset = 0; 2500 } 2501 2502 return 0; 2503 } 2504 2505 static int 2506 bdev_nvme_no_pi_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 2507 struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 2508 void *md, uint64_t lba_count, uint64_t lba) 2509 { 2510 int rc; 2511 2512 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n", 2513 lba_count, lba); 2514 2515 bio->iovs = iov; 2516 bio->iovcnt = iovcnt; 2517 bio->iovpos = 0; 2518 bio->iov_offset = 0; 2519 2520 rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count, 2521 bdev_nvme_no_pi_readv_done, bio, 0, 2522 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 2523 md, 0, 0); 2524 2525 if (rc != 0 && rc != -ENOMEM) { 2526 SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc); 2527 } 2528 return rc; 2529 } 2530 2531 static int 2532 bdev_nvme_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 2533 struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt, 2534 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags) 2535 { 2536 int rc; 2537 2538 SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n", 2539 lba_count, lba); 2540 2541 bio->iovs = iov; 2542 bio->iovcnt = iovcnt; 2543 bio->iovpos = 0; 2544 bio->iov_offset = 0; 2545 2546 if (iovcnt == 1) { 2547 rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base, md, lba, 2548 lba_count, 2549 bdev_nvme_readv_done, bio, 2550 flags, 2551 0, 0); 2552 } else { 2553 rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count, 2554 bdev_nvme_readv_done, bio, flags, 2555 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 2556 md, 0, 0); 2557 } 2558 2559 if (rc != 0 && rc != -ENOMEM) { 2560 SPDK_ERRLOG("readv failed: rc = %d\n", rc); 2561 } 2562 return rc; 2563 } 2564 2565 static int 2566 bdev_nvme_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 2567 struct nvme_bdev_io *bio, 2568 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba, 2569 uint32_t flags) 2570 { 2571 int rc; 2572 2573 SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 2574 lba_count, lba); 2575 2576 bio->iovs = iov; 2577 bio->iovcnt = iovcnt; 2578 bio->iovpos = 0; 2579 bio->iov_offset = 0; 2580 2581 if (iovcnt == 1) { 2582 rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base, md, lba, 2583 lba_count, 2584 bdev_nvme_readv_done, bio, 2585 flags, 2586 0, 0); 2587 } else { 2588 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 2589 bdev_nvme_writev_done, bio, flags, 2590 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 2591 md, 0, 0); 2592 } 2593 2594 if (rc != 0 && rc != -ENOMEM) { 2595 SPDK_ERRLOG("writev failed: rc = %d\n", rc); 2596 } 2597 return rc; 2598 } 2599 2600 static int 2601 bdev_nvme_comparev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 2602 struct nvme_bdev_io *bio, 2603 struct iovec *iov, int iovcnt, void *md, uint64_t lba_count, uint64_t lba, 2604 uint32_t flags) 2605 { 2606 int rc; 2607 2608 SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n", 2609 lba_count, lba); 2610 2611 bio->iovs = iov; 2612 bio->iovcnt = iovcnt; 2613 bio->iovpos = 0; 2614 bio->iov_offset = 0; 2615 2616 rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count, 2617 bdev_nvme_comparev_done, bio, flags, 2618 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, 2619 md, 0, 0); 2620 2621 if (rc != 0 && rc != -ENOMEM) { 2622 SPDK_ERRLOG("comparev failed: rc = %d\n", rc); 2623 } 2624 return rc; 2625 } 2626 2627 static int 2628 bdev_nvme_comparev_and_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 2629 struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt, 2630 struct iovec *write_iov, int write_iovcnt, 2631 void *md, uint64_t lba_count, uint64_t lba, uint32_t flags) 2632 { 2633 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2634 int rc; 2635 2636 SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n", 2637 lba_count, lba); 2638 2639 bio->iovs = cmp_iov; 2640 bio->iovcnt = cmp_iovcnt; 2641 bio->iovpos = 0; 2642 bio->iov_offset = 0; 2643 bio->fused_iovs = write_iov; 2644 bio->fused_iovcnt = write_iovcnt; 2645 bio->fused_iovpos = 0; 2646 bio->fused_iov_offset = 0; 2647 2648 if (bdev_io->num_retries == 0) { 2649 bio->first_fused_submitted = false; 2650 } 2651 2652 if (!bio->first_fused_submitted) { 2653 flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST; 2654 memset(&bio->cpl, 0, sizeof(bio->cpl)); 2655 2656 rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count, 2657 bdev_nvme_comparev_and_writev_done, bio, flags, 2658 bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0); 2659 if (rc == 0) { 2660 bio->first_fused_submitted = true; 2661 flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST; 2662 } else { 2663 if (rc != -ENOMEM) { 2664 SPDK_ERRLOG("compare failed: rc = %d\n", rc); 2665 } 2666 return rc; 2667 } 2668 } 2669 2670 flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND; 2671 2672 rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count, 2673 bdev_nvme_comparev_and_writev_done, bio, flags, 2674 bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0); 2675 if (rc != 0 && rc != -ENOMEM) { 2676 SPDK_ERRLOG("write failed: rc = %d\n", rc); 2677 rc = 0; 2678 } 2679 2680 return rc; 2681 } 2682 2683 static int 2684 bdev_nvme_unmap(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 2685 struct nvme_bdev_io *bio, 2686 uint64_t offset_blocks, 2687 uint64_t num_blocks) 2688 { 2689 struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES]; 2690 struct spdk_nvme_dsm_range *range; 2691 uint64_t offset, remaining; 2692 uint64_t num_ranges_u64; 2693 uint16_t num_ranges; 2694 int rc; 2695 2696 num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) / 2697 SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 2698 if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) { 2699 SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks); 2700 return -EINVAL; 2701 } 2702 num_ranges = (uint16_t)num_ranges_u64; 2703 2704 offset = offset_blocks; 2705 remaining = num_blocks; 2706 range = &dsm_ranges[0]; 2707 2708 /* Fill max-size ranges until the remaining blocks fit into one range */ 2709 while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) { 2710 range->attributes.raw = 0; 2711 range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 2712 range->starting_lba = offset; 2713 2714 offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 2715 remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; 2716 range++; 2717 } 2718 2719 /* Final range describes the remaining blocks */ 2720 range->attributes.raw = 0; 2721 range->length = remaining; 2722 range->starting_lba = offset; 2723 2724 rc = spdk_nvme_ns_cmd_dataset_management(ns, qpair, 2725 SPDK_NVME_DSM_ATTR_DEALLOCATE, 2726 dsm_ranges, num_ranges, 2727 bdev_nvme_queued_done, bio); 2728 2729 return rc; 2730 } 2731 2732 static int 2733 bdev_nvme_admin_passthru(struct nvme_io_channel *nvme_ch, struct nvme_bdev_io *bio, 2734 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) 2735 { 2736 uint32_t max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ch->ctrlr->ctrlr); 2737 2738 if (nbytes > max_xfer_size) { 2739 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 2740 return -EINVAL; 2741 } 2742 2743 bio->orig_thread = spdk_io_channel_get_thread(spdk_io_channel_from_ctx(nvme_ch)); 2744 2745 return spdk_nvme_ctrlr_cmd_admin_raw(nvme_ch->ctrlr->ctrlr, cmd, buf, 2746 (uint32_t)nbytes, bdev_nvme_admin_passthru_done, bio); 2747 } 2748 2749 static int 2750 bdev_nvme_io_passthru(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 2751 struct nvme_bdev_io *bio, 2752 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) 2753 { 2754 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 2755 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 2756 2757 if (nbytes > max_xfer_size) { 2758 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 2759 return -EINVAL; 2760 } 2761 2762 /* 2763 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 2764 * so fill it out automatically. 2765 */ 2766 cmd->nsid = spdk_nvme_ns_get_id(ns); 2767 2768 return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf, 2769 (uint32_t)nbytes, bdev_nvme_queued_done, bio); 2770 } 2771 2772 static int 2773 bdev_nvme_io_passthru_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, 2774 struct nvme_bdev_io *bio, 2775 struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len) 2776 { 2777 size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns); 2778 uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns); 2779 struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns); 2780 2781 if (nbytes > max_xfer_size) { 2782 SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); 2783 return -EINVAL; 2784 } 2785 2786 if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) { 2787 SPDK_ERRLOG("invalid meta data buffer size\n"); 2788 return -EINVAL; 2789 } 2790 2791 /* 2792 * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, 2793 * so fill it out automatically. 2794 */ 2795 cmd->nsid = spdk_nvme_ns_get_id(ns); 2796 2797 return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf, 2798 (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio); 2799 } 2800 2801 static void 2802 bdev_nvme_abort_admin_cmd(void *ctx) 2803 { 2804 struct nvme_bdev_io *bio = ctx; 2805 struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); 2806 struct nvme_io_channel *nvme_ch; 2807 struct nvme_bdev_io *bio_to_abort; 2808 int rc; 2809 2810 nvme_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io)); 2811 bio_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx; 2812 2813 rc = spdk_nvme_ctrlr_cmd_abort_ext(nvme_ch->ctrlr->ctrlr, 2814 NULL, 2815 bio_to_abort, 2816 bdev_nvme_abort_done, bio); 2817 if (rc == -ENOENT) { 2818 /* If no admin command was found in admin qpair, complete the abort 2819 * request with failure. 2820 */ 2821 bio->cpl.cdw0 |= 1U; 2822 bio->cpl.status.sc = SPDK_NVME_SC_SUCCESS; 2823 bio->cpl.status.sct = SPDK_NVME_SCT_GENERIC; 2824 2825 spdk_thread_send_msg(bio->orig_thread, bdev_nvme_abort_completion, bio); 2826 } 2827 } 2828 2829 static int 2830 bdev_nvme_abort(struct nvme_io_channel *nvme_ch, struct nvme_bdev_io *bio, 2831 struct nvme_bdev_io *bio_to_abort) 2832 { 2833 int rc; 2834 2835 bio->orig_thread = spdk_io_channel_get_thread(spdk_io_channel_from_ctx(nvme_ch)); 2836 2837 rc = spdk_nvme_ctrlr_cmd_abort_ext(nvme_ch->ctrlr->ctrlr, 2838 nvme_ch->qpair, 2839 bio_to_abort, 2840 bdev_nvme_abort_done, bio); 2841 if (rc == -ENOENT) { 2842 /* If no command was found in I/O qpair, the target command may be 2843 * admin command. Only a single thread tries aborting admin command 2844 * to clean I/O flow. 2845 */ 2846 spdk_thread_send_msg(nvme_ch->ctrlr->thread, 2847 bdev_nvme_abort_admin_cmd, bio); 2848 rc = 0; 2849 } 2850 2851 return rc; 2852 } 2853 2854 static void 2855 nvme_ctrlr_config_json_standard_namespace(struct spdk_json_write_ctx *w, 2856 struct nvme_bdev_ns *nvme_ns) 2857 { 2858 /* nop */ 2859 } 2860 2861 static void 2862 nvme_namespace_config_json(struct spdk_json_write_ctx *w, struct nvme_bdev_ns *nvme_ns) 2863 { 2864 g_config_json_namespace_fn[nvme_ns->type](w, nvme_ns); 2865 } 2866 2867 static void 2868 bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w) 2869 { 2870 const char *action; 2871 2872 if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) { 2873 action = "reset"; 2874 } else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) { 2875 action = "abort"; 2876 } else { 2877 action = "none"; 2878 } 2879 2880 spdk_json_write_object_begin(w); 2881 2882 spdk_json_write_named_string(w, "method", "bdev_nvme_set_options"); 2883 2884 spdk_json_write_named_object_begin(w, "params"); 2885 spdk_json_write_named_string(w, "action_on_timeout", action); 2886 spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us); 2887 spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms); 2888 spdk_json_write_named_uint32(w, "retry_count", g_opts.retry_count); 2889 spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst); 2890 spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight); 2891 spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight); 2892 spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight); 2893 spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us); 2894 spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us); 2895 spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests); 2896 spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit); 2897 spdk_json_write_object_end(w); 2898 2899 spdk_json_write_object_end(w); 2900 } 2901 2902 static void 2903 nvme_bdev_ctrlr_config_json(struct spdk_json_write_ctx *w, 2904 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr) 2905 { 2906 struct spdk_nvme_transport_id *trid; 2907 2908 trid = nvme_bdev_ctrlr->connected_trid; 2909 2910 spdk_json_write_object_begin(w); 2911 2912 spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller"); 2913 2914 spdk_json_write_named_object_begin(w, "params"); 2915 spdk_json_write_named_string(w, "name", nvme_bdev_ctrlr->name); 2916 nvme_bdev_dump_trid_json(trid, w); 2917 spdk_json_write_named_bool(w, "prchk_reftag", 2918 (nvme_bdev_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0); 2919 spdk_json_write_named_bool(w, "prchk_guard", 2920 (nvme_bdev_ctrlr->prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0); 2921 2922 spdk_json_write_object_end(w); 2923 2924 spdk_json_write_object_end(w); 2925 } 2926 2927 static void 2928 bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w) 2929 { 2930 spdk_json_write_object_begin(w); 2931 spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug"); 2932 2933 spdk_json_write_named_object_begin(w, "params"); 2934 spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us); 2935 spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled); 2936 spdk_json_write_object_end(w); 2937 2938 spdk_json_write_object_end(w); 2939 } 2940 2941 static int 2942 bdev_nvme_config_json(struct spdk_json_write_ctx *w) 2943 { 2944 struct nvme_bdev_ctrlr *nvme_bdev_ctrlr; 2945 uint32_t nsid; 2946 2947 bdev_nvme_opts_config_json(w); 2948 2949 pthread_mutex_lock(&g_bdev_nvme_mutex); 2950 2951 TAILQ_FOREACH(nvme_bdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) { 2952 nvme_bdev_ctrlr_config_json(w, nvme_bdev_ctrlr); 2953 2954 for (nsid = 0; nsid < nvme_bdev_ctrlr->num_ns; ++nsid) { 2955 if (!nvme_bdev_ctrlr->namespaces[nsid]->populated) { 2956 continue; 2957 } 2958 2959 nvme_namespace_config_json(w, nvme_bdev_ctrlr->namespaces[nsid]); 2960 } 2961 } 2962 2963 /* Dump as last parameter to give all NVMe bdevs chance to be constructed 2964 * before enabling hotplug poller. 2965 */ 2966 bdev_nvme_hotplug_config_json(w); 2967 2968 pthread_mutex_unlock(&g_bdev_nvme_mutex); 2969 return 0; 2970 } 2971 2972 struct spdk_nvme_ctrlr * 2973 bdev_nvme_get_ctrlr(struct spdk_bdev *bdev) 2974 { 2975 if (!bdev || bdev->module != &nvme_if) { 2976 return NULL; 2977 } 2978 2979 return SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk)->nvme_ns->ctrlr->ctrlr; 2980 } 2981 2982 SPDK_LOG_REGISTER_COMPONENT(bdev_nvme) 2983